├── .babelrc
├── .env.example
├── .eslintignore
├── .eslintrc
├── .gitignore
├── .prettierrc
├── .vscode
└── settings.json
├── LICENSE
├── README.md
├── package-lock.json
├── package.json
├── src
├── assets
│ └── img
│ │ ├── bubble-arrow.png
│ │ ├── fox-transparent.png
│ │ ├── fox.png
│ │ ├── icon-128.png
│ │ ├── icon-34.png
│ │ ├── spinner-orange.svg
│ │ ├── spinner.svg
│ │ ├── ss1.png
│ │ └── ss2.png
├── components
│ ├── common
│ │ ├── Checkbox.js
│ │ ├── Error.jsx
│ │ ├── GlobalError.jsx
│ │ ├── HelpBar.jsx
│ │ ├── Input.jsx
│ │ ├── Loading.jsx
│ │ └── Pills.jsx
│ ├── fox
│ │ └── FoxSays.jsx
│ ├── newscrape
│ │ └── NewScrape.jsx
│ ├── openai
│ │ └── OpenAiKeyEntry.jsx
│ ├── pagination
│ │ └── Pagination.jsx
│ ├── perpage
│ │ └── PerPage.jsx
│ ├── prompt
│ │ └── InputPrompt.jsx
│ ├── report
│ │ └── Report.jsx
│ ├── scrape
│ │ ├── Results.jsx
│ │ ├── Scrape.css
│ │ ├── Scrape.jsx
│ │ ├── ScrapeStep.jsx
│ │ ├── UrlsStep.jsx
│ │ └── shared.js
│ └── share
│ │ └── Share.jsx
├── containers
│ └── Greetings
│ │ └── Greetings.jsx
├── lib
│ ├── ai.mjs
│ ├── browser.mjs
│ ├── cache.mjs
│ ├── constants.mjs
│ ├── controller.mjs
│ ├── csv.mjs
│ ├── errors.mjs
│ ├── gather.mjs
│ ├── gen.mjs
│ ├── job.mjs
│ ├── navigation.mjs
│ ├── report.mjs
│ ├── scrape.mjs
│ ├── share.mjs
│ ├── store.mjs
│ ├── templates.mjs
│ └── util.mjs
├── manifest.json
├── pages
│ ├── Background
│ │ └── index.js
│ ├── Content
│ │ ├── content.styles.css
│ │ ├── index.js
│ │ └── modules
│ │ │ └── print.js
│ ├── Devtools
│ │ ├── index.html
│ │ └── index.js
│ ├── Newtab
│ │ ├── Newtab.css
│ │ ├── Newtab.scss
│ │ ├── index.css
│ │ ├── index.html
│ │ └── index.jsx
│ ├── Options
│ │ ├── Options.css
│ │ ├── Options.tsx
│ │ ├── index.css
│ │ ├── index.html
│ │ └── index.jsx
│ ├── Panel
│ │ ├── Panel.css
│ │ ├── Panel.jsx
│ │ ├── index.css
│ │ ├── index.html
│ │ └── index.jsx
│ └── Popup
│ │ ├── Popup.css
│ │ ├── Popup.jsx
│ │ ├── index.css
│ │ ├── index.html
│ │ └── index.jsx
└── state
│ ├── errors.js
│ ├── gather.js
│ ├── jobs.js
│ ├── navigation.js
│ ├── openai.js
│ ├── storage.js
│ └── util.js
├── test
├── data
│ ├── amazonsoap.1.gather.json
│ ├── amazonsoap.1.mjs
│ ├── ebay.1.gather.json
│ ├── ebay.1.mjs
│ ├── etsy.1.gather.json
│ ├── etsy.1.mjs
│ ├── hackernews-comments.1.scrape.json
│ ├── hackernews-comments.2.scrape.json
│ ├── hackernews-comments.3.scrape.json
│ ├── hackernews.1.gather.json
│ ├── linkedin.1.gather.json
│ ├── linkedin.1.mjs
│ ├── reddit.1.gather.json
│ ├── redfin.1.gather.json
│ ├── redfin.1.mjs
│ ├── wayfair.1.gather.json
│ └── zillow.1.gather.json
├── testAiGather.mjs
└── testAiScrape.mjs
├── tsconfig.json
├── utils
├── build.js
├── env.js
└── webserver.js
└── webpack.config.js
/.babelrc:
--------------------------------------------------------------------------------
1 | {
2 | "presets": [
3 | // "@babel/preset-env"
4 | "@babel/preset-react"
5 | // "react-app"
6 | ],
7 | "plugins": [
8 | // "@babel/plugin-proposal-class-properties",
9 | ]
10 | }
11 |
--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 | SENTRY_AUTH_TOKEN=
2 |
--------------------------------------------------------------------------------
/.eslintignore:
--------------------------------------------------------------------------------
1 | src/lib/cache.mjs
2 |
--------------------------------------------------------------------------------
/.eslintrc:
--------------------------------------------------------------------------------
1 | {
2 | "extends": "react-app",
3 | "globals": {
4 | "chrome": "readonly"
5 | }
6 | }
7 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # See https://help.github.com/ignore-files/ for more about ignoring files.
2 |
3 | # dependencies
4 | /node_modules
5 |
6 | # testing
7 | /coverage
8 |
9 | # production
10 | /build
11 |
12 | # zip
13 | /zip
14 |
15 | # misc
16 | .DS_Store
17 | .env.production
18 | .env.dev
19 |
20 | .env.local
21 | .env.development.local
22 | .env.test.local
23 | .env.production.local
24 | .history
25 |
26 | # secrets
27 | secrets.*.js
28 |
29 | Makefile
30 |
31 | # Sentry Config File
32 | .env.sentry-build-plugin
33 |
34 | .plasmo
--------------------------------------------------------------------------------
/.prettierrc:
--------------------------------------------------------------------------------
1 | {
2 | "singleQuote": true,
3 | "trailingComma": "es5",
4 | "requirePragma": false,
5 | "arrowParens": "always"
6 | }
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "editor.formatOnSave": true
3 | }
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Hi! I'm FetchFox
4 |
5 | FetchFox is an AI powered web scraper. It takes the raw text of a website, and uses AI to extract data the user is looking for. It runs as a Chrome Extension, and the user describes the desired data in plain English.
6 |
7 | You can use FetchFox to quickly gather data like building a list of leads, assembling research data, or scoping out a market segment.
8 |
9 | By scraping raw text with AI, FetchFox lets you circumvent anti-scraping measures on sites like LinkedIn and Facebook. Even the the complicated HTML structures are possible to parse with FetchFox.
10 |
11 | # Get the extension
12 |
13 | You can get the extension for free in the [Chrome Extension Store](https://chromewebstore.google.com/detail/fetchfox/meieeikgpmlhmfjmjgciiclgmbcocfnk?authuser=0&hl=en).
14 |
15 | # Contributing
16 |
17 | Contributions are welcome! Please be aware the extension is under active constuction. [Join us on Discord](https://discord.gg/mM54bwdu59) for more info.
18 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "fetchfox",
3 | "version": "1.0.50",
4 | "description": "FetchFox lets you scrape any site for any data",
5 | "license": "none",
6 | "repository": {
7 | "type": "git",
8 | "url": "https://github.com/fetchfox/fetchfox.git"
9 | },
10 | "scripts": {
11 | "build": "node utils/build.js",
12 | "build-dev": "NODE_ENV=dev node utils/build.js",
13 | "start": "node utils/webserver.js",
14 | "prettier": "prettier --write '**/*.{js,jsx,ts,tsx,json,css,scss,md}'"
15 | },
16 | "dependencies": {
17 | "@fortawesome/react-fontawesome": "^0.2.0",
18 | "@plasmohq/storage": "^1.12.0",
19 | "@sentry/react": "^8.32.0",
20 | "@sentry/webpack-plugin": "^2.22.4",
21 | "dotenv": "^16.4.5",
22 | "export-to-csv": "^1.3.0",
23 | "js-sha256": "^0.11.0",
24 | "json5": "^2.2.3",
25 | "jsonic": "^1.0.1",
26 | "openai": "^4.29.2",
27 | "openai-tokens": "^2.3.3",
28 | "pdfjs-dist": "^4.6.82",
29 | "radash": "^12.1.0",
30 | "react": "^18.2.0",
31 | "react-copy-to-clipboard": "^5.1.0",
32 | "react-dom": "^18.2.0",
33 | "react-expanding-textarea": "^2.3.6",
34 | "react-icons": "^5.0.1"
35 | },
36 | "devDependencies": {
37 | "@babel/core": "^7.20.12",
38 | "@babel/plugin-proposal-class-properties": "^7.18.6",
39 | "@babel/preset-env": "^7.20.2",
40 | "@babel/preset-react": "^7.18.6",
41 | "@pmmmwh/react-refresh-webpack-plugin": "^0.5.10",
42 | "@types/chrome": "^0.0.202",
43 | "@types/react": "^18.0.26",
44 | "@types/react-dom": "^18.0.10",
45 | "babel-eslint": "^10.1.0",
46 | "babel-loader": "^9.1.2",
47 | "babel-preset-react-app": "^10.0.1",
48 | "clean-webpack-plugin": "^4.0.0",
49 | "copy-webpack-plugin": "^11.0.0",
50 | "css-loader": "^6.7.3",
51 | "eslint": "^8.31.0",
52 | "eslint-config-react-app": "^7.0.1",
53 | "eslint-plugin-flowtype": "^8.0.3",
54 | "eslint-plugin-import": "^2.27.4",
55 | "eslint-plugin-jsx-a11y": "^6.7.1",
56 | "eslint-plugin-react": "^7.32.0",
57 | "eslint-plugin-react-hooks": "^4.6.0",
58 | "file-loader": "^6.2.0",
59 | "fs-extra": "^11.1.0",
60 | "html-loader": "^4.2.0",
61 | "html-webpack-plugin": "^5.5.0",
62 | "mocha": "^10.4.0",
63 | "prettier": "^2.8.3",
64 | "react-refresh": "^0.14.0",
65 | "react-refresh-typescript": "^2.0.7",
66 | "sass": "^1.57.1",
67 | "sass-loader": "^13.2.0",
68 | "source-map-loader": "^3.0.1",
69 | "style-loader": "^3.3.1",
70 | "terser-webpack-plugin": "^5.3.6",
71 | "ts-loader": "^9.4.2",
72 | "type-fest": "^3.5.2",
73 | "typescript": "^4.9.4",
74 | "webpack": "^5.75.0",
75 | "webpack-cli": "^4.10.0",
76 | "webpack-dev-server": "^4.11.1",
77 | "zip-webpack-plugin": "^4.0.1"
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/src/assets/img/bubble-arrow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fetchfox/fetchfox-extension/698f8969568fc2a88a8abc10a368e37c842aa332/src/assets/img/bubble-arrow.png
--------------------------------------------------------------------------------
/src/assets/img/fox-transparent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fetchfox/fetchfox-extension/698f8969568fc2a88a8abc10a368e37c842aa332/src/assets/img/fox-transparent.png
--------------------------------------------------------------------------------
/src/assets/img/fox.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fetchfox/fetchfox-extension/698f8969568fc2a88a8abc10a368e37c842aa332/src/assets/img/fox.png
--------------------------------------------------------------------------------
/src/assets/img/icon-128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fetchfox/fetchfox-extension/698f8969568fc2a88a8abc10a368e37c842aa332/src/assets/img/icon-128.png
--------------------------------------------------------------------------------
/src/assets/img/icon-34.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fetchfox/fetchfox-extension/698f8969568fc2a88a8abc10a368e37c842aa332/src/assets/img/icon-34.png
--------------------------------------------------------------------------------
/src/assets/img/spinner-orange.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/src/assets/img/spinner.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/src/assets/img/ss1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fetchfox/fetchfox-extension/698f8969568fc2a88a8abc10a368e37c842aa332/src/assets/img/ss1.png
--------------------------------------------------------------------------------
/src/assets/img/ss2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fetchfox/fetchfox-extension/698f8969568fc2a88a8abc10a368e37c842aa332/src/assets/img/ss2.png
--------------------------------------------------------------------------------
/src/components/common/Checkbox.js:
--------------------------------------------------------------------------------
1 | import React, { useState, useEffect } from 'react';
2 | import { IoIosCheckmark } from "react-icons/io";
3 | import {
4 | IoCheckmarkCircle,
5 | IoEllipseOutline,
6 | } from 'react-icons/io5';
7 | import { mainColor } from '../../lib/constants.mjs';
8 |
9 |
10 | export const Checkbox = ({ size, checked, disabled, onClick, children }) => {
11 | size ||= 18;
12 | return (
13 |
onChange(child.key)}
10 | key={child.key}
11 | >
12 | {child}
13 |
14 | );
15 | });
16 |
17 | return (
18 | updateMirror([['urls.perPage', val]])}
205 | />
206 | );
207 |
208 | const questionNode = (
209 |
210 |
What kinds of {job?.urls.action == 'gather' ? 'links' : 'items' } should we look for?
211 |
updateMirror([['urls.question', e.target.value]])} />
222 |
223 | );
224 |
225 | const gatherNode = (
226 |
227 |
Find links on current page
228 |
229 |
230 |
238 | {mirror?.urls.currentUrl}
239 |
240 |
241 |
242 |
243 | {questionNode}
244 | {perPageNode}
245 |
246 |
247 |
252 | Run Only Crawl
253 |
254 |
255 |
256 | );
257 |
258 | const currentNode = (
259 |
260 |
We will only scrape the current page
261 |
262 |
270 | {mirror?.urls.currentUrl}
271 |
272 |
273 |
274 |
updateMirror([['urls.pagination', val]])}
277 | follow={mirror?.urls.pagination?.follow || false}
278 | count={mirror?.urls.pagination?.count || 0}
279 | />
280 |
281 | {/*{JSON.stringify(job.urls?.pagination, null, 2)} */}
282 |
283 | {questionNode}
284 | {perPageNode}
285 |
286 | );
287 |
288 | const manualNode = (
289 |
290 |
Enter the URLs you would like to scrape (one per row)
291 |
292 |
309 |
310 | {questionNode}
311 | {perPageNode}
312 |
313 | );
314 |
315 | if (!job || !mirror) return null;
316 |
317 | return (
318 |
319 |
320 | {/*
321 |
job
322 |
{JSON.stringify(job?.urls, null, 2)}
323 |
mirror
324 |
{JSON.stringify(mirror?.urls, null, 2)}
325 | */}
326 |
327 |
What page do you want to scrape?
328 |
handleAction(val)}>
329 | Current Page Only
330 | Linked Pages
331 | Manually Enter URLs
332 |
333 |
334 | {mirror?.urls.action == 'current' && currentNode}
335 | {mirror?.urls.action == 'gather' && gatherNode}
336 | {mirror?.urls.action == 'manual' && manualNode}
337 |
338 |
339 |
340 | );
341 | };
342 |
--------------------------------------------------------------------------------
/src/components/scrape/shared.js:
--------------------------------------------------------------------------------
1 | import { getActiveTab } from '../../lib/navigation.mjs';
2 |
3 | export const blankJob = {
4 | id: 'draft',
5 | urls: {
6 | action: 'gather',
7 | url: '',
8 | question: '',
9 | list: [],
10 | },
11 | scrape: {
12 | action: 'scrape',
13 | questions: [''],
14 | },
15 | };
16 |
17 | export const mainStyle = {
18 | padding: 10,
19 | paddingBottom: 100,
20 | color: 'white',
21 | width: '100%',
22 | };
23 |
24 | export const stepStyle = {
25 | borderRadius: 5,
26 | padding: 10,
27 | background: '#fff2',
28 | marginBottom: 20,
29 | };
30 |
31 | export const stepHeaderStyle = {
32 | fontSize: 18,
33 | fontWeight: 'bold',
34 | marginBottom: 10,
35 | };
36 |
37 | export const smallButtonStyle = {
38 | fontSize: 12,
39 | };
40 |
41 | export const maybeOpenPanel = async (job) => {
42 | let shouldOpen = true;
43 |
44 | if (!(job.scrape?.concurrency < 0)) shouldOpen = false;
45 | if (job.urls?.action == 'current') shouldOpen = false;
46 | if (job.urls?.pagination?.follow) shouldOpen = true;
47 |
48 | if (shouldOpen) openPanel();
49 | }
50 |
51 | export const openPanel = async () => {
52 | const activeTab = await getActiveTab();
53 | chrome.sidePanel.open(
54 | { windowId: activeTab.windowId },
55 | () => {
56 | // TODO: remove need for setTimeout
57 | setTimeout(
58 | () => { window.close() },
59 | 50);
60 | });
61 | }
62 |
63 |
--------------------------------------------------------------------------------
/src/components/share/Share.jsx:
--------------------------------------------------------------------------------
1 | import React, { useState, useEffect } from 'react';
2 | import { FaShareFromSquare } from 'react-icons/fa6';
3 | import { shareResults } from '../../lib/share.mjs';
4 | import { bgColor } from '../../lib/constants.mjs';
5 | import { CopyToClipboard } from 'react-copy-to-clipboard';
6 | import { LuCopy, LuCopyCheck } from 'react-icons/lu';
7 | import { Loading } from '../common/Loading';
8 |
9 | const ShareModal = ({ id, onDone }) => {
10 | const [copied, setCopied] = useState();
11 |
12 | const handleCopy = () => {
13 | setCopied(true);
14 | setTimeout(() => setCopied(false), 2000);
15 | }
16 |
17 | const url = 'https://fetchfoxai.com/s/' + id;
18 |
19 | let body;
20 | if (id == 'loading') {
21 | body =
;
22 | } else {
23 | body = (
24 |
33 |
34 | {url}
35 |
36 |
37 |
39 |
40 | Copy{' '}
41 | {copied && }
42 | {!copied && }
43 |
44 |
45 |
46 |
47 | );
48 | }
49 |
50 | return (
51 |
64 |
65 |
e.stopPropagation()}
73 | >
74 | {body}
75 |
76 |
77 | );
78 | }
79 |
80 | export const Share = ({ job }) => {
81 | const [id, setId] = useState();
82 |
83 | const handleShare = async () => {
84 | setId('loading');
85 | const { id } = await shareResults(job);
86 | setId(id);
87 | }
88 |
89 | return (
90 |
91 | {id && setId(null)} />}
92 |
97 | Share Results
98 |
99 |
100 | );
101 | }
102 |
--------------------------------------------------------------------------------
/src/containers/Greetings/Greetings.jsx:
--------------------------------------------------------------------------------
1 | import React, { Component } from 'react';
2 | import icon from '../../assets/img/icon-128.png';
3 |
4 | class GreetingComponent extends Component {
5 | state = {
6 | name: 'dev',
7 | };
8 |
9 | render() {
10 | return (
11 |
12 |
Hello, {this.state.name}!
13 |
14 |
15 | );
16 | }
17 | }
18 |
19 | export default GreetingComponent;
20 |
--------------------------------------------------------------------------------
/src/lib/ai.mjs:
--------------------------------------------------------------------------------
1 | import jsonic from 'jsonic';
2 | import JSON5 from 'json5'
3 | import { createClient } from 'openai-tokens';
4 | import OpenAI from 'openai';
5 | import { apiHost } from './constants.mjs';
6 | import { setKey, getKey, setStatus } from './store.mjs';
7 | import { getCache, setCache } from './cache.mjs';
8 | import { getRoundId } from './controller.mjs';
9 | import { setGlobalError } from './errors.mjs';
10 | import { getTemplate } from './templates.mjs';
11 | import { sleep, parseJsonl } from './util.mjs';
12 |
13 | const recommendModel = 'gpt-4o-mini';
14 | const queryLogCutoff = 1; // 1 minute
15 | let queryLog = [];
16 | let observedRateLimit = 1000000; // tpm, a guess that adjusts
17 | let tpmTimeoutId;
18 |
19 | export async function estimateTokens(prompt) {
20 | // TODO: more accurate token estimation
21 | return prompt.length / 4;
22 | }
23 |
24 | export async function exec(name, args, cb, modelOverride) {
25 | const model = modelOverride ? modelOverride : await getModel();
26 | const plan = await getKey('openAiPlan');
27 |
28 | const keys = [model, plan, name, args];
29 | const cached = await getCache('ai', keys);
30 | if (cached) return cached;
31 |
32 | let answer;
33 | let askAI;
34 |
35 | if (plan == 'free') {
36 | console.log('AI using Free');
37 |
38 | // Run via mirror API
39 | askAI = async (name, args) => {
40 | const url = apiHost + '/api/mirror';
41 | const body = JSON.stringify({ template: name, ...args });
42 | console.log('making mirror request', url, name);
43 | const resp = await fetch(url, { method: 'POST', body });
44 | const data = await resp.json();
45 | console.log('got data response', data);
46 | if (data.error) {
47 | throw data.error;
48 | }
49 |
50 | return {
51 | answer: data.answer,
52 | usage: data.usage,
53 | }
54 | }
55 |
56 | } else {
57 | console.log('AI using OpenAI key');
58 |
59 | // Run via user's API key
60 | askAI = async (name, args) => {
61 | const prompt = render(name, args);
62 | console.log('Sending prompt to openai:', prompt);
63 | console.log('modelOverride?', modelOverride);
64 | const resp = await stream(
65 | prompt,
66 | (text) => cb && cb(parseAnswer(text)),
67 | model);
68 |
69 | return { answer: resp.result, usage: resp.usage };
70 | }
71 | }
72 |
73 | let retries = 3;
74 | while (true) {
75 | const rate = await checkRateLimit(render(name, args), plan);
76 |
77 | setKey('tpm', rate);
78 | if (tpmTimeoutId) { clearTimeout(tpmTimeoutId); tpmTimeoutId = null; }
79 | tpmTimeoutId = setTimeout(() => setKey('tpm', null), 15000);
80 | console.log('Check rate limit gave', rate, 'limit:', observedRateLimit);
81 |
82 | let hitRateLimit = false;
83 | let resp;
84 |
85 | try {
86 | resp = await askAI(name, args);
87 | console.log('AI resp:', resp);
88 |
89 | } catch(e) {
90 | console.error('AI error:', e);
91 | console.log('AI error retries left:', retries);
92 |
93 | if (e.code == 'insufficient_quota') {
94 | setGlobalError(
95 | 'You have no OpenAI quota. Add credits, or switch to the FetchFox backend.');
96 | return;
97 |
98 | } else if (e.code == 'rate_limit_exceeded' && retries > 0) {
99 | observedRateLimit *= 0.9;
100 | console.log('Query rate limit hit, set new observedRateLimit:', observedRateLimit);
101 | hitRateLimit = true;
102 |
103 | } else if (e.code == 'rate_limit_exceeded' && plan == 'free' && retries <= 0) {
104 |
105 | console.error('Too many errors, giving up on AI query');
106 | setGlobalError(
107 | 'High load! ' +
108 | 'Please try again later, or enter your OpenAI API key in settings.');
109 | return;
110 |
111 | } else {
112 | setGlobalError('' + e);
113 | return;
114 | }
115 | }
116 |
117 | if (hitRateLimit) {
118 | console.log('Check rate limit RETRY');
119 | setStatus('AI rate limit hit, slowing down...');
120 | retries--;
121 | await sleep(2000);
122 |
123 | } else {
124 | answer = resp.answer;
125 | await addUsage(resp.usage);
126 | // Slowly grow rate limit until we hit it again
127 | observedRateLimit *= 1.005;
128 | break;
129 | }
130 | }
131 |
132 | const out = parseAnswer(answer);
133 | setCache('ai', keys, out);
134 | return out;
135 | }
136 |
137 | export async function stream(prompt, cb, model) {
138 | const openai = new OpenAI({
139 | apiKey: await getKey('openAiKey'),
140 | dangerouslyAllowBrowser: true,
141 | });
142 |
143 | console.log('Using model:', model);
144 |
145 | const stream = await openai.chat.completions.create({
146 | model,
147 | messages: [{ role: 'user', content: prompt }],
148 | stream: true,
149 | stream_options: { include_usage: true },
150 | });
151 |
152 | let result = '';
153 | let usage;
154 | for await (const chunk of stream) {
155 | if (chunk.usage) {
156 | usage = chunk.usage;
157 | }
158 |
159 | if (chunk.choices?.length) {
160 | const delta = chunk.choices[0].delta.content;
161 | if (delta) result += delta;
162 | cb && cb(result);
163 | }
164 | }
165 |
166 | console.log('AI gave result:', result);
167 | console.log('AI gave stream:', stream)
168 | console.log('clip final result', result, usage);
169 |
170 | const out = { result, usage };
171 | return out;
172 | }
173 |
174 | const checkRateLimit = async (prompt, plan) => {
175 | // Rate limit check
176 | const count = await estimateTokens(prompt);
177 | let timestamp;
178 | let total;
179 | while (true) {
180 | timestamp = Math.floor((new Date()).getTime() / 1000);
181 | queryLog = queryLog.filter(q => q.timestamp >= timestamp - (queryLogCutoff * 60));
182 | total = queryLog.reduce((acc, q) => acc + q.count, 0);
183 |
184 | // If below rate limit, continue with query
185 | if ((total + count) / queryLogCutoff < observedRateLimit) break;
186 |
187 | setStatus('Waiting for AI rate limit...');
188 | console.log('Check rate limit WAITING');
189 |
190 | await sleep(3000);
191 | }
192 |
193 | queryLog.push({ count, timestamp });
194 | const rate = Math.round((total + count) / queryLogCutoff);
195 |
196 | console.log('Query rate count:', rate, 'tokens per', queryLogCutoff, 'minutes');
197 |
198 | return rate;
199 | }
200 |
201 | const render = (name, args) => {
202 | const template = getTemplate(name);
203 | let prompt = template;
204 | for (const key of Object.keys(args)) {
205 | const val = (args[key] || '');
206 | prompt = prompt.replaceAll('{{' + key + '}}', val);
207 | }
208 | return prompt;
209 | }
210 |
211 | const parseAnswer = (text) => {
212 | if (!text) return;
213 | const clean = text
214 | .replace(/```jsonl?/, '')
215 | .replace('```', '')
216 | .replaceAll(/^`+|`+$/g, '');
217 |
218 | // Try to parse it as JSON
219 | try {
220 | return JSON5.parse(clean);
221 | } catch(e) {
222 | // It was not JSON
223 | }
224 |
225 | // Try to parse it as JSONL
226 | let data;
227 | try {
228 | data = parseJsonl(clean);
229 | } catch (e) {
230 | console.warn('Unable to parse partial response:', clean, e);
231 | }
232 | if (data && data.length > 0) {
233 | return data;
234 | }
235 |
236 | // We don't know what it is, return null
237 | return null;
238 | }
239 |
240 | export const getAvailableModels = async () => {
241 | const apiKey = await getKey('openAiKey');
242 |
243 | if (!apiKey) {
244 | return [];
245 | }
246 |
247 | console.log('fetch models with', apiKey);
248 | const resp = await fetch(
249 | 'https://api.openai.com/v1/models',
250 | { headers: {Authorization: 'Bearer ' + apiKey }});
251 | const data = await resp.json();
252 |
253 | if (!data || !data.data) {
254 | setGlobalError(`We couldn't find any available models. Double check your API key`);
255 | return [];
256 | }
257 |
258 | const models = data.data.map(m => m.id)
259 | .sort((a, b) => {
260 | if (a == recommendModel) return -1;
261 | if (b == recommendModel) return 1;
262 |
263 | const [ma, mb] = [
264 | a.match(/gpt-([0-9]+)/),
265 | b.match(/gpt-([0-9]+)/)];
266 |
267 | if (ma && !mb) return -1;
268 | if (!ma && mb) return 1;
269 | if (!ma && !mb) return b.localeCompare(a);
270 | return parseInt(mb[1]) - parseInt(ma[1]);
271 | });
272 |
273 | console.log('Available models:', models);
274 |
275 | return models
276 | }
277 |
278 | export const getModel = async () => {
279 | const model = await getKey('model');
280 |
281 | if (model) {
282 | return model;
283 | }
284 |
285 | const models = await getAvailableModels();
286 | const use = models[0];
287 | console.log('Setting model:', use);
288 | setKey('model', use);
289 | return use;
290 | }
291 |
292 | const addUsage = async (usage) => {
293 | const roundId = await getRoundId();
294 | const key = 'roundUsage_' + roundId;
295 | console.log('Set usage for:', key);
296 | const current = await getKey(key) || { prompt: 0, completion: 0, total: 0 };
297 | console.log('Got previous usage:', current);
298 | console.log('Adding new usage:', usage);
299 | current.prompt += usage.prompt_tokens;
300 | current.completion += usage.completion_tokens;
301 | current.total += usage.total_tokens;
302 | console.log('Setting new usage', key, current);
303 | return setKey(key, current);
304 | }
305 |
--------------------------------------------------------------------------------
/src/lib/browser.mjs:
--------------------------------------------------------------------------------
1 | export const checkIfTabExists = async (tabId) => {
2 | return new Promise((ok) => {
3 | chrome.tabs.get(tabId, () => ok(!chrome.runtime.lastError))
4 | });
5 | }
6 |
7 | export const closeTabIfExists = async (tabId) => {
8 | return new Promise((ok) => {
9 | chrome.tabs.get(tabId, (tab) => {
10 | if (chrome.runtime.lastError) {
11 | return;
12 | }
13 | chrome.tabs.remove(tab.id, ok);
14 | })
15 | });
16 | }
17 |
18 | export const getTabUrl = async (tabId) => {
19 | return new Promise((ok) => {
20 | chrome.tabs.get(tabId, (tab) => {
21 | if (chrome.runtime.lastError || !tab) {
22 | ok(null)
23 | } else {
24 | ok(tab.url)
25 | }
26 | })
27 | });
28 | }
29 |
--------------------------------------------------------------------------------
/src/lib/cache.mjs:
--------------------------------------------------------------------------------
1 | import { sha256 } from 'js-sha256';
2 | import { getKey, setKey } from './store.mjs';
3 |
4 | const getTtl = (part) => {
5 | return 24 * 3600;
6 | }
7 |
8 | export const cacheKey = (part, keys) => {
9 | const keyStr = JSON.stringify(keys);
10 | return `${part}-${keyStr.replace(/^[A-Za-z0-9]+/g, '-').substr(0, 40)}-${sha256(part + keyStr).substr(0, 40)}`;
11 | }
12 |
13 | export const getCache = async (part, keys) => {
14 | const key = cacheKey(part, keys,);
15 | const cache = (await getKey('cache')) || {};
16 | const data = cache[key];
17 | if (!data) return;
18 | if (Date.now() > data.expiresAt || data.val == undefined) {
19 | delete cache[key];
20 | setKey('cache', cache);
21 | return;
22 | }
23 |
24 | console.log('cache hit', key, data);
25 |
26 | return data.val;
27 | }
28 |
29 | export const setCache = async (part, keys, val) => {
30 | const key = cacheKey(part, keys,);
31 |
32 | console.log('set cache', part, keys, key, val);
33 |
34 | const ttl = getTtl(part)
35 | const cache = (await getKey('cache')) || {};
36 | cache[key] = { val, expiresAt: Date.now() + ttl * 1000};
37 |
38 | // Expire items based on TTL
39 | for (const k of Object.keys(cache)) {
40 | const data = cache[k];
41 | if (Date.now() > data.expiresAt || data.val == undefined) {
42 | delete cache[k];
43 | }
44 | }
45 |
46 | return await setKey('cache', cache);
47 | }
48 |
--------------------------------------------------------------------------------
/src/lib/constants.mjs:
--------------------------------------------------------------------------------
1 | export const mainColor = '#df6546';
2 | export const bgColor = '#282c34';
3 | export const errorColor = '#e00';
4 |
5 | export const apiHost = 'https://fetchfoxai.com';
6 |
7 | export const discordUrl = 'https://discord.gg/mM54bwdu59';
8 | export const gitHubIssuesUrl = 'https://github.com/fetchfox/fetchfox/issues';
9 |
10 | export const sentryDsn = 'https://a049c0bd8d2be740747cdc18c9a1198f@o4507944397570048.ingest.us.sentry.io/4507944399077376';
11 |
--------------------------------------------------------------------------------
/src/lib/controller.mjs:
--------------------------------------------------------------------------------
1 | import { useEffect, useState } from 'react';
2 | import { stopActiveJob } from './store.mjs';
3 |
4 | let listeners = [];
5 |
6 | export const useRoundId = () => {
7 | const [roundId, setRoundId] = useState(null);
8 |
9 | const update = (changes) => {
10 | if (changes.roundId) {
11 | setRoundId(changes.roundId.newValue);
12 | }
13 | };
14 |
15 | useEffect(() => {
16 | getRoundId().then(setRoundId);
17 | chrome.storage.onChanged.addListener(update);
18 | return () => chrome.storage.onChanged.removeListener(update);
19 | }, []);
20 |
21 | return roundId;
22 | }
23 |
24 | export const getRoundId = async () => {
25 | return chrome.storage.local.get('roundId').
26 | then(r => {
27 | return r['roundId'] || 1;
28 | });
29 | }
30 |
31 | export const isActive = async (r) => {
32 | return r == await getRoundId();
33 | }
34 |
35 | export const addListener = async (f) => {
36 | if (listeners.includes(f)) {
37 | return;
38 | }
39 | listeners.push(f);
40 | }
41 |
42 | export const removeListener = async (f) => {
43 | const index = listeners.indexOf(f);
44 | if (index == -1) return;
45 | listeners.splice(index, 1);
46 | }
47 |
48 | export const runStopListeners = () => {
49 | listeners.map(l => { l() });
50 | listeners = [];
51 | }
52 |
53 | export const advanceRound = async () => {
54 | const roundId = await getRoundId();
55 |
56 | const changes = {
57 | inFlight: 0,
58 | roundId: roundId + 1,
59 | };
60 |
61 | await chrome.storage.local.set(changes);
62 |
63 | runStopListeners();
64 |
65 | return stopActiveJob();
66 | }
67 |
--------------------------------------------------------------------------------
/src/lib/csv.mjs:
--------------------------------------------------------------------------------
1 | import {
2 | mkConfig,
3 | generateCsv,
4 | download,
5 | } from 'export-to-csv';
6 |
7 | export const downloadJobCsv = (job) => {
8 | console.log('downloadJobCsv ->', job);
9 |
10 | const filename = 'FetchFox - ' + job.name;
11 | const csvConfig = mkConfig({ useKeysAsHeaders: true, filename });
12 | const rows = toRows(job);
13 | console.log('CSV rows:', rows);
14 | const csv = generateCsv(csvConfig)(rows);
15 | download(csvConfig)(csv);
16 | }
17 |
18 | export const toRows = (job) => {
19 | if (!job?.results?.targets) return [[]];
20 |
21 | let answerHeaders;
22 | if (job.results?.answerHeaders) {
23 | answerHeaders = job.results?.answerHeaders;
24 | } else {
25 | const answerNames = {};
26 | for (const target of job.results.targets) {
27 | for (const key of Object.keys(target.answer || {})) {
28 | answerNames[key] = true;
29 | }
30 | }
31 | answerHeaders = Object.keys(answerNames);
32 | }
33 |
34 | const headers = [
35 | 'URL',
36 | 'Link Text',
37 | 'Status',
38 | ...(answerHeaders),
39 | ];
40 |
41 | const rows = [headers];
42 |
43 | for (const target of job.results.targets) {
44 | const answer = target.answer || [{}];
45 | for (const a of answer) {
46 | const answerValues = answerHeaders.map(h => a[h]);
47 | const row = [
48 | target.url,
49 | target.text,
50 | target.status,
51 | ...answerValues,
52 | ];
53 | rows.push(row);
54 | }
55 | }
56 |
57 | return rows;
58 | }
59 |
--------------------------------------------------------------------------------
/src/lib/errors.mjs:
--------------------------------------------------------------------------------
1 | import * as Sentry from '@sentry/react';
2 | import { sentryDsn } from './constants.mjs';
3 | import { setKey, getKey } from './store.mjs';
4 |
5 | let timeoutId = null;
6 |
7 | export const setGlobalError = async (message) => {
8 | if (timeoutId) clearTimeout(timeoutId);
9 | timeoutId = setTimeout(clearGlobalError, 5000);
10 | return setKey('globalError', { message });
11 | }
12 |
13 | export const clearGlobalError = async () => {
14 | return setKey('globalError', null);
15 | }
16 |
17 | export const getGlobalError = async () => {
18 | return getKey('globalError');
19 | }
20 |
21 | export const initSentry = () => {
22 | https://stackoverflow.com/questions/12830649/check-if-chrome-extension-installed-in-unpacked-mode
23 | if (!('update_url' in chrome.runtime.getManifest())) {
24 | return;
25 | }
26 |
27 | Sentry.init({
28 | dsn: sentryDsn,
29 | beforeSend(event, hint) {
30 | const err = hint.originalException;
31 | console.error(err);
32 | setGlobalError('We noticed an error: ' + err.message);
33 | return event;
34 | }
35 | });
36 | }
37 |
--------------------------------------------------------------------------------
/src/lib/gather.mjs:
--------------------------------------------------------------------------------
1 | import { stream, exec } from './ai.mjs';
2 | import { sleep, shuffle } from './util.mjs';
3 | import {
4 | getRoundId,
5 | isActive,
6 | addListener,
7 | removeListener,
8 | } from './controller.mjs';
9 | import { getActiveJob, setJobResults, setStatus } from './store.mjs';
10 | import { getCache, setCache } from './cache.mjs';
11 | import { gatherTemplate } from './templates.mjs';
12 |
13 |
14 | export const cleanLinks = (l) => {
15 | const clean = [];
16 | const seen = {};
17 | console.log('clean links:', l);
18 | for (let item of l) {
19 | if (!item.url) {
20 | console.warn('got invalid link:', item);
21 | continue;
22 | }
23 |
24 | // De-dupe anchors for now. May want to revisit this later.
25 | item.url = item.url.split('#')[0];
26 | clean.push(item);
27 | }
28 | return clean;
29 | }
30 |
31 | export const dedupeLinks = (l) => {
32 | const u = [];
33 | const seen = {};
34 | for (let item of cleanLinks(l)) {
35 | if (seen[item.url]) continue;
36 | seen[item.url] = true;
37 | u.push(item);
38 | }
39 | return u;
40 | }
41 |
42 | const chunkList = (list, maxBytes) => {
43 | const chunks = [];
44 | let current = [];
45 | for (let item of list) {
46 | current.push(item);
47 | if (JSON.stringify(current, null, 2).length > maxBytes) {
48 | chunks.push(current);
49 | current = [];
50 | }
51 | }
52 | if (current.length) {
53 | chunks.push(current);
54 | }
55 | return chunks;
56 | };
57 |
58 | const slimmer = item => ({
59 | id: item.id,
60 | html: item.html.substr(0, 200),
61 | text: item.text,
62 | url: item.url,
63 | });
64 |
65 | const expander = (page, item) => {
66 | const m = page.links.filter(x => x.id == item.id);
67 | return m.length > 0 ? m[0] : item;
68 | }
69 |
70 | export const findPagination = async (page) => {
71 | const roundId = await getRoundId();
72 |
73 | const cached = await getCache('pagination', [page.url]);
74 | if (cached) return cached;
75 |
76 | const likelyPagintion = (url) => {
77 | const regexes = [
78 | /page=/i,
79 | /offset=/i,
80 | /start=/i,
81 | /p=\d+/i,
82 | /pg=\d+/i,
83 | /page-\d+/i,
84 | /page_\d+/i,
85 | /start=\d+/i,
86 | /(\?|&)pageno=\d+/i,
87 | /(\?|&)paging=\d+/i,
88 | /(\?|&)limitstart=\d+/i,
89 | /(\?|&)skip=\d+/i,
90 | /part=\d+/i,
91 | /section=\d+/i,
92 | /count=\d+/i,
93 | ];
94 | for (const regex of regexes) {
95 | if (url.match(regex)) return true;
96 | }
97 | return false;
98 | };
99 | const links = JSON.parse(JSON.stringify(page.links));
100 | links.sort((a, b) => {
101 | const [la, lb] = [likelyPagintion(a.url), likelyPagintion(b.url)];
102 | if (la && lb || (!la && !lb)) return 0;
103 | if (la) return -1;
104 | if (lb) return 1;
105 | });
106 | console.log('pagination sorted links:', links);
107 |
108 | const limit = 10000;
109 | const chunked = chunkList(links.map(slimmer), limit);
110 |
111 | let next = [];
112 | let pages = [];
113 |
114 | const max = Math.min(4, chunked.length);
115 |
116 | for (let i = 0; i < max; i++) {
117 | if (!await isActive(roundId)) break;
118 | const chunk = chunked[i];
119 | console.log('find pagination from chunk:', chunk);
120 |
121 | const answer = await exec(
122 | 'pagination',
123 | { list: JSON.stringify(chunk.map(slimmer), null, 2) });
124 |
125 | console.log('ai pagination gave answer:', answer);
126 |
127 | if (answer.hasPagination != 'yes') continue;
128 |
129 | if (answer.pageLinks) {
130 | for (const l of answer.pageLinks) {
131 | const expanded = expander(page, l);
132 | expanded.pageNumber = l.pageNumber;
133 | pages.push(expanded);
134 | }
135 | }
136 |
137 | if (answer.nextLink) {
138 | next.push(expander(page, { id: answer.nextLink }));
139 | }
140 |
141 | if (pages.length >= 10) break;
142 | }
143 |
144 | if (pages.length > 0) {
145 | // Run it again to check for dupes, etc.
146 | const answer = await exec(
147 | 'pagination',
148 | { list: JSON.stringify(pages.slice(0, 50).map(slimmer), null, 2) });
149 |
150 | pages = [];
151 | for (const l of (answer.pageLinks || [])) {
152 | const expanded = expander(page, l);
153 | expanded.pageNumber = l.pageNumber;
154 | pages.push(expanded);
155 | }
156 | pages.sort((a, b) => (parseInt(a.pageNumber) || 99) - (parseInt(b.pageNumber) || 99));
157 | pages.unshift({ url: page.url, pageNumber: 0 });
158 | }
159 |
160 | // Disabled, since we are not using the "Next" field right now
161 |
162 | // if (next.length > 1) {
163 | // const answer = await exec(
164 | // 'paginationPickNext',
165 | // { list: JSON.stringify(next.map(slimmer), null, 2) });
166 | // console.log('pick next pagination answer', answer);
167 | // if (answer?.id) {
168 | // next = [expander(page, answer)];
169 | // } else {
170 | // next = [];
171 | // }
172 | // }
173 |
174 | const result = { pages: pages.slice(0, 20), next: next[0] };
175 | setCache('pagination', [page.url], result);
176 | return result;
177 | }
178 |
179 | export const parseLinks = async (page, question, cb, templateName) => {
180 | const roundId = await getRoundId();
181 |
182 | const links = shuffle(page.links);
183 | const limit = 6000;
184 | const chunked = chunkList(links.map(slimmer), limit);
185 |
186 | let matches = [];
187 |
188 | for (let i = 0; i < chunked.length; i++) {
189 | if (!await isActive(roundId)) break;
190 | const chunk = chunked[i];
191 |
192 | const answer = (await exec(
193 | 'gather',
194 | {
195 | question,
196 | list: JSON.stringify(chunk.map(slimmer), null, 2),
197 | })) || [];
198 |
199 | const expanded = answer.map(item => expander(page, item));
200 | if (!await isActive(roundId)) return [];
201 | matches = dedupeLinks(matches.concat(expanded));
202 | if (cb) cb(cleanLinks(matches), i / chunked.length);
203 |
204 | await setStatus(`Crawl stage working, ${i+1}/${chunked.length} chunks`);
205 | }
206 |
207 | return dedupeLinks(matches);
208 | }
209 |
--------------------------------------------------------------------------------
/src/lib/gen.mjs:
--------------------------------------------------------------------------------
1 | import { exec, query, stream } from './ai.mjs';
2 | import { sleep } from './util.mjs';
3 | import { setStatus, nextId } from './store.mjs';
4 | import { genJobTemplate } from './templates.mjs';
5 | import { sendNextIdMessage } from './job.mjs';
6 | import { getAvailableModels } from './ai.mjs';
7 | import { findPagination } from './gather.mjs';
8 |
9 | const domainRules = {
10 | 'www.google.com': [
11 | `Determine if the user is interestedin OFFSITE links, eg. search results. If yes, the "itemDescription" and "gatherPrompt" should IGNORE links containing www.google.com in them.`,
12 | ]
13 | };
14 |
15 | export const genJob = async (scrapePrompt, url, page) => {
16 | const text = page?.text || '';
17 | const html = page?.html || '';
18 |
19 | console.log('gen job got url:', url);
20 |
21 | const hostname = (new URL(url)).hostname;
22 | let extraRules = domainRules[hostname];
23 | if (extraRules) {
24 | extraRules = `Follow these IMPORTANT instructions SPECIFIC to ${hostname}:\n${extraRules}`;
25 | }
26 |
27 | const available = await getAvailableModels();
28 | console.log('available models for gen job:', available);
29 | const modelOverride = available.includes('gpt-4o') ? 'gpt-4o' : null;
30 | console.log('using modelOverride for gen job:', modelOverride);
31 |
32 | const answer = await exec(
33 | 'genJob2',
34 | {
35 | url,
36 | prompt: scrapePrompt || '(not given, guess based on the page content)',
37 | text: text.substr(0, 30000),
38 | html: html.substr(0, 6000),
39 | extraRules,
40 | },
41 | null,
42 | modelOverride);
43 |
44 | console.log('GEN JOB 2 GAVE', await answer);
45 |
46 | if (!answer) {
47 | throw 'No answer for generate job';
48 | }
49 |
50 | const job = {
51 | id: await sendNextIdMessage(),
52 | name: (new URL(url)).hostname + ' - ' + (answer?.itemDescription || ''),
53 | urls: {
54 | manualUrls: url,
55 | url: url, // TODO: remove this field
56 | currentUrl: url,
57 | },
58 | scrape: {
59 | action: 'scrape',
60 | questions: (answer?.detailFields || ['Error: try again']),
61 | },
62 | };
63 |
64 | if (answer?.scrapeType === "singlePage") {
65 | job.urls.action = "current";
66 | job.urls.question = answer.itemDescription;
67 | job.urls.perPage = answer.perPage || "multiple";
68 | job.scrape.concurrency = -1;
69 | } else if (answer?.scrapeType === "multiPage") {
70 | job.urls.action = "gather";
71 | job.urls.question = answer.itemDescription + ": " + answer.gatherPrompt;
72 | job.urls.perPage = "single";
73 | job.scrape.concurrency = 3;
74 | }
75 |
76 | return job;
77 | }
78 |
79 | export const genBlankJob = async () => {
80 | return {
81 | id: await sendNextIdMessage(),
82 | name: 'Untitled Scrape',
83 | urls: {
84 | action: 'gather',
85 | url: '',
86 | list: [],
87 | question: '',
88 | },
89 | scrape: {
90 | action: 'scrape',
91 | questions: [],
92 | },
93 | };
94 | }
95 |
96 | export const genJobFromUrls = async (prompt, urls) => {
97 | const unique = [];
98 | const seen = {};
99 | const validUrls = [];
100 | for (const url of urls) {
101 | if (seen[url]) continue;
102 | try { new URL(url) } catch(e) { continue }
103 | unique.push(url);
104 | seen[url] = true;
105 | }
106 | const urlsString = unique.join('\n') + '\n';
107 |
108 | const job = await genJob(
109 | prompt,
110 | unique[0],
111 | {
112 | text: `not available, guess context based on these urls: ${urlsString}`,
113 | html: 'not availble',
114 | });
115 |
116 | job.urls.action = 'manual';
117 | job.urls.manualUrls = urlsString;
118 | job.scrape.concurrency = 3;
119 |
120 | return job;
121 | }
122 |
--------------------------------------------------------------------------------
/src/lib/job.mjs:
--------------------------------------------------------------------------------
1 | export const runJob = async (job) => {
2 | const [current] = await chrome.tabs.query({ active: true, currentWindow: true });
3 | const tabId = current.url == job.urls?.url ? current.id : null;
4 | return chrome.runtime.sendMessage({ action: 'runJob', job, tabId });
5 | }
6 |
7 | export const runGather = async (job) => {
8 | const [current] = await chrome.tabs.query({ active: true, currentWindow: true });
9 | const tabId = current.url == job.urls?.url ? current.id : null;
10 | return chrome.runtime.sendMessage({ action: 'runGather', job, tabId });
11 | };
12 |
13 | export const runScrape = async (job, urls) => {
14 | return chrome.runtime.sendMessage({ action: 'runScrape', job, urls });
15 | }
16 |
17 | export const sendStopMessage = async () => {
18 | return chrome.runtime.sendMessage({ action: 'stop' });
19 | }
20 |
21 | // TODO: use this pattern for most/all future functiosn in this file
22 | // TODO: rename this file to something reflecting this new pattern
23 | export const sendNextIdMessage = async () => {
24 | return new Promise(ok => chrome.runtime.sendMessage({ action: 'nextId' }, ok));
25 | }
26 |
--------------------------------------------------------------------------------
/src/lib/navigation.mjs:
--------------------------------------------------------------------------------
1 | import { setStatus, setKey } from './store.mjs';
2 | import { getRoundId, isActive, addListener, removeListener } from './controller.mjs';
3 | import { sleep } from './util.mjs';
4 | import { getTabUrl, closeTabIfExists } from './browser.mjs';
5 | import { apiHost } from './constants.mjs';
6 | import { setGlobalError } from './errors.mjs';
7 |
8 | const loadSleepTimes = {};
9 |
10 | const maxPageAttempts = 2;
11 | const maxTabAttempts = 3;
12 |
13 | export const getPageData = async (url, options) => {
14 | console.log('get page data got options (sleep)', options);
15 |
16 | const isPdf = await checkIfPdf(url);
17 | if (isPdf) {
18 | // TODO: handle large PDFs. Vercel caps body size at 1MB or 4.5MB
19 | const pdfResp = await fetch(url);
20 | const buf = await pdfResp.arrayBuffer();
21 | const base64 = btoa(
22 | new Uint8Array(buf)
23 | .reduce((data, byte) => data + String.fromCharCode(byte), '')
24 | );
25 | const body = JSON.stringify({ base64 });
26 | const resp = await fetch(apiHost + '/api/pdf', { method: 'POST', body });
27 | return {
28 | url,
29 | html: '',
30 | text: await resp.text(),
31 | links: [],
32 | };
33 | }
34 |
35 | const roundId = await getRoundId();
36 |
37 | let result;
38 | for (let i = 0; i < maxPageAttempts; i++) {
39 | if (!await isActive(roundId)) return;
40 |
41 | result = await getPageDataIteration(url, options);
42 | if (!result || result.error) {
43 | console.error(`Got page data error ${url} (${i}/${maxPageAttempts}):`, result);
44 | await sleep(2000);
45 | continue;
46 | } else {
47 | return result;
48 | }
49 | }
50 | return result?.error ? result : { error: result.error };
51 | }
52 |
53 | const getPageDataIteration = async (url, options) => {
54 | const { active, onCreate, sleepTime } = options || {};
55 | const tabWithUrl = await getTabWithUrl(url);
56 |
57 | if (tabWithUrl) {
58 | return getTabData(tabWithUrl.id, { shouldClose: false, sleepTime });
59 | }
60 |
61 | let tab;
62 | if (active) {
63 | tab = await chrome.tabs.create({ url, active: true });
64 |
65 | // if (activeTab) {
66 | // tab = await chrome.tabs.update(activeTab.id, { url });
67 | // } else {
68 | // tab = await chrome.tabs.create({ url, active: true });
69 | // }
70 | } else {
71 | tab = await chrome.tabs.create({ url, active: false });
72 | }
73 |
74 | if (onCreate) onCreate(tab);
75 |
76 | let handleStop;
77 | let errorHandleStop;
78 | let error;
79 |
80 | const errorLoad = new Promise((ok, bad) => {
81 | const listener = chrome.webNavigation.onErrorOccurred.addListener((details) => {
82 | if (details.tabId === tab.id) {
83 | if (details.frameType == 'outermost_frame') {
84 | error = details;
85 | ok('error');
86 | }
87 | }
88 | });
89 |
90 | errorHandleStop = () => {
91 | chrome.webNavigation.onErrorOccurred.removeListener(listener);
92 | if (!active) closeTabIfExists(tab.id);
93 | }
94 | addListener(errorHandleStop);
95 | });
96 |
97 | const pageLoad = new Promise((ok, bad) => {
98 | const listener = chrome.tabs.onUpdated.addListener((tabId, info) => {
99 | if (tabId == tab.id && info.status == 'complete') {
100 | chrome.tabs.onUpdated.removeListener(listener);
101 | ok('ok');
102 | }
103 | });
104 |
105 | handleStop = () => {
106 | chrome.tabs.onUpdated.removeListener(listener);
107 | if (!active) closeTabIfExists(tab.id);
108 | }
109 | addListener(handleStop);
110 | });
111 |
112 | const outcome = await Promise.any([pageLoad, errorLoad]);
113 | setStatus('Loaded (' + outcome + ') ' + url);
114 | removeListener(handleStop);
115 |
116 | let results;
117 | if (!error) {
118 | results = await getTabData(tab.id, { shouldClose: true, sleepTime });
119 | if (!results) {
120 | error = 'No tab results';
121 | }
122 | }
123 |
124 | if (error) {
125 | if (!active) closeTabIfExists(tab.id);
126 | return { error };
127 | }
128 |
129 | return results;
130 | }
131 |
132 | export const getTabData = async (tabId, options) => {
133 | const roundId = await getRoundId();
134 |
135 | const { shouldClose, sleepTime } = options || {};
136 |
137 | console.log('get tab data got options (sleep)', options);
138 |
139 | if (!tabId) {
140 | tabId = (await getActiveTab()).id;
141 | }
142 | let url = await getTabUrl(tabId);
143 |
144 | const handleStop = () => {
145 | if (shouldClose) closeTabIfExists(tabId);
146 | }
147 | addListener(handleStop);
148 |
149 | let error;
150 | let results;
151 | // Retry a few times, mainly for redirects
152 | for (let i = 0; i < maxTabAttempts; i++) {
153 | if (!await isActive(roundId)) return;
154 |
155 | // get the html + text
156 | console.log('=> Inject:', tabId, i);
157 | url = await getTabUrl(tabId);
158 | if (!url) {
159 | console.warn(`No URL found when trying to get tab data for ${tabId}`);
160 | }
161 |
162 | if ((url || '').indexOf('https://chromewebstore.google.com/') != -1) {
163 | error = 'Due to Google policy, cannot scrape Chrome Extension Store';
164 | break;
165 | }
166 |
167 | console.log('Got sleep time:', sleepTime);
168 | let args;
169 | if (sleepTime && !isNaN(Number(sleepTime))) {
170 | console.log('Using given sleep time:', sleepTime);
171 | args = [Number(sleepTime), false];
172 | } else {
173 | console.log('Auto suggesting sleep time');
174 | args = suggestSleep(url);
175 | }
176 | console.log('sleep args', tabId, args);
177 |
178 | const frames = await new Promise((ok) => {
179 | chrome.webNavigation.getAllFrames(
180 | { tabId },
181 | (frames) => {
182 | ok(frames);
183 | });
184 | });
185 |
186 | for (const frame of (frames || [])) {
187 | console.log('- Frame:', tabId, frame.url, frame);
188 | }
189 |
190 | try {
191 | results = await chrome.scripting.executeScript({
192 | target: { tabId },
193 | injectImmediately: true,
194 | args,
195 | func: injectFunction,
196 | });
197 | } catch (e) {
198 | console.error(`Got error during injection for ${url} ${tabId}: ${e}, results: ${results}`);
199 | }
200 |
201 | console.log('Results from navigation are:', tabId, results);
202 | if (results && results[0].result) break;
203 |
204 | console.error(`Got no results, sleep and try again (${i}/${maxTabAttempts}): ${url} ${tabId}`);
205 |
206 | await sleep(1000);
207 | }
208 |
209 | removeListener(handleStop);
210 |
211 | if (shouldClose) closeTabIfExists(tabId);
212 |
213 | if (!results || error) {
214 | console.error(`Giving up for ${url}, return error`);
215 | return { error: error || `Could not get tab data for ${url}` };
216 | }
217 |
218 | console.log('Getting result from', results);
219 |
220 | const result = results[0].result;
221 | console.log('Success', result);
222 |
223 | if (result.redir) {
224 | console.log('Handle redir', result.redir);
225 | return getPageData(result.redir, options);
226 | } else {
227 | return result;
228 | }
229 | }
230 |
231 | export const getActiveTab = async () => {
232 | return new Promise((ok) => {
233 | chrome.tabs.query(
234 | { active: true, currentWindow: true },
235 | (tabs) => ok(tabs[0] ? tabs[0] : null));
236 | });
237 | }
238 |
239 | export const getTabWithUrl = async (url) => {
240 | let u = new URL(url);
241 | // Query without hash
242 | const noHash = url.replace(u.hash, '');
243 | return new Promise((ok) => {
244 | chrome.tabs.query(
245 | { url: noHash },
246 | (tabs) => {
247 | console.log('lll got tabs after query', url, tabs);
248 | // Check for hash match
249 | for (let tab of (tabs || [])) {
250 | if (tab.url == url) ok(tab);
251 | }
252 | ok(null);
253 | });
254 | });
255 | }
256 |
257 | export const reportSleep = async (url, msec) => {
258 | const hostname = (new URL(url)).hostname;
259 | if (!loadSleepTimes[hostname]) {
260 | loadSleepTimes[hostname] = {
261 | times: [],
262 | };
263 | }
264 | const t = loadSleepTimes[hostname].times;
265 | t.unshift(msec);
266 | loadSleepTimes[hostname].times = t.slice(0, 10);
267 | console.log('nav loadSleepTimes', hostname, loadSleepTimes[hostname].times);
268 |
269 | setKey('loadSleepTimes', loadSleepTimes);
270 | }
271 |
272 | export const suggestSleep = (url) => {
273 | if (!url) {
274 | // No URL: No suggested sleep time, and don't check for loads
275 | return [null, false];
276 | }
277 |
278 | const hostname = (new URL(url)).hostname;
279 | const data = loadSleepTimes[hostname];
280 | if (!data || data.times.length < 2) {
281 | return [null, true];
282 | }
283 | const suggested = Math.min(
284 | 15*1000, // Hard cap 15 seconds sleep
285 | Math.max(...(data.times)) * 1.1);
286 |
287 | // Check it less as time goes on, min 5% of the time
288 | const shouldCheckLoad = Math.random() < Math.max(
289 | 0.05,
290 | 0.80 - data.times.length / 20);
291 |
292 | return [suggested, shouldCheckLoad];
293 | }
294 |
295 | const injectFunction = async (sleepTime, shouldCheckLoad) => {
296 | const defaultSleep = shouldCheckLoad ? 500 : (sleepTime || 1500);
297 | const dynamicSleep = 2000;
298 |
299 | // Max 15 seconds per page
300 | // TODO: test/ fix this
301 | const x = await Promise.any([
302 | new Promise((ok) =>
303 | setTimeout(() => {
304 | console.error(`Injection timeout ${window.location.href}`);
305 | ok({ error: 'timeout' });
306 | }, 20*1000)),
307 |
308 | new Promise(async (ok) => {
309 | const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms));
310 |
311 | const start = (new Date()).getTime();
312 |
313 | // Sleep a for dynamic content
314 | await sleep(defaultSleep);
315 |
316 | // via https://chatgpt.com/share/ef8bcaec-6fb1-478b-a074-1ae22c908ae2
317 | const getText = (node) => {
318 | let t = '';
319 | if (node.nodeType === Node.TEXT_NODE) {
320 | t += ' ' + node.textContent.trim();
321 | } else if (node.nodeType === Node.ELEMENT_NODE) {
322 | if (!['script', 'style'].includes(node.nodeName.toLocaleLowerCase())) {
323 | if (node.shadowRoot) {
324 | t += ' ' + getText(node.shadowRoot);
325 | }
326 | node.childNodes.forEach(child => {
327 | t += ' ' + getText(child);
328 | });
329 | }
330 | }
331 | return t;
332 | }
333 |
334 | // Via https://chatgpt.com/share/e9a142ab-775d-4f1d-8a84-69f829ffc45c
335 | const getHtml = (node) => {
336 | let clone = node.cloneNode(true);
337 |
338 | const removeTags = ['style', 'path', 'svg'];
339 | // Remove LinkedIn junk
340 | // TODO: more resilient solution
341 | if (url.indexOf('https://www.linkedin.com') != -1) {
342 | removeTags.push('code');
343 | }
344 |
345 | for (const tagName of removeTags) {
346 | clone
347 | .querySelectorAll(tagName)
348 | .forEach(el => el.remove());
349 | }
350 |
351 | const removeIfLargeAttributes = [
352 | ['img', 'src', 1000],
353 | ['*', 'class', 100],
354 | ];
355 | for (const [tagName, attr, cutoff] of removeIfLargeAttributes) {
356 | clone
357 | .querySelectorAll(tagName)
358 | .forEach(el => {
359 | const val = (el.getAttribute(attr) || '')
360 | if (val.length > cutoff) {
361 | console.log('remove!!', tagName, attr, cutoff, val.length);
362 | el.setAttribute(attr, '');
363 | }
364 | });
365 | }
366 |
367 | // Remove hidden elements, LinkedIn puts in a bunch of these
368 | const els = clone.querySelectorAll('*');
369 | els.forEach((el) => {
370 | const style = window.getComputedStyle(el);
371 | if (style.display == 'none') el.remove();
372 | });
373 |
374 | return clone.outerHTML;
375 | }
376 |
377 | const url = window.location.href;
378 | let text = getText(document.body) || '';
379 | let html = getHtml(document.body) || '';
380 |
381 | const maxDynamicWaits = 1;
382 | let i;
383 | for (i = 0; shouldCheckLoad & i < maxDynamicWaits; i++) {
384 | // Check if its loaded
385 | console.log('== check if loaded ==', { text, html });
386 | const resp = await new Promise((ok) => {
387 | chrome.runtime.sendMessage(
388 | {
389 | action: 'checkLoading',
390 | text,
391 | html,
392 | },
393 | (resp) => {
394 | console.log('checkloading said:', resp);
395 | ok(resp);
396 | });
397 | });
398 |
399 | if (resp.answer?.status == 'done' || resp.status == 'error') {
400 | console.log('== checkLoading done! break ==');
401 |
402 | if (i > 0) {
403 | chrome.runtime.sendMessage({
404 | action: 'setStatus',
405 | message: 'Loaded dynamic content on ' + url,
406 | });
407 | }
408 | break;
409 | }
410 |
411 | // Page maybe not loaded... let's wait and try again
412 | chrome.runtime.sendMessage({
413 | action: 'setStatus',
414 | message: 'Waiting for dynamic content on ' + url,
415 | });
416 | console.log('== checkLoading waiting ==');
417 | await sleep(dynamicSleep);
418 |
419 | if (i + 1 == maxDynamicWaits) {
420 | chrome.runtime.sendMessage({
421 | action: 'setStatus',
422 | message: 'Stop waiting for dynamic content on ' + url,
423 | });
424 | }
425 |
426 | text = getText(document.body) || '';
427 | html = getHtml(document.body) || '';
428 | }
429 |
430 | const took = (new Date()).getTime() - start;
431 |
432 | if (shouldCheckLoad) {
433 | chrome.runtime.sendMessage({
434 | action: 'reportSleep',
435 | url,
436 | msec: took,
437 | });
438 | }
439 |
440 | console.log('check for redir', text);
441 |
442 | // Special case Archive.org redirects
443 | if (url.indexOf('https://web.archive.org') == 0 &&
444 | text.match(/Got an HTTP 30[0-9] response at crawl time/)) {
445 |
446 | console.log('archive org redir, find url');
447 | const m = html.match(/ {
460 | const tags = document.querySelectorAll('a');
461 | return Array.from(tags)
462 | .filter(a => a.href)
463 | .map(a => ({
464 | id: id++,
465 | html: a.outerHTML.substr(0, 1000),
466 | text: a.innerText.substr(0, 200),
467 | url: a.href,
468 | }));
469 | }
470 | const links = getLinks(document);
471 |
472 | const fetchWithTimeout = async (urls) => {
473 | const timeout = new Promise((ok) => setTimeout(() => ok('timeout'), 2000));
474 | const fetches = urls.map(url => fetch(url));
475 | const result = await Promise.race([
476 | timeout,
477 | Promise.allSettled(fetches),
478 | ]);
479 |
480 | let settled;
481 | if (result == 'timeout') {
482 | const isSettled = async (p) => {
483 | const result = await Promise.race([p, Promise.resolve('pending')]);
484 | return result == 'pending' ? 'pending' : 'fulfilled';
485 | }
486 | const statuses = await Promise.all(fetches.map(isSettled));
487 | settled = fetches.map((val, index) => {
488 | if (statuses[index] == 'fulfilled') {
489 | return val;
490 | } else {
491 | return null;
492 | }
493 | });
494 | } else {
495 | settled = result;
496 | }
497 | return (await Promise.all(settled)).map(x => x.value);
498 | }
499 |
500 | const fetchTitles = async (urls) => {
501 | const fetches = await fetchWithTimeout(urls);
502 | const texts = await Promise.all(fetches.map(async (resp) => {
503 | if (!resp) return '[no response]';
504 | return await resp.text();
505 | }));
506 | return await Promise.all(texts.map((text) => {
507 | const node = document.createElement('div');
508 | node.innerHTML = text;
509 | const title = node.querySelector('title');
510 | node.remove();
511 | return title ? title.innerText : '[no title]';
512 | }));
513 | }
514 |
515 | const iframes = document.querySelectorAll('iframe');
516 | const iframeLinks = Array.from(iframes)
517 | .filter(iframe => iframe.src)
518 | .map(iframe => ({
519 | html: '',
520 | text: '',
521 | url: iframe.src,
522 | iframe: true,
523 | }));
524 |
525 | const iframeTitles = await fetchTitles(iframeLinks.map(l => l.url));
526 |
527 | for (let i = 0; i < iframeTitles.length; i++) {
528 | const title = iframeTitles[i];
529 | const url = iframeLinks[i].url;
530 | iframeLinks[i].html = ` ${title} `;
531 | iframeLinks[i].text = title;
532 | }
533 |
534 | console.log('iframeLinks', iframeLinks);
535 | links.push(...iframeLinks);
536 |
537 | ok({ url, text, html, links });
538 | })
539 | ]);
540 |
541 | console.log('inject response gave:', x);
542 |
543 | return x;
544 | }
545 |
546 | const checkIfPdf = async (url) => {
547 | if (url.indexOf(apiHost) == -1 && url.toLowerCase().endsWith(".pdf")) {
548 | return true;
549 | }
550 |
551 | const resp = await fetch(url, { method: 'HEAD' });
552 | const type = resp.headers.get('Content-Type');
553 | if (('' + type).toLowerCase().includes('application/pdf')) {
554 | return true;
555 | }
556 |
557 | return false;
558 | }
559 |
--------------------------------------------------------------------------------
/src/lib/report.mjs:
--------------------------------------------------------------------------------
1 | import { apiHost } from './constants.mjs';
2 | import { getActiveJob } from './store.mjs';
3 |
4 | export const sendReport = async (logs) => {
5 | const maxBytes = 900000;
6 | const l = logs.length;
7 | if (l > maxBytes) {
8 | logs = logs.substr(l - maxBytes);
9 | }
10 |
11 | const job = Object.assign({}, (await getActiveJob()));
12 | if (job.results?.targets) {
13 | // Don't need too many of these
14 | job.results.targets = job.results.targets.slice(0, 50);
15 | }
16 | const url = apiHost + '/api/report';
17 | const report = {
18 | manifest: chrome.runtime.getManifest(),
19 | job,
20 | logs,
21 | }
22 | console.log('sending report:', logs.substr(0, 100), report);
23 | let body = JSON.stringify({ report });
24 |
25 | // Stay under Vercel cap
26 | // https://vercel.com/docs/errors/FUNCTION_PAYLOAD_TOO_LARGE
27 | if (body.length > 4000000) {
28 | body = body.substr(0, 4000000);
29 | }
30 |
31 | const resp = await fetch(url, { method: 'POST', body });
32 | return resp.json();
33 | }
34 |
--------------------------------------------------------------------------------
/src/lib/scrape.mjs:
--------------------------------------------------------------------------------
1 | import { exec } from './ai.mjs';
2 | import { sleep } from './util.mjs';
3 | import { setStatus } from './store.mjs';
4 | import { getRoundId, isActive } from './controller.mjs';
5 | import { scrapeTemplate } from './templates.mjs';
6 |
7 | export const scrapePage = async (
8 | page,
9 | questions,
10 | perPage,
11 | itemDescription,
12 | extraRules,
13 | cb) =>
14 | {
15 |
16 | const roundId = await getRoundId();
17 | if (!await isActive(roundId)) return;
18 |
19 | const translateToHeaders = (questions) => {
20 | const result = {};
21 | let i = 0;
22 | for (let q of questions) {
23 | if (q == '') continue;
24 | result[q] = '';
25 | }
26 | return result;
27 | };
28 |
29 |
30 | const clip = 60000;
31 | const len = page.text.length + page.html.length;
32 | const percentHtml = page.html.length / len;
33 | const textChunkSize = Math.floor(clip * (1 - percentHtml));
34 | const htmlChunkSize = Math.floor(clip * percentHtml);
35 |
36 | console.log('clip page:', page);
37 | console.log('clip should we clip?', len, clip);
38 | console.log('clip len text', page.text.length);
39 | console.log('clip len html', page.html.length);
40 |
41 | let expectedItemCount;
42 |
43 | const scrapeInner = async (offset, existing, cb) => {
44 | const text = page.text.slice(
45 | offset * textChunkSize,
46 | (offset + 1) * textChunkSize);
47 |
48 | const html = page.html.slice(
49 | offset * htmlChunkSize,
50 | (offset + 1) * htmlChunkSize);
51 |
52 | console.log('building prompt using perPage', perPage);
53 |
54 | let perPageCopy;
55 | if (perPage == 'single') {
56 | perPageCopy = 'You should look for a SINGLE item on this page, expect itemCount == 1';
57 | } else if (perPage == 'multiple') {
58 | perPageCopy = 'You should look for MULTIPLE items on this page, expect itemCount > 1. Be sure to FIND ALL THE ITEMS'
59 | } else {
60 | perPageCopy = 'The user wants you to GUESS how many items are on this page, itemCount may be 1 or more than 1';
61 | }
62 |
63 | const context = {
64 | url: page.url,
65 | questions: JSON.stringify(translateToHeaders(questions), null, 2),
66 | itemDescription,
67 | perPageCopy,
68 | text,
69 | html,
70 | extraRules,
71 | count: '',
72 | };
73 |
74 | if (itemDescription) {
75 | context.itemDescription = (
76 | 'You are looking for this type of item(s):\n\n' +
77 | itemDescription);
78 | }
79 |
80 | console.log('manual scrape sending context', context);
81 |
82 | let prevLength = 1; // set it to 1 to ignore itemCount row
83 | const a = await exec(
84 | 'scrape',
85 | context,
86 | async (partial) => {
87 | if (partial?.length && partial[0].itemCount) {
88 | expectedItemCount = partial[0].itemCount;
89 | }
90 |
91 | if (cb && partial && partial.length > prevLength) {
92 | if (!await isActive(roundId)) return answer;
93 | const items = existing.concat(partial.slice(1));
94 |
95 | let percent = (offset/numChunks) + (1/numChunks) * (items.length / expectedItemCount);
96 | // Slow down percent above cap in case AI mis-estimated
97 | const cap = 0.7;
98 | if (percent > cap) {
99 | percent = cap + ((percent - cap) * 0.5);
100 | }
101 |
102 | cb({ items, percent });
103 | console.log(`clip partial ${items.length} of expected ${expectedItemCount}`);
104 | prevLength = partial.length;
105 | }
106 | });
107 |
108 | const ensureArray = (x) => {
109 | if (!x) return [];
110 | if (!Array.isArray(x)) return [x];
111 | return x;
112 | }
113 |
114 | console.log('Scrape answer:', a);
115 |
116 | const localAnswer = ensureArray(a)
117 | .filter(i => i.itemCount == undefined ||
118 | Object.keys(i).length > 1);
119 |
120 | let single = false;
121 | if (localAnswer.length == 1) {
122 | single = true;
123 | for (const key of Object.keys(localAnswer[0])) {
124 | if (!localAnswer[0][key]) single = false;
125 | }
126 | }
127 |
128 | return {
129 | answer: localAnswer,
130 | single,
131 | more: (page.text.length > (offset + 1) * textChunkSize ||
132 | page.html.length > (offset + 1) * htmlChunkSize),
133 | };
134 | }
135 |
136 | let answer = [];
137 | let offset = 0;
138 |
139 | const max = perPage == 'single' ? 3 : 20;
140 | const numTextCunks = page.text.length / textChunkSize;
141 | const numHtmlCunks = page.html.length / htmlChunkSize;
142 | const numChunks = Math.ceil(Math.min(max, Math.max(numTextCunks, numHtmlCunks)));
143 | for (let i = 0; i < max; i++) {
144 | console.log(`clip iteration ==> ${offset}/${numChunks}`);
145 | const result = await scrapeInner(offset++, answer, cb);
146 |
147 | console.log('clip iteration result gave:', result);
148 | console.log('clip scrape inner gave:', result.answer);
149 | console.log('clip is there more?', result.more);
150 |
151 | if (!await isActive(roundId)) return answer;
152 | answer = answer.concat(result.answer);
153 |
154 | console.log('clip combined answer:', answer);
155 |
156 | if (!result.more) break;
157 | if (result.single) break;
158 | }
159 |
160 | setStatus('Result: ' + JSON.stringify(
161 | Object.values(answer[0] || {})
162 | ));
163 | return answer;
164 | }
165 |
--------------------------------------------------------------------------------
/src/lib/share.mjs:
--------------------------------------------------------------------------------
1 | import { apiHost } from './constants.mjs';
2 |
3 | export const shareResults = async (job) => {
4 | console.log('share results', job);
5 | const url = apiHost + '/api/share';
6 | const body = JSON.stringify({ job });
7 | const resp = await fetch(url, { method: 'POST', body });
8 | return resp.json();
9 | }
10 |
--------------------------------------------------------------------------------
/src/lib/templates.mjs:
--------------------------------------------------------------------------------
1 | export const getTemplate = (name) => {
2 | return {
3 | genJob: genJobTemplate,
4 | genJob2: genJob2Template,
5 |
6 | pagination: paginationTemplate,
7 | paginationPickNext: paginationPickNextTemplate,
8 |
9 | gather: gatherTemplate,
10 | scrape: scrapeTemplate,
11 | name: nameTemplate,
12 | checkLoading: checkLoadingTemplate,
13 | }[name];
14 | }
15 |
16 | export const genJobTemplate = `You are part of a web scraping program, and your job is to take a "master prompt" from the user, and generate the JSON job definition for the web scrape.
17 |
18 | The web scrape job you are generating starts at a target page which the user provides, and it navigates 1 link away from that target page to detail pages. On the detail pages, the job extracts 1 item per detail page.
19 |
20 | For the job definition, you will need to output the following fields:
21 |
22 | - itemSummary: This tells the job what items we are looking for. It is based on the master prompt. These items may be present on the starting page, or they may be present on pages linked from the starting page. Be specific.
23 |
24 | - gatherPrompt: This is based on itemSummary, and it gives a little more detail. It describes how to find items on a page, and what to ignore. Exclusions are important to clear up confusion.
25 |
26 | - detailFields: This is a list of detail prompts for data extraction for each item. It is also based on the master prompt. Each detail prompt corresponds to an output field on the item.
27 |
28 | Here is an example input and output:
29 |
30 | Example input: Scrape https://www.nytimes.com for articles. Find author, title, date, key people, and 2-5 word summary
31 |
32 | Example output 1:
33 | {
34 | itemSummary: "News articles",
35 | gatherPrompt: "Find news articles. Only find news articles. Exclude general that match specific articles, not general pages. Avoid advertisements and general category links.",
36 | detailFields: [
37 | "Who is the author of this article?",
38 | "What is the title of this article?",
39 | "What is the date of this article? Format: YYYY-MM-DD",
40 | "List the key people in this article",
41 | "Give a 2-5 word summary of this article."
42 | ]
43 | }
44 |
45 | Example output 2:
46 | {
47 | itemSummary: "Biographies",
48 | gatherPrompt: "Find links to pages about individual people. Only pages that match specific people, not general pages.",
49 | detailFields: [
50 | "Who is this page about? Give the name",
51 | "Where was this person born?",
52 | "When was this person born? Format: YYYY-MM-DD",
53 | "List a 5-10 word summary of their key accomplishments"
54 | ]
55 | }
56 |
57 | Example output 3:
58 | {
59 | itemSummary: "Software engineers",
60 | gatherPrompt: "Find links to employees for hiring software engineers. Ignore site navigation links.",
61 | detailFields: [
62 | "Name of the person",
63 | "Most recent job experience, format: Company Name, Title, Start Date-End Date",
64 | "Key skills, including programming languages. Exclude fluff.",
65 | "University and degree, or N/A if none"
66 | ]
67 | }
68 |
69 | Follow this guidance:
70 |
71 | - If the prompt is ambiguous, take your best guess at the likely fields. Output a JSON object of the results.
72 | - Important: try to figure out unique aspects of the site, and focus on those
73 | - Only output a JSON object, and make sure it is parseable.
74 | - The user can delete fields he doesn't want, so err on the side of giving too many fields, typically in the range of 2-5.
75 | - Specify the format if necessary: dates, say the date format. For numbers, specify the output should be a number. For emails, request deobfuscation. For subjective or text fields, give a reasonable word limit, usually no more than 20 words
76 | - If reasonable, tailor the fields to the distinct qualities of the targe site
77 |
78 | Below is the information from the user and website. Prompt directive lines are preceded by >>>>
79 |
80 | >>>> The master prompt is below:
81 | Scrape {{url}} for {{prompt}}
82 |
83 | >>>> To help, here is the innerText from the page:
84 | {{text}}
85 |
86 | >>>> HTML text from innerHTML of the page (first {{count}} characters):
87 | {{html}}
88 | `;
89 |
90 | export const genJob2Template = `You are part of a web scraping program, and your job is to take a "master prompt" from the user, and generate the JSON job definition for the web scrape.
91 |
92 | You will receive a prompt from the user, describing what they wish to scrape.
93 |
94 | You will also receive information about the starting page of the scrape, including its URL, HTML, and text.
95 |
96 | You will return a JSON object with the following fields, in this order:
97 |
98 | - "intentAnalysis": A 10-30 word summary of the user's likely intent, given the user prompt and the page information.
99 |
100 | - "itemDescription": A 2-5 word description of the items the user is trying to scrape. Try to infer the general item based on the intent.
101 |
102 | - "detailFields": An array defining the field(s) the user is looking for, based on "intentAnalysis" and the prompt. IF THERE IS A PROMPT GIVEN, base this off the prompt.
103 | - For "detailFields", follow these guidelines:
104 | - For URLs and links, that detail field SHOULD SPECIFY absolute URL format
105 |
106 | - "dataAvailability": For each item in "detailFields", say whether it is likely present on the current page ("currentPage"), or a page LINKED from the current page ("linkedPage")
107 |
108 | - "scrapeTypeGuess": Either "singlePage" or "multiPage". Respond with "singlePage" if the current page has all the items, and all the DETAILS the user wants to scrape. Respond with "multiPage" if these items are LINKED from the current page, and the LINKED pages are needed to get ALL the details the user is looking for. This is your first guess, which you will have a chance to revise.
109 |
110 | - "scrapeTypeReason": Explain in 5-15 words why you made your guess in "scrapeTypeGuess".
111 |
112 | - "scrapeType": Your FINAL answer for the scrape type, either "singlePage" or "multiPage". Change only if you need to after thinking about it in "scrapeTypeReason"
113 |
114 | - "perPage": IF scrapeType is "singlePage", answer either "single" or "multiple". Answer "single" if there is only one item on the page to scrape. Answer "multiple" if there are multiple items on the page to scrape.
115 |
116 | - "gatherPrompt": If this is "singlePage", return "". If this is "multiPage", describe how to find the linked pages that contain all the detail fields. Exclusions are important to clear up confusion.
117 |
118 | {{extraRules}}
119 |
120 | Example output 1:
121 | {
122 | "intentAnalysis": "The user is likely looking for ratings and information about products to evaluate which one to buy",
123 | "itemDescription": "Product reviews and information",
124 | "detailFields": [
125 | "What is the name of this product?",
126 | "What is the rating of this product? Format: X.X/X",
127 | "Who made this product?",
128 | "What is the URL of this product? Format: full absolute URL",
129 | "What is the price of this product? Format: $XX.XX"
130 | ],
131 | "dataAvailability": {
132 | "What is the name of this product?": "currentPage",
133 | "What is the rating of this product? Format: X.X/X": "linkedPage",
134 | "Who made this product?": "currentPage",
135 | "What is the price of this product? Format: $XX.XX": "linkedPage"
136 | },
137 | "scrapeTypeGuess": "multiPage",
138 | "scrapeTypeReason": "The product rating and price are only available on the indivdiual pages",
139 | "scrapeType": "multiPage",
140 | "gatherPrompt": "Find links to products. Ignore category links, page navigation, and advertisements"
141 | }
142 |
143 | Example output 2:
144 | {
145 | "intentAnalysis": "The user wants to find candidates for a job based on the results from a search page",
146 | "itemDescription": "Job applicant candidates",
147 | "detailFields": [
148 | "What is the name of this person?",
149 | "What is this person's current employer?",
150 | "What is this peerson's current job title?",
151 | "What is the full URL of their profile?"
152 | ],
153 | "dataAvailability": {
154 | "What is the name of this person?": "currentPage",
155 | "What is this person's current employer?": "currentPage",
156 | "What is this peerson's current job title?": "currentPage",
157 | "What is the full URL of their profile?": "currentPage"
158 | },
159 | "scrapeTypeGuess": "singlePage",
160 | "scrapeTypeReason": "All the data is available on the current page, so I don't need to load extra pages",
161 | "scrapeType": "singlePage",
162 | "gatherPrompt": ""
163 | }
164 |
165 | Page URL: {{url}}
166 |
167 | Page HTML: {{html}}
168 |
169 | Page text: {{text}}
170 |
171 | User prompt: {{prompt}}
172 |
173 | You MUST respond with ONLY the JSON object, no comments, no explanation. Otherwise you fail the task.
174 |
175 | `;
176 |
177 | export const gatherTemplate = `You are part of a web crawling program, and your goal is to pick out relevant links in a list. The list contains the inner text of links, and also their URLs. You will take this list, look for links that match the user prompt, and generate a new list of only the matching items.
178 |
179 | Your response will be ONLY the "id" field of matching items. The "id" field will be used to generate the results later, you only need to include the "id" field.
180 |
181 | Follow these important rules:
182 | - The entire array should be JSON array
183 | - Do not wrap the response in an array, individual dictionaries only.
184 | - Do not include any markdown formatting. Only include JSON.
185 | - Respond with [] if nothing matches the prompt.
186 | - Generally avoid links with no link text.
187 | - Find all the matches, and err on the side of overmatching, especially if the user prompt is short
188 |
189 | Follow these site specific rules:
190 | - For amazon.com, product links have the product name in the link text. For amazon.com, ONLY include proudcts where the link text has the product name.
191 |
192 | Example of valid output:
193 | [
194 | { "id": 3 },
195 | { "id": 18 },
196 | { "id": 45 }
197 | ]
198 |
199 | The user is looking for: {{question}}
200 |
201 | The list to find this is below:
202 | {{list}}
203 | `;
204 |
205 | export const scrapeTemplate = `You are a web scraping extraction program. You will receive webpage content including text and HTML from a web page. Your goal is to extract one or more items matching a user's prompt. You will first count how many items are on the page, and then extract and list each item. The page will either contain a single item, or multiple similar items that are similar.
206 |
207 | If you're unable to answer a question fill in the value "(not found)", but make your best guess. Prefer to give an answer if one seems plausible.
208 |
209 | Your response will be parsed by a computer program, so respond ONLY with valid JSONL. Each line must be parseable JSON.
210 |
211 | The first JSON object your return will have one field, "itemCount", indicating how many items are to come.
212 |
213 | The remaining JSON objects you returns will be items. There will be one item per line. Each field in these objects corresponds to the questions.
214 |
215 | Follow these important rules:
216 | - Please make sure the response is valid JSONL. Only ONE JSON object per line. Remove any \n characters in questions and answers.
217 | - Use the SAME keys for each item as you find in the questions dictionary.
218 | - Do NOT fix spelling errors in the item keys. If the questions contain typos, spelling errors, or other mistakes, keep those in the item dictionary keys.
219 |
220 | {{extraRules}}
221 |
222 | Example of a valid response with multiple items:
223 | {"itemCount": 2}
224 | {"What is the author's name?": "Ernest Hemingway", "What is the book's name?": "The Old Man and the Sea"}
225 | {"What is the author's name?": "George Orwell", "What is the book's name?": "1984"}
226 |
227 | Example of a valid response with a single item:
228 | {"itemCount": 1}
229 | {"What is the article's title?": "New Find at the Great Wall of China", "What is the article's date in YYYY-MM-DD format?": "2024-02-04"}
230 |
231 | Below is the user prompts. Prompt directive lines are preceded by >>>>
232 |
233 | >>>> {{itemDescription}}
234 |
235 | >>>> {{perPageCopy}}
236 |
237 | >>>> Below are the questions for each item(s):
238 |
239 | {{questions}}
240 |
241 | >>>> The URL of the website:
242 | {{url}}
243 |
244 | >>>> Raw text from innerText of the page:
245 | {{text}}
246 |
247 | >>>> HTML text from innerHTML of the page (first {{count}} characters):
248 | {{html}}
249 | `;
250 |
251 | export const nameTemplate = `Below is the JSON definition of a web scraping job. Your task is to summarize this job in no more than 30 characters. The summary should be user friendly, intended for a human power user. It shoudl highlight the unique aspects of the job, such as the scraping target, and the type of information being extracted. Ideally your summary will be between 2 to 5 words, and never more than 40 characters.
252 |
253 | Follow these directives:
254 | - Respoind with valid JSON, with the 'name' field representing your answer. Only give JSON, no explanation or markup. Your response will be parsed using JSON.parse()
255 | - Do not include words like "Scrape", "Extract", "Collect", "Monitor", etc. in the answer, since that is implied based on context. The response should be a noun-phrase.
256 | - Prepend the domain to the summary, for example if the url is https://www.cnn.com/politics, prepend "cnn.com - " to the summary you come up with. Do NOT include "www" subdomain. Include other subdomains if they are informative.
257 |
258 | Example of valid response:
259 |
260 | { name: "site.com - Article author and name" }
261 |
262 | The job definition, in JSON, is:
263 |
264 | {{job}}`;
265 |
266 | export const checkLoadingTemplate = `Below is text and HTML from a webpage. Your job is to check if the main content on the page is loaded, or if it is not yet available do to dynamic requests like ajax and other async dynamic content. The main content is information relevant to the user's questions, which are show below.
267 |
268 | Follow these guidelines:
269 | - If the main content of the page is missing, reply "loading"
270 | - If the main content is available, reply "done"
271 | - If the main content is available but small parts are loading, reply "done"
272 | - Your response MUST be valid json, with the key "status" and either "loading" or "done" as the value
273 | - Summarize the main contnet of the page in 2-10 words before deciding if it is loaded
274 |
275 | Example of valid responses:
276 |
277 | { contentSummary: "user profile page with emails", status: "loading" }
278 | { contentSummary: "real estate listing page with price history", status: "done" }
279 |
280 | Below is the user's questions:
281 |
282 | {{questions}}
283 |
284 | Below is the text from the page:
285 |
286 | {{text}}
287 |
288 | Below is the HTML from the page:
289 |
290 | {{html}}
291 | `;
292 |
293 | export const paginationTemplate = `You are part of a web scraping program, and your goal is to look for pagination on a page. You will be given a list of links, and your goal is to find the ones that are related to pagination. The list contains the inner text of links, and also their URLs. You will take this list, look for pagination links, and generate a new list of only the matching items.
294 |
295 | Your response will be ONLY the "id" field of matching items, and the "pageNumber" if applicable. Do NOT include the actual URL. The "id" field will be used to generate the results later, you only need to include the "id" and "pageNumber" fields.
296 |
297 | You will return a JSON object with the following fields, in this order:
298 |
299 | - "hasPagination": Either "yes" or "no". Answer "yes" if there is pagination in the links you got, otherwise answer "no"
300 |
301 | - "nextLink": The ID of the link to go the next page, if any. Only include links that relate to the next pagination result, not other links that show more results.
302 |
303 | - "pageLinks": Array of IDs linking to specific pages. Include the ID and the pageNumber in a JSON object. Determine the pageNumber by looking at the "text", "url", and "html" fields in the link.
304 |
305 | Example of valid output:
306 |
307 | {
308 | "hasPagination": "yes",
309 | "nextLink": 11,
310 | "pageLinks": [
311 | { "id": 15, "pageNumber": 5 },
312 | { "id": 16, "pageNumber": 17 },
313 | { "id": 17, "pageNumber": 18 },
314 | { "id": 19, "pageNumber": 20 }
315 | ]
316 | }
317 |
318 | {
319 | "hasPagination": "no"
320 | }
321 |
322 | The list of links is below:
323 | {{list}}
324 | `;
325 |
326 | export const paginationPickNextTemplate = `You are part of a web scraping program, and your goal is to find the "Next" page pagination link out of a list. You will receive multiple candidate links, and your goal is to pick the one MOST LIKELY to be the "Next" page pagination link.
327 |
328 | Your response will be ONLY the "id" field of link, as a JSON object.
329 |
330 | Example 1 of valid output:
331 |
332 | { "id": 3 }
333 |
334 | Example 1 of valid output:
335 |
336 | { "id": 75 }
337 |
338 | The list of candidates is below:
339 | {{list}}
340 | `;
341 |
--------------------------------------------------------------------------------
/src/lib/util.mjs:
--------------------------------------------------------------------------------
1 | import JSON5 from 'json5'
2 |
3 | export const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms))
4 |
5 | export const shuffle = (a) => {
6 | for (let i = a.length - 1; i > 0; i--) {
7 | const j = Math.floor(Math.random() * (i + 1));
8 | [a[i], a[j]] = [a[j], a[i]];
9 | }
10 | return a;
11 | }
12 |
13 | export const formatNumber = (number, abbrev) => {
14 | if (abbrev && number >= 1000000) {
15 | return formatNumber(Math.round(number / 100000) / 10, false) + 'M';
16 | } else if (abbrev && number >= 1000) {
17 | return formatNumber(Math.round(number / 1000), false) + 'k';
18 | } else {
19 | return ('' + number).replace(/\B(?=(\d{3})+(?!\d))/g, ',');
20 | }
21 | }
22 |
23 | export const splitUrls = (str) => {
24 | return str
25 | .split('\n')
26 | .map(x => x.trim())
27 | .filter(x => !!x && x != '');
28 | }
29 |
30 | export const parseJsonl = (str) => {
31 | const lines = str.split('\n');
32 | // console.log('parseJsonl', lines);
33 | const result = [];
34 | for (const line of lines) {
35 | try {
36 | result.push(JSON5.parse(line));
37 | } catch(e) {
38 | // console.warn('skipping invalid jsonl:', line);
39 | }
40 | }
41 | return result;
42 | }
43 |
44 | export const getJobColumn = (job, header) => {
45 | const col = [];
46 | for (const target of job?.results?.targets || []) {
47 | if (header == 'URL') {
48 | col.push(target.url);
49 | } else {
50 | for (const a of (target.answer || [])) {
51 | col.push(a[header] || '');
52 | }
53 | }
54 | }
55 |
56 | return col;
57 | }
58 |
59 | export const getJobUrl = (job) => {
60 | if (job.urls?.action == 'gather') {
61 | return job.urls?.url;
62 | } else if (job.urls?.action == 'current') {
63 | return job.urls?.currentUrl;
64 | } else if (job.urls?.action == 'manual') {
65 | return job.urls?.manualUrls;
66 | }
67 | return '';
68 | }
69 |
--------------------------------------------------------------------------------
/src/manifest.json:
--------------------------------------------------------------------------------
1 | {
2 | "manifest_version": 3,
3 | "name": "FetchFox",
4 | "description": "FetchFox lets you scrape any site for any data, using AI",
5 | "options_page": "options.html",
6 | "background": { "service_worker": "background.bundle.js" },
7 | "action": {
8 | "default_popup": "popup.html",
9 | "default_icon": "icon-34.png"
10 | },
11 | "browser_action": {
12 | "default_icon": "icon-34.png",
13 | "default_title": "FetchFox"
14 | },
15 | "side_panel": {
16 | "default_path": "panel.html"
17 | },
18 | "permissions": [
19 | "webNavigation",
20 | "sidePanel",
21 | "tabs",
22 | "scripting",
23 | "storage"
24 | ],
25 | "host_permissions": [""],
26 | "icons": {
27 | "128": "icon-128.png"
28 | },
29 | "content_scripts": [
30 | {
31 | "matches": [""],
32 | "js": ["contentScript.bundle.js"],
33 | "css": ["content.styles.css"]
34 | }
35 | ],
36 | "devtools_page": "devtools.html",
37 | "web_accessible_resources": [
38 | {
39 | "resources": ["content.styles.css", "icon-128.png", "icon-34.png"],
40 | "matches": []
41 | }
42 | ]
43 | }
44 |
--------------------------------------------------------------------------------
/src/pages/Background/index.js:
--------------------------------------------------------------------------------
1 | import { GATHER_TARGETS_PROMPT, EXAMPLE_LINKS } from '../../lib/constants.mjs';
2 | import { sleep, splitUrls } from '../../lib/util.mjs';
3 | import { exec } from '../../lib/ai.mjs';
4 | import {
5 | getKey,
6 | setKey,
7 | nextId,
8 | saveJob,
9 | getActiveJob,
10 | setJobField,
11 | setActiveJob,
12 | setJobResults,
13 | setStatus,
14 | setPercent,
15 | setScrapeStatus,
16 | setScrapeAnswer,
17 | pushConsole,
18 | } from '../../lib/store.mjs';
19 | import { getPageData, getTabData, getActiveTab, reportSleep } from '../../lib/navigation.mjs';
20 | import { parseLinks, cleanLinks, dedupeLinks } from '../../lib/gather.mjs';
21 | import { scrapePage } from '../../lib/scrape.mjs';
22 | import { getRoundId, isActive, runStopListeners, advanceRound } from '../../lib/controller.mjs';
23 | import { nameTemplate } from '../../lib/templates.mjs';
24 | import { sendReport } from '../../lib/report.mjs';
25 | import { initSentry } from '../../lib/errors.mjs';
26 | import icon34 from '../../assets/img/icon-34.png';
27 |
28 | initSentry();
29 |
30 | let iconInterval;
31 | let loadingRoundId;
32 | chrome.storage.onChanged.addListener(async (changes) => {
33 | if (changes.inFlight) {
34 | if (changes.inFlight.newValue == 0) {
35 | if (iconInterval) {
36 | clearInterval(iconInterval);
37 | iconInterval = null;
38 | }
39 | chrome.action.setIcon({ path: icon34 });
40 | } else if (iconInterval == null) {
41 |
42 | const context = (new OffscreenCanvas(100, 100)).getContext('2d');
43 | const start = new Date();
44 | const lines = 16;
45 | const cW = 40;
46 | const cH = 40;
47 |
48 | loadingRoundId = await getRoundId();
49 |
50 | iconInterval = setInterval(() => {
51 | const rotation = parseInt(((new Date() - start) / 1000) * lines) / lines;
52 | context.save();
53 | context.clearRect(0, 0, cW, cH);
54 | context.translate(cW / 2, cH / 2);
55 | context.rotate(Math.PI * 2 * rotation);
56 | for (var i = 0; i < lines; i++) {
57 | context.beginPath();
58 | context.rotate(Math.PI * 2 / lines);
59 | context.moveTo(cW / 10, 0);
60 | context.lineTo(cW / 4, 0);
61 | context.lineWidth = cW / 30;
62 | context.strokeStyle = 'rgba(0, 0, 0,' + i / lines + ')';
63 | // context.strokeStyle = 'rgba(223, 101, 70,' + i / lines + ')';
64 | context.stroke();
65 | }
66 | const imageData = context.getImageData(10, 10, 19, 19);
67 | chrome.action.setIcon({ imageData });
68 | context.restore();
69 | }, 1000/30);
70 | }
71 | }
72 | });
73 |
74 | chrome.runtime.onMessage.addListener(function (req, sender, sendResponse) {
75 | if (req.action != 'console') console.log('bg got message:', req);
76 |
77 | if (req.action == 'runJob') runJob(req.job, req.tabId);
78 | else if (req.action == 'runGather') runGather(req.job, req.tabId);
79 | else if (req.action == 'runScrape') runScrape(req.job, req.urls);
80 | else if (req.action == 'stop') runStopListeners();
81 |
82 | else if (req.action == 'checkLoading') {
83 | checkLoading(req.text, req.html).then(sendResponse);
84 | return true;
85 | }
86 |
87 | if (req.action == 'reportSleep') {
88 | reportSleep(req.url, req.msec);
89 | }
90 |
91 | else if (req.action == 'setStatus') {
92 | setStatus(req.message);
93 | }
94 |
95 | else if (req.action == 'console') {
96 | saveConsole(req.key, req.args);
97 | }
98 |
99 | else if (req.action == 'reportBug') {
100 | reportBug().then(sendResponse);
101 | return true;
102 | }
103 |
104 | else if (req.action == 'nextId') {
105 | nextId().then(sendResponse);
106 | return true;
107 | }
108 | });
109 |
110 | const runJob = async (job, tabId) => {
111 | const roundId = await getRoundId();
112 |
113 | job = await maybeNameJob(job);
114 | if (!await isActive(roundId)) return;
115 |
116 | await setStatus('Run job ' + job.name, roundId, 1);
117 |
118 | let targets;
119 | let gatherShare = 0.25;
120 |
121 | const mergeTargets = (newTargets) => {
122 | // Merge with existing pagination results, if any
123 | const merged = [];
124 | const existing = job.results?.targets || [];
125 | const partialComplete = existing.filter(x => x.status != 'scraped').length > 0;
126 | for (const nt of newTargets) {
127 | const e = existing
128 | .filter(t => t.url == nt.url)
129 | .filter(t => t.text == nt.text)
130 | .filter(t => t.status == 'scraped');
131 | if (partialComplete && e.length > 0) {
132 | // Job is partially complete, and we already scraped this one. Skip it.
133 | } else {
134 | merged.push(nt);
135 | }
136 | }
137 | return merged
138 | }
139 |
140 | if (job.urls.action == 'manual') {
141 | gatherShare = 0;
142 | const manualTargets = splitUrls(job.urls.manualUrls)
143 | .map(url => ({ url, text: '(manual)' }));
144 | targets = mergeTargets(manualTargets);
145 | await setJobResults(job.id, { targets });
146 |
147 | } else if (job.urls.action == 'current') {
148 | const active = await getActiveTab();
149 | let url;
150 | if (active) {
151 | url = active.url;
152 | // Save it for next time, in case Chrome can't find it
153 | setJobField(job.id, 'urls', Object.assign({}, job.urls, { currentUrl: url }));
154 | } else if (job.urls.currentUrl) {
155 | url = job.urls.currentUrl;
156 | }
157 | gatherShare = 0;
158 |
159 | if (job.urls.pagination?.follow) {
160 | const paginationTargets = [];
161 | for (const link of job.urls.pagination.links) {
162 | console.log('look at pagination link', link);
163 | const text = link.pageNumber == 0 ? '(current)' : `(page ${link.pageNumber})`;
164 | paginationTargets.push({ url: link.url, text });
165 | }
166 | targets = mergeTargets(paginationTargets);
167 | console.log('pagination gave targets:', targets);
168 | } else {
169 | targets = [{ url, text: '(current)' }];
170 | }
171 |
172 | await setJobResults(job.id, { targets });
173 |
174 | } else {
175 | gatherShare = 0.25;
176 | targets = await runGather(job, tabId, gatherShare);
177 | targets = targets.concat(job.results?.targets || []);
178 | }
179 |
180 | if (!await isActive(roundId)) return;
181 |
182 | console.log('Call runScrape');
183 | await runScrape(
184 | job,
185 | targets.map(t => t.url),
186 | gatherShare);
187 | if (!await isActive(roundId)) return;
188 |
189 | console.log('all done, lets advance the round just in case');
190 | await advanceRound();
191 | await setStatus('Completed job ' + job.name);
192 | }
193 |
194 | const maybeNameJob = async (job) => {
195 | if (job.name.indexOf('Untitled') != -1 || job.name.indexOf('undefined') != -1) {
196 | const name = '' + (await runGetName(job));
197 |
198 | console.log('maybeNameJob got name:', name);
199 |
200 | job.name = name;
201 | await saveJob(job);
202 | }
203 | return job;
204 | }
205 |
206 | const runGetName = async (job) => {
207 | const slim = {};
208 | slim.scrape = job.scrape;
209 | slim.urls = job.urls;
210 | return exec('name', { job: JSON.stringify(slim, null, 2) })
211 | .then(x => x.name);
212 | };
213 |
214 | const runGather = async (job, tabId, percentFactor) => {
215 | if (!percentFactor) percentFactor = 1;
216 | const roundId = await getRoundId();
217 | await setStatus('Start job', roundId, 1);
218 | await setPercent(0.01);
219 | job = await maybeNameJob(job);
220 |
221 | console.log('runGather got tabId:', tabId);
222 |
223 | let tabUrl;
224 | if (tabId) {
225 | const activeTab = await getActiveTab();
226 | tabUrl = activeTab?.url;
227 | }
228 |
229 | const urlsList = splitUrls(job.urls.url);
230 |
231 | console.log('urlsList', urlsList);
232 | let links = [];
233 | for (let i = 0; i < urlsList.length; i++) {
234 | const url = urlsList[i];
235 | console.log('gather from url:', url);
236 | if (!url) continue;
237 |
238 | if (!await isActive(roundId)) return [null, null, null];
239 |
240 | setStatus('Crawl URLs from ' + url);
241 |
242 | let page;
243 |
244 | console.log('gather current tab?', tabId, tabUrl, url);
245 |
246 | if (tabId && tabUrl == url) {
247 | page = await getTabData(tabId, { shouldClose: false });
248 | } else {
249 | page = await getPageData(
250 | url,
251 | {
252 | active: job.scrape?.concurrency < 0,
253 | sleepTime: job.scrape?.sleepTime,
254 | });
255 | }
256 |
257 | console.log('gather got page:', page);
258 |
259 | if (page?.error) {
260 | console.error('Error, skipping' + url, page?.error);
261 | await setScrapeStatus(job.id, roundId, [url], 'error');
262 | continue;
263 | }
264 |
265 | const factor = (i + 1) / urlsList.length;
266 | const partial = await parseLinks(
267 | page,
268 | job.urls.question,
269 | (targets, percent) => {
270 | console.log('changes percent cb', targets, percent);
271 | setJobResults(job.id, { targets });
272 | setPercent(percent * percentFactor * factor);
273 | });
274 |
275 | console.log('got partial', partial);
276 |
277 | if (partial) {
278 | links = cleanLinks(dedupeLinks(links.concat(partial)));
279 | setJobResults(job.id, { targets: links });
280 | console.log('links is now:', links);
281 | }
282 | }
283 |
284 | console.log('links:', links);
285 |
286 | setStatus('AI found URLs:\n' + JSON.stringify(links, null, 2), roundId, -1);
287 | if (percentFactor == 1) setPercent(null);
288 |
289 | return links
290 | };
291 |
292 | const checkLoading = async (text, html) => {
293 | // TODO: re-enable this after dev done
294 | // if (true) {
295 | // return { status: 'ok', answer: { status: 'done' } };
296 | // }
297 |
298 |
299 | const job = await getActiveJob();
300 | if (!job) {
301 | // Hack...
302 | return { status: 'ok', answer: { status: 'done' } };
303 | }
304 |
305 | const answer = await exec(
306 | 'checkLoading',
307 | {
308 | text: text.substr(0, 10000),
309 | html: html.substr(0, 30000),
310 | questions: JSON.stringify(job.scrape?.questions || []),
311 | });
312 |
313 | if (!answer) {
314 | return { status: 'error' };
315 | } else {
316 | return { status: 'ok', answer };
317 | }
318 | }
319 |
320 | const runScrape = async (job, urls, percentAdd) => {
321 | if (!percentAdd) percentAdd = 0;
322 | console.log('bg got runscrape', job, urls);
323 |
324 | job = await maybeNameJob(job);
325 |
326 | const usingActive = (job.scrape?.concurrency || 0) < 0;
327 |
328 | const roundId = await getRoundId();
329 | const maxConc = Math.abs(job.scrape?.concurrency || 3);
330 |
331 | console.log('bg running with maxConc', maxConc);
332 |
333 | await setStatus(
334 | 'Queue ' + (urls.length == 1 ? urls[0] : urls.length + ' URLs'),
335 | roundId,
336 | urls.length);
337 |
338 | await setScrapeStatus(
339 | job.id,
340 | roundId,
341 | urls,
342 | 'queued');
343 |
344 | const extraRules = (job.urls.action == 'gather'
345 | ? 'Important: For this scrape, ONLY find exactly 1 item. So itemCount will always be 1, and you will return only 1 result after that.'
346 | : '');
347 |
348 | const itemDescription = (job.urls.action == 'manual'
349 | ? job.urls.question
350 | : '');
351 |
352 | const fn = async (url, index, cb) => {
353 | console.log('bg runscrape got url', next, url);
354 | await setScrapeStatus(job.id, roundId, [url], 'scraping');
355 |
356 | if (!await isActive(roundId)) return [null, null, null];
357 | console.log('bg runscrape getting page data', url);
358 |
359 | let timeoutId;
360 | const options = {
361 | active: job.scrape?.concurrency < 0,
362 | sleepTime: job.scrape?.sleepTime,
363 | onCreate: (tab) => {
364 | timeoutId = setTimeout(
365 | () => {
366 | try { chrome.tabs.remove(tab.id) } catch(e) {};
367 | },
368 | 15*1000);
369 | }
370 | };
371 | const page = await getPageData(url, options);
372 |
373 | if (timeoutId) clearTimeout(timeoutId);
374 |
375 | console.log('bg runscrape got page data', url, page);
376 |
377 | if (page.error) {
378 | return [index, url, { error: page.error }];
379 | }
380 |
381 | if (!await isActive(roundId)) return [null, null, null];
382 | console.log('bg runscrape scraping', url);
383 | let result;
384 | try {
385 | result = await scrapePage(
386 | page,
387 | job.scrape.questions,
388 | job.urls?.perPage,
389 | itemDescription,
390 | extraRules,
391 | cb);
392 | } catch(e) {
393 | console.error('scrapePage gave error:', e);
394 | throw e;
395 | }
396 | console.log('bg runscrape scraped', url, result);
397 |
398 | if (!await isActive(roundId)) return [null, null, null];
399 |
400 | return [index, url, result];
401 | };
402 |
403 | let next = 0;
404 | let done = 0;
405 |
406 | let p = [];
407 |
408 | while (next <= urls.length) {
409 | for (let i = p.filter(x => !!x).length; i < maxConc && next < urls.length; i++) {
410 | const url = urls[next++];
411 | const index = p.length;
412 | console.log('bg nnn runscrape start:', next, index, url);
413 | p.push(fn(
414 | url,
415 | index,
416 | ({ items, percent }) => {
417 | console.log('partial partialItems', items, percent);
418 | setScrapeAnswer(job.id, url, items);
419 |
420 | if (percent) {
421 | setPercent(percent, 0, 1);
422 | }
423 | }));
424 | if (usingActive) await sleep(2000);
425 | }
426 |
427 | console.log('bg nnn runscrape wait for any', next, urls.length, p);
428 | const l = p.filter(x => !!x);
429 | if (l.length == 0) break;
430 | let [doneIndex, url, result] = [null, null, null];
431 | console.log('Promise.any', l);
432 | [doneIndex, url, result] = await Promise.any(l);
433 |
434 | if (doneIndex === null) break;
435 |
436 | console.log('bg nnn runscrape got completed:', doneIndex);
437 | console.log('bg runscrape setting results/status', url);
438 |
439 | done++;
440 |
441 | if (result.error) {
442 | await setScrapeStatus(job.id, roundId, [url], 'error');
443 | } else {
444 | await setScrapeStatus(job.id, roundId, [url], 'scraped');
445 | await setScrapeAnswer(job.id, url, result);
446 | }
447 |
448 | await setStatus(
449 | (result.error ? 'Error' : 'Scraped') +
450 | ' (' + next + '/' + urls.length + ') ' + url, roundId, -1);
451 |
452 | await setPercent(
453 | percentAdd + ((done / urls.length) * (1 - percentAdd)),
454 | done,
455 | urls.length,
456 | );
457 |
458 | p[doneIndex] = null;
459 | }
460 |
461 | setPercent(null);
462 | }
463 |
464 | let consoleMessages = [];
465 | let consoleTimeoutId = null;
466 | const saveConsole = (key, args) => {
467 | // Disable for now
468 | setKey('consoleMessages', []);
469 | return;
470 |
471 | // Do not put any console.log() statements in here
472 |
473 | const message = [
474 | '' + (new Date()),
475 | key,
476 | JSON.stringify(args),
477 | ].join('\t').substr(0, 5000); // max 5kB per message
478 | consoleMessages.push(message);
479 |
480 |
481 | // Buffer and write
482 | if (consoleTimeoutId) clearTimeout(consoleTimeoutId);
483 |
484 | consoleTimeoutId = setTimeout(
485 | async () => {
486 | const prev = (await getKey('consoleMessages')) || [];
487 | consoleMessages = prev.concat(consoleMessages);
488 |
489 | const l = consoleMessages.length;
490 | const max = 100000;
491 | if (l > max) {
492 | consoleMessages = consoleMessages.slice(l - max);
493 | }
494 |
495 | setKey('consoleMessages', consoleMessages);
496 | consoleMessages = [];
497 | }, 1000);
498 | }
499 |
500 | (() => {
501 | const devMode = !('update_url' in chrome.runtime.getManifest());
502 | if (devMode) return;
503 |
504 | for (const key of ['log', 'warn', 'error']) {
505 | const original = console[key];
506 | console[key] = (...args) => {
507 | original(...args);
508 | saveConsole(key, args);
509 | };
510 | }
511 | })();
512 |
513 | const reportBug = async () => {
514 | const messages = (await getKey('consoleMessages')) || [];
515 | return sendReport(messages.join('\n'));
516 | }
517 |
--------------------------------------------------------------------------------
/src/pages/Content/content.styles.css:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fetchfox/fetchfox-extension/698f8969568fc2a88a8abc10a368e37c842aa332/src/pages/Content/content.styles.css
--------------------------------------------------------------------------------
/src/pages/Content/index.js:
--------------------------------------------------------------------------------
1 | import { printLine } from './modules/print';
2 |
3 | console.log('Content script works!');
4 | console.log('Must reload extension for modifications to take effect.');
5 |
6 | printLine("Using the 'printLine' function from the Print Module");
7 |
--------------------------------------------------------------------------------
/src/pages/Content/modules/print.js:
--------------------------------------------------------------------------------
1 | export const printLine = (line) => {
2 | console.log('===> FROM THE PRINT MODULE:', line);
3 | };
4 |
--------------------------------------------------------------------------------
/src/pages/Devtools/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/src/pages/Devtools/index.js:
--------------------------------------------------------------------------------
1 | chrome.devtools.panels.create(
2 | 'Dev Tools from chrome-extension-boilerplate-react',
3 | 'icon-34.png',
4 | 'panel.html'
5 | );
6 |
--------------------------------------------------------------------------------
/src/pages/Newtab/Newtab.css:
--------------------------------------------------------------------------------
1 | .App {
2 | text-align: center;
3 | }
4 |
5 | .App-logo {
6 | height: 40vmin;
7 | pointer-events: none;
8 | }
9 |
10 | @media (prefers-reduced-motion: no-preference) {
11 | .App-logo {
12 | animation: App-logo-spin infinite 20s linear;
13 | }
14 | }
15 |
16 | .App-header {
17 | background-color: #282c34;
18 | min-height: 100vh;
19 | display: flex;
20 | flex-direction: column;
21 | align-items: center;
22 | justify-content: center;
23 | font-size: calc(10px + 2vmin);
24 | color: white;
25 | }
26 |
27 | .App-link {
28 | color: #61dafb;
29 | }
30 |
31 | @keyframes App-logo-spin {
32 | from {
33 | transform: rotate(0deg);
34 | }
35 | to {
36 | transform: rotate(360deg);
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/src/pages/Newtab/Newtab.scss:
--------------------------------------------------------------------------------
1 | $myColor: orange;
2 |
3 | h1,
4 | h2,
5 | h3,
6 | h4,
7 | h5,
8 | h6 {
9 | color: $myColor;
10 | }
11 |
--------------------------------------------------------------------------------
/src/pages/Newtab/index.css:
--------------------------------------------------------------------------------
1 | body {
2 | margin: 0;
3 | font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen',
4 | 'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue',
5 | sans-serif;
6 | -webkit-font-smoothing: antialiased;
7 | -moz-osx-font-smoothing: grayscale;
8 | }
9 |
10 | code {
11 | font-family: source-code-pro, Menlo, Monaco, Consolas, 'Courier New',
12 | monospace;
13 | }
14 |
--------------------------------------------------------------------------------
/src/pages/Newtab/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Chrome Extension Boilerplate (with React 16.6+ & Webpack 4+)
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/src/pages/Newtab/index.jsx:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import { createRoot } from 'react-dom/client';
3 |
4 | import Newtab from './Newtab';
5 | import './index.css';
6 |
7 | const container = document.getElementById('app-container');
8 | const root = createRoot(container); // createRoot(container!) if you use TypeScript
9 | root.render( );
10 |
--------------------------------------------------------------------------------
/src/pages/Options/Options.css:
--------------------------------------------------------------------------------
1 | .OptionsContainer {
2 | width: 100%;
3 | height: 50vh;
4 | font-size: 2rem;
5 | display: flex;
6 | align-items: center;
7 | justify-content: center;
8 | }
9 |
--------------------------------------------------------------------------------
/src/pages/Options/Options.tsx:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import './Options.css';
3 |
4 | interface Props {
5 | title: string;
6 | }
7 |
8 | const Options: React.FC = ({ title }: Props) => {
9 | return {title} Page
;
10 | };
11 |
12 | export default Options;
13 |
--------------------------------------------------------------------------------
/src/pages/Options/index.css:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fetchfox/fetchfox-extension/698f8969568fc2a88a8abc10a368e37c842aa332/src/pages/Options/index.css
--------------------------------------------------------------------------------
/src/pages/Options/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Settings
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/src/pages/Options/index.jsx:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import { createRoot } from 'react-dom/client';
3 |
4 | import Options from './Options';
5 | import './index.css';
6 |
7 | const container = document.getElementById('app-container');
8 | const root = createRoot(container); // createRoot(container!) if you use TypeScript
9 | root.render( );
10 |
--------------------------------------------------------------------------------
/src/pages/Panel/Panel.css:
--------------------------------------------------------------------------------
1 | body {
2 | background-color: #242424;
3 | }
4 |
5 | .container {
6 | color: #ffffff;
7 | }
--------------------------------------------------------------------------------
/src/pages/Panel/Panel.jsx:
--------------------------------------------------------------------------------
1 | import React, { useState, useEffect } from 'react';
2 | import { Scrape } from '../../components/scrape/Scrape';
3 |
4 | const Panel = () => {
5 | return ;
6 | }
7 |
8 | export default Panel;
9 |
--------------------------------------------------------------------------------
/src/pages/Panel/index.css:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fetchfox/fetchfox-extension/698f8969568fc2a88a8abc10a368e37c842aa332/src/pages/Panel/index.css
--------------------------------------------------------------------------------
/src/pages/Panel/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Dev Tools Panel
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/src/pages/Panel/index.jsx:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import { createRoot } from 'react-dom/client';
3 |
4 | import Panel from './Panel';
5 | import './index.css';
6 |
7 | import { initSentry } from '../../lib/errors.mjs';
8 | initSentry();
9 |
10 | const container = document.getElementById('app-container');
11 | const root = createRoot(container);
12 |
13 | root.render( );
14 |
--------------------------------------------------------------------------------
/src/pages/Popup/Popup.css:
--------------------------------------------------------------------------------
1 | .App {
2 | color: white;
3 | height: 100%;
4 | padding: 10px;
5 | background-color: #282c34;
6 | font-size: 12px;
7 | }
8 |
9 | .App-logo {
10 | height: 30vmin;
11 | pointer-events: none;
12 | }
13 |
14 | @media (prefers-reduced-motion: no-preference) {
15 | .App-logo {
16 | animation: App-logo-spin infinite 20s linear;
17 | }
18 | }
19 |
20 | .App-header {
21 | height: 100%;
22 | display: flex;
23 | flex-direction: column;
24 | align-items: center;
25 | justify-content: center;
26 | font-size: calc(10px + 2vmin);
27 | }
28 |
29 | .App-link {
30 | color: #61dafb;
31 | }
32 |
33 | @keyframes App-logo-spin {
34 | from {
35 | transform: rotate(0deg);
36 | }
37 | to {
38 | transform: rotate(360deg);
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/pages/Popup/Popup.jsx:
--------------------------------------------------------------------------------
1 | import React, { useState, useEffect } from 'react';
2 | import { Scrape } from '../../components/scrape/Scrape';
3 |
4 | console.log('Replace console log');
5 | const original = console;
6 | console = Object.assign(
7 | {},
8 | {
9 | log: (...args) => {
10 | original.log('INTERCEPTED', args);
11 | }
12 | },
13 | original);
14 |
15 |
16 | const Popup = () => {
17 | return (
18 |
19 |
20 |
21 | );
22 | }
23 |
24 | export default Popup;
25 |
--------------------------------------------------------------------------------
/src/pages/Popup/index.css:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fetchfox/fetchfox-extension/698f8969568fc2a88a8abc10a368e37c842aa332/src/pages/Popup/index.css
--------------------------------------------------------------------------------
/src/pages/Popup/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Popup
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/src/pages/Popup/index.jsx:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import { createRoot } from 'react-dom/client';
3 | import Popup from './Popup';
4 | import './index.css';
5 | import { initSentry } from '../../lib/errors.mjs';
6 |
7 | initSentry();
8 |
9 | (() => {
10 | const devMode = !('update_url' in chrome.runtime.getManifest());
11 | if (devMode) return;
12 |
13 | for (const key of ['log', 'warn', 'error']) {
14 | const original = console[key];
15 | console[key] = (...args) => {
16 | chrome.runtime.sendMessage({
17 | action: 'console',
18 | key,
19 | args,
20 | });
21 | original(...args);
22 | };
23 | }
24 | })();
25 |
26 | const container = document.getElementById('app-container');
27 | const root = createRoot(container);
28 | root.render(
29 |
32 | );
33 |
--------------------------------------------------------------------------------
/src/state/errors.js:
--------------------------------------------------------------------------------
1 | import { useLocal } from "./storage";
2 |
3 | export function useGlobalError() {
4 | const [globalError] = useLocal("globalError");
5 | return globalError;
6 | }
7 |
--------------------------------------------------------------------------------
/src/state/gather.js:
--------------------------------------------------------------------------------
1 | import { useMemo, useEffect, useState, useRef } from 'react';
2 | import {
3 | getJob,
4 | getActiveJob,
5 | } from '../lib/store.mjs';
6 | import { findPagination } from '../lib/gather.mjs';
7 |
8 | export const usePagination = (page) => {
9 | const [loading, setLoading] = useState(true);
10 | const [didInit, setDidInit] = useState(false);
11 | const [links, setLinks] = useState({});
12 |
13 | const cacheRef = useRef(new Map());
14 |
15 | useEffect(() => {
16 | if (!page?.url) return;
17 |
18 | setLoading(true);
19 | findPagination(page).then((result) => {
20 | cacheRef.current.set(page.url, result);
21 | setLoading(false);
22 | setDidInit(true);
23 | setLinks(result)
24 | });
25 | }, [page?.url]);
26 |
27 | return { loading, didInit, links };
28 | }
29 |
--------------------------------------------------------------------------------
/src/state/jobs.js:
--------------------------------------------------------------------------------
1 | import { storage, useLocal } from "./storage";
2 | import { sift, sort } from "radash";
3 | import { useEffect, useMemo, useState } from "react";
4 |
5 | export const useJobs = () => {
6 | const [jobs, setJobs] = useState({});
7 | const [jobIds] = useLocal("jobs_ids");
8 |
9 | useEffect(() => {
10 | const keys = jobIds || [];
11 |
12 | const listeners = {};
13 | keys.forEach((id) => {
14 | const key = 'job_' + id;
15 | listeners[key] = async (c) => {
16 | setJobs((old) => ({ ...old, [id]: c.newValue }));
17 | };
18 | });
19 |
20 | storage.watch(listeners);
21 | return () => storage.unwatch(listeners);
22 | }, []);
23 |
24 | return useMemo(() => {
25 | const sortedJobIds = sort(jobIds || [], (it) => parseInt(it), true);
26 | const sortedJobs = sortedJobIds.map((id) => jobs[id]);
27 | return sift(sortedJobs);
28 | }, [jobIds, jobs]);
29 | };
30 |
31 | export const useJob = (jobId) => {
32 | const [didInit, setDidInit] = useState();
33 | const [job, setJob] = useLocal('job_' + jobId, 'loading');
34 | const [result, setResult] = useState({ job: null, setJob, didInit: false });
35 |
36 | useEffect(() => {
37 | setResult({ job: null, setJob, didInit: false });
38 | }, [jobId]);
39 |
40 | useEffect(() => {
41 | if (jobId === undefined) {
42 | setResult({ job: null, setJob, didInit: true });
43 | } else if (job == 'loading') {
44 | setResult({ job: null, setJob, didInit: false });
45 | } else {
46 | setResult({ job, setJob, didInit: true });
47 | }
48 | }, [job]);
49 |
50 | return result;
51 | };
52 |
53 | export const useActiveJob = () => {
54 | const [activeId] = useLocal("activeId");
55 | return useJob(activeId);
56 | };
57 |
--------------------------------------------------------------------------------
/src/state/navigation.js:
--------------------------------------------------------------------------------
1 | import React, { useState, useEffect, useRef } from 'react';
2 | import { getTabData } from '../lib/navigation.mjs';
3 |
4 | export const useActivePage = () => {
5 | const [page, setPage] = useState();
6 |
7 | useEffect(() => {
8 | // TODO: update on navigation
9 | getTabData()
10 | .then(resp => {
11 | console.log('use active page resp', resp);
12 | setPage(resp);
13 | });
14 | }, []);
15 |
16 | return page;
17 | }
18 |
--------------------------------------------------------------------------------
/src/state/openai.js:
--------------------------------------------------------------------------------
1 | import { useLocal, storage } from "./storage.js";
2 | import { useMemo, useEffect, useState } from "react";
3 | import { getModel, getAvailableModels } from "../lib/ai.mjs";
4 | import { getKey } from "../lib/store.mjs";
5 | import { useRoundId } from "../lib/controller.mjs";
6 | import OpenAI from "openai";
7 | // import { storage } from "../../../lib/extension";
8 |
9 | export const useOpenAiKey = () => {
10 | const [key, , { isLoading: keyIsLoading }] = useLocal("openAiKey");
11 | const [plan, , { isLoading: planIsLoading }] = useLocal("openAiPlan");
12 | const loading = keyIsLoading || planIsLoading;
13 |
14 | return { key, plan, loading };
15 | };
16 |
17 | export const useOpenAiModels = () => {
18 | const [model, setModel] = useState();
19 | const [available, setAvailable] = useState([]);
20 | const openai = useOpenAiKey();
21 |
22 | useEffect(() => {
23 | getAvailableModels().then(setAvailable);
24 | getModel().then(setModel);
25 | }, [openai.key]);
26 |
27 | return { model, available };
28 | };
29 |
30 | export const useUsage = () => {
31 | const roundId = useRoundId();
32 | const [usage] = useLocal("roundUsage_" + roundId);
33 | return usage || {};
34 | };
35 |
36 | export const useQuota = () => {
37 | const [quota, setQuota] = useState({ ok: true });
38 | const { key: openaiKey, plan: openaiPlan } = useOpenAiKey();
39 | const models = useOpenAiModels();
40 |
41 | useEffect(() => {
42 | if (!openaiKey) return;
43 | if (!models?.model) return;
44 |
45 | if (openaiPlan === "free") {
46 | setQuota({ credits: 1, ok: true });
47 | return;
48 | }
49 |
50 | const client = new OpenAI({
51 | apiKey: openaiKey,
52 | dangerouslyAllowBrowser: true,
53 | });
54 |
55 | // There's no endpoint for quota available, so just run
56 | // a test prompt
57 | client.chat.completions
58 | .create({
59 | model: models.model,
60 | messages: [{ role: "user", content: "test" }],
61 | })
62 | .then((resp) => {
63 | setQuota({ credits: 1, ok: true });
64 | })
65 | .catch((err) => {
66 | if (err.code === "insufficient_quota") {
67 | setQuota({ credits: 0, error: err, ok: false });
68 | } else {
69 | setQuota({ error: err, ok: false });
70 | }
71 | });
72 | }, [openaiPlan, openaiKey, models?.model]);
73 |
74 | return quota;
75 | };
76 |
--------------------------------------------------------------------------------
/src/state/storage.js:
--------------------------------------------------------------------------------
1 | import { useEffect, useState } from "react";
2 | import { useStorage } from "@plasmohq/storage/hook";
3 | import { Storage } from "@plasmohq/storage";
4 | // import { storage } from "../../../lib/extension";
5 | import { getKey } from '../lib/store.mjs';
6 |
7 | export const storage = new Storage({ area: 'local' });
8 |
9 | export const useLocal = (key, initial) => {
10 | const [val, setVal] = useState(initial);
11 | const [isLoading, setIsLoading] = useState({ isLoading: true });
12 | const framework = useStorage({ key, instance: storage }, initial);
13 |
14 | // const setter = () => {
15 | // console.log('TODO: setter');
16 | // };
17 |
18 | // useEffect(() => {
19 | // getKey(key)
20 | // .then((result) => {
21 | // console.log('====== active result', key, result);
22 | // setVal(result);
23 | // setIsLoading(false);
24 | // });
25 | // }, []);
26 |
27 | return framework;
28 | // return [val, setter, isLoading];
29 |
30 | // return { val, framework };
31 | // console.log('debug active', key, initial);
32 | // getKey(key)
33 | // .then((result) => {
34 | // console.log('debug active result:', key, result);
35 | // });
36 |
37 | // return useStorage({ key, instance: storage }, initial);
38 | }
39 |
--------------------------------------------------------------------------------
/src/state/util.js:
--------------------------------------------------------------------------------
1 | import { useMemo, useEffect, useState, useRef } from 'react';
2 | import { getKey } from '../lib/store.mjs';
3 | import { useActiveJob } from './jobs';
4 | import { set } from 'radash';
5 |
6 | export const useAutoSleepTime = () => {
7 | const [average, setAverage] = useState();
8 | const [times, setTimes] = useState();
9 | const [pretty, setPretty] = useState();
10 | const { job } = useActiveJob();
11 |
12 | const parse = (loadSleepTimes) => {
13 | console.log('autosleep update loadSleepTimes', job, loadSleepTimes);
14 | const times = [];
15 | for (const target of (job?.results?.targets || [])) {
16 | const hostname = (new URL(target.url)).hostname;
17 | if (loadSleepTimes[hostname]) {
18 | const values = (loadSleepTimes[hostname].times || []).map(parseFloat);
19 | times.push(...values);
20 | }
21 | }
22 | console.log('loadSleepTimes got times', times);
23 |
24 | if (times.length == 0) return;
25 |
26 | times.sort((a, b) => a - b);
27 | const lo = Math.round(times[Math.floor(times.length * 0.2)] / 1000);
28 | const hi = Math.round(times[Math.floor(times.length * 0.8)] / 1000);
29 | if (lo == hi) {
30 | setPretty(`~${lo} seconds`);
31 | } else {
32 | setPretty(`~${lo}-${hi} seconds`);
33 | }
34 | setAverage(times.reduce((acc, v) => acc + v, 0) / times.length);
35 | setTimes(times);
36 | }
37 |
38 | const update = (changes) => {
39 | if (changes.loadSleepTimes) {
40 | parse(changes.loadSleepTimes.newValue);
41 | }
42 | };
43 |
44 | useEffect(() => {
45 | if (!job?.id) return;
46 |
47 | getKey('loadSleepTimes').then(parse);
48 | chrome.storage.onChanged.addListener(update);
49 | return () => chrome.storage.onChanged.removeListener(update);
50 | }, [job?.id]);
51 |
52 | return { average, times, pretty };
53 | }
54 |
55 | export const useMirror = (orig, setOrig) => {
56 | const [mirror, setMirror] = useState();
57 |
58 | useEffect(() => {
59 | if (JSON.stringify(mirror) == JSON.stringify(orig)) return;
60 | setMirror(orig);
61 | }, [orig]);
62 |
63 | const timeoutRef = useRef();
64 | const delayedSet = (updates) => {
65 | const copy = {...orig};
66 | const run = (copy, setter) => {
67 | for (const [keys, val] of updates) {
68 | copy = set(copy, keys, val);
69 | }
70 | setter(copy);
71 | }
72 | run({...mirror}, setMirror);
73 | if (timeoutRef.current) {
74 | clearTimeout(timeoutRef.current);
75 | }
76 | timeoutRef.current = setTimeout(
77 | () => run({...orig}, setOrig),
78 | 1000);
79 | }
80 |
81 | return [mirror, delayedSet];
82 | }
83 |
--------------------------------------------------------------------------------
/test/data/linkedin.1.mjs:
--------------------------------------------------------------------------------
1 | export const expected = [
2 | 'https://www.linkedin.com/in/demir9?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAAAH-dMBlP1n3TQ2YUqO--EODF4ArKQX7Vk',
3 | 'https://www.linkedin.com/in/ACoAAAAlCCcBIb_YZfytimGDoa4MAiK890-IUjQ?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAAAlCCcBIb_YZfytimGDoa4MAiK890-IUjQ',
4 | 'https://www.linkedin.com/in/ACoAAABL-vYBNkB9XFgSG__CFxbdFu_aUQtXvmU?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAABL-vYBNkB9XFgSG__CFxbdFu_aUQtXvmU',
5 | 'https://www.linkedin.com/in/tanmaykhirwadkar?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAALyAQEByDtfiW1zsMpdc3AfQtxnDaOAimg',
6 | 'https://www.linkedin.com/in/ACoAAAABtmYBOijE8UYnK83lnmku4EdtemnEL-Q?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAAABtmYBOijE8UYnK83lnmku4EdtemnEL-Q',
7 | 'https://www.linkedin.com/in/ACoAAAgSf_0BEgP3Cu_IaPrqdT3jaFjj2dYPHMo?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAAgSf_0BEgP3Cu_IaPrqdT3jaFjj2dYPHMo',
8 | 'https://www.linkedin.com/in/manojganesan?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAADxwbYBTi8VNczb65rquiMEtiNCSZUDQvk',
9 | 'https://www.linkedin.com/in/ACoAAAADMdwBJ2I40IdLfDzk-SPG4pm4KlCWZB8?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAAADMdwBJ2I40IdLfDzk-SPG4pm4KlCWZB8',
10 | 'https://www.linkedin.com/in/ACoAAAAE5W8B-WtXiSCpnu4WY5d2hIj2pGpYsck?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAAAE5W8B-WtXiSCpnu4WY5d2hIj2pGpYsck',
11 | 'https://www.linkedin.com/in/larryleguo?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAAC9H6wBvosY1C5JdXRSV1_WftbNvpi2oTo',
12 | 'https://www.linkedin.com/in/ACoAAAEW-54B6VgMytQoSWrjl7aZ1LG2EBvHB-k?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAAEW-54B6VgMytQoSWrjl7aZ1LG2EBvHB-k',
13 | 'https://www.linkedin.com/in/ACoAACz99TsBl6Bl4TCVqwbeyDMFeCZduzhwMFU?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAACz99TsBl6Bl4TCVqwbeyDMFeCZduzhwMFU',
14 | 'https://www.linkedin.com/in/levent-koc-38ab6810?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAAJOohoBW0DDTfdhC-zgs6oKRu_U9ALMDrs',
15 | 'https://www.linkedin.com/in/tao-li-9487a644?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAAlfK8gBHiplx4IJ2GBjZygjWOOJv5NBoV0',
16 | 'https://www.linkedin.com/in/ACoAAADvQF0BaxlvFsR5NlSqKtld8hD9gw-LHZU?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAADvQF0BaxlvFsR5NlSqKtld8hD9gw-LHZU',
17 | 'https://www.linkedin.com/in/ACoAAAFbfp8BxSwGht-lqUBGIOlJ9hdfXivcAnU?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAAFbfp8BxSwGht-lqUBGIOlJ9hdfXivcAnU',
18 | 'https://www.linkedin.com/in/paulotanaka?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAAXVWQoBpyLb2TYuQ8d7R1-D2WKqzh8wxZU',
19 | 'https://www.linkedin.com/in/ACoAABDGglgBdLuVR_oBw0pv0bTu2SiKL61kKFM?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAABDGglgBdLuVR_oBw0pv0bTu2SiKL61kKFM',
20 | 'https://www.linkedin.com/in/tushar-dogra-28058557?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAAwCV0ABZrf4uMND-vfpkePO-T35TRhsh9I',
21 | 'https://www.linkedin.com/in/ACoAABJMRe4BBf_DenSjxpK3ugsJx8LWLtdiCNk?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAABJMRe4BBf_DenSjxpK3ugsJx8LWLtdiCNk',
22 | 'https://www.linkedin.com/in/sarvjeet?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAADHujkBYn0ZNMFfxBjHPpe_Se2aP7RgzIs',
23 | 'https://www.linkedin.com/in/ACoAAAB1nWABh-fgoV7BUFfhsOARJstgDwsgiF8?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAAB1nWABh-fgoV7BUFfhsOARJstgDwsgiF8',
24 | 'https://www.linkedin.com/in/ACoAAA0XzZUB5070rEl1uF1I2U0WxiHn_7jBGpc?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAA0XzZUB5070rEl1uF1I2U0WxiHn_7jBGpc',
25 | 'https://www.linkedin.com/in/saadali?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAADC6Q0BXvFErdzKHta4Pqz0IsqVceVfwZ0',
26 | ];
27 |
--------------------------------------------------------------------------------
/test/data/redfin.1.mjs:
--------------------------------------------------------------------------------
1 | export const expected = [
2 | 'https://www.redfin.com/CA/Mountain-View/221-Cypress-Point-Dr-94043/home/672849',
3 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-10/home/623116',
4 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-128/home/622342',
5 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-131/home/1148236',
6 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-132/home/622349',
7 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-135/home/622368',
8 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-146/home/729108',
9 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-158/home/1070448',
10 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-160/home/623212',
11 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-162/home/622377',
12 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-164/home/623216',
13 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-17/home/622219',
14 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-170/home/1316401',
15 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-177/home/563911',
16 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-183/home/1173647',
17 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-188/home/622412',
18 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-190/home/868628',
19 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-195/home/1466548',
20 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-213/home/623234',
21 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-216/home/622448',
22 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-235/home/643128',
23 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-250/home/1070466',
24 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-252/home/991604',
25 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-272/home/622485',
26 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-273/home/1160302',
27 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-276/home/1186774',
28 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-28/home/1137138',
29 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-286/home/1345471',
30 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-290/home/1174523',
31 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-291/home/991613',
32 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-292/home/622499',
33 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-301/home/1397274',
34 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-44/home/1197678',
35 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-45/home/623151',
36 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-50/home/622236',
37 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-52/home/1462088',
38 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-59/home/768524',
39 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-8/home/1431842',
40 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-82/home/622303',
41 | 'https://www.redfin.com/CA/Mountain-View/505-Cypress-Point-Dr-94043/unit-85/home/1309489',
42 | ];
43 |
--------------------------------------------------------------------------------
/test/testAiGather.mjs:
--------------------------------------------------------------------------------
1 | import assert from 'assert';
2 | import { promises as fs } from 'fs';
3 | import path from 'path';
4 |
5 | import { parseLinks } from '../src/lib/gather.mjs';
6 |
7 | import { expected as expectedAmazonSoap } from './data/amazonsoap.1.mjs';
8 | import { expected as expectedLinkedIn } from './data/linkedin.1.mjs';
9 | import { expected as expectedRedfin } from './data/redfin.1.mjs';
10 | import { expected as expectedEtsy } from './data/etsy.1.mjs';
11 | import { expected as expectedEbay } from './data/ebay.1.mjs';
12 |
13 | const testFile = async (filename, question, expected) => {
14 | const page = await fs.readFile(filename, 'utf8');
15 | const data = JSON.parse(page);
16 | const links = await parseLinks(data, question, () => {});
17 |
18 | console.log('');
19 | console.log('');
20 | console.log('Results:');
21 | let [matches, extras] = [0, 0];
22 |
23 | const norm = s => decodeURIComponent(s.replaceAll('&', '&'));
24 | if(expected) expected = expected.map(norm);
25 |
26 | for (let link of links) {
27 | console.log('- text:', link.text.replaceAll('\n', ' '));
28 | console.log('- url: ', link.url);
29 |
30 | if (expected) {
31 |
32 | if (expected.includes(norm(link.url))) {
33 | console.log('- MATCH');
34 | matches++;
35 | } else {
36 | console.log('- EXTRA');
37 | extras++;
38 | }
39 | }
40 | console.log('');
41 | }
42 |
43 | if (expected) {
44 | console.log('Summary:');
45 | console.log('- expected:', expected.length);
46 | console.log('- matches: ', matches);
47 | console.log('- extras: ', extras);
48 | }
49 |
50 | return { expected: (expected || []).length, matches, extras };
51 | };
52 |
53 | describe('AI gather', function () {
54 | this.timeout(500 * 1000);
55 | let partial;
56 | let partials = {};
57 | let combined = { expected: 0, matches: 0, extras: 0 };
58 |
59 | afterEach(function() {
60 | console.log('partial >', partial, this.currentTest.title);
61 | partials[this.currentTest.title] = partial;
62 | combined.expected += partial.expected;
63 | combined.matches += partial.matches;
64 | combined.extras += partial.extras;
65 | });
66 |
67 | after(function() {
68 | console.log('');
69 | console.log('====');
70 | for (const k of Object.keys(partials)) {
71 | console.log('partial >', partials[k], k);
72 | }
73 | console.log('');
74 | console.log('combined >', combined);
75 | });
76 |
77 | it('should gather wayfair dressers', async () => {
78 | partial = await testFile(
79 | 'test/data/wayfair.1.gather.json',
80 | 'links to dressers, ONLY links to product pages that are dressers and nothing else'
81 | );
82 | });
83 |
84 | it('should gather reddit mma', async () => {
85 | partial = await testFile(
86 | 'test/data/reddit.1.gather.json',
87 | 'article links. do NOT include links on reddit.com'
88 | );
89 | });
90 |
91 | it('should gather reddit profiles', async () => {
92 | partial = await testFile(
93 | 'test/data/reddit.1.gather.json',
94 | 'reddit user profile links',
95 | [
96 | 'https://www.reddit.com/user/AbrahamRinkin',
97 | 'https://www.reddit.com/user/AliBagovBeatKhabib',
98 | 'https://www.reddit.com/user/AutoModerator',
99 | 'https://www.reddit.com/user/Blacker_Jesus',
100 | 'https://www.reddit.com/user/BotnetUser',
101 | 'https://www.reddit.com/user/BustaTron',
102 | 'https://www.reddit.com/user/Designer-Stage1825',
103 | 'https://www.reddit.com/user/GorillaOnChest',
104 | 'https://www.reddit.com/user/Hawkeye76',
105 | 'https://www.reddit.com/user/LatterTarget7',
106 | 'https://www.reddit.com/user/Loganbaker2147',
107 | 'https://www.reddit.com/user/MarbledNightmare',
108 | 'https://www.reddit.com/user/MoustacheLightning',
109 | 'https://www.reddit.com/user/OchoMuerte-XL',
110 | 'https://www.reddit.com/user/SegaCKY',
111 | 'https://www.reddit.com/user/The_Majestic_Banana',
112 | 'https://www.reddit.com/user/TradeBrockNelson',
113 | 'https://www.reddit.com/user/XniklasX',
114 | 'https://www.reddit.com/user/YeahBishMagnets',
115 | 'https://www.reddit.com/user/Yodsanan',
116 | 'https://www.reddit.com/user/ayushc3po',
117 | 'https://www.reddit.com/user/buzznights',
118 | 'https://www.reddit.com/user/epicfishboy',
119 | 'https://www.reddit.com/user/fightsgoneby',
120 | 'https://www.reddit.com/user/random_sTp',
121 | 'https://www.reddit.com/user/realest-dawg',
122 | 'https://www.reddit.com/user/riga345',
123 | 'https://www.reddit.com/user/rmma',
124 | 'https://www.reddit.com/user/synapticrelease',
125 | 'https://www.reddit.com/user/textorix',
126 | 'https://www.reddit.com/user/thiswasnotyettaken',
127 | 'https://www.reddit.com/user/toldyouanditoldyou',
128 | ]
129 | );
130 | });
131 |
132 | it('should gather yc hackernews comment pages', async () => {
133 | partial = await testFile(
134 | 'test/data/hackernews.1.gather.json',
135 | 'links to comment pages, make sure it is the comment pages, NOT the article pages',
136 | [
137 | 'https://news.ycombinator.com/item?id=40716154',
138 | 'https://news.ycombinator.com/item?id=40723024',
139 | 'https://news.ycombinator.com/item?id=40725924',
140 | 'https://news.ycombinator.com/item?id=40725970',
141 | 'https://news.ycombinator.com/item?id=40726497',
142 | 'https://news.ycombinator.com/item?id=40727252',
143 | 'https://news.ycombinator.com/item?id=40733705',
144 | 'https://news.ycombinator.com/item?id=40735743',
145 | 'https://news.ycombinator.com/item?id=40736577',
146 | 'https://news.ycombinator.com/item?id=40736771',
147 | 'https://news.ycombinator.com/item?id=40737294',
148 | 'https://news.ycombinator.com/item?id=40737370',
149 | 'https://news.ycombinator.com/item?id=40738833',
150 | 'https://news.ycombinator.com/item?id=40739384',
151 | 'https://news.ycombinator.com/item?id=40739710',
152 | 'https://news.ycombinator.com/item?id=40739982',
153 | 'https://news.ycombinator.com/item?id=40740021',
154 | 'https://news.ycombinator.com/item?id=40740237',
155 | 'https://news.ycombinator.com/item?id=40740581',
156 | 'https://news.ycombinator.com/item?id=40741072',
157 | 'https://news.ycombinator.com/item?id=40741197',
158 | 'https://news.ycombinator.com/item?id=40741672',
159 | 'https://news.ycombinator.com/item?id=40742014',
160 | 'https://news.ycombinator.com/item?id=40742026',
161 | 'https://news.ycombinator.com/item?id=40742163',
162 | 'https://news.ycombinator.com/item?id=40742764',
163 | 'https://news.ycombinator.com/item?id=40743308',
164 | 'https://news.ycombinator.com/item?id=40743531',
165 | 'https://news.ycombinator.com/item?id=40743975',
166 | 'https://news.ycombinator.com/item?id=40744162',
167 | ]
168 | );
169 | });
170 |
171 | it('should gather yc hackernews article links', async () => {
172 | partial = await testFile(
173 | 'test/data/hackernews.1.gather.json',
174 | 'links to articles',
175 | [
176 | 'https://beyondloom.com/blog/dither.html',
177 | 'https://blog.jgc.org/2024/06/two-ways-to-use-led-as-light-sensor.html',
178 | 'https://calculatingempires.net/',
179 | 'https://doi.org/10.1016/j.jasrep.2024.104636',
180 | 'https://downdetector.com/status/docker/',
181 | 'https://duckdb.org/2024/06/20/cli-data-processing-using-duckdb-as-a-unix-tool.html',
182 | 'https://erikdemaine.org/fonts/tetris/',
183 | 'https://gaultier.github.io/blog/write_a_video_game_from_scratch_like_1987.html',
184 | 'https://github.com/madprops/curls',
185 | 'https://github.com/robertdavidgraham/wc2',
186 | 'https://jprx.io/cve-2024-27815/',
187 | 'https://kucharski.substack.com/p/the-shape-of-information',
188 | 'https://lwn.net/SubscriberLink/978463/608c876c1153fd31/',
189 | 'https://news.alvaroduran.com/p/the-prototypes-language',
190 | 'https://osrd.fr/en/',
191 | 'https://science.nasa.gov/missions/hubble/nasa-releases-hubble-image-taken-in-new-pointing-mode/',
192 | 'https://stackdiary.com/eu-council-has-withdrawn-the-vote-on-chat-control/',
193 | 'https://taxfoundation.org/research/all/federal/501c3-nonprofit-organization-tax-exempt/',
194 | 'https://www.bbc.com/news/articles/c9rrvdq3g9zo',
195 | 'https://www.bloomberg.com/news/articles/2024-06-20/gilead-shot-prevents-100-of-hiv-cases-in-trial-of-african-women',
196 | 'https://www.bloomberg.com/news/articles/2024-06-20/remote-work-helps-more-people-with-disabilities-get-employed',
197 | 'https://www.engadget.com/how-small-claims-court-became-metas-customer-service-hotline-160224479.html',
198 | 'https://www.fuzzmap.io/',
199 | 'https://www.governor.ny.gov/news/governor-hochul-joins-attorney-general-james-and-bill-sponsors-sign-nation-leading-legislation',
200 | 'https://www.jerpint.io/blog/diffusion-gol/',
201 | 'https://www.octomind.dev/blog/why-we-no-longer-use-langchain-for-building-our-ai-agents',
202 | 'https://www.rahulilango.com/coloring/',
203 | 'https://www.ycombinator.com/companies/promoted/jobs/5moymju-sales-engineer-new-grad',
204 | 'https://www.zdnet.com/article/suse-upgrades-its-distros-with-19-years-of-support-no-other-linux-comes-close/',
205 | ]
206 | );
207 | });
208 |
209 | it('should gather amazon soap', async () => {
210 | partial = await testFile(
211 | 'test/data/amazonsoap.1.gather.json',
212 | 'find all the links to soap product pages, and ONLY product pages, not pages for general items like best sellers',
213 | expectedAmazonSoap,
214 | );
215 | });
216 |
217 | it('should gather linkedin profiles', async () => {
218 | partial = await testFile(
219 | 'test/data/linkedin.1.gather.json',
220 | 'find links to profile pages, typically these will be names of people',
221 | expectedLinkedIn,
222 | );
223 | });
224 |
225 | it('should gather zillow listings', async () => {
226 | partial = await testFile(
227 | 'test/data/zillow.1.gather.json',
228 | 'find links to property listings. only listings to individual propeties with an address, not general links',
229 | [
230 | 'https://www.zillow.com/homedetails/239-Cypress-Point-Dr-Mountain-View-CA-94043/19516531_zpid/',
231 | 'https://www.zillow.com/homedetails/505-Cypress-Point-Dr-UNIT-183-Mountain-View-CA-94043/19516343_zpid/',
232 | 'https://www.zillow.com/homedetails/505-Cypress-Point-Dr-UNIT-164-Mountain-View-CA-94043/19516324_zpid/',
233 | 'https://www.zillow.com/homedetails/505-Cypress-Point-Dr-UNIT-44-Mountain-View-CA-94043/19516204_zpid/',
234 | 'https://www.zillow.com/homedetails/505-Cypress-Point-Dr-UNIT-202-Mountain-View-CA-94043/19516362_zpid/',
235 | 'https://www.zillow.com/homedetails/505-Cypress-Point-Dr-UNIT-155-Mountain-View-CA-94043/19516315_zpid/',
236 | 'https://www.zillow.com/homedetails/505-Cypress-Point-Dr-UNIT-292-Mountain-View-CA-94043/19516451_zpid/',
237 | 'https://www.zillow.com/homedetails/505-Cypress-Point-Dr-UNIT-47-Mountain-View-CA-94043/19516207_zpid/',
238 | 'https://www.zillow.com/homedetails/505-Cypress-Point-Dr-UNIT-8-Mountain-View-CA-94043/19516168_zpid/',
239 | ],
240 | );
241 | });
242 |
243 | it('should gather redfin lsitings', async () => {
244 | partial = await testFile(
245 | 'test/data/redfin.1.gather.json',
246 | 'find links to property listings. only listings to individual propeties with an address, not general links',
247 | expectedRedfin,
248 | );
249 | });
250 |
251 | it('should gather etsy lsitings', async () => {
252 | partial = await testFile(
253 | 'test/data/etsy.1.gather.json',
254 | 'find links to product pages',
255 | expectedEtsy,
256 | );
257 | });
258 |
259 |
260 | it('should gather ebay lsitings', async () => {
261 | partial = await testFile(
262 | 'test/data/ebay.1.gather.json',
263 | 'find links to product/item pages. only find links to specific items, not general links or ge',
264 | expectedEbay,
265 | );
266 | });
267 |
268 | });
269 |
--------------------------------------------------------------------------------
/test/testAiScrape.mjs:
--------------------------------------------------------------------------------
1 | import assert from 'assert';
2 | import { promises as fs } from 'fs';
3 | import path from 'path';
4 |
5 | import { scrapePage } from '../src/lib/scrape.mjs';
6 |
7 | const testFile = async (filenames, questions, expecteds) => {
8 | let total = 0;
9 | let correct = 0;
10 | for (let i = 0; i < filenames.length; i++) {
11 | const page = JSON.parse(await fs.readFile(filenames[i], 'utf8'));
12 | const result = await scrapePage(page, questions, () => {});
13 | for (let j = 0; j < questions.length; j++) {
14 | const expected = expecteds[i][j];
15 | const actual = result[questions[j]];
16 | total++;
17 | if (expected == actual) {
18 | correct++;
19 | }
20 | }
21 | }
22 |
23 | console.log('total: ', total);
24 | console.log('correct:', correct);
25 | }
26 |
27 |
28 | describe('AI scrape', function () {
29 | this.timeout(500 * 1000);
30 |
31 | it('should scrape YC comments', async () => {
32 | await testFile(
33 | [
34 | 'test/data/hackernews-comments.1.scrape.json',
35 | 'test/data/hackernews-comments.2.scrape.json',
36 | 'test/data/hackernews-comments.3.scrape.json',
37 | ],
38 | [
39 | 'find the top comment. what is the timestamp of this top comment?',
40 | 'what is the username of the author of the top comment?',
41 | ],
42 | [
43 | ['16 hours ago', 'jmcgough'],
44 | ['1 minute ago', 'idontknowtech'],
45 | ['6 minutes ago', 'perihelions'],
46 | ]
47 | );
48 | });
49 | });
50 |
51 |
--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "target": "es5",
4 | "lib": ["dom", "dom.iterable", "esnext"],
5 | "allowJs": false,
6 | "skipLibCheck": true,
7 | "esModuleInterop": true,
8 | "allowSyntheticDefaultImports": true,
9 | "strict": true,
10 | "forceConsistentCasingInFileNames": true,
11 | "noFallthroughCasesInSwitch": true,
12 | "module": "esnext",
13 | "moduleResolution": "node",
14 | "resolveJsonModule": true,
15 | "noEmit": false,
16 | "jsx": "react"
17 | },
18 | "include": ["src"],
19 | "exclude": ["build", "node_modules"]
20 | }
21 |
--------------------------------------------------------------------------------
/utils/build.js:
--------------------------------------------------------------------------------
1 | // Do this as the first thing so that any code reading it knows the right env.
2 | require('dotenv').config();
3 |
4 | process.env.BABEL_ENV = process.env.NODE_ENV || 'production';
5 | process.env.NODE_ENV = process.env.NODE_ENV || 'production';
6 | process.env.ASSET_PATH = '/';
7 |
8 | if (process.env.NODE_ENV == 'dev') {
9 | require('dotenv').config({ path: `.env.dev` });
10 | } else {
11 | require('dotenv').config({ path: `.env.production` });
12 | }
13 |
14 |
15 | var webpack = require('webpack'),
16 | path = require('path'),
17 | fs = require('fs'),
18 | config = require('../webpack.config'),
19 | ZipPlugin = require('zip-webpack-plugin');
20 |
21 | delete config.chromeExtensionBoilerplate;
22 |
23 | config.mode = 'production';
24 |
25 | var packageInfo = JSON.parse(fs.readFileSync('package.json', 'utf-8'));
26 |
27 | config.plugins = (config.plugins || []).concat(
28 | new ZipPlugin({
29 | filename: `${packageInfo.name}-${packageInfo.version}.zip`,
30 | path: path.join(__dirname, '../', 'zip'),
31 | })
32 | );
33 |
34 | webpack(config, function (err) {
35 | if (err) throw err;
36 | });
37 |
--------------------------------------------------------------------------------
/utils/env.js:
--------------------------------------------------------------------------------
1 | // tiny wrapper with default env vars
2 | module.exports = {
3 | NODE_ENV: process.env.NODE_ENV || 'development',
4 | PORT: process.env.PORT || 3000,
5 | };
6 |
--------------------------------------------------------------------------------
/utils/webserver.js:
--------------------------------------------------------------------------------
1 | // Do this as the first thing so that any code reading it knows the right env.
2 | process.env.BABEL_ENV = 'development';
3 | process.env.NODE_ENV = 'development';
4 | process.env.ASSET_PATH = '/';
5 |
6 | var WebpackDevServer = require('webpack-dev-server'),
7 | webpack = require('webpack'),
8 | config = require('../webpack.config'),
9 | env = require('./env'),
10 | path = require('path');
11 |
12 | var options = config.chromeExtensionBoilerplate || {};
13 | var excludeEntriesToHotReload = options.notHotReload || [];
14 |
15 | for (var entryName in config.entry) {
16 | if (excludeEntriesToHotReload.indexOf(entryName) === -1) {
17 | config.entry[entryName] = [
18 | 'webpack/hot/dev-server',
19 | `webpack-dev-server/client?hot=true&hostname=localhost&port=${env.PORT}`,
20 | ].concat(config.entry[entryName]);
21 | }
22 | }
23 |
24 | delete config.chromeExtensionBoilerplate;
25 |
26 | var compiler = webpack(config);
27 |
28 | var server = new WebpackDevServer(
29 | {
30 | https: false,
31 | hot: true,
32 | liveReload: false,
33 | client: {
34 | webSocketTransport: 'sockjs',
35 | },
36 | webSocketServer: 'sockjs',
37 | host: 'localhost',
38 | port: env.PORT,
39 | static: {
40 | directory: path.join(__dirname, '../build'),
41 | },
42 | devMiddleware: {
43 | publicPath: `http://localhost:${env.PORT}/`,
44 | writeToDisk: true,
45 | },
46 | headers: {
47 | 'Access-Control-Allow-Origin': '*',
48 | },
49 | allowedHosts: 'all',
50 | },
51 | compiler
52 | );
53 |
54 | (async () => {
55 | await server.start();
56 | })();
57 |
--------------------------------------------------------------------------------
/webpack.config.js:
--------------------------------------------------------------------------------
1 | var webpack = require('webpack'),
2 | path = require('path'),
3 | fileSystem = require('fs-extra'),
4 | env = require('./utils/env'),
5 | CopyWebpackPlugin = require('copy-webpack-plugin'),
6 | HtmlWebpackPlugin = require('html-webpack-plugin'),
7 | TerserPlugin = require('terser-webpack-plugin');
8 | var { CleanWebpackPlugin } = require('clean-webpack-plugin');
9 | var ReactRefreshWebpackPlugin = require('@pmmmwh/react-refresh-webpack-plugin');
10 | var ReactRefreshTypeScript = require('react-refresh-typescript');
11 | var { sentryWebpackPlugin } = require("@sentry/webpack-plugin");
12 |
13 | const ASSET_PATH = process.env.ASSET_PATH || '/';
14 |
15 | var alias = {};
16 |
17 | // load the secrets
18 | var secretsPath = path.join(__dirname, 'secrets.' + env.NODE_ENV + '.js');
19 |
20 | var fileExtensions = [
21 | 'jpg',
22 | 'jpeg',
23 | 'png',
24 | 'gif',
25 | 'eot',
26 | 'otf',
27 | 'svg',
28 | 'ttf',
29 | 'woff',
30 | 'woff2',
31 | ];
32 |
33 | if (fileSystem.existsSync(secretsPath)) {
34 | alias['secrets'] = secretsPath;
35 | }
36 |
37 | const isDevelopment = process.env.NODE_ENV !== 'production';
38 |
39 | var options = {
40 | mode: process.env.NODE_ENV || 'development',
41 | entry: {
42 | newtab: path.join(__dirname, 'src', 'pages', 'Newtab', 'index.jsx'),
43 | options: path.join(__dirname, 'src', 'pages', 'Options', 'index.jsx'),
44 | popup: path.join(__dirname, 'src', 'pages', 'Popup', 'index.jsx'),
45 | background: path.join(__dirname, 'src', 'pages', 'Background', 'index.js'),
46 | contentScript: path.join(__dirname, 'src', 'pages', 'Content', 'index.js'),
47 | devtools: path.join(__dirname, 'src', 'pages', 'Devtools', 'index.js'),
48 | panel: path.join(__dirname, 'src', 'pages', 'Panel', 'index.jsx'),
49 | },
50 | chromeExtensionBoilerplate: {
51 | notHotReload: ['background', 'contentScript', 'devtools'],
52 | },
53 | output: {
54 | filename: '[name].bundle.js',
55 | path: path.resolve(__dirname, 'build'),
56 | clean: true,
57 | publicPath: ASSET_PATH,
58 | },
59 | module: {
60 | rules: [
61 | {
62 | // look for .css or .scss files
63 | test: /\.(css|scss)$/,
64 | // in the `src` directory
65 | use: [
66 | {
67 | loader: 'style-loader',
68 | },
69 | {
70 | loader: 'css-loader',
71 | },
72 | {
73 | loader: 'sass-loader',
74 | options: {
75 | sourceMap: true,
76 | },
77 | },
78 | ],
79 | },
80 | {
81 | test: new RegExp('.(' + fileExtensions.join('|') + ')$'),
82 | type: 'asset/resource',
83 | exclude: /node_modules/,
84 | // loader: 'file-loader',
85 | // options: {
86 | // name: '[name].[ext]',
87 | // },
88 | },
89 | {
90 | test: /\.html$/,
91 | loader: 'html-loader',
92 | exclude: /node_modules/,
93 | },
94 | {
95 | test: /\.(ts|tsx)$/,
96 | exclude: /node_modules/,
97 | use: [
98 | {
99 | loader: require.resolve('ts-loader'),
100 | options: {
101 | getCustomTransformers: () => ({
102 | before: [isDevelopment && ReactRefreshTypeScript()].filter(
103 | Boolean
104 | ),
105 | }),
106 | transpileOnly: isDevelopment,
107 | },
108 | },
109 | ],
110 | },
111 | {
112 | test: /\.(js|jsx)$/,
113 | use: [
114 | {
115 | loader: 'source-map-loader',
116 | },
117 | {
118 | loader: require.resolve('babel-loader'),
119 | options: {
120 | plugins: [
121 | isDevelopment && require.resolve('react-refresh/babel'),
122 | ].filter(Boolean),
123 | },
124 | },
125 | ],
126 | exclude: /node_modules/,
127 | },
128 | ],
129 | },
130 | resolve: {
131 | alias: alias,
132 | extensions: fileExtensions
133 | .map((extension) => '.' + extension)
134 | .concat(['.js', '.jsx', '.ts', '.tsx', '.css']),
135 | },
136 | devtool: "source-map", // Source map generation must be turned on
137 | plugins: [
138 | !isDevelopment && sentryWebpackPlugin({
139 | org: "fetchfox",
140 | project: "javascript-react",
141 | authToken: process.env.SENTRY_AUTH_TOKEN,
142 | }),
143 | isDevelopment && new ReactRefreshWebpackPlugin(),
144 | new CleanWebpackPlugin({ verbose: false }),
145 | new webpack.ProgressPlugin(),
146 | // expose and write the allowed env vars on the compiled bundle
147 | new webpack.EnvironmentPlugin(['NODE_ENV']),
148 | new CopyWebpackPlugin({
149 | patterns: [
150 | {
151 | from: 'src/manifest.json',
152 | to: path.join(__dirname, 'build'),
153 | force: true,
154 | transform: function (content, path) {
155 | // generates the manifest file using the package.json informations
156 | return Buffer.from(
157 | JSON.stringify({
158 | description: process.env.npm_package_description,
159 | version: process.env.npm_package_version,
160 | ...JSON.parse(content.toString()),
161 | })
162 | );
163 | },
164 | },
165 | ],
166 | }),
167 | new CopyWebpackPlugin({
168 | patterns: [
169 | {
170 | from: 'src/pages/Content/content.styles.css',
171 | to: path.join(__dirname, 'build'),
172 | force: true,
173 | },
174 | ],
175 | }),
176 | new CopyWebpackPlugin({
177 | patterns: [
178 | {
179 | from: 'src/assets/img/icon-128.png',
180 | to: path.join(__dirname, 'build'),
181 | force: true,
182 | },
183 | ],
184 | }),
185 | new CopyWebpackPlugin({
186 | patterns: [
187 | {
188 | from: 'src/assets/img/icon-34.png',
189 | to: path.join(__dirname, 'build'),
190 | force: true,
191 | },
192 | ],
193 | }),
194 | new HtmlWebpackPlugin({
195 | template: path.join(__dirname, 'src', 'pages', 'Newtab', 'index.html'),
196 | filename: 'newtab.html',
197 | chunks: ['newtab'],
198 | cache: false,
199 | }),
200 | new HtmlWebpackPlugin({
201 | template: path.join(__dirname, 'src', 'pages', 'Options', 'index.html'),
202 | filename: 'options.html',
203 | chunks: ['options'],
204 | cache: false,
205 | }),
206 | new HtmlWebpackPlugin({
207 | template: path.join(__dirname, 'src', 'pages', 'Popup', 'index.html'),
208 | filename: 'popup.html',
209 | chunks: ['popup'],
210 | cache: false,
211 | }),
212 | new HtmlWebpackPlugin({
213 | template: path.join(__dirname, 'src', 'pages', 'Devtools', 'index.html'),
214 | filename: 'devtools.html',
215 | chunks: ['devtools'],
216 | cache: false,
217 | }),
218 | new HtmlWebpackPlugin({
219 | template: path.join(__dirname, 'src', 'pages', 'Panel', 'index.html'),
220 | filename: 'panel.html',
221 | chunks: ['panel'],
222 | cache: false,
223 | }),
224 | ].filter(Boolean),
225 | infrastructureLogging: {
226 | level: 'info',
227 | },
228 | };
229 |
230 | if (env.NODE_ENV === 'development') {
231 | options.devtool = 'cheap-module-source-map';
232 | } else {
233 | options.optimization = {
234 | minimize: false,
235 | minimizer: [
236 | new TerserPlugin({
237 | extractComments: false,
238 | }),
239 | ],
240 | };
241 | }
242 |
243 | module.exports = options;
244 |
--------------------------------------------------------------------------------