├── .env
├── .gitignore
├── jest.config.js
├── src
├── typings.ts
├── product-overrides.ts
├── utilities.ts
├── urls.txt
├── cosmosdb.ts
└── index.ts
├── package.json
├── tests
└── utilities.test.ts
└── readme.md
/.env:
--------------------------------------------------------------------------------
1 | STORE_NAME=
2 | COSMOS_CONSTRING=
3 | COSMOS_DB_NAME=
4 | COSMOS_CONTAINER=
5 | COSMOS_PARTITION_KEY=
6 | IMAGE_UPLOAD_FUNC_URL=
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /node_modules
2 |
3 | /dist
4 |
5 | /build
6 |
7 | .env.*
8 |
9 | npm-debug.log*
10 |
11 | package-lock.json
12 |
13 | /.vscode
14 |
15 | /coverage
16 |
17 | salt
18 |
19 | *.bat
20 | *.lnk
21 | *.ps1
--------------------------------------------------------------------------------
/jest.config.js:
--------------------------------------------------------------------------------
1 | /** @type {import('ts-jest').JestConfigWithTsJest} */
2 | module.exports = {
3 | preset: 'ts-jest',
4 | testEnvironment: 'node',
5 | transform: {
6 | '^.+\\.(ts|tsx)?$': ['ts-jest', { diagnostics: { ignoreCodes: ['TS151001'] } }],
7 | "^.+\\.(js|jsx)$": "babel-jest",
8 | },
9 | };
--------------------------------------------------------------------------------
/src/typings.ts:
--------------------------------------------------------------------------------
1 | export interface Product {
2 | id: string;
3 | name: string;
4 | size?: string;
5 | currentPrice: number;
6 | lastUpdated: Date;
7 | lastChecked: Date;
8 | priceHistory: DatedPrice[];
9 | sourceSite: string;
10 | category: string[];
11 | unitPrice?: number;
12 | unitName?: string;
13 | originalUnitQuantity?: number;
14 | }
15 |
16 | export interface DatedPrice {
17 | date: Date;
18 | price: number;
19 | }
20 |
21 | export interface ProductResponse {
22 | upsertType: UpsertResponse;
23 | product: Product;
24 | }
25 |
26 | export interface CategorisedUrl {
27 | url: string;
28 | categories: string[];
29 | }
30 |
31 | export const enum UpsertResponse {
32 | NewProduct,
33 | PriceChanged,
34 | InfoChanged,
35 | AlreadyUpToDate,
36 | Failed,
37 | }
38 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "cd-scraper",
3 | "version": "1.0.0",
4 | "description": "",
5 | "main": "index",
6 | "scripts": {
7 | "test": "npx jest",
8 | "dev": "npx esrun src/index.ts",
9 | "db": "npx esrun src/index.ts db",
10 | "db images": "npx esrun src/index.ts db images"
11 | },
12 | "author": "",
13 | "license": "ISC",
14 | "dependencies": {
15 | "@azure/cosmos": "latest",
16 | "cheerio": "^1.0.0-rc.12",
17 | "dotenv": "latest",
18 | "lodash": "^4.17.21",
19 | "playwright": "^1.44.0"
20 | },
21 | "devDependencies": {
22 | "@types/jest": "^29.5.12",
23 | "@types/lodash": "^4.17.4",
24 | "@types/node": "^20.12.12",
25 | "esrun": "^3.2.26",
26 | "jest": "^29.7.0",
27 | "ts-jest": "^29.1.2",
28 | "typescript": "^5.4.5"
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/tests/utilities.test.ts:
--------------------------------------------------------------------------------
1 | import 'jest';
2 | // jest.useFakeTimers();
3 | import * as cheerio from 'cheerio';
4 | import { playwrightElementToProduct } from '../src/index';
5 | import { CategorisedUrl, Product } from '../src/typings';
6 | import { addUnitPriceToProduct } from '../src/utilities';
7 |
8 | // Sample input
9 | const html = `
10 |
12 |
Large
13 | `;
14 |
15 | // Sample product
16 | const juiceProduct: Product = {
17 | id: '12345',
18 | name: 'Orange Juice',
19 | size: '250ml',
20 | currentPrice: 4,
21 | lastUpdated: new Date('01-20-2023'),
22 | lastChecked: new Date('01-20-2023'),
23 | priceHistory: [],
24 | sourceSite: 'countdown.co.nz',
25 | category: ['juice'],
26 | };
27 |
28 | const $ = cheerio.load(html);
29 | const productEntries = $('cdx-card a.product-entry');
30 |
31 | describe('scraping', () => {
32 | // it('extract normal product titles', async () => {
33 | // const result = playwrightElementToProduct(productEntries[0], ['test']);
34 | // expect(result!.name).toBe('yes');
35 | // });
36 |
37 | it('per unit price is derived from quantity and size', async () => {
38 | const result = addUnitPriceToProduct(juiceProduct);
39 | expect(result.unitName).toBe('L');
40 | expect(result.unitPrice).toBe(16);
41 | expect(result.originalUnitQuantity).toBe(250);
42 | });
43 | });
44 |
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | # Woolworths Scraper
2 |
3 | This project scrapes product info from Woolworths (Formerly Countdown) NZ website and optionally places the data into Azure CosmosDB.
4 |
5 | A history of price changes is stored within each product's database entry.
6 |
7 | Image files can also be scraped and placed into an Azure Storage Blob Container.
8 |
9 | ## Basic Setup
10 |
11 | With `NodeJS` installed, clone this repository, then run `npm install` to install dependencies.
12 |
13 | Playwright must also be installed when running for the first time with `npx playwright install`.
14 |
15 | The program can now be tested in dry run mode without any further setup using `npm run dev`.
16 |
17 | ## Optional Setup
18 |
19 | The `.env` file has variables that can be filled for more functionality.
20 |
21 | ```js
22 | STORE_NAME= Optional supermarket location name
23 | COSMOS_CONSTRING= Read-write CosmosDB connection string
24 | COSMOS_DB_NAME= CosmosDB Name
25 | COSMOS_CONTAINER= CosmosDB Container Name, eg. products
26 | COSMOS_PARTITION_KEY= CosmosDB Partition Key, eg. /name
27 | IMAGE_UPLOAD_FUNC_URL= Optional image upload REST API URL
28 | ```
29 |
30 | - The CosmosDB read-write connection string can be obtained from the `Azure Portal > CosmosDB > Settings > Keys`.
31 | - A list of URLs to scrape can be put in file `urls.txt`, with one url per line.
32 |
33 | ## Usage
34 |
35 | `npm run dev` - will use dry-run mode, no azure connection is required and the results will log to console.
36 |
37 | `npm run db` - will scrape through the URLs and store the results into CosmosDB.
38 |
39 | `npm run db https://sampleurl` - a single url can be used as an argument. This will be scraped instead of the URLs text file.
40 |
41 | ## Other Command-Line Arguments
42 |
43 | `images` - will also upload images.
44 |
45 | `headed` - will run the browser in a window instead of a headless.
46 |
47 | ## Output
48 |
49 | Sample log output when running in dry run mode:
50 |
51 | ```cmd
52 | ID | Name | Size | Price | Unit Price
53 | ----------------------------------------------------------------------------------
54 | 762844 | Ocean Blue Smoked Salmon Slices | 100g | $ 9 | $90 /kg
55 | 697201 | Clearly Premium Smoked Salmon | 200g | $ 13.5 | $67.5 /kg
56 | 830035 | Ocean Blue Smoked Salmon Slices | 180g | $ 12 | $67.7 /kg
57 | ```
58 |
59 | This is a sample of a single product stored in CosmosDB. It was re-run at multiple dates to store changing prices:
60 |
61 | ```json
62 | {
63 | "id": "123456",
64 | "name": "Sausages Precooked Chinese Honey",
65 | "currentPrice": 12.9,
66 | "size": "Prepacked 1kg pack",
67 | "priceHistory": [
68 | {
69 | "date": "Sat Jan 14 2023",
70 | "price": 10
71 | },
72 | {
73 | "date": "Thu Jan 26 2023",
74 | "price": 12.9
75 | }
76 | ]
77 | }
78 | ```
79 |
--------------------------------------------------------------------------------
/src/product-overrides.ts:
--------------------------------------------------------------------------------
1 | // This file is for manually overriding product size and category data.
2 | // Is used for products that do not have listed sizes to be scraped, or have incorrect categories.
3 |
4 | export const productOverrides = [
5 | { id: '206889', size: '180g' },
6 | { id: '196996', size: '300g' },
7 | { id: '137967', size: '420g' },
8 | { id: '125856', size: '450g' },
9 | { id: '189268', size: '1.13kg' },
10 | { id: '189150', size: '1.2kg' },
11 | { id: '190454', size: '2.1kg' },
12 | { id: '189078', size: '1.3kg' },
13 | { id: '189136', size: '1.2kg' },
14 | { id: '755237', size: '931g' },
15 | { id: '755304', size: '1.1kg' },
16 | { id: '755246', size: '1020g' },
17 | { id: '755245', size: '1.2kg' },
18 | { id: '112273', size: '865ml' },
19 | { id: '269514', size: '584ml' },
20 | { id: '269515', size: '584ml' },
21 | { id: '116518', size: '440ml' },
22 | { id: '151191', size: '570ml' },
23 | { id: '279904', size: '575ml' },
24 | { id: '146149', size: '1000ml' },
25 | { id: '791925', size: '525g' },
26 | { id: '774216', size: '525g' },
27 | { id: '784406', size: '525g' },
28 | { id: '791916', size: '525g' },
29 | { id: '306624', size: '185g' },
30 | { id: '156824', size: '180g' },
31 | { id: '9023', size: '375g' },
32 | { id: '266962', category: 'sweets-lollies' },
33 | { id: '171524', size: '230ml', category: 'baking' },
34 | { id: '170021', category: 'ice-blocks' },
35 | { id: '71164', category: 'sausages' },
36 | { id: '71174', category: 'sausages' },
37 | { id: '71168', category: 'sausages' },
38 | { id: '71165', category: 'sausages' },
39 | { id: '331560', category: 'specialty-bread' },
40 | { id: '679412', category: 'herbal-tea' },
41 | { id: '267492', category: 'herbal-tea' },
42 | { id: '267485', category: 'herbal-tea' },
43 | { id: '413302', category: 'herbal-tea' },
44 | { id: '267488', category: 'herbal-tea' },
45 | { id: '760872', category: 'herbal-tea' },
46 | { id: '681177', category: 'herbal-tea' },
47 | { id: '95091', category: 'herbal-tea' },
48 | { id: '761093', category: 'black-tea' },
49 | { id: '721661', category: 'green-tea' },
50 | { id: '790129', category: 'herbal-tea' },
51 | { id: '721034', category: 'herbal-tea' },
52 | { id: '95091.', category: 'herbal-tea' },
53 | { id: '184090', category: 'herbal-tea' },
54 | { id: '690093', category: 'green-tea' },
55 | { id: '780922', category: 'sauces' },
56 | { id: '780921', category: 'sauces' },
57 | { id: '72618', category: 'black-tea' },
58 | { id: '6053', category: 'black-tea' },
59 | { id: '72617', category: 'black-tea' },
60 | { id: '168068', category: 'black-tea' },
61 | { id: '6052', category: 'black-tea' },
62 | { id: '761436', category: 'black-tea' },
63 | { id: '14133', size: '390g' },
64 | { id: '970886', category: 'bakery-desserts' },
65 | { id: '775131', category: 'sweets-lollies' },
66 | { id: '69336', category: 'sweets-lollies' },
67 | { id: '790537', category: 'sweets-lollies' },
68 | { id: '746240', category: 'sausages' },
69 | { id: '351192', category: 'sausages' },
70 | { id: '246719', category: 'sausages' },
71 | { id: '282184', category: 'sausages' },
72 | { id: '746273', category: 'sausages' },
73 | { id: '282181', category: 'sausages' },
74 | { id: '70941.', category: 'sausages' },
75 | { id: '905772', category: 'sausages' },
76 | { id: '905792', category: 'sausages' },
77 | { id: '905773', category: 'sausages' },
78 | { id: '905764', category: 'sausages' },
79 | { id: '290802', category: 'sausages' },
80 | { id: '563985', category: 'sausages' },
81 | { id: '757239', category: 'sausages' },
82 | { id: '761522', category: 'sausages' },
83 | { id: '290811', category: 'sausages' },
84 | { id: '290796', category: 'sausages' },
85 | { id: '290735', category: 'sausages' },
86 | { id: '761618', category: 'sausages' },
87 | { id: '290815', category: 'sausages' },
88 | { id: '69865.', category: 'sausages' },
89 | { id: '681002', size: '480g' },
90 | { id: '681001', size: '552g' },
91 | { id: '681005', size: '552g' },
92 | { id: '285555', size: '360g' },
93 | { id: '822190', size: '350g' },
94 | { id: '680890', size: '315g' },
95 | { id: '681003', size: '315g' },
96 | { id: '822078', size: '300g' },
97 | { id: '822079', size: '350g' },
98 | { id: '391911', size: '325g' },
99 | { id: '917977', category: 'sauces' },
100 | { id: '918032', category: 'sauces' },
101 | { id: '279219', category: 'pizza' },
102 | { id: '279114', category: 'pizza' },
103 | { id: '279218', category: 'pizza' },
104 | { id: '324079', category: 'pizza' },
105 | { id: '279059', category: 'pizza' },
106 | { id: '782359', category: 'pizza' },
107 | { id: '63801', category: 'pizza' },
108 | { id: '6045946', category: 'patties-meatballs' },
109 | { id: '70941', category: 'sausages' },
110 | { id: '69865', category: 'sausages' },
111 | { id: '116285', category: 'sausages' },
112 | { id: '319272', category: 'sausages' },
113 | ];
114 |
--------------------------------------------------------------------------------
/src/utilities.ts:
--------------------------------------------------------------------------------
1 | import { Product } from './typings';
2 | import { readFileSync } from 'fs';
3 |
4 | // Set widths for table log output
5 | const tableIDWidth = 7
6 | const tableNameWidth = 60;
7 | const tableSizeWidth = 17;
8 |
9 | export const colour = {
10 | red: '\x1b[31m',
11 | green: '\x1b[32m',
12 | yellow: '\x1b[33m',
13 | blue: '\x1b[38;5;117m',
14 | magenta: '\x1b[35m',
15 | cyan: '\x1b[36m',
16 | white: '\x1b[37m',
17 | crimson: '\x1b[38m',
18 | grey: '\x1b[90m',
19 | orange: '\x1b[38;5;214m',
20 | sky: '\x1b[38;5;153m',
21 | };
22 |
23 | // log()
24 | // -----
25 | // Console log with specified colour
26 |
27 | export function log(colour: string, text: string) {
28 | const clear = '\x1b[0m';
29 | console.log(`${colour}%s${clear}`, text);
30 | }
31 |
32 | // logError()
33 | // ----------
34 | // Shorthand function for logging with red colour
35 |
36 | export function logError(text: string) {
37 | log(colour.red, text);
38 | }
39 |
40 | // logProductRow()
41 | // ---------------
42 | // Log a single product in one row, using alternating colours for readability.
43 |
44 | export function logProductRow(product: Product) {
45 | const unitPriceString = product.unitPrice ? `$${product.unitPrice.toFixed(2)} /${product.unitName}` : ``;
46 | log(
47 | getAlternatingRowColour(colour.sky, colour.white),
48 | `${product.id.padStart(tableIDWidth)} | ` +
49 | `${product.name.slice(0, tableNameWidth).padEnd(tableNameWidth)} | ` +
50 | `${product.size?.slice(0, tableSizeWidth).padEnd(tableSizeWidth)} | ` +
51 | `$ ${product.currentPrice.toFixed(2).padStart(4).padEnd(5)} | ` +
52 | unitPriceString
53 | );
54 | }
55 |
56 | // logTableHeader()
57 | // ----------------
58 |
59 | export function logTableHeader() {
60 | log(
61 | colour.yellow,
62 | `${'ID'.padStart(tableIDWidth)} | ${'Name'.padEnd(tableNameWidth)} | ` +
63 | `${'Size'.padEnd(tableSizeWidth)} | ` +
64 | `${'Price'.padEnd(7)} | Unit Price`
65 | );
66 |
67 | let headerLine = ""
68 | for (let i = 0; i < 113; i++) {
69 | headerLine += "-"
70 | }
71 | log(colour.yellow, headerLine);
72 |
73 | }
74 |
75 | // getAlternatingRowColour()
76 | // -------------------------
77 | // Takes 2 colours and flip-flops between them on each function call.
78 | // Is used for printing tables with better readability.
79 |
80 | let alternatingRowColour = false;
81 | function getAlternatingRowColour(colourA: string, colourB: string) {
82 | alternatingRowColour = alternatingRowColour ? false : true;
83 | return alternatingRowColour ? colourA : colourB;
84 | }
85 |
86 | // readLinesFromTextFile()
87 | // -----------------------
88 | // Read from local text file containing one url per line, return as string array.
89 |
90 | export function readLinesFromTextFile(filename: string): string[] {
91 | try {
92 | const file = readFileSync(filename, 'utf-8');
93 | const result = file.split(/\r?\n/).filter((line) => {
94 | if (line.trim().length > 0) return true;
95 | else return false;
96 | });
97 | return result;
98 | } catch (error) {
99 | throw 'Error reading ' + filename;
100 | }
101 | }
102 |
103 | // getTimeElapsedSince()
104 | // ---------------------
105 | // Get time difference in between startTime and now. Returns in 58s or 12:32 format.
106 |
107 | export function getTimeElapsedSince(startTime: number): string {
108 | let elapsedTimeSeconds: number = (Date.now() - startTime) / 1000;
109 | let elapsedTimeString: string = Math.floor(elapsedTimeSeconds).toString();
110 |
111 | // If over 60 secs, print as 1:23 format
112 | if (elapsedTimeSeconds >= 60) {
113 | return (
114 | Math.floor(elapsedTimeSeconds / 60) +
115 | ':' +
116 | Math.floor(elapsedTimeSeconds % 60)
117 | .toString()
118 | .padStart(2, '0')
119 | )
120 | // Else print in 40s format
121 | } else return elapsedTimeString + "s";
122 | }
123 |
124 | // List of valid category names that scraped products should be put in
125 | export const validCategories: string[] = [
126 | // freshCategory
127 | 'eggs',
128 | 'fruit',
129 | 'fresh-vegetables',
130 | 'salads-coleslaw',
131 | 'bread',
132 | 'bread-rolls',
133 | 'specialty-bread',
134 | 'bakery-cakes',
135 | 'bakery-desserts',
136 | // chilledCategory
137 | 'milk',
138 | 'long-life-milk',
139 | 'sour-cream',
140 | 'cream',
141 | 'yoghurt',
142 | 'butter',
143 | 'cheese',
144 | 'cheese-slices',
145 | 'salami',
146 | 'other-deli-foods',
147 | // meatCategory
148 | 'beef-lamb',
149 | 'chicken',
150 | 'ham',
151 | 'bacon',
152 | 'pork',
153 | 'patties-meatballs',
154 | 'sausages',
155 | 'deli-meats',
156 | 'meat-alternatives',
157 | 'seafood',
158 | 'salmon',
159 | // frozenCategory
160 | 'ice-cream',
161 | 'ice-blocks',
162 | 'pastries-cheesecake',
163 | 'frozen-chips',
164 | 'frozen-vegetables',
165 | 'frozen-fruit',
166 | 'frozen-seafood',
167 | 'pies-sausage-rolls',
168 | 'pizza',
169 | 'other-savouries',
170 | // pantryCategory
171 | 'rice',
172 | 'noodles',
173 | 'pasta',
174 | 'beans-spaghetti',
175 | 'canned-fish',
176 | 'canned-meat',
177 | 'soup',
178 | 'cereal',
179 | 'spreads',
180 | 'baking',
181 | 'sauces',
182 | 'oils-vinegars',
183 | 'world-foods',
184 | // snacksCategory
185 | 'chocolate',
186 | 'boxed-chocolate',
187 | 'chips',
188 | 'crackers',
189 | 'biscuits',
190 | 'muesli-bars',
191 | 'nuts-bulk-mix',
192 | 'sweets-lollies',
193 | 'other-snacks',
194 | // drinksCategory
195 | 'black-tea',
196 | 'green-tea',
197 | 'herbal-tea',
198 | 'drinking-chocolate',
199 | 'coffee',
200 | 'soft-drinks',
201 | 'energy-drinks',
202 | 'juice',
203 | // petsCategory
204 | 'cat-food',
205 | 'cat-treats',
206 | 'dog-food',
207 | 'dog-treats',
208 | ];
209 |
210 | // toTitleCase()
211 | // -------------
212 | // Convert a string to title case
213 |
214 | export function toTitleCase(str: string) {
215 | return str.replace(/\w\S*/g, function (txt) {
216 | return txt.charAt(0).toUpperCase() + txt.substring(1).toLowerCase();
217 | });
218 | }
--------------------------------------------------------------------------------
/src/urls.txt:
--------------------------------------------------------------------------------
1 | # List of all URLs to be scraped (based off original WW categories as of Oct-2025)
2 | # --------------------------------------------------------------------------------
3 | # Not all products are scraped. Only popular categories are to save time.
4 | # add pages=2 or similiar to scrape more pages per category.
5 | # add categories=name to replace the category if storing to a database.
6 |
7 | # Fruit & Veg
8 | woolworths.co.nz/shop/browse/fruit-veg/fruit categories=fruit pages=2
9 | woolworths.co.nz/shop/browse/fruit-veg/prepared-fruit-veg categories=salads-coleslaw
10 | woolworths.co.nz/shop/browse/fruit-veg/vegetables categories=fresh-vegetables pages=4
11 | woolworths.co.nz/shop/browse/fruit-veg/fresh-salad-herbs categories=fresh-vegetables pages=2
12 | woolworths.co.nz/shop/browse/fruit-veg/fresh-salad-herbs/slaws-salad-kits categories=salads-coleslaw
13 |
14 | # Meat & Poultry
15 | woolworths.co.nz/shop/browse/meat-poultry/beef categories=beef-lamb pages=2
16 | woolworths.co.nz/shop/browse/meat-poultry/lamb categories=beef-lamb
17 | woolworths.co.nz/shop/browse/meat-poultry/chicken-poultry categories=chicken pages=2
18 | woolworths.co.nz/shop/browse/meat-poultry/pork
19 | woolworths.co.nz/shop/browse/meat-poultry/mince-patties categories=patties-meatballs
20 | woolworths.co.nz/shop/browse/meat-poultry/sausages
21 | woolworths.co.nz/shop/browse/meat-poultry/plant-based-alternatives categories=meat-alternatives
22 |
23 | # Fish & Seafood
24 | woolworths.co.nz/shop/browse/fish-seafood categories=seafood
25 | woolworths.co.nz/shop/browse/fish-seafood/salmon categories=salmon
26 |
27 | # Fridge & Deli
28 | woolworths.co.nz/shop/browse/fridge-deli/milk pages=3
29 | woolworths.co.nz/shop/browse/fridge-deli/cream-custard categories=cream
30 | woolworths.co.nz/shop/browse/fridge-deli/milk/long-life-milk categories=long-life-milk
31 | woolworths.co.nz/shop/browse/fridge-deli/eggs-butter-spreads/butter categories=butter pages=2
32 | woolworths.co.nz/shop/browse/fridge-deli/eggs-butter-spreads/margarine-spreads categories=butter
33 | woolworths.co.nz/shop/browse/fridge-deli/cheese categories=cheese pages=2
34 | woolworths.co.nz/shop/browse/fridge-deli/cheese/block-cheese categories=cheese
35 | woolworths.co.nz/shop/browse/fridge-deli/pasta-pizza-pastry/pizza-bases categories=pizza
36 | woolworths.co.nz/shop/browse/fridge-deli/yoghurt-desserts categories=yoghurt pages=2
37 | woolworths.co.nz/shop/browse/fridge-deli/deli-meats-seafood/ham-shaved-meat categories=ham
38 | woolworths.co.nz/shop/browse/fridge-deli/deli-meats-seafood/bacon categories=bacon
39 | woolworths.co.nz/shop/browse/fridge-deli/deli-meats-seafood/salami-cured-dried-meats categories=salami
40 | woolworths.co.nz/shop/browse/fridge-deli/deli-meats-seafood/cooked-meats categories=ham
41 | woolworths.co.nz/shop/browse/fridge-deli/juice-drinks categories=juice
42 | woolworths.co.nz/shop/browse/fridge-deli/vegan-vegetarian categories=other-deli-foods
43 | woolworths.co.nz/shop/browse/fridge-deli/prepared-meals-sides/heat-and-eat-meals categories=other-deli-foods
44 | woolworths.co.nz/shop/browse/fridge-deli/prepared-meals-sides/soup-risotto categories=other-deli-foods
45 | woolworths.co.nz/shop/browse/fridge-deli/prepared-meals-sides/pies-quiche categories=pies-sausage-rolls
46 | woolworths.co.nz/shop/browse/fridge-deli/dips-hummus-nibbles categories=other-deli-foods
47 |
48 | # Bakery
49 | woolworths.co.nz/shop/browse/bakery/baked-in-store/loaves-garlic-savoury-bread categories=bread-rolls
50 | woolworths.co.nz/shop/browse/bakery/buns-rolls-bread-sticks categories=bread-rolls
51 | woolworths.co.nz/shop/browse/bakery/sliced-packaged-bread categories=bread pages=2
52 | woolworths.co.nz/shop/browse/bakery/wraps-pita-pizza-bases categories=specialty-bread
53 | woolworths.co.nz/shop/browse/bakery/bagels-crumpets-pancakes categories=specialty-bread
54 | woolworths.co.nz/shop/browse/bakery/pastries-croissants-biscuits categories=bakery-desserts
55 | woolworths.co.nz/shop/browse/bakery/cakes-muffins-desserts categories=bakery-desserts
56 | woolworths.co.nz/shop/browse/bakery/cakes-muffins-desserts/birthday-celebration-cakes categories=bakery-cakes
57 |
58 | # Frozen
59 | woolworths.co.nz/shop/browse/frozen/ice-cream-sorbet/tubs categories=ice-cream pages=3
60 | woolworths.co.nz/shop/browse/frozen/ice-cream-sorbet/single-serve-multipacks categories=ice-cream pages=2
61 | woolworths.co.nz/shop/browse/frozen/frozen-meals-snacks/frozen-pies-sausage-rolls-hot-dogs categories=pies-sausage-rolls pages=2
62 | woolworths.co.nz/shop/browse/frozen/pizza-pastry-bread/frozen-pizza categories=pizza
63 | woolworths.co.nz/shop/browse/frozen/pizza-pastry-bread/frozen-pastry categories=pastries-cheesecake
64 | woolworths.co.nz/shop/browse/frozen/frozen-meals-snacks/spring-rolls-toppers-savouries categories=other-savouries
65 | woolworths.co.nz/shop/browse/frozen/frozen-meals-snacks/dumplings-wontons-steam-buns categories=other-savouries
66 | woolworths.co.nz/shop/browse/frozen/frozen-vegetables/other-frozen-vegetables categories=frozen-vegetables
67 | woolworths.co.nz/shop/browse/frozen/frozen-vegetables/frozen-peas-corn-carrots categories=frozen-vegetables
68 | woolworths.co.nz/shop/browse/frozen/frozen-vegetables/mixed-vegetables-stir-fry categories=frozen-vegetables
69 | woolworths.co.nz/shop/browse/frozen/frozen-vegetables/chips-wedges-potatoes categories=frozen-chips
70 | woolworths.co.nz/shop/browse/frozen/frozen-vegetables/hash-browns-rosti categories=other-savouries
71 | woolworths.co.nz/shop/browse/frozen/frozen-meat/frozen-chicken-poultry categories=chicken
72 | woolworths.co.nz/shop/browse/frozen/frozen-meat/frozen-burgers categories=patties-meatballs
73 | woolworths.co.nz/shop/browse/frozen/frozen-meat-alternatives categories=other-savouries
74 | woolworths.co.nz/shop/browse/frozen/frozen-seafood
75 | woolworths.co.nz/shop/browse/frozen/frozen-fruit-drink categories=frozen-fruit
76 |
77 | # Pantry
78 | woolworths.co.nz/shop/browse/pantry/snacks-sweets/popcorn-nuts-savoury-snacks categories=other-snacks pages=2
79 | woolworths.co.nz/shop/browse/pantry/snacks-sweets/corn-chips-salsa categories=chips
80 | woolworths.co.nz/shop/browse/pantry/snacks-sweets/chips categories=chips pages=3
81 | woolworths.co.nz/shop/browse/pantry/snacks-sweets/muesli-bars-snack-bars categories=muesli-bars pages=2
82 | woolworths.co.nz/shop/browse/pantry/snacks-sweets/chocolate-bars-blocks categories=chocolate pages=4
83 | woolworths.co.nz/shop/browse/pantry/snacks-sweets/chocolate-boxes-gifts categories=chocolate pages=4
84 | woolworths.co.nz/shop/browse/pantry/snacks-sweets/sweets-lollies-licorice categories=sweets-lollies pages=2
85 | woolworths.co.nz/shop/browse/pantry/snacks-sweets/gums-mints categories=sweets-lollies
86 | woolworths.co.nz/shop/browse/pantry/eggs
87 | woolworths.co.nz/shop/browse/pantry/biscuits-crackers/biscuits-cookies categories=biscuits pages=4
88 | woolworths.co.nz/shop/browse/pantry/biscuits-crackers/crackers categories=crackers pages=2
89 | woolworths.co.nz/shop/browse/pantry/biscuits-crackers/rice-cakes-crispbread categories=crackers
90 | woolworths.co.nz/shop/browse/pantry/bulk-foods/nuts-seeds categories=nuts-bulk-mix
91 | woolworths.co.nz/shop/browse/pantry/bulk-foods/dried-fruit-mixes categories=nuts-bulk-mix
92 | woolworths.co.nz/shop/browse/pantry/baking categories=baking pages=3
93 | woolworths.co.nz/shop/browse/pantry/pasta-noodles-grains/rice
94 | woolworths.co.nz/shop/browse/pantry/pasta-noodles-grains/noodles pages=2
95 | woolworths.co.nz/shop/browse/pantry/pasta-noodles-grains/dried-pasta categories=pasta
96 | woolworths.co.nz/shop/browse/pantry/pasta-noodles-grains/pasta-meals-sides categories=pasta
97 | woolworths.co.nz/shop/browse/pantry/tinned-foods-packets/baked-beans-spaghetti categories=beans-spaghetti
98 | woolworths.co.nz/shop/browse/pantry/tinned-foods-packets/tinned-tuna-seafood categories=canned-fish
99 | woolworths.co.nz/shop/browse/pantry/tinned-foods-packets/tinned-meat categories=canned-meat
100 | woolworths.co.nz/shop/browse/pantry/tinned-foods-packets/tinned-soup-soup-mix categories=soup
101 | woolworths.co.nz/shop/browse/pantry/cereals-spreads/cereal pages=2
102 | woolworths.co.nz/shop/browse/pantry/cereals-spreads/nut-butter categories=spreads
103 | woolworths.co.nz/shop/browse/pantry/cereals-spreads/honey categories=spreads
104 | woolworths.co.nz/shop/browse/pantry/cereals-spreads/jam categories=spreads
105 | woolworths.co.nz/shop/browse/pantry/cereals-spreads/other-spreads categories=spreads
106 | woolworths.co.nz/shop/browse/pantry/cereals-spreads/muesli-oats categories=cereal
107 | woolworths.co.nz/shop/browse/pantry/sauces-pastes categories=sauces
108 |
109 | # Beer & Wine
110 |
111 | # Drinks
112 | woolworths.co.nz/shop/browse/drinks/coffee pages=3
113 | woolworths.co.nz/shop/browse/drinks/tea-milk-drinks/black-breakfast-tea categories=black-tea pages=2
114 | woolworths.co.nz/shop/browse/drinks/tea-milk-drinks/green-tea categories=green-tea
115 | woolworths.co.nz/shop/browse/drinks/tea-milk-drinks/herbal-fruit-teas categories=herbal-tea pages=2
116 | woolworths.co.nz/shop/browse/drinks/tea-milk-drinks/drinking-chocolate-malt categories=drinking-chocolate
117 | woolworths.co.nz/shop/browse/drinks/juice-cordial categories=juice pages=3
118 | woolworths.co.nz/shop/browse/drinks/soft-drinks-sports-drinks/soft-drinks pages=2
119 | woolworths.co.nz/shop/browse/drinks/soft-drinks-sports-drinks/energy-drinks pages=2
120 |
121 | # Health & Body
122 |
123 | # Household
124 |
125 | # Baby & Child
126 |
127 | # Pet
128 | woolworths.co.nz/shop/browse/pet/cats/dry-cat-food categories=cat-food pages=2
129 | woolworths.co.nz/shop/browse/pet/cats/wet-cat-food categories=cat-food pages=3
130 | woolworths.co.nz/shop/browse/pet/cats/cat-milk-treats categories=cat-treats
131 | woolworths.co.nz/shop/browse/pet/dogs/dog-chews-bones-treats categories=dog-treats
132 | woolworths.co.nz/shop/browse/pet/dogs/dry-dog-food categories=dog-food
133 | woolworths.co.nz/shop/browse/pet/dogs/wet-dog-food categories=dog-food
134 | woolworths.co.nz/shop/browse/pet/dogs/chilled-or-frozen-dog-food categories=dog-food
135 |
--------------------------------------------------------------------------------
/src/cosmosdb.ts:
--------------------------------------------------------------------------------
1 | // Used by index.ts for creating and accessing items stored in Azure CosmosDB
2 |
3 | import * as dotenv from "dotenv";
4 | dotenv.config();
5 | dotenv.config({ path: `.env.local`, override: true });
6 |
7 | import { CosmosClient, Container, Database, FeedOptions, SqlQuerySpec } from "@azure/cosmos";
8 | import { logError, log, colour, validCategories } from "./utilities";
9 | import { Product, UpsertResponse, ProductResponse } from "./typings";
10 |
11 | let cosmosClient: CosmosClient;
12 | let database: Database;
13 | let container: Container;
14 |
15 | export async function establishCosmosDB() {
16 | // Get CosmosDB connection string stored in .env
17 | const COSMOS_CONSTRING = process.env.COSMOS_CONSTRING;
18 | if (!COSMOS_CONSTRING) {
19 | throw Error(
20 | "CosmosDB connection string COSMOS_CONSTRING not found in .env"
21 | );
22 | }
23 |
24 | // Ensure partition key is in correct format
25 | let validatedPartitionKey = process.env.COSMOS_PARTITION_KEY;
26 | if (validatedPartitionKey?.charAt(0) != "/") validatedPartitionKey = "/" + validatedPartitionKey;
27 |
28 | // Establish CosmosDB Client, Database, Container
29 | try {
30 | cosmosClient = new CosmosClient(COSMOS_CONSTRING);
31 |
32 | const databaseResponse = await cosmosClient.databases.createIfNotExists({
33 | id: process.env.COSMOS_DB_NAME,
34 | });
35 |
36 | database = databaseResponse.database;
37 |
38 | const containerResponse = await database.containers.createIfNotExists({
39 | id: process.env.COSMOS_CONTAINER,
40 | partitionKey: { paths: [validatedPartitionKey] },
41 | });
42 |
43 | container = containerResponse.container;
44 | } catch (error) {
45 | throw Error(error + "\n\nInvalid CosmosDB connection - check for valid connection string");
46 | }
47 | }
48 |
49 | // upsertProductToCosmosDB()
50 | // -------------------------
51 | // Inserts or updates a product object to CosmosDB,
52 | // returns an UpsertResponse based on if and how the Product was updated
53 |
54 | export async function upsertProductToCosmosDB(
55 | scrapedProduct: Product
56 | ): Promise
{
57 | try {
58 | // Check CosmosDB for any existing item using id and name as the partition key
59 | const cosmosResponse = await container
60 | .item(scrapedProduct.id as string, scrapedProduct.name)
61 | .read();
62 |
63 | // If an existing item was found in CosmosDB, check for update values before uploading
64 | if (cosmosResponse.statusCode === 200) {
65 | const dbProduct = (await cosmosResponse.resource) as Product;
66 | const response = buildUpdatedProduct(scrapedProduct, dbProduct);
67 |
68 | // Send updated product to CosmosDB
69 | await container.items.upsert(response.product);
70 | return response.upsertType;
71 | }
72 |
73 | // If product with ID and exact name doesn't yet exist in CosmosDB
74 | else if (cosmosResponse.statusCode === 404) {
75 | // First check if there is an existing product with the same ID but different name(partition key)
76 | const querySpec = {
77 | query: `SELECT * FROM products p WHERE p.id = @id`,
78 | parameters: [
79 | {
80 | name: "@id",
81 | value: scrapedProduct.id,
82 | },
83 | ],
84 | };
85 | const { resources } = await container.items.query(querySpec).fetchAll();
86 |
87 | // If an existing ID was found, update the DB with the new name
88 | if (resources.length > 0) {
89 | // Cast existing product to correct type
90 | const dbProduct = resources[0] as Product;
91 |
92 | // Update product with new name
93 | const response = buildUpdatedProduct(scrapedProduct, dbProduct);
94 | response.product.name = scrapedProduct.name;
95 |
96 | // Send updated product to CosmosDB
97 | await container.items.upsert(response.product);
98 | return response.upsertType;
99 | } else {
100 | // If no existing ID was found, create a new product
101 | await container.items.create(scrapedProduct);
102 |
103 | console.log(
104 | ` New Product: ${scrapedProduct.name.slice(0, 47).padEnd(47)}` +
105 | ` | $ ${scrapedProduct.currentPrice}`
106 | );
107 |
108 | return UpsertResponse.NewProduct;
109 | }
110 | }
111 | // Manage any failed cosmos updates
112 | else if (cosmosResponse.statusCode === 409) {
113 | logError(`Conflicting ID found for product ${scrapedProduct.name}`);
114 | return UpsertResponse.Failed;
115 | } else {
116 | // If CosmoDB returns a status code other than 200 or 404, manage other errors here
117 | logError(`CosmosDB returned status code: ${cosmosResponse.statusCode}`);
118 | return UpsertResponse.Failed;
119 | }
120 | } catch (e: any) {
121 | logError(e);
122 | return UpsertResponse.Failed;
123 | }
124 | }
125 |
126 | // buildUpdatedProduct()
127 | // ---------------------
128 | // This takes a freshly scraped product and compares it with a found database product.
129 | // It returns an updated product with data from both product versions
130 |
131 | function buildUpdatedProduct(
132 | scrapedProduct: Product,
133 | dbProduct: Product
134 | ): ProductResponse {
135 | // Date objects pulled from CosmosDB need to re-parsed as strings in format yyyy-mm-dd
136 | let dbDay = dbProduct.lastUpdated.toString();
137 | dbDay = dbDay.slice(0, 10);
138 | let scrapedDay = scrapedProduct.lastUpdated.toISOString().slice(0, 10);
139 |
140 | // Measure the price difference between the new scraped product and the old db product
141 | const priceDifference = Math.abs(
142 | dbProduct.currentPrice - scrapedProduct.currentPrice
143 | );
144 |
145 | // If price has changed by more than $0.05, and not on the same day
146 | if (priceDifference > 0.05 && dbDay != scrapedDay) {
147 | // Push scraped priceHistory into existing priceHistory array
148 | dbProduct.priceHistory.push(scrapedProduct.priceHistory[0]);
149 |
150 | // Set the scrapedProduct to use the updated priceHistory
151 | scrapedProduct.priceHistory = dbProduct.priceHistory;
152 |
153 | // Return completed Product ready for uploading
154 | logPriceChange(dbProduct, scrapedProduct.currentPrice);
155 | return {
156 | upsertType: UpsertResponse.PriceChanged,
157 | product: scrapedProduct,
158 | };
159 | }
160 |
161 | // If any db categories are not included within the list of valid ones, update to scraped ones
162 | else if (
163 | !dbProduct.category.every((category) => {
164 | const isValid = validCategories.includes(category);
165 | return isValid;
166 | }) ||
167 | dbProduct.category === null
168 | ) {
169 | console.log(
170 | ` Categories Changed: ${scrapedProduct.name
171 | .padEnd(40)
172 | .substring(0, 40)}` +
173 | ` - ${dbProduct.category.join(" ")} > ${scrapedProduct.category.join(
174 | " "
175 | )}`
176 | );
177 |
178 | // Update everything but priceHistory and lastUpdated
179 | scrapedProduct.priceHistory = dbProduct.priceHistory;
180 | scrapedProduct.lastUpdated = dbProduct.lastUpdated;
181 |
182 | // Return completed Product ready for uploading
183 | return {
184 | upsertType: UpsertResponse.InfoChanged,
185 | product: scrapedProduct,
186 | };
187 | }
188 |
189 | // Update other info
190 | else if (
191 | dbProduct.sourceSite !== scrapedProduct.sourceSite ||
192 | dbProduct.category.join(" ") !== scrapedProduct.category.join(" ") ||
193 | dbProduct.size !== scrapedProduct.size ||
194 | dbProduct.unitPrice !== scrapedProduct.unitPrice ||
195 | dbProduct.unitName !== scrapedProduct.unitName ||
196 | dbProduct.originalUnitQuantity !== scrapedProduct.originalUnitQuantity
197 | ) {
198 | // Update everything but priceHistory and lastUpdated
199 | scrapedProduct.priceHistory = dbProduct.priceHistory;
200 | scrapedProduct.lastUpdated = dbProduct.lastUpdated;
201 |
202 | // Return completed Product ready for uploading
203 | return {
204 | upsertType: UpsertResponse.InfoChanged,
205 | product: scrapedProduct,
206 | };
207 | } else {
208 | // Nothing has changed, only update lastChecked
209 | dbProduct.lastChecked = scrapedProduct.lastChecked;
210 | return {
211 | upsertType: UpsertResponse.AlreadyUpToDate,
212 | product: dbProduct,
213 | };
214 | }
215 | }
216 |
217 | // logPriceChange()
218 | // ----------------
219 | // Log a per product price change message,
220 | // coloured green for price reduction, red for price increase
221 |
222 | export function logPriceChange(product: Product, newPrice: number) {
223 | const priceIncreased = newPrice > product.currentPrice;
224 | log(
225 | priceIncreased ? colour.red : colour.green,
226 | " Price " +
227 | (priceIncreased ? "Up : " : "Down : ") +
228 | product.name.slice(0, 47).padEnd(47) +
229 | " | $" +
230 | product.currentPrice.toString().padStart(4) +
231 | " > $" +
232 | newPrice
233 | );
234 | }
235 |
236 | // customQuery()
237 | // -------------
238 | // Function for running custom DB queries - used primarily for debugging
239 |
240 | export async function customQuery(): Promise {
241 | const options: FeedOptions = {
242 | maxItemCount: 30,
243 | };
244 | const secondsDelayBetweenBatches = 5;
245 | const querySpec: SqlQuerySpec = {
246 | query: "SELECT * FROM products p",
247 | };
248 |
249 | log(colour.yellow, "Custom Query \n" + querySpec.query);
250 |
251 | const response = await container.items.query(querySpec, options);
252 |
253 | let batchCount = 0;
254 | const maxBatchCount = 900;
255 | let continueFetching = true;
256 |
257 | await (async () => {
258 | while (response.hasMoreResults() && continueFetching) {
259 | await delayedBatchFetch();
260 | }
261 | })();
262 |
263 | console.log("Custom Query Complete");
264 | return;
265 |
266 | function delayedBatchFetch() {
267 | return new Promise((resolve) =>
268 | setTimeout(async () => {
269 | console.log(
270 | "Batch " +
271 | batchCount +
272 | " - Items [" +
273 | batchCount * options.maxItemCount! +
274 | " - " +
275 | (batchCount + 1) * options.maxItemCount!
276 | ) + "]";
277 |
278 | const batch = await response.fetchNext();
279 | const products = batch.resources as Product[];
280 | const items = batch.resources;
281 |
282 | products.forEach(async (p) => {
283 | let oldDatedPrice = 0;
284 | let requiresUpdate = false;
285 |
286 | p.priceHistory.forEach((datedPrice) => {
287 | let newDatedPrice = datedPrice.price;
288 | if (Math.abs(oldDatedPrice - newDatedPrice) < 0.04) {
289 | console.log(p.name);
290 | console.log(
291 | " - Tiny price difference detected on " +
292 | datedPrice.date.toDateString() +
293 | " - " +
294 | oldDatedPrice +
295 | " - " +
296 | newDatedPrice
297 | );
298 | datedPrice.price = 0;
299 | requiresUpdate = true;
300 | }
301 | oldDatedPrice = newDatedPrice;
302 | });
303 |
304 | if (requiresUpdate) {
305 | let updatedPriceHistory = p.priceHistory.filter((datedPrice) => {
306 | if (datedPrice.price > 0) return true;
307 | else return false;
308 | });
309 |
310 | console.log(
311 | " - Old price history length: " +
312 | p.priceHistory.length +
313 | " - new length: " +
314 | updatedPriceHistory.length
315 | );
316 |
317 | p.priceHistory = updatedPriceHistory;
318 |
319 | const uploadRes = await container.items.upsert(p);
320 | console.log(
321 | " - Uploaded updated product with status code: " +
322 | uploadRes.statusCode
323 | );
324 | }
325 |
326 | // item.name = item.name.replace(' ', ' ').trim();
327 | // let p: Product = item as Product;
328 |
329 | // const res = await container.item(item.id, item.name).delete();
330 | // console.log('delete ' + res.statusCode);
331 |
332 | // const uploadRes = await container.items.upsert(p);
333 | // console.log('upload ' + uploadRes.statusCode);
334 | });
335 |
336 | if (batchCount++ === maxBatchCount) continueFetching = false;
337 |
338 | resolve();
339 | }, secondsDelayBetweenBatches * 1000)
340 | );
341 | }
342 | }
343 |
--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
1 | import * as dotenv from "dotenv";
2 | dotenv.config();
3 | dotenv.config({ path: `.env.local`, override: true });
4 |
5 | import playwright from "playwright";
6 | import * as cheerio from "cheerio";
7 | import _ from "lodash";
8 | import { setTimeout } from "timers/promises";
9 |
10 | import { establishCosmosDB, upsertProductToCosmosDB } from "./cosmosdb.js";
11 | import { productOverrides } from "./product-overrides.js";
12 | import { CategorisedUrl, DatedPrice, Product, UpsertResponse } from "./typings";
13 | import {
14 | log, colour, logProductRow, logError, readLinesFromTextFile, getTimeElapsedSince,
15 | logTableHeader, toTitleCase,
16 | } from "./utilities.js";
17 |
18 | // Woolworths / Countdown Scraper
19 | // ------------------------------
20 | // Scrapes pricing and other info from Woolworths NZ's website.
21 |
22 | // Set a reasonable delay between each page load to reduce load on the server.
23 | const pageLoadDelaySeconds = 7;
24 |
25 | // Set a delay when logging each product per page to the console.
26 | const productLogDelayMilliSeconds = 20;
27 |
28 | // Record start time for logging purposes
29 | const startTime = Date.now();
30 |
31 | // Load URLs from text file 'urls.txt'
32 | let categorisedUrls: CategorisedUrl[] = loadUrlsFile();
33 |
34 | // Handle command-line arguments, ex: 'db', 'images', or single urls
35 | export let databaseMode = false;
36 | export let uploadImagesMode = false;
37 | let headlessMode = true;
38 | categorisedUrls = await handleArguments(categorisedUrls);
39 |
40 | // Establish CosmosDB connection if being used
41 | if (databaseMode) establishCosmosDB();
42 |
43 | // Establish playwright browser
44 | let browser: playwright.Browser;
45 | let page: playwright.Page;
46 | browser = await establishPlaywrightPage(headlessMode);
47 |
48 | // Select store location
49 | await selectStoreByLocationName();
50 |
51 | // Main Loop - Scrape through each page
52 | await scrapeAllPageURLs();
53 |
54 | // Program End and Cleanup
55 | browser.close();
56 | log(
57 | colour.sky,
58 | `\nAll Pages Completed = Total Time Elapsed ${getTimeElapsedSince(startTime)} \n`
59 | );
60 | // -----------------------
61 |
62 |
63 | // loadUrlsFile
64 | // ------------
65 | // Loads and validates URLs from a txt file to be scraped.
66 |
67 | function loadUrlsFile(filePath: string = "src/urls.txt"): CategorisedUrl[] {
68 | // Try to read file urls.txt or other file for a list of URLs
69 | const rawLinesFromFile: string[] = readLinesFromTextFile(filePath);
70 |
71 | // Parse and optimise URLs
72 | let categorisedUrls: CategorisedUrl[] = [];
73 | rawLinesFromFile.map((line) => {
74 | let parsedUrls = parseAndCategoriseURL(line);
75 | if (parsedUrls !== undefined) {
76 | categorisedUrls = [...categorisedUrls, ...parsedUrls];
77 | }
78 | });
79 |
80 | // Return as an array of CategorisedUrl objects
81 | return categorisedUrls;
82 | }
83 |
84 | // scrapeAllPageURLs
85 | // ---------------
86 | // Loops through each page URL and scrapes pricing and other info.
87 | // This is the main function that calls the other functions.
88 |
89 | async function scrapeAllPageURLs() {
90 |
91 | // Log loop start
92 | log(
93 | colour.yellow,
94 | `${categorisedUrls.length} pages to be scraped`.padEnd(35) +
95 | `${pageLoadDelaySeconds}s delay between scrapes`.padEnd(35) +
96 | (databaseMode ? "(Database Mode)" : "(Dry Run Mode)")
97 | );
98 |
99 | // Loop through each page URL to scrape
100 | for (let i = 0; i < categorisedUrls.length; i++) {
101 |
102 | // Extract url from CategorisedUrl object
103 | const categorisedUrl: CategorisedUrl = categorisedUrls[i];
104 | let url: string = categorisedUrls[i].url;
105 |
106 | // Log current scrape sequence and the total number of pages to scrape
107 | const shortUrl = url.replace("https://", "");
108 | log(
109 | colour.yellow,
110 | `\n[${i + 1}/${categorisedUrls.length}] ${shortUrl}`
111 | );
112 |
113 | try {
114 | // Open page with upto 3 retries on failure
115 | let retries = 0;
116 | const maxRetries = 3;
117 | const retryDelay = 2000; // 2 seconds
118 |
119 | while (retries < maxRetries) {
120 | try {
121 | await page.goto(url);
122 |
123 | // Set page timeout to 8 seconds
124 | await page.setDefaultTimeout(8000);
125 |
126 | // Wait for product-price h3 html element to dynamically load in,
127 | // this is required to see product data
128 | await page.waitForSelector("product-price h3");
129 |
130 | break; // If successful, exit the retry loop
131 | } catch (error) {
132 | retries++;
133 | if (retries === maxRetries) {
134 | throw error; // If all retries failed, throw the error
135 | }
136 | log(colour.yellow, `Retry ${retries}/${maxRetries} for ${url}`);
137 | await setTimeout(retryDelay);
138 | }
139 | }
140 |
141 | // Wait and page down multiple times to further trigger any lazy loads
142 | for (let pageDown = 0; pageDown < 5; pageDown++) {
143 | // create a random number between 500 and 1500
144 | const timeBetweenPgDowns = Math.random() * 1000 + 500;
145 | await page.waitForTimeout(timeBetweenPgDowns);
146 | await page.keyboard.press("PageDown");
147 | }
148 |
149 | // If url has page= query parameter, check to see that page is available
150 | let desiredPageNumber = 1;
151 | let numPagesAvailable = 1;
152 | if (categorisedUrl.url.includes("page=")) {
153 | const currentPageMatch = categorisedUrl.url.match(/page=(\d+)/);
154 | if (currentPageMatch) {
155 | desiredPageNumber = parseInt(currentPageMatch[1])
156 |
157 | try {
158 | // Detect number of pages available
159 | const paginationUL = await page.innerHTML("ul.pagination");
160 | const $$ = cheerio.load(paginationUL);
161 | numPagesAvailable = $$("li").length - 2 // exclude prev/next buttons
162 | } catch {
163 | numPagesAvailable = 1; // if no pagination found, only 1 page exists
164 | }
165 |
166 | if (desiredPageNumber > numPagesAvailable) {
167 | log(colour.yellow, `Page ${desiredPageNumber} does not exist, only ${numPagesAvailable} pages available. Skipping..`);
168 | continue; // Skip this page as it doesn't exist
169 | }
170 | }
171 | }
172 |
173 | // Load html into Cheerio for DOM selection
174 | const html = await page.innerHTML("product-grid");
175 | const $ = cheerio.load(html);
176 |
177 | // Find all product entries
178 | const allProductEntries = $("cdx-card product-stamp-grid div.product-entry");
179 |
180 | // Find advertisement product entries not normally part of this product category
181 | const advertisementEntries = $("div.carousel-track div cdx-card product-stamp-grid div.product-entry")
182 | const adHrefs: string[] = advertisementEntries.map((index, element) => {
183 | return $(element).find("a").first().attr("href");
184 | }).toArray();
185 |
186 | // Filter out product entries that match the found advertisements
187 | const productEntries = allProductEntries.filter((index, element) => {
188 | const productHref = $(element).find("a").first().attr("href");
189 | return !adHrefs.includes(productHref!);
190 | })
191 |
192 | // Log the number of products found, time elapsed, category, pages
193 | log(
194 | colour.yellow,
195 | `${productEntries.length} product entries found`.padEnd(38) +
196 | `Time Elapsed: ${getTimeElapsedSince(startTime)}`.padEnd(35) +
197 | `Category: ${_.startCase(categorisedUrl.categories.join(" - ")).padEnd(20)}` +
198 | `Page: ${desiredPageNumber}/${numPagesAvailable}`
199 | );
200 |
201 | // Log table header
202 | if (!databaseMode) logTableHeader();
203 |
204 | // Store number of items processed for logging purposes
205 | let perPageLogStats = {
206 | newProducts: 0,
207 | priceChanged: 0,
208 | infoUpdated: 0,
209 | alreadyUpToDate: 0,
210 | }
211 |
212 | // Start nested loop which loops through each product entry
213 | perPageLogStats =
214 | await processFoundProductEntries(categorisedUrl, productEntries, perPageLogStats);
215 |
216 | // After scraping every item is complete, log how many products were scraped
217 | if (databaseMode) {
218 | log(
219 | colour.blue,
220 | `CosmosDB: ${perPageLogStats.newProducts} new products, ` +
221 | `${perPageLogStats.priceChanged} updated prices, ` +
222 | `${perPageLogStats.infoUpdated} updated info, ` +
223 | `${perPageLogStats.alreadyUpToDate} already up-to-date`
224 | );
225 | }
226 |
227 | // Delay between each page load
228 | await setTimeout(pageLoadDelaySeconds * 1000);
229 |
230 | } catch (error: unknown) {
231 | if (typeof error === 'string') {
232 | if (error.includes("NS_ERROR_CONNECTION_REFUSED")) {
233 | logError("Connection Failed - Check Firewall\n" + error);
234 | return;
235 | }
236 | }
237 | logError(
238 | "Page Timeout after 15 seconds - Skipping this page\n" + error
239 | );
240 | }
241 | }
242 | }
243 |
244 | // processFoundProductEntries
245 | // --------------------------
246 | // Loops through each product entry and scrapes pricing and other info.
247 | // This function is called by scrapeAllPageURLs.
248 |
249 | async function processFoundProductEntries
250 | (
251 | categorisedUrl: CategorisedUrl,
252 | productEntries: cheerio.Cheerio,
253 | perPageLogStats: {
254 | newProducts: number;
255 | priceChanged: number;
256 | infoUpdated: number;
257 | alreadyUpToDate: number;
258 | }) {
259 |
260 | // Loop through each product entry
261 | for (let i = 0; i < productEntries.length; i++) {
262 | const productEntryElement = productEntries[i];
263 |
264 | const product = playwrightElementToProduct(
265 | productEntryElement,
266 | categorisedUrl.categories
267 | );
268 |
269 | if (databaseMode && product !== undefined) {
270 | // Insert or update item into azure cosmosdb
271 | const response = await upsertProductToCosmosDB(product);
272 |
273 | // Use response to update logging counters
274 | switch (response) {
275 | case UpsertResponse.AlreadyUpToDate:
276 | perPageLogStats.alreadyUpToDate++;
277 | break;
278 | case UpsertResponse.InfoChanged:
279 | perPageLogStats.infoUpdated++;
280 | break;
281 | case UpsertResponse.NewProduct:
282 | perPageLogStats.newProducts++;
283 | break;
284 | case UpsertResponse.PriceChanged:
285 | perPageLogStats.priceChanged++;
286 | break;
287 | default:
288 | break;
289 | }
290 |
291 | // Upload image to Azure Function
292 | if (uploadImagesMode) {
293 | // Get image url using provided base url, product ID, and hi-res query parameters
294 | const imageUrlBase =
295 | "https://assets.woolworths.com.au/images/2010/";
296 | const imageUrlExtensionAndQueryParams =
297 | ".jpg?impolicy=wowcdxwbjbx&w=900&h=900";
298 | const imageUrl =
299 | imageUrlBase + product.id + imageUrlExtensionAndQueryParams;
300 |
301 | await uploadImageRestAPI(imageUrl!, product);
302 | }
303 | } else if (!databaseMode && product !== undefined) {
304 | // When doing a dry run, log product name - size - price in table format
305 | logProductRow(product!);
306 | }
307 |
308 | // Add a tiny delay between each product loop.
309 | // This makes printing the log more readable
310 | await setTimeout(productLogDelayMilliSeconds);
311 | }
312 |
313 | // Return log stats for completed page
314 | return perPageLogStats;
315 | }
316 |
317 | // uploadImageRestAPI()
318 | // --------------------
319 | // Send image url to an Azure Function API
320 |
321 | async function uploadImageRestAPI(
322 | imgUrl: string,
323 | product: Product
324 | ): Promise {
325 | // Check if passed in url is valid, return if not
326 | if (imgUrl === undefined || !imgUrl.includes("http") || product.id.length < 4) {
327 | log(colour.grey, ` Image ${product.id} has invalid url: ${imgUrl}`);
328 | return false;
329 | }
330 |
331 | // Get IMAGE_UPLOAD_FUNC_URL from env
332 | // Example format:
333 | // https://.azurewebsites.net/api/ImageToS3?code=
334 | const funcBaseUrl = process.env.IMAGE_UPLOAD_FUNC_URL;
335 |
336 | // Check funcBaseUrl is valid
337 | if (!funcBaseUrl?.includes("http")) {
338 | throw Error(
339 | "\nIMAGE_UPLOAD_FUNC_URL in .env is invalid. Should be in .env :\n\n" +
340 | "IMAGE_UPLOAD_FUNC_URL=https://.azurewebsites.net/api/ImageToS3?code=\n\n"
341 | );
342 | }
343 | const restUrl = `${funcBaseUrl}${product.id}&source=${imgUrl}`;
344 |
345 | // Perform http get
346 | var res = await fetch(new URL(restUrl), { method: "GET" });
347 | var responseMsg = await (await res.blob()).text();
348 |
349 | if (responseMsg.includes("S3 Upload of Full-Size")) {
350 | // Log for successful upload
351 | log(
352 | colour.grey,
353 | ` New Image : ${(product.id + ".webp").padEnd(11)} | ` +
354 | `${product.name.padEnd(40).slice(0, 40)}`
355 | );
356 | } else if (responseMsg.includes("already exists")) {
357 | // Do not log for existing images
358 | } else if (responseMsg.includes("Unable to download:")) {
359 | // Log for missing images
360 | log(colour.grey, ` Image ${product.id} unavailable to be downloaded`);
361 | } else if (responseMsg.includes("unable to be processed")) {
362 | log(colour.grey, ` Image ${product.id} unable to be processed`);
363 | } else {
364 | // Log any other errors that may have occurred
365 | console.log(responseMsg);
366 | }
367 | return true;
368 | }
369 |
370 | // handleArguments()
371 | // -----------------
372 | // Handle command line arguments. Can be reverse mode, dry-run-mode, custom url, or categories
373 |
374 | function handleArguments(categorisedUrls: CategorisedUrl[]): CategorisedUrl[] {
375 | if (process.argv.length > 2) {
376 | // Slice out the first 2 arguments, as they are not user-provided
377 | const userArgs = process.argv.slice(2, process.argv.length);
378 |
379 | // Loop through all args and find any matching keywords
380 | let potentialUrl = "";
381 | userArgs.forEach(async (arg) => {
382 | if (arg === "db") databaseMode = true;
383 | else if (arg === "images") uploadImagesMode = true;
384 | else if (arg === "headless") headlessMode = true // is already default
385 | else if (arg === "headed") headlessMode = false
386 |
387 | // Any arg containing .co.nz will replaced the URLs text file to be scraped.
388 | else if (arg.includes(".co.nz")) potentialUrl += arg;
389 |
390 | // Reverse the order of the URLs to be scraped, starting from the bottom
391 | else if (arg === "reverse") categorisedUrls = categorisedUrls.reverse();
392 | // else if (arg === "custom") {
393 | // categorisedUrls = [];
394 | // await customQuery();
395 | // process.exit();
396 | // }
397 | });
398 |
399 | // Try to parse the potential new url
400 | const parsedUrl = parseAndCategoriseURL(potentialUrl);
401 | if (parsedUrl !== undefined) categorisedUrls = parsedUrl;
402 | }
403 | return categorisedUrls;
404 | }
405 |
406 | // establishPlaywrightPage()
407 | // -------------------------
408 | // Create a playwright browser
409 |
410 | async function establishPlaywrightPage(headless = true) {
411 | log(
412 | colour.yellow,
413 | "Launching Browser.. " +
414 | (process.argv.length > 2
415 | ? "(" + (process.argv.length - 2) + " arguments found)"
416 | : "")
417 | );
418 | browser = await playwright.firefox.launch({
419 | headless: headless,
420 | });
421 | page = await browser.newPage();
422 |
423 | // Reject unnecessary ad/tracking urls
424 | await routePlaywrightExclusions();
425 |
426 | return browser;
427 | }
428 |
429 | // selectStoreByLocationName()
430 | // ---------------------------
431 | // Selects a store location by typing in the specified location address
432 |
433 | async function selectStoreByLocationName(locationName: string = "") {
434 | // If no location was passed in, also check .env for STORE_NAME
435 | if (locationName === "") {
436 | if (process.env.STORE_NAME) locationName = process.env.STORE_NAME;
437 | // If STORE_NAME is also not present, skip store location selection
438 | else return;
439 | }
440 |
441 | log(colour.yellow, "Selecting Store Location..");
442 |
443 | // Open store selection page
444 | try {
445 | await page.setDefaultTimeout(12000);
446 | await page.goto("https://www.woolworths.co.nz/bookatimeslot", {
447 | waitUntil: "domcontentloaded",
448 | });
449 | await page.waitForSelector("fieldset div div p button");
450 | } catch (error) {
451 | logError("Location selection page timed out - Using default location instead");
452 | return;
453 | }
454 |
455 | const oldLocation = await page
456 | .locator("fieldset div div p strong")
457 | .innerText();
458 |
459 | // Click change address modal
460 | await page.locator("fieldset div div p button").click();
461 | await page.waitForSelector("form-suburb-autocomplete form-input input");
462 | try {
463 | // Type in address, wait 1.5s for auto-complete to populate entries
464 | await page
465 | .locator("form-suburb-autocomplete form-input input")
466 | .type(locationName);
467 | await page.waitForTimeout(1500);
468 |
469 | // Select first matched entry, wait for validation
470 | await page.keyboard.press("ArrowDown");
471 | await page.waitForTimeout(300);
472 | await page.keyboard.press("Enter");
473 | await page.waitForTimeout(1000);
474 |
475 | // Click save location button
476 | await page.getByText("Save and Continue Shopping").click();
477 | log(
478 | colour.yellow,
479 | "Changed Location from " + oldLocation + " to " + locationName + "\n"
480 | );
481 |
482 | // Ensure location is saved before moving on
483 | await page.waitForTimeout(2000);
484 | } catch {
485 | // Catch timeout if no locations are found using the provided env value.
486 | logError(
487 | `Store Location:${locationName} not found. Using default instead.`
488 | );
489 | }
490 | }
491 |
492 | // playwrightElementToProduct()
493 | // ----------------------------
494 | // Takes a playwright html element for 'a.product-entry', builds and returns a Product
495 |
496 | export function playwrightElementToProduct(
497 | element: cheerio.Element,
498 | categories: string[]
499 | ): Product | undefined {
500 | const $ = cheerio.load(element);
501 |
502 | // Find the tag with an id containing "-title"
503 | // This holds the product ID, name and size
504 | let idNameSizeH3 = $(element).find("h3").filter((i, element) => {
505 | if ($(element).attr("id")?.includes("-title")) {
506 | return true
507 | } else return false;
508 | });
509 |
510 | let product: Product = {
511 |
512 | // ID
513 | // -------
514 | // Extract product ID from h3 id attribute, and remove non-numbers
515 | id: idNameSizeH3.attr("id")?.replace(/\D/g, "") as string,
516 |
517 | // Source Site - set where the source of information came from
518 | sourceSite: "countdown.co.nz", // use countdown for consistency with old data
519 |
520 | // Categories
521 | category: categories, // already obtained from url/text file
522 |
523 | // Store today's date
524 | lastChecked: new Date(),
525 | lastUpdated: new Date(),
526 |
527 | // These values will later be overwritten
528 | name: "",
529 | priceHistory: [],
530 | currentPrice: 0,
531 | };
532 |
533 | // Name & Size
534 | // ------------
535 | // Try to extract combined name and size from h3 tag inner text
536 | let rawNameAndSize = idNameSizeH3.text().trim();
537 |
538 | // Clean unnecessary words from titles
539 | rawNameAndSize = rawNameAndSize
540 | .toLowerCase()
541 | .replace(" ", " ")
542 | .replace("fresh fruit", "")
543 | .replace("fresh vegetable", "")
544 | .trim()
545 | ;
546 |
547 | // Try to regex match a size section such as:
548 | // 100g, 150ml, 16pack, 0.5-1.5kg, tray 1kg, etc
549 | let tryMatchSize =
550 | rawNameAndSize.match(/(tray\s\d+)|(\d+(\.\d+)?(\-\d+\.\d+)?\s?(g|kg|l|ml|pack))\b/g);
551 |
552 | if (!tryMatchSize) {
553 | // Capitalise and set name
554 | product.name = toTitleCase(rawNameAndSize);
555 |
556 | // No size was found in name, size can be derived from unit price later
557 | product.size = "";
558 | } else {
559 | // If a size was found, get the index to split the string into name and size
560 | let indexOfSizeSection = rawNameAndSize.indexOf(tryMatchSize[0]);
561 |
562 | // Capitalise and set name
563 | product.name = toTitleCase(rawNameAndSize.slice(0, indexOfSizeSection)).trim();
564 |
565 | // Clean up and set size
566 | let cleanedSize = rawNameAndSize.slice(indexOfSizeSection).trim();
567 | if (cleanedSize.match(/\d+l\b/)) {
568 | // Capitalise L for litres
569 | cleanedSize = cleanedSize.replace("l", "L");
570 | }
571 | cleanedSize.replace("tray", "Tray");
572 | product.size = cleanedSize;
573 | }
574 |
575 | // Price
576 | // ------
577 | // Is originally displayed with dollars in an , cents in a ,
578 | // and potentially a kg unit name inside the for some meat products.
579 | // The 2 numbers are joined, parsed, and non-number chars are removed.
580 | const dollarString: string = $(element)
581 | .find("product-price div h3 em")
582 | .text()
583 | .trim();
584 | let centString: string = $(element)
585 | .find("product-price div h3 span")
586 | .text()
587 | .trim();
588 | if (centString.includes("kg")) product.size = "per kg";
589 | centString = centString.replace(/\D/g, "");
590 | product.currentPrice = Number(dollarString + "." + centString);
591 |
592 | // Create a date object for now, but with minutes and seconds set to 0
593 | const today = new Date();
594 | today.setMinutes(0);
595 | today.setSeconds(0);
596 |
597 | // Create a DatedPrice object, which may be added into the product if needed
598 | const todaysDatedPrice: DatedPrice = {
599 | date: today,
600 | price: product.currentPrice,
601 | };
602 | product.priceHistory = [todaysDatedPrice];
603 |
604 | // Unit Price
605 | // -----------
606 | // Try to extract from span.cupPrice, ex: $2.52 / 100mL
607 | const rawUnitPrice = $(element).find("span.cupPrice").text().trim();
608 |
609 | if (rawUnitPrice) {
610 | // Extract and parse unit price, ex: 2.52
611 | const unitPriceString = rawUnitPrice.split("/")[0].replace("$", "").trim();
612 | let unitPrice = Number.parseFloat(unitPriceString);
613 |
614 | // Extract amount and unit, ex: 100mL
615 | const amountAndUnit = rawUnitPrice.split("/")[1].trim();
616 |
617 | // Parse amount, ex: 100
618 | let amount = Number.parseInt(amountAndUnit.match(/\d+/g)?.[0] || "");
619 |
620 | // Extract unit, ex: mL
621 | let unit = amountAndUnit.match(/\w+/g)?.[0] || ""
622 |
623 | // Normalize units to kg or L
624 | if (amountAndUnit == "100g") {
625 | amount = amount * 10;
626 | unitPrice = unitPrice * 10;
627 | unit = "kg";
628 | }
629 | else if (amountAndUnit == "100mL") {
630 | amount = amount * 10;
631 | unitPrice = unitPrice * 10;
632 | unit = "L";
633 | }
634 |
635 | // Cleanup 1kg to just kg
636 | unit = unit.replace("1kg", "kg");
637 | unit = unit.replace("1L", "L");
638 |
639 | // Set finalised unit price and name
640 | product.unitPrice = unitPrice;
641 | product.unitName = unit;
642 | }
643 |
644 | // Overrides
645 | // ----------
646 | // Check .ts file for manually overridden product data
647 | productOverrides.forEach((override) => {
648 | // First check if product ID has any overrides
649 | if (override.id === product.id) {
650 | // Check for size override
651 | if (override.size !== undefined) {
652 | product.size = override.size;
653 | }
654 |
655 | // Check for category override
656 | if (override.category !== undefined) {
657 | product.category = [override.category];
658 | }
659 | }
660 | });
661 |
662 | // Validation
663 | // ----------
664 | // If product values pass validation, return product
665 | if (validateProduct(product)) return product;
666 | else {
667 | try {
668 | logError(
669 | ` Unable to Scrape: ${product.id.padStart(6)} | ${product.name} | ` +
670 | `$${product.currentPrice}`
671 | );
672 | } catch {
673 | logError(" Unable to Scrape ID from product");
674 | }
675 | return undefined;
676 | }
677 | }
678 |
679 | // validateProduct()
680 | // -----------------
681 | // Checks scraped product values are within reasonable ranges
682 |
683 | function validateProduct(product: Product): boolean {
684 | try {
685 | if (product.name.match(/\$\s\d+/)) return false;
686 | if (product.name.length < 4 || product.name.length > 100) return false;
687 | if (product.id.length < 2 || product.id.length > 20) return false;
688 | if (
689 | product.currentPrice <= 0 ||
690 | product.currentPrice === null ||
691 | product.currentPrice === undefined ||
692 | Number.isNaN(product.currentPrice) ||
693 | product.currentPrice > 999
694 | ) {
695 | return false;
696 | }
697 | return true;
698 | } catch (error) {
699 | return false;
700 | }
701 | }
702 |
703 | // parseAndCategoriseURL()
704 | // -----------------------
705 | // Parses a URL string, an optional category, optional number of pages to scrape
706 | // from a single line of text.
707 | // Returns undefined if not a valid URL
708 | // Example Input:
709 | // woolworths.co.nz/shop/browse/frozen/ice-cream-sorbet/tubs category=ice-cream pages=2
710 | // Example Return:
711 | // [
712 | // {
713 | // url: "https://woolworths.co.nz/shop/browse/frozen/ice-cream-sorbet/tubs?page=1&inStockProductsOnly=true"
714 | // category: "ice-cream"
715 | // },
716 | // {
717 | // url: "https://woolworths.co.nz/shop/browse/frozen/ice-cream-sorbet/tubs?page=2&inStockProductsOnly=true"
718 | // category: "ice-cream"
719 | // }
720 | // ]
721 |
722 | export function parseAndCategoriseURL(
723 | line: string
724 | ): CategorisedUrl[] | undefined {
725 | let baseCategorisedURL: CategorisedUrl = { url: "", categories: [] };
726 | let parsedUrls: CategorisedUrl[] = [];
727 | let numPagesPerURL = 1;
728 |
729 | // If line doesn't contain desired url section, return undefined
730 | if (!line.includes("woolworths.co.nz")) {
731 | return undefined;
732 |
733 | // If line is a search url, return as-is
734 | } else if (line.includes("?search=")) {
735 | parsedUrls.push({ url: line, categories: [] })
736 |
737 | // Else optimize and cleanup URL
738 | } else {
739 | // Split line by empty space, look for url, optional page amount & category
740 | line.split(" ").forEach((section) => {
741 |
742 | // Parse URL
743 | if (section.includes("woolworths.co.nz")) {
744 | baseCategorisedURL.url = section;
745 |
746 | // Ensure URL has http:// or https://
747 | if (!baseCategorisedURL.url.startsWith("http"))
748 | baseCategorisedURL.url = "https://" + section;
749 |
750 | // If url contains ? it has query options already set
751 | if (section.includes("?")) {
752 | // Strip any existing query options off of URL
753 | baseCategorisedURL.url = line.substring(0, line.indexOf("?"));
754 | }
755 | // Replace query parameters with optimised ones,
756 | // such as limiting to in-stock only,
757 | baseCategorisedURL.url += '?page=1&inStockProductsOnly=true';
758 |
759 | // Parse Category
760 | } else if (section.startsWith("categories=")) {
761 | let splitCategories = [section.replace("categories=", "")];
762 | if (section.includes(","))
763 | splitCategories = section.replace("categories=", "").split(",");
764 | baseCategorisedURL.categories = splitCategories;
765 |
766 | // Parse number of pages
767 | } else if (section.startsWith("pages=")) {
768 | numPagesPerURL = Number.parseInt(section.split("=")[1]);
769 | }
770 |
771 | // If no category was specified, derive one from the last url /section
772 | if (baseCategorisedURL.categories.length === 0) {
773 | // Extract /slashSections/ from url, while excluding content after '?'
774 | const baseUrl = baseCategorisedURL.url.split("?")[0];
775 | let slashSections = baseUrl.split("/");
776 |
777 | // Set category to last url /section/
778 | baseCategorisedURL.categories = [slashSections[slashSections.length - 1]];
779 | }
780 | });
781 |
782 | // For multiple pages, duplicate the url and edit the ?page=1 query parameter
783 | for (let i = 1; i <= numPagesPerURL; i++) {
784 | let pagedUrl = {
785 | url: baseCategorisedURL.url.replace("page=1", "page=" + i),
786 | categories: baseCategorisedURL.categories,
787 | }
788 | parsedUrls.push(pagedUrl);
789 | }
790 | }
791 |
792 | return parsedUrls;
793 | }
794 |
795 | // routePlaywrightExclusions()
796 | // ---------------------------
797 | // Excludes ads, tracking, and bandwidth intensive resources from being downloaded by Playwright
798 |
799 | async function routePlaywrightExclusions() {
800 | let typeExclusions = ["image", "media", "font"];
801 | let urlExclusions = [
802 | "googleoptimize.com",
803 | "gtm.js",
804 | "visitoridentification.js",
805 | "js-agent.newrelic.com",
806 | "cquotient.com",
807 | "googletagmanager.com",
808 | "cloudflareinsights.com",
809 | "dwanalytics",
810 | "facebook.net",
811 | "chatWidget",
812 | "edge.adobedc.net",
813 | "/Content/Banners/",
814 | "algolia.io",
815 | "algoliaradar.com",
816 | "go-mpulse.net"
817 | ];
818 |
819 | // Route with exclusions processed
820 | await page.route("**/*", async (route) => {
821 | const req = route.request();
822 | let excludeThisRequest = false;
823 | //let trimmedUrl = req.url().length > 120 ? req.url().substring(0, 120) + '...' : req.url();
824 |
825 | urlExclusions.forEach((excludedURL) => {
826 | if (req.url().includes(excludedURL)) excludeThisRequest = true;
827 | });
828 |
829 | typeExclusions.forEach((excludedType) => {
830 | if (req.resourceType() === excludedType) excludeThisRequest = true;
831 | });
832 |
833 | if (excludeThisRequest) {
834 | //logError(`${req.method()} ${req.resourceType()} - ${trimmedUrl}`);
835 | await route.abort();
836 | } else {
837 | //log(colour.white, `${req.method()} ${req.resourceType()} - ${trimmedUrl}`);
838 | await route.continue();
839 | }
840 | });
841 |
842 | return;
843 | }
844 |
845 |
--------------------------------------------------------------------------------