├── .env ├── .gitignore ├── jest.config.js ├── src ├── typings.ts ├── product-overrides.ts ├── utilities.ts ├── urls.txt ├── cosmosdb.ts └── index.ts ├── package.json ├── tests └── utilities.test.ts └── readme.md /.env: -------------------------------------------------------------------------------- 1 | STORE_NAME= 2 | COSMOS_CONSTRING= 3 | COSMOS_DB_NAME= 4 | COSMOS_CONTAINER= 5 | COSMOS_PARTITION_KEY= 6 | IMAGE_UPLOAD_FUNC_URL= -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /node_modules 2 | 3 | /dist 4 | 5 | /build 6 | 7 | .env.* 8 | 9 | npm-debug.log* 10 | 11 | package-lock.json 12 | 13 | /.vscode 14 | 15 | /coverage 16 | 17 | salt 18 | 19 | *.bat 20 | *.lnk 21 | *.ps1 -------------------------------------------------------------------------------- /jest.config.js: -------------------------------------------------------------------------------- 1 | /** @type {import('ts-jest').JestConfigWithTsJest} */ 2 | module.exports = { 3 | preset: 'ts-jest', 4 | testEnvironment: 'node', 5 | transform: { 6 | '^.+\\.(ts|tsx)?$': ['ts-jest', { diagnostics: { ignoreCodes: ['TS151001'] } }], 7 | "^.+\\.(js|jsx)$": "babel-jest", 8 | }, 9 | }; -------------------------------------------------------------------------------- /src/typings.ts: -------------------------------------------------------------------------------- 1 | export interface Product { 2 | id: string; 3 | name: string; 4 | size?: string; 5 | currentPrice: number; 6 | lastUpdated: Date; 7 | lastChecked: Date; 8 | priceHistory: DatedPrice[]; 9 | sourceSite: string; 10 | category: string[]; 11 | unitPrice?: number; 12 | unitName?: string; 13 | originalUnitQuantity?: number; 14 | } 15 | 16 | export interface DatedPrice { 17 | date: Date; 18 | price: number; 19 | } 20 | 21 | export interface ProductResponse { 22 | upsertType: UpsertResponse; 23 | product: Product; 24 | } 25 | 26 | export interface CategorisedUrl { 27 | url: string; 28 | categories: string[]; 29 | } 30 | 31 | export const enum UpsertResponse { 32 | NewProduct, 33 | PriceChanged, 34 | InfoChanged, 35 | AlreadyUpToDate, 36 | Failed, 37 | } 38 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "cd-scraper", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "index", 6 | "scripts": { 7 | "test": "npx jest", 8 | "dev": "npx esrun src/index.ts", 9 | "db": "npx esrun src/index.ts db", 10 | "db images": "npx esrun src/index.ts db images" 11 | }, 12 | "author": "", 13 | "license": "ISC", 14 | "dependencies": { 15 | "@azure/cosmos": "latest", 16 | "cheerio": "^1.0.0-rc.12", 17 | "dotenv": "latest", 18 | "lodash": "^4.17.21", 19 | "playwright": "^1.44.0" 20 | }, 21 | "devDependencies": { 22 | "@types/jest": "^29.5.12", 23 | "@types/lodash": "^4.17.4", 24 | "@types/node": "^20.12.12", 25 | "esrun": "^3.2.26", 26 | "jest": "^29.7.0", 27 | "ts-jest": "^29.1.2", 28 | "typescript": "^5.4.5" 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /tests/utilities.test.ts: -------------------------------------------------------------------------------- 1 | import 'jest'; 2 | // jest.useFakeTimers(); 3 | import * as cheerio from 'cheerio'; 4 | import { playwrightElementToProduct } from '../src/index'; 5 | import { CategorisedUrl, Product } from '../src/typings'; 6 | import { addUnitPriceToProduct } from '../src/utilities'; 7 | 8 | // Sample input 9 | const html = ` 10 |

Product Name fresh fruit

11 |
12 |

Large

13 | `; 14 | 15 | // Sample product 16 | const juiceProduct: Product = { 17 | id: '12345', 18 | name: 'Orange Juice', 19 | size: '250ml', 20 | currentPrice: 4, 21 | lastUpdated: new Date('01-20-2023'), 22 | lastChecked: new Date('01-20-2023'), 23 | priceHistory: [], 24 | sourceSite: 'countdown.co.nz', 25 | category: ['juice'], 26 | }; 27 | 28 | const $ = cheerio.load(html); 29 | const productEntries = $('cdx-card a.product-entry'); 30 | 31 | describe('scraping', () => { 32 | // it('extract normal product titles', async () => { 33 | // const result = playwrightElementToProduct(productEntries[0], ['test']); 34 | // expect(result!.name).toBe('yes'); 35 | // }); 36 | 37 | it('per unit price is derived from quantity and size', async () => { 38 | const result = addUnitPriceToProduct(juiceProduct); 39 | expect(result.unitName).toBe('L'); 40 | expect(result.unitPrice).toBe(16); 41 | expect(result.originalUnitQuantity).toBe(250); 42 | }); 43 | }); 44 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Woolworths Scraper 2 | 3 | This project scrapes product info from Woolworths (Formerly Countdown) NZ website and optionally places the data into Azure CosmosDB. 4 | 5 | A history of price changes is stored within each product's database entry. 6 | 7 | Image files can also be scraped and placed into an Azure Storage Blob Container. 8 | 9 | ## Basic Setup 10 | 11 | With `NodeJS` installed, clone this repository, then run `npm install` to install dependencies. 12 | 13 | Playwright must also be installed when running for the first time with `npx playwright install`. 14 | 15 | The program can now be tested in dry run mode without any further setup using `npm run dev`. 16 | 17 | ## Optional Setup 18 | 19 | The `.env` file has variables that can be filled for more functionality. 20 | 21 | ```js 22 | STORE_NAME= Optional supermarket location name 23 | COSMOS_CONSTRING= Read-write CosmosDB connection string 24 | COSMOS_DB_NAME= CosmosDB Name 25 | COSMOS_CONTAINER= CosmosDB Container Name, eg. products 26 | COSMOS_PARTITION_KEY= CosmosDB Partition Key, eg. /name 27 | IMAGE_UPLOAD_FUNC_URL= Optional image upload REST API URL 28 | ``` 29 | 30 | - The CosmosDB read-write connection string can be obtained from the `Azure Portal > CosmosDB > Settings > Keys`. 31 | - A list of URLs to scrape can be put in file `urls.txt`, with one url per line. 32 | 33 | ## Usage 34 | 35 | `npm run dev` - will use dry-run mode, no azure connection is required and the results will log to console. 36 | 37 | `npm run db` - will scrape through the URLs and store the results into CosmosDB. 38 | 39 | `npm run db https://sampleurl` - a single url can be used as an argument. This will be scraped instead of the URLs text file. 40 | 41 | ## Other Command-Line Arguments 42 | 43 | `images` - will also upload images. 44 | 45 | `headed` - will run the browser in a window instead of a headless. 46 | 47 | ## Output 48 | 49 | Sample log output when running in dry run mode: 50 | 51 | ```cmd 52 | ID | Name | Size | Price | Unit Price 53 | ---------------------------------------------------------------------------------- 54 | 762844 | Ocean Blue Smoked Salmon Slices | 100g | $ 9 | $90 /kg 55 | 697201 | Clearly Premium Smoked Salmon | 200g | $ 13.5 | $67.5 /kg 56 | 830035 | Ocean Blue Smoked Salmon Slices | 180g | $ 12 | $67.7 /kg 57 | ``` 58 | 59 | This is a sample of a single product stored in CosmosDB. It was re-run at multiple dates to store changing prices: 60 | 61 | ```json 62 | { 63 | "id": "123456", 64 | "name": "Sausages Precooked Chinese Honey", 65 | "currentPrice": 12.9, 66 | "size": "Prepacked 1kg pack", 67 | "priceHistory": [ 68 | { 69 | "date": "Sat Jan 14 2023", 70 | "price": 10 71 | }, 72 | { 73 | "date": "Thu Jan 26 2023", 74 | "price": 12.9 75 | } 76 | ] 77 | } 78 | ``` 79 | -------------------------------------------------------------------------------- /src/product-overrides.ts: -------------------------------------------------------------------------------- 1 | // This file is for manually overriding product size and category data. 2 | // Is used for products that do not have listed sizes to be scraped, or have incorrect categories. 3 | 4 | export const productOverrides = [ 5 | { id: '206889', size: '180g' }, 6 | { id: '196996', size: '300g' }, 7 | { id: '137967', size: '420g' }, 8 | { id: '125856', size: '450g' }, 9 | { id: '189268', size: '1.13kg' }, 10 | { id: '189150', size: '1.2kg' }, 11 | { id: '190454', size: '2.1kg' }, 12 | { id: '189078', size: '1.3kg' }, 13 | { id: '189136', size: '1.2kg' }, 14 | { id: '755237', size: '931g' }, 15 | { id: '755304', size: '1.1kg' }, 16 | { id: '755246', size: '1020g' }, 17 | { id: '755245', size: '1.2kg' }, 18 | { id: '112273', size: '865ml' }, 19 | { id: '269514', size: '584ml' }, 20 | { id: '269515', size: '584ml' }, 21 | { id: '116518', size: '440ml' }, 22 | { id: '151191', size: '570ml' }, 23 | { id: '279904', size: '575ml' }, 24 | { id: '146149', size: '1000ml' }, 25 | { id: '791925', size: '525g' }, 26 | { id: '774216', size: '525g' }, 27 | { id: '784406', size: '525g' }, 28 | { id: '791916', size: '525g' }, 29 | { id: '306624', size: '185g' }, 30 | { id: '156824', size: '180g' }, 31 | { id: '9023', size: '375g' }, 32 | { id: '266962', category: 'sweets-lollies' }, 33 | { id: '171524', size: '230ml', category: 'baking' }, 34 | { id: '170021', category: 'ice-blocks' }, 35 | { id: '71164', category: 'sausages' }, 36 | { id: '71174', category: 'sausages' }, 37 | { id: '71168', category: 'sausages' }, 38 | { id: '71165', category: 'sausages' }, 39 | { id: '331560', category: 'specialty-bread' }, 40 | { id: '679412', category: 'herbal-tea' }, 41 | { id: '267492', category: 'herbal-tea' }, 42 | { id: '267485', category: 'herbal-tea' }, 43 | { id: '413302', category: 'herbal-tea' }, 44 | { id: '267488', category: 'herbal-tea' }, 45 | { id: '760872', category: 'herbal-tea' }, 46 | { id: '681177', category: 'herbal-tea' }, 47 | { id: '95091', category: 'herbal-tea' }, 48 | { id: '761093', category: 'black-tea' }, 49 | { id: '721661', category: 'green-tea' }, 50 | { id: '790129', category: 'herbal-tea' }, 51 | { id: '721034', category: 'herbal-tea' }, 52 | { id: '95091.', category: 'herbal-tea' }, 53 | { id: '184090', category: 'herbal-tea' }, 54 | { id: '690093', category: 'green-tea' }, 55 | { id: '780922', category: 'sauces' }, 56 | { id: '780921', category: 'sauces' }, 57 | { id: '72618', category: 'black-tea' }, 58 | { id: '6053', category: 'black-tea' }, 59 | { id: '72617', category: 'black-tea' }, 60 | { id: '168068', category: 'black-tea' }, 61 | { id: '6052', category: 'black-tea' }, 62 | { id: '761436', category: 'black-tea' }, 63 | { id: '14133', size: '390g' }, 64 | { id: '970886', category: 'bakery-desserts' }, 65 | { id: '775131', category: 'sweets-lollies' }, 66 | { id: '69336', category: 'sweets-lollies' }, 67 | { id: '790537', category: 'sweets-lollies' }, 68 | { id: '746240', category: 'sausages' }, 69 | { id: '351192', category: 'sausages' }, 70 | { id: '246719', category: 'sausages' }, 71 | { id: '282184', category: 'sausages' }, 72 | { id: '746273', category: 'sausages' }, 73 | { id: '282181', category: 'sausages' }, 74 | { id: '70941.', category: 'sausages' }, 75 | { id: '905772', category: 'sausages' }, 76 | { id: '905792', category: 'sausages' }, 77 | { id: '905773', category: 'sausages' }, 78 | { id: '905764', category: 'sausages' }, 79 | { id: '290802', category: 'sausages' }, 80 | { id: '563985', category: 'sausages' }, 81 | { id: '757239', category: 'sausages' }, 82 | { id: '761522', category: 'sausages' }, 83 | { id: '290811', category: 'sausages' }, 84 | { id: '290796', category: 'sausages' }, 85 | { id: '290735', category: 'sausages' }, 86 | { id: '761618', category: 'sausages' }, 87 | { id: '290815', category: 'sausages' }, 88 | { id: '69865.', category: 'sausages' }, 89 | { id: '681002', size: '480g' }, 90 | { id: '681001', size: '552g' }, 91 | { id: '681005', size: '552g' }, 92 | { id: '285555', size: '360g' }, 93 | { id: '822190', size: '350g' }, 94 | { id: '680890', size: '315g' }, 95 | { id: '681003', size: '315g' }, 96 | { id: '822078', size: '300g' }, 97 | { id: '822079', size: '350g' }, 98 | { id: '391911', size: '325g' }, 99 | { id: '917977', category: 'sauces' }, 100 | { id: '918032', category: 'sauces' }, 101 | { id: '279219', category: 'pizza' }, 102 | { id: '279114', category: 'pizza' }, 103 | { id: '279218', category: 'pizza' }, 104 | { id: '324079', category: 'pizza' }, 105 | { id: '279059', category: 'pizza' }, 106 | { id: '782359', category: 'pizza' }, 107 | { id: '63801', category: 'pizza' }, 108 | { id: '6045946', category: 'patties-meatballs' }, 109 | { id: '70941', category: 'sausages' }, 110 | { id: '69865', category: 'sausages' }, 111 | { id: '116285', category: 'sausages' }, 112 | { id: '319272', category: 'sausages' }, 113 | ]; 114 | -------------------------------------------------------------------------------- /src/utilities.ts: -------------------------------------------------------------------------------- 1 | import { Product } from './typings'; 2 | import { readFileSync } from 'fs'; 3 | 4 | // Set widths for table log output 5 | const tableIDWidth = 7 6 | const tableNameWidth = 60; 7 | const tableSizeWidth = 17; 8 | 9 | export const colour = { 10 | red: '\x1b[31m', 11 | green: '\x1b[32m', 12 | yellow: '\x1b[33m', 13 | blue: '\x1b[38;5;117m', 14 | magenta: '\x1b[35m', 15 | cyan: '\x1b[36m', 16 | white: '\x1b[37m', 17 | crimson: '\x1b[38m', 18 | grey: '\x1b[90m', 19 | orange: '\x1b[38;5;214m', 20 | sky: '\x1b[38;5;153m', 21 | }; 22 | 23 | // log() 24 | // ----- 25 | // Console log with specified colour 26 | 27 | export function log(colour: string, text: string) { 28 | const clear = '\x1b[0m'; 29 | console.log(`${colour}%s${clear}`, text); 30 | } 31 | 32 | // logError() 33 | // ---------- 34 | // Shorthand function for logging with red colour 35 | 36 | export function logError(text: string) { 37 | log(colour.red, text); 38 | } 39 | 40 | // logProductRow() 41 | // --------------- 42 | // Log a single product in one row, using alternating colours for readability. 43 | 44 | export function logProductRow(product: Product) { 45 | const unitPriceString = product.unitPrice ? `$${product.unitPrice.toFixed(2)} /${product.unitName}` : ``; 46 | log( 47 | getAlternatingRowColour(colour.sky, colour.white), 48 | `${product.id.padStart(tableIDWidth)} | ` + 49 | `${product.name.slice(0, tableNameWidth).padEnd(tableNameWidth)} | ` + 50 | `${product.size?.slice(0, tableSizeWidth).padEnd(tableSizeWidth)} | ` + 51 | `$ ${product.currentPrice.toFixed(2).padStart(4).padEnd(5)} | ` + 52 | unitPriceString 53 | ); 54 | } 55 | 56 | // logTableHeader() 57 | // ---------------- 58 | 59 | export function logTableHeader() { 60 | log( 61 | colour.yellow, 62 | `${'ID'.padStart(tableIDWidth)} | ${'Name'.padEnd(tableNameWidth)} | ` + 63 | `${'Size'.padEnd(tableSizeWidth)} | ` + 64 | `${'Price'.padEnd(7)} | Unit Price` 65 | ); 66 | 67 | let headerLine = "" 68 | for (let i = 0; i < 113; i++) { 69 | headerLine += "-" 70 | } 71 | log(colour.yellow, headerLine); 72 | 73 | } 74 | 75 | // getAlternatingRowColour() 76 | // ------------------------- 77 | // Takes 2 colours and flip-flops between them on each function call. 78 | // Is used for printing tables with better readability. 79 | 80 | let alternatingRowColour = false; 81 | function getAlternatingRowColour(colourA: string, colourB: string) { 82 | alternatingRowColour = alternatingRowColour ? false : true; 83 | return alternatingRowColour ? colourA : colourB; 84 | } 85 | 86 | // readLinesFromTextFile() 87 | // ----------------------- 88 | // Read from local text file containing one url per line, return as string array. 89 | 90 | export function readLinesFromTextFile(filename: string): string[] { 91 | try { 92 | const file = readFileSync(filename, 'utf-8'); 93 | const result = file.split(/\r?\n/).filter((line) => { 94 | if (line.trim().length > 0) return true; 95 | else return false; 96 | }); 97 | return result; 98 | } catch (error) { 99 | throw 'Error reading ' + filename; 100 | } 101 | } 102 | 103 | // getTimeElapsedSince() 104 | // --------------------- 105 | // Get time difference in between startTime and now. Returns in 58s or 12:32 format. 106 | 107 | export function getTimeElapsedSince(startTime: number): string { 108 | let elapsedTimeSeconds: number = (Date.now() - startTime) / 1000; 109 | let elapsedTimeString: string = Math.floor(elapsedTimeSeconds).toString(); 110 | 111 | // If over 60 secs, print as 1:23 format 112 | if (elapsedTimeSeconds >= 60) { 113 | return ( 114 | Math.floor(elapsedTimeSeconds / 60) + 115 | ':' + 116 | Math.floor(elapsedTimeSeconds % 60) 117 | .toString() 118 | .padStart(2, '0') 119 | ) 120 | // Else print in 40s format 121 | } else return elapsedTimeString + "s"; 122 | } 123 | 124 | // List of valid category names that scraped products should be put in 125 | export const validCategories: string[] = [ 126 | // freshCategory 127 | 'eggs', 128 | 'fruit', 129 | 'fresh-vegetables', 130 | 'salads-coleslaw', 131 | 'bread', 132 | 'bread-rolls', 133 | 'specialty-bread', 134 | 'bakery-cakes', 135 | 'bakery-desserts', 136 | // chilledCategory 137 | 'milk', 138 | 'long-life-milk', 139 | 'sour-cream', 140 | 'cream', 141 | 'yoghurt', 142 | 'butter', 143 | 'cheese', 144 | 'cheese-slices', 145 | 'salami', 146 | 'other-deli-foods', 147 | // meatCategory 148 | 'beef-lamb', 149 | 'chicken', 150 | 'ham', 151 | 'bacon', 152 | 'pork', 153 | 'patties-meatballs', 154 | 'sausages', 155 | 'deli-meats', 156 | 'meat-alternatives', 157 | 'seafood', 158 | 'salmon', 159 | // frozenCategory 160 | 'ice-cream', 161 | 'ice-blocks', 162 | 'pastries-cheesecake', 163 | 'frozen-chips', 164 | 'frozen-vegetables', 165 | 'frozen-fruit', 166 | 'frozen-seafood', 167 | 'pies-sausage-rolls', 168 | 'pizza', 169 | 'other-savouries', 170 | // pantryCategory 171 | 'rice', 172 | 'noodles', 173 | 'pasta', 174 | 'beans-spaghetti', 175 | 'canned-fish', 176 | 'canned-meat', 177 | 'soup', 178 | 'cereal', 179 | 'spreads', 180 | 'baking', 181 | 'sauces', 182 | 'oils-vinegars', 183 | 'world-foods', 184 | // snacksCategory 185 | 'chocolate', 186 | 'boxed-chocolate', 187 | 'chips', 188 | 'crackers', 189 | 'biscuits', 190 | 'muesli-bars', 191 | 'nuts-bulk-mix', 192 | 'sweets-lollies', 193 | 'other-snacks', 194 | // drinksCategory 195 | 'black-tea', 196 | 'green-tea', 197 | 'herbal-tea', 198 | 'drinking-chocolate', 199 | 'coffee', 200 | 'soft-drinks', 201 | 'energy-drinks', 202 | 'juice', 203 | // petsCategory 204 | 'cat-food', 205 | 'cat-treats', 206 | 'dog-food', 207 | 'dog-treats', 208 | ]; 209 | 210 | // toTitleCase() 211 | // ------------- 212 | // Convert a string to title case 213 | 214 | export function toTitleCase(str: string) { 215 | return str.replace(/\w\S*/g, function (txt) { 216 | return txt.charAt(0).toUpperCase() + txt.substring(1).toLowerCase(); 217 | }); 218 | } -------------------------------------------------------------------------------- /src/urls.txt: -------------------------------------------------------------------------------- 1 | # List of all URLs to be scraped (based off original WW categories as of Oct-2025) 2 | # -------------------------------------------------------------------------------- 3 | # Not all products are scraped. Only popular categories are to save time. 4 | # add pages=2 or similiar to scrape more pages per category. 5 | # add categories=name to replace the category if storing to a database. 6 | 7 | # Fruit & Veg 8 | woolworths.co.nz/shop/browse/fruit-veg/fruit categories=fruit pages=2 9 | woolworths.co.nz/shop/browse/fruit-veg/prepared-fruit-veg categories=salads-coleslaw 10 | woolworths.co.nz/shop/browse/fruit-veg/vegetables categories=fresh-vegetables pages=4 11 | woolworths.co.nz/shop/browse/fruit-veg/fresh-salad-herbs categories=fresh-vegetables pages=2 12 | woolworths.co.nz/shop/browse/fruit-veg/fresh-salad-herbs/slaws-salad-kits categories=salads-coleslaw 13 | 14 | # Meat & Poultry 15 | woolworths.co.nz/shop/browse/meat-poultry/beef categories=beef-lamb pages=2 16 | woolworths.co.nz/shop/browse/meat-poultry/lamb categories=beef-lamb 17 | woolworths.co.nz/shop/browse/meat-poultry/chicken-poultry categories=chicken pages=2 18 | woolworths.co.nz/shop/browse/meat-poultry/pork 19 | woolworths.co.nz/shop/browse/meat-poultry/mince-patties categories=patties-meatballs 20 | woolworths.co.nz/shop/browse/meat-poultry/sausages 21 | woolworths.co.nz/shop/browse/meat-poultry/plant-based-alternatives categories=meat-alternatives 22 | 23 | # Fish & Seafood 24 | woolworths.co.nz/shop/browse/fish-seafood categories=seafood 25 | woolworths.co.nz/shop/browse/fish-seafood/salmon categories=salmon 26 | 27 | # Fridge & Deli 28 | woolworths.co.nz/shop/browse/fridge-deli/milk pages=3 29 | woolworths.co.nz/shop/browse/fridge-deli/cream-custard categories=cream 30 | woolworths.co.nz/shop/browse/fridge-deli/milk/long-life-milk categories=long-life-milk 31 | woolworths.co.nz/shop/browse/fridge-deli/eggs-butter-spreads/butter categories=butter pages=2 32 | woolworths.co.nz/shop/browse/fridge-deli/eggs-butter-spreads/margarine-spreads categories=butter 33 | woolworths.co.nz/shop/browse/fridge-deli/cheese categories=cheese pages=2 34 | woolworths.co.nz/shop/browse/fridge-deli/cheese/block-cheese categories=cheese 35 | woolworths.co.nz/shop/browse/fridge-deli/pasta-pizza-pastry/pizza-bases categories=pizza 36 | woolworths.co.nz/shop/browse/fridge-deli/yoghurt-desserts categories=yoghurt pages=2 37 | woolworths.co.nz/shop/browse/fridge-deli/deli-meats-seafood/ham-shaved-meat categories=ham 38 | woolworths.co.nz/shop/browse/fridge-deli/deli-meats-seafood/bacon categories=bacon 39 | woolworths.co.nz/shop/browse/fridge-deli/deli-meats-seafood/salami-cured-dried-meats categories=salami 40 | woolworths.co.nz/shop/browse/fridge-deli/deli-meats-seafood/cooked-meats categories=ham 41 | woolworths.co.nz/shop/browse/fridge-deli/juice-drinks categories=juice 42 | woolworths.co.nz/shop/browse/fridge-deli/vegan-vegetarian categories=other-deli-foods 43 | woolworths.co.nz/shop/browse/fridge-deli/prepared-meals-sides/heat-and-eat-meals categories=other-deli-foods 44 | woolworths.co.nz/shop/browse/fridge-deli/prepared-meals-sides/soup-risotto categories=other-deli-foods 45 | woolworths.co.nz/shop/browse/fridge-deli/prepared-meals-sides/pies-quiche categories=pies-sausage-rolls 46 | woolworths.co.nz/shop/browse/fridge-deli/dips-hummus-nibbles categories=other-deli-foods 47 | 48 | # Bakery 49 | woolworths.co.nz/shop/browse/bakery/baked-in-store/loaves-garlic-savoury-bread categories=bread-rolls 50 | woolworths.co.nz/shop/browse/bakery/buns-rolls-bread-sticks categories=bread-rolls 51 | woolworths.co.nz/shop/browse/bakery/sliced-packaged-bread categories=bread pages=2 52 | woolworths.co.nz/shop/browse/bakery/wraps-pita-pizza-bases categories=specialty-bread 53 | woolworths.co.nz/shop/browse/bakery/bagels-crumpets-pancakes categories=specialty-bread 54 | woolworths.co.nz/shop/browse/bakery/pastries-croissants-biscuits categories=bakery-desserts 55 | woolworths.co.nz/shop/browse/bakery/cakes-muffins-desserts categories=bakery-desserts 56 | woolworths.co.nz/shop/browse/bakery/cakes-muffins-desserts/birthday-celebration-cakes categories=bakery-cakes 57 | 58 | # Frozen 59 | woolworths.co.nz/shop/browse/frozen/ice-cream-sorbet/tubs categories=ice-cream pages=3 60 | woolworths.co.nz/shop/browse/frozen/ice-cream-sorbet/single-serve-multipacks categories=ice-cream pages=2 61 | woolworths.co.nz/shop/browse/frozen/frozen-meals-snacks/frozen-pies-sausage-rolls-hot-dogs categories=pies-sausage-rolls pages=2 62 | woolworths.co.nz/shop/browse/frozen/pizza-pastry-bread/frozen-pizza categories=pizza 63 | woolworths.co.nz/shop/browse/frozen/pizza-pastry-bread/frozen-pastry categories=pastries-cheesecake 64 | woolworths.co.nz/shop/browse/frozen/frozen-meals-snacks/spring-rolls-toppers-savouries categories=other-savouries 65 | woolworths.co.nz/shop/browse/frozen/frozen-meals-snacks/dumplings-wontons-steam-buns categories=other-savouries 66 | woolworths.co.nz/shop/browse/frozen/frozen-vegetables/other-frozen-vegetables categories=frozen-vegetables 67 | woolworths.co.nz/shop/browse/frozen/frozen-vegetables/frozen-peas-corn-carrots categories=frozen-vegetables 68 | woolworths.co.nz/shop/browse/frozen/frozen-vegetables/mixed-vegetables-stir-fry categories=frozen-vegetables 69 | woolworths.co.nz/shop/browse/frozen/frozen-vegetables/chips-wedges-potatoes categories=frozen-chips 70 | woolworths.co.nz/shop/browse/frozen/frozen-vegetables/hash-browns-rosti categories=other-savouries 71 | woolworths.co.nz/shop/browse/frozen/frozen-meat/frozen-chicken-poultry categories=chicken 72 | woolworths.co.nz/shop/browse/frozen/frozen-meat/frozen-burgers categories=patties-meatballs 73 | woolworths.co.nz/shop/browse/frozen/frozen-meat-alternatives categories=other-savouries 74 | woolworths.co.nz/shop/browse/frozen/frozen-seafood 75 | woolworths.co.nz/shop/browse/frozen/frozen-fruit-drink categories=frozen-fruit 76 | 77 | # Pantry 78 | woolworths.co.nz/shop/browse/pantry/snacks-sweets/popcorn-nuts-savoury-snacks categories=other-snacks pages=2 79 | woolworths.co.nz/shop/browse/pantry/snacks-sweets/corn-chips-salsa categories=chips 80 | woolworths.co.nz/shop/browse/pantry/snacks-sweets/chips categories=chips pages=3 81 | woolworths.co.nz/shop/browse/pantry/snacks-sweets/muesli-bars-snack-bars categories=muesli-bars pages=2 82 | woolworths.co.nz/shop/browse/pantry/snacks-sweets/chocolate-bars-blocks categories=chocolate pages=4 83 | woolworths.co.nz/shop/browse/pantry/snacks-sweets/chocolate-boxes-gifts categories=chocolate pages=4 84 | woolworths.co.nz/shop/browse/pantry/snacks-sweets/sweets-lollies-licorice categories=sweets-lollies pages=2 85 | woolworths.co.nz/shop/browse/pantry/snacks-sweets/gums-mints categories=sweets-lollies 86 | woolworths.co.nz/shop/browse/pantry/eggs 87 | woolworths.co.nz/shop/browse/pantry/biscuits-crackers/biscuits-cookies categories=biscuits pages=4 88 | woolworths.co.nz/shop/browse/pantry/biscuits-crackers/crackers categories=crackers pages=2 89 | woolworths.co.nz/shop/browse/pantry/biscuits-crackers/rice-cakes-crispbread categories=crackers 90 | woolworths.co.nz/shop/browse/pantry/bulk-foods/nuts-seeds categories=nuts-bulk-mix 91 | woolworths.co.nz/shop/browse/pantry/bulk-foods/dried-fruit-mixes categories=nuts-bulk-mix 92 | woolworths.co.nz/shop/browse/pantry/baking categories=baking pages=3 93 | woolworths.co.nz/shop/browse/pantry/pasta-noodles-grains/rice 94 | woolworths.co.nz/shop/browse/pantry/pasta-noodles-grains/noodles pages=2 95 | woolworths.co.nz/shop/browse/pantry/pasta-noodles-grains/dried-pasta categories=pasta 96 | woolworths.co.nz/shop/browse/pantry/pasta-noodles-grains/pasta-meals-sides categories=pasta 97 | woolworths.co.nz/shop/browse/pantry/tinned-foods-packets/baked-beans-spaghetti categories=beans-spaghetti 98 | woolworths.co.nz/shop/browse/pantry/tinned-foods-packets/tinned-tuna-seafood categories=canned-fish 99 | woolworths.co.nz/shop/browse/pantry/tinned-foods-packets/tinned-meat categories=canned-meat 100 | woolworths.co.nz/shop/browse/pantry/tinned-foods-packets/tinned-soup-soup-mix categories=soup 101 | woolworths.co.nz/shop/browse/pantry/cereals-spreads/cereal pages=2 102 | woolworths.co.nz/shop/browse/pantry/cereals-spreads/nut-butter categories=spreads 103 | woolworths.co.nz/shop/browse/pantry/cereals-spreads/honey categories=spreads 104 | woolworths.co.nz/shop/browse/pantry/cereals-spreads/jam categories=spreads 105 | woolworths.co.nz/shop/browse/pantry/cereals-spreads/other-spreads categories=spreads 106 | woolworths.co.nz/shop/browse/pantry/cereals-spreads/muesli-oats categories=cereal 107 | woolworths.co.nz/shop/browse/pantry/sauces-pastes categories=sauces 108 | 109 | # Beer & Wine 110 | 111 | # Drinks 112 | woolworths.co.nz/shop/browse/drinks/coffee pages=3 113 | woolworths.co.nz/shop/browse/drinks/tea-milk-drinks/black-breakfast-tea categories=black-tea pages=2 114 | woolworths.co.nz/shop/browse/drinks/tea-milk-drinks/green-tea categories=green-tea 115 | woolworths.co.nz/shop/browse/drinks/tea-milk-drinks/herbal-fruit-teas categories=herbal-tea pages=2 116 | woolworths.co.nz/shop/browse/drinks/tea-milk-drinks/drinking-chocolate-malt categories=drinking-chocolate 117 | woolworths.co.nz/shop/browse/drinks/juice-cordial categories=juice pages=3 118 | woolworths.co.nz/shop/browse/drinks/soft-drinks-sports-drinks/soft-drinks pages=2 119 | woolworths.co.nz/shop/browse/drinks/soft-drinks-sports-drinks/energy-drinks pages=2 120 | 121 | # Health & Body 122 | 123 | # Household 124 | 125 | # Baby & Child 126 | 127 | # Pet 128 | woolworths.co.nz/shop/browse/pet/cats/dry-cat-food categories=cat-food pages=2 129 | woolworths.co.nz/shop/browse/pet/cats/wet-cat-food categories=cat-food pages=3 130 | woolworths.co.nz/shop/browse/pet/cats/cat-milk-treats categories=cat-treats 131 | woolworths.co.nz/shop/browse/pet/dogs/dog-chews-bones-treats categories=dog-treats 132 | woolworths.co.nz/shop/browse/pet/dogs/dry-dog-food categories=dog-food 133 | woolworths.co.nz/shop/browse/pet/dogs/wet-dog-food categories=dog-food 134 | woolworths.co.nz/shop/browse/pet/dogs/chilled-or-frozen-dog-food categories=dog-food 135 | -------------------------------------------------------------------------------- /src/cosmosdb.ts: -------------------------------------------------------------------------------- 1 | // Used by index.ts for creating and accessing items stored in Azure CosmosDB 2 | 3 | import * as dotenv from "dotenv"; 4 | dotenv.config(); 5 | dotenv.config({ path: `.env.local`, override: true }); 6 | 7 | import { CosmosClient, Container, Database, FeedOptions, SqlQuerySpec } from "@azure/cosmos"; 8 | import { logError, log, colour, validCategories } from "./utilities"; 9 | import { Product, UpsertResponse, ProductResponse } from "./typings"; 10 | 11 | let cosmosClient: CosmosClient; 12 | let database: Database; 13 | let container: Container; 14 | 15 | export async function establishCosmosDB() { 16 | // Get CosmosDB connection string stored in .env 17 | const COSMOS_CONSTRING = process.env.COSMOS_CONSTRING; 18 | if (!COSMOS_CONSTRING) { 19 | throw Error( 20 | "CosmosDB connection string COSMOS_CONSTRING not found in .env" 21 | ); 22 | } 23 | 24 | // Ensure partition key is in correct format 25 | let validatedPartitionKey = process.env.COSMOS_PARTITION_KEY; 26 | if (validatedPartitionKey?.charAt(0) != "/") validatedPartitionKey = "/" + validatedPartitionKey; 27 | 28 | // Establish CosmosDB Client, Database, Container 29 | try { 30 | cosmosClient = new CosmosClient(COSMOS_CONSTRING); 31 | 32 | const databaseResponse = await cosmosClient.databases.createIfNotExists({ 33 | id: process.env.COSMOS_DB_NAME, 34 | }); 35 | 36 | database = databaseResponse.database; 37 | 38 | const containerResponse = await database.containers.createIfNotExists({ 39 | id: process.env.COSMOS_CONTAINER, 40 | partitionKey: { paths: [validatedPartitionKey] }, 41 | }); 42 | 43 | container = containerResponse.container; 44 | } catch (error) { 45 | throw Error(error + "\n\nInvalid CosmosDB connection - check for valid connection string"); 46 | } 47 | } 48 | 49 | // upsertProductToCosmosDB() 50 | // ------------------------- 51 | // Inserts or updates a product object to CosmosDB, 52 | // returns an UpsertResponse based on if and how the Product was updated 53 | 54 | export async function upsertProductToCosmosDB( 55 | scrapedProduct: Product 56 | ): Promise { 57 | try { 58 | // Check CosmosDB for any existing item using id and name as the partition key 59 | const cosmosResponse = await container 60 | .item(scrapedProduct.id as string, scrapedProduct.name) 61 | .read(); 62 | 63 | // If an existing item was found in CosmosDB, check for update values before uploading 64 | if (cosmosResponse.statusCode === 200) { 65 | const dbProduct = (await cosmosResponse.resource) as Product; 66 | const response = buildUpdatedProduct(scrapedProduct, dbProduct); 67 | 68 | // Send updated product to CosmosDB 69 | await container.items.upsert(response.product); 70 | return response.upsertType; 71 | } 72 | 73 | // If product with ID and exact name doesn't yet exist in CosmosDB 74 | else if (cosmosResponse.statusCode === 404) { 75 | // First check if there is an existing product with the same ID but different name(partition key) 76 | const querySpec = { 77 | query: `SELECT * FROM products p WHERE p.id = @id`, 78 | parameters: [ 79 | { 80 | name: "@id", 81 | value: scrapedProduct.id, 82 | }, 83 | ], 84 | }; 85 | const { resources } = await container.items.query(querySpec).fetchAll(); 86 | 87 | // If an existing ID was found, update the DB with the new name 88 | if (resources.length > 0) { 89 | // Cast existing product to correct type 90 | const dbProduct = resources[0] as Product; 91 | 92 | // Update product with new name 93 | const response = buildUpdatedProduct(scrapedProduct, dbProduct); 94 | response.product.name = scrapedProduct.name; 95 | 96 | // Send updated product to CosmosDB 97 | await container.items.upsert(response.product); 98 | return response.upsertType; 99 | } else { 100 | // If no existing ID was found, create a new product 101 | await container.items.create(scrapedProduct); 102 | 103 | console.log( 104 | ` New Product: ${scrapedProduct.name.slice(0, 47).padEnd(47)}` + 105 | ` | $ ${scrapedProduct.currentPrice}` 106 | ); 107 | 108 | return UpsertResponse.NewProduct; 109 | } 110 | } 111 | // Manage any failed cosmos updates 112 | else if (cosmosResponse.statusCode === 409) { 113 | logError(`Conflicting ID found for product ${scrapedProduct.name}`); 114 | return UpsertResponse.Failed; 115 | } else { 116 | // If CosmoDB returns a status code other than 200 or 404, manage other errors here 117 | logError(`CosmosDB returned status code: ${cosmosResponse.statusCode}`); 118 | return UpsertResponse.Failed; 119 | } 120 | } catch (e: any) { 121 | logError(e); 122 | return UpsertResponse.Failed; 123 | } 124 | } 125 | 126 | // buildUpdatedProduct() 127 | // --------------------- 128 | // This takes a freshly scraped product and compares it with a found database product. 129 | // It returns an updated product with data from both product versions 130 | 131 | function buildUpdatedProduct( 132 | scrapedProduct: Product, 133 | dbProduct: Product 134 | ): ProductResponse { 135 | // Date objects pulled from CosmosDB need to re-parsed as strings in format yyyy-mm-dd 136 | let dbDay = dbProduct.lastUpdated.toString(); 137 | dbDay = dbDay.slice(0, 10); 138 | let scrapedDay = scrapedProduct.lastUpdated.toISOString().slice(0, 10); 139 | 140 | // Measure the price difference between the new scraped product and the old db product 141 | const priceDifference = Math.abs( 142 | dbProduct.currentPrice - scrapedProduct.currentPrice 143 | ); 144 | 145 | // If price has changed by more than $0.05, and not on the same day 146 | if (priceDifference > 0.05 && dbDay != scrapedDay) { 147 | // Push scraped priceHistory into existing priceHistory array 148 | dbProduct.priceHistory.push(scrapedProduct.priceHistory[0]); 149 | 150 | // Set the scrapedProduct to use the updated priceHistory 151 | scrapedProduct.priceHistory = dbProduct.priceHistory; 152 | 153 | // Return completed Product ready for uploading 154 | logPriceChange(dbProduct, scrapedProduct.currentPrice); 155 | return { 156 | upsertType: UpsertResponse.PriceChanged, 157 | product: scrapedProduct, 158 | }; 159 | } 160 | 161 | // If any db categories are not included within the list of valid ones, update to scraped ones 162 | else if ( 163 | !dbProduct.category.every((category) => { 164 | const isValid = validCategories.includes(category); 165 | return isValid; 166 | }) || 167 | dbProduct.category === null 168 | ) { 169 | console.log( 170 | ` Categories Changed: ${scrapedProduct.name 171 | .padEnd(40) 172 | .substring(0, 40)}` + 173 | ` - ${dbProduct.category.join(" ")} > ${scrapedProduct.category.join( 174 | " " 175 | )}` 176 | ); 177 | 178 | // Update everything but priceHistory and lastUpdated 179 | scrapedProduct.priceHistory = dbProduct.priceHistory; 180 | scrapedProduct.lastUpdated = dbProduct.lastUpdated; 181 | 182 | // Return completed Product ready for uploading 183 | return { 184 | upsertType: UpsertResponse.InfoChanged, 185 | product: scrapedProduct, 186 | }; 187 | } 188 | 189 | // Update other info 190 | else if ( 191 | dbProduct.sourceSite !== scrapedProduct.sourceSite || 192 | dbProduct.category.join(" ") !== scrapedProduct.category.join(" ") || 193 | dbProduct.size !== scrapedProduct.size || 194 | dbProduct.unitPrice !== scrapedProduct.unitPrice || 195 | dbProduct.unitName !== scrapedProduct.unitName || 196 | dbProduct.originalUnitQuantity !== scrapedProduct.originalUnitQuantity 197 | ) { 198 | // Update everything but priceHistory and lastUpdated 199 | scrapedProduct.priceHistory = dbProduct.priceHistory; 200 | scrapedProduct.lastUpdated = dbProduct.lastUpdated; 201 | 202 | // Return completed Product ready for uploading 203 | return { 204 | upsertType: UpsertResponse.InfoChanged, 205 | product: scrapedProduct, 206 | }; 207 | } else { 208 | // Nothing has changed, only update lastChecked 209 | dbProduct.lastChecked = scrapedProduct.lastChecked; 210 | return { 211 | upsertType: UpsertResponse.AlreadyUpToDate, 212 | product: dbProduct, 213 | }; 214 | } 215 | } 216 | 217 | // logPriceChange() 218 | // ---------------- 219 | // Log a per product price change message, 220 | // coloured green for price reduction, red for price increase 221 | 222 | export function logPriceChange(product: Product, newPrice: number) { 223 | const priceIncreased = newPrice > product.currentPrice; 224 | log( 225 | priceIncreased ? colour.red : colour.green, 226 | " Price " + 227 | (priceIncreased ? "Up : " : "Down : ") + 228 | product.name.slice(0, 47).padEnd(47) + 229 | " | $" + 230 | product.currentPrice.toString().padStart(4) + 231 | " > $" + 232 | newPrice 233 | ); 234 | } 235 | 236 | // customQuery() 237 | // ------------- 238 | // Function for running custom DB queries - used primarily for debugging 239 | 240 | export async function customQuery(): Promise { 241 | const options: FeedOptions = { 242 | maxItemCount: 30, 243 | }; 244 | const secondsDelayBetweenBatches = 5; 245 | const querySpec: SqlQuerySpec = { 246 | query: "SELECT * FROM products p", 247 | }; 248 | 249 | log(colour.yellow, "Custom Query \n" + querySpec.query); 250 | 251 | const response = await container.items.query(querySpec, options); 252 | 253 | let batchCount = 0; 254 | const maxBatchCount = 900; 255 | let continueFetching = true; 256 | 257 | await (async () => { 258 | while (response.hasMoreResults() && continueFetching) { 259 | await delayedBatchFetch(); 260 | } 261 | })(); 262 | 263 | console.log("Custom Query Complete"); 264 | return; 265 | 266 | function delayedBatchFetch() { 267 | return new Promise((resolve) => 268 | setTimeout(async () => { 269 | console.log( 270 | "Batch " + 271 | batchCount + 272 | " - Items [" + 273 | batchCount * options.maxItemCount! + 274 | " - " + 275 | (batchCount + 1) * options.maxItemCount! 276 | ) + "]"; 277 | 278 | const batch = await response.fetchNext(); 279 | const products = batch.resources as Product[]; 280 | const items = batch.resources; 281 | 282 | products.forEach(async (p) => { 283 | let oldDatedPrice = 0; 284 | let requiresUpdate = false; 285 | 286 | p.priceHistory.forEach((datedPrice) => { 287 | let newDatedPrice = datedPrice.price; 288 | if (Math.abs(oldDatedPrice - newDatedPrice) < 0.04) { 289 | console.log(p.name); 290 | console.log( 291 | " - Tiny price difference detected on " + 292 | datedPrice.date.toDateString() + 293 | " - " + 294 | oldDatedPrice + 295 | " - " + 296 | newDatedPrice 297 | ); 298 | datedPrice.price = 0; 299 | requiresUpdate = true; 300 | } 301 | oldDatedPrice = newDatedPrice; 302 | }); 303 | 304 | if (requiresUpdate) { 305 | let updatedPriceHistory = p.priceHistory.filter((datedPrice) => { 306 | if (datedPrice.price > 0) return true; 307 | else return false; 308 | }); 309 | 310 | console.log( 311 | " - Old price history length: " + 312 | p.priceHistory.length + 313 | " - new length: " + 314 | updatedPriceHistory.length 315 | ); 316 | 317 | p.priceHistory = updatedPriceHistory; 318 | 319 | const uploadRes = await container.items.upsert(p); 320 | console.log( 321 | " - Uploaded updated product with status code: " + 322 | uploadRes.statusCode 323 | ); 324 | } 325 | 326 | // item.name = item.name.replace(' ', ' ').trim(); 327 | // let p: Product = item as Product; 328 | 329 | // const res = await container.item(item.id, item.name).delete(); 330 | // console.log('delete ' + res.statusCode); 331 | 332 | // const uploadRes = await container.items.upsert(p); 333 | // console.log('upload ' + uploadRes.statusCode); 334 | }); 335 | 336 | if (batchCount++ === maxBatchCount) continueFetching = false; 337 | 338 | resolve(); 339 | }, secondsDelayBetweenBatches * 1000) 340 | ); 341 | } 342 | } 343 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | import * as dotenv from "dotenv"; 2 | dotenv.config(); 3 | dotenv.config({ path: `.env.local`, override: true }); 4 | 5 | import playwright from "playwright"; 6 | import * as cheerio from "cheerio"; 7 | import _ from "lodash"; 8 | import { setTimeout } from "timers/promises"; 9 | 10 | import { establishCosmosDB, upsertProductToCosmosDB } from "./cosmosdb.js"; 11 | import { productOverrides } from "./product-overrides.js"; 12 | import { CategorisedUrl, DatedPrice, Product, UpsertResponse } from "./typings"; 13 | import { 14 | log, colour, logProductRow, logError, readLinesFromTextFile, getTimeElapsedSince, 15 | logTableHeader, toTitleCase, 16 | } from "./utilities.js"; 17 | 18 | // Woolworths / Countdown Scraper 19 | // ------------------------------ 20 | // Scrapes pricing and other info from Woolworths NZ's website. 21 | 22 | // Set a reasonable delay between each page load to reduce load on the server. 23 | const pageLoadDelaySeconds = 7; 24 | 25 | // Set a delay when logging each product per page to the console. 26 | const productLogDelayMilliSeconds = 20; 27 | 28 | // Record start time for logging purposes 29 | const startTime = Date.now(); 30 | 31 | // Load URLs from text file 'urls.txt' 32 | let categorisedUrls: CategorisedUrl[] = loadUrlsFile(); 33 | 34 | // Handle command-line arguments, ex: 'db', 'images', or single urls 35 | export let databaseMode = false; 36 | export let uploadImagesMode = false; 37 | let headlessMode = true; 38 | categorisedUrls = await handleArguments(categorisedUrls); 39 | 40 | // Establish CosmosDB connection if being used 41 | if (databaseMode) establishCosmosDB(); 42 | 43 | // Establish playwright browser 44 | let browser: playwright.Browser; 45 | let page: playwright.Page; 46 | browser = await establishPlaywrightPage(headlessMode); 47 | 48 | // Select store location 49 | await selectStoreByLocationName(); 50 | 51 | // Main Loop - Scrape through each page 52 | await scrapeAllPageURLs(); 53 | 54 | // Program End and Cleanup 55 | browser.close(); 56 | log( 57 | colour.sky, 58 | `\nAll Pages Completed = Total Time Elapsed ${getTimeElapsedSince(startTime)} \n` 59 | ); 60 | // ----------------------- 61 | 62 | 63 | // loadUrlsFile 64 | // ------------ 65 | // Loads and validates URLs from a txt file to be scraped. 66 | 67 | function loadUrlsFile(filePath: string = "src/urls.txt"): CategorisedUrl[] { 68 | // Try to read file urls.txt or other file for a list of URLs 69 | const rawLinesFromFile: string[] = readLinesFromTextFile(filePath); 70 | 71 | // Parse and optimise URLs 72 | let categorisedUrls: CategorisedUrl[] = []; 73 | rawLinesFromFile.map((line) => { 74 | let parsedUrls = parseAndCategoriseURL(line); 75 | if (parsedUrls !== undefined) { 76 | categorisedUrls = [...categorisedUrls, ...parsedUrls]; 77 | } 78 | }); 79 | 80 | // Return as an array of CategorisedUrl objects 81 | return categorisedUrls; 82 | } 83 | 84 | // scrapeAllPageURLs 85 | // --------------- 86 | // Loops through each page URL and scrapes pricing and other info. 87 | // This is the main function that calls the other functions. 88 | 89 | async function scrapeAllPageURLs() { 90 | 91 | // Log loop start 92 | log( 93 | colour.yellow, 94 | `${categorisedUrls.length} pages to be scraped`.padEnd(35) + 95 | `${pageLoadDelaySeconds}s delay between scrapes`.padEnd(35) + 96 | (databaseMode ? "(Database Mode)" : "(Dry Run Mode)") 97 | ); 98 | 99 | // Loop through each page URL to scrape 100 | for (let i = 0; i < categorisedUrls.length; i++) { 101 | 102 | // Extract url from CategorisedUrl object 103 | const categorisedUrl: CategorisedUrl = categorisedUrls[i]; 104 | let url: string = categorisedUrls[i].url; 105 | 106 | // Log current scrape sequence and the total number of pages to scrape 107 | const shortUrl = url.replace("https://", ""); 108 | log( 109 | colour.yellow, 110 | `\n[${i + 1}/${categorisedUrls.length}] ${shortUrl}` 111 | ); 112 | 113 | try { 114 | // Open page with upto 3 retries on failure 115 | let retries = 0; 116 | const maxRetries = 3; 117 | const retryDelay = 2000; // 2 seconds 118 | 119 | while (retries < maxRetries) { 120 | try { 121 | await page.goto(url); 122 | 123 | // Set page timeout to 8 seconds 124 | await page.setDefaultTimeout(8000); 125 | 126 | // Wait for product-price h3 html element to dynamically load in, 127 | // this is required to see product data 128 | await page.waitForSelector("product-price h3"); 129 | 130 | break; // If successful, exit the retry loop 131 | } catch (error) { 132 | retries++; 133 | if (retries === maxRetries) { 134 | throw error; // If all retries failed, throw the error 135 | } 136 | log(colour.yellow, `Retry ${retries}/${maxRetries} for ${url}`); 137 | await setTimeout(retryDelay); 138 | } 139 | } 140 | 141 | // Wait and page down multiple times to further trigger any lazy loads 142 | for (let pageDown = 0; pageDown < 5; pageDown++) { 143 | // create a random number between 500 and 1500 144 | const timeBetweenPgDowns = Math.random() * 1000 + 500; 145 | await page.waitForTimeout(timeBetweenPgDowns); 146 | await page.keyboard.press("PageDown"); 147 | } 148 | 149 | // If url has page= query parameter, check to see that page is available 150 | let desiredPageNumber = 1; 151 | let numPagesAvailable = 1; 152 | if (categorisedUrl.url.includes("page=")) { 153 | const currentPageMatch = categorisedUrl.url.match(/page=(\d+)/); 154 | if (currentPageMatch) { 155 | desiredPageNumber = parseInt(currentPageMatch[1]) 156 | 157 | try { 158 | // Detect number of pages available 159 | const paginationUL = await page.innerHTML("ul.pagination"); 160 | const $$ = cheerio.load(paginationUL); 161 | numPagesAvailable = $$("li").length - 2 // exclude prev/next buttons 162 | } catch { 163 | numPagesAvailable = 1; // if no pagination found, only 1 page exists 164 | } 165 | 166 | if (desiredPageNumber > numPagesAvailable) { 167 | log(colour.yellow, `Page ${desiredPageNumber} does not exist, only ${numPagesAvailable} pages available. Skipping..`); 168 | continue; // Skip this page as it doesn't exist 169 | } 170 | } 171 | } 172 | 173 | // Load html into Cheerio for DOM selection 174 | const html = await page.innerHTML("product-grid"); 175 | const $ = cheerio.load(html); 176 | 177 | // Find all product entries 178 | const allProductEntries = $("cdx-card product-stamp-grid div.product-entry"); 179 | 180 | // Find advertisement product entries not normally part of this product category 181 | const advertisementEntries = $("div.carousel-track div cdx-card product-stamp-grid div.product-entry") 182 | const adHrefs: string[] = advertisementEntries.map((index, element) => { 183 | return $(element).find("a").first().attr("href"); 184 | }).toArray(); 185 | 186 | // Filter out product entries that match the found advertisements 187 | const productEntries = allProductEntries.filter((index, element) => { 188 | const productHref = $(element).find("a").first().attr("href"); 189 | return !adHrefs.includes(productHref!); 190 | }) 191 | 192 | // Log the number of products found, time elapsed, category, pages 193 | log( 194 | colour.yellow, 195 | `${productEntries.length} product entries found`.padEnd(38) + 196 | `Time Elapsed: ${getTimeElapsedSince(startTime)}`.padEnd(35) + 197 | `Category: ${_.startCase(categorisedUrl.categories.join(" - ")).padEnd(20)}` + 198 | `Page: ${desiredPageNumber}/${numPagesAvailable}` 199 | ); 200 | 201 | // Log table header 202 | if (!databaseMode) logTableHeader(); 203 | 204 | // Store number of items processed for logging purposes 205 | let perPageLogStats = { 206 | newProducts: 0, 207 | priceChanged: 0, 208 | infoUpdated: 0, 209 | alreadyUpToDate: 0, 210 | } 211 | 212 | // Start nested loop which loops through each product entry 213 | perPageLogStats = 214 | await processFoundProductEntries(categorisedUrl, productEntries, perPageLogStats); 215 | 216 | // After scraping every item is complete, log how many products were scraped 217 | if (databaseMode) { 218 | log( 219 | colour.blue, 220 | `CosmosDB: ${perPageLogStats.newProducts} new products, ` + 221 | `${perPageLogStats.priceChanged} updated prices, ` + 222 | `${perPageLogStats.infoUpdated} updated info, ` + 223 | `${perPageLogStats.alreadyUpToDate} already up-to-date` 224 | ); 225 | } 226 | 227 | // Delay between each page load 228 | await setTimeout(pageLoadDelaySeconds * 1000); 229 | 230 | } catch (error: unknown) { 231 | if (typeof error === 'string') { 232 | if (error.includes("NS_ERROR_CONNECTION_REFUSED")) { 233 | logError("Connection Failed - Check Firewall\n" + error); 234 | return; 235 | } 236 | } 237 | logError( 238 | "Page Timeout after 15 seconds - Skipping this page\n" + error 239 | ); 240 | } 241 | } 242 | } 243 | 244 | // processFoundProductEntries 245 | // -------------------------- 246 | // Loops through each product entry and scrapes pricing and other info. 247 | // This function is called by scrapeAllPageURLs. 248 | 249 | async function processFoundProductEntries 250 | ( 251 | categorisedUrl: CategorisedUrl, 252 | productEntries: cheerio.Cheerio, 253 | perPageLogStats: { 254 | newProducts: number; 255 | priceChanged: number; 256 | infoUpdated: number; 257 | alreadyUpToDate: number; 258 | }) { 259 | 260 | // Loop through each product entry 261 | for (let i = 0; i < productEntries.length; i++) { 262 | const productEntryElement = productEntries[i]; 263 | 264 | const product = playwrightElementToProduct( 265 | productEntryElement, 266 | categorisedUrl.categories 267 | ); 268 | 269 | if (databaseMode && product !== undefined) { 270 | // Insert or update item into azure cosmosdb 271 | const response = await upsertProductToCosmosDB(product); 272 | 273 | // Use response to update logging counters 274 | switch (response) { 275 | case UpsertResponse.AlreadyUpToDate: 276 | perPageLogStats.alreadyUpToDate++; 277 | break; 278 | case UpsertResponse.InfoChanged: 279 | perPageLogStats.infoUpdated++; 280 | break; 281 | case UpsertResponse.NewProduct: 282 | perPageLogStats.newProducts++; 283 | break; 284 | case UpsertResponse.PriceChanged: 285 | perPageLogStats.priceChanged++; 286 | break; 287 | default: 288 | break; 289 | } 290 | 291 | // Upload image to Azure Function 292 | if (uploadImagesMode) { 293 | // Get image url using provided base url, product ID, and hi-res query parameters 294 | const imageUrlBase = 295 | "https://assets.woolworths.com.au/images/2010/"; 296 | const imageUrlExtensionAndQueryParams = 297 | ".jpg?impolicy=wowcdxwbjbx&w=900&h=900"; 298 | const imageUrl = 299 | imageUrlBase + product.id + imageUrlExtensionAndQueryParams; 300 | 301 | await uploadImageRestAPI(imageUrl!, product); 302 | } 303 | } else if (!databaseMode && product !== undefined) { 304 | // When doing a dry run, log product name - size - price in table format 305 | logProductRow(product!); 306 | } 307 | 308 | // Add a tiny delay between each product loop. 309 | // This makes printing the log more readable 310 | await setTimeout(productLogDelayMilliSeconds); 311 | } 312 | 313 | // Return log stats for completed page 314 | return perPageLogStats; 315 | } 316 | 317 | // uploadImageRestAPI() 318 | // -------------------- 319 | // Send image url to an Azure Function API 320 | 321 | async function uploadImageRestAPI( 322 | imgUrl: string, 323 | product: Product 324 | ): Promise { 325 | // Check if passed in url is valid, return if not 326 | if (imgUrl === undefined || !imgUrl.includes("http") || product.id.length < 4) { 327 | log(colour.grey, ` Image ${product.id} has invalid url: ${imgUrl}`); 328 | return false; 329 | } 330 | 331 | // Get IMAGE_UPLOAD_FUNC_URL from env 332 | // Example format: 333 | // https://.azurewebsites.net/api/ImageToS3?code= 334 | const funcBaseUrl = process.env.IMAGE_UPLOAD_FUNC_URL; 335 | 336 | // Check funcBaseUrl is valid 337 | if (!funcBaseUrl?.includes("http")) { 338 | throw Error( 339 | "\nIMAGE_UPLOAD_FUNC_URL in .env is invalid. Should be in .env :\n\n" + 340 | "IMAGE_UPLOAD_FUNC_URL=https://.azurewebsites.net/api/ImageToS3?code=\n\n" 341 | ); 342 | } 343 | const restUrl = `${funcBaseUrl}${product.id}&source=${imgUrl}`; 344 | 345 | // Perform http get 346 | var res = await fetch(new URL(restUrl), { method: "GET" }); 347 | var responseMsg = await (await res.blob()).text(); 348 | 349 | if (responseMsg.includes("S3 Upload of Full-Size")) { 350 | // Log for successful upload 351 | log( 352 | colour.grey, 353 | ` New Image : ${(product.id + ".webp").padEnd(11)} | ` + 354 | `${product.name.padEnd(40).slice(0, 40)}` 355 | ); 356 | } else if (responseMsg.includes("already exists")) { 357 | // Do not log for existing images 358 | } else if (responseMsg.includes("Unable to download:")) { 359 | // Log for missing images 360 | log(colour.grey, ` Image ${product.id} unavailable to be downloaded`); 361 | } else if (responseMsg.includes("unable to be processed")) { 362 | log(colour.grey, ` Image ${product.id} unable to be processed`); 363 | } else { 364 | // Log any other errors that may have occurred 365 | console.log(responseMsg); 366 | } 367 | return true; 368 | } 369 | 370 | // handleArguments() 371 | // ----------------- 372 | // Handle command line arguments. Can be reverse mode, dry-run-mode, custom url, or categories 373 | 374 | function handleArguments(categorisedUrls: CategorisedUrl[]): CategorisedUrl[] { 375 | if (process.argv.length > 2) { 376 | // Slice out the first 2 arguments, as they are not user-provided 377 | const userArgs = process.argv.slice(2, process.argv.length); 378 | 379 | // Loop through all args and find any matching keywords 380 | let potentialUrl = ""; 381 | userArgs.forEach(async (arg) => { 382 | if (arg === "db") databaseMode = true; 383 | else if (arg === "images") uploadImagesMode = true; 384 | else if (arg === "headless") headlessMode = true // is already default 385 | else if (arg === "headed") headlessMode = false 386 | 387 | // Any arg containing .co.nz will replaced the URLs text file to be scraped. 388 | else if (arg.includes(".co.nz")) potentialUrl += arg; 389 | 390 | // Reverse the order of the URLs to be scraped, starting from the bottom 391 | else if (arg === "reverse") categorisedUrls = categorisedUrls.reverse(); 392 | // else if (arg === "custom") { 393 | // categorisedUrls = []; 394 | // await customQuery(); 395 | // process.exit(); 396 | // } 397 | }); 398 | 399 | // Try to parse the potential new url 400 | const parsedUrl = parseAndCategoriseURL(potentialUrl); 401 | if (parsedUrl !== undefined) categorisedUrls = parsedUrl; 402 | } 403 | return categorisedUrls; 404 | } 405 | 406 | // establishPlaywrightPage() 407 | // ------------------------- 408 | // Create a playwright browser 409 | 410 | async function establishPlaywrightPage(headless = true) { 411 | log( 412 | colour.yellow, 413 | "Launching Browser.. " + 414 | (process.argv.length > 2 415 | ? "(" + (process.argv.length - 2) + " arguments found)" 416 | : "") 417 | ); 418 | browser = await playwright.firefox.launch({ 419 | headless: headless, 420 | }); 421 | page = await browser.newPage(); 422 | 423 | // Reject unnecessary ad/tracking urls 424 | await routePlaywrightExclusions(); 425 | 426 | return browser; 427 | } 428 | 429 | // selectStoreByLocationName() 430 | // --------------------------- 431 | // Selects a store location by typing in the specified location address 432 | 433 | async function selectStoreByLocationName(locationName: string = "") { 434 | // If no location was passed in, also check .env for STORE_NAME 435 | if (locationName === "") { 436 | if (process.env.STORE_NAME) locationName = process.env.STORE_NAME; 437 | // If STORE_NAME is also not present, skip store location selection 438 | else return; 439 | } 440 | 441 | log(colour.yellow, "Selecting Store Location.."); 442 | 443 | // Open store selection page 444 | try { 445 | await page.setDefaultTimeout(12000); 446 | await page.goto("https://www.woolworths.co.nz/bookatimeslot", { 447 | waitUntil: "domcontentloaded", 448 | }); 449 | await page.waitForSelector("fieldset div div p button"); 450 | } catch (error) { 451 | logError("Location selection page timed out - Using default location instead"); 452 | return; 453 | } 454 | 455 | const oldLocation = await page 456 | .locator("fieldset div div p strong") 457 | .innerText(); 458 | 459 | // Click change address modal 460 | await page.locator("fieldset div div p button").click(); 461 | await page.waitForSelector("form-suburb-autocomplete form-input input"); 462 | try { 463 | // Type in address, wait 1.5s for auto-complete to populate entries 464 | await page 465 | .locator("form-suburb-autocomplete form-input input") 466 | .type(locationName); 467 | await page.waitForTimeout(1500); 468 | 469 | // Select first matched entry, wait for validation 470 | await page.keyboard.press("ArrowDown"); 471 | await page.waitForTimeout(300); 472 | await page.keyboard.press("Enter"); 473 | await page.waitForTimeout(1000); 474 | 475 | // Click save location button 476 | await page.getByText("Save and Continue Shopping").click(); 477 | log( 478 | colour.yellow, 479 | "Changed Location from " + oldLocation + " to " + locationName + "\n" 480 | ); 481 | 482 | // Ensure location is saved before moving on 483 | await page.waitForTimeout(2000); 484 | } catch { 485 | // Catch timeout if no locations are found using the provided env value. 486 | logError( 487 | `Store Location:${locationName} not found. Using default instead.` 488 | ); 489 | } 490 | } 491 | 492 | // playwrightElementToProduct() 493 | // ---------------------------- 494 | // Takes a playwright html element for 'a.product-entry', builds and returns a Product 495 | 496 | export function playwrightElementToProduct( 497 | element: cheerio.Element, 498 | categories: string[] 499 | ): Product | undefined { 500 | const $ = cheerio.load(element); 501 | 502 | // Find the

tag with an id containing "-title" 503 | // This holds the product ID, name and size 504 | let idNameSizeH3 = $(element).find("h3").filter((i, element) => { 505 | if ($(element).attr("id")?.includes("-title")) { 506 | return true 507 | } else return false; 508 | }); 509 | 510 | let product: Product = { 511 | 512 | // ID 513 | // ------- 514 | // Extract product ID from h3 id attribute, and remove non-numbers 515 | id: idNameSizeH3.attr("id")?.replace(/\D/g, "") as string, 516 | 517 | // Source Site - set where the source of information came from 518 | sourceSite: "countdown.co.nz", // use countdown for consistency with old data 519 | 520 | // Categories 521 | category: categories, // already obtained from url/text file 522 | 523 | // Store today's date 524 | lastChecked: new Date(), 525 | lastUpdated: new Date(), 526 | 527 | // These values will later be overwritten 528 | name: "", 529 | priceHistory: [], 530 | currentPrice: 0, 531 | }; 532 | 533 | // Name & Size 534 | // ------------ 535 | // Try to extract combined name and size from h3 tag inner text 536 | let rawNameAndSize = idNameSizeH3.text().trim(); 537 | 538 | // Clean unnecessary words from titles 539 | rawNameAndSize = rawNameAndSize 540 | .toLowerCase() 541 | .replace(" ", " ") 542 | .replace("fresh fruit", "") 543 | .replace("fresh vegetable", "") 544 | .trim() 545 | ; 546 | 547 | // Try to regex match a size section such as: 548 | // 100g, 150ml, 16pack, 0.5-1.5kg, tray 1kg, etc 549 | let tryMatchSize = 550 | rawNameAndSize.match(/(tray\s\d+)|(\d+(\.\d+)?(\-\d+\.\d+)?\s?(g|kg|l|ml|pack))\b/g); 551 | 552 | if (!tryMatchSize) { 553 | // Capitalise and set name 554 | product.name = toTitleCase(rawNameAndSize); 555 | 556 | // No size was found in name, size can be derived from unit price later 557 | product.size = ""; 558 | } else { 559 | // If a size was found, get the index to split the string into name and size 560 | let indexOfSizeSection = rawNameAndSize.indexOf(tryMatchSize[0]); 561 | 562 | // Capitalise and set name 563 | product.name = toTitleCase(rawNameAndSize.slice(0, indexOfSizeSection)).trim(); 564 | 565 | // Clean up and set size 566 | let cleanedSize = rawNameAndSize.slice(indexOfSizeSection).trim(); 567 | if (cleanedSize.match(/\d+l\b/)) { 568 | // Capitalise L for litres 569 | cleanedSize = cleanedSize.replace("l", "L"); 570 | } 571 | cleanedSize.replace("tray", "Tray"); 572 | product.size = cleanedSize; 573 | } 574 | 575 | // Price 576 | // ------ 577 | // Is originally displayed with dollars in an , cents in a , 578 | // and potentially a kg unit name inside the for some meat products. 579 | // The 2 numbers are joined, parsed, and non-number chars are removed. 580 | const dollarString: string = $(element) 581 | .find("product-price div h3 em") 582 | .text() 583 | .trim(); 584 | let centString: string = $(element) 585 | .find("product-price div h3 span") 586 | .text() 587 | .trim(); 588 | if (centString.includes("kg")) product.size = "per kg"; 589 | centString = centString.replace(/\D/g, ""); 590 | product.currentPrice = Number(dollarString + "." + centString); 591 | 592 | // Create a date object for now, but with minutes and seconds set to 0 593 | const today = new Date(); 594 | today.setMinutes(0); 595 | today.setSeconds(0); 596 | 597 | // Create a DatedPrice object, which may be added into the product if needed 598 | const todaysDatedPrice: DatedPrice = { 599 | date: today, 600 | price: product.currentPrice, 601 | }; 602 | product.priceHistory = [todaysDatedPrice]; 603 | 604 | // Unit Price 605 | // ----------- 606 | // Try to extract from span.cupPrice, ex: $2.52 / 100mL 607 | const rawUnitPrice = $(element).find("span.cupPrice").text().trim(); 608 | 609 | if (rawUnitPrice) { 610 | // Extract and parse unit price, ex: 2.52 611 | const unitPriceString = rawUnitPrice.split("/")[0].replace("$", "").trim(); 612 | let unitPrice = Number.parseFloat(unitPriceString); 613 | 614 | // Extract amount and unit, ex: 100mL 615 | const amountAndUnit = rawUnitPrice.split("/")[1].trim(); 616 | 617 | // Parse amount, ex: 100 618 | let amount = Number.parseInt(amountAndUnit.match(/\d+/g)?.[0] || ""); 619 | 620 | // Extract unit, ex: mL 621 | let unit = amountAndUnit.match(/\w+/g)?.[0] || "" 622 | 623 | // Normalize units to kg or L 624 | if (amountAndUnit == "100g") { 625 | amount = amount * 10; 626 | unitPrice = unitPrice * 10; 627 | unit = "kg"; 628 | } 629 | else if (amountAndUnit == "100mL") { 630 | amount = amount * 10; 631 | unitPrice = unitPrice * 10; 632 | unit = "L"; 633 | } 634 | 635 | // Cleanup 1kg to just kg 636 | unit = unit.replace("1kg", "kg"); 637 | unit = unit.replace("1L", "L"); 638 | 639 | // Set finalised unit price and name 640 | product.unitPrice = unitPrice; 641 | product.unitName = unit; 642 | } 643 | 644 | // Overrides 645 | // ---------- 646 | // Check .ts file for manually overridden product data 647 | productOverrides.forEach((override) => { 648 | // First check if product ID has any overrides 649 | if (override.id === product.id) { 650 | // Check for size override 651 | if (override.size !== undefined) { 652 | product.size = override.size; 653 | } 654 | 655 | // Check for category override 656 | if (override.category !== undefined) { 657 | product.category = [override.category]; 658 | } 659 | } 660 | }); 661 | 662 | // Validation 663 | // ---------- 664 | // If product values pass validation, return product 665 | if (validateProduct(product)) return product; 666 | else { 667 | try { 668 | logError( 669 | ` Unable to Scrape: ${product.id.padStart(6)} | ${product.name} | ` + 670 | `$${product.currentPrice}` 671 | ); 672 | } catch { 673 | logError(" Unable to Scrape ID from product"); 674 | } 675 | return undefined; 676 | } 677 | } 678 | 679 | // validateProduct() 680 | // ----------------- 681 | // Checks scraped product values are within reasonable ranges 682 | 683 | function validateProduct(product: Product): boolean { 684 | try { 685 | if (product.name.match(/\$\s\d+/)) return false; 686 | if (product.name.length < 4 || product.name.length > 100) return false; 687 | if (product.id.length < 2 || product.id.length > 20) return false; 688 | if ( 689 | product.currentPrice <= 0 || 690 | product.currentPrice === null || 691 | product.currentPrice === undefined || 692 | Number.isNaN(product.currentPrice) || 693 | product.currentPrice > 999 694 | ) { 695 | return false; 696 | } 697 | return true; 698 | } catch (error) { 699 | return false; 700 | } 701 | } 702 | 703 | // parseAndCategoriseURL() 704 | // ----------------------- 705 | // Parses a URL string, an optional category, optional number of pages to scrape 706 | // from a single line of text. 707 | // Returns undefined if not a valid URL 708 | // Example Input: 709 | // woolworths.co.nz/shop/browse/frozen/ice-cream-sorbet/tubs category=ice-cream pages=2 710 | // Example Return: 711 | // [ 712 | // { 713 | // url: "https://woolworths.co.nz/shop/browse/frozen/ice-cream-sorbet/tubs?page=1&inStockProductsOnly=true" 714 | // category: "ice-cream" 715 | // }, 716 | // { 717 | // url: "https://woolworths.co.nz/shop/browse/frozen/ice-cream-sorbet/tubs?page=2&inStockProductsOnly=true" 718 | // category: "ice-cream" 719 | // } 720 | // ] 721 | 722 | export function parseAndCategoriseURL( 723 | line: string 724 | ): CategorisedUrl[] | undefined { 725 | let baseCategorisedURL: CategorisedUrl = { url: "", categories: [] }; 726 | let parsedUrls: CategorisedUrl[] = []; 727 | let numPagesPerURL = 1; 728 | 729 | // If line doesn't contain desired url section, return undefined 730 | if (!line.includes("woolworths.co.nz")) { 731 | return undefined; 732 | 733 | // If line is a search url, return as-is 734 | } else if (line.includes("?search=")) { 735 | parsedUrls.push({ url: line, categories: [] }) 736 | 737 | // Else optimize and cleanup URL 738 | } else { 739 | // Split line by empty space, look for url, optional page amount & category 740 | line.split(" ").forEach((section) => { 741 | 742 | // Parse URL 743 | if (section.includes("woolworths.co.nz")) { 744 | baseCategorisedURL.url = section; 745 | 746 | // Ensure URL has http:// or https:// 747 | if (!baseCategorisedURL.url.startsWith("http")) 748 | baseCategorisedURL.url = "https://" + section; 749 | 750 | // If url contains ? it has query options already set 751 | if (section.includes("?")) { 752 | // Strip any existing query options off of URL 753 | baseCategorisedURL.url = line.substring(0, line.indexOf("?")); 754 | } 755 | // Replace query parameters with optimised ones, 756 | // such as limiting to in-stock only, 757 | baseCategorisedURL.url += '?page=1&inStockProductsOnly=true'; 758 | 759 | // Parse Category 760 | } else if (section.startsWith("categories=")) { 761 | let splitCategories = [section.replace("categories=", "")]; 762 | if (section.includes(",")) 763 | splitCategories = section.replace("categories=", "").split(","); 764 | baseCategorisedURL.categories = splitCategories; 765 | 766 | // Parse number of pages 767 | } else if (section.startsWith("pages=")) { 768 | numPagesPerURL = Number.parseInt(section.split("=")[1]); 769 | } 770 | 771 | // If no category was specified, derive one from the last url /section 772 | if (baseCategorisedURL.categories.length === 0) { 773 | // Extract /slashSections/ from url, while excluding content after '?' 774 | const baseUrl = baseCategorisedURL.url.split("?")[0]; 775 | let slashSections = baseUrl.split("/"); 776 | 777 | // Set category to last url /section/ 778 | baseCategorisedURL.categories = [slashSections[slashSections.length - 1]]; 779 | } 780 | }); 781 | 782 | // For multiple pages, duplicate the url and edit the ?page=1 query parameter 783 | for (let i = 1; i <= numPagesPerURL; i++) { 784 | let pagedUrl = { 785 | url: baseCategorisedURL.url.replace("page=1", "page=" + i), 786 | categories: baseCategorisedURL.categories, 787 | } 788 | parsedUrls.push(pagedUrl); 789 | } 790 | } 791 | 792 | return parsedUrls; 793 | } 794 | 795 | // routePlaywrightExclusions() 796 | // --------------------------- 797 | // Excludes ads, tracking, and bandwidth intensive resources from being downloaded by Playwright 798 | 799 | async function routePlaywrightExclusions() { 800 | let typeExclusions = ["image", "media", "font"]; 801 | let urlExclusions = [ 802 | "googleoptimize.com", 803 | "gtm.js", 804 | "visitoridentification.js", 805 | "js-agent.newrelic.com", 806 | "cquotient.com", 807 | "googletagmanager.com", 808 | "cloudflareinsights.com", 809 | "dwanalytics", 810 | "facebook.net", 811 | "chatWidget", 812 | "edge.adobedc.net", 813 | "​/Content/Banners/", 814 | "algolia.io", 815 | "algoliaradar.com", 816 | "go-mpulse.net" 817 | ]; 818 | 819 | // Route with exclusions processed 820 | await page.route("**/*", async (route) => { 821 | const req = route.request(); 822 | let excludeThisRequest = false; 823 | //let trimmedUrl = req.url().length > 120 ? req.url().substring(0, 120) + '...' : req.url(); 824 | 825 | urlExclusions.forEach((excludedURL) => { 826 | if (req.url().includes(excludedURL)) excludeThisRequest = true; 827 | }); 828 | 829 | typeExclusions.forEach((excludedType) => { 830 | if (req.resourceType() === excludedType) excludeThisRequest = true; 831 | }); 832 | 833 | if (excludeThisRequest) { 834 | //logError(`${req.method()} ${req.resourceType()} - ${trimmedUrl}`); 835 | await route.abort(); 836 | } else { 837 | //log(colour.white, `${req.method()} ${req.resourceType()} - ${trimmedUrl}`); 838 | await route.continue(); 839 | } 840 | }); 841 | 842 | return; 843 | } 844 | 845 | --------------------------------------------------------------------------------