├── .gitignore ├── LICENSE ├── README.md ├── bigquery-schema.json ├── config.json.sample ├── config.schema.json ├── gce-install.sh ├── index.js ├── package.json └── test ├── config.test.js ├── config.test.json └── index.unit.test.js /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | .idea 3 | *.iml 4 | package-lock.json 5 | secret 6 | 7 | test.json 8 | config.json 9 | 10 | .nyc_output 11 | coverage -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Simo Ahava 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # web-scraper-gcp 2 | Scrape all the pages and links of a given domain and write the results to Google Cloud BigQuery. 3 | 4 | # Steps 5 | 6 | 1. Clone the repo locally 7 | 2. Create a Google Cloud Platform project, enable Compute Engine API and BigQuery API 8 | 3. Install latest version of gcloud SDK 9 | 4. Authenticate against gcloud SDK and set the project to the one you created 10 | 5. Edit config.json.sample 11 | -- a. Update "domain" to match what you consider an "internal" domain pattern 12 | -- b. Update "startUrl" to give the entry point for the crawl 13 | -- c. Update "projectId" to the GCP project ID 14 | -- d. Update "bigQuery.datasetId" and "bigQuery.tableId" to a dataset ID and table ID you want the script to create and write the results to. 15 | 6. If you want to use e.g. GCP Memorystore, set "redis.active" to true and update the host and port to match the Redis instance 16 | 7. Save the config.json.sample to config.json, and upload it to a Google Cloud Storage bucket 17 | 8. Edit gce-install.sh and update the `bucket` variable to the URL to the config file in Google Cloud Storage 18 | 9. Once ready, run 19 | 20 | ``` 21 | gcloud compute instances create web-scraper-gcp \ 22 | --machine-type=n1-standard-16 \ 23 | --metadata-from-file=startup-script=./gce-install.sh \ 24 | --scopes=bigquery,cloud-platform \ 25 | --zone=europe-north1-a 26 | ``` 27 | 28 | Feel free to change `machine-type` to something more or less powerful if you wish. Feel free to change the zone, too. 29 | 30 | This will create a new Compute Engine instance called "web-scraper-gcp", which will run the crawl as soon as the instance is started. Once the crawl is over, the instance is automatically stopped. 31 | -------------------------------------------------------------------------------- /bigquery-schema.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "description": "The URL initially requested for crawling.", 4 | "name": "requested_url", 5 | "type": "STRING" 6 | }, 7 | { 8 | "description": "The actual URL that was crawled (e.g. due to redirect).", 9 | "name": "final_url", 10 | "type": "STRING" 11 | }, 12 | { 13 | "description": "The HTTP status code.", 14 | "name": "http_status", 15 | "type": "INT64" 16 | }, 17 | { 18 | "description": "The Content-Type header of the response.", 19 | "name": "content_type", 20 | "type": "STRING" 21 | }, 22 | { 23 | "description": "If the URL was external to the crawled domain.", 24 | "name": "external", 25 | "type": "BOOLEAN" 26 | }, 27 | { 28 | "description": "The URL of the page which linked to the crawled URL.", 29 | "name": "previous_url", 30 | "type": "STRING" 31 | }, 32 | { 33 | "description": "Cookies set on the page", 34 | "name": "cookies", 35 | "type": "RECORD", 36 | "mode": "REPEATED", 37 | "fields": [ 38 | { 39 | "name": "name", 40 | "type": "string" 41 | }, 42 | { 43 | "name": "value", 44 | "type": "string" 45 | }, 46 | { 47 | "name": "domain", 48 | "type": "string" 49 | }, 50 | { 51 | "name": "path", 52 | "type": "string" 53 | }, 54 | { 55 | "name": "expires", 56 | "type": "timestamp" 57 | }, 58 | { 59 | "name": "size", 60 | "type": "int64" 61 | }, 62 | { 63 | "name": "httpOnly", 64 | "type": "bool" 65 | }, 66 | { 67 | "name": "secure", 68 | "type": "bool" 69 | }, 70 | { 71 | "name": "session", 72 | "type": "bool" 73 | }, 74 | { 75 | "name": "sameSite", 76 | "type": "string" 77 | } 78 | ] 79 | }, 80 | { 81 | "description": "localStorage keys set on the page", 82 | "name": "localStorage", 83 | "type": "RECORD", 84 | "mode": "REPEATED", 85 | "fields": [ 86 | { 87 | "name": "key", 88 | "type": "string" 89 | }, 90 | { 91 | "name": "value", 92 | "type": "string" 93 | } 94 | ] 95 | }, 96 | { 97 | "description": "Document title.", 98 | "name": "document_title", 99 | "type": "STRING" 100 | }, 101 | { 102 | "description": "Meta Description.", 103 | "name": "meta_description", 104 | "type": "STRING" 105 | } 106 | ] 107 | -------------------------------------------------------------------------------- /config.json.sample: -------------------------------------------------------------------------------- 1 | { 2 | "domain": "gtmtools.com", 3 | "sitemap": { 4 | "active": false, 5 | "url": "https://www.example.com/sitemap.xml" 6 | }, 7 | "startUrl": "https://www.gtmtools.com/", 8 | "projectId": "web-scraper-gcp", 9 | "skipExternal": true, 10 | "clearStorage": true, 11 | "bigQuery": { 12 | "datasetId": "web_scraper_gcp", 13 | "tableId": "crawl_results" 14 | }, 15 | "redis": { 16 | "active": false, 17 | "host": "10.0.0.3", 18 | "port": 6379 19 | }, 20 | "puppeteerArgs": ["--no-sandbox"], 21 | "crawlerOptions": { 22 | "maxConcurrency": 50, 23 | "skipRequestedRedirect": true 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /config.schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "/Config", 3 | "type": "object", 4 | "properties": { 5 | "domain": { 6 | "type": "string" 7 | }, 8 | "startUrl": { 9 | "type": "string" 10 | }, 11 | "projectId": { 12 | "type": "string" 13 | }, 14 | "bigQuery": { 15 | "type": "object", 16 | "properties": { 17 | "datasetId": { 18 | "type": "string" 19 | }, 20 | "tableId": { 21 | "type": "string" 22 | } 23 | }, 24 | "required": ["datasetId", "tableId"] 25 | }, 26 | "redis": { 27 | "type": "object", 28 | "properties": { 29 | "active": { 30 | "type": "boolean" 31 | }, 32 | "host": { 33 | "type": "string" 34 | }, 35 | "port": { 36 | "type": "number" 37 | } 38 | }, 39 | "required": ["active", "host", "port"] 40 | }, 41 | "puppeteerArgs": { 42 | "type": "array", 43 | "items": { 44 | "type": "string" 45 | } 46 | }, 47 | "crawlerOptions": { 48 | "type": "object" 49 | } 50 | }, 51 | "required": ["domain", "startUrl", "projectId", "bigQuery", "redis", "puppeteerArgs", "crawlerOptions"] 52 | } 53 | -------------------------------------------------------------------------------- /gce-install.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | bucket='gs://web-scraper-config/config.json' 4 | 5 | set -v 6 | 7 | apt-get update 8 | apt-get install -y chromium 9 | apt-get install -y libgbm-dev 10 | 11 | curl -sL https://deb.nodesource.com/setup_12.x | bash - 12 | apt-get install -yq git libgconf-2-4 nodejs 13 | 14 | git clone https://github.com/sahava/web-scraper-gcp.git 15 | 16 | cd web-scraper-gcp 17 | sudo npm install 18 | gsutil cp ${bucket} . 19 | node index.js 20 | 21 | shutdown -h now 22 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | /** 2 | * MIT License 3 | * 4 | * Copyright (c) 2018 Simo Ahava 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy 7 | * of this software and associated documentation files (the "Software"), to deal 8 | * in the Software without restriction, including without limitation the rights 9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | * copies of the Software, and to permit persons to whom the Software is 11 | * furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in all 14 | * copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | * SOFTWARE. 23 | */ 24 | 25 | const HCCrawler = require(`headless-chrome-crawler`); 26 | const RedisCache = require(`headless-chrome-crawler/cache/redis`); 27 | const extend = require(`lodash/extend`); 28 | const {Validator} = require(`jsonschema`); 29 | const Sitemapper = require('sitemapper'); 30 | 31 | const {BigQuery} = require(`@google-cloud/bigquery`); 32 | 33 | const configFile = require(`./config.json`); 34 | const configSchema = require(`./config.schema.json`); 35 | const bigQuerySchema = require(`./bigquery-schema.json`); 36 | 37 | const sitemap = new Sitemapper(); 38 | 39 | let sitemapPages; 40 | if (configFile.sitemap.active) { 41 | sitemap.fetch(configFile.sitemap.url).then(data => { 42 | sitemapPages = data.sites; 43 | }); 44 | } 45 | 46 | // Initialize new BigQuery client 47 | const bigquery = new BigQuery({ 48 | projectId: configFile.projectId 49 | }); 50 | 51 | const validator = new Validator; 52 | 53 | // Only set cache if the configuration file has redis set to active 54 | let cache = null; 55 | 56 | let count = 0; 57 | let start = null; 58 | 59 | /** 60 | * Checks if given URL is external. 61 | * 62 | * @param {string} urlString The URL string to check. 63 | * @returns {boolean} Returns true if external. 64 | */ 65 | function checkIfUrlExternal(urlString) { 66 | const domain = new RegExp(`^https?://(www\.)?${configFile.domain}/`); 67 | return !domain.test(urlString); 68 | } 69 | 70 | /** 71 | * Writes the crawl result to a BigQuery table. 72 | * 73 | * @param {object} result The object returned for each crawled page. 74 | */ 75 | async function writeToBigQuery(result) { 76 | console.log(`Crawled ${result.response.url}`); 77 | count += 1; 78 | 79 | const ls = JSON.parse(result.result.localStorage); 80 | 81 | const item = { 82 | requested_url: result.options.url, 83 | final_url: result.response.url, 84 | http_status: result.response.status, 85 | content_type: result.response.headers['content-type'], 86 | external: checkIfUrlExternal(result.response.url), 87 | previous_url: result.previousUrl, 88 | cookies: !!checkIfUrlExternal(result.response.url) ? [] : result.cookies.map(c => ({ 89 | name: c.name, 90 | value: c.value, 91 | domain: c.domain, 92 | path: c.path, 93 | expires: new Date(c.expires * 1000).toISOString(), 94 | size: c.size, 95 | httpOnly: c.httpOnly, 96 | secure: c.secure, 97 | session: c.session, 98 | sameSite: c.sameSite || null 99 | })), 100 | localStorage: Object.keys(ls).map(k => ({ 101 | name: k, 102 | value: ls[k] 103 | })), 104 | document_title: result.result.title, 105 | meta_description: result.result.metaDescription, 106 | }; 107 | 108 | await bigquery 109 | .dataset(configFile.bigQuery.datasetId) 110 | .table(configFile.bigQuery.tableId) 111 | .insert([item]); 112 | } 113 | 114 | /** 115 | * Checks if the crawled URL is external. if it is, only crawl the current page but not any of its links. 116 | * 117 | * @param {object} options The options object for each crawled page. 118 | * @returns {boolean} Returns true after setting the new maxDepth. 119 | */ 120 | function preRequest(options) { 121 | if (checkIfUrlExternal(options.url)) { 122 | if (configFile.skipExternal) return false; 123 | options.maxDepth = 1; 124 | } 125 | return true; 126 | } 127 | 128 | /** 129 | * Use jQuery to return title and Meta Description content of each crawled page. 130 | * 131 | * Ignored from tests due to use of jQuery. 132 | * 133 | * @returns {object} The object containing title and metaDescription data. 134 | */ 135 | /* istanbul ignore next */ 136 | function evaluatePage() { 137 | const ls = JSON.stringify(window.localStorage); 138 | window.localStorage.clear(); 139 | return { 140 | title: $('title').text(), 141 | metaDescription: $('meta[name="description"]').attr('content'), 142 | localStorage: ls 143 | }; 144 | } 145 | 146 | /** 147 | * Custom crawler that fetches ALL cookies. 148 | * 149 | * @param page Page object. 150 | * @param crawl Crawl API. 151 | * @returns {Promise<*>} 152 | */ 153 | async function customCrawl(page, crawl) { 154 | const result = await crawl(); 155 | const cookies = await page._client.send('Network.getAllCookies'); 156 | result.cookies = cookies.cookies; 157 | if (configFile.clearStorage) { 158 | const client = await page.target().createCDPSession(); 159 | await client.send('Network.clearBrowserCookies'); 160 | await client.send('Network.clearBrowserCache'); 161 | } 162 | return result; 163 | } 164 | 165 | /** 166 | * Launches the crawler. 167 | * 168 | * @returns {Promise} 169 | */ 170 | async function launchCrawler() { 171 | try { 172 | start = new Date().getTime(); 173 | console.log(`Creating table ${configFile.bigQuery.tableId} in dataset ${configFile.bigQuery.datasetId}`); 174 | 175 | try { 176 | await bigquery.createDataset(configFile.bigQuery.datasetId); 177 | } catch(e) {} 178 | try { 179 | await bigquery 180 | .dataset(configFile.bigQuery.datasetId) 181 | .createTable(configFile.bigQuery.tableId, { 182 | schema: { 183 | fields: bigQuerySchema 184 | }, 185 | timePartitioning: { 186 | type: 'DAY' 187 | } 188 | }); 189 | } catch(e) {} 190 | 191 | const options = extend({ 192 | args: configFile.puppeteerArgs, 193 | onSuccess: writeToBigQuery, 194 | customCrawl, 195 | preRequest, 196 | evaluatePage, 197 | cache, 198 | skipRequestedRedirect: true 199 | }, configFile.crawlerOptions); 200 | 201 | const crawler = await HCCrawler.launch(options); 202 | 203 | if (configFile.sitemap.active) { 204 | console.log(`Crawling sitemap ${configFile.sitemap.url}`); 205 | await crawler.queue({url: sitemapPages[0], maxDepth: 999999}); 206 | } else { 207 | console.log(`Starting crawl from ${configFile.startUrl}`); 208 | await crawler.queue({ 209 | url: configFile.startUrl, 210 | maxDepth: 999999 211 | }); 212 | } 213 | 214 | await crawler.onIdle(); 215 | const finish = new Date().getTime(); 216 | console.log(`Crawl took ${finish - start} milliseconds.`); 217 | console.log(`Crawled ${count} files.`); 218 | await crawler.close(); 219 | } catch(e) { 220 | console.error(e); 221 | } 222 | } 223 | 224 | /** 225 | * Validates the configuration file. 226 | */ 227 | function init() { 228 | const result = validator.validate(configFile, configSchema); 229 | if (result.errors.length) { 230 | throw new Error(`Error(s) in configuration file: ${JSON.stringify(result.errors, null, " ")}`); 231 | } else { 232 | cache = configFile.redis.active ? new RedisCache({ host: configFile.redis.host, port: configFile.redis.port }) : null; 233 | console.log(`Configuration validated successfully`); 234 | } 235 | } 236 | 237 | /** 238 | * Runs the intiialization and crawler unless in test, in which case only the module exports are done for the test suite. 239 | * 240 | * Ignored from test coverage. 241 | */ 242 | /* istanbul ignore next */ 243 | (async () => { 244 | try { 245 | if (process.env.NODE_ENV !== 'test') { 246 | init(); 247 | await launchCrawler(); 248 | } else { 249 | // For testing 250 | module.exports = { 251 | _init: init, 252 | _writeToBigQuery: writeToBigQuery, 253 | _preRequest: preRequest, 254 | _evaluatePage: evaluatePage 255 | }; 256 | } 257 | module.exports.launchCrawler = launchCrawler; 258 | } catch(e) { 259 | console.error(e); 260 | } 261 | })(); 262 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "web-scraper-gcp", 3 | "version": "0.0.1", 4 | "description": "Node.js web crawler designed to operate in the Google Cloud Platform.", 5 | "main": "index.js", 6 | "scripts": { 7 | "run-dev": "npm run start-redis && npm run run-script && npm run stop-redis", 8 | "run-script": "NODE_ENV=dev GOOGLE_APPLICATION_CREDENTIALS='./secret/secret.json' node index.js", 9 | "start-redis": "redis-server ~/Documents/Projects/redis-stable/redis.conf --daemonize yes", 10 | "stop-redis": "redis-cli shutdown", 11 | "test": "NODE_ENV=test nyc --reporter=lcov ava --verbose ./test/*.test.js && nyc report" 12 | }, 13 | "author": "Simo Ahava", 14 | "license": "MIT", 15 | "dependencies": { 16 | "headless-chrome-crawler": "git+https://github.com/sahava/headless-chrome-crawler.git#master", 17 | "jsonschema": "^1.2.6", 18 | "lodash": "^4.17.20", 19 | "redis": "^3.0.2", 20 | "sitemapper": "^3.0.5" 21 | }, 22 | "devDependencies": { 23 | "@google-cloud/bigquery": "^5.2.0", 24 | "@google-cloud/nodejs-repo-tools": "^3.3.0", 25 | "ava": "^3.12.1", 26 | "nyc": "^15.1.0", 27 | "proxyquire": "^2.1.3", 28 | "sinon": "^9.0.3" 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /test/config.test.js: -------------------------------------------------------------------------------- 1 | const test = require(`ava`); 2 | const tools = require(`@google-cloud/nodejs-repo-tools`); 3 | const {Validator} = require(`jsonschema`); 4 | const configSchema = require(`../config.schema.json`); 5 | 6 | const mockConfig = require(`./config.test.json`); 7 | let config; 8 | 9 | test.beforeEach(() => { 10 | config = JSON.parse(JSON.stringify(mockConfig)); 11 | tools.stubConsole(); 12 | }); 13 | test.afterEach.always(tools.restoreConsole); 14 | 15 | test.serial(`should fail without domain`, async t => { 16 | delete config['domain']; 17 | const validator = new Validator; 18 | console.error(validator.validate(config, configSchema).errors[0]); 19 | t.deepEqual(console.error.firstCall.args[0].message, 'requires property "domain"'); 20 | }); 21 | 22 | test.serial(`should fail without startUrl`, async t => { 23 | delete config['startUrl']; 24 | const validator = new Validator; 25 | console.error(validator.validate(config, configSchema).errors[0]); 26 | t.deepEqual(console.error.firstCall.args[0].message, 'requires property "startUrl"'); 27 | }); 28 | 29 | test.serial(`should fail without projectId`, async t => { 30 | delete config['projectId']; 31 | const validator = new Validator; 32 | console.error(validator.validate(config, configSchema).errors[0]); 33 | t.deepEqual(console.error.firstCall.args[0].message, 'requires property "projectId"'); 34 | }); 35 | 36 | test.serial(`should fail without bigQuery`, async t => { 37 | delete config['bigQuery']; 38 | const validator = new Validator; 39 | console.error(validator.validate(config, configSchema).errors[0]); 40 | t.deepEqual(console.error.firstCall.args[0].message, 'requires property "bigQuery"'); 41 | }); 42 | 43 | test.serial(`should fail without bigQuery.datasetId`, async t => { 44 | delete config['bigQuery']['datasetId']; 45 | const validator = new Validator; 46 | console.error(validator.validate(config, configSchema).errors[0]); 47 | t.deepEqual(console.error.firstCall.args[0].message, 'requires property "datasetId"'); 48 | }); 49 | 50 | test.serial(`should fail without bigQuery.tableId`, async t => { 51 | delete config['bigQuery']['tableId']; 52 | const validator = new Validator; 53 | console.error(validator.validate(config, configSchema).errors[0]); 54 | t.deepEqual(console.error.firstCall.args[0].message, 'requires property "tableId"'); 55 | }); 56 | 57 | test.serial(`should fail without redis`, async t => { 58 | delete config['redis']; 59 | const validator = new Validator; 60 | console.error(validator.validate(config, configSchema).errors[0]); 61 | t.deepEqual(console.error.firstCall.args[0].message, 'requires property "redis"'); 62 | }); 63 | 64 | test.serial(`should fail without redis.active`, async t => { 65 | delete config['redis']['active']; 66 | const validator = new Validator; 67 | console.error(validator.validate(config, configSchema).errors[0]); 68 | t.deepEqual(console.error.firstCall.args[0].message, 'requires property "active"'); 69 | }); 70 | 71 | test.serial(`should fail without redis.host`, async t => { 72 | delete config['redis']['host']; 73 | const validator = new Validator; 74 | console.error(validator.validate(config, configSchema).errors[0]); 75 | t.deepEqual(console.error.firstCall.args[0].message, 'requires property "host"'); 76 | }); 77 | 78 | test.serial(`should fail without redis.port`, async t => { 79 | delete config['redis']['port']; 80 | const validator = new Validator; 81 | console.error(validator.validate(config, configSchema).errors[0]); 82 | t.deepEqual(console.error.firstCall.args[0].message, 'requires property "port"'); 83 | }); 84 | 85 | test.serial(`should fail without puppeteerArgs`, async t => { 86 | delete config['puppeteerArgs']; 87 | const validator = new Validator; 88 | console.error(validator.validate(config, configSchema).errors[0]); 89 | t.deepEqual(console.error.firstCall.args[0].message, 'requires property "puppeteerArgs"'); 90 | }); 91 | 92 | test.serial(`should fail if puppeteerArgs is not an array`, async t => { 93 | config['puppeteerArgs'] = '--no-sandbox'; 94 | const validator = new Validator; 95 | console.error(validator.validate(config, configSchema).errors[0]); 96 | t.deepEqual(console.error.firstCall.args[0].message, 'is not of a type(s) array'); 97 | }); 98 | 99 | test.serial(`should fail without crawlerOptions`, async t => { 100 | delete config['crawlerOptions']; 101 | const validator = new Validator; 102 | console.error(validator.validate(config, configSchema).errors[0]); 103 | t.deepEqual(console.error.firstCall.args[0].message, 'requires property "crawlerOptions"'); 104 | }); 105 | -------------------------------------------------------------------------------- /test/config.test.json: -------------------------------------------------------------------------------- 1 | { 2 | "domain": "simoahava.com", 3 | "startUrl": "https://www.simoahava.com/", 4 | "sitemap": { 5 | "active": false, 6 | "url": "https://www.example.com/sitemap.xml" 7 | }, 8 | "projectId": "web-scraper-gcp", 9 | "skipExternal": true, 10 | "clearStorage": true, 11 | "bigQuery": { 12 | "datasetId": "web_scraper_gcp", 13 | "tableId": "crawl_results" 14 | }, 15 | "redis": { 16 | "active": false, 17 | "host": "10.0.0.3", 18 | "port": 6379 19 | }, 20 | "puppeteerArgs": ["--no-sandbox"], 21 | "crawlerOptions": { 22 | "maxConcurrency": 50, 23 | "skipRequestedRedirect": true 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /test/index.unit.test.js: -------------------------------------------------------------------------------- 1 | /** 2 | * MIT License 3 | * 4 | * Copyright (c) 2018 Simo Ahava 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy 7 | * of this software and associated documentation files (the "Software"), to deal 8 | * in the Software without restriction, including without limitation the rights 9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | * copies of the Software, and to permit persons to whom the Software is 11 | * furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in all 14 | * copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | * SOFTWARE. 23 | */ 24 | 25 | 'use strict'; 26 | 27 | const sinon = require(`sinon`); 28 | const test = require(`ava`); 29 | const proxyquire = require(`proxyquire`).noCallThru(); 30 | const tools = require(`@google-cloud/nodejs-repo-tools`); 31 | 32 | const mockConfig = require(`./config.test.json`); 33 | const mockResult = { 34 | options: { 35 | url: 'https://www.simoahava.com/page/' 36 | }, 37 | response: { 38 | url: 'https://www.simoahava.com/redirect-page/', 39 | status: 200, 40 | headers: { 41 | 'content-type': 'test-content-type' 42 | }, 43 | }, 44 | previousUrl: 'https://www.simoahava.com/', 45 | result: { 46 | title: 'Test title', 47 | metaDescription: null 48 | }, 49 | cookies: [{ 50 | name: 'test cookie', 51 | value: 'test value', 52 | domain: 'testdomain.com', 53 | path: '/', 54 | expires: 1591696664, 55 | size: 1, 56 | httpOnly: false, 57 | secure: false, 58 | session: false, 59 | sameSite: 'Lax' 60 | }] 61 | }; 62 | 63 | let config; 64 | 65 | function getSample() { 66 | const tableMock = { 67 | insert: sinon.stub().returns(Promise.resolve()) 68 | }; 69 | const datasetMock = { 70 | createTable: sinon.stub().returns(Promise.resolve([])), 71 | table: sinon.stub().returns(tableMock) 72 | }; 73 | const bigqueryMock = { 74 | createDataset: sinon.stub().returns(Promise.resolve([])), 75 | dataset: sinon.stub().returns(datasetMock) 76 | }; 77 | const BigQueryMock = sinon.stub().returns(bigqueryMock); 78 | const crawlerMock = { 79 | queue: sinon.stub().returns(Promise.resolve()), 80 | onIdle: sinon.stub().returns(Promise.resolve()), 81 | close: sinon.stub().returns(Promise.resolve()) 82 | }; 83 | const HCCrawlerMock = { 84 | launch: sinon.stub().returns(Promise.resolve(crawlerMock)) 85 | }; 86 | const RedisCacheMock = sinon.stub().returns(); 87 | return { 88 | program: proxyquire(`../`, { 89 | './config.json': config, 90 | '@google-cloud/bigquery': {BigQuery: BigQueryMock}, 91 | 'headless-chrome-crawler': HCCrawlerMock, 92 | 'headless-chrome-crawler/cache/redis': RedisCacheMock 93 | }), 94 | mocks: { 95 | config: config, 96 | BigQuery: BigQueryMock, 97 | bigquery: bigqueryMock, 98 | HCCrawler: HCCrawlerMock, 99 | crawler: crawlerMock, 100 | redis: RedisCacheMock 101 | } 102 | }; 103 | } 104 | 105 | test.beforeEach(() => { 106 | config = JSON.parse(JSON.stringify(mockConfig)); 107 | tools.stubConsole(); 108 | }); 109 | test.afterEach.always(tools.restoreConsole); 110 | 111 | test.serial(`should initialize with proper config`, t => { 112 | // Initialize mocks 113 | const sample = getSample(); 114 | 115 | // Call function and verify behavior 116 | sample.program._init(); 117 | t.deepEqual(console.log.args[0], ['Configuration validated successfully']); 118 | }); 119 | 120 | test.serial(`should fail if config doesn't pass validation`, t => { 121 | // Initialize mocks 122 | delete config['projectId']; 123 | const sample = getSample(); 124 | 125 | // Call function and verify behavior 126 | t.throws(() => { 127 | sample.program._init(); 128 | }); 129 | }); 130 | 131 | test.serial(`should initialize Redis cache if configured to do so`, t => { 132 | // Initialize mocks 133 | config.redis.active = true; 134 | const sample = getSample(); 135 | 136 | // Call function and verify behavior 137 | sample.program._init(); 138 | t.deepEqual(sample.mocks.redis.args[0], [{host: config.redis.host, port: config.redis.port}]); 139 | }); 140 | 141 | test.serial(`should write proper item to BigQuery`, async t => { 142 | // Initialize mocks 143 | const sample = getSample(); 144 | const expected = { 145 | requested_url: mockResult.options.url, 146 | final_url: mockResult.response.url, 147 | http_status: mockResult.response.status, 148 | content_type: mockResult.response.headers['content-type'], 149 | external: mockResult.response.url.indexOf(mockConfig.domain) === -1, 150 | previous_url: mockResult.previousUrl, 151 | document_title: mockResult.result.title, 152 | meta_description: mockResult.result.metaDescription, 153 | cookies: mockResult.cookies.map(c => ({ 154 | name: c.name, 155 | value: c.value, 156 | domain: c.domain, 157 | path: c.path, 158 | expires: new Date(c.expires * 1000).toISOString(), 159 | size: c.size, 160 | httpOnly: c.httpOnly, 161 | secure: c.secure, 162 | session: c.session, 163 | sameSite: c.sameSite || null 164 | })) 165 | }; 166 | 167 | // Call function and verify behavior 168 | await sample.program._writeToBigQuery(mockResult); 169 | t.deepEqual(sample.mocks.bigquery.dataset().table().insert.args[0], [[expected]]); 170 | }); 171 | 172 | test.serial(`should write proper item to BigQuery without cookie`, async t => { 173 | // Initialize mocks 174 | const sample = getSample(); 175 | mockResult.response.url = 'https://www.external.com/'; 176 | const expected = { 177 | requested_url: mockResult.options.url, 178 | final_url: mockResult.response.url, 179 | http_status: mockResult.response.status, 180 | content_type: mockResult.response.headers['content-type'], 181 | external: mockResult.response.url.indexOf(mockConfig.domain) === -1, 182 | previous_url: mockResult.previousUrl, 183 | document_title: mockResult.result.title, 184 | meta_description: mockResult.result.metaDescription, 185 | cookies: [] 186 | }; 187 | 188 | // Call function and verify behavior 189 | await sample.program._writeToBigQuery(mockResult); 190 | t.deepEqual(sample.mocks.bigquery.dataset().table().insert.args[0], [[expected]]); 191 | }); 192 | 193 | test.serial(`should return undefined if dataset already exists`, async t => { 194 | // Initialize mocks 195 | const bigQueryMock = { 196 | createDataset: sinon.stub().returns(Promise.reject(new Error('Already Exists'))) 197 | }; 198 | const BigQueryMock = sinon.stub().returns(bigQueryMock); 199 | const newSample = { 200 | program: proxyquire(`../`, { 201 | './config.json': config, 202 | '@google-cloud/bigquery': {BigQuery: BigQueryMock} 203 | }), 204 | mocks: { 205 | bigQuery: bigQueryMock 206 | } 207 | }; 208 | 209 | const result = await newSample.program._createBigQueryDataset(); 210 | t.deepEqual(result, undefined); 211 | }); 212 | 213 | test.serial(`should throw error if dataset creation failed`, async t => { 214 | // Initialize mocks 215 | const bigQueryMock = { 216 | createDataset: sinon.stub().returns(Promise.reject(new Error('Some Error'))) 217 | }; 218 | const BigQueryMock = sinon.stub().returns(bigQueryMock); 219 | const newSample = { 220 | program: proxyquire(`../`, { 221 | './config.json': config, 222 | '@google-cloud/bigquery': {BigQuery: BigQueryMock} 223 | }), 224 | mocks: { 225 | bigQuery: bigQueryMock 226 | } 227 | }; 228 | 229 | await t.throwsAsync(async() => { 230 | await newSample.program._createBigQueryDataset(); 231 | }); 232 | }); 233 | 234 | test.serial(`should create dataset with datasetId`, async t => { 235 | // Initialize mocks 236 | const sample = getSample(); 237 | 238 | // Call function and verify behavior 239 | await sample.program._createBigQueryDataset(); 240 | t.deepEqual(sample.mocks.bigquery.createDataset.args[0], [mockConfig.bigQuery.datasetId]); 241 | }); 242 | 243 | test.serial(`should return undefined if table already exists`, async t => { 244 | // Initialize mocks 245 | const datasetMock = { 246 | createTable: sinon.stub().returns(Promise.reject(new Error('Already Exists'))) 247 | }; 248 | const bigQueryMock = { 249 | dataset: sinon.stub().returns(datasetMock) 250 | }; 251 | const BigQueryMock = sinon.stub().returns(bigQueryMock); 252 | const newSample = { 253 | program: proxyquire(`../`, { 254 | './config.json': config, 255 | '@google-cloud/bigquery': {BigQuery: BigQueryMock} 256 | }) 257 | }; 258 | 259 | const result = await newSample.program._createBigQueryTable(); 260 | t.deepEqual(result, undefined); 261 | }); 262 | 263 | test.serial(`should throw error if table creation failed`, async t => { 264 | // Initialize mocks 265 | const datasetMock = { 266 | createTable: sinon.stub().returns(Promise.reject(new Error('Some Error'))) 267 | }; 268 | const bigQueryMock = { 269 | dataset: sinon.stub().returns(datasetMock) 270 | }; 271 | const BigQueryMock = sinon.stub().returns(bigQueryMock); 272 | const newSample = { 273 | program: proxyquire(`../`, { 274 | './config.json': config, 275 | '@google-cloud/bigquery': {BigQuery: BigQueryMock} 276 | }) 277 | }; 278 | 279 | await t.throwsAsync(async() => { 280 | await newSample.program._createBigQueryTable(); 281 | }); 282 | }); 283 | 284 | test.serial(`should create table with tableId`, async t => { 285 | // Initialize mocks 286 | const sample = getSample(); 287 | 288 | // Call function and verify behavior 289 | await sample.program._createBigQueryTable(); 290 | t.deepEqual(sample.mocks.bigquery.dataset().createTable.args[0][0], mockConfig.bigQuery.tableId); 291 | }); 292 | 293 | test.serial(`Calling preRequest should return true for external urls`, t => { 294 | // Initialize mocks 295 | const sample = getSample(); 296 | 297 | // Call function and verify behavior 298 | const result = sample.program._preRequest({url: 'https://some-external-url.com'}); 299 | t.true(result); 300 | }); 301 | 302 | test.serial(`Calling preRequest should return true for internal urls`, t => { 303 | // Initialize mocks 304 | const sample = getSample(); 305 | 306 | // Call function and verify behavior 307 | const result = sample.program._preRequest({url: config.startUrl}); 308 | t.true(result); 309 | }); 310 | 311 | test.serial(`should catch errors when running crawler`, async t => { 312 | // Initialize mocks 313 | delete config['bigQuery']; 314 | const sample = getSample(); 315 | 316 | // Call function and verify behavior 317 | await sample.program.launchCrawler(); 318 | t.deepEqual(console.error.args[0], [new TypeError('Cannot read property \'tableId\' of undefined')]); 319 | }); 320 | 321 | test.serial(`should run crawler`, async t => { 322 | // Initialize mocks 323 | const sample = getSample(); 324 | 325 | // Call function and verify behavior 326 | await sample.program.launchCrawler(); 327 | t.deepEqual(console.log.callCount, 4); 328 | t.deepEqual(console.log.args[0], [`Creating table ${config.bigQuery.tableId} in dataset ${config.bigQuery.datasetId}`]); 329 | t.deepEqual(console.log.args[1], [`Starting crawl from ${config.startUrl}`]); 330 | t.deepEqual(console.log.args[3], [`Crawled 0 files.`]); 331 | t.regex(console.log.args[2][0], /^Crawl took \d milliseconds\.$/); 332 | t.true(sample.mocks.bigquery.createDataset.calledWith(config.bigQuery.datasetId)); 333 | t.true(sample.mocks.bigquery.dataset().createTable.calledWith(config.bigQuery.tableId)); 334 | t.deepEqual(sample.mocks.HCCrawler.launch.callCount, 1); 335 | t.true(sample.mocks.crawler.queue.calledWith({url: config.startUrl, maxDepth: 9999999})); 336 | t.deepEqual(sample.mocks.crawler.onIdle.callCount, 1); 337 | t.deepEqual(sample.mocks.crawler.close.callCount, 1); 338 | }); 339 | 340 | 341 | /* 342 | test.serial(`should fail without valid pubsub message`, async t => { 343 | // Initialize mocks 344 | const sample = getSample(); 345 | const event = { 346 | data: Buffer.from('invalid_message').toString('base64') 347 | }; 348 | const expectedMsg = 'No valid message found!'; 349 | 350 | // Call function and verify behavior 351 | await sample.program.launchLighthouse(event); 352 | t.deepEqual(console.error.firstCall.args, [expectedMsg]); 353 | }); 354 | 355 | test.serial(`should convert object to ndJson string`, t => { 356 | // Initialize mocks 357 | const sample = getSample(); 358 | const expected = '{"item1":"value1"}\n{"item2":"value2"}\n{"item3":"value3"}\n'; 359 | const mockObj = [{item1: 'value1'},{item2: 'value2'},{item3: 'value3'}]; 360 | 361 | // Call function and verify behavior 362 | const result = sample.program._toNdJson(mockObj); 363 | t.deepEqual(result, expected); 364 | }); 365 | 366 | test.serial(`should convert lhr to bigquery schema`, t => { 367 | // Initialize mocks 368 | const sample = getSample(); 369 | const expected = require(`./mock.parsed_lhr.json`); 370 | 371 | // Call function and verify behavior 372 | const result = sample.program._createJSON(mockLhr, 'googlesearch'); 373 | t.deepEqual(result, expected); 374 | }); 375 | 376 | test.serial(`should launch puppeteer and lighthouse without lighthouseFlags`, async t => { 377 | // Initialize mocks 378 | const sample = getSample(); 379 | delete config.lighthouseFlags; 380 | const id = 'googlesearch'; 381 | const url = 'https://www.google.com/'; 382 | 383 | // Call function and verify behavior 384 | await sample.program._launchBrowserWithLighthouse(id, url); 385 | t.deepEqual(console.log.callCount, 5); 386 | t.deepEqual(console.log.args, [ 387 | [`${id}: Starting browser for ${url}`], 388 | [`${id}: Browser started for ${url}`], 389 | [`${id}: Starting lighthouse for ${url}`], 390 | [`${id}: Lighthouse done for ${url}`], 391 | [`${id}: Browser closed for ${url}`] 392 | ]); 393 | }); 394 | 395 | test.serial(`should launch puppeteer and lighthouse with lighthouseFlags`, async t => { 396 | // Initialize mocks 397 | const sample = getSample(); 398 | const id = 'googlesearch'; 399 | const url = 'https://www.google.com/'; 400 | 401 | // Call function and verify behavior 402 | await sample.program._launchBrowserWithLighthouse(id, url); 403 | t.deepEqual(console.log.callCount, 5); 404 | t.deepEqual(console.log.args, [ 405 | [`${id}: Starting browser for ${url}`], 406 | [`${id}: Browser started for ${url}`], 407 | [`${id}: Starting lighthouse for ${url}`], 408 | [`${id}: Lighthouse done for ${url}`], 409 | [`${id}: Browser closed for ${url}`] 410 | ]); 411 | }); 412 | 413 | test.serial(`should trigger pubsub for all config ids`, async t => { 414 | // Initialize mocks 415 | const sample = getSample(); 416 | const ids = sample.mocks.config.source.map(obj => obj.id); 417 | 418 | // Call function and verify behavior 419 | await sample.program._sendAllPubSubMsgs(ids); 420 | t.deepEqual(console.log.callCount, 4); 421 | t.true(sample.mocks.pubsub.topic.calledWithExactly(sample.mocks.config.pubsubTopicId)); 422 | t.deepEqual(sample.mocks.pubsub.topic().publisher().publish.callCount, 2); 423 | t.deepEqual(sample.mocks.pubsub.topic().publisher().publish.firstCall.args, [Buffer.from(sample.mocks.config.source[0].id)]); 424 | t.deepEqual(sample.mocks.pubsub.topic().publisher().publish.secondCall.args, [Buffer.from(sample.mocks.config.source[1].id)]); 425 | t.deepEqual(console.log.args, [ 426 | [`${ids[0]}: Sending init PubSub message`], 427 | [`${ids[1]}: Sending init PubSub message`], 428 | [`${ids[0]}: Init PubSub message sent`], 429 | [`${ids[1]}: Init PubSub message sent`] 430 | ]); 431 | }); 432 | 433 | test.serial(`should return active state if trigger fired < ${mockConfig.minTimeBetweenTriggers/1000}s ago`, async t => { 434 | // Initialize mocks 435 | const sample = getSample(); 436 | const expected = {active: true, delta: 10}; 437 | 438 | // Call function and verify behavior 439 | const result = await sample.program._checkEventState('googlesearch', new Date().getTime() - mockConfig.minTimeBetweenTriggers + 10000); 440 | t.deepEqual(result, expected); 441 | }); 442 | 443 | test.serial(`should return inactive state if trigger fired >= ${mockConfig.minTimeBetweenTriggers/1000}s ago`, async t => { 444 | // Initialize mocks 445 | const sample = getSample(); 446 | const expected = {active: false}; 447 | 448 | // Call function and verify behavior 449 | const result = await sample.program._checkEventState('googlesearch', new Date().getTime()); 450 | t.deepEqual(result, expected); 451 | }); 452 | 453 | test.serial(`should abort main function if trigger fired < ${mockConfig.minTimeBetweenTriggers/1000}s ago`, async t => { 454 | // Initialize mocks 455 | const sample = getSample({eventTriggerActive: true}); 456 | const event = { 457 | data: Buffer.from('googlesearch').toString('base64') 458 | }; 459 | 460 | // Call function and verify behavior 461 | await sample.program.launchLighthouse(event); 462 | t.true(console.log.calledWith(`googlesearch: Found active event (0s < ${mockConfig.minTimeBetweenTriggers/1000}s), aborting...`)); 463 | }); 464 | 465 | test.serial(`should write only object log to gcs bucket if output not defined`, async t => { 466 | // Initialize mocks 467 | const sample = getSample(); 468 | delete config.lighthouseFlags.output; 469 | const mockObj = { 470 | lhr: {fetchTime: "2018-12-17T10:56:56.420Z"} 471 | }; 472 | const id = 'ebay'; 473 | 474 | // Call function and verify behavior 475 | await sample.program._writeLogAndReportsToStorage(mockObj, id); 476 | t.deepEqual(sample.mocks.storage.bucket().file().save.callCount, 1); 477 | t.true(sample.mocks.storage.bucket.calledWith('lighthouse-reports')); 478 | t.true(sample.mocks.storage.bucket().file.calledWith(`${id}/log_${mockObj.lhr.fetchTime}.json`)); 479 | t.deepEqual(sample.mocks.storage.bucket().file().save.firstCall.args, [JSON.stringify(mockObj.lhr, null, " "), {metadata: {contentType: 'application/json'}}]); 480 | }); 481 | 482 | test.serial(`should write object reports and log to gcs bucket if output defined`, async t => { 483 | // Initialize mocks 484 | const sample = getSample(); 485 | const mockObj = { 486 | report: ['report1', 'report2', 'report3'], 487 | lhr: {fetchTime: "2018-12-17T10:56:56.420Z"} 488 | }; 489 | const id = 'ebay'; 490 | 491 | // Call function and verify behavior 492 | await sample.program._writeLogAndReportsToStorage(mockObj, id); 493 | t.deepEqual(sample.mocks.storage.bucket().file().save.callCount, 4); 494 | t.true(sample.mocks.storage.bucket.calledWith('lighthouse-reports')); 495 | t.true(sample.mocks.storage.bucket().file.calledWith(`${id}/report_${mockObj.lhr.fetchTime}.html`)); 496 | t.deepEqual(sample.mocks.storage.bucket().file().save.firstCall.args, ['report1', {metadata: {contentType: 'text/html'}}]); 497 | t.true(sample.mocks.storage.bucket().file.calledWith(`${id}/report_${mockObj.lhr.fetchTime}.csv`)); 498 | t.deepEqual(sample.mocks.storage.bucket().file().save.secondCall.args, ['report2', {metadata: {contentType: 'text/csv'}}]); 499 | t.true(sample.mocks.storage.bucket().file.calledWith(`${id}/report_${mockObj.lhr.fetchTime}.json`)); 500 | t.deepEqual(sample.mocks.storage.bucket().file().save.thirdCall.args, ['report3', {metadata: {contentType: 'application/json'}}]); 501 | t.true(sample.mocks.storage.bucket().file.calledWith(`${id}/log_${mockObj.lhr.fetchTime}.json`)); 502 | t.deepEqual(sample.mocks.storage.bucket().file().save.lastCall.args, [JSON.stringify(mockObj.lhr, null, " "), {metadata: {contentType: 'application/json'}}]); 503 | }); 504 | 505 | test.serial(`should fire all pubsub triggers with 'all' message`, async t => { 506 | // Initialize mocks 507 | const sample = getSample(); 508 | const event = { 509 | data: Buffer.from('all').toString('base64') 510 | }; 511 | 512 | // Call function and verify behavior 513 | await sample.program.launchLighthouse(event); 514 | t.true(sample.mocks.pubsub.topic().publisher().publish.calledWith(Buffer.from('googlesearch'))); 515 | t.true(sample.mocks.pubsub.topic().publisher().publish.calledWith(Buffer.from('ebay'))); 516 | }); 517 | 518 | test.serial(`should catch error`, async t => { 519 | // Initialize mocks 520 | const sample = getSample(); 521 | delete config.source; 522 | const event = { 523 | data: Buffer.from('all').toString('base64') 524 | }; 525 | 526 | // Call function and verify behavior 527 | await sample.program.launchLighthouse(event); 528 | t.deepEqual(console.error.firstCall.args, [new TypeError('Cannot read property \'map\' of undefined')]); 529 | }); 530 | 531 | test.serial(`should call bigquery load for id when called with id in pubsub message`, async t => { 532 | // Initialize mocks, test live environment 533 | process.env.NODE_ENV = 'live'; 534 | const sample = getSample(); 535 | const event = { 536 | data: Buffer.from(sample.mocks.config.source[0].id).toString('base64') 537 | }; 538 | 539 | // Call function and verify behavior 540 | await sample.program.launchLighthouse(event); 541 | t.deepEqual(sample.mocks.bigquery.dataset().table().load.callCount, 1); 542 | });*/ 543 | --------------------------------------------------------------------------------