├── .circleci └── config.yml ├── .gitignore ├── .npmignore ├── LICENSE ├── README.md ├── data ├── search_categories.json └── search_features.json ├── index.js ├── main.js ├── package-lock.json ├── package.json ├── selenium ├── src └── chrome-web-store-scraper.js └── test ├── package-lock.json ├── package.json └── test.js /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | # Javascript Node CircleCI 2.0 configuration file 2 | # 3 | # Check https://circleci.com/docs/2.0/language-javascript/ for more details 4 | # 5 | version: 2 6 | jobs: 7 | build: 8 | 9 | docker: 10 | # specify the version you desire here 11 | - image: circleci/node:10.8.0 12 | # Specify service dependencies here if necessary 13 | # CircleCI maintains a library of pre-built images 14 | # documented at https://circleci.com/docs/2.0/circleci-images/ 15 | # - image: circleci/mongo:3.4.4 16 | 17 | working_directory: ~/repo 18 | 19 | steps: 20 | - checkout 21 | 22 | # Download and cache dependencies 23 | - restore_cache: 24 | keys: 25 | - v1-dependencies-{{ checksum "package.json" }} 26 | # fallback to using the latest cache if no exact match is found 27 | - v1-dependencies- 28 | 29 | - run: 30 | name: Install Detault JRE 31 | command: sudo apt-get install default-jre 32 | 33 | - run: 34 | name: Install Chromium 35 | command: wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | sudo apt-key add - && echo 'deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main' | sudo tee /etc/apt/sources.list.d/google-chrome.list && sudo apt-get update && sudo apt-get install google-chrome-stable 36 | 37 | - run: 38 | name: Download Selenium 39 | command: curl -O http://selenium-release.storage.googleapis.com/3.5/selenium-server-standalone-3.5.3.jar && sudo cp selenium-server-standalone-3.5.3.jar /bin/selenium.jar && sudo cp ./selenium /bin/selenium && sudo chmod +x /bin/selenium 40 | 41 | - run: 42 | name: Download Chrome Driver 43 | command: wget https://chromedriver.storage.googleapis.com/2.41/chromedriver_linux64.zip && unzip ./chromedriver_linux64.zip -d ~/ && rm ./chromedriver_linux64.zip && sudo mv -f ~/chromedriver /usr/local/bin/chromedriver && sudo chown root:root /usr/local/bin/chromedriver && sudo chmod 0755 /usr/local/bin/chromedriver 44 | 45 | - run: 46 | name: install mocha 47 | command: sudo npm install -g mocha 48 | 49 | - run: npm -v && node -v 50 | 51 | - run: npm install 52 | 53 | - save_cache: 54 | paths: 55 | - node_modules 56 | key: v1-dependencies-{{ checksum "package.json" }} 57 | 58 | # run tests! 59 | - run: npm test 60 | 61 | 62 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | 8 | # Runtime data 9 | pids 10 | *.pid 11 | *.seed 12 | *.pid.lock 13 | 14 | # Directory for instrumented libs generated by jscoverage/JSCover 15 | lib-cov 16 | 17 | # Coverage directory used by tools like istanbul 18 | coverage 19 | 20 | # nyc test coverage 21 | .nyc_output 22 | 23 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 24 | .grunt 25 | 26 | # Bower dependency directory (https://bower.io/) 27 | bower_components 28 | 29 | # node-waf configuration 30 | .lock-wscript 31 | 32 | # Compiled binary addons (https://nodejs.org/api/addons.html) 33 | build/Release 34 | 35 | # Dependency directories 36 | node_modules/ 37 | jspm_packages/ 38 | 39 | # TypeScript v1 declaration files 40 | typings/ 41 | 42 | # Optional npm cache directory 43 | .npm 44 | 45 | # Optional eslint cache 46 | .eslintcache 47 | 48 | # Optional REPL history 49 | .node_repl_history 50 | 51 | # Output of 'npm pack' 52 | *.tgz 53 | 54 | # Yarn Integrity file 55 | .yarn-integrity 56 | 57 | # dotenv environment variables file 58 | .env 59 | 60 | # next.js build output 61 | .next 62 | 63 | 64 | # Sonarqube 65 | 66 | sonar.sh -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | .circleci/ 2 | main.js 3 | test/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Adam Slack 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NOTICE 2 | It has been a very long time since i looked at this repo. i would guess it no longer functions. It might not take too much effort to fix up though. 3 | 4 | ---------- 5 | [![CircleCI](https://circleci.com/gh/AdamSlack/chrome-web-store-scraper.svg?style=shield)](https://circleci.com/gh/AdamSlack/chrome-web-store-scraper) 6 | [![npm version](https://badge.fury.io/js/chrome-web-store-scraper.svg)](https://badge.fury.io/js/chrome-web-store-scraper) 7 | [![GitHub license](https://img.shields.io/github/license/Naereen/StrapDown.js.svg)](https://github.com/Naereen/StrapDown.js/blob/master/LICENSE) 8 | [![sonarcloud](https://sonarcloud.io/api/project_badges/measure?project=AdamSlack_chrome-web-store-scraper&metric=alert_status)](https://sonarcloud.io/api/project_badges/measure?project=AdamSlack_chrome-web-store-scraper&metric=alert_status) 9 | # chrome-web-store-scraper 10 | A node js package for scraping the chrome web store. 11 | 12 | 13 | # Requirements 14 | 15 | This project requires selenium, a Web Browser Automation tool. The latest version of the Selenium Standalone Server can be downloaded from [seleniumhq](https://www.seleniumhq.org/download/). 16 | 17 | Selenium Server must also be installed as `selenium` on the system PATH. For linux, a `selenium` bash script is included that can be paired with the `selenium.jar` for ease of use. 18 | 19 | The [selenium-webdriver](https://www.npmjs.com/package/selenium-webdriver) npm package has some details on what is required. 20 | 21 | # Selenium Setup 22 | 23 | The Selenium server must be on the system path as '`selenium`' the easiest way to set it up to work with the chrome web store scraper is to make the `selenium` bash script (that is included this project) an executable with `chmod +x selenium` and then copy that file, along with the selenium server `.jar` file to `/bin/` or somewhere similar. 24 | 25 | When copying the selenium server `.jar` make sure it is renamed from `selenium-server-standalone-3.14.0.jar` or whatever it is currently called, to just `selenium.jar`. 26 | 27 | ## chromedriver 28 | 29 | As well as selenium, you're going to need the latest [chromedriver](http://chromedriver.chromium.org/) installed. 30 | 31 | ## chrome-browser-stable 32 | 33 | A chrome browser is also required. you can get the latest [chrome-broswer-stable](https://www.chromium.org/getting-involved/dev-channel) from chromium. 34 | 35 | # How To Use 36 | 37 | You can use this to scrape search results for chrome extensions, or to scrape store information for a specific extension. 38 | 39 | To include the scraper in your project: 40 | ```js 41 | const ChromeWebScraper = require('chrome-web-store-scraper'); 42 | const scraper = new ChromeWebScraper(); 43 | ``` 44 | 45 | ## Search 46 | 47 | The most basic search just requires you to provide a search term. 48 | ```js 49 | scraper.search('some-search-term').then( 50 | (res) => console.log(res), 51 | (err) => console.log(err) 52 | ); 53 | ``` 54 | 55 | Example Response 56 | ```json 57 | [ 58 | { 59 | "title": "Data Scraper - Easy Web Scraping", 60 | "description": "Data Scraper extracts data out of HTML web pages and imports it into Microsoft Excel spreadsheets", 61 | "author": "", 62 | "category": "Productivity", 63 | "rating": 4.107231920199501, 64 | "numberOfRatings": 401, 65 | "storeURL": "https://chrome.google.com/webstore/detail/data-scraper-easy-web-scr/nndknepjnldbdbepjfgmncbggmopgden" 66 | }, 67 | ... 68 | ] 69 | ``` 70 | 71 | there are additional options than can be used to perform a more directed search 72 | 73 | ### Search categories 74 | You can provide as a category which the scraper will then use when building a search request, only one category can be provided. 75 | 76 | Valid Categories 77 | ```json 78 | all 79 | accessibility 80 | blogging 81 | byGoogle 82 | developerTools 83 | fun 84 | newsAndWeather 85 | photos 86 | productivity 87 | searchTools 88 | shopping 89 | socialAndCommunication 90 | sports 91 | ``` 92 | 93 | Categories can be provided in an options JSON object as demonstrated below: 94 | ```js 95 | const options = {searchCategory : 'newsAndWeather'} 96 | 97 | scraper.search('searchString', options).then( 98 | (res) => console.log(res), 99 | (err) => console.log(err) 100 | ); 101 | ``` 102 | 103 | ### Search Features 104 | 105 | Search features can be provided as a means of specifying select features that a chrome extension must have. 106 | 107 | Passed as an array of strings in an options JSON object, the features can be any combination of the following: 108 | ```json 109 | offline 110 | byGoogle 111 | free 112 | android 113 | googleDrive 114 | ``` 115 | 116 | 117 | features can be provided in an options JSON object as demonstrated below: 118 | ```js 119 | const options = { 120 | searchFeatures : ['free', 'offline','byGoogle'] 121 | } 122 | 123 | scraper.search('searchString', options).then( 124 | (res) => console.log(res), 125 | (err) => console.log(err) 126 | ); 127 | ``` 128 | 129 | ### Features and Categories 130 | 131 | Searching can be performed with categories and features together in the same options JSON object. 132 | 133 | ```js 134 | const options = { 135 | searchCategory : 'newsAndWeather', 136 | searchFeatures : ['free', 'byGoogle'] 137 | } 138 | 139 | scraper.search('searchString', options).then( 140 | (res) => console.log(res), 141 | (err) => console.log(err) 142 | ); 143 | 144 | ``` 145 | 146 | ## Search Options 147 | 148 | As well as the categories and feature filters, additional options in the form of `locale` and `scrollAttempts` can also be used. 149 | 150 | The `scrollAttempts` option is used to specify how many attempts are made to retrieve additional search results, by scrolling down the page loaded by selenium. the larger the number, the more the page will be scrolled down. It is worth noting that each scroll attempt is paired with a 50ms wait, so a number excessively large will result in longer processing times. 151 | 152 | The `locale` option can be set by passing a locale string as an option. The locale string is used as the `hl` url option by the chrome extension store, to scrape search results in french, danish, or italian, locale strings `'fr'`, `'da'`, or `'it'` could be used. 153 | 154 | ### Example Search with locale and scrollAttempts 155 | ```js 156 | scraper.search('scraper',{scrollAttempts:200, locale:'da'}).then( 157 | (res) => console.log(res[0]), 158 | (err) => console.log(err) 159 | ); 160 | ``` 161 | 162 | ## Extension Scraping 163 | 164 | In order to scrape the store page for a specific chrome extension, this scraper requires a direct url to that page. These urls are to be passed as a parameter to the `scrapeApp` function. 165 | 166 | ```js 167 | scraper.scrapeApp('url-to-some-app').then( 168 | (res) => console.log(res), 169 | (err) => console.log(err) 170 | ); 171 | ``` 172 | 173 | Example Response 174 | 175 | ```json 176 | { 177 | "header": { 178 | "title": "Autosave webpage", 179 | "offeredBy": "offered by mtcutler1", 180 | "userCount": "48", 181 | "rating": "3.5", 182 | "ratingCount": 4, 183 | "imgURL": "https://lh3.googleusercontent.com/4jyS9mGYDUFs2KL52Xfg_I9EzkUIzlCboTp5Dvqv-vKrUWhoz9tNCWR4lPfNFneM2JFmgNrkCkc=w26-h26-e365" 184 | }, 185 | "overview": { 186 | "summary": "Save ... a scheduled…", 187 | "description": "Save ... stay updated", 188 | "version": "0.1", 189 | "lastUpdatedDate": "January 24, 2018", 190 | "size": "178KiB", 191 | "language": "English (United States)", 192 | "screenshotURLs": [ 193 | "https://lh3.googleusercontent.com/nBXzgn-La5s3HyynhHWmnJwAasC1KUMK8GfqCVnOqL-CEGhLOcVNGaNPYUQBv180-ypWPQN2xc8=w640-h400-e365", 194 | "https://lh3.googleusercontent.com/nBXzgn-La5s3HyynhHWmnJwAasC1KUMK8GfqCVnOqL-CEGhLOcVNGaNPYUQBv180-ypWPQN2xc8=w640-h400-e365", 195 | "https://lh3.googleusercontent.com/nBXzgn-La5s3HyynhHWmnJwAasC1KUMK8GfqCVnOqL-CEGhLOcVNGaNPYUQBv180-ypWPQN2xc8=w120-h90-e365" 196 | ], 197 | "additionalInfo": [] 198 | }, 199 | "reviews": [ 200 | { 201 | "displayName": "Jeffrey", 202 | "profileImageURL": "//www.gstatic.com/s2/contacts/images/NoPicture.gif", 203 | "displayNameURL": "https://plus.google.com/110338040265199312388", 204 | "timestamp": "Modified Mar 21, 2018", 205 | "ratingString": "4 stars (Liked it)", 206 | "rating": 4, 207 | "comment": "Seemed to only work with one tab...would be perfect if it works on multiple tabs simultaneously" 208 | } 209 | ] 210 | } 211 | } 212 | ``` 213 | -------------------------------------------------------------------------------- /data/search_categories.json: -------------------------------------------------------------------------------- 1 | { 2 | "all" : "extensions", 3 | "accessibility" : "ext/22-accessibility", 4 | "blogging" : "ext/10-blogging", 5 | "byGoogle" : "ext/15-by-google", 6 | "developerTools" : "ext/11-web-development", 7 | "fun" : "ext/14-fun", 8 | "newsAndWeather" : "ext/6-news", 9 | "photos" : "ext/28-photos", 10 | "productivity" : "ext/7-productivity", 11 | "searchTools" : "ext/38-search-tools", 12 | "shopping" : "ext/12-shopping", 13 | "socialAndCommunication" : "ext/1-communication", 14 | "sports" : "" 15 | } -------------------------------------------------------------------------------- /data/search_features.json: -------------------------------------------------------------------------------- 1 | { 2 | "offline" : "_feature=offline", 3 | "byGoogle" : "_feature=google", 4 | "free" : "_feature=free", 5 | "android" : "_feature=android", 6 | "googleDrive" : "_feature=drive" 7 | } -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Export cheerio (with ) 3 | */ 4 | 5 | exports = module.exports = require('./src/chrome-web-store-scraper.js'); 6 | 7 | /* 8 | Export the version 9 | */ 10 | 11 | exports.version = require('./package.json').version; 12 | -------------------------------------------------------------------------------- /main.js: -------------------------------------------------------------------------------- 1 | const ChromeWebScraper = require('./src/chrome-web-store-scraper') 2 | const scraper = new ChromeWebScraper() 3 | 4 | const fs = require('fs') 5 | 6 | async function main () { 7 | // scraper.search('scraper',{scrollAttempts:200,locale:'da'}).then( 8 | // (res) => console.log(res[0]), 9 | // (err) => console.log(err) 10 | // ); 11 | 12 | // scraper.scrapeApp('https://chrome.google.com/webstore/detail/vidiq-vision-for-youtube/pachckjkecffpdphbpmfolblodfkgbhl') 13 | // scraper.scrapeApp('https://chrome.google.com/webstore/detail/scraper-crawler-v3/kbhidgghgflkbalnkoeokcipocmigkfh') 14 | scraper.scrapeApp('https://chrome.google.com/webstore/detail/restlet-client-rest-api-t/aejoelaoggembcahagimdiliamlcdmfm') 15 | .then( 16 | (res) => { 17 | fs.writeFile(`${res.header.title.replace(' ', '_')}_scraping.json`,JSON.stringify(res,null,2),() => console.log('Output Saved')), 18 | console.log('Complete:', res.reviews, 'Number of reviews:', res.reviews.length) 19 | }, 20 | (err) => console.log(err) 21 | ) 22 | } 23 | 24 | main() 25 | -------------------------------------------------------------------------------- /package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "chrome-web-store-scraper", 3 | "version": "1.1.1", 4 | "lockfileVersion": 1, 5 | "requires": true, 6 | "dependencies": { 7 | "@types/node": { 8 | "version": "10.5.8", 9 | "resolved": "https://registry.npmjs.org/@types/node/-/node-10.5.8.tgz", 10 | "integrity": "sha512-sWSjw+bYW/2W+1V3m8tVsm9PKJcxk3NHN7oRqNUfEdofKg0Imbdu1dQbFvLKjZQXEDXRN6IfSMACjJ7Wv4NGCQ==" 11 | }, 12 | "assertion-error": { 13 | "version": "1.1.0", 14 | "resolved": "https://registry.npmjs.org/assertion-error/-/assertion-error-1.1.0.tgz", 15 | "integrity": "sha512-jgsaNduz+ndvGyFt3uSuWqvy4lCnIJiovtouQN5JZHOKCS2QuhEdbcQHFhVksz2N2U9hXJo8odG7ETyWlEeuDw==" 16 | }, 17 | "balanced-match": { 18 | "version": "1.0.0", 19 | "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz", 20 | "integrity": "sha1-ibTRmasr7kneFk6gK4nORi1xt2c=" 21 | }, 22 | "boolbase": { 23 | "version": "1.0.0", 24 | "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", 25 | "integrity": "sha1-aN/1++YMUes3cl6p4+0xDcwed24=" 26 | }, 27 | "brace-expansion": { 28 | "version": "1.1.11", 29 | "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", 30 | "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==", 31 | "requires": { 32 | "balanced-match": "^1.0.0", 33 | "concat-map": "0.0.1" 34 | } 35 | }, 36 | "chai": { 37 | "version": "4.1.2", 38 | "resolved": "https://registry.npmjs.org/chai/-/chai-4.1.2.tgz", 39 | "integrity": "sha1-D2RYS6ZC8PKs4oBiefTwbKI61zw=", 40 | "requires": { 41 | "assertion-error": "^1.0.1", 42 | "check-error": "^1.0.1", 43 | "deep-eql": "^3.0.0", 44 | "get-func-name": "^2.0.0", 45 | "pathval": "^1.0.0", 46 | "type-detect": "^4.0.0" 47 | } 48 | }, 49 | "check-error": { 50 | "version": "1.0.2", 51 | "resolved": "https://registry.npmjs.org/check-error/-/check-error-1.0.2.tgz", 52 | "integrity": "sha1-V00xLt2Iu13YkS6Sht1sCu1KrII=" 53 | }, 54 | "cheerio": { 55 | "version": "1.0.0-rc.2", 56 | "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0-rc.2.tgz", 57 | "integrity": "sha1-S59TqBsn5NXawxwP/Qz6A8xoMNs=", 58 | "requires": { 59 | "css-select": "~1.2.0", 60 | "dom-serializer": "~0.1.0", 61 | "entities": "~1.1.1", 62 | "htmlparser2": "^3.9.1", 63 | "lodash": "^4.15.0", 64 | "parse5": "^3.0.1" 65 | } 66 | }, 67 | "concat-map": { 68 | "version": "0.0.1", 69 | "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", 70 | "integrity": "sha1-2Klr13/Wjfd5OnMDajug1UBdR3s=" 71 | }, 72 | "core-js": { 73 | "version": "2.3.0", 74 | "resolved": "https://registry.npmjs.org/core-js/-/core-js-2.3.0.tgz", 75 | "integrity": "sha1-+rg/uwstjchfpjbEudNMdUIMbWU=" 76 | }, 77 | "core-util-is": { 78 | "version": "1.0.2", 79 | "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz", 80 | "integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac=" 81 | }, 82 | "css-select": { 83 | "version": "1.2.0", 84 | "resolved": "https://registry.npmjs.org/css-select/-/css-select-1.2.0.tgz", 85 | "integrity": "sha1-KzoRBTnFNV8c2NMUYj6HCxIeyFg=", 86 | "requires": { 87 | "boolbase": "~1.0.0", 88 | "css-what": "2.1", 89 | "domutils": "1.5.1", 90 | "nth-check": "~1.0.1" 91 | } 92 | }, 93 | "css-what": { 94 | "version": "2.1.0", 95 | "resolved": "https://registry.npmjs.org/css-what/-/css-what-2.1.0.tgz", 96 | "integrity": "sha1-lGfQMsOM+u+58teVASUwYvh/ob0=" 97 | }, 98 | "deep-eql": { 99 | "version": "3.0.1", 100 | "resolved": "https://registry.npmjs.org/deep-eql/-/deep-eql-3.0.1.tgz", 101 | "integrity": "sha512-+QeIQyN5ZuO+3Uk5DYh6/1eKO0m0YmJFGNmFHGACpf1ClL1nmlV/p4gNgbl2pJGxgXb4faqo6UE+M5ACEMyVcw==", 102 | "requires": { 103 | "type-detect": "^4.0.0" 104 | } 105 | }, 106 | "dom-serializer": { 107 | "version": "0.1.0", 108 | "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-0.1.0.tgz", 109 | "integrity": "sha1-BzxpdUbOB4DOI75KKOKT5AvDDII=", 110 | "requires": { 111 | "domelementtype": "~1.1.1", 112 | "entities": "~1.1.1" 113 | }, 114 | "dependencies": { 115 | "domelementtype": { 116 | "version": "1.1.3", 117 | "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-1.1.3.tgz", 118 | "integrity": "sha1-vSh3PiZCiBrsUVRJJCmcXNgiGFs=" 119 | } 120 | } 121 | }, 122 | "domelementtype": { 123 | "version": "1.3.0", 124 | "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-1.3.0.tgz", 125 | "integrity": "sha1-sXrtguirWeUt2cGbF1bg/BhyBMI=" 126 | }, 127 | "domhandler": { 128 | "version": "2.4.2", 129 | "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-2.4.2.tgz", 130 | "integrity": "sha512-JiK04h0Ht5u/80fdLMCEmV4zkNh2BcoMFBmZ/91WtYZ8qVXSKjiw7fXMgFPnHcSZgOo3XdinHvmnDUeMf5R4wA==", 131 | "requires": { 132 | "domelementtype": "1" 133 | } 134 | }, 135 | "domutils": { 136 | "version": "1.5.1", 137 | "resolved": "https://registry.npmjs.org/domutils/-/domutils-1.5.1.tgz", 138 | "integrity": "sha1-3NhIiib1Y9YQeeSMn3t+Mjc2gs8=", 139 | "requires": { 140 | "dom-serializer": "0", 141 | "domelementtype": "1" 142 | } 143 | }, 144 | "entities": { 145 | "version": "1.1.1", 146 | "resolved": "https://registry.npmjs.org/entities/-/entities-1.1.1.tgz", 147 | "integrity": "sha1-blwtClYhtdra7O+AuQ7ftc13cvA=" 148 | }, 149 | "es6-promise": { 150 | "version": "3.0.2", 151 | "resolved": "https://registry.npmjs.org/es6-promise/-/es6-promise-3.0.2.tgz", 152 | "integrity": "sha1-AQ1YWEI6XxGJeWZfRkhqlcbuK7Y=" 153 | }, 154 | "fs.realpath": { 155 | "version": "1.0.0", 156 | "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", 157 | "integrity": "sha1-FQStJSMVjKpA20onh8sBQRmU6k8=" 158 | }, 159 | "get-func-name": { 160 | "version": "2.0.0", 161 | "resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.0.tgz", 162 | "integrity": "sha1-6td0q+5y4gQJQzoGY2YCPdaIekE=" 163 | }, 164 | "glob": { 165 | "version": "7.1.2", 166 | "resolved": "https://registry.npmjs.org/glob/-/glob-7.1.2.tgz", 167 | "integrity": "sha512-MJTUg1kjuLeQCJ+ccE4Vpa6kKVXkPYJ2mOCQyUuKLcLQsdrMCpBPUi8qVE6+YuaJkozeA9NusTAw3hLr8Xe5EQ==", 168 | "requires": { 169 | "fs.realpath": "^1.0.0", 170 | "inflight": "^1.0.4", 171 | "inherits": "2", 172 | "minimatch": "^3.0.4", 173 | "once": "^1.3.0", 174 | "path-is-absolute": "^1.0.0" 175 | } 176 | }, 177 | "htmlparser2": { 178 | "version": "3.9.2", 179 | "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-3.9.2.tgz", 180 | "integrity": "sha1-G9+HrMoPP55T+k/M6w9LTLsAszg=", 181 | "requires": { 182 | "domelementtype": "^1.3.0", 183 | "domhandler": "^2.3.0", 184 | "domutils": "^1.5.1", 185 | "entities": "^1.1.1", 186 | "inherits": "^2.0.1", 187 | "readable-stream": "^2.0.2" 188 | } 189 | }, 190 | "immediate": { 191 | "version": "3.0.6", 192 | "resolved": "https://registry.npmjs.org/immediate/-/immediate-3.0.6.tgz", 193 | "integrity": "sha1-nbHb0Pr43m++D13V5Wu2BigN5ps=" 194 | }, 195 | "inflight": { 196 | "version": "1.0.6", 197 | "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz", 198 | "integrity": "sha1-Sb1jMdfQLQwJvJEKEHW6gWW1bfk=", 199 | "requires": { 200 | "once": "^1.3.0", 201 | "wrappy": "1" 202 | } 203 | }, 204 | "inherits": { 205 | "version": "2.0.3", 206 | "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz", 207 | "integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4=" 208 | }, 209 | "isarray": { 210 | "version": "1.0.0", 211 | "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", 212 | "integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE=" 213 | }, 214 | "jszip": { 215 | "version": "3.1.5", 216 | "resolved": "https://registry.npmjs.org/jszip/-/jszip-3.1.5.tgz", 217 | "integrity": "sha512-5W8NUaFRFRqTOL7ZDDrx5qWHJyBXy6velVudIzQUSoqAAYqzSh2Z7/m0Rf1QbmQJccegD0r+YZxBjzqoBiEeJQ==", 218 | "requires": { 219 | "core-js": "~2.3.0", 220 | "es6-promise": "~3.0.2", 221 | "lie": "~3.1.0", 222 | "pako": "~1.0.2", 223 | "readable-stream": "~2.0.6" 224 | }, 225 | "dependencies": { 226 | "process-nextick-args": { 227 | "version": "1.0.7", 228 | "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-1.0.7.tgz", 229 | "integrity": "sha1-FQ4gt1ZZCtP5EJPyWk8q2L/zC6M=" 230 | }, 231 | "readable-stream": { 232 | "version": "2.0.6", 233 | "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.0.6.tgz", 234 | "integrity": "sha1-j5A0HmilPMySh4jaz80Rs265t44=", 235 | "requires": { 236 | "core-util-is": "~1.0.0", 237 | "inherits": "~2.0.1", 238 | "isarray": "~1.0.0", 239 | "process-nextick-args": "~1.0.6", 240 | "string_decoder": "~0.10.x", 241 | "util-deprecate": "~1.0.1" 242 | } 243 | }, 244 | "string_decoder": { 245 | "version": "0.10.31", 246 | "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz", 247 | "integrity": "sha1-YuIDvEF2bGwoyfyEMB2rHFMQ+pQ=" 248 | } 249 | } 250 | }, 251 | "lie": { 252 | "version": "3.1.1", 253 | "resolved": "https://registry.npmjs.org/lie/-/lie-3.1.1.tgz", 254 | "integrity": "sha1-mkNrLMd0bKWd56QfpGmz77dr2H4=", 255 | "requires": { 256 | "immediate": "~3.0.5" 257 | } 258 | }, 259 | "lodash": { 260 | "version": "4.17.10", 261 | "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.10.tgz", 262 | "integrity": "sha512-UejweD1pDoXu+AD825lWwp4ZGtSwgnpZxb3JDViD7StjQz+Nb/6l093lx4OQ0foGWNRoc19mWy7BzL+UAK2iVg==" 263 | }, 264 | "minimatch": { 265 | "version": "3.0.4", 266 | "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz", 267 | "integrity": "sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA==", 268 | "requires": { 269 | "brace-expansion": "^1.1.7" 270 | } 271 | }, 272 | "nth-check": { 273 | "version": "1.0.1", 274 | "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-1.0.1.tgz", 275 | "integrity": "sha1-mSms32KPwsQQmN6rgqxYDPFJquQ=", 276 | "requires": { 277 | "boolbase": "~1.0.0" 278 | } 279 | }, 280 | "once": { 281 | "version": "1.4.0", 282 | "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", 283 | "integrity": "sha1-WDsap3WWHUsROsF9nFC6753Xa9E=", 284 | "requires": { 285 | "wrappy": "1" 286 | } 287 | }, 288 | "os-tmpdir": { 289 | "version": "1.0.2", 290 | "resolved": "https://registry.npmjs.org/os-tmpdir/-/os-tmpdir-1.0.2.tgz", 291 | "integrity": "sha1-u+Z0BseaqFxc/sdm/lc0VV36EnQ=" 292 | }, 293 | "pako": { 294 | "version": "1.0.6", 295 | "resolved": "https://registry.npmjs.org/pako/-/pako-1.0.6.tgz", 296 | "integrity": "sha512-lQe48YPsMJAig+yngZ87Lus+NF+3mtu7DVOBu6b/gHO1YpKwIj5AWjZ/TOS7i46HD/UixzWb1zeWDZfGZ3iYcg==" 297 | }, 298 | "parse5": { 299 | "version": "3.0.3", 300 | "resolved": "https://registry.npmjs.org/parse5/-/parse5-3.0.3.tgz", 301 | "integrity": "sha512-rgO9Zg5LLLkfJF9E6CCmXlSE4UVceloys8JrFqCcHloC3usd/kJCyPDwH2SOlzix2j3xaP9sUX3e8+kvkuleAA==", 302 | "requires": { 303 | "@types/node": "*" 304 | } 305 | }, 306 | "path-is-absolute": { 307 | "version": "1.0.1", 308 | "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", 309 | "integrity": "sha1-F0uSaHNVNP+8es5r9TpanhtcX18=" 310 | }, 311 | "pathval": { 312 | "version": "1.1.0", 313 | "resolved": "https://registry.npmjs.org/pathval/-/pathval-1.1.0.tgz", 314 | "integrity": "sha1-uULm1L3mUwBe9rcTYd74cn0GReA=" 315 | }, 316 | "process-nextick-args": { 317 | "version": "2.0.0", 318 | "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.0.tgz", 319 | "integrity": "sha512-MtEC1TqN0EU5nephaJ4rAtThHtC86dNN9qCuEhtshvpVBkAW5ZO7BASN9REnF9eoXGcRub+pFuKEpOHE+HbEMw==" 320 | }, 321 | "readable-stream": { 322 | "version": "2.3.6", 323 | "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz", 324 | "integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==", 325 | "requires": { 326 | "core-util-is": "~1.0.0", 327 | "inherits": "~2.0.3", 328 | "isarray": "~1.0.0", 329 | "process-nextick-args": "~2.0.0", 330 | "safe-buffer": "~5.1.1", 331 | "string_decoder": "~1.1.1", 332 | "util-deprecate": "~1.0.1" 333 | } 334 | }, 335 | "rimraf": { 336 | "version": "2.6.2", 337 | "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.6.2.tgz", 338 | "integrity": "sha512-lreewLK/BlghmxtfH36YYVg1i8IAce4TI7oao75I1g245+6BctqTVQiBP3YUJ9C6DQOXJmkYR9X9fCLtCOJc5w==", 339 | "requires": { 340 | "glob": "^7.0.5" 341 | } 342 | }, 343 | "safe-buffer": { 344 | "version": "5.1.2", 345 | "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", 346 | "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==" 347 | }, 348 | "sax": { 349 | "version": "1.2.4", 350 | "resolved": "https://registry.npmjs.org/sax/-/sax-1.2.4.tgz", 351 | "integrity": "sha512-NqVDv9TpANUjFm0N8uM5GxL36UgKi9/atZw+x7YFnQ8ckwFGKrl4xX4yWtrey3UJm5nP1kUbnYgLopqWNSRhWw==" 352 | }, 353 | "selenium-webdriver": { 354 | "version": "4.0.0-alpha.1", 355 | "resolved": "https://registry.npmjs.org/selenium-webdriver/-/selenium-webdriver-4.0.0-alpha.1.tgz", 356 | "integrity": "sha512-z88rdjHAv3jmTZ7KSGUkTvo4rGzcDGMq0oXWHNIDK96Gs31JKVdu9+FMtT4KBrVoibg8dUicJDok6GnqqttO5Q==", 357 | "requires": { 358 | "jszip": "^3.1.3", 359 | "rimraf": "^2.5.4", 360 | "tmp": "0.0.30", 361 | "xml2js": "^0.4.17" 362 | } 363 | }, 364 | "string_decoder": { 365 | "version": "1.1.1", 366 | "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", 367 | "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", 368 | "requires": { 369 | "safe-buffer": "~5.1.0" 370 | } 371 | }, 372 | "tmp": { 373 | "version": "0.0.30", 374 | "resolved": "https://registry.npmjs.org/tmp/-/tmp-0.0.30.tgz", 375 | "integrity": "sha1-ckGdSovn1s51FI/YsyTlk6cRwu0=", 376 | "requires": { 377 | "os-tmpdir": "~1.0.1" 378 | } 379 | }, 380 | "type-detect": { 381 | "version": "4.0.8", 382 | "resolved": "https://registry.npmjs.org/type-detect/-/type-detect-4.0.8.tgz", 383 | "integrity": "sha512-0fr/mIH1dlO+x7TlcMy+bIDqKPsw/70tVyeHW787goQjhmqaZe10uwLujubK9q9Lg6Fiho1KUKDYz0Z7k7g5/g==" 384 | }, 385 | "util-deprecate": { 386 | "version": "1.0.2", 387 | "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", 388 | "integrity": "sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8=" 389 | }, 390 | "wrappy": { 391 | "version": "1.0.2", 392 | "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", 393 | "integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8=" 394 | }, 395 | "xml2js": { 396 | "version": "0.4.19", 397 | "resolved": "https://registry.npmjs.org/xml2js/-/xml2js-0.4.19.tgz", 398 | "integrity": "sha512-esZnJZJOiJR9wWKMyuvSE1y6Dq5LCuJanqhxslH2bxM6duahNZ+HMpCLhBQGZkbX6xRf8x1Y2eJlgt2q3qo49Q==", 399 | "requires": { 400 | "sax": ">=0.6.0", 401 | "xmlbuilder": "~9.0.1" 402 | } 403 | }, 404 | "xmlbuilder": { 405 | "version": "9.0.7", 406 | "resolved": "https://registry.npmjs.org/xmlbuilder/-/xmlbuilder-9.0.7.tgz", 407 | "integrity": "sha1-Ey7mPS7FVlxVfiD0wi35rKaGsQ0=" 408 | } 409 | } 410 | } 411 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "chrome-web-store-scraper", 3 | "version": "1.1.1", 4 | "description": "A scraper for the Google Chrome extension metadata on the chrome web store", 5 | "main": "index.js", 6 | "directories": { 7 | "test": "test" 8 | }, 9 | "scripts": { 10 | "test": "mocha test/test.js" 11 | }, 12 | "repository": { 13 | "type": "git", 14 | "url": "git+https://github.com/AdamSlack/chrome-web-store-scraper.git" 15 | }, 16 | "author": "Adam Slack", 17 | "license": "MIT", 18 | "keywords": [ 19 | "google", 20 | "chrome", 21 | "extensions", 22 | "selenium", 23 | "scraper", 24 | "web-scraper", 25 | "web scraper", 26 | "chrome-extensions" 27 | ], 28 | "bugs": { 29 | "url": "https://github.com/AdamSlack/chrome-web-store-scraper/issues" 30 | }, 31 | "homepage": "https://github.com/AdamSlack/chrome-web-store-scraper#readme", 32 | "dependencies": { 33 | "chai": "^4.1.2", 34 | "cheerio": "^1.0.0-rc.2", 35 | "selenium-webdriver": "^4.0.0-alpha.1" 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /selenium: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright (C) 2007 The Android Open Source Project 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # This script was adapted from the one provided by apktool. 18 | # https://ibotpeaches.github.io/Apktool/ 19 | 20 | prog="$0" 21 | 22 | while [ -h "${prog}" ]; do 23 | newProg=`/bin/ls -ld "${prog}"` 24 | 25 | newProg=`expr "${newProg}" : ".* -> \(.*\)$"` 26 | if expr "x${newProg}" : 'x/' >/dev/null; then 27 | prog="${newProg}" 28 | else 29 | progdir=`dirname "${prog}"` 30 | prog="${progdir}/${newProg}" 31 | fi 32 | done 33 | oldwd=`pwd` 34 | progdir=`dirname "${prog}"` 35 | cd "${progdir}" 36 | progdir=`pwd` 37 | prog="${progdir}"/`basename "${prog}"` 38 | cd "${oldwd}" 39 | 40 | jarfile=selenium.jar 41 | libdir="$progdir" 42 | if [ ! -r "$libdir/$jarfile" ] 43 | then 44 | echo `basename "$prog"`": can't find $jarfile" 45 | exit 1 46 | fi 47 | 48 | javaOpts="" 49 | 50 | # If you want DX to have more memory when executing, uncomment the following 51 | # line and adjust the value accordingly. Use "java -X" for a list of options 52 | # you can pass here. 53 | 54 | javaOpts="-Xmx512M -Dfile.encoding=utf-8" 55 | 56 | # Alternatively, this will extract any parameter "-Jxxx" from the command line 57 | # and pass them to Java (instead of to dx). This makes it possible for you to 58 | # add a command-line parameter such as "-JXmx256M" in your ant scripts, for 59 | # example. 60 | while expr "x$1" : 'x-J' >/dev/null; do 61 | opt=`expr "$1" : '-J\(.*\)'` 62 | javaOpts="${javaOpts} -${opt}" 63 | shift 64 | done 65 | 66 | if [ "$OSTYPE" = "cygwin" ] ; then 67 | jarpath=`cygpath -w "$libdir/$jarfile"` 68 | else 69 | jarpath="$libdir/$jarfile" 70 | fi 71 | 72 | # add current location to path for aapt 73 | PATH=$PATH:`pwd`; 74 | export PATH; 75 | exec java $javaOpts -jar "$jarpath" "$@" 76 | -------------------------------------------------------------------------------- /src/chrome-web-store-scraper.js: -------------------------------------------------------------------------------- 1 | const cheerio = require('cheerio') 2 | 3 | const { Builder, By, Capabilities, until } = require('selenium-webdriver') 4 | 5 | const WAIT_THRESHOLD = 10000 6 | 7 | class ChromeWebStoreScraper { 8 | constructor () { 9 | this.searchCategories = require('../data/search_categories.json') 10 | this.searchFeatures = require('../data/search_features.json') 11 | this.driver = undefined 12 | } 13 | 14 | async scrapeApp (appURL) { 15 | let details = {} 16 | try { 17 | if (!this.driver) { 18 | this.driver = await this.createChromeDriver() 19 | } 20 | await this.driver.get(appURL) 21 | details = await this.scrapeDetails(this.driver) 22 | } catch (err) { 23 | console.log('building failed', err) 24 | } 25 | return details 26 | } 27 | 28 | async scrapeDetails (driver) { 29 | const header = await this.scrapeAppHeader(driver) 30 | const overview = await this.scrapeOverview(driver) 31 | const reviews = await this.scrapeReviews(driver) 32 | return { 33 | header: header, 34 | overview: overview, 35 | reviews: reviews 36 | } 37 | } 38 | 39 | async scrapeReviews (driver, previousReviews) { 40 | previousReviews = previousReviews || [] 41 | 42 | // Find Reviews CSS button, then click it. 43 | const reviewButtons = await driver.findElements(By.css('.e-f-b-L')) 44 | if (reviewButtons.length) { 45 | console.log('Review Button Found') 46 | await reviewButtons[0].click() 47 | console.log('Review Button Clicked') 48 | } else { 49 | console.log('Review Button Not Found') 50 | } 51 | 52 | const reviewBySelector = By.css('.ba-fb > div') 53 | const otherReviewBySelector = By.css('.ba-bc-xb') 54 | let reviewData = [] 55 | try { 56 | await driver.wait(until.elementsLocated(reviewBySelector), WAIT_THRESHOLD) 57 | reviewData = await driver.findElements(reviewBySelector) 58 | } 59 | catch(err) { 60 | console.log(`Could not find review elements using first Selector: ${err}`) 61 | } 62 | 63 | if (!reviewData.length) { 64 | try { 65 | await driver.wait(until.elementsLocated(otherReviewBySelector), WAIT_THRESHOLD) 66 | reviewData = await driver.findElements(otherReviewBySelector) 67 | } 68 | catch(err) { 69 | console.log(`Could not find review elements using second Selector: ${err}`) 70 | return {} 71 | } 72 | } 73 | 74 | console.log(reviewData.length) 75 | 76 | let reviews = [] 77 | for (const data of reviewData) { 78 | let res = await data 79 | console.log(await res.getAttribute('outerHTML')) 80 | let html = '' 81 | let review = {} 82 | 83 | if (typeof res.getAttribute === 'function') { 84 | html = await res.getAttribute('outerHTML') 85 | review = this.parseAppReviewHTML(html) 86 | } 87 | const notAllEmpty = Object.keys(review).every((key) => { 88 | const val = review[key] 89 | return val !== '' && val !== -1 && val !== [] && val !== {} // replace with an isEmpty method (maybe use the one in lodash) 90 | }) 91 | if (notAllEmpty) { 92 | reviews.push(review) 93 | } 94 | } 95 | 96 | // return reviews 97 | 98 | previousReviews = previousReviews.concat(reviews) 99 | 100 | console.log(`Scraped ${previousReviews.length} (+${reviews.length}) Reviews`) 101 | const nextButtonBySelector = By.css('.Aa.dc-se') 102 | let nextReviewsButtons = [] 103 | 104 | try { 105 | await driver.wait(until.elementsLocated(nextButtonBySelector), WAIT_THRESHOLD) 106 | nextReviewsButtons = await driver.findElement(nextButtonBySelector) 107 | console.log(await nextReviewsButtons.getAttribute('innerHTML')) 108 | console.log(await nextReviewsButtons.getAttribute('outerHTML')) 109 | await driver.wait(until.elementIsVisible(nextReviewsButtons), WAIT_THRESHOLD) 110 | } 111 | catch(err) { 112 | console.log(`Could not find Next Page Element: ${err}`) 113 | return previousReviews 114 | } 115 | 116 | console.log(await nextReviewsButtons[0].getAttribute('outerHTML')) 117 | await nextReviewsButtons[0].click() 118 | 119 | return this.scrapeReviews(driver, previousReviews) 120 | } 121 | 122 | parseAppReviewHTML (html) { 123 | const $ = cheerio.load(html) 124 | const profileImageURL = $('.Lg-ee-A-O-xb').first().attr('src') 125 | const displayNameField = $('.ba-bc-Xb-K').first().find('a').first() 126 | const displayName = displayNameField.text() 127 | const displayNameURL = displayNameField.attr('href') 128 | const timestamp = $('.ba-Eb-Nf').first().text() 129 | const ratingString = $('.rsw-stars').first().attr('title') 130 | let rating = -1 131 | if (ratingString) { 132 | rating = parseInt(ratingString.substr(0, 1)) 133 | } 134 | const comment = $('.ba-Eb-ba').first().text() 135 | return { 136 | displayName: displayName || '', 137 | profileImageURL: profileImageURL || '', 138 | displayNameURL: displayNameURL || '', 139 | timestamp: timestamp || '', 140 | ratingString: ratingString || '', 141 | rating: rating || -1, 142 | comment: comment || '' 143 | } 144 | } 145 | 146 | async scrapeOverview (driver) { 147 | var overviewData = [] 148 | var timer = 0 149 | while (overviewData.length === 0) { 150 | const selection = await driver.findElements(By.css('.h-e-f-b-Qe')) 151 | if (selection.length) { 152 | overviewData.push(selection[0]) 153 | } 154 | timer = timer + 1 155 | if (timer > WAIT_THRESHOLD) { 156 | return {} 157 | } 158 | } 159 | 160 | let res = await overviewData[0] 161 | let html = '' 162 | let overview = {} 163 | 164 | if (typeof res.getAttribute === 'function') { 165 | html = await res.getAttribute('outerHTML') 166 | overview = this.parseAppOverviewHTML(html) 167 | } 168 | return overview 169 | } 170 | 171 | parseAppOverviewHTML (html) { 172 | const $ = cheerio.load(html) 173 | const summary = $('.C-b-p-j-Pb').first().text() 174 | const description = $('.C-b-p-j-Oa').first().text() 175 | const version = $('.C-b-p-D-Xe.h-C-b-p-D-md').first().text() 176 | const lastUpdatedDate = $('.C-b-p-D-Xe.h-C-b-p-D-xh-hh').first().text() 177 | const size = $('.C-b-p-D-Xe.h-C-b-p-D-za').first().text() 178 | const language = $('.C-b-p-D-Xe.h-C-b-p-D-Ba').first().text() 179 | 180 | const screenshotURLs = $('.h-A-Ce-ze-Yf.A-Ce-ze-Yf').map(function () { 181 | return $(this).attr('src') 182 | }).get() 183 | 184 | const additionalInfo = $('.C-b-p-rc-D-R').map(function () { 185 | return { 186 | text: $(this).text(), 187 | href: $(this).attr('href') 188 | } 189 | }).get() 190 | 191 | return { 192 | summary: summary || '', 193 | description: description || '', 194 | version: version || '', 195 | lastUpdatedDate: lastUpdatedDate || '', 196 | language: language ? (language.includes('See all') ? 'multiple' : language) : '', 197 | size: size || '', 198 | screenshotURLs: screenshotURLs || [], 199 | additionalInfo: additionalInfo || [] 200 | } 201 | } 202 | 203 | async scrapeAppHeader (driver) { 204 | var headerData = [] 205 | var timer = 0 206 | while (headerData.length === 0) { 207 | const selection = await driver.findElements(By.css('.e-f-o')) 208 | if (selection.length) { 209 | headerData.push(selection[0]) 210 | } 211 | timer = timer + 1 212 | if (timer > WAIT_THRESHOLD) { 213 | return {} 214 | } 215 | } 216 | let res = await headerData[0] 217 | let html = '' 218 | let header = {} 219 | if (typeof res.getAttribute === 'function') { 220 | html = await res.getAttribute('outerHTML') 221 | header = this.parseAppHeaderHTML(html) 222 | } 223 | return header 224 | } 225 | 226 | parseAppHeaderHTML (html) { 227 | const $ = cheerio.load(html) 228 | const imgURL = $('img').first().attr('src') 229 | const title = $('.e-f-w').first().text() 230 | const offeredBy = $('.e-f-Me').first().text() 231 | const rating = $('.rsw-stars').first().attr('g:rating_override') 232 | const userCount = parseInt($('.e-f-ih').first().text().replace(/\D/g, '')) 233 | const ratingCount = parseInt($('.KnRoYd-N-nd').first().text().replace(/\D/g, '')) 234 | 235 | return { 236 | title: title, 237 | offeredBy: offeredBy, 238 | userCount: userCount, 239 | rating: rating, 240 | ratingCount: ratingCount, 241 | imgURL: imgURL 242 | } 243 | } 244 | 245 | async parseSearchBody (driver, throttle, scrollAttempts) { 246 | console.log('Processing Selenium Search Page') 247 | var searchResults = [] 248 | 249 | // wait for page to load, scroll, wait scroll, wait scroll... 250 | const sleep = (time) => new Promise((resolve) => setTimeout(resolve, time)) 251 | 252 | for (var i = 0; i < scrollAttempts; i++) { 253 | await sleep(50) 254 | driver.executeScript('window.scrollBy(0,1000)', '') 255 | } 256 | 257 | console.log('Waiting for results to load.') 258 | 259 | await sleep(throttle) 260 | var timer = 0 261 | while (searchResults.length === 0) { 262 | searchResults = await driver.findElements(By.css('.a-d-na.a-d.webstore-test-wall-tile.a-d-zc.Xd.dd')) 263 | timer = timer + 1 264 | if (timer > WAIT_THRESHOLD) { 265 | throw new Error('Unable to find search body.', 'Timer:', timer) 266 | } 267 | } 268 | 269 | // const textHeadings = ['title', 'author', 'description','buttonText','category','numberOfRatings']; 270 | const searchResultsJSON = [] 271 | console.log(`${searchResults.length} Found. Extracting Text and HTML`) 272 | for (const result of searchResults) { 273 | const res = await result 274 | 275 | if (typeof res.getAttribute !== 'function') { 276 | console.log('No getAttribute Method Found a Search Result.') 277 | break 278 | } 279 | 280 | if (typeof res.getText !== 'function') { 281 | console.log('No getText Method Found on a Search Result.') 282 | break 283 | } 284 | const html = await res.getAttribute('outerHTML') 285 | 286 | let $ = cheerio.load(html) 287 | 288 | const title = $('.a-na-d-w').first().text() 289 | const description = $('.a-na-d-Oa').first().text() 290 | const author = $('.oc').first().text() 291 | const category = $('.a-na-d-ea').text() 292 | const storeURL = $('.h-Ja-d-Ac.a-u').first().attr('href') 293 | const numberOfRatings = $('.q-N-nd').first().text() 294 | const rating = $('.rsw-stars').first().attr('g:rating_override') 295 | 296 | const resJSON = { 297 | title: title || '', 298 | description: description || '', 299 | author: author || '', 300 | category: category || '', 301 | storeURL: storeURL || '', 302 | rating: rating ? parseFloat(rating) : -1, 303 | numberOfRatings: numberOfRatings ? parseInt(numberOfRatings.replace(/\D/g, '')) : -1 304 | } 305 | searchResultsJSON.push(resJSON) 306 | } 307 | return searchResultsJSON 308 | } 309 | 310 | buildSearchURLString ( 311 | searchString, 312 | options = { 313 | searchCategory: undefined, 314 | searchFeatures: undefined, 315 | locale: undefined 316 | } 317 | ) { 318 | const searchCategory = options.searchCategory !== undefined ? options.searchCategory : 'all' 319 | const searchFeatures = options.searchFeatures !== undefined ? options.searchFeatures : [] 320 | const locale = options.locale !== undefined ? options.locale : 'en-gb' 321 | 322 | // Form Search URL 323 | const baseURL = 'https://chrome.google.com/webstore/search' 324 | let searchURL = `${baseURL}/${searchString}?hl=${locale}&_category=${this.searchCategories[searchCategory]}` 325 | 326 | if (searchFeatures.length > 0) { 327 | searchURL = `${searchURL}&${searchFeatures.map((f) => this.searchFeatures[f]).join('&')}` 328 | } 329 | 330 | return encodeURI(searchURL) 331 | } 332 | 333 | async search ( 334 | searchString, 335 | options = { 336 | searchCategory: undefined, 337 | searchFeatures: undefined, 338 | throttle: undefined, 339 | scrollAttempts: undefined, 340 | locale: undefined 341 | } 342 | ) { 343 | const searchCategory = options.searchCategory !== undefined ? options.searchCategory : 'all' 344 | const searchFeatures = options.searchFeatures !== undefined ? options.searchFeatures : [] 345 | const throttle = options.throttle !== undefined ? options.throttle : 3000 346 | const scrollAttempts = options !== undefined ? options.scrollAttempts : 100 347 | const locale = options.locale !== undefined ? options.locale : 'en-gb' 348 | 349 | // Check options are valid. 350 | if (searchFeatures.constructor !== Array) { 351 | throw new Error('Search Filter Must Be Provided as an Array') 352 | } 353 | 354 | if (searchFeatures.some((f) => !this.searchFeatures[f])) { 355 | throw new Error('Invalid Search Filter Provided.') 356 | } 357 | 358 | if (!this.searchCategories[searchCategory]) { 359 | throw new Error('Invalid Search Category Provided.') 360 | } 361 | 362 | // build the encoded search URL. 363 | const searchURL = this.buildSearchURLString(searchString, { 364 | searchCategory: searchCategory, 365 | searchFeatures: searchFeatures, 366 | locale: locale 367 | }) 368 | 369 | let searchResults = [] 370 | 371 | try { 372 | if (!this.driver) { 373 | this.driver = await this.createChromeDriver() 374 | } 375 | await this.driver.get(searchURL) 376 | searchResults = await this.parseSearchBody(this.driver, throttle, scrollAttempts) 377 | } catch (err) { 378 | console.log(err) 379 | } 380 | return searchResults 381 | } 382 | 383 | async createChromeDriver () { 384 | const chromeOptions = { 385 | args: ['--headless', '--disable-gpu', '--no-sandbox'] 386 | } 387 | const chromeCapabilities = Capabilities.chrome() 388 | chromeCapabilities.set('chromeOptions', chromeOptions) 389 | 390 | console.log('Building Selenium Driver') 391 | 392 | return new Builder() 393 | .forBrowser('chrome') 394 | .withCapabilities(chromeCapabilities) 395 | .build() 396 | } 397 | } 398 | 399 | module.exports = ChromeWebStoreScraper 400 | -------------------------------------------------------------------------------- /test/package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "test", 3 | "version": "1.0.0", 4 | "lockfileVersion": 1, 5 | "requires": true, 6 | "dependencies": { 7 | "@types/node": { 8 | "version": "10.9.1", 9 | "resolved": "https://registry.npmjs.org/@types/node/-/node-10.9.1.tgz", 10 | "integrity": "sha512-f+qQR5lLCB8UPhtk8Xm8RQvbR4ycD7MOsdiuAEQCYpz44bBx2g7JL0+iYBcjl9J7d0KT1sX2g0VGNeHfw+GXpg==" 11 | }, 12 | "assertion-error": { 13 | "version": "1.1.0", 14 | "resolved": "https://registry.npmjs.org/assertion-error/-/assertion-error-1.1.0.tgz", 15 | "integrity": "sha512-jgsaNduz+ndvGyFt3uSuWqvy4lCnIJiovtouQN5JZHOKCS2QuhEdbcQHFhVksz2N2U9hXJo8odG7ETyWlEeuDw==" 16 | }, 17 | "balanced-match": { 18 | "version": "1.0.0", 19 | "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz", 20 | "integrity": "sha1-ibTRmasr7kneFk6gK4nORi1xt2c=" 21 | }, 22 | "boolbase": { 23 | "version": "1.0.0", 24 | "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", 25 | "integrity": "sha1-aN/1++YMUes3cl6p4+0xDcwed24=" 26 | }, 27 | "brace-expansion": { 28 | "version": "1.1.11", 29 | "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", 30 | "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==", 31 | "requires": { 32 | "balanced-match": "^1.0.0", 33 | "concat-map": "0.0.1" 34 | } 35 | }, 36 | "chai": { 37 | "version": "4.1.2", 38 | "resolved": "https://registry.npmjs.org/chai/-/chai-4.1.2.tgz", 39 | "integrity": "sha1-D2RYS6ZC8PKs4oBiefTwbKI61zw=", 40 | "requires": { 41 | "assertion-error": "^1.0.1", 42 | "check-error": "^1.0.1", 43 | "deep-eql": "^3.0.0", 44 | "get-func-name": "^2.0.0", 45 | "pathval": "^1.0.0", 46 | "type-detect": "^4.0.0" 47 | } 48 | }, 49 | "check-error": { 50 | "version": "1.0.2", 51 | "resolved": "https://registry.npmjs.org/check-error/-/check-error-1.0.2.tgz", 52 | "integrity": "sha1-V00xLt2Iu13YkS6Sht1sCu1KrII=" 53 | }, 54 | "cheerio": { 55 | "version": "1.0.0-rc.2", 56 | "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0-rc.2.tgz", 57 | "integrity": "sha1-S59TqBsn5NXawxwP/Qz6A8xoMNs=", 58 | "requires": { 59 | "css-select": "~1.2.0", 60 | "dom-serializer": "~0.1.0", 61 | "entities": "~1.1.1", 62 | "htmlparser2": "^3.9.1", 63 | "lodash": "^4.15.0", 64 | "parse5": "^3.0.1" 65 | } 66 | }, 67 | "chrome-web-store-scraper": { 68 | "version": "1.0.7", 69 | "resolved": "https://registry.npmjs.org/chrome-web-store-scraper/-/chrome-web-store-scraper-1.0.7.tgz", 70 | "integrity": "sha512-Tjy7QmTBeLL6VjjP+vXw/NNHglDLBYbRSNSB4fthLMrOi39O/xR74UST7DpMkpIPW++uK5VyHNOgcY6piCqI+w==", 71 | "requires": { 72 | "chai": "^4.1.2", 73 | "cheerio": "^1.0.0-rc.2", 74 | "selenium-webdriver": "^4.0.0-alpha.1" 75 | } 76 | }, 77 | "concat-map": { 78 | "version": "0.0.1", 79 | "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", 80 | "integrity": "sha1-2Klr13/Wjfd5OnMDajug1UBdR3s=" 81 | }, 82 | "core-js": { 83 | "version": "2.3.0", 84 | "resolved": "https://registry.npmjs.org/core-js/-/core-js-2.3.0.tgz", 85 | "integrity": "sha1-+rg/uwstjchfpjbEudNMdUIMbWU=" 86 | }, 87 | "core-util-is": { 88 | "version": "1.0.2", 89 | "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz", 90 | "integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac=" 91 | }, 92 | "css-select": { 93 | "version": "1.2.0", 94 | "resolved": "https://registry.npmjs.org/css-select/-/css-select-1.2.0.tgz", 95 | "integrity": "sha1-KzoRBTnFNV8c2NMUYj6HCxIeyFg=", 96 | "requires": { 97 | "boolbase": "~1.0.0", 98 | "css-what": "2.1", 99 | "domutils": "1.5.1", 100 | "nth-check": "~1.0.1" 101 | } 102 | }, 103 | "css-what": { 104 | "version": "2.1.0", 105 | "resolved": "https://registry.npmjs.org/css-what/-/css-what-2.1.0.tgz", 106 | "integrity": "sha1-lGfQMsOM+u+58teVASUwYvh/ob0=" 107 | }, 108 | "deep-eql": { 109 | "version": "3.0.1", 110 | "resolved": "https://registry.npmjs.org/deep-eql/-/deep-eql-3.0.1.tgz", 111 | "integrity": "sha512-+QeIQyN5ZuO+3Uk5DYh6/1eKO0m0YmJFGNmFHGACpf1ClL1nmlV/p4gNgbl2pJGxgXb4faqo6UE+M5ACEMyVcw==", 112 | "requires": { 113 | "type-detect": "^4.0.0" 114 | } 115 | }, 116 | "dom-serializer": { 117 | "version": "0.1.0", 118 | "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-0.1.0.tgz", 119 | "integrity": "sha1-BzxpdUbOB4DOI75KKOKT5AvDDII=", 120 | "requires": { 121 | "domelementtype": "~1.1.1", 122 | "entities": "~1.1.1" 123 | }, 124 | "dependencies": { 125 | "domelementtype": { 126 | "version": "1.1.3", 127 | "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-1.1.3.tgz", 128 | "integrity": "sha1-vSh3PiZCiBrsUVRJJCmcXNgiGFs=" 129 | } 130 | } 131 | }, 132 | "domelementtype": { 133 | "version": "1.3.0", 134 | "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-1.3.0.tgz", 135 | "integrity": "sha1-sXrtguirWeUt2cGbF1bg/BhyBMI=" 136 | }, 137 | "domhandler": { 138 | "version": "2.4.2", 139 | "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-2.4.2.tgz", 140 | "integrity": "sha512-JiK04h0Ht5u/80fdLMCEmV4zkNh2BcoMFBmZ/91WtYZ8qVXSKjiw7fXMgFPnHcSZgOo3XdinHvmnDUeMf5R4wA==", 141 | "requires": { 142 | "domelementtype": "1" 143 | } 144 | }, 145 | "domutils": { 146 | "version": "1.5.1", 147 | "resolved": "https://registry.npmjs.org/domutils/-/domutils-1.5.1.tgz", 148 | "integrity": "sha1-3NhIiib1Y9YQeeSMn3t+Mjc2gs8=", 149 | "requires": { 150 | "dom-serializer": "0", 151 | "domelementtype": "1" 152 | } 153 | }, 154 | "entities": { 155 | "version": "1.1.1", 156 | "resolved": "https://registry.npmjs.org/entities/-/entities-1.1.1.tgz", 157 | "integrity": "sha1-blwtClYhtdra7O+AuQ7ftc13cvA=" 158 | }, 159 | "es6-promise": { 160 | "version": "3.0.2", 161 | "resolved": "https://registry.npmjs.org/es6-promise/-/es6-promise-3.0.2.tgz", 162 | "integrity": "sha1-AQ1YWEI6XxGJeWZfRkhqlcbuK7Y=" 163 | }, 164 | "fs.realpath": { 165 | "version": "1.0.0", 166 | "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", 167 | "integrity": "sha1-FQStJSMVjKpA20onh8sBQRmU6k8=" 168 | }, 169 | "get-func-name": { 170 | "version": "2.0.0", 171 | "resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.0.tgz", 172 | "integrity": "sha1-6td0q+5y4gQJQzoGY2YCPdaIekE=" 173 | }, 174 | "glob": { 175 | "version": "7.1.2", 176 | "resolved": "https://registry.npmjs.org/glob/-/glob-7.1.2.tgz", 177 | "integrity": "sha512-MJTUg1kjuLeQCJ+ccE4Vpa6kKVXkPYJ2mOCQyUuKLcLQsdrMCpBPUi8qVE6+YuaJkozeA9NusTAw3hLr8Xe5EQ==", 178 | "requires": { 179 | "fs.realpath": "^1.0.0", 180 | "inflight": "^1.0.4", 181 | "inherits": "2", 182 | "minimatch": "^3.0.4", 183 | "once": "^1.3.0", 184 | "path-is-absolute": "^1.0.0" 185 | } 186 | }, 187 | "htmlparser2": { 188 | "version": "3.9.2", 189 | "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-3.9.2.tgz", 190 | "integrity": "sha1-G9+HrMoPP55T+k/M6w9LTLsAszg=", 191 | "requires": { 192 | "domelementtype": "^1.3.0", 193 | "domhandler": "^2.3.0", 194 | "domutils": "^1.5.1", 195 | "entities": "^1.1.1", 196 | "inherits": "^2.0.1", 197 | "readable-stream": "^2.0.2" 198 | } 199 | }, 200 | "immediate": { 201 | "version": "3.0.6", 202 | "resolved": "https://registry.npmjs.org/immediate/-/immediate-3.0.6.tgz", 203 | "integrity": "sha1-nbHb0Pr43m++D13V5Wu2BigN5ps=" 204 | }, 205 | "inflight": { 206 | "version": "1.0.6", 207 | "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz", 208 | "integrity": "sha1-Sb1jMdfQLQwJvJEKEHW6gWW1bfk=", 209 | "requires": { 210 | "once": "^1.3.0", 211 | "wrappy": "1" 212 | } 213 | }, 214 | "inherits": { 215 | "version": "2.0.3", 216 | "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz", 217 | "integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4=" 218 | }, 219 | "isarray": { 220 | "version": "1.0.0", 221 | "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", 222 | "integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE=" 223 | }, 224 | "jszip": { 225 | "version": "3.1.5", 226 | "resolved": "https://registry.npmjs.org/jszip/-/jszip-3.1.5.tgz", 227 | "integrity": "sha512-5W8NUaFRFRqTOL7ZDDrx5qWHJyBXy6velVudIzQUSoqAAYqzSh2Z7/m0Rf1QbmQJccegD0r+YZxBjzqoBiEeJQ==", 228 | "requires": { 229 | "core-js": "~2.3.0", 230 | "es6-promise": "~3.0.2", 231 | "lie": "~3.1.0", 232 | "pako": "~1.0.2", 233 | "readable-stream": "~2.0.6" 234 | }, 235 | "dependencies": { 236 | "process-nextick-args": { 237 | "version": "1.0.7", 238 | "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-1.0.7.tgz", 239 | "integrity": "sha1-FQ4gt1ZZCtP5EJPyWk8q2L/zC6M=" 240 | }, 241 | "readable-stream": { 242 | "version": "2.0.6", 243 | "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.0.6.tgz", 244 | "integrity": "sha1-j5A0HmilPMySh4jaz80Rs265t44=", 245 | "requires": { 246 | "core-util-is": "~1.0.0", 247 | "inherits": "~2.0.1", 248 | "isarray": "~1.0.0", 249 | "process-nextick-args": "~1.0.6", 250 | "string_decoder": "~0.10.x", 251 | "util-deprecate": "~1.0.1" 252 | } 253 | }, 254 | "string_decoder": { 255 | "version": "0.10.31", 256 | "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz", 257 | "integrity": "sha1-YuIDvEF2bGwoyfyEMB2rHFMQ+pQ=" 258 | } 259 | } 260 | }, 261 | "lie": { 262 | "version": "3.1.1", 263 | "resolved": "https://registry.npmjs.org/lie/-/lie-3.1.1.tgz", 264 | "integrity": "sha1-mkNrLMd0bKWd56QfpGmz77dr2H4=", 265 | "requires": { 266 | "immediate": "~3.0.5" 267 | } 268 | }, 269 | "lodash": { 270 | "version": "4.17.10", 271 | "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.10.tgz", 272 | "integrity": "sha512-UejweD1pDoXu+AD825lWwp4ZGtSwgnpZxb3JDViD7StjQz+Nb/6l093lx4OQ0foGWNRoc19mWy7BzL+UAK2iVg==" 273 | }, 274 | "minimatch": { 275 | "version": "3.0.4", 276 | "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz", 277 | "integrity": "sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA==", 278 | "requires": { 279 | "brace-expansion": "^1.1.7" 280 | } 281 | }, 282 | "nth-check": { 283 | "version": "1.0.1", 284 | "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-1.0.1.tgz", 285 | "integrity": "sha1-mSms32KPwsQQmN6rgqxYDPFJquQ=", 286 | "requires": { 287 | "boolbase": "~1.0.0" 288 | } 289 | }, 290 | "once": { 291 | "version": "1.4.0", 292 | "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", 293 | "integrity": "sha1-WDsap3WWHUsROsF9nFC6753Xa9E=", 294 | "requires": { 295 | "wrappy": "1" 296 | } 297 | }, 298 | "os-tmpdir": { 299 | "version": "1.0.2", 300 | "resolved": "https://registry.npmjs.org/os-tmpdir/-/os-tmpdir-1.0.2.tgz", 301 | "integrity": "sha1-u+Z0BseaqFxc/sdm/lc0VV36EnQ=" 302 | }, 303 | "pako": { 304 | "version": "1.0.6", 305 | "resolved": "https://registry.npmjs.org/pako/-/pako-1.0.6.tgz", 306 | "integrity": "sha512-lQe48YPsMJAig+yngZ87Lus+NF+3mtu7DVOBu6b/gHO1YpKwIj5AWjZ/TOS7i46HD/UixzWb1zeWDZfGZ3iYcg==" 307 | }, 308 | "parse5": { 309 | "version": "3.0.3", 310 | "resolved": "https://registry.npmjs.org/parse5/-/parse5-3.0.3.tgz", 311 | "integrity": "sha512-rgO9Zg5LLLkfJF9E6CCmXlSE4UVceloys8JrFqCcHloC3usd/kJCyPDwH2SOlzix2j3xaP9sUX3e8+kvkuleAA==", 312 | "requires": { 313 | "@types/node": "*" 314 | } 315 | }, 316 | "path-is-absolute": { 317 | "version": "1.0.1", 318 | "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", 319 | "integrity": "sha1-F0uSaHNVNP+8es5r9TpanhtcX18=" 320 | }, 321 | "pathval": { 322 | "version": "1.1.0", 323 | "resolved": "https://registry.npmjs.org/pathval/-/pathval-1.1.0.tgz", 324 | "integrity": "sha1-uULm1L3mUwBe9rcTYd74cn0GReA=" 325 | }, 326 | "process-nextick-args": { 327 | "version": "2.0.0", 328 | "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.0.tgz", 329 | "integrity": "sha512-MtEC1TqN0EU5nephaJ4rAtThHtC86dNN9qCuEhtshvpVBkAW5ZO7BASN9REnF9eoXGcRub+pFuKEpOHE+HbEMw==" 330 | }, 331 | "readable-stream": { 332 | "version": "2.3.6", 333 | "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz", 334 | "integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==", 335 | "requires": { 336 | "core-util-is": "~1.0.0", 337 | "inherits": "~2.0.3", 338 | "isarray": "~1.0.0", 339 | "process-nextick-args": "~2.0.0", 340 | "safe-buffer": "~5.1.1", 341 | "string_decoder": "~1.1.1", 342 | "util-deprecate": "~1.0.1" 343 | } 344 | }, 345 | "rimraf": { 346 | "version": "2.6.2", 347 | "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.6.2.tgz", 348 | "integrity": "sha512-lreewLK/BlghmxtfH36YYVg1i8IAce4TI7oao75I1g245+6BctqTVQiBP3YUJ9C6DQOXJmkYR9X9fCLtCOJc5w==", 349 | "requires": { 350 | "glob": "^7.0.5" 351 | } 352 | }, 353 | "safe-buffer": { 354 | "version": "5.1.2", 355 | "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", 356 | "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==" 357 | }, 358 | "sax": { 359 | "version": "1.2.4", 360 | "resolved": "https://registry.npmjs.org/sax/-/sax-1.2.4.tgz", 361 | "integrity": "sha512-NqVDv9TpANUjFm0N8uM5GxL36UgKi9/atZw+x7YFnQ8ckwFGKrl4xX4yWtrey3UJm5nP1kUbnYgLopqWNSRhWw==" 362 | }, 363 | "selenium-webdriver": { 364 | "version": "4.0.0-alpha.1", 365 | "resolved": "https://registry.npmjs.org/selenium-webdriver/-/selenium-webdriver-4.0.0-alpha.1.tgz", 366 | "integrity": "sha512-z88rdjHAv3jmTZ7KSGUkTvo4rGzcDGMq0oXWHNIDK96Gs31JKVdu9+FMtT4KBrVoibg8dUicJDok6GnqqttO5Q==", 367 | "requires": { 368 | "jszip": "^3.1.3", 369 | "rimraf": "^2.5.4", 370 | "tmp": "0.0.30", 371 | "xml2js": "^0.4.17" 372 | } 373 | }, 374 | "string_decoder": { 375 | "version": "1.1.1", 376 | "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", 377 | "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", 378 | "requires": { 379 | "safe-buffer": "~5.1.0" 380 | } 381 | }, 382 | "tmp": { 383 | "version": "0.0.30", 384 | "resolved": "https://registry.npmjs.org/tmp/-/tmp-0.0.30.tgz", 385 | "integrity": "sha1-ckGdSovn1s51FI/YsyTlk6cRwu0=", 386 | "requires": { 387 | "os-tmpdir": "~1.0.1" 388 | } 389 | }, 390 | "type-detect": { 391 | "version": "4.0.8", 392 | "resolved": "https://registry.npmjs.org/type-detect/-/type-detect-4.0.8.tgz", 393 | "integrity": "sha512-0fr/mIH1dlO+x7TlcMy+bIDqKPsw/70tVyeHW787goQjhmqaZe10uwLujubK9q9Lg6Fiho1KUKDYz0Z7k7g5/g==" 394 | }, 395 | "util-deprecate": { 396 | "version": "1.0.2", 397 | "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", 398 | "integrity": "sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8=" 399 | }, 400 | "wrappy": { 401 | "version": "1.0.2", 402 | "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", 403 | "integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8=" 404 | }, 405 | "xml2js": { 406 | "version": "0.4.19", 407 | "resolved": "https://registry.npmjs.org/xml2js/-/xml2js-0.4.19.tgz", 408 | "integrity": "sha512-esZnJZJOiJR9wWKMyuvSE1y6Dq5LCuJanqhxslH2bxM6duahNZ+HMpCLhBQGZkbX6xRf8x1Y2eJlgt2q3qo49Q==", 409 | "requires": { 410 | "sax": ">=0.6.0", 411 | "xmlbuilder": "~9.0.1" 412 | } 413 | }, 414 | "xmlbuilder": { 415 | "version": "9.0.7", 416 | "resolved": "https://registry.npmjs.org/xmlbuilder/-/xmlbuilder-9.0.7.tgz", 417 | "integrity": "sha1-Ey7mPS7FVlxVfiD0wi35rKaGsQ0=" 418 | } 419 | } 420 | } 421 | -------------------------------------------------------------------------------- /test/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "test", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "test.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1" 8 | }, 9 | "author": "", 10 | "license": "ISC", 11 | "dependencies": { 12 | "chrome-web-store-scraper": "^1.0.7" 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /test/test.js: -------------------------------------------------------------------------------- 1 | const ChromeWebStoreScraper = require('../src/chrome-web-store-scraper') 2 | //const ChromeWebStoreScraper = require('chrome-web-store-scraper') 3 | 4 | const { assert } = require('chai') 5 | 6 | describe ('Scraper', function () { 7 | describe ('#constructor()', function () { 8 | it ('Should have constructed correctly with searchCategories and searchFeatures defined', function () { 9 | const scraper = new ChromeWebStoreScraper() 10 | assert.notEqual(scraper.searchCategories, undefined) 11 | assert.notEqual(scraper.searchFeatures, undefined) 12 | }) 13 | 14 | it ('Should have a json of search categories with categories inside of it.', function () { 15 | const scraper = new ChromeWebStoreScraper() 16 | assert.isOk(Object.keys(scraper.searchCategories).length > 0) 17 | }) 18 | 19 | it ('Should have a json of search filters with filters inside of it.', function () { 20 | const scraper = new ChromeWebStoreScraper() 21 | assert.isOk(Object.keys(scraper.searchFeatures).length > 0) 22 | }) 23 | }) 24 | 25 | describe ('#search()', function () { 26 | it ('Must reject if invalid search category is provided', function () { 27 | const scraper = new ChromeWebStoreScraper() 28 | const fakeFilter = { searchFeatures: ['notARealFilter'] } 29 | 30 | return scraper 31 | .search('searchString', fakeFilter) 32 | .then( 33 | () => Promise.reject(new Error('Expected method to reject.')), 34 | err => assert.instanceOf(err, Error) 35 | ) 36 | S 37 | }) 38 | 39 | it ('Must reject if search filter not passed as array', function () { 40 | const scraper = new ChromeWebStoreScraper() 41 | const invalidFilter = { searchFeatures: 'notAnArray' } 42 | 43 | return scraper 44 | .search('searchString', invalidFilter) 45 | .then( 46 | () => Promise.reject(new Error('Expected method to reject.')), 47 | err => assert.instanceOf(err, Error) 48 | ) 49 | }) 50 | 51 | it ('Must reject if invalid search category is provided', function () { 52 | const scraper = new ChromeWebStoreScraper() 53 | const fakeCategory = { searchCategory: 'notARealCategory' } 54 | 55 | return scraper 56 | .search('searchString', fakeCategory) 57 | .then( 58 | () => Promise.reject(new Error('Expected method to reject.')), 59 | err => assert.instanceOf(err, Error) 60 | ) 61 | }) 62 | 63 | it ('must return an array with more than 0 elements for a known search term', function () { 64 | this.timeout(15000) 65 | const scraper = new ChromeWebStoreScraper() 66 | const searchTerm = 'addiction' 67 | 68 | return scraper 69 | .search(searchTerm) 70 | .then( 71 | succ => assert.isAbove(succ.length, 0), 72 | fail => 73 | Promise.reject(`Searching Failed to get an array of JSON ${fail}`) 74 | ) 75 | }) 76 | 77 | it ('JSON results should all have the following keys: title, author, description, category, numberOfRatings, storeURL, rating', function () { 78 | this.timeout(15000) 79 | const scraper = new ChromeWebStoreScraper() 80 | const searchTerm = 'addiction' 81 | const expectedKeys = new Set([ 82 | 'title', 83 | 'author', 84 | 'description', 85 | 'category', 86 | 'numberOfRatings', 87 | 'storeURL', 88 | 'rating' 89 | ]) 90 | 91 | return scraper.search(searchTerm).then( 92 | succ => { 93 | succ.every(res => { 94 | const keys = Object.keys(res) 95 | const sameLength = expectedKeys.size == keys.length 96 | const sameKeys = keys.every(key => expectedKeys.has(key)) 97 | assert.isTrue(sameLength && sameKeys) 98 | }) 99 | }, 100 | fail => Promise.reject(`Searching Failed', ${fail}`) 101 | ) 102 | }) 103 | }) 104 | 105 | describe ('#buildSearchURL()', function () { 106 | it ('Must produce a valid URL with only a searchString', function () { 107 | const scraper = new ChromeWebStoreScraper() 108 | const searchTerm = 'A Test String' 109 | 110 | const searchURL = scraper.buildSearchURLString(searchTerm) 111 | assert.doesNotThrow(() => new URL(searchURL)) 112 | }) 113 | 114 | it ('Must produce a valid URL with only one search filter option and search string', function () { 115 | const scraper = new ChromeWebStoreScraper() 116 | const searchTerm = 'A Test String' 117 | const options = { searchFeatures: ['free'] } 118 | 119 | const searchURL = scraper.buildSearchURLString(searchTerm, options) 120 | assert.doesNotThrow(() => new URL(searchURL)) 121 | }) 122 | 123 | it ('Must produce a valid URL with multiple search filter option and search string', function () { 124 | const scraper = new ChromeWebStoreScraper() 125 | const searchTerm = 'A Test String' 126 | const options = { searchFeatures: ['offline', 'byGoogle', 'free'] } 127 | 128 | const searchURL = scraper.buildSearchURLString(searchTerm, options) 129 | assert.doesNotThrow(() => new URL(searchURL)) 130 | }) 131 | 132 | it ('Must produce a valid URL with only search category option and search string', function () { 133 | const scraper = new ChromeWebStoreScraper() 134 | const searchTerm = 'A Test String' 135 | const options = { searchCategory: 'fun' } 136 | 137 | const searchURL = scraper.buildSearchURLString(searchTerm, options) 138 | assert.doesNotThrow(() => new URL(searchURL)) 139 | }) 140 | }) 141 | }) 142 | --------------------------------------------------------------------------------