├── .babelrc ├── .eslintrc ├── .gitignore ├── .gitmodules ├── .npmignore ├── LICENSE ├── README.md ├── docs ├── CSS selector.md ├── Installation.md ├── Open Web Scraper.md ├── Scraping a site.md ├── Selectors.md ├── Selectors │ ├── Element attribute selector.md │ ├── Element click selector.md │ ├── Element scroll down selector.md │ ├── Element selector.md │ ├── Grouped selector.md │ ├── HTML selector.md │ ├── Image selector.md │ ├── Link popup selector.md │ ├── Link selector.md │ ├── Table selector.md │ └── Text selector.md ├── Storage backends.md └── images │ ├── chrome-store-logo-920x680.png │ ├── chrome-store-logo-920x680.xcf │ ├── chrome-store-logo.png │ ├── chrome-store-logo.xcf │ ├── open-web-scraper │ └── open-web-scraper.png │ ├── scraping-a-site │ ├── news-site-selector-graph.png │ ├── news-site-sitemap.png │ └── news-site.png │ ├── selectors │ ├── element-click │ │ ├── click-more.png │ │ └── click-once.png │ ├── link │ │ ├── multiple-level-link-selectors.png │ │ ├── pagination-link-selectors.png │ │ └── pagination-selector-graph.png │ ├── table │ │ ├── selectors.png │ │ └── table.png │ └── text │ │ ├── text-selector-multiple-elements-with-text-selectors.png │ │ ├── text-selector-multiple-per-page.png │ │ └── text-selector-multiple-single-text-selectors-in-one-page.png │ ├── sitemap-tree.png │ └── store-logo-sources.txt ├── extension ├── assets │ ├── ICanHaz.js │ ├── LICENSE-d3-js │ ├── LICENSE-icanhaz-js │ ├── LICENSE-jquery-js │ ├── LICENSE-pouchdb-js │ ├── LICENSE-sugar-js │ ├── base64.js │ ├── bootstrap-3.0.0 │ │ ├── css │ │ │ ├── bootstrap-theme.css │ │ │ ├── bootstrap-theme.min.css │ │ │ ├── bootstrap.css │ │ │ └── bootstrap.min.css │ │ ├── fonts │ │ │ ├── glyphicons-halflings-regular.eot │ │ │ ├── glyphicons-halflings-regular.svg │ │ │ ├── glyphicons-halflings-regular.ttf │ │ │ └── glyphicons-halflings-regular.woff │ │ └── js │ │ │ ├── bootstrap.js │ │ │ └── bootstrap.min.js │ ├── d3.v3.js │ ├── d3.v3.min.js │ ├── images │ │ ├── LICENSE │ │ ├── icon128.png │ │ ├── icon16.png │ │ ├── icon19.png │ │ ├── icon38.png │ │ └── icon48.png │ ├── jquery-2.0.3.js │ ├── jquery.bootstrapvalidator │ │ ├── bootstrapValidator.css │ │ └── bootstrapValidator.js │ ├── jquery.whencallsequentially.js │ ├── pouchdb-nightly.min.js │ └── sugar-1.4.1.js ├── background_page │ └── background_script.js ├── content_script │ ├── contentScraperHeadlessBundler.js │ ├── content_scraper.js │ ├── content_scraper_browser.js │ ├── content_script.css │ └── content_script.js ├── devtools │ ├── devtools_init_page.html │ ├── devtools_init_page.js │ ├── devtools_scraper_panel.css │ ├── devtools_scraper_panel.html │ └── views │ │ ├── DataPreview.html │ │ ├── SelectorEdit.html │ │ ├── SelectorEditTableColumn.html │ │ ├── SelectorList.html │ │ ├── SelectorListItem.html │ │ ├── SitemapBrowseData.html │ │ ├── SitemapCreate.html │ │ ├── SitemapEditMetadata.html │ │ ├── SitemapExport.html │ │ ├── SitemapExportDataCSV.html │ │ ├── SitemapHeadlessScrapeConfig.html │ │ ├── SitemapImport.html │ │ ├── SitemapList.html │ │ ├── SitemapListItem.html │ │ ├── SitemapScrapeConfig.html │ │ ├── SitemapSelectorGraph.html │ │ ├── SitemapStartUrlField.html │ │ └── Viewport.html ├── generated │ └── .gitignore ├── manifest.json ├── options_page │ ├── options.html │ └── options_page.js ├── popup.html └── scripts │ ├── App.js │ ├── BackgroundScript.js │ ├── ChromeHeadlessBrowser.js │ ├── ChromePopupBrowser.js │ ├── Config.js │ ├── ContentScript.js │ ├── ContentSelector.js │ ├── Controller.js │ ├── DataExtractor.js │ ├── ElementQuery.js │ ├── InMemoryStore.js │ ├── JSDOMBrowser.js │ ├── JSDOMBrowserLoader.js │ ├── Job.js │ ├── Queue.js │ ├── Scraper.js │ ├── Selector.js │ ├── Selector │ ├── SelectorElement.js │ ├── SelectorElementAttribute.js │ ├── SelectorElementClick.js │ ├── SelectorElementScroll.js │ ├── SelectorGoogMapID.js │ ├── SelectorGroup.js │ ├── SelectorHTML.js │ ├── SelectorImage.js │ ├── SelectorLink.js │ ├── SelectorPopupLink.js │ ├── SelectorTable.js │ └── SelectorText.js │ ├── SelectorGraph.js │ ├── SelectorGraphv2.js │ ├── SelectorList.js │ ├── Selectors.js │ ├── Sitemap.js │ ├── Store.js │ ├── StoreDevtools.js │ ├── UniqueElementList.js │ ├── WebJSDOMBrowser.js │ ├── getBackgroundScript.js │ └── getContentScript.js ├── gulpfile.js ├── index.js ├── karma.conf.js ├── package.json ├── playgrounds ├── extension │ ├── index.html │ └── webpage.css └── sitemap-tree │ ├── index.html │ ├── sitemap.json │ └── style.css └── tests ├── ChromeAPI.js ├── FakeStore.js ├── Matchers.js ├── browserSpec.js ├── globals.js ├── jsdomSpec.js ├── spec ├── ContentSelectorSpec.js ├── DataExtractSpec.js ├── ElementQuerySpec.js ├── JobSpec.js ├── QueueSpec.js ├── ScraperSpec.js ├── Selector │ ├── SelectorElementAttributeSpec.js │ ├── SelectorElementClickSpec.js │ ├── SelectorElementScrollSpec.js │ ├── SelectorElementSpec.js │ ├── SelectorGoogMapIDSpec.js │ ├── SelectorGroupSpec.js │ ├── SelectorHTMLSpec.js │ ├── SelectorImageSpec.js │ ├── SelectorLinkSpec.js │ ├── SelectorTableSpec.js │ └── SelectorTextSpec.js ├── SelectorListSpec.js ├── SelectorSpec.js ├── SitemapSpec.js ├── UniqueElementListSpec.js ├── browser │ ├── BackgroundScriptSpec.js │ ├── ChromePopupBrowserSpec.js │ ├── ContentScriptSpec.js │ ├── ScraperSpec.js │ └── Selector │ │ ├── SelectorImageSpec.js │ │ └── SelectorPopupLinkSpec.js ├── headless │ └── browserSpec.js ├── jquery.whencallsequentiallySpec.js └── jsdom │ └── browserSpec.js └── utils.js /.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | "plugins": ["meaningful-logs"] 3 | } -------------------------------------------------------------------------------- /.eslintrc: -------------------------------------------------------------------------------- 1 | { "env": { 2 | "node": true 3 | }, 4 | "globals": { 5 | "d3": true, 6 | "$": true, 7 | "chrome": true, 8 | "jQuery": true, 9 | "describe": true, 10 | "it": true, 11 | "beforeEach": true, 12 | "afterEach": true, 13 | "after": true, 14 | "before": true 15 | }, 16 | "extends": ["standard"]} 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | projectFilesBackup 3 | extension.zip 4 | node_modules 5 | npm-debug.log -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "extension/assets/css-selector"] 2 | path = extension/assets/css-selector 3 | url = https://github.com/martinsbalodis/css-selector.git 4 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | .idea 2 | projectFilesBackup 3 | extension.zip 4 | node_modules 5 | npm-debug.log 6 | extension/assets/* 7 | extension/assets/*/ 8 | !extension/assets/jquery.whencallsequentially.js 9 | !extension/assets/base64.js 10 | docs/images/* 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Web Scraper 3 | Web Scraper is a chrome browser extension and a library built for data extraction from web 4 | pages. Using this extension you can create a plan (sitemap) how a web site 5 | should be traversed and what should be extracted. Using these sitemaps the 6 | Web Scraper will navigate the site accordingly and extract all data. Scraped 7 | data later can be exported as CSV. 8 | 9 | To use it as an extension install it from [Chrome store] [chrome-store] 10 | 11 | To use it as a library do `npm i web-scraper-headless` 12 | 13 | ### Features 14 | 15 | 1. Scrape multiple pages 16 | 2. Sitemaps and scraped data are stored in browsers local storage or in CouchDB 17 | 3. Multiple data selection types 18 | 4. Extract data from dynamic pages (JavaScript+AJAX) 19 | 5. Browse scraped data 20 | 6. Export scraped data as CSV 21 | 7. Import, Export sitemaps 22 | 8. Depends only on Chrome browser 23 | 24 | ### Help 25 | 26 | Documentation and tutorials are available on [webscraper.io] [webscraper.io] 27 | 28 | Ask for help, submit bugs, suggest features on [google groups] [google-groups] 29 | 30 | Submit bugs and suggest features on [bug tracker] [github-issues] 31 | 32 | #### Headless mode 33 | To use it as a library you need a sitemap, you can write it by hand, but the easiest way is to use the [original extension][extension] to scrape and then click on "export sitemap". 34 | 35 | const webscraper = require('web-scraper-headless') 36 | // visit github and retrieve last commit of all trending repo. 37 | // The sitemap depends on the actual DOM of github, so it might get outdated 38 | const sitemap = { 39 | "startUrl": "https://github.com/trending", 40 | "selectors": [{ 41 | "parentSelectors": ["_root"], 42 | "type": "SelectorLink", 43 | "multiple": true, 44 | "id": "link_to_repo", 45 | "selector": "h3 a", 46 | "delay": "" 47 | }, { 48 | "parentSelectors": ["link_to_repo"], 49 | "type": "SelectorText", 50 | "multiple": false, 51 | "id": "latest_commit", 52 | "selector": "a.commit-tease-sha", 53 | "regex": "", 54 | "delay": "" 55 | }], 56 | "_id": "github_trending" 57 | } 58 | const options = {delay: 10, pageLoadDelay: 10, browser: 'headless'} // optional delay, pageLoadDelay and browser 59 | webscraper(sitemap, options) 60 | .then(function (scraped) { 61 | // This is your scraped info 62 | }) 63 | 64 | By default webscraper-headless will open [jsdom](https://github.com/jsdom/jsdom) as a browser. This is a purely JS implementation of HTML. As such it has no native dependencies and it is very lightweighted. However, it is not capable of executing js which might be a hindrance in some cases. If that is your case, you can use chrome headless as a browser. Note that it will consume far more resources than jsdom and you need to have some native dependencies installed in the server. To use chrome headless do the following: 65 | 66 | const sitemap = // same as previous example 67 | const options = {browser: 'headless'} 68 | webscraper(sitemap, options) 69 | .then(function (scraped) { 70 | // This is your scraped info 71 | }) 72 | 73 | #### Bugs 74 | When submitting a bug please attach an exported sitemap if possible. 75 | 76 | ## License 77 | LGPLv3 78 | 79 | ## Changelog 80 | 81 | ### v0.2 82 | * Added Element click selector 83 | * Added Element scroll down selector 84 | * Added Link popup selector 85 | * Improved table selector to work with any html markup 86 | * Added Image download 87 | * Added keyboard shortcuts when selecting elements 88 | * Added configurable delay before using selector 89 | * Added configurable delay between page visiting 90 | * Added multiple start url configuration 91 | * Added form field validation 92 | * Fixed a lot of bugs 93 | 94 | ### v0.1.3 95 | * Added Table selector 96 | * Added HTML selector 97 | * Added HTML attribute selector 98 | * Added data preview 99 | * Added ranged start urls 100 | * Fixed bug which made selector tree not to show on some operating systems 101 | 102 | [chrome-store]: https://chrome.google.com/webstore/detail/web-scraper/jnhgnonknehpejjnehehllkliplmbmhn 103 | [webscraper.io]: http://webscraper.io/ 104 | [google-groups]: https://groups.google.com/forum/#!forum/web-scraper 105 | [github-issues]: https://github.com/martinsbalodis/web-scraper-chrome-extension/issues 106 | [extension]: https://chrome.google.com/webstore/detail/web-scraper/jnhgnonknehpejjnehehllkliplmbmhn 107 | -------------------------------------------------------------------------------- /docs/CSS selector.md: -------------------------------------------------------------------------------- 1 | # CSS selector 2 | 3 | Web Scraper uses css selectors to find HTML elements in web pages and to extract 4 | data from them. When selecting an element the Web Scraper will try to make its 5 | best guess what the CSS selector might be for the selected elements. But you 6 | can also write it yourself and test it with by clicking "Element preview". You 7 | can use CSS selectors that are available in CSS versions 1-3 and also pseudo 8 | selectors that are additionally available in jQuery. Here are some 9 | documentation links that might help you: 10 | 11 | * [CSS Selectors] [css-selectors-wikipedia] 12 | * [jQuery CSS selectors] [css-selectors-jquery] 13 | * [w3schools CSS selector reference] [w3schools-css-selector-reference] 14 | 15 | ## Additional Web Scraper selectors 16 | It is possible to add new pseudo CSS selectors to Web Scraper. Right now there 17 | is only one CSS selector added. 18 | 19 | #### Parent selector 20 | 21 | CSS Selector `_parent_` allows a child selector of an 22 | *Element selector* to select the element that was returned by the *Element selector*. For 23 | example this CSS selector could be used in a case where you need to extract an 24 | attribute from the element that the *Element selector* returned. 25 | 26 | [css-selectors-wikipedia]: http://en.wikipedia.org/wiki/Cascading_Style_Sheets#Selector 27 | [css-selectors-jquery]: http://api.jquery.com/category/selectors/ 28 | [w3schools-css-selector-reference]: http://www.w3schools.com/cssref/css_selectors.asp -------------------------------------------------------------------------------- /docs/Installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | You can install the extension from [Chrome store] [1]. After installing it you 4 | should restart chrome to make sure the extension is fully loaded. If you don't 5 | want to restart Chrome then use the extension only in tabs that are 6 | created after installing it. 7 | 8 | ## Requirements 9 | 10 | The extension requires Chrome 31+ . There are no OS limitations. 11 | 12 | [1]: https://chrome.google.com/webstore/detail/web-scraper/jnhgnonknehpejjnehehllkliplmbmhn "Install web scraper from Chrome store" -------------------------------------------------------------------------------- /docs/Open Web Scraper.md: -------------------------------------------------------------------------------- 1 | # Open Web Scraper 2 | 3 | Web Scraper is integrated into chrome Developer tools. Figure 1 shows how you 4 | can open it. You can also use these shortcuts to open Developer tools. After 5 | opening Developer tools open *Web Scraper* tab. 6 | 7 | Shourtcuts: 8 | 9 | * windows, linux: `Ctrl+Shift+I`, `f12`, open `Tools / Developer tools` 10 | * mac `Cmd+Opt+I`, open `Tools / Developer tools` 11 | 12 | ![Fig. 1: Open Web Scraper][open-web-scraper] 13 | 14 | [open-web-scraper]: images/open-web-scraper/open-web-scraper.png?raw=true -------------------------------------------------------------------------------- /docs/Scraping a site.md: -------------------------------------------------------------------------------- 1 | # Scraping a site 2 | 3 | Open the site that you want to scrape. 4 | 5 | ## Create Sitemap 6 | 7 | The first thing you need to do when creating a *sitemap* is specifying the 8 | start url. This is the url from which the scraping will start. You can also 9 | specify multiple start urls if the scraping should start from multiple places. 10 | For example if you want to scrape multiple search results then you could create 11 | a separate start url for each search result. 12 | 13 | ### Specify multiple urls with ranges 14 | 15 | In cases where a site uses numbering in pages URLs it is much simpler to create 16 | a range start url than creating *Link selectors* that would navigate the site. 17 | To specify a range url replace the numeric part of start url with a range 18 | definition - `[1-100]`. If the site uses zero padding in urls then add zero 19 | padding to the range definition - `[001-100]`. If you want to skip some urls 20 | then you can also specify incremental like this `[0-100:10]`. 21 | 22 | Use range url like this `http://example.com/page/[1-3]` for links like these: 23 | 24 | * `http://example.com/page/1` 25 | * `http://example.com/page/2` 26 | * `http://example.com/page/3` 27 | 28 | Use range url with zero padding like this `http://example.com/page/[001-100]` 29 | for links like these: 30 | 31 | * `http://example.com/page/001` 32 | * `http://example.com/page/002` 33 | * `http://example.com/page/003` 34 | 35 | Use range url with increment like this `http://example.com/page/[0-100:10]` for 36 | links like these: 37 | 38 | * `http://example.com/page/0` 39 | * `http://example.com/page/10` 40 | * `http://example.com/page/20` 41 | 42 | ## Create selectors 43 | 44 | After you have created the *sitemap* you can add selectors to it. In the 45 | *Selectors* panel you can add new selectors, modify them and navigate the 46 | selector tree. 47 | The selectors can be added in a tree type structure. The web scraper will 48 | execute the selectors in the order how they are organized in the tree 49 | structure. For example there is a news site and you want to scrape all articles 50 | whose links are available on the first page. In image 1 you can see this 51 | example site. 52 | 53 | ![Fig. 1: News site][image-news-site] 54 | 55 | To scrape this site you can create a *Link selector* which will extract all 56 | article links in the first page. Then as a child selector you can add a 57 | *Text selector* that will extract articles from the article pages that the 58 | *Link selector* found links to. Image below illustrates how the *sitemap* 59 | should be built for the news site. 60 | 61 | ![Fig. 2: News site sitemap][image-news-site-sitemap] 62 | 63 | Note that when creating selectors use Element preview and Data preview features 64 | to ensure that you have selected the correct elements with the correct data. 65 | 66 | More information about selector tree building is available in selector 67 | documentation. You should atleast read about these core selectors: 68 | 69 | * [Text selector][text-selector] 70 | * [Link selector][link-selector] 71 | * [Element selector][element-selector] 72 | 73 | ### Inspect selector tree 74 | 75 | After you have created selectors for the *sitemap* you can inspect the tree 76 | structure of selectors in the Selector graph panel. Image below shows an 77 | example selector graph. 78 | 79 | ![Fig. 3: News site selector graph][image-news-site-selector-graph] 80 | 81 | ## Scrape the site 82 | 83 | After you have created selectors for the *sitemap* you can start scraping. Open 84 | *Scrape* panel and start scraping. A new popup window will open in which the 85 | scraper will load pages and extract data from them. After the scraping is done 86 | the popup window will close and you will be notified with a popup message. You can view 87 | the scraped data by opening *Browse* panel and export it by opening the 88 | *Export data as CSV* panel. 89 | 90 | 91 | [image-news-site]: images/scraping-a-site/news-site.png?raw=true 92 | [image-news-site-sitemap]: images/scraping-a-site/news-site-sitemap.png?raw=true 93 | [image-news-site-selector-graph]: images/scraping-a-site/news-site-selector-graph.png?raw=true 94 | [text-selector]: Selectors/Text%20selector.md 95 | [link-selector]: Selectors/Link%20selector.md 96 | [element-selector]: Selectors/Element%20selector.md -------------------------------------------------------------------------------- /docs/Selectors.md: -------------------------------------------------------------------------------- 1 | # Selectors 2 | 3 | Web scraper has multiple selectors that can be used for different type data 4 | extraction and for different interaction with the website. The selectors can 5 | be divided in three groups: 6 | 7 | * Data extraction selectors for data extraction. 8 | * Link selectors for site navigation. 9 | * Element selectors for element selection that separate multiple records 10 | 11 | ### Data extraction selectors 12 | 13 | Data extraction selectors simply return data from the selected element. 14 | For example [Text selector] [text-selector] extracts text from 15 | selected element. These selectors can be used as data extraction selectors: 16 | 17 | * [Text selector] [text-selector] 18 | * [Link selector] [link-selector] 19 | * [Link popup selector] [link-popup-selector] 20 | * [Image selector] [image-selector] 21 | * [Table selector] [table-selector] 22 | * [Element attribute selector] [element-attribute-selector] 23 | * [HTML selector] [html-selector] 24 | * [Grouped selector] [grouped-selector] 25 | 26 | ### Link selectors 27 | 28 | Link selectors extract URLs from links that can be later opened for data 29 | extraction. For example if in a sitemap tree there is a *Link selector* that has 30 | 3 child text selectors then the Web Scraper extract all urls with the *Link 31 | selector* and then open each link and use those child data extraction selectors 32 | to extract data. Of course a link selector might have *Link selectors* as child 33 | selectors then these child *Link selectors* would be used for further page 34 | navigation. These are currently available *Link selectors*: 35 | 36 | * [Link selector] [link-selector] 37 | * [Link popup selector] [link-popup-selector] 38 | 39 | ### Element selectors 40 | 41 | Element selectors are for element selection that contain multiple data elements. 42 | For example an element selector might be used to select a list of items in an 43 | e-commerce site. The selector will return each selected element as a parent 44 | element to its child selectors. Element selectors child selectors will 45 | extract data only within the element that the element selector gave them. 46 | These are currently available Element selectors: 47 | 48 | * [Element selector] [element-selector] 49 | * [Element scroll down selector] [element-scroll-selector] 50 | * [Element click selector] [element-click-selector] 51 | 52 | ## Selector configuration options 53 | 54 | Each selector has configuration options. Here you can see the most common ones. 55 | Configuration options that are specific to a selector are described in 56 | selectors documentation. 57 | 58 | * selector - CSS selector that selects an element the selector will be working 59 | on. 60 | * multiple - should be checked when multiple records (data rows) are going to 61 | be extracted with this selector. Data extracted from two or more selectors with 62 | multiple checked wont be merged in a single record. 63 | * delay - delay before selector is being used. 64 | * parent selectors - configure parent selectors for this selector to make the 65 | selector tree. 66 | 67 | Note! A common mistake when using multiple configuration option is to create 68 | two selectors alongside with multiple checked and expect that the scraper will 69 | join selector values in pairs. For example if you selected pagination links and 70 | navigation links these links couldn't be logically joined in pairs. The correct 71 | way is to select a wrapper element with Element selector and add data selectors 72 | as child selectors to the element selector with multiple option not checked. 73 | 74 | [text-selector]: Selectors/Text%20selector.md 75 | [link-selector]: Selectors/Link%20Selector.md 76 | [link-popup-selector]: Selectors/Link%20Popup%20Selector.md 77 | [image-selector]: Selectors/Image%20selector.md 78 | [element-attribute-selector]: Selectors/Table%20selector.md 79 | [table-selector]: Selectors/Table%20selector.md 80 | [grouped-selector]: Selectors/Grouped%20selector.md 81 | [html-selector]: Selectors/HTML%20selector.md 82 | [element-selector]: Selectors/Element%20selector.md 83 | [element-click-selector]: Selectors/Element%20click%20selector.md 84 | [element-scroll-selector]: Selectors/Element%20scroll%20down%20selector.md 85 | -------------------------------------------------------------------------------- /docs/Selectors/Element attribute selector.md: -------------------------------------------------------------------------------- 1 | # Element attribute selector 2 | Element attribute selector can extract an attributes value of an HTML element. 3 | For example you could use this selector to extract title attribute from 4 | this link: `link`. 5 | 6 | ## Configuration options 7 | * selector - [CSS selector] [css-selector] for the element. 8 | * multiple - multiple records are being extracted. 9 | * attribute name - the attribute that is going to be extracted. For example 10 | `title`, `data-id`. 11 | 12 | ## Use cases 13 | See [Text selector] [text-selector] use cases. 14 | 15 | [text-selector]: Text%20selector.md 16 | [css-selector]: ../CSS%20selector.md -------------------------------------------------------------------------------- /docs/Selectors/Element click selector.md: -------------------------------------------------------------------------------- 1 | # Element click selector 2 | 3 | Element click selector works similarly to 4 | [Element selector] [element-selector]. It's main purpose also is element 5 | selection that could be given as parent elements to its child selectors. The only 6 | difference is that *Element click selector* can interact with the web page by 7 | clicking on buttons to load new elements. For example a page might use 8 | JavaScript and AJAX for pagination or item loading. 9 | 10 | Note! when selecting clickable elements you should select them by moving the 11 | mouse over the element and pressing "S". This kind of selection will avoid 12 | events triggered by the button. 13 | 14 | ## Configuration options 15 | * selector - [CSS selector] [css-selector] for the wrapper elements that will 16 | be used as parent elements for child selectors. 17 | * click selector - [CSS selector] [css-selector] for the buttons that need to 18 | be clicked to load more elements. 19 | * click type - type of how the selector knows when there will be no new 20 | elements and clicking should stop. 21 | * click element uniqueness - type of how selector knows which buttons are 22 | already clicked. 23 | * multiple - multiple records are being extracted (almost always should be 24 | checked). Multiple option for child selectors usually should not be checked. 25 | * delay - delay before element selection and delay between clicking. This 26 | should usually be specified because the data won't be loaded immediately from 27 | the server. More than 2000 ms might be a good choice if you you don't want to 28 | loose data because the server didn't respond fast enough. 29 | * Discard initial elements - the selector will not return the elements that 30 | were available before clicking for the first time. This might be useful for 31 | duplicate removal. 32 | 33 | ### Click type 34 | #### Click Once 35 | 36 | Click Once type will click on the buttons only once. If a new button appears 37 | that can be selected it will be also clicked. For example pagination links 38 | might show pages 1 to 5 but pages 6 to 10 would appear some time later. The 39 | selector will also click on those buttons. 40 | 41 | #### Click More 42 | 43 | Click More type makes the selector click on given buttons multiple times 44 | until there are no new elements appearing. A new element is considered an 45 | element that has unique text content. 46 | 47 | ### Click element uniqueness 48 | 49 | When using *Click Once* only unique buttons will be clicked. When using 50 | *Click More* this helps to ignore buttons that don't generate more elements. 51 | 52 | * Unique Text - buttons with identical text content are considered equal 53 | * Unique HTML+Text - buttons with identical HTML and text content are 54 | considered equal 55 | * Unique HTML - buttons with identical HTML and stripped text content are 56 | considered equal 57 | * Unique CSS Selector - buttons with identical CSS Selector are considered equal 58 | 59 | ## Use cases 60 | 61 | #### Navigate pagination using "Click once" selector type 62 | 63 | For example there is a site that displays a list of items and there are some 64 | pagination buttons that reload these items dynamically (after clicking a button 65 | the url doesn't change. changes after hash tag # doesn't count). Using *Element 66 | click selector* you can select these items and buttons that need to be clicked. 67 | The scraper during scraping phase will click these buttons to extract all 68 | elements. Also you need to add child selectors for the *Element click selector* 69 | that select data within each element. In figure 1 you can see how to configure 70 | the *Element click selector* to extract data from the described site. 71 | 72 | ![Fig. 1: Sitemap when using Click once type][image-click-once] 73 | 74 | #### Load more items in an e-commerce site by clicking "More" button 75 | 76 | This example is similar to the one above. The only difference is that in this 77 | site items are loaded by clicking a single button multiple times. In this case 78 | the *Element click selector* should be configured to use "Click more" click 79 | type. In figure 2 you can see how to configure the *Element click selector* 80 | to extract data from this site. 81 | 82 | ![Fig. 2: Sitemap when using Click more type][image-click-more] 83 | 84 | [image-click-more]: ../images/selectors/element-click/click-more.png?raw=true 85 | [image-click-once]: ../images/selectors/element-click/click-once.png?raw=true 86 | [element-selector]: Element%20selector.md 87 | [css-selector]: ../CSS%20selector.md -------------------------------------------------------------------------------- /docs/Selectors/Element scroll down selector.md: -------------------------------------------------------------------------------- 1 | # Element scroll down selector 2 | 3 | This is another Element selector that works similarly to Element selector but 4 | additionally it scrolls down the page multiple times to find those elements 5 | which are added when page is scrolled down to the bottom. Use the delay 6 | attribute to configure waiting interval between scrolling and element search. 7 | Scrolling is stopped after no new elements are found. If the page can scroll 8 | infinitely then this selector will be stuck in an infinite loop. 9 | 10 | ## Configuration options 11 | 12 | * selector - [CSS selector] [css-selector] for the element. 13 | * multiple - multiple records are being extracted (almost always should be 14 | checked). Multiple option for child selectors usually should not be checked. 15 | * delay - delay before element selection and delay between scrolling. This 16 | should usually be specified because the data won't be loaded immediately from 17 | the server after scrolling down. More than 2000 ms might be a good choice if 18 | you you don't want to loose data because the server didn't respond fast enough. 19 | 20 | ## Use cases 21 | See [Element selector] [element-selector] use cases. 22 | 23 | [element-selector]: Element%20selector.md 24 | [css-selector]: ../CSS%20selector.md -------------------------------------------------------------------------------- /docs/Selectors/Element selector.md: -------------------------------------------------------------------------------- 1 | # Element selector 2 | 3 | Element selector is for element selection that contain multiple data elements. 4 | For example element selector might be used to select a list of items in an 5 | e-commerce site. The selector will return each selected element as a parent 6 | element to its child selectors. Element selectors child selectors will be 7 | extracting data only within the element that the element selector gave them. 8 | 9 | Note! If the page dynamically loads new items after scrolling down or clicking 10 | on a button then you should try these selectors: 11 | 12 | * [Element scroll down selector] [element-scroll-selector] 13 | * [Element click selector] [element-click-selector] 14 | 15 | ## Configuration options 16 | * selector - [CSS selector] [css-selector] for the wrapper elements that will 17 | be used as parent elements for child selectors. 18 | * multiple - multiple records are being extracted (almost always should be 19 | checked). Multiple option for child selectors usually should not be checked. 20 | 21 | ## Use cases 22 | 23 | #### Select multiple e-commerce items from a page 24 | 25 | For example an e-commerce site has a page with a list of items. With element 26 | selector you can select the elements that wrap these items and then add 27 | multiple child selectors to it to extract data within the items wrapper 28 | element. Figure 1 shows how an element selector could be used in this 29 | situation. 30 | 31 | ![Fig. 1: Multiple items selected with element selector] [multiple-elements-with-text-selectors] 32 | 33 | #### Extract data from tables 34 | 35 | Similarly to e-commerce item selection you can also select table rows and add 36 | child selectors for data extraction from table cells. 37 | Though [Table selector] [table-selector] might be much better solution. 38 | 39 | [css-selector]: ../CSS%20selector.md 40 | [element-scroll-selector]: Element%20scroll%20down%20selector.md 41 | [element-click-selector]: Element%20click%20selector.md 42 | [table-selector]: Table%20selector.md 43 | [multiple-elements-with-text-selectors]: ../images/selectors/text/text-selector-multiple-elements-with-text-selectors.png?raw=true -------------------------------------------------------------------------------- /docs/Selectors/Grouped selector.md: -------------------------------------------------------------------------------- 1 | # Grouped selector 2 | 3 | Grouped selector can group text data from multiple elements into one record. 4 | The extracted data will be stored as JSON. 5 | 6 | ## Configuration options 7 | * selector - [CSS selector] [css-selector] for the elements whose text will be 8 | extracted and stored in JSON format. 9 | * attribute name - optionally this selector can extract an attribute of the 10 | selected element. If specified the extractor will also add this attribute to 11 | the resulting JSON. 12 | 13 | ## Use cases 14 | 15 | #### Extract article references 16 | 17 | For example you are extracting a news article that might have multiple 18 | reference links. If you are selecting these links with link selector with 19 | multiple checked you would get duplicate articles in the result set where each 20 | record would contain one reference link. Using grouped selector you could 21 | serialize all these reference links into one record. To do that select all 22 | reference links and set attribute name to `href` to also extract links to these 23 | sites. 24 | 25 | [css-selector]: ../CSS%20selector.md -------------------------------------------------------------------------------- /docs/Selectors/HTML selector.md: -------------------------------------------------------------------------------- 1 | # HTML selector 2 | HMTL selector can extract HTML and text within the selected element. Only the 3 | inner HTML of the element will be extracted. 4 | 5 | ## Configuration options 6 | * selector - [CSS selector] [css-selector] for the element whose inner HTML 7 | will be extracted. 8 | * multiple - multiple records are being extracted. 9 | 10 | ## Use cases 11 | See [Text selector] [text-selector] use cases. 12 | 13 | [text-selector]: Text%20selector.md 14 | [css-selector]: ../CSS%20selector.md -------------------------------------------------------------------------------- /docs/Selectors/Image selector.md: -------------------------------------------------------------------------------- 1 | # Image selector 2 | Image selector can extract `src` attribute (URL) of an image. 3 | Optionally you can also store the images. The images will be stored in your 4 | downloads directory: 5 | 6 | `Downloads///` 7 | 8 | Note! When selecting CSS selector for image selector all the images within the 9 | site are moved to the top. If this feature somehow breaks sites layout please 10 | report it as a bug. 11 | 12 | ## Configuration options 13 | * selector - [CSS selector] [css-selector] for the image element. 14 | * multiple - multiple records are being extracted. Usually should not be 15 | checked for Image selector. 16 | * download image - downloads and store images on local drive. When CouchDB 17 | storage back end is used the image is also stored locally. 18 | 19 | ## Use cases 20 | See [Text selector] [text-selector] use cases. 21 | 22 | [text-selector]: Text%20selector.md 23 | [css-selector]: ../CSS%20selector.md -------------------------------------------------------------------------------- /docs/Selectors/Link popup selector.md: -------------------------------------------------------------------------------- 1 | # Link popup selector 2 | 3 | *Link popup selector* works similarly as [Link selector] [link-selector]. It can 4 | be used for url extraction and site navigation. The only difference is that 5 | *Link popup selector* should be used when clicking on a link the site opens a new 6 | window (popup) instead of loading the URL in the same tab or opening it in a 7 | new tab. This selector will catch the popup creation event and extract the URL. 8 | If the site creates a visual popup but not a real window then you should try 9 | [Element click selector] [element-click-selector] 10 | 11 | Note! when selecting these link elements you can move the mouse over the 12 | element and press "S" to select it to prevent it from opening a popup. 13 | 14 | ## Use cases 15 | See [Link selector] [link-selector] use cases. 16 | 17 | [link-selector]: Link%20selector.md 18 | [element-click-selector]: Element%20click%20selector.md -------------------------------------------------------------------------------- /docs/Selectors/Link selector.md: -------------------------------------------------------------------------------- 1 | # Link selector 2 | 3 | Link selector is used for link selection and website navigation. If you use 4 | *Link selector* without any child selectors then it will extract the link and 5 | the href attribute of the link. If you add child selectors to *Link selector* 6 | then these child selectors will be used in the page that this link was leading 7 | to. If you are selecting multiple links then check *multiple* property. 8 | 9 | Note! Link selector works only with `` tags with `href` attribute. If the 10 | link selector is not working for you then you can try these workarounds: 11 | 12 | 1. Check that the link in the url bar changes after clicking an item (changes 13 | only after hash tag doesn't count). If the link doesn't change then the site 14 | is probably using ajax for data loading. Instead of using link selector you 15 | should use [Element click selector] [element-click]. 16 | 2. If the site opens a popup then you should use 17 | [Link popup selector] [link-popup] 18 | 3. The site might be using JavaScript `window.location` to change the URL. Web 19 | Scraper cannot handle this kind of navigation right now. 20 | 21 | ## Configuration options 22 | 23 | * selector - [CSS selector] [css-selector] for the link element from which the 24 | link for navigation will be extracted. 25 | * multiple - multiple records are being extracted. Usually should be checked. 26 | 27 | ## Use cases 28 | 29 | **Navigate through multiple levels of navigation** 30 | 31 | For example an e-commerce site has multi level navigation - 32 | `categories -> subcategories`. To scrape data from all categories and 33 | subcategories you can create two *Link selectors*. One selector would select 34 | category links and the other selector would select subcategory links that are 35 | available in the category pages. The subcategory *Link selector* should be made 36 | as a child of the category *Link selector*. The selectors for data extraction 37 | from subcategory pages should be made as a child selectors to the subcategory 38 | selector. 39 | 40 | ![Fig. 1: Multiple link selectors for category navigation][multiple-level-link-selectors] 41 | 42 | **Handle pagination** 43 | 44 | For example an e-commerce site has multiple categories. Each category has a 45 | list of items and pagination links. Also some pages are not directly available 46 | from the category but are available from pagination pages (you can see 47 | pagination links 1-5, but not 6-8). You can start by building a sitemap that 48 | visits each category and extract items from category page. This sitemap will 49 | extract items only from the first pagination page. To extract items from all of 50 | the pagination links including the ones that are not visible at the beginning 51 | you need to create another *Link selector* that selects the pagination links. 52 | Figure 2 shows how the link selector should be created in the sitemap. When 53 | the scraper opens a category link it will extract items that are available in 54 | the page. After that it will find the pagination links and also visit those. If 55 | the pagination link selector is made a child to itself it will recursively 56 | discover all pagination pages. Figure 3 shows a selector graph where you can 57 | see how pagination links discover more pagination links and more data. 58 | 59 | ![Fig. 2: Sitemap with Link selector for pagination][pagination-link-selectors] 60 | ![Fig. 3: Selector graph with pagination][pagination-selector-graph] 61 | 62 | [multiple-level-link-selectors]: ../images/selectors/link/multiple-level-link-selectors.png?raw=true 63 | [pagination-link-selectors]: ../images/selectors/link/pagination-link-selectors.png?raw=true 64 | [pagination-selector-graph]: ../images/selectors/link/pagination-selector-graph.png?raw=true 65 | [element-click]: Element%20click%20selector.md 66 | [link-popup]: Link%20popup%20selector.md 67 | [css-selector]: ../CSS%20selector.md -------------------------------------------------------------------------------- /docs/Selectors/Table selector.md: -------------------------------------------------------------------------------- 1 | # Table selector 2 | 3 | Table selector can extract data from tables. *Table selector* has 3 4 | configurable CSS selectors. The selector is for table selection. After you have 5 | selected the selector the *Table selector* will try to guess selectors 6 | for header row and data rows. You can click Element preview on those selectors 7 | to see whether the *Table selector* found table header and data rows correctly. 8 | The header row selector is used to identify table columns when data is 9 | extracted from multiple pages. Also you can rename table columns. Figure 1 10 | shows what you should select when extracting data from a table. 11 | 12 | ![Fig. 1: Selectors for table selector] [table-selector-selectors] 13 | 14 | ## Configuration options 15 | * selector - [CSS selector] [css-selector] for the table element. 16 | * header row selector - [CSS selector] [css-selector] for table header row. 17 | * data rows selector - [CSS selector] [css-selector] for table data rows. 18 | * multiple - multiple records are being extracted. Usually should be 19 | checked for Table selector because you are extracting multiple rows. 20 | 21 | ## Use cases 22 | See [Text selector] [text-selector] use cases. 23 | 24 | [table-selector-selectors]: ../images/selectors/table/selectors.png?raw=true 25 | [text-selector]: Text%20selector.md 26 | [css-selector]: ../CSS%20selector.md -------------------------------------------------------------------------------- /docs/Selectors/Text selector.md: -------------------------------------------------------------------------------- 1 | # Text selector 2 | 3 | Text selector is used for text selection. The text selector will extract text 4 | from the selected element and from all its child elements. HTML will be 5 | stripped and only text will be returned. Selector will ignore text within 6 | ` 4 | 5 | -------------------------------------------------------------------------------- /extension/devtools/devtools_init_page.js: -------------------------------------------------------------------------------- 1 | console.log('loading devtools') 2 | chrome.devtools.panels.create('Web Scraper Headless', '../assets/images/icon48.png', 'devtools/devtools_scraper_panel.html') 3 | -------------------------------------------------------------------------------- /extension/devtools/devtools_scraper_panel.css: -------------------------------------------------------------------------------- 1 | /*body > form, body > div {*/ 2 | /*display:none;*/ 3 | /*}*/ 4 | 5 | a, tbody tr { 6 | cursor: pointer; 7 | } 8 | 9 | 10 | .selector-list-tpl, .sitemap-list-tpl { 11 | display:none 12 | } 13 | 14 | /** 15 | * Compact elements 16 | */ 17 | .navbar-nav>li>a { 18 | padding-top: 3px; 19 | padding-bottom: 3px; 20 | } 21 | 22 | .navbar-text { 23 | margin-top:4px; 24 | margin-bottom:4px; 25 | padding-right:3px; 26 | } 27 | 28 | .navbar { 29 | min-height:26px; 30 | margin-bottom: 6px; 31 | } 32 | .table-condensed tbody>tr>td { 33 | padding:1px 5px; 34 | } 35 | 36 | body { 37 | font-size: 12px; 38 | } 39 | 40 | form .form-control { 41 | font-size: 12px; 42 | padding: 3px 12px; 43 | height: 25px; 44 | } 45 | 46 | textarea.form-control { 47 | height: auto; 48 | } 49 | 50 | form .btn { 51 | font-size: 12px; 52 | padding: 3px 12px; 53 | } 54 | 55 | form .form-group { 56 | margin-bottom:5px; 57 | } 58 | 59 | form select[multiple], select[size] { 60 | height: auto; 61 | } 62 | 63 | #selector-graph .node circle { 64 | cursor: pointer; 65 | fill: #fff; 66 | stroke: steelblue; 67 | stroke-width: 1px; 68 | } 69 | 70 | #selector-graph .node text { 71 | font-size: 11px; 72 | } 73 | 74 | #selector-graph path.link { 75 | fill: none; 76 | stroke: #ccc; 77 | stroke-width: 1px; 78 | } 79 | 80 | .data-preview-modal .modal-dialog { 81 | width:auto; 82 | } 83 | 84 | .data-preview-modal .modal-body { 85 | overflow-y:scroll; 86 | } 87 | 88 | .data-preview-modal tbody tr { 89 | cursor: initial; 90 | } -------------------------------------------------------------------------------- /extension/devtools/devtools_scraper_panel.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /extension/devtools/views/DataPreview.html: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /extension/devtools/views/SelectorEditTableColumn.html: -------------------------------------------------------------------------------- 1 | 2 | {{header}} 3 | 4 | 5 | -------------------------------------------------------------------------------- /extension/devtools/views/SelectorList.html: -------------------------------------------------------------------------------- 1 |
2 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 |
IDSelectortypeMultipleParent selectorsActions
20 | 21 |
-------------------------------------------------------------------------------- /extension/devtools/views/SelectorListItem.html: -------------------------------------------------------------------------------- 1 | 2 | {{id}} 3 | {{selector}} 4 | {{type}} 5 | {{multiple}} 6 | {{parentSelectors}} 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /extension/devtools/views/SitemapBrowseData.html: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | 5 | {{#columns}} 6 | 7 | {{/columns}} 8 | 9 | 10 | 11 | 12 |
{{.}}
13 |
-------------------------------------------------------------------------------- /extension/devtools/views/SitemapCreate.html: -------------------------------------------------------------------------------- 1 |
2 |
3 | 4 | 5 |
6 | 7 |
8 |
9 |
10 | 11 |
12 |
13 | 14 | 15 | 16 | 17 | 18 |
19 |
20 |
21 |
22 |
23 | 24 |
25 |
26 |
-------------------------------------------------------------------------------- /extension/devtools/views/SitemapEditMetadata.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |
4 | 5 | 6 |
7 | 8 |
9 |
10 | {{#startUrl.push}} 11 | {{#startUrl}} 12 |
13 | 14 |
15 |
16 | 17 | 18 | 19 | 20 | 21 |
22 |
23 |
24 | {{/startUrl}} 25 | {{/startUrl.push}} 26 | {{^startUrl.push}} 27 |
28 | 29 |
30 |
31 | 32 | 33 | 34 | 35 | 36 |
37 |
38 |
39 | {{/startUrl.push}} 40 |
41 |
42 | 43 |
44 |
45 |
46 |
-------------------------------------------------------------------------------- /extension/devtools/views/SitemapExport.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |
4 | 5 |
6 |
7 |
-------------------------------------------------------------------------------- /extension/devtools/views/SitemapExportDataCSV.html: -------------------------------------------------------------------------------- 1 |

2 | Export {{_id}} data as CSV.
Waiting for the download button to appear. > 3 | Download now! 4 |

-------------------------------------------------------------------------------- /extension/devtools/views/SitemapHeadlessScrapeConfig.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |
4 | 5 |
6 | 7 |
8 |
9 |
10 | 11 |
12 | 13 |
14 |
15 | 19 | 20 |
21 |
22 | 23 |
24 |
25 |
26 |
-------------------------------------------------------------------------------- /extension/devtools/views/SitemapImport.html: -------------------------------------------------------------------------------- 1 |
2 |
3 | 4 | 5 |
6 | 7 |
8 |
9 |
10 | 11 | 12 |
13 | 14 |
15 |
16 |
17 |
18 | 19 |
20 |
21 |
-------------------------------------------------------------------------------- /extension/devtools/views/SitemapList.html: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
IDStart URLactions
13 |
-------------------------------------------------------------------------------- /extension/devtools/views/SitemapListItem.html: -------------------------------------------------------------------------------- 1 | 2 | {{_id}} 3 | 4 | {{#startUrl.push}} 5 | {{#startUrl}} 6 | {{.}}, 7 | {{/startUrl}} 8 | {{/startUrl.push}} 9 | {{^startUrl.push}} 10 | {{startUrl}} 11 | {{/startUrl.push}} 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /extension/devtools/views/SitemapScrapeConfig.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |
4 | 5 |
6 | 7 |
8 |
9 |
10 | 11 |
12 | 13 |
14 |
15 | 19 | 20 |
21 |
22 | 23 |
24 |
25 |
26 |
-------------------------------------------------------------------------------- /extension/devtools/views/SitemapSelectorGraph.html: -------------------------------------------------------------------------------- 1 |
-------------------------------------------------------------------------------- /extension/devtools/views/SitemapStartUrlField.html: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 |
5 |
6 | 7 | 8 | 9 | 10 | 11 |
12 |
13 |
-------------------------------------------------------------------------------- /extension/devtools/views/Viewport.html: -------------------------------------------------------------------------------- 1 | 2 | 34 | 35 |
36 |
-------------------------------------------------------------------------------- /extension/generated/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /extension/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest_version": 2, 3 | "version": "0.2.0.9", 4 | "name": "Web Scraper Headless", 5 | "short_name": "Web Scraper Headless", 6 | "description": "Tool for data extraction from websites", 7 | "permissions": ["", "tabs", "notifications", "storage", "unlimitedStorage", "downloads"], 8 | "icons": { 9 | "16": "assets/images/icon16.png", 10 | "48": "assets/images/icon48.png", 11 | "128": "assets/images/icon128.png" 12 | }, 13 | "browser_action": { 14 | "default_icon": { 15 | "19": "assets/images/icon19.png", 16 | "38": "assets/images/icon38.png" 17 | }, 18 | "default_title": "Web Scraper", 19 | "default_popup": "popup.html" 20 | }, 21 | "options_page": "options_page/options.html", 22 | "devtools_page": "devtools/devtools_init_page.html", 23 | "content_security_policy": "script-src 'self' 'unsafe-eval'; object-src 'self'", 24 | "background": { 25 | "scripts": [ 26 | "assets/jquery-2.0.3.js", 27 | "assets/pouchdb-nightly.min.js", 28 | "generated/background-scraper.js" 29 | ] 30 | }, 31 | "web_accessible_resources": [ 32 | "assets/images/icon16.png", 33 | "assets/images/icon48.png", 34 | "assets/images/icon128.png", 35 | "assets/images/icon19.png", 36 | "assets/images/icon38.png" 37 | ], 38 | "content_scripts": [ 39 | { 40 | "matches": ["*://*/*"], 41 | "js": [ 42 | "assets/jquery-2.0.3.js", 43 | "assets/sugar-1.4.1.js", 44 | "generated/content-scraper.js", 45 | "content_script/content_script.js" 46 | ], 47 | "css": [ 48 | "content_script/content_script.css" 49 | ] 50 | } 51 | ] 52 | } -------------------------------------------------------------------------------- /extension/options_page/options.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Web Scraper 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |
14 |

Web Scraper

15 |

Options page

16 |
17 | 18 | 19 |
20 |
21 | Storage settings 22 |
23 | 24 |
25 | 29 |
30 |
31 | 32 |
33 | 34 | 35 |
36 | 37 |
38 |
39 | 40 |
41 | 42 | 43 |
44 | 45 |
46 |
47 |
48 |
49 | 50 |
51 |
52 |
53 |
54 | 55 |
56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /extension/options_page/options_page.js: -------------------------------------------------------------------------------- 1 | $(function () { 2 | console.log('opening config page') 3 | // popups for Storage setting input fields 4 | $('#sitemapDb') 5 | .popover({ 6 | title: 'Database for sitemap storage', 7 | html: true, 8 | content: 'CouchDB database url
http://example.com/scraper-sitemaps/', 9 | placement: 'bottom' 10 | }) 11 | .blur(function () { 12 | $(this).popover('hide') 13 | }) 14 | 15 | $('#dataDb') 16 | .popover({ 17 | title: 'Database for scraped data', 18 | html: true, 19 | content: 'CouchDB database url. For each sitemap a new DB will be created.
http://example.com/', 20 | placement: 'bottom' 21 | }) 22 | .blur(function () { 23 | $(this).popover('hide') 24 | }) 25 | 26 | // switch between configuration types 27 | $('select[name=storageType]').change(function () { 28 | var type = $(this).val() 29 | 30 | if (type === 'couchdb') { 31 | $('.form-group.couchdb').show() 32 | } else { 33 | $('.form-group.couchdb').hide() 34 | } 35 | }) 36 | 37 | // Extension configuration 38 | var config = new Config() 39 | 40 | // load previously synced data 41 | config.loadConfiguration(function () { 42 | $('#storageType').val(config.storageType) 43 | $('#sitemapDb').val(config.sitemapDb) 44 | $('#dataDb').val(config.dataDb) 45 | 46 | $('select[name=storageType]').change() 47 | }) 48 | 49 | // Sync storage settings 50 | $('form#storage_configuration').submit(function () { 51 | var sitemapDb = $('#sitemapDb').val() 52 | var dataDb = $('#dataDb').val() 53 | var storageType = $('#storageType').val() 54 | 55 | var newConfig 56 | 57 | if (storageType === 'local') { 58 | newConfig = { 59 | storageType: storageType, 60 | sitemapDb: ' ', 61 | dataDb: ' ' 62 | } 63 | } else { 64 | newConfig = { 65 | storageType: storageType, 66 | sitemapDb: sitemapDb, 67 | dataDb: dataDb 68 | } 69 | } 70 | 71 | config.updateConfiguration(newConfig) 72 | return false 73 | }) 74 | }) 75 | -------------------------------------------------------------------------------- /extension/popup.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 10 | 11 | 12 |

13 | Open Developer tools where you will find Web Scraper tab: 14 |

25 |

26 |

27 | Documentation is available on webscraper.io 28 |

29 | 30 | -------------------------------------------------------------------------------- /extension/scripts/App.js: -------------------------------------------------------------------------------- 1 | var StoreDevtools = require('./StoreDevtools') 2 | var SitemapController = require('./Controller') 3 | 4 | $(function () { 5 | // init bootstrap alerts 6 | $('.alert').alert() 7 | 8 | var store = new StoreDevtools({$, document, window}) 9 | new SitemapController({ 10 | store: store, 11 | templateDir: 'views/' 12 | }, {$, document, window}) 13 | }) 14 | -------------------------------------------------------------------------------- /extension/scripts/BackgroundScript.js: -------------------------------------------------------------------------------- 1 | var jquery = require('jquery-deferred') 2 | const debug = require('debug')('web-scraper-headless:background-script') 3 | 4 | /** 5 | * ContentScript that can be called from anywhere within the extension 6 | */ 7 | var BackgroundScript = { 8 | 9 | dummy: function () { 10 | return jquery.Deferred().resolve('dummy').promise() 11 | }, 12 | 13 | /** 14 | * Returns the id of the tab that is visible to user 15 | * @returns jquery.Deferred() integer 16 | */ 17 | getActiveTabId: function () { 18 | var deferredResponse = jquery.Deferred() 19 | 20 | chrome.tabs.query({ 21 | active: true, 22 | currentWindow: true 23 | }, function (tabs) { 24 | if (tabs.length < 1) { 25 | debug('There seems to be no active tab in the current window. Let us try only active') 26 | chrome.tabs.query({ 27 | active: true, 28 | windowType: 'normal' 29 | }, function (tabs) { 30 | if (tabs.length < 1) { 31 | debug('Could not find tab') 32 | deferredResponse.reject("couldn't find the active tab") 33 | } else { 34 | const tabId = tabs[0].id 35 | deferredResponse.resolve(tabId) 36 | } 37 | }) 38 | // @TODO must be running within popup. maybe find another active window? 39 | } else { 40 | var tabId = tabs[0].id 41 | deferredResponse.resolve(tabId) 42 | } 43 | }) 44 | return deferredResponse.promise() 45 | }, 46 | 47 | /** 48 | * Execute a function within the active tab within content script 49 | * @param request.fn function to call 50 | * @param request.request request that will be passed to the function 51 | */ 52 | executeContentScript: function (request) { 53 | var reqToContentScript = { 54 | contentScriptCall: true, 55 | fn: request.fn, 56 | request: request.request 57 | } 58 | var deferredResponse = jquery.Deferred() 59 | var deferredActiveTabId = this.getActiveTabId() 60 | deferredActiveTabId.done(function (tabId) { 61 | chrome.tabs.sendMessage(tabId, reqToContentScript, function (response) { 62 | deferredResponse.resolve(response) 63 | }) 64 | }) 65 | 66 | return deferredResponse 67 | } 68 | } 69 | 70 | module.exports = BackgroundScript 71 | -------------------------------------------------------------------------------- /extension/scripts/ChromeHeadlessBrowser.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require('puppeteer') 2 | const debug = require('debug')('web-scraper-headless:chrome-headless-browser') 3 | const {ExecutionContext} = require('puppeteer/lib/ExecutionContext') 4 | const contentSraperBundler = require('../content_script/contentScraperHeadlessBundler') 5 | const jqueryDeferred = require('jquery-deferred') 6 | const whenCallSequentially = require('../assets/jquery.whencallsequentially') 7 | 8 | class ChromeHeadlessBrowser { 9 | constructor (options) { 10 | this.pageLoadDelay = options.pageLoadDelay 11 | // constructors cannot handle asynchronous 12 | this.browserPromise = puppeteer.launch({ 13 | headless: true 14 | }) 15 | this.pagePromise = this.browserPromise.then(function (browser) { 16 | return browser.newPage() 17 | }) 18 | } 19 | async loadUrl (url) { 20 | debug('Loading url', url) 21 | const page = await this.pagePromise 22 | await page.goto(url, {waitUntil: 'networkidle2'}) 23 | } 24 | async close () { 25 | try { 26 | const browser = await this.browserPromise 27 | await browser.close() 28 | } catch (e) { 29 | console.error(e) 30 | } 31 | } 32 | saveImages (record, namingFunction) { 33 | var deferredResponse = jqueryDeferred.Deferred() 34 | var deferredImageStoreCalls = [] 35 | var prefixLength = '_imageBase64-'.length 36 | for (var attr in record) { 37 | if (attr.substr(0, prefixLength) === '_imageBase64-') { 38 | throw new Error('Downloading images is not yet supported') 39 | } 40 | } 41 | whenCallSequentially(deferredImageStoreCalls).done(function () { 42 | deferredResponse.resolve() 43 | }) 44 | 45 | return deferredResponse.promise() 46 | } 47 | async fetchData (url, sitemap, parentSelectorId, callback, scope) { 48 | try { 49 | const page = await this.pagePromise 50 | await this.loadUrl(url) 51 | 52 | const mainFrame = page.mainFrame() 53 | 54 | // Maybe we don't need a context each time? 55 | const isolatedWorldInfo = await page._client.send('Page.createIsolatedWorld', {frameId: mainFrame._id, worldName: 'web-scraper-headless'}) 56 | const executionContextId = isolatedWorldInfo.executionContextId 57 | const JsHandleFactory = page._frameManager.createJSHandle.bind(page._frameManager, executionContextId) 58 | 59 | const executionContext = new ExecutionContext(page._client, {id: executionContextId}, JsHandleFactory) 60 | 61 | const bundle = await contentSraperBundler.getBundle() 62 | await executionContext.evaluate(bundle) 63 | const message = { 64 | extractData: true, 65 | sitemap: JSON.parse(JSON.stringify(sitemap)), 66 | parentSelectorId: parentSelectorId 67 | } 68 | 69 | const data = await executionContext.evaluate(function (message) { 70 | return new Promise(function (resolve, reject) { 71 | window.webScraper(message, null, function (data) { 72 | resolve(data) 73 | }) 74 | }) 75 | }, message) 76 | callback.call(scope, null, data) 77 | } catch (e) { 78 | return callback(e) 79 | } 80 | } 81 | } 82 | 83 | module.exports = ChromeHeadlessBrowser 84 | -------------------------------------------------------------------------------- /extension/scripts/Config.js: -------------------------------------------------------------------------------- 1 | var Config = function () { 2 | 3 | } 4 | 5 | Config.prototype = { 6 | 7 | sitemapDb: '', 8 | dataDb: '', 9 | 10 | defaults: { 11 | storageType: 'local', 12 | // this is where sitemap documents are stored 13 | sitemapDb: 'scraper-sitemaps', 14 | // this is where scraped data is stored. 15 | // empty for local storage 16 | dataDb: '' 17 | }, 18 | 19 | /** 20 | * Loads configuration from chrome extension sync storage 21 | */ 22 | loadConfiguration: function (callback) { 23 | chrome.storage.sync.get(['sitemapDb', 'dataDb', 'storageType'], function (items) { 24 | this.storageType = items.storageType || this.defaults.storageType 25 | if (this.storageType === 'local') { 26 | this.sitemapDb = this.defaults.sitemapDb 27 | this.dataDb = this.defaults.dataDb 28 | } else { 29 | this.sitemapDb = items.sitemapDb || this.defaults.sitemapDb 30 | this.dataDb = items.dataDb || this.defaults.dataDb 31 | } 32 | 33 | callback() 34 | }.bind(this)) 35 | }, 36 | 37 | /** 38 | * Saves configuration to chrome extension sync storage 39 | * @param {type} items 40 | * @param {type} callback 41 | * @returns {undefined} 42 | */ 43 | updateConfiguration: function (items, callback) { 44 | chrome.storage.sync.set(items, callback) 45 | } 46 | } 47 | 48 | module.exports = Config 49 | -------------------------------------------------------------------------------- /extension/scripts/ContentScript.js: -------------------------------------------------------------------------------- 1 | var ContentSelector = require('./ContentSelector') 2 | var jquery = require('jquery-deferred') 3 | const debug = require('debug')('web-scraper-headless:content-script') 4 | 5 | /** 6 | * ContentScript that can be called from anywhere within the extension 7 | */ 8 | var ContentScript = { 9 | 10 | /** 11 | * Fetch 12 | * @param request.CSSSelector css selector as string 13 | * @returns jquery.Deferred() 14 | */ 15 | getHTML: function (request, options) { 16 | var $ = options.$ 17 | var deferredHTML = jquery.Deferred() 18 | var html = $(request.CSSSelector).clone().wrap('

').parent().html() 19 | deferredHTML.resolve(html) 20 | debug('Send html', html) 21 | return deferredHTML.promise() 22 | }, 23 | 24 | /** 25 | * Removes current content selector if is in use within the page 26 | * @returns jquery.Deferred() 27 | */ 28 | removeCurrentContentSelector: function () { 29 | var deferredResponse = jquery.Deferred() 30 | var contentSelector = window.cs 31 | if (contentSelector === undefined) { 32 | deferredResponse.resolve() 33 | } else { 34 | contentSelector.removeGUI() 35 | window.cs = undefined 36 | deferredResponse.resolve() 37 | } 38 | 39 | return deferredResponse.promise() 40 | }, 41 | 42 | /** 43 | * Select elements within the page 44 | * @param request.parentCSSSelector 45 | * @param request.allowedElements 46 | */ 47 | selectSelector: function (request, options) { 48 | var $ = options.$ 49 | var deferredResponse = jquery.Deferred() 50 | 51 | this.removeCurrentContentSelector().done(function () { 52 | var contentSelector = new ContentSelector({ 53 | parentCSSSelector: request.parentCSSSelector, 54 | allowedElements: request.allowedElements 55 | }, {$, document, window}) 56 | window.cs = contentSelector 57 | 58 | var deferredCSSSelector = contentSelector.getCSSSelector() 59 | deferredCSSSelector.done(function (response) { 60 | this.removeCurrentContentSelector().done(function () { 61 | deferredResponse.resolve(response) 62 | window.cs = undefined 63 | }) 64 | }.bind(this)).fail(function (message) { 65 | deferredResponse.reject(message) 66 | window.cs = undefined 67 | }) 68 | }.bind(this)) 69 | 70 | return deferredResponse.promise() 71 | }, 72 | 73 | /** 74 | * Preview elements 75 | * @param request.parentCSSSelector 76 | * @param request.elementCSSSelector 77 | */ 78 | previewSelector: function (request, options) { 79 | var $ = options.$ 80 | var deferredResponse = jquery.Deferred() 81 | this.removeCurrentContentSelector().done(function () { 82 | var contentSelector = new ContentSelector({ 83 | parentCSSSelector: request.parentCSSSelector 84 | }, {$, document, window}) 85 | window.cs = contentSelector 86 | 87 | var deferredSelectorPreview = contentSelector.previewSelector(request.elementCSSSelector) 88 | deferredSelectorPreview.done(function () { 89 | deferredResponse.resolve() 90 | }).fail(function (message) { 91 | deferredResponse.reject(message) 92 | window.cs = undefined 93 | }) 94 | }) 95 | return deferredResponse 96 | } 97 | } 98 | 99 | module.exports = ContentScript 100 | -------------------------------------------------------------------------------- /extension/scripts/ElementQuery.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Element selector. Uses jQuery as base and adds some more features 3 | * @param CSSSelector 4 | * @param parentElement 5 | * @param options 6 | */ 7 | var ElementQuery = function (CSSSelector, parentElement, options) { 8 | CSSSelector = CSSSelector || '' 9 | this.$ = options.$ 10 | this.document = options.document 11 | this.window = options.window 12 | if (!this.$) throw new Error('Missing jquery for ElementQuery') 13 | if (!this.document) throw new Error("Missing document") 14 | if(!this.window)throw new Error("Missing window") 15 | var selectedElements = [] 16 | 17 | var addElement = function (element) { 18 | if (selectedElements.indexOf(element) === -1) { 19 | selectedElements.push(element) 20 | } 21 | } 22 | 23 | var selectorParts = ElementQuery.getSelectorParts(CSSSelector) 24 | var self = this 25 | selectorParts.forEach(function (selector) { 26 | // handle special case when parent is selected 27 | if (selector === '_parent_') { 28 | self.$(parentElement).each(function (i, element) { 29 | addElement(element) 30 | }) 31 | } else { 32 | var elements = self.$(selector, self.$(parentElement)) 33 | elements.each(function (i, element) { 34 | addElement(element) 35 | }) 36 | } 37 | }) 38 | 39 | return selectedElements 40 | } 41 | 42 | ElementQuery.getSelectorParts = function (CSSSelector) { 43 | var selectors = CSSSelector.split(/(,|".*?"|'.*?'|\(.*?\))/) 44 | 45 | var resultSelectors = [] 46 | var currentSelector = '' 47 | selectors.forEach(function (selector) { 48 | if (selector === ',') { 49 | if (currentSelector.trim().length) { 50 | resultSelectors.push(currentSelector.trim()) 51 | } 52 | currentSelector = '' 53 | } else { 54 | currentSelector += selector 55 | } 56 | }) 57 | if (currentSelector.trim().length) { 58 | resultSelectors.push(currentSelector.trim()) 59 | } 60 | 61 | return resultSelectors 62 | } 63 | 64 | module.exports = ElementQuery 65 | -------------------------------------------------------------------------------- /extension/scripts/InMemoryStore.js: -------------------------------------------------------------------------------- 1 | 2 | var InMemoryStore = function () { 3 | this.data = [] 4 | } 5 | 6 | InMemoryStore.prototype = { 7 | 8 | writeDocs: function (data, callback) { 9 | data.forEach(function (data) { 10 | this.data.push(data) 11 | }.bind(this)) 12 | callback() 13 | }, 14 | 15 | initSitemapDataDb: function (sitemapId, callback) { 16 | callback(this) 17 | } 18 | } 19 | 20 | module.exports = InMemoryStore 21 | -------------------------------------------------------------------------------- /extension/scripts/JSDOMBrowser.js: -------------------------------------------------------------------------------- 1 | const jsdom = require('jsdom') 2 | const jQuery = require('jquery') 3 | var jqueryDeferred = require('jquery-deferred') 4 | 5 | const contentScraper = require('../content_script/content_scraper') 6 | var whenCallSequentially = require('../assets/jquery.whencallsequentially') 7 | const debug = require('debug')('web-scraper-headless:jsdom-browser') 8 | var JSDOMBrowser = function (options) { 9 | this.pageLoadDelay = options.pageLoadDelay 10 | } 11 | 12 | JSDOMBrowser.prototype = { 13 | loadUrl: function (url, callback) { 14 | const {JSDOM} = jsdom 15 | const browser = this 16 | JSDOM.fromURL(url) 17 | .then(function (dom) { 18 | const window = dom.window 19 | const document = window.document 20 | const $ = jQuery(dom.window) 21 | setTimeout(function () { 22 | callback(null, {$, document, window}) 23 | }, browser.pageLoadDelay) 24 | }).catch(e => callback(e)) 25 | }, 26 | close: function () { 27 | 28 | }, 29 | saveImages: function (record, namingFunction) { 30 | var deferredResponse = jqueryDeferred.Deferred() 31 | var deferredImageStoreCalls = [] 32 | var prefixLength = '_imageBase64-'.length 33 | for (var attr in record) { 34 | if (attr.substr(0, prefixLength) === '_imageBase64-') { 35 | throw new Error('Downloading images is not yet supported') 36 | } 37 | } 38 | whenCallSequentially(deferredImageStoreCalls).done(function () { 39 | deferredResponse.resolve() 40 | }) 41 | 42 | return deferredResponse.promise() 43 | }, 44 | fetchData: function (url, sitemap, parentSelectorId, callback, scope) { 45 | const browser = this 46 | debug('Init jsdom browser app') 47 | browser.loadUrl(url, function (err, options) { 48 | if (err) { 49 | return callback(err) 50 | } 51 | const {$, document, window} = options 52 | 53 | var message = { 54 | extractData: true, 55 | sitemap: JSON.parse(JSON.stringify(sitemap)), 56 | parentSelectorId: parentSelectorId 57 | } 58 | function sendResponse (data) { 59 | callback.call(scope, null, data) 60 | } 61 | contentScraper(message, null, sendResponse, {$, document, window}) 62 | }) 63 | } 64 | } 65 | 66 | module.exports = JSDOMBrowser 67 | -------------------------------------------------------------------------------- /extension/scripts/JSDOMBrowserLoader.js: -------------------------------------------------------------------------------- 1 | const JSDOMBrowser = require('./JSDOMBrowser') 2 | module.exports = function (self) { 3 | var browser = 4 | 5 | self.onerror = function (err) { 6 | self.postMessage({ 7 | err: new Error(err) 8 | }) 9 | self.close() 10 | } 11 | self.addEventListener('message', function (ev) { 12 | const data = ev.data 13 | const UUID = data.UUID 14 | if (data.topic === 'init') { 15 | browser = new JSDOMBrowser(data.options) 16 | return self.postMessage({ 17 | UUID 18 | }) 19 | } else if (data.topic === 'loadUrl') { 20 | browser.loadUrl(data.url, function (err, {$, document, window}) { 21 | if (err) { 22 | return self.postMessage({ 23 | UUID, 24 | err 25 | }) 26 | } 27 | self.postMessage({ 28 | UUID 29 | }) 30 | }) 31 | } else if (data.topic === 'fetchData') { 32 | browser.fetchData(data.url, data.sitemap, data.parentSelectorId, function (err, results) { 33 | if (err) { 34 | return self.postMessage({ 35 | UUID, 36 | err 37 | }) 38 | } 39 | self.postMessage({ 40 | UUID, 41 | info: { 42 | results 43 | } 44 | }) 45 | }, null) 46 | } else { 47 | self.postMessage({ 48 | err: new Error('Unknown topic ' + data.topic) 49 | }) 50 | } 51 | }) 52 | } -------------------------------------------------------------------------------- /extension/scripts/Job.js: -------------------------------------------------------------------------------- 1 | const debug = require('debug')('web-scraper-headless:job') 2 | var Job = function (url, parentSelector, scraper, parentJob, baseData) { 3 | if (parentJob !== undefined) { 4 | this.url = this.combineUrls(parentJob.url, url) 5 | } else { 6 | this.url = url 7 | } 8 | this.parentSelector = parentSelector 9 | this.scraper = scraper 10 | this.dataItems = [] 11 | this.baseData = baseData || {} 12 | } 13 | 14 | Job.prototype = { 15 | 16 | combineUrls: function (parentUrl, childUrl) { 17 | var urlMatcher = new RegExp('(https?://)?([a-z0-9\\-\\.]+\\.[a-z0-9\\-]+(:\\d+)?|\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}(:\\d+)?)?(\\/[^\\?]*\\/|\\/)?([^\\?]*)?(\\?.*)?', 'i') 18 | 19 | var parentMatches = parentUrl.match(urlMatcher) 20 | var childMatches = childUrl.match(urlMatcher) 21 | 22 | // special case for urls like this: ?a=1 or like-this/ 23 | if (childMatches[1] === undefined && childMatches[2] === undefined && childMatches[5] === undefined && childMatches[6] === undefined) { 24 | var url = parentMatches[1] + parentMatches[2] + parentMatches[5] + parentMatches[6] + childMatches[7] 25 | return url 26 | } 27 | 28 | if (childMatches[1] === undefined) { 29 | childMatches[1] = parentMatches[1] 30 | } 31 | if (childMatches[2] === undefined) { 32 | childMatches[2] = parentMatches[2] 33 | } 34 | if (childMatches[5] === undefined) { 35 | if (parentMatches[5] === undefined) { 36 | childMatches[5] = '/' 37 | } else { 38 | childMatches[5] = parentMatches[5] 39 | } 40 | } 41 | 42 | if (childMatches[6] === undefined) { 43 | childMatches[6] = '' 44 | } 45 | if (childMatches[7] === undefined) { 46 | childMatches[7] = '' 47 | } 48 | 49 | return childMatches[1] + childMatches[2] + childMatches[5] + childMatches[6] + childMatches[7] 50 | }, 51 | 52 | execute: function (browser, callback, scope) { 53 | var sitemap = this.scraper.sitemap 54 | var job = this 55 | debug('starting fetching') 56 | browser.fetchData(this.url, sitemap, this.parentSelector, function (err, results) { 57 | if (err) { 58 | return callback(err) 59 | } 60 | debug('finished fetching') 61 | // merge data with data from initialization 62 | for (var i in results) { 63 | var result = results[i] 64 | for (var key in this.baseData) { 65 | if (!(key in result)) { 66 | result[key] = this.baseData[key] 67 | } 68 | } 69 | this.dataItems.push(result) 70 | } 71 | callback(null, job) 72 | }.bind(this), this) 73 | }, 74 | getResults: function () { 75 | return this.dataItems 76 | } 77 | } 78 | 79 | module.exports = Job 80 | -------------------------------------------------------------------------------- /extension/scripts/Queue.js: -------------------------------------------------------------------------------- 1 | 2 | var Queue = function () { 3 | this.jobs = [] 4 | this.scrapedUrls = {} 5 | } 6 | 7 | Queue.prototype = { 8 | 9 | /** 10 | * Returns false if page is already scraped 11 | * @param job 12 | * @returns {boolean} 13 | */ 14 | add: function (job) { 15 | if (this.canBeAdded(job)) { 16 | this.jobs.push(job) 17 | this._setUrlScraped(job.url) 18 | return true 19 | } 20 | return false 21 | }, 22 | 23 | canBeAdded: function (job) { 24 | if (this.isScraped(job.url)) { 25 | return false 26 | } 27 | 28 | // reject documents 29 | if (job.url.match(/\.(doc|docx|pdf|ppt|pptx|odt)$/i) !== null) { 30 | return false 31 | } 32 | return true 33 | }, 34 | 35 | getQueueSize: function () { 36 | return this.jobs.length 37 | }, 38 | 39 | isScraped: function (url) { 40 | return (this.scrapedUrls[url] !== undefined) 41 | }, 42 | 43 | _setUrlScraped: function (url) { 44 | this.scrapedUrls[url] = true 45 | }, 46 | 47 | getNextJob: function () { 48 | // @TODO test this 49 | if (this.getQueueSize() > 0) { 50 | return this.jobs.pop() 51 | } else { 52 | return false 53 | } 54 | } 55 | } 56 | 57 | module.exports = Queue 58 | -------------------------------------------------------------------------------- /extension/scripts/Selector.js: -------------------------------------------------------------------------------- 1 | var selectors = require('./Selectors') 2 | var ElementQuery = require('./ElementQuery') 3 | var jquery = require('jquery-deferred') 4 | const debug = require('debug')('web-scraper-headless:selector') 5 | 6 | var Selector = function (selector, options) { 7 | var $ = options.$ 8 | var document = options.document 9 | var window = options.window 10 | // We don't want enumerable properties 11 | Object.defineProperty(this, '$', { 12 | value: $, 13 | enumerable: false 14 | }) 15 | Object.defineProperty(this, 'window', { 16 | value: window, 17 | enumerable: false 18 | }) 19 | Object.defineProperty(this, 'document', { 20 | value: document, 21 | enumerable: false 22 | }) 23 | if (!this.$) throw new Error('Missing jquery') 24 | if (!this.document) throw new Error("Missing document") 25 | if(!this.window)throw new Error("Missing window") 26 | 27 | this.updateData(selector) 28 | this.initType() 29 | } 30 | 31 | Selector.prototype = { 32 | 33 | /** 34 | * Is this selector configured to return multiple items? 35 | * @returns {boolean} 36 | */ 37 | willReturnMultipleRecords: function () { 38 | return this.canReturnMultipleRecords() && this.multiple 39 | }, 40 | 41 | /** 42 | * Update current selector configuration 43 | * @param data 44 | */ 45 | updateData: function (data) { 46 | var allowedKeys = ['window', 'document', 'id', 'type', 'selector', 'parentSelectors'] 47 | debug('data type', data.type) 48 | allowedKeys = allowedKeys.concat(selectors[data.type].getFeatures()) 49 | var key 50 | // update data 51 | for (key in data) { 52 | if (allowedKeys.indexOf(key) !== -1 || typeof data[key] === 'function') { 53 | this[key] = data[key] 54 | } 55 | } 56 | 57 | // remove values that are not needed for this type of selector 58 | for (key in this) { 59 | if (allowedKeys.indexOf(key) === -1 && typeof this[key] !== 'function') { 60 | delete this[key] 61 | } 62 | } 63 | }, 64 | 65 | /** 66 | * CSS selector which will be used for element selection 67 | * @returns {string} 68 | */ 69 | getItemCSSSelector: function () { 70 | return '*' 71 | }, 72 | 73 | /** 74 | * override objects methods based on seletor type 75 | */ 76 | initType: function () { 77 | if (selectors[this.type] === undefined) { 78 | throw new Error('Selector type not defined ' + this.type) 79 | } 80 | 81 | // overrides objects methods 82 | for (var i in selectors[this.type]) { 83 | this[i] = selectors[this.type][i] 84 | } 85 | }, 86 | 87 | /** 88 | * Check whether a selector is a paren selector of this selector 89 | * @param selectorId 90 | * @returns {boolean} 91 | */ 92 | hasParentSelector: function (selectorId) { 93 | return (this.parentSelectors.indexOf(selectorId) !== -1) 94 | }, 95 | 96 | removeParentSelector: function (selectorId) { 97 | var index = this.parentSelectors.indexOf(selectorId) 98 | if (index !== -1) { 99 | this.parentSelectors.splice(index, 1) 100 | } 101 | }, 102 | 103 | renameParentSelector: function (originalId, replacementId) { 104 | if (this.hasParentSelector(originalId)) { 105 | var pos = this.parentSelectors.indexOf(originalId) 106 | this.parentSelectors.splice(pos, 1, replacementId) 107 | } 108 | }, 109 | 110 | getDataElements: function (parentElement) { 111 | var $ = this.$ 112 | var document = this.document 113 | var window = this.window 114 | var elements = ElementQuery(this.selector, parentElement, {$, document, window}) 115 | if (this.multiple) { 116 | return elements 117 | } else if (elements.length > 0) { 118 | return [elements[0]] 119 | } else { 120 | return [] 121 | } 122 | }, 123 | 124 | getData: function (parentElement) { 125 | var d = jquery.Deferred() 126 | var timeout = this.delay || 0 127 | 128 | // this works much faster because whenCallSequentally isn't running next data extraction immediately 129 | if (timeout === 0) { 130 | var deferredData = this._getData(parentElement) 131 | deferredData.done(function (data) { 132 | d.resolve(data) 133 | }) 134 | } else { 135 | setTimeout(function () { 136 | var deferredData = this._getData(parentElement) 137 | deferredData.done(function (data) { 138 | d.resolve(data) 139 | }) 140 | }.bind(this), timeout) 141 | } 142 | 143 | return d.promise() 144 | } 145 | } 146 | 147 | module.exports = Selector 148 | -------------------------------------------------------------------------------- /extension/scripts/Selector/SelectorElement.js: -------------------------------------------------------------------------------- 1 | var jquery = require('jquery-deferred') 2 | 3 | var SelectorElement = { 4 | 5 | canReturnMultipleRecords: function () { 6 | return true 7 | }, 8 | 9 | canHaveChildSelectors: function () { 10 | return true 11 | }, 12 | 13 | canHaveLocalChildSelectors: function () { 14 | return true 15 | }, 16 | 17 | canCreateNewJobs: function () { 18 | return false 19 | }, 20 | willReturnElements: function () { 21 | return true 22 | }, 23 | _getData: function (parentElement) { 24 | var dfd = jquery.Deferred() 25 | 26 | var elements = this.getDataElements(parentElement) 27 | dfd.resolve(this.$.makeArray(elements)) 28 | 29 | return dfd.promise() 30 | }, 31 | 32 | getDataColumns: function () { 33 | return [] 34 | }, 35 | 36 | getFeatures: function () { 37 | return ['multiple', 'delay'] 38 | } 39 | } 40 | 41 | module.exports = SelectorElement 42 | -------------------------------------------------------------------------------- /extension/scripts/Selector/SelectorElementAttribute.js: -------------------------------------------------------------------------------- 1 | var jquery = require('jquery-deferred') 2 | var SelectorElementAttribute = { 3 | canReturnMultipleRecords: function () { 4 | return true 5 | }, 6 | 7 | canHaveChildSelectors: function () { 8 | return false 9 | }, 10 | 11 | canHaveLocalChildSelectors: function () { 12 | return false 13 | }, 14 | 15 | canCreateNewJobs: function () { 16 | return false 17 | }, 18 | willReturnElements: function () { 19 | return false 20 | }, 21 | _getData: function (parentElement) { 22 | var dfd = jquery.Deferred() 23 | var self = this 24 | var elements = this.getDataElements(parentElement) 25 | 26 | var result = [] 27 | self.$(elements).each(function (k, element) { 28 | var data = {} 29 | 30 | data[this.id] = self.$(element).attr(this.extractAttribute) 31 | result.push(data) 32 | }.bind(this)) 33 | 34 | if (this.multiple === false && elements.length === 0) { 35 | var data = {} 36 | data[this.id + '-src'] = null 37 | result.push(data) 38 | } 39 | dfd.resolve(result) 40 | 41 | return dfd.promise() 42 | }, 43 | 44 | getDataColumns: function () { 45 | return [this.id] 46 | }, 47 | 48 | getFeatures: function () { 49 | return ['multiple', 'extractAttribute', 'delay'] 50 | } 51 | } 52 | 53 | module.exports = SelectorElementAttribute 54 | -------------------------------------------------------------------------------- /extension/scripts/Selector/SelectorElementScroll.js: -------------------------------------------------------------------------------- 1 | var jquery = require('jquery-deferred') 2 | var SelectorElementScroll = { 3 | 4 | canReturnMultipleRecords: function () { 5 | return true 6 | }, 7 | 8 | canHaveChildSelectors: function () { 9 | return true 10 | }, 11 | 12 | canHaveLocalChildSelectors: function () { 13 | return true 14 | }, 15 | 16 | canCreateNewJobs: function () { 17 | return false 18 | }, 19 | willReturnElements: function () { 20 | return true 21 | }, 22 | scrollToBottom: function () { 23 | var document = this.document 24 | window.scrollTo(0, document.body.scrollHeight) 25 | }, 26 | _getData: function (parentElement) { 27 | var delay = parseInt(this.delay) || 0 28 | var deferredResponse = jquery.Deferred() 29 | var foundElements = [] 30 | 31 | // initially scroll down and wait 32 | this.scrollToBottom() 33 | var nextElementSelection = (new Date()).getTime() + delay 34 | 35 | // infinitely scroll down and find all items 36 | var interval = setInterval(function () { 37 | var now = (new Date()).getTime() 38 | // sleep. wait when to extract next elements 39 | if (now < nextElementSelection) { 40 | return 41 | } 42 | 43 | var elements = this.getDataElements(parentElement) 44 | // no new elements found 45 | if (elements.length === foundElements.length) { 46 | clearInterval(interval) 47 | deferredResponse.resolve(this.$.makeArray(elements)) 48 | } else { 49 | // continue scrolling and add delay 50 | foundElements = elements 51 | this.scrollToBottom() 52 | nextElementSelection = now + delay 53 | } 54 | }.bind(this), 50) 55 | 56 | return deferredResponse.promise() 57 | }, 58 | 59 | getDataColumns: function () { 60 | return [] 61 | }, 62 | 63 | getFeatures: function () { 64 | return ['multiple', 'delay'] 65 | } 66 | } 67 | 68 | module.exports = SelectorElementScroll 69 | -------------------------------------------------------------------------------- /extension/scripts/Selector/SelectorGoogMapID.js: -------------------------------------------------------------------------------- 1 | const url = require('url') 2 | const jquery = require('jquery-deferred') 3 | const debug = require('debug')('web-scraper-headless:selector-goog-map-id') 4 | 5 | var SelectorGoogMapID = { 6 | 7 | canReturnMultipleRecords: function () { 8 | return true 9 | }, 10 | 11 | canHaveChildSelectors: function () { 12 | return false 13 | }, 14 | 15 | canHaveLocalChildSelectors: function () { 16 | return false 17 | }, 18 | 19 | canCreateNewJobs: function () { 20 | return false 21 | }, 22 | willReturnElements: function () { 23 | return false 24 | }, 25 | getMapID: function ($container) { 26 | const $ = this.$ 27 | const mapSelector = this.getMapsSelector() 28 | const mUrl = $($container).find(mapSelector).attr('src') 29 | if (!mUrl) { 30 | debug('Goog map url was undefined') 31 | return '' 32 | } 33 | const mQuery = url.parse(mUrl, true).query 34 | const pb = mQuery ? mQuery.pb : null 35 | if (!pb) { 36 | debug('Pb in query was undefined in url', url) 37 | return '' 38 | } 39 | const match = pb.match(/0x[0-9a-f]{15,16}:0x[0-9a-f]{15,16}/) 40 | if (!match) { 41 | debug('Could not find fid in pb', pb) 42 | return '' 43 | } 44 | return match[0] 45 | }, 46 | _getData: function (parentElement) { 47 | var dfd = jquery.Deferred() 48 | var $ = this.$ 49 | 50 | // easier to select divs containing the iframe 51 | var containers = this.getDataElements(parentElement) 52 | const result = [] 53 | var selector = this 54 | $(containers).each(function (k, container) { 55 | const mapId = selector.getMapID($(container)) 56 | result.push({[selector.id + '_FTID']: mapId}) 57 | }) 58 | 59 | dfd.resolve(result) 60 | return dfd.promise() 61 | }, 62 | 63 | getDataColumns: function () { 64 | return [this.id + '_FTID', this.id + '_PID', this.id + '_CID'] 65 | }, 66 | 67 | getFeatures: function () { 68 | return ['mapsSelectorFromDiv'] 69 | }, 70 | 71 | getItemCSSSelector: function () { 72 | // We get the container 73 | return '*:not(div.overlay)' 74 | }, 75 | 76 | getMapsSelectorFromDivHTML: function (html, options = {}) { 77 | const $ = options.$ || this.$ 78 | const div = $(html) 79 | const defaultSelector = 'iframe[src*="google.com/maps/embed"]' 80 | if (div.find(defaultSelector).length) { 81 | return defaultSelector 82 | } 83 | return '' 84 | }, 85 | 86 | getMapsSelector: function () { 87 | if (this.mapsSelectorFromDiv === undefined) { 88 | return 'iframe[src*="google.com/maps/embed"]' 89 | } else { 90 | return this.mapsSelectorFromDiv 91 | } 92 | } 93 | } 94 | 95 | module.exports = SelectorGoogMapID 96 | -------------------------------------------------------------------------------- /extension/scripts/Selector/SelectorGroup.js: -------------------------------------------------------------------------------- 1 | var jquery = require('jquery-deferred') 2 | var SelectorGroup = { 3 | 4 | canReturnMultipleRecords: function () { 5 | return false 6 | }, 7 | 8 | canHaveChildSelectors: function () { 9 | return false 10 | }, 11 | 12 | canHaveLocalChildSelectors: function () { 13 | return false 14 | }, 15 | 16 | canCreateNewJobs: function () { 17 | return false 18 | }, 19 | willReturnElements: function () { 20 | return false 21 | }, 22 | _getData: function (parentElement) { 23 | var dfd = jquery.Deferred() 24 | var self = this 25 | // cannot reuse this.getDataElements because it depends on *multiple* property 26 | var elements = self.$(this.selector, parentElement) 27 | 28 | var records = [] 29 | self.$(elements).each(function (k, element) { 30 | var data = {} 31 | 32 | data[this.id] = self.$(element).text() 33 | 34 | if (this.extractAttribute) { 35 | data[this.id + '-' + this.extractAttribute] = self.$(element).attr(this.extractAttribute) 36 | } 37 | 38 | records.push(data) 39 | }.bind(this)) 40 | 41 | var result = {} 42 | result[this.id] = records 43 | 44 | dfd.resolve([result]) 45 | return dfd.promise() 46 | }, 47 | 48 | getDataColumns: function () { 49 | return [this.id] 50 | }, 51 | 52 | getFeatures: function () { 53 | return ['delay', 'extractAttribute'] 54 | } 55 | } 56 | 57 | module.exports = SelectorGroup 58 | -------------------------------------------------------------------------------- /extension/scripts/Selector/SelectorHTML.js: -------------------------------------------------------------------------------- 1 | var jquery = require('jquery-deferred') 2 | var SelectorHTML = { 3 | 4 | canReturnMultipleRecords: function () { 5 | return true 6 | }, 7 | 8 | canHaveChildSelectors: function () { 9 | return false 10 | }, 11 | 12 | canHaveLocalChildSelectors: function () { 13 | return false 14 | }, 15 | 16 | canCreateNewJobs: function () { 17 | return false 18 | }, 19 | willReturnElements: function () { 20 | return false 21 | }, 22 | _getData: function (parentElement) { 23 | var dfd = jquery.Deferred() 24 | var self = this 25 | var elements = this.getDataElements(parentElement) 26 | 27 | var result = [] 28 | self.$(elements).each(function (k, element) { 29 | var data = {} 30 | var html = self.$(element).html() 31 | 32 | if (this.regex !== undefined && this.regex.length) { 33 | var matches = html.match(new RegExp(this.regex)) 34 | if (matches !== null) { 35 | html = matches[0] 36 | } else { 37 | html = null 38 | } 39 | } 40 | data[this.id] = html 41 | 42 | result.push(data) 43 | }.bind(this)) 44 | 45 | if (this.multiple === false && elements.length === 0) { 46 | var data = {} 47 | data[this.id] = null 48 | result.push(data) 49 | } 50 | 51 | dfd.resolve(result) 52 | return dfd.promise() 53 | }, 54 | 55 | getDataColumns: function () { 56 | return [this.id] 57 | }, 58 | 59 | getFeatures: function () { 60 | return ['multiple', 'regex', 'delay'] 61 | } 62 | } 63 | 64 | module.exports = SelectorHTML 65 | -------------------------------------------------------------------------------- /extension/scripts/Selector/SelectorImage.js: -------------------------------------------------------------------------------- 1 | var jquery = require('jquery-deferred') 2 | var whenCallSequentially = require('../../assets/jquery.whencallsequentially') 3 | var Base64 = require('../../assets/base64') 4 | var SelectorImage = { 5 | canReturnMultipleRecords: function () { 6 | return true 7 | }, 8 | 9 | canHaveChildSelectors: function () { 10 | return false 11 | }, 12 | 13 | canHaveLocalChildSelectors: function () { 14 | return false 15 | }, 16 | 17 | canCreateNewJobs: function () { 18 | return false 19 | }, 20 | willReturnElements: function () { 21 | return false 22 | }, 23 | _getData: function (parentElement) { 24 | var dfd = jquery.Deferred() 25 | 26 | var elements = this.getDataElements(parentElement) 27 | 28 | var deferredDataCalls = [] 29 | this.$(elements).each(function (i, element) { 30 | deferredDataCalls.push(function () { 31 | var deferredData = jquery.Deferred() 32 | 33 | var data = {} 34 | data[this.id + '-src'] = element.src 35 | 36 | // download image if required 37 | if (!this.downloadImage) { 38 | deferredData.resolve(data) 39 | } else { 40 | var deferredImageBase64 = this.downloadImageBase64(element.src) 41 | 42 | deferredImageBase64.done(function (imageResponse) { 43 | data['_imageBase64-' + this.id] = imageResponse.imageBase64 44 | data['_imageMimeType-' + this.id] = imageResponse.mimeType 45 | 46 | deferredData.resolve(data) 47 | }.bind(this)).fail(function () { 48 | // failed to download image continue. 49 | // @TODO handle errror 50 | deferredData.resolve(data) 51 | }) 52 | } 53 | 54 | return deferredData.promise() 55 | }.bind(this)) 56 | }.bind(this)) 57 | 58 | whenCallSequentially(deferredDataCalls).done(function (dataResults) { 59 | if (this.multiple === false && elements.length === 0) { 60 | var data = {} 61 | data[this.id + '-src'] = null 62 | dataResults.push(data) 63 | } 64 | 65 | dfd.resolve(dataResults) 66 | }) 67 | 68 | return dfd.promise() 69 | }, 70 | 71 | downloadFileAsBlob: function (url) { 72 | var window = this.window 73 | var deferredResponse = jquery.Deferred() 74 | var xhr = new window.XMLHttpRequest() 75 | xhr.onreadystatechange = function () { 76 | if (this.readyState == 4) { 77 | if (this.status == 200) { 78 | var blob = this.response 79 | deferredResponse.resolve(blob) 80 | } else { 81 | deferredResponse.reject(xhr.statusText) 82 | } 83 | } 84 | } 85 | xhr.open('GET', url) 86 | xhr.responseType = 'blob' 87 | xhr.send() 88 | 89 | return deferredResponse.promise() 90 | }, 91 | 92 | downloadImageBase64: function (url) { 93 | var deferredResponse = jquery.Deferred() 94 | var deferredDownload = this.downloadFileAsBlob(url) 95 | deferredDownload.done(function (blob) { 96 | var mimeType = blob.type 97 | var deferredBlob = Base64.blobToBase64(blob) 98 | deferredBlob.done(function (imageBase64) { 99 | deferredResponse.resolve({ 100 | mimeType: mimeType, 101 | imageBase64: imageBase64 102 | }) 103 | }) 104 | }).fail(deferredResponse.fail) 105 | return deferredResponse.promise() 106 | }, 107 | 108 | getDataColumns: function () { 109 | return [this.id + '-src'] 110 | }, 111 | 112 | getFeatures: function () { 113 | return ['multiple', 'delay', 'downloadImage'] 114 | }, 115 | 116 | getItemCSSSelector: function () { 117 | return 'img' 118 | } 119 | } 120 | 121 | module.exports = SelectorImage 122 | -------------------------------------------------------------------------------- /extension/scripts/Selector/SelectorLink.js: -------------------------------------------------------------------------------- 1 | var jquery = require('jquery-deferred') 2 | var whenCallSequentially = require('../../assets/jquery.whencallsequentially') 3 | 4 | var SelectorLink = { 5 | canReturnMultipleRecords: function () { 6 | return true 7 | }, 8 | 9 | canHaveChildSelectors: function () { 10 | return true 11 | }, 12 | 13 | canHaveLocalChildSelectors: function () { 14 | return false 15 | }, 16 | 17 | canCreateNewJobs: function () { 18 | return true 19 | }, 20 | willReturnElements: function () { 21 | return false 22 | }, 23 | _getData: function (parentElement) { 24 | var elements = this.getDataElements(parentElement) 25 | var self = this 26 | 27 | var dfd = jquery.Deferred() 28 | 29 | // return empty record if not multiple type and no elements found 30 | if (this.multiple === false && elements.length === 0) { 31 | var data = {} 32 | data[this.id] = null 33 | dfd.resolve([data]) 34 | return dfd 35 | } 36 | 37 | // extract links one by one 38 | var deferredDataExtractionCalls = [] 39 | self.$(elements).each(function (k, element) { 40 | deferredDataExtractionCalls.push(function (element) { 41 | var deferredData = jquery.Deferred() 42 | 43 | var data = {} 44 | data[this.id] = self.$(element).text() 45 | data._followSelectorId = this.id 46 | data[this.id + '-href'] = element.href 47 | data._follow = element.href 48 | deferredData.resolve(data) 49 | 50 | return deferredData 51 | }.bind(this, element)) 52 | }.bind(this)) 53 | 54 | whenCallSequentially(deferredDataExtractionCalls).done(function (responses) { 55 | var result = [] 56 | responses.forEach(function (dataResult) { 57 | result.push(dataResult) 58 | }) 59 | dfd.resolve(result) 60 | }) 61 | 62 | return dfd.promise() 63 | }, 64 | 65 | getDataColumns: function () { 66 | return [this.id, this.id + '-href'] 67 | }, 68 | 69 | getFeatures: function () { 70 | return ['multiple', 'delay'] 71 | }, 72 | 73 | getItemCSSSelector: function () { 74 | return 'a' 75 | } 76 | } 77 | 78 | module.exports = SelectorLink 79 | -------------------------------------------------------------------------------- /extension/scripts/Selector/SelectorPopupLink.js: -------------------------------------------------------------------------------- 1 | var whenCallSequentially = require('../../assets/jquery.whencallsequentially') 2 | var jquery = require('jquery-deferred') 3 | var CssSelector = require('css-selector').CssSelector 4 | const debug = require('debug')('web-scraper-headless:selector:selector-popup-link') 5 | var SelectorPopupLink = { 6 | canReturnMultipleRecords: function () { 7 | return true 8 | }, 9 | 10 | canHaveChildSelectors: function () { 11 | return true 12 | }, 13 | 14 | canHaveLocalChildSelectors: function () { 15 | return false 16 | }, 17 | 18 | canCreateNewJobs: function () { 19 | return true 20 | }, 21 | willReturnElements: function () { 22 | return false 23 | }, 24 | _getData: function (parentElement) { 25 | var $ = this.$ 26 | var document = this.document 27 | var window = this.window 28 | var elements = this.getDataElements(parentElement) 29 | 30 | var dfd = jquery.Deferred() 31 | 32 | // return empty record if not multiple type and no elements found 33 | if (this.multiple === false && elements.length === 0) { 34 | var data = {} 35 | data[this.id] = null 36 | dfd.resolve([data]) 37 | return dfd 38 | } 39 | 40 | // extract links one by one 41 | var deferredDataExtractionCalls = [] 42 | $(elements).each(function (k, element) { 43 | deferredDataExtractionCalls.push(function (element) { 44 | var deferredData = jquery.Deferred() 45 | 46 | var data = {} 47 | data[this.id] = $(element).text() 48 | data._followSelectorId = this.id 49 | 50 | var deferredPopupURL = this.getPopupURL(element) 51 | deferredPopupURL.done(function (url) { 52 | data[this.id + '-href'] = url 53 | data._follow = url 54 | deferredData.resolve(data) 55 | }.bind(this)) 56 | 57 | return deferredData 58 | }.bind(this, element)) 59 | }.bind(this)) 60 | 61 | whenCallSequentially(deferredDataExtractionCalls).done(function (responses) { 62 | var result = [] 63 | responses.forEach(function (dataResult) { 64 | result.push(dataResult) 65 | }) 66 | dfd.resolve(result) 67 | }) 68 | 69 | return dfd.promise() 70 | }, 71 | 72 | /** 73 | * Gets an url from a window.open call by mocking the window.open function 74 | * @param element 75 | * @returns $.Deferred() 76 | */ 77 | getPopupURL: function (element) { 78 | var $ = this.$ 79 | var document = this.document 80 | var window = this.window 81 | // override window.open function. we need to execute this in page scope. 82 | // we need to know how to find this element from page scope. 83 | var cs = new CssSelector({ 84 | enableSmartTableSelector: false, 85 | parent: document.body, 86 | enableResultStripping: false 87 | }) 88 | var cssSelector = cs.getCssSelector([element]) 89 | debug(cssSelector) 90 | debug(document.body.querySelectorAll(cssSelector)) 91 | // this function will catch window.open call and place the requested url as the elements data attribute 92 | var script = document.createElement('script') 93 | script.type = 'text/javascript' 94 | debug(cssSelector) 95 | debug(document.querySelectorAll(cssSelector)) 96 | var el = document.querySelectorAll(cssSelector)[0] 97 | 98 | const open = window.open 99 | window.open = function () { 100 | var url = arguments[0] 101 | el.dataset.webScraperExtractUrl = url 102 | window.open = open 103 | } 104 | el.click() 105 | 106 | // wait for url to be available 107 | var deferredURL = jquery.Deferred() 108 | var timeout = Math.abs(5000 / 30) // 5s timeout to generate an url for popup 109 | var interval = setInterval(function () { 110 | var url = $(element).data('web-scraper-extract-url') 111 | if (url) { 112 | deferredURL.resolve(url) 113 | clearInterval(interval) 114 | script.remove() 115 | } 116 | // timeout popup opening 117 | if (timeout-- <= 0) { 118 | clearInterval(interval) 119 | script.remove() 120 | } 121 | }, 30) 122 | 123 | return deferredURL.promise() 124 | }, 125 | 126 | getDataColumns: function () { 127 | return [this.id, this.id + '-href'] 128 | }, 129 | 130 | getFeatures: function () { 131 | return ['multiple', 'delay'] 132 | }, 133 | 134 | getItemCSSSelector: function () { 135 | return '*' 136 | } 137 | } 138 | 139 | module.exports = SelectorPopupLink 140 | -------------------------------------------------------------------------------- /extension/scripts/Selector/SelectorText.js: -------------------------------------------------------------------------------- 1 | var jquery = require('jquery-deferred') 2 | var SelectorText = { 3 | 4 | canReturnMultipleRecords: function () { 5 | return true 6 | }, 7 | 8 | canHaveChildSelectors: function () { 9 | return false 10 | }, 11 | 12 | canHaveLocalChildSelectors: function () { 13 | return false 14 | }, 15 | 16 | canCreateNewJobs: function () { 17 | return false 18 | }, 19 | willReturnElements: function () { 20 | return false 21 | }, 22 | _getData: function (parentElement) { 23 | var $ = this.$ 24 | var document = this.document 25 | var window = this.window 26 | var dfd = jquery.Deferred() 27 | 28 | var elements = this.getDataElements(parentElement) 29 | 30 | var result = [] 31 | $(elements).each(function (k, element) { 32 | var data = {} 33 | 34 | // remove script, style tag contents from text results 35 | var $element_clone = $(element).clone() 36 | $element_clone.find('script, style').remove() 37 | //
replace br tags with newlines 38 | $element_clone.find('br').after('\n') 39 | 40 | var text = $element_clone.text() 41 | if (this.regex !== undefined && this.regex.length) { 42 | var matches = text.match(new RegExp(this.regex)) 43 | if (matches !== null) { 44 | text = matches[0] 45 | } else { 46 | text = null 47 | } 48 | } 49 | data[this.id] = text 50 | 51 | result.push(data) 52 | }.bind(this)) 53 | 54 | if (this.multiple === false && elements.length === 0) { 55 | var data = {} 56 | data[this.id] = null 57 | result.push(data) 58 | } 59 | 60 | dfd.resolve(result) 61 | return dfd.promise() 62 | }, 63 | 64 | getDataColumns: function () { 65 | return [this.id] 66 | }, 67 | 68 | getFeatures: function () { 69 | return ['multiple', 'regex', 'delay'] 70 | } 71 | } 72 | 73 | module.exports = SelectorText 74 | -------------------------------------------------------------------------------- /extension/scripts/Selectors.js: -------------------------------------------------------------------------------- 1 | var SelectorElement = require('./Selector/SelectorElement') 2 | var SelectorElementAttribute = require('./Selector/SelectorElementAttribute') 3 | var SelectorElementClick = require('./Selector/SelectorElementClick') 4 | var SelectorElementScroll = require('./Selector/SelectorElementScroll') 5 | var SelectorGroup = require('./Selector/SelectorGroup') 6 | var SelectorHTML = require('./Selector/SelectorHTML') 7 | var SelectorImage = require('./Selector/SelectorImage') 8 | var SelectorLink = require('./Selector/SelectorLink') 9 | var SelectorPopupLink = require('./Selector/SelectorPopupLink') 10 | var SelectorTable = require('./Selector/SelectorTable') 11 | var SelectorText = require('./Selector/SelectorText') 12 | var SelectorGoogMapID = require('./Selector/SelectorGoogMapID') 13 | module.exports = { 14 | SelectorElement, 15 | SelectorElementAttribute, 16 | SelectorElementClick, 17 | SelectorElementScroll, 18 | SelectorGroup, 19 | SelectorHTML, 20 | SelectorImage, 21 | SelectorLink, 22 | SelectorPopupLink, 23 | SelectorTable, 24 | SelectorText, 25 | SelectorGoogMapID 26 | } 27 | -------------------------------------------------------------------------------- /extension/scripts/Store.js: -------------------------------------------------------------------------------- 1 | var Sitemap = require('./Sitemap') 2 | const debug = require('debug')('web-scraper-headless:store') 3 | var Store = function (config, options) { 4 | this.config = config 5 | this.$ = options.$ 6 | this.document = options.document 7 | this.window = options.window 8 | if (!this.$) throw new Error('jquery required') 9 | if (!this.document) throw new Error("Missing document") 10 | if(!this.window)throw new Error("Missing window") 11 | // configure couchdb 12 | this.sitemapDb = new PouchDB(this.config.sitemapDb) 13 | } 14 | var StoreScrapeResultWriter = function (db) { 15 | this.db = db 16 | } 17 | 18 | StoreScrapeResultWriter.prototype = { 19 | writeDocs: function (docs, callback) { 20 | if (docs.length === 0) { 21 | callback() 22 | } else { 23 | this.db.bulkDocs({docs: docs}, function (err, response) { 24 | if (err !== null) { 25 | debug('Error while persisting scraped data to db', err) 26 | } 27 | callback() 28 | }) 29 | } 30 | } 31 | } 32 | 33 | Store.prototype = { 34 | 35 | sanitizeSitemapDataDbName: function (dbName) { 36 | return 'sitemap-data-' + dbName.replace(/[^a-z0-9_\$\(\)\+\-/]/gi, '_') 37 | }, 38 | getSitemapDataDbLocation: function (sitemapId) { 39 | var dbName = this.sanitizeSitemapDataDbName(sitemapId) 40 | return this.config.dataDb + dbName 41 | }, 42 | getSitemapDataDb: function (sitemapId) { 43 | var dbLocation = this.getSitemapDataDbLocation(sitemapId) 44 | return new PouchDB(dbLocation) 45 | }, 46 | 47 | /** 48 | * creates or clears a sitemap db 49 | * @param {type} sitemapId 50 | * @returns {undefined} 51 | */ 52 | initSitemapDataDb: function (sitemapId, callback) { 53 | var dbLocation = this.getSitemapDataDbLocation(sitemapId) 54 | var store = this 55 | 56 | PouchDB.destroy(dbLocation, function () { 57 | var db = store.getSitemapDataDb(sitemapId) 58 | var dbWriter = new StoreScrapeResultWriter(db) 59 | callback(dbWriter) 60 | }) 61 | }, 62 | 63 | createSitemap: function (sitemap, callback) { 64 | var sitemapJson = JSON.parse(JSON.stringify(sitemap)) 65 | 66 | if (!sitemap._id) { 67 | debug('cannot save sitemap without an id', sitemap) 68 | } 69 | 70 | this.sitemapDb.put(sitemapJson, function (sitemap, err, response) { 71 | // @TODO handle err 72 | sitemap._rev = response.rev 73 | callback(sitemap) 74 | }.bind(this, sitemap)) 75 | }, 76 | saveSitemap: function (sitemap, callback) { 77 | // @TODO remove 78 | this.createSitemap(sitemap, callback) 79 | }, 80 | deleteSitemap: function (sitemap, callback) { 81 | sitemap = JSON.parse(JSON.stringify(sitemap)) 82 | 83 | this.sitemapDb.remove(sitemap, function (err, response) { 84 | // @TODO handle err 85 | 86 | // delete sitemap data db 87 | var dbLocation = this.getSitemapDataDbLocation(sitemap._id) 88 | PouchDB.destroy(dbLocation, function () { 89 | callback() 90 | }) 91 | }.bind(this)) 92 | }, 93 | getAllSitemaps: function (callback) { 94 | var $ = this.$ 95 | var document = this.document 96 | var window = this.window 97 | this.sitemapDb.allDocs({include_docs: true}, function (err, response) { 98 | var sitemaps = [] 99 | for (var i in response.rows) { 100 | var sitemap = response.rows[i].doc 101 | if (!chrome.extension) { 102 | sitemap = new Sitemap(sitemap, {$, document, window}) 103 | } 104 | 105 | sitemaps.push(sitemap) 106 | } 107 | callback(sitemaps) 108 | }) 109 | }, 110 | 111 | getSitemapData: function (sitemap, callback) { 112 | var db = this.getSitemapDataDb(sitemap._id) 113 | db.allDocs({include_docs: true}, function (err, response) { 114 | var responseData = [] 115 | for (var i in response.rows) { 116 | var doc = response.rows[i].doc 117 | responseData.push(doc) 118 | } 119 | callback(responseData) 120 | }) 121 | }, 122 | // @TODO make this call lighter 123 | sitemapExists: function (sitemapId, callback) { 124 | this.getAllSitemaps(function (sitemaps) { 125 | var sitemapFound = false 126 | for (var i in sitemaps) { 127 | if (sitemaps[i]._id === sitemapId) { 128 | sitemapFound = true 129 | } 130 | } 131 | callback(sitemapFound) 132 | }) 133 | } 134 | } 135 | 136 | module.exports = Store 137 | -------------------------------------------------------------------------------- /extension/scripts/StoreDevtools.js: -------------------------------------------------------------------------------- 1 | var Sitemap = require('./Sitemap') 2 | 3 | /** 4 | * From devtools panel there is no possibility to execute XHR requests. So all requests to a remote CouchDb must be 5 | * handled through Background page. StoreDevtools is a simply a proxy store 6 | * @constructor 7 | */ 8 | var StoreDevtools = function (options) { 9 | this.$ = options.$ 10 | this.document = options.document 11 | this.window = options.window 12 | if (!this.$) throw new Error('jquery required') 13 | if (!this.document) throw new Error("Missing document") 14 | if(!this.window)throw new Error("Missing window") 15 | } 16 | 17 | StoreDevtools.prototype = { 18 | createSitemap: function (sitemap, callback) { 19 | var request = { 20 | createSitemap: true, 21 | sitemap: JSON.parse(JSON.stringify(sitemap)) 22 | } 23 | 24 | chrome.runtime.sendMessage(request, function (callbackFn, originalSitemap, newSitemap) { 25 | originalSitemap._rev = newSitemap._rev 26 | callbackFn(originalSitemap) 27 | }.bind(this, callback, sitemap)) 28 | }, 29 | saveSitemap: function (sitemap, callback) { 30 | this.createSitemap(sitemap, callback) 31 | }, 32 | deleteSitemap: function (sitemap, callback) { 33 | var request = { 34 | deleteSitemap: true, 35 | sitemap: JSON.parse(JSON.stringify(sitemap)) 36 | } 37 | chrome.runtime.sendMessage(request, function (response) { 38 | callback() 39 | }) 40 | }, 41 | getAllSitemaps: function (callback) { 42 | var $ = this.$ 43 | var document = this.document 44 | var window = this.window 45 | var request = { 46 | getAllSitemaps: true 47 | } 48 | 49 | chrome.runtime.sendMessage(request, function (response) { 50 | var sitemaps = [] 51 | 52 | for (var i in response) { 53 | sitemaps.push(new Sitemap(response[i], {$, document, window})) 54 | } 55 | callback(sitemaps) 56 | }) 57 | }, 58 | getSitemapData: function (sitemap, callback) { 59 | var request = { 60 | getSitemapData: true, 61 | sitemap: JSON.parse(JSON.stringify(sitemap)) 62 | } 63 | 64 | chrome.runtime.sendMessage(request, function (response) { 65 | callback(response) 66 | }) 67 | }, 68 | sitemapExists: function (sitemapId, callback) { 69 | var request = { 70 | sitemapExists: true, 71 | sitemapId: sitemapId 72 | } 73 | 74 | chrome.runtime.sendMessage(request, function (response) { 75 | callback(response) 76 | }) 77 | } 78 | } 79 | 80 | module.exports = StoreDevtools 81 | -------------------------------------------------------------------------------- /extension/scripts/UniqueElementList.js: -------------------------------------------------------------------------------- 1 | var CssSelector = require('css-selector').CssSelector 2 | // TODO get rid of jquery 3 | 4 | /** 5 | * Only Elements unique will be added to this array 6 | * @constructor 7 | */ 8 | function UniqueElementList (clickElementUniquenessType, options) { 9 | var $ = options.$ 10 | var window = options.window 11 | var document = options.document 12 | 13 | Object.defineProperty(this, '$', { 14 | value: $, 15 | enumerable: false 16 | }) 17 | Object.defineProperty(this, 'window', { 18 | value: window, 19 | enumerable: false 20 | }) 21 | Object.defineProperty(this, 'document', { 22 | value: document, 23 | enumerable: false 24 | }) 25 | if (!this.$) throw new Error('jquery required') 26 | if (!this.document) { 27 | throw new Error("Missing document") 28 | } 29 | if(!this.window) throw new Error("Missing window") 30 | this.clickElementUniquenessType = clickElementUniquenessType 31 | this.addedElements = {} 32 | } 33 | 34 | UniqueElementList.prototype = [] 35 | 36 | UniqueElementList.prototype.push = function (element) { 37 | var $ = this.$ 38 | var document = this.document 39 | var window = this.window 40 | if (this.isAdded(element)) { 41 | return false 42 | } else { 43 | var elementUniqueId = this.getElementUniqueId(element) 44 | this.addedElements[elementUniqueId] = true 45 | Array.prototype.push.call(this, $(element).clone(true)[0]) 46 | return true 47 | } 48 | } 49 | 50 | UniqueElementList.prototype.getElementUniqueId = function (element) { 51 | var $ = this.$ 52 | var document = this.document 53 | var window = this.window 54 | if (this.clickElementUniquenessType === 'uniqueText') { 55 | var elementText = $(element).text().trim() 56 | return elementText 57 | } else if (this.clickElementUniquenessType === 'uniqueHTMLText') { 58 | var elementHTML = $("

").append($(element).eq(0).clone()).html() 59 | return elementHTML 60 | } else if (this.clickElementUniquenessType === 'uniqueHTML') { 61 | // get element without text 62 | var $element = $(element).eq(0).clone() 63 | 64 | var removeText = function ($element) { 65 | $element.contents() 66 | .filter(function () { 67 | if (this.nodeType !== 3) { 68 | removeText($(this)) 69 | } 70 | return this.nodeType == 3 // Node.TEXT_NODE 71 | }).remove() 72 | } 73 | removeText($element) 74 | 75 | var elementHTML = $("
").append($element).html() 76 | return elementHTML 77 | } else if (this.clickElementUniquenessType === 'uniqueCSSSelector') { 78 | var cs = new CssSelector({ 79 | enableSmartTableSelector: false, 80 | parent: $('body')[0], 81 | enableResultStripping: false 82 | }) 83 | var CSSSelector = cs.getCssSelector([element]) 84 | return CSSSelector 85 | } else { 86 | throw 'Invalid clickElementUniquenessType ' + this.clickElementUniquenessType 87 | } 88 | } 89 | 90 | module.exports = UniqueElementList 91 | 92 | UniqueElementList.prototype.isAdded = function (element) { 93 | var elementUniqueId = this.getElementUniqueId(element) 94 | var isAdded = elementUniqueId in this.addedElements 95 | return isAdded 96 | } 97 | -------------------------------------------------------------------------------- /extension/scripts/WebJSDOMBrowser.js: -------------------------------------------------------------------------------- 1 | // Basically runs JSDOM in a webworker 2 | const work = require('webworkify') 3 | const jsdomBrowserLoader = require('./JSDOMBrowserLoader') 4 | var jqueryDeferred = require('jquery-deferred') 5 | var whenCallSequentially = require('../assets/jquery.whencallsequentially') 6 | const debug = require('debug')('web-scraper-headless:web-jsdom-browser') 7 | const WebJSDOMBrowser = function (options) { 8 | this.pageLoadDelay = options.pageLoadDelay 9 | const promises = {} 10 | this.promises = promises 11 | 12 | this.worker = work(jsdomBrowserLoader) 13 | 14 | this.worker.addEventListener('message', function (ev) { 15 | const data = ev.data 16 | if (!data.UUID) { 17 | return console.error(data.err) 18 | } 19 | if (data.UUID && !promises[data.UUID]) { 20 | return console.error('Missing UUID', data.UUID) 21 | } 22 | if (data.err) { 23 | console.error(data.err) 24 | promises[data.UUID].reject(new Error(data.err)) 25 | delete promises[data.UUID] 26 | return 27 | } 28 | promises[data.UUID].resolve(data.info) 29 | delete promises[data.UUID] 30 | }) 31 | this.worker.postMessage({ 32 | topic: 'init', 33 | UUID: 'init', 34 | options 35 | }) 36 | promises.init = { 37 | resolve: function () { 38 | debug('successfully created') 39 | }, 40 | reject: function (err) { 41 | console.error(err) 42 | } 43 | } 44 | } 45 | 46 | WebJSDOMBrowser.prototype = { 47 | loadUrl: function (url, callback) { 48 | const UUID = parseInt(Math.random() * 1000000).toString() 49 | let res, rej 50 | const promise = new Promise(function (resolve, reject) { 51 | res = resolve 52 | rej = reject 53 | }) 54 | this.promises[UUID] = {resolve: res, reject: rej} 55 | this.worker.postMessage({ 56 | topic: 'loadUrl', 57 | url, 58 | UUID 59 | }) 60 | promise.then(function (info) { 61 | callback() 62 | }, function (err) {callback(err)}) 63 | }, 64 | saveImages: function (record, namingFunction) { 65 | var deferredResponse = jqueryDeferred.Deferred() 66 | var deferredImageStoreCalls = [] 67 | var prefixLength = '_imageBase64-'.length 68 | for (var attr in record) { 69 | if (attr.substr(0, prefixLength) === '_imageBase64-') { 70 | throw new Error('Downloading images is not yet supported') 71 | } 72 | } 73 | whenCallSequentially(deferredImageStoreCalls).done(function () { 74 | deferredResponse.resolve() 75 | }) 76 | 77 | return deferredResponse.promise() 78 | }, 79 | fetchData: function (url, sitemap, parentSelectorId, callback, scope) { 80 | const UUID = parseInt(Math.random() * 1000000).toString() 81 | let res, rej 82 | const promise = new Promise(function (resolve, reject) { 83 | res = resolve 84 | rej = reject 85 | }) 86 | this.promises[UUID] = {resolve: res, reject: rej} 87 | this.worker.postMessage({ 88 | topic: 'fetchData', 89 | url, 90 | UUID, 91 | sitemap: JSON.parse(JSON.stringify(sitemap)), 92 | parentSelectorId 93 | }) 94 | promise.then(function (info) { 95 | callback.call(scope, null, info.results) 96 | }, function (err) { 97 | callback(err) 98 | }) 99 | }, 100 | close: function () { 101 | debug('closing webjsdom browser') 102 | if (this.worker) this.worker.terminate() 103 | this.worker = null 104 | } 105 | } 106 | 107 | module.exports = WebJSDOMBrowser 108 | -------------------------------------------------------------------------------- /extension/scripts/getBackgroundScript.js: -------------------------------------------------------------------------------- 1 | var jquery = require('jquery-deferred') 2 | var BackgroundScript = require('./BackgroundScript') 3 | /** 4 | * @param location configure from where the content script is being accessed (ContentScript, BackgroundPage, DevTools) 5 | * @returns BackgroundScript 6 | */ 7 | var getBackgroundScript = function (location) { 8 | // Handle calls from different places 9 | if (location === 'BackgroundScript') { 10 | return BackgroundScript 11 | } else if (location === 'DevTools' || location === 'ContentScript') { 12 | // if called within background script proxy calls to content script 13 | var backgroundScript = {} 14 | 15 | Object.keys(BackgroundScript).forEach(function (attr) { 16 | if (typeof BackgroundScript[attr] === 'function') { 17 | backgroundScript[attr] = function (request) { 18 | var reqToBackgroundScript = { 19 | backgroundScriptCall: true, 20 | fn: attr, 21 | request: request 22 | } 23 | 24 | var deferredResponse = jquery.Deferred() 25 | 26 | chrome.runtime.sendMessage(reqToBackgroundScript, function (response) { 27 | deferredResponse.resolve(response) 28 | }) 29 | 30 | return deferredResponse 31 | } 32 | } else { 33 | backgroundScript[attr] = BackgroundScript[attr] 34 | } 35 | }) 36 | 37 | return backgroundScript 38 | } else { 39 | throw new Error('Invalid BackgroundScript initialization - ' + location) 40 | } 41 | } 42 | 43 | module.exports = getBackgroundScript 44 | -------------------------------------------------------------------------------- /extension/scripts/getContentScript.js: -------------------------------------------------------------------------------- 1 | var getBackgroundScript = require('./getBackgroundScript') 2 | var ContentScript = require('./ContentScript') 3 | /** 4 | * 5 | * @param location configure from where the content script is being accessed (ContentScript, BackgroundPage, DevTools) 6 | * @param options 7 | * @returns ContentScript 8 | */ 9 | var getContentScript = function (location) { 10 | var contentScript 11 | 12 | // Handle calls from different places 13 | if (location === 'ContentScript') { 14 | contentScript = ContentScript 15 | contentScript.backgroundScript = getBackgroundScript('ContentScript') 16 | return contentScript 17 | } else if (location === 'BackgroundScript' || location === 'DevTools') { 18 | var backgroundScript = getBackgroundScript(location) 19 | 20 | // if called within background script proxy calls to content script 21 | contentScript = {} 22 | Object.keys(ContentScript).forEach(function (attr) { 23 | if (typeof ContentScript[attr] === 'function') { 24 | contentScript[attr] = function (request) { 25 | var reqToContentScript = { 26 | contentScriptCall: true, 27 | fn: attr, 28 | request: request 29 | } 30 | 31 | return backgroundScript.executeContentScript(reqToContentScript) 32 | } 33 | } else { 34 | contentScript[attr] = ContentScript[attr] 35 | } 36 | }) 37 | contentScript.backgroundScript = backgroundScript 38 | return contentScript 39 | } else { 40 | throw new Error('Invalid ContentScript initialization - ' + location) 41 | } 42 | } 43 | 44 | module.exports = getContentScript 45 | -------------------------------------------------------------------------------- /gulpfile.js: -------------------------------------------------------------------------------- 1 | const gulp = require('gulp') 2 | const browserify = require('browserify') 3 | const watchify = require('watchify') 4 | const source = require('vinyl-source-stream') 5 | const notify = require('gulp-notify') 6 | const Server = require('karma').Server 7 | const path = require('path') 8 | const babelify = require('babelify') 9 | const mocha = require('gulp-spawn-mocha') 10 | // We do karma in gulp instead of npm because we need to recompute all the generated bundles that are loaded to the browser 11 | const runTests = (function () { 12 | let builds = 0 13 | return function (done = function () {}) { 14 | builds++ 15 | // One build per bundle 16 | if (builds % 3 === 0) { 17 | runKarma(done) 18 | runNodeTests() 19 | } 20 | } 21 | })() 22 | 23 | function runKarma (done) { 24 | const server = new Server({ 25 | configFile: path.join(__dirname, 'karma.conf.js'), 26 | singleRun: true 27 | }, done) 28 | server.start() 29 | } 30 | 31 | function runNodeTests () { 32 | return gulp.src([ 33 | 'tests/jsdomSpec.js', 34 | 'tests/spec/*Spec.js', 35 | 'tests/spec/Selector/*Spec.js', 36 | 'tests/spec/jsdom/*Spec.js', 37 | 'tests/spec/headless/*Spec.js' 38 | ]) 39 | .pipe(mocha({ 40 | compilers: 'js:babel-register' 41 | }).on('error', console.error)) 42 | } 43 | 44 | gulp.task('build:watch', () => generateBuilder(true, true)) 45 | gulp.task('build', () => generateBuilder(false, false)) 46 | 47 | gulp.task('default', ['build:watch']) 48 | 49 | function generateBuilder (isWatch, debug) { 50 | const wrapper = isWatch ? watchify : (x) => x 51 | const bundlerBackground = wrapper(browserify({ 52 | standalone: 'backgroundScraper', 53 | entries: [ 54 | 'extension/background_page/background_script.js' 55 | ], 56 | debug 57 | })) 58 | const bundlerScraper = wrapper(browserify({ 59 | standalone: 'contentScraper', 60 | entries: [ 61 | 'extension/content_script/content_scraper_browser.js' 62 | ], 63 | debug 64 | })) 65 | const bundlerDevtools = wrapper(browserify({ 66 | standalone: 'contentScraper', 67 | entries: [ 68 | 'extension/scripts/App.js' 69 | ], 70 | debug 71 | })) 72 | 73 | setBundler(bundlerBackground, 'background-scraper.js') 74 | setBundler(bundlerScraper, 'content-scraper.js') 75 | setBundler(bundlerDevtools, 'devtools-scraper.js') 76 | function gulpBundle (bundler, file) { 77 | bundler.bundle() 78 | .on('error', function (err) { 79 | return notify().write(err) 80 | }) 81 | .pipe(source(file)) 82 | .pipe(gulp.dest('extension/generated/')) 83 | .on('error', function (e) { 84 | console.error(e) 85 | }) 86 | .on('end', function () { 87 | runTests() 88 | console.log('finished bundling') 89 | // TODO launch tests 90 | }) 91 | } 92 | 93 | function setBundler (bundler, file) { 94 | bundler 95 | .transform(babelify, {}) 96 | .on('update', function () { 97 | gulpBundle(bundler, file) 98 | }) 99 | .on('error', function (err) { 100 | return notify().write(err) 101 | }) 102 | .on('log', function (log) { 103 | console.log(log) 104 | }) 105 | return gulpBundle(bundler, file) 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | const Queue = require('./extension/scripts/Queue') 2 | const Sitemap = require('./extension/scripts/Sitemap') 3 | const InMemoryStore = require('./extension/scripts/InMemoryStore') 4 | const Scraper = require('./extension/scripts/Scraper') 5 | const debug = require('debug')('web-scraper-headless:index') 6 | const JSDOMBrowser = require('./extension/scripts/JSDOMBrowser') 7 | const ChromeHeadlessBrowser = require('./extension/scripts/ChromeHeadlessBrowser') 8 | /** 9 | * 10 | * @param sitemap 11 | * @param options 12 | * @param options.browser jsdom|headless 13 | * @param options.pageLoadDelay 14 | * @param options.delay 15 | * @return {*} 16 | */ 17 | module.exports = function (sitemap, options) { 18 | return scrape(sitemap, options) 19 | } 20 | 21 | function scrape (sitemapInfo, options = {}) { 22 | return new Promise(function (resolve, reject) { 23 | // sitemap is created twice, once in node another in the browser context. 24 | // In node we don't actually need these variables. 25 | const fakeWindow = {} 26 | const fakeDocument = {} 27 | const fake$ = {} 28 | const q = new Queue() 29 | const store = new InMemoryStore() 30 | const sitemap = new Sitemap(sitemapInfo, {$: fake$, document: fakeDocument, window: fakeWindow}) 31 | 32 | let BrowserConstructor 33 | switch (options.browser) { 34 | case 'jsdom': 35 | BrowserConstructor = JSDOMBrowser 36 | debug('Jsdom browser selected') 37 | break 38 | case 'headless': 39 | BrowserConstructor = ChromeHeadlessBrowser 40 | debug('Chrome headless browser selected') 41 | break 42 | default: 43 | debug('No browser requested so jsdom was selected as default') 44 | BrowserConstructor = JSDOMBrowser 45 | } 46 | const browser = new BrowserConstructor({ 47 | pageLoadDelay: options.pageLoadDelay || 2000 48 | }) 49 | const s = new Scraper({ 50 | queue: q, 51 | sitemap, 52 | browser, 53 | store, 54 | delay: options.delay || 500 55 | }, {}) 56 | s.run(function (err) { 57 | if (err) { 58 | reject(err) 59 | } else { 60 | resolve(store.data) 61 | } 62 | }) 63 | }) 64 | } 65 | -------------------------------------------------------------------------------- /karma.conf.js: -------------------------------------------------------------------------------- 1 | const files = ['tests/browserSpec.js', 'tests/spec/*.js', 'tests/spec/browser/*.js', 'tests/spec/Selector/*Spec.js'] 2 | const _ = require('lodash') 3 | module.exports = function (config) { 4 | config.set({ 5 | 6 | // base path that will be used to resolve all patterns (eg. files, exclude) 7 | basePath: '', 8 | 9 | // frameworks to use 10 | // available frameworks: https://npmjs.org/browse/keyword/karma-adapter 11 | frameworks: ['browserify', 'mocha'], 12 | 13 | preprocessors: _.mapValues(_.keyBy(files), () => ['browserify']), 14 | // list of files / patterns to load in the browser 15 | files: [ 16 | 'extension/assets/sugar-1.4.1.js', 17 | 'extension/assets/pouchdb-nightly.min.js', 18 | 'tests/ChromeAPI.js', 19 | 'extension/generated/background-scraper.js', // not very nice, we need to load the background script to listen to the messages 20 | 'extension/generated/content-scraper.js', 21 | 'extension/content_script/content_script.js', 22 | 'docs/images/chrome-store-logo.png', 23 | ...files 24 | ], 25 | customLaunchers: { 26 | ChromeOutOfFocus: { 27 | base: 'Chrome', 28 | flags: ['--window-size=300,300'] 29 | } 30 | }, 31 | browserify: { 32 | debug: true, 33 | transform: [ 34 | ['babelify', {ignore: /\/node_modules\//}] 35 | ] 36 | }, 37 | 38 | // list of files to exclude 39 | exclude: [ 40 | ], 41 | // test results reporter to use 42 | // possible values: 'dots', 'progress' 43 | // available reporters: https://npmjs.org/browse/keyword/karma-reporter 44 | reporters: ['dots'], 45 | 46 | // web server port 47 | port: 9876, 48 | 49 | // enable / disable colors in the output (reporters and logs) 50 | colors: true, 51 | 52 | // level of logging 53 | // possible values: config.LOG_DISABLE || config.LOG_ERROR || config.LOG_WARN || config.LOG_INFO || config.LOG_DEBUG 54 | logLevel: config.LOG_INFO, 55 | 56 | browserConsoleLogOptions: { 57 | terminal: true, 58 | level: 'error' 59 | }, 60 | // start these browsers 61 | // available browser launchers: https://npmjs.org/browse/keyword/karma-launcher 62 | browsers: ['ChromeHeadless'], 63 | 64 | // Concurrency level 65 | // how many browser should be started simultaneous 66 | concurrency: Infinity, 67 | browserNoActivityTimeout: 50000000, 68 | plugins: [ 69 | 'karma-mocha', 70 | 'karma-browserify', 71 | 'karma-chrome-launcher' 72 | ] 73 | }) 74 | } 75 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "web-scraper-headless", 3 | "version": "1.0.7", 4 | "description": "Web Scraper Headless allows to extract data from web pages using plans (sitemaps) created with the Web Scraper browser extension. Using these sitemaps the Web Scraper will navigate the site accordingly and extract all data. Scraped data later can be exported as CSV.", 5 | "main": "index.js", 6 | "directories": { 7 | "doc": "docs", 8 | "test": "tests" 9 | }, 10 | "watch": { 11 | "generate": { 12 | "patterns": [ 13 | "extension" 14 | ], 15 | "ignore": "extension/generated", 16 | "extensions": "js" 17 | } 18 | }, 19 | "standard": { 20 | "ignore": [ 21 | "extension/generated", 22 | "extension/assets" 23 | ], 24 | "globals": [ 25 | "d3", 26 | "chrome", 27 | "describe", 28 | "it", 29 | "beforeEach", 30 | "afterEach", 31 | "after", 32 | "before" 33 | ] 34 | }, 35 | "scripts": { 36 | "build": "gulp build", 37 | "test-watch": "gulp" 38 | }, 39 | "repository": { 40 | "type": "git", 41 | "url": "git+https://github.com/geoblink/web-scraper-chrome-extension.git" 42 | }, 43 | "author": "", 44 | "license": "LGPL-3.0", 45 | "bugs": { 46 | "url": "https://github.com/geoblink/web-scraper-chrome-extension/issues" 47 | }, 48 | "homepage": "https://github.com/geoblink/web-scraper-chrome-extension#readme", 49 | "devDependencies": { 50 | "babel-plugin-meaningful-logs": "^1.0.2", 51 | "babel-register": "^6.24.1", 52 | "babelify": "^7.3.0", 53 | "chai": "^3.5.0", 54 | "chrome-remote-interface": "^0.18.0", 55 | "gulp": "^3.9.1", 56 | "gulp-notify": "^3.0.0", 57 | "gulp-spawn-mocha": "^3.3.0", 58 | "istanbul": "^0.4.5", 59 | "jasmine-node": "^1.14.5", 60 | "karma": "^1.6.0", 61 | "karma-browserify": "^5.1.1", 62 | "karma-chrome-launcher": "^2.0.0", 63 | "karma-mocha": "^1.3.0", 64 | "mocha": "^3.2.0", 65 | "npm-watch": "^0.1.8", 66 | "sinon": "^7.4.2", 67 | "standard": "^9.0.2", 68 | "vinyl-buffer": "^1.0.0", 69 | "vinyl-source-stream": "^1.1.0", 70 | "watchify": "^3.9.0", 71 | "webworkify": "^1.4.0" 72 | }, 73 | "dependencies": { 74 | "browserify": "^16.1.0", 75 | "css-selector": "git://github.com/furstenheim/css-selector.git#b50eb6befc4129ac56e91efba3dd1e233bb67202", 76 | "debug": "^3.1.0", 77 | "jquery": "^3.2.1", 78 | "jquery-deferred": "^0.3.1", 79 | "jsdom": "^10.1.0", 80 | "puppeteer": "1.5.0" 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /playgrounds/extension/webpage.css: -------------------------------------------------------------------------------- 1 | #webpage { 2 | height:400px; 3 | border-radius: 5px; 4 | border:3px #ccc solid; 5 | margin:10px; 6 | overflow-y:scroll; 7 | } 8 | 9 | #webpage { 10 | font-size: 14px; 11 | } 12 | 13 | #webpage .navbar-nav > li > a { 14 | padding-top: 15px; 15 | padding-bottom: 15px; 16 | } 17 | -------------------------------------------------------------------------------- /playgrounds/sitemap-tree/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |
14 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /playgrounds/sitemap-tree/sitemap.json: -------------------------------------------------------------------------------- 1 | { 2 | "selectors":[ 3 | { 4 | "id": "a", 5 | "type": "SelectorElement", 6 | "parentSelectors": ["_root", "d"] 7 | }, 8 | { 9 | "id": "b", 10 | "type": "SelectorElement", 11 | "parentSelectors": ["a"] 12 | }, 13 | { 14 | "id": "c", 15 | "type": "SelectorElement", 16 | "parentSelectors": ["a"] 17 | }, 18 | { 19 | "id": "d", 20 | "type": "SelectorElement", 21 | "parentSelectors": ["a"] 22 | } 23 | ] 24 | } -------------------------------------------------------------------------------- /playgrounds/sitemap-tree/style.css: -------------------------------------------------------------------------------- 1 | .node circle { 2 | cursor: pointer; 3 | fill: #fff; 4 | stroke: steelblue; 5 | stroke-width: 1.5px; 6 | } 7 | 8 | .node text { 9 | font-size: 11px; 10 | } 11 | 12 | path.link { 13 | fill: none; 14 | stroke: #ccc; 15 | stroke-width: 1.5px; 16 | } -------------------------------------------------------------------------------- /tests/FakeStore.js: -------------------------------------------------------------------------------- 1 | 2 | var FakeStore = function () { 3 | this.data = [] 4 | } 5 | 6 | FakeStore.prototype = { 7 | 8 | writeDocs: function (data, callback) { 9 | data.forEach(function (data) { 10 | this.data.push(data) 11 | }.bind(this)) 12 | callback() 13 | }, 14 | 15 | initSitemapDataDb: function (sitemapId, callback) { 16 | callback(this) 17 | } 18 | } 19 | 20 | module.exports = FakeStore 21 | -------------------------------------------------------------------------------- /tests/Matchers.js: -------------------------------------------------------------------------------- 1 | const assert = require('chai').assert 2 | var getSelectorIds = function (selectors) { 3 | var ids = [] 4 | selectors.forEach(function (selector) { 5 | ids.push(selector.id) 6 | }) 7 | return ids 8 | } 9 | 10 | var selectorListSorter = function (a, b) { 11 | if (a.id === b.id) { 12 | return 0 13 | } else if (a.id > b.id) { 14 | return 1 15 | } else { 16 | return -1 17 | } 18 | } 19 | 20 | var selectorMatchers = { 21 | matchSelectors: async function (actual, expectedIds) { 22 | expectedIds = expectedIds.sort() 23 | var actualIds = getSelectorIds(actual).sort() 24 | 25 | assert.deepEqual(actualIds, expectedIds) 26 | }, 27 | matchSelectorList: async function (actual, expectedSelectors) { 28 | var actualSelectors = actual 29 | assert.equal(expectedSelectors.length, actualSelectors.length) 30 | expectedSelectors.sort(selectorListSorter) 31 | actualSelectors.sort(selectorListSorter) 32 | 33 | for (const i in expectedSelectors) { 34 | console.log(expectedSelectors[i], actualSelectors[i].id) 35 | assert.equal(expectedSelectors[i].id, actualSelectors[i].id) 36 | } 37 | }, 38 | // @REFACTOR use match selector list 39 | matchSelectorTrees: async function (actual, expectedSelectorTrees) { 40 | var actualSelectorTrees = actual 41 | 42 | assert.equal(actualSelectorTrees.length, expectedSelectorTrees.length) 43 | 44 | for (var i in expectedSelectorTrees) { 45 | await selectorMatchers.matchSelectors(actualSelectorTrees[i], expectedSelectorTrees[i]) 46 | } 47 | }, 48 | deferredToEqual: function (actual, expectedData) { 49 | var deferredData = actual 50 | return deferredData 51 | .then(function (d) { 52 | assert.deepEqual(d, expectedData) 53 | }) 54 | }, 55 | deferredToFail: async function (actual) { 56 | var deferredData = actual 57 | 58 | try { 59 | await deferredData 60 | return Promise.reject(new Error('Promise not rejected')) 61 | } catch (e) { 62 | 63 | } 64 | } 65 | } 66 | 67 | module.exports = selectorMatchers 68 | -------------------------------------------------------------------------------- /tests/browserSpec.js: -------------------------------------------------------------------------------- 1 | const globals = require('./globals') 2 | const $ = require('jquery') 3 | const ChromePopupBrowser = require('../extension/scripts/ChromePopupBrowser') 4 | beforeEach(function () { 5 | globals.window = window 6 | globals.document = document 7 | globals.$ = $ 8 | globals.Browser = ChromePopupBrowser 9 | window.chromeAPI.reset() 10 | 11 | window.addEventListener('unhandledrejection', function (err, promise) { 12 | console.error('Unhandled error', err.reason) 13 | }) 14 | }) 15 | -------------------------------------------------------------------------------- /tests/globals.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | } 3 | -------------------------------------------------------------------------------- /tests/jsdomSpec.js: -------------------------------------------------------------------------------- 1 | const globals = require('./globals') 2 | const jsdom = require('jsdom') 3 | const jQuery = require('jquery') 4 | const Browser = require('./../extension/scripts/JSDOMBrowser') 5 | beforeEach(function () { 6 | const {JSDOM} = jsdom 7 | const dom = new JSDOM() 8 | const $ = jQuery(dom.window) 9 | const window = dom.window 10 | const document = window.document 11 | globals.document = dom.window.document 12 | globals.window = dom.window 13 | globals.$ = $ 14 | globals.Browser = Browser 15 | Browser.prototype.loadUrl = function (url, callback) { 16 | callback(null, {$, document, window}) 17 | } 18 | }) 19 | process.on('unhandledRejection', function (err) { 20 | console.error(err) 21 | }) 22 | -------------------------------------------------------------------------------- /tests/spec/ElementQuerySpec.js: -------------------------------------------------------------------------------- 1 | const ElementQuery = require('../../extension/scripts/ElementQuery') 2 | const assert = require('chai').assert 3 | const utils = require('./../utils') 4 | const globals = require('../globals') 5 | 6 | describe('ElementQuery', function () { 7 | var $el 8 | let $ 9 | let document 10 | let window 11 | beforeEach(function () { 12 | $ = globals.$ 13 | document = globals.document 14 | window = globals.window 15 | 16 | document.body.innerHTML = utils.getTestHTML() 17 | $el = utils.createElementFromHTML("", document) 18 | document.body.appendChild($el) 19 | }) 20 | 21 | it('should be able to select elements', function () { 22 | $el.innerHTML = '' 23 | 24 | var selectedElements = ElementQuery('a, span', $el, {$, document, window}) 25 | var expectedElements = Array.from($el.querySelectorAll('a, span')) 26 | 27 | assert.deepEqual(selectedElements.sort(), expectedElements) 28 | }) 29 | 30 | it('should be able to select parent', function () { 31 | $el.innerHTML = '' 32 | 33 | var selectedElements = ElementQuery('a, span, _parent_', $el, {$, document, window}) 34 | var expectedElements = Array.from($el.querySelectorAll('a, span')) 35 | expectedElements.push($el) 36 | 37 | assert.deepEqual(selectedElements.sort(), expectedElements.sort()) 38 | }) 39 | 40 | it('should should not return duplicates', function () { 41 | $el.innerHTML = '' 42 | 43 | var selectedElements = ElementQuery('*, a, span, _parent_', $el, {$, document, window}) 44 | var expectedElements = Array.from($el.querySelectorAll('a, span')) 45 | expectedElements.push($el) 46 | 47 | assert.deepEqual(selectedElements.length, 3) 48 | assert.deepEqual(selectedElements.sort(), expectedElements.sort()) 49 | }) 50 | 51 | it('should be able to select parent when parent there are multiple parents', function () { 52 | $el.innerHTML = '' 53 | 54 | var selectedElements = ElementQuery('_parent_', $el.querySelectorAll('span'), {$, document, window}) 55 | var expectedElements = Array.from($el.querySelectorAll('span')) 56 | 57 | assert.deepEqual(selectedElements.length, 2) 58 | assert.deepEqual(selectedElements.sort(), expectedElements) 59 | }) 60 | 61 | it('should be able to select element with a comma ,', function () { 62 | $el.innerHTML = ',' 63 | 64 | var selectedElements = ElementQuery(":contains(',')", $el, {$, document, window}) 65 | var expectedElements = Array.from($el.querySelectorAll('span')) 66 | 67 | assert.deepEqual(selectedElements.length, 1) 68 | assert.deepEqual(selectedElements.sort(), expectedElements.sort()) 69 | }) 70 | 71 | it('should preserve spaces', function () { 72 | var parts = ElementQuery.getSelectorParts('div.well li:nth-of-type(2) a') 73 | assert.deepEqual(parts, ['div.well li:nth-of-type(2) a']) 74 | }) 75 | }) 76 | -------------------------------------------------------------------------------- /tests/spec/JobSpec.js: -------------------------------------------------------------------------------- 1 | const Job = require('./../../extension/scripts/Job') 2 | const assert = require('chai').assert 3 | 4 | describe('Job', function () { 5 | it('should be able to create correct url from parent job', function () { 6 | var parent = new Job('http://example.com/') 7 | var child = new Job('/test/', null, null, parent) 8 | assert.equal(child.url, 'http://example.com/test/') 9 | 10 | parent = new Job('http://example.com') 11 | child = new Job('test/', null, null, parent) 12 | assert.equal(child.url, 'http://example.com/test/') 13 | 14 | parent = new Job('http://example.com/asdasdad') 15 | child = new Job('tvnet.lv', null, null, parent) 16 | assert.equal(child.url, 'http://tvnet.lv/') 17 | 18 | parent = new Job('http://example.com/asdasdad') 19 | child = new Job('?test', null, null, parent) 20 | assert.equal(child.url, 'http://example.com/asdasdad?test') 21 | 22 | parent = new Job('http://example.com/1/') 23 | child = new Job('2/', null, null, parent) 24 | assert.equal(child.url, 'http://example.com/1/2/') 25 | 26 | parent = new Job('http://127.0.0.1/1/') 27 | child = new Job('2/', null, null, parent) 28 | assert.equal(child.url, 'http://127.0.0.1/1/2/') 29 | 30 | parent = new Job('http://xn--80aaxitdbjk.xn--p1ai/') 31 | child = new Job('2/', null, null, parent) 32 | 33 | assert.equal(child.url, 'http://xn--80aaxitdbjk.xn--p1ai/2/') 34 | }) 35 | 36 | it('should be able to create correct url from parent job with slashes after question mark', function () { 37 | var parent = new Job('http://www.sportstoto.com.my/results_past.asp?date=5/1/1992') 38 | var child = new Job('popup_past_results.asp?drawNo=418/92', null, null, parent) 39 | assert.equal(child.url, 'http://www.sportstoto.com.my/popup_past_results.asp?drawNo=418/92') 40 | }) 41 | 42 | it('should be able to create correct url with a port number', function () { 43 | var parent = new Job('http://nukrobi2.nuk.uni-lj.si:8080/wayback/20101021090940/http://volitve.gov.si/lv2010/kandidati/seznam_obcin.html') 44 | var child = new Job('http://nukrobi2.nuk.uni-lj.si:8080/wayback/20101021091250/http://volitve.gov.si/lv2010/kandidati/zupani_os_celje.html', null, null, parent) 45 | assert.equal(child.url, 'http://nukrobi2.nuk.uni-lj.si:8080/wayback/20101021091250/http://volitve.gov.si/lv2010/kandidati/zupani_os_celje.html') 46 | 47 | parent = new Job('http://nukrobi2.nuk.uni-lj.si:8080') 48 | child = new Job('zupani_os_celje.html', null, null, parent) 49 | assert.equal(child.url, 'http://nukrobi2.nuk.uni-lj.si:8080/zupani_os_celje.html') 50 | }) 51 | 52 | it('should not override data with base data if it already exists', function () { 53 | var browser = { 54 | fetchData: function (url, sitemap, parentSelector, callback) { 55 | callback(null, [{a: 1, b: 2}]) 56 | } 57 | } 58 | 59 | var job = new Job(undefined, undefined, {sitemap: undefined}, undefined, {a: 'do not override', c: 3}) 60 | job.execute(browser, function () {}) 61 | var results = job.getResults() 62 | assert.deepEqual(results, [{a: 1, b: 2, c: 3}]) 63 | }) 64 | }) 65 | -------------------------------------------------------------------------------- /tests/spec/QueueSpec.js: -------------------------------------------------------------------------------- 1 | const Queue = require('./../../extension/scripts/Queue') 2 | const Job = require('./../../extension/scripts/Job') 3 | const assert = require('chai').assert 4 | 5 | describe('Queue', function () { 6 | var q 7 | var job 8 | 9 | beforeEach(function () { 10 | q = new Queue() 11 | job = new Job('http://test.lv/', {}) 12 | }) 13 | 14 | it('should be able to add items to queue', function () { 15 | q.add(job) 16 | assert.equal(q.getQueueSize(), 1) 17 | assert.equal(q.jobs[0].url, 'http://test.lv/') 18 | }) 19 | 20 | it('should be able to mark urls as scraped', function () { 21 | q.add(job) 22 | q.getNextJob() 23 | assert.equal(q.getQueueSize(), 0) 24 | 25 | // try to add this job again 26 | q.add(job) 27 | assert.equal(q.getQueueSize(), 0) 28 | }) 29 | 30 | it('should be able to reject documents', function () { 31 | job = new Job('http://test.lv/test.doc') 32 | 33 | var accepted = q.add(job) 34 | assert.isFalse(accepted) 35 | }) 36 | }) 37 | -------------------------------------------------------------------------------- /tests/spec/Selector/SelectorElementAttributeSpec.js: -------------------------------------------------------------------------------- 1 | const Selector = require('../../../extension/scripts/Selector') 2 | const utils = require('./../../utils') 3 | const assert = require('chai').assert 4 | const globals = require('../../globals') 5 | 6 | describe('Element Attribute Selector', function () { 7 | var $el 8 | let $ 9 | let document 10 | let window 11 | 12 | beforeEach(function () { 13 | $ = globals.$ 14 | document = globals.document 15 | window = globals.window 16 | 17 | document.body.innerHTML = utils.getTestHTML() 18 | $el = utils.createElementFromHTML("", document) 19 | document.body.appendChild($el) 20 | }) 21 | 22 | it('should extract image src tag', function (done) { 23 | var selector = new Selector({ 24 | id: 'img', 25 | type: 'SelectorElementAttribute', 26 | multiple: false, 27 | extractAttribute: 'src', 28 | selector: 'img' 29 | }, { $, document, window }) 30 | 31 | var dataDeferred = selector.getData(document.querySelector('#selector-image-one-image')) 32 | dataDeferred.then(function (data) { 33 | assert.deepEqual(data, [ 34 | { 35 | 'img': 'http://aa/' 36 | } 37 | ]) 38 | done() 39 | }) 40 | }) 41 | 42 | it('should extract multiple src tags', function (done) { 43 | var selector = new Selector({ 44 | id: 'img', 45 | type: 'SelectorElementAttribute', 46 | multiple: true, 47 | extractAttribute: 'src', 48 | selector: 'img' 49 | }, { $, document, window }) 50 | 51 | var dataDeferred = selector.getData(document.querySelector('#selector-image-multiple-images')) 52 | 53 | dataDeferred.then(function (data) { 54 | assert.deepEqual(data, [ 55 | { 56 | 'img': 'http://aa/' 57 | }, 58 | { 59 | 'img': 'http://bb/' 60 | } 61 | ]) 62 | done() 63 | }) 64 | }) 65 | 66 | it('should return only one data column', function () { 67 | var selector = new Selector({ 68 | id: 'id', 69 | type: 'SelectorElementAttribute', 70 | multiple: true, 71 | selector: 'img' 72 | }, { $, document, window }) 73 | 74 | var columns = selector.getDataColumns() 75 | assert.deepEqual(columns, [ 'id' ]) 76 | }) 77 | 78 | it('should return empty array when no images are found', function (done) { 79 | var selector = new Selector({ 80 | id: 'img', 81 | type: 'SelectorElementAttribute', 82 | multiple: true, 83 | selector: 'img.not-exist', 84 | extractAttribute: 'src' 85 | }, { $, document, window }) 86 | 87 | var dataDeferred = selector.getData(document.querySelector('#not-exist')) 88 | 89 | dataDeferred.then(function (data) { 90 | assert.deepEqual(data, []) 91 | done() 92 | }) 93 | }) 94 | 95 | it('should be able to select data- attributes', function (done) { 96 | var html = '
' 97 | utils.appendHTML($el, html, document) 98 | 99 | var selector = new Selector({ 100 | id: 'type', 101 | type: 'SelectorElementAttribute', 102 | multiple: true, 103 | selector: 'li', 104 | extractAttribute: 'data-type' 105 | }, { $, document, window }) 106 | 107 | var dataDeferred = selector.getData($el) 108 | 109 | dataDeferred.then(function (data) { 110 | assert.deepEqual(data, [ { 111 | 'type': 'dog' 112 | } ]) 113 | done() 114 | }) 115 | }) 116 | }) 117 | -------------------------------------------------------------------------------- /tests/spec/Selector/SelectorElementScrollSpec.js: -------------------------------------------------------------------------------- 1 | var Selector = require('../../../extension/scripts/Selector') 2 | const utils = require('./../../utils') 3 | const assert = require('chai').assert 4 | const globals = require('../../globals') 5 | 6 | describe('Scroll Element Selector', function () { 7 | var $el 8 | let $ 9 | let document 10 | let window 11 | beforeEach(function () { 12 | $ = globals.$ 13 | document = globals.document 14 | window = globals.window 15 | 16 | document.body.innerHTML = utils.getTestHTML() 17 | $el = utils.createElementFromHTML("", document) 18 | document.body.appendChild($el) 19 | }) 20 | 21 | it('should return one element', function (done) { 22 | $el.innerHTML = '
a
b
' 23 | var selector = new Selector({ 24 | id: 'a', 25 | type: 'SelectorElementScroll', 26 | multiple: false, 27 | selector: 'div' 28 | }, {$, document, window}) 29 | 30 | var dataDeferred = selector.getData($el) 31 | dataDeferred.then(function (data) { 32 | assert.equal(data.length, 1) 33 | assert.equal(data[0], $el.querySelectorAll('div')[0]) 34 | done() 35 | }) 36 | }) 37 | 38 | it('should return multiple elements', function (done) { 39 | $el.innerHTML = '
a
b
' 40 | var selector = new Selector({ 41 | id: 'a', 42 | type: 'SelectorElementScroll', 43 | multiple: true, 44 | selector: 'div' 45 | }, {$, document, window}) 46 | 47 | var dataDeferred = selector.getData($el) 48 | dataDeferred.then(function (data) { 49 | assert.equal(data.length, 2) 50 | assert.deepEqual(data, Array.from($el.querySelectorAll('div'))) 51 | done() 52 | }) 53 | }) 54 | 55 | it('should get elements when scrolling is not needed', function (done) { 56 | $el.innerHTML = 'a' 57 | var selector = new Selector({ 58 | id: 'a', 59 | type: 'SelectorElementScroll', 60 | multiple: true, 61 | selector: 'a', 62 | delay: 100 63 | }, {$, document, window}) 64 | var dataDeferred = selector.getData($el) 65 | dataDeferred.then(function (data) { 66 | assert.equal(data.length, 1) 67 | assert.equal(data[0], $el.querySelectorAll('a')[0]) 68 | done() 69 | }) 70 | }) 71 | 72 | it('should get elements which are added a delay', function (done) { 73 | $el.innerHTML = 'a' 74 | // add extra element after a little delay 75 | setTimeout(function () { 76 | utils.appendHTML($el, 'a', document) 77 | }, 100) 78 | 79 | var selector = new Selector({ 80 | id: 'a', 81 | type: 'SelectorElementScroll', 82 | multiple: true, 83 | selector: 'a', 84 | delay: 200 85 | }, {$, document, window}) 86 | var dataDeferred = selector.getData($el) 87 | dataDeferred.then(function (data) { 88 | assert.equal(data.length, 2) 89 | assert.deepEqual(data, Array.from($el.querySelectorAll('a'))) 90 | done() 91 | }) 92 | }) 93 | it('should return no data columns', function () { 94 | var selector = new Selector({ 95 | id: 'a', 96 | type: 'SelectorElementScroll', 97 | multiple: true, 98 | selector: 'div' 99 | }, {$, document, window}) 100 | 101 | var columns = selector.getDataColumns() 102 | assert.deepEqual(columns, []) 103 | }) 104 | }) 105 | -------------------------------------------------------------------------------- /tests/spec/Selector/SelectorElementSpec.js: -------------------------------------------------------------------------------- 1 | var Selector = require('../../../extension/scripts/Selector') 2 | const utils = require('./../../utils') 3 | const assert = require('chai').assert 4 | const globals = require('../../globals') 5 | 6 | describe('Element Selector', function () { 7 | let $ 8 | let document 9 | let window 10 | beforeEach(function () { 11 | $ = globals.$ 12 | document = globals.document 13 | window = globals.window 14 | 15 | document.body.innerHTML = utils.getTestHTML() 16 | }) 17 | 18 | it('should return one element', function (done) { 19 | var selector = new Selector({ 20 | id: 'a', 21 | type: 'SelectorElement', 22 | multiple: false, 23 | selector: 'div' 24 | }, {$, document, window}) 25 | 26 | var dataDeferred = selector.getData(document.querySelectorAll('#selector-element-nodata')[0]) 27 | dataDeferred.then(function (data) { 28 | assert.equal(data.length, 1) 29 | assert.equal(data[0], document.querySelectorAll('#selector-element-nodata div')[0]) 30 | done() 31 | }) 32 | }) 33 | 34 | it('should return multiple elements', function (done) { 35 | var selector = new Selector({ 36 | id: 'a', 37 | type: 'SelectorElement', 38 | multiple: true, 39 | selector: 'div' 40 | }, {$, document, window}) 41 | var dataDeferred = selector.getData(document.querySelectorAll('#selector-element-nodata')[0]) 42 | dataDeferred.then(function (data) { 43 | assert.equal(data.length, 2) 44 | assert.deepEqual(data, Array.from(document.querySelectorAll('#selector-element-nodata div'))) 45 | done() 46 | }) 47 | }) 48 | 49 | it('should return no data columns', function () { 50 | var selector = new Selector({ 51 | id: 'a', 52 | type: 'SelectorElement', 53 | multiple: true, 54 | selector: 'div' 55 | }, {$, document, window}) 56 | 57 | var columns = selector.getDataColumns() 58 | assert.deepEqual(columns, []) 59 | }) 60 | }) 61 | -------------------------------------------------------------------------------- /tests/spec/Selector/SelectorGoogMapIDSpec.js: -------------------------------------------------------------------------------- 1 | const Selector = require('../../../extension/scripts/Selector') 2 | const utils = require('./../../utils') 3 | const assert = require('chai').assert 4 | const globals = require('../../globals') 5 | describe('Goog Map ID Selector', function () { 6 | var $el 7 | let $ 8 | let document 9 | let window 10 | beforeEach(function () { 11 | $ = globals.$ 12 | document = globals.document 13 | window = globals.window 14 | 15 | document.body.innerHTML = utils.getTestHTML() 16 | $el = utils.createElementFromHTML("", document) 17 | document.body.appendChild($el) 18 | }) 19 | 20 | it('FTID selector', function (done) { 21 | $el.innerHTML = ` 22 | 24 | 25 |
26 |
27 | 28 |
29 | 30 |
31 | ` 32 | var selector = new Selector({ 33 | id: 'a', 34 | type: 'SelectorGoogMapID', 35 | selector: 'section', 36 | mapsSelectorFromDiv: 'iframe[src*="google.com/maps/embed"]' 37 | }, {$, document, window}) 38 | 39 | var dataDeferred = selector.getData($el) 40 | dataDeferred.then(function (data) { 41 | assert.equal(data[0].a_FTID, '0x12a4a2674531e3bd:0xf12f53af6888194e') 42 | done() 43 | }) 44 | }) 45 | }) 46 | -------------------------------------------------------------------------------- /tests/spec/Selector/SelectorGroupSpec.js: -------------------------------------------------------------------------------- 1 | var Selector = require('../../../extension/scripts/Selector') 2 | const utils = require('./../../utils') 3 | const assert = require('chai').assert 4 | const globals = require('../../globals') 5 | 6 | describe('Group Selector', function () { 7 | let $ 8 | let document 9 | let window 10 | beforeEach(function () { 11 | $ = globals.$ 12 | document = globals.document 13 | window = globals.window 14 | document.body.innerHTML = utils.getTestHTML() 15 | 16 | }) 17 | 18 | it('should extract text data', function (done) { 19 | var selector = new Selector({ 20 | id: 'a', 21 | type: 'SelectorGroup', 22 | multiple: false, 23 | selector: 'div' 24 | }, {$, document, window}) 25 | 26 | var dataDeferred = selector.getData(document.querySelectorAll('#selector-group-text')[0]) 27 | dataDeferred.then(function (data) { 28 | assert.equal(data.length, 1) 29 | var expected = [ 30 | { 31 | a: [ 32 | { 33 | a: 'a' 34 | }, 35 | { 36 | a: 'b' 37 | } 38 | ] 39 | } 40 | ] 41 | assert.deepEqual(data, expected) 42 | done() 43 | }) 44 | }) 45 | 46 | it('should extract link urls', function (done) { 47 | var selector = new Selector({ 48 | id: 'a', 49 | type: 'SelectorGroup', 50 | multiple: false, 51 | selector: 'a', 52 | extractAttribute: 'href' 53 | }, {$, document, window}) 54 | var dataDeferred = selector.getData(document.querySelectorAll('#selector-group-url')[0]) 55 | dataDeferred.then(function (data) { 56 | assert.equal(data.length, 1) 57 | var expected = [ 58 | { 59 | a: [ 60 | { 61 | a: 'a', 62 | 'a-href': 'http://aa/' 63 | }, 64 | { 65 | a: 'b', 66 | 'a-href': 'http://bb/' 67 | } 68 | ] 69 | } 70 | ] 71 | assert.deepEqual(data, expected) 72 | done() 73 | }) 74 | }) 75 | 76 | it('should return only one data column', function () { 77 | var selector = new Selector({ 78 | id: 'id', 79 | type: 'SelectorGroup', 80 | multiple: true, 81 | selector: 'div' 82 | }, {$, document, window}) 83 | 84 | var columns = selector.getDataColumns() 85 | assert.deepEqual(columns, ['id']) 86 | }) 87 | }) 88 | -------------------------------------------------------------------------------- /tests/spec/Selector/SelectorHTMLSpec.js: -------------------------------------------------------------------------------- 1 | var Selector = require('../../../extension/scripts/Selector') 2 | const utils = require('./../../utils') 3 | const assert = require('chai').assert 4 | const globals = require('../../globals') 5 | 6 | describe('HTML Selector', function () { 7 | let $ 8 | let document 9 | let window 10 | beforeEach(function () { 11 | $ = globals.$ 12 | document = globals.document 13 | window = globals.window 14 | 15 | document.body.innerHTML = utils.getTestHTML() 16 | }) 17 | 18 | it('should extract single html element', function (done) { 19 | var selector = new Selector({ 20 | id: 'a', 21 | type: 'SelectorHTML', 22 | multiple: false, 23 | selector: 'div' 24 | }, {$, document, window}) 25 | var dataDeferred = selector.getData(document.querySelectorAll('#selector-html-single-html')[0]) 26 | dataDeferred.then(function (data) { 27 | assert.equal(data.length, 1) 28 | var expected = [ 29 | { 30 | a: 'aaabbbccc' 31 | } 32 | ] 33 | assert.deepEqual(data, expected) 34 | done() 35 | }) 36 | }) 37 | 38 | it('should extract multiple html elements', function (done) { 39 | var selector = new Selector({ 40 | id: 'a', 41 | type: 'SelectorHTML', 42 | multiple: true, 43 | selector: 'div' 44 | }, {$, document, window}) 45 | var dataDeferred = selector.getData(document.querySelectorAll('#selector-html-multiple-html')[0]) 46 | dataDeferred.then(function (data) { 47 | assert.equal(data.length, 2) 48 | var expected = [ 49 | { 50 | a: 'aaabbbccc' 51 | }, 52 | { 53 | a: 'dddeeefff' 54 | } 55 | ] 56 | assert.deepEqual(data, expected) 57 | done() 58 | }) 59 | }) 60 | 61 | it('should extract null when there are no elements', function (done) { 62 | var selector = new Selector({ 63 | id: 'a', 64 | type: 'SelectorHTML', 65 | multiple: false, 66 | selector: 'div' 67 | }, {$, document, window}) 68 | console.log(document.querySelectorAll('#selector-html-single-not-exist')) 69 | var dataDeferred = selector.getData(document.querySelectorAll('#selector-html-single-not-exist')[0]) 70 | dataDeferred.then(function (data) { 71 | assert.equal(data.length, 1) 72 | var expected = [ 73 | { 74 | a: null 75 | } 76 | ] 77 | assert.deepEqual(data, expected) 78 | done() 79 | }) 80 | }) 81 | 82 | it('should extract null when there is no regex match', function (done) { 83 | var selector = new Selector({ 84 | id: 'a', 85 | type: 'SelectorHTML', 86 | multiple: false, 87 | selector: 'div', 88 | regex: 'wontmatch' 89 | }, {$, document, window}) 90 | var dataDeferred = selector.getData(document.querySelectorAll('#selector-html-single-html')[0]) 91 | dataDeferred.then(function (data) { 92 | assert.equal(data.length, 1) 93 | var expected = [ 94 | { 95 | a: null 96 | } 97 | ] 98 | assert.deepEqual(data, expected) 99 | done() 100 | }) 101 | }) 102 | 103 | it('should extract html+text using regex', function (done) { 104 | var selector = new Selector({ 105 | id: 'a', 106 | type: 'SelectorHTML', 107 | multiple: false, 108 | selector: 'div', 109 | regex: '\\w+' 110 | }, {$, document, window}) 111 | var dataDeferred = selector.getData(document.querySelectorAll('#selector-html-single-html')[0]) 112 | dataDeferred.then(function (data) { 113 | assert.equal(data.length, 1) 114 | var expected = [ 115 | { 116 | a: 'bbb' 117 | } 118 | ] 119 | assert.deepEqual(data, expected) 120 | done() 121 | }) 122 | }) 123 | 124 | it('should return only one data column', function () { 125 | var selector = new Selector({ 126 | id: 'id', 127 | type: 'SelectorHTML', 128 | multiple: true, 129 | selector: 'div' 130 | }, {$, document, window}) 131 | 132 | var columns = selector.getDataColumns() 133 | assert.deepEqual(columns, ['id']) 134 | }) 135 | }) 136 | -------------------------------------------------------------------------------- /tests/spec/Selector/SelectorImageSpec.js: -------------------------------------------------------------------------------- 1 | const Selector = require('../../../extension/scripts/Selector') 2 | const SelectorImage = require('../../../extension/scripts/Selector/SelectorImage') 3 | const utils = require('./../../utils') 4 | const assert = require('chai').assert 5 | const globals = require('../../globals') 6 | 7 | describe('Image Selector', function () { 8 | let $ 9 | let document 10 | let window 11 | var $el 12 | beforeEach(function () { 13 | $ = globals.$ 14 | document = globals.document 15 | window = globals.window 16 | 17 | document.body.innerHTML = utils.getTestHTML() 18 | $el = utils.createElementFromHTML("", document) 19 | document.body.appendChild($el) 20 | }) 21 | 22 | it('should extract single image', function (done) { 23 | var selector = new Selector({ 24 | id: 'img', 25 | type: 'SelectorImage', 26 | multiple: false, 27 | selector: 'img' 28 | }, {$, document, window}) 29 | 30 | var dataDeferred = selector.getData(document.querySelectorAll('#selector-image-one-image')[0]) 31 | dataDeferred.then(function (data) { 32 | assert.equal(data.length, 1) 33 | var expected = [ 34 | { 35 | 'img-src': 'http://aa/' 36 | } 37 | ] 38 | assert.deepEqual(data, expected) 39 | done() 40 | }) 41 | }) 42 | 43 | it('should extract multiple images', function (done) { 44 | var selector = new Selector({ 45 | id: 'img', 46 | type: 'SelectorImage', 47 | multiple: true, 48 | selector: 'img' 49 | }, {$, document, window}) 50 | var dataDeferred = selector.getData(document.querySelectorAll('#selector-image-multiple-images')[0]) 51 | dataDeferred.then(function (data) { 52 | assert.equal(data.length, 2) 53 | var expected = [ 54 | { 55 | 'img-src': 'http://aa/' 56 | }, 57 | { 58 | 'img-src': 'http://bb/' 59 | } 60 | ] 61 | assert.deepEqual(data, expected) 62 | done() 63 | }) 64 | }) 65 | 66 | it('should return only src column', function () { 67 | var selector = new Selector({ 68 | id: 'id', 69 | type: 'SelectorImage', 70 | multiple: true, 71 | selector: 'img' 72 | }, {$, document, window}) 73 | 74 | var columns = selector.getDataColumns() 75 | assert.deepEqual(columns, ['id-src']) 76 | }) 77 | 78 | it('should return empty array when no images are found', function (done) { 79 | var selector = new Selector({ 80 | id: 'img', 81 | type: 'SelectorImage', 82 | multiple: true, 83 | selector: 'img.not-exist' 84 | }, {$, document, window}) 85 | var dataDeferred = selector.getData(document.querySelectorAll('#not-exist')[0]) 86 | dataDeferred.then(function (data) { 87 | assert.equal(data.length, 0) 88 | var expected = [] 89 | assert.deepEqual(data, expected) 90 | done() 91 | }) 92 | }) 93 | 94 | // base is not a real url so it does not work from jsdom. 95 | it.skip('should be able to download image as base64', function (done) { 96 | var selector = new Selector({ 97 | id: 'img', 98 | type: 'SelectorImage' 99 | }, {$, document, window}) 100 | var deferredImage = selector.downloadImageBase64('base/docs/images/chrome-store-logo.png') 101 | 102 | deferredImage.then(function (imageResponse) { 103 | assert.isTrue(imageResponse.imageBase64.length > 100) 104 | done() 105 | }) 106 | }) 107 | 108 | it.skip('should be able to get data with image data attached', function (done) { 109 | $el.innerHTML = '' 110 | 111 | var selector = new Selector({ 112 | id: 'img', 113 | type: 'SelectorImage', 114 | multiple: true, 115 | selector: 'img', 116 | downloadImage: true 117 | }, {$, document, window}) 118 | 119 | var dataDeferred = selector.getData($el) 120 | dataDeferred.then(function (data) { 121 | assert.equal(data.length, 1) 122 | assert.isTrue(!!data[0]['_imageBase64-img']) 123 | assert.isTrue(!!data[0]['_imageMimeType-img']) 124 | done() 125 | }) 126 | }) 127 | }) 128 | -------------------------------------------------------------------------------- /tests/spec/Selector/SelectorLinkSpec.js: -------------------------------------------------------------------------------- 1 | var Selector = require('../../../extension/scripts/Selector') 2 | const utils = require('./../../utils') 3 | const assert = require('chai').assert 4 | const globals = require('../../globals') 5 | 6 | describe('Link Selector', function () { 7 | var $el 8 | let $ 9 | let document 10 | let window 11 | beforeEach(function () { 12 | $ = globals.$ 13 | document = globals.document 14 | window = globals.window 15 | 16 | document.body.innerHTML = utils.getTestHTML() 17 | $el = utils.createElementFromHTML("", document) 18 | document.body.appendChild($el) 19 | }) 20 | 21 | it('should extract single link', function (done) { 22 | var selector = new Selector({ 23 | id: 'a', 24 | type: 'SelectorLink', 25 | multiple: false, 26 | selector: 'a' 27 | }, {$, document, window}) 28 | 29 | var dataDeferred = selector.getData(document.querySelectorAll('#selector-follow')[0]) 30 | dataDeferred.then(function (data) { 31 | var expected = [ 32 | { 33 | a: 'a', 34 | 'a-href': 'http://example.com/a', 35 | _follow: 'http://example.com/a', 36 | _followSelectorId: 'a' 37 | } 38 | ] 39 | assert.deepEqual(data, expected) 40 | done() 41 | }) 42 | }) 43 | 44 | it('should extract multiple links', function (done) { 45 | var selector = new Selector({ 46 | id: 'a', 47 | type: 'SelectorLink', 48 | multiple: true, 49 | selector: 'a' 50 | }, {$, document, window}) 51 | var dataDeferred = selector.getData(document.querySelectorAll('#selector-follow')[0]) 52 | dataDeferred.then(function (data) { 53 | var expected = [ 54 | { 55 | a: 'a', 56 | 'a-href': 'http://example.com/a', 57 | _follow: 'http://example.com/a', 58 | _followSelectorId: 'a' 59 | }, 60 | { 61 | a: 'b', 62 | 'a-href': 'http://example.com/b', 63 | _follow: 'http://example.com/b', 64 | _followSelectorId: 'a' 65 | } 66 | ] 67 | assert.deepEqual(data, expected) 68 | done() 69 | }) 70 | }) 71 | 72 | it('should return data and url columns', function () { 73 | var selector = new Selector({ 74 | id: 'id', 75 | type: 'SelectorLink', 76 | multiple: true, 77 | selector: 'div' 78 | }, {$, document, window}) 79 | 80 | var columns = selector.getDataColumns() 81 | assert.deepEqual(columns, ['id', 'id-href']) 82 | }) 83 | 84 | it('should return empty array when no links are found', function (done) { 85 | var selector = new Selector({ 86 | id: 'a', 87 | type: 'SelectorLink', 88 | multiple: true, 89 | selector: 'a' 90 | }, {$, document, window}) 91 | var dataDeferred = selector.getData(document.querySelectorAll('#not-exist')[0]) 92 | dataDeferred.then(function (data) { 93 | var expected = [] 94 | assert.deepEqual(data, expected) 95 | done() 96 | }) 97 | }) 98 | }) 99 | -------------------------------------------------------------------------------- /tests/spec/SelectorSpec.js: -------------------------------------------------------------------------------- 1 | const Selector = require('./../../extension/scripts/Selector') 2 | const utils = require('./../utils') 3 | const assert = require('chai').assert 4 | const globals = require('../globals') 5 | describe('Selector', function () { 6 | var $el 7 | let $ 8 | let document 9 | let window 10 | 11 | beforeEach(function () { 12 | $ = globals.$ 13 | document = globals.document 14 | window = globals.window 15 | 16 | document.body.innerHTML = utils.getTestHTML() 17 | $el = utils.createElementFromHTML("", document) 18 | document.body.appendChild($el) 19 | }) 20 | 21 | it('should be able to select elements', function () { 22 | $el.innerHTML = '' 23 | var selector = new Selector({ 24 | selector: 'a', 25 | type: 'SelectorLink' 26 | }, {$, document, window}) 27 | var elements = selector.getDataElements($el) 28 | 29 | assert.deepEqual(elements, Object.values($el.querySelectorAll('a'))) 30 | }) 31 | 32 | it('should be able to select parent', function () { 33 | $el.innerHTML = '' 34 | var selector = new Selector({ 35 | selector: '_parent_', 36 | type: 'SelectorLink' 37 | }, {$, document, window}) 38 | var elements = selector.getDataElements($el) 39 | 40 | assert.deepEqual(elements, [$el]) 41 | }) 42 | 43 | it('should be able to select elements with delay', function () { 44 | var selector = new Selector({ 45 | id: 'a', 46 | selector: 'a', 47 | type: 'SelectorText', 48 | delay: 100 49 | }, {$, document, window}) 50 | var dataDeferred = selector.getData($el) 51 | 52 | // add data after data extraction called 53 | $el.innerHTML = 'a' 54 | 55 | return dataDeferred.then(function (data) { 56 | assert.deepEqual(data, [ 57 | { 58 | 'a': 'a' 59 | } 60 | ]) 61 | }) 62 | }) 63 | }) 64 | -------------------------------------------------------------------------------- /tests/spec/UniqueElementListSpec.js: -------------------------------------------------------------------------------- 1 | const UniqueElementList = require('../../extension/scripts/UniqueElementList') 2 | const utils = require('./../utils') 3 | const assert = require('chai').assert 4 | const globals = require('../globals') 5 | describe('UniqueElementList', function () { 6 | var $el 7 | let $ 8 | let document 9 | let window 10 | 11 | beforeEach(function () { 12 | $ = globals.$ 13 | document = globals.document 14 | window = globals.window 15 | 16 | document.body.innerHTML = utils.getTestHTML() 17 | $el = utils.createElementFromHTML("", document) 18 | document.body.appendChild($el) 19 | }) 20 | 21 | it('it should add only unique elements', function () { 22 | $el.innerHTML = '12' 23 | 24 | var list = new UniqueElementList('uniqueText', {$, document, window}) 25 | assert.equal(list.length, 0) 26 | 27 | var $a = $el.querySelectorAll('a') 28 | list.push($a[0]) 29 | assert.equal(list.length, 1) 30 | list.push($a[0]) 31 | assert.equal(list.length, 1) 32 | list.push($a[1]) 33 | assert.equal(list.length, 2) 34 | list.push($a[1]) 35 | assert.equal(list.length, 2) 36 | }) 37 | 38 | it('it should add only unique elements when using uniqueHTMLText type', function () { 39 | $el.innerHTML = "aa" 40 | 41 | var list = new UniqueElementList('uniqueHTMLText', {$, document, window}) 42 | assert.equal(list.length, 0) 43 | 44 | var $a = $el.querySelectorAll('a') 45 | list.push($a[0]) 46 | assert.equal(list.length, 1) 47 | list.push($a[0]) 48 | assert.equal(list.length, 1) 49 | list.push($a[1]) 50 | assert.equal(list.length, 2) 51 | list.push($a[1]) 52 | assert.equal(list.length, 2) 53 | }) 54 | 55 | it('it should add only unique elements when using uniqueHTML type', function () { 56 | $el.innerHTML = "aaabcc" 57 | 58 | var list = new UniqueElementList('uniqueHTML', {$, document, window}) 59 | assert.equal(list.length, 0) 60 | 61 | var $a = $el.querySelectorAll('a') 62 | list.push($a[0]) 63 | assert.equal(list.length, 1) 64 | list.push($a[0]) 65 | assert.equal(list.length, 1) 66 | list.push($a[1]) 67 | assert.equal(list.length, 2) 68 | list.push($a[1]) 69 | assert.equal(list.length, 2) 70 | list.push($a[2]) 71 | assert.equal(list.length, 2) 72 | }) 73 | 74 | it('it should add only unique elements when using uniqueCSSSelector type', function () { 75 | $el.innerHTML = '' 76 | 77 | var list = new UniqueElementList('uniqueCSSSelector', {$, document, window}) 78 | assert.equal(list.length, 0) 79 | 80 | var $a = $el.querySelectorAll('a') 81 | list.push($a[0]) 82 | assert.equal(list.length, 1) 83 | list.push($a[0]) 84 | assert.equal(list.length, 1) 85 | list.push($a[1]) 86 | assert.equal(list.length, 2) 87 | list.push($a[1]) 88 | assert.equal(list.length, 2) 89 | }) 90 | }) 91 | -------------------------------------------------------------------------------- /tests/spec/browser/BackgroundScriptSpec.js: -------------------------------------------------------------------------------- 1 | const getBackgroundScript = require('../../../extension/scripts/getBackgroundScript') 2 | const getContentScript = require('../../../extension/scripts/getContentScript') 3 | const selectorMatchers = require('../../Matchers') 4 | const utils = require('../../utils') 5 | 6 | describe('BackgroundScript', function () { 7 | var backgroundScript = getBackgroundScript('BackgroundScript') 8 | var $el 9 | 10 | beforeEach(function () { 11 | document.body.innerHTML = utils.getTestHTML() 12 | $el = utils.createElementFromHTML("", document) 13 | document.body.appendChild($el) 14 | }) 15 | 16 | it('should be able to call BackgroundScript functions from background script', async function () { 17 | var deferredResponse = backgroundScript.dummy() 18 | await selectorMatchers.deferredToEqual(deferredResponse, 'dummy') 19 | await selectorMatchers.deferredToEqual(deferredResponse, 'dummy') 20 | }) 21 | 22 | it('should be able to call BackgroundScript from Devtools', async function () { 23 | var backgroundScript = getBackgroundScript('DevTools') 24 | var deferredResponse = backgroundScript.dummy() 25 | await selectorMatchers.deferredToEqual(deferredResponse, 'dummy') 26 | }) 27 | }) 28 | -------------------------------------------------------------------------------- /tests/spec/browser/ChromePopupBrowserSpec.js: -------------------------------------------------------------------------------- 1 | const ChromePopupBrowser = require('../../../extension/scripts/ChromePopupBrowser') 2 | const Sitemap = require('../../../extension/scripts/Sitemap') 3 | const assert = require('chai').assert 4 | const utils = require('../../utils') 5 | const globals = require('../../globals') 6 | describe('Chrome popup browser', function () { 7 | let $ 8 | let document 9 | let window 10 | beforeEach(function () { 11 | $ = globals.$ 12 | document = globals.document 13 | window = globals.window 14 | 15 | window.chromeAPI.reset() 16 | document.body.innerHTML = utils.getTestHTML() 17 | }) 18 | 19 | it('should init a popup window', function () { 20 | var browser = new ChromePopupBrowser({ 21 | pageLoadDelay: 500 22 | }) 23 | browser._initPopupWindow(function () { 24 | }) 25 | assert.deepEqual(browser.tab, {id: 0}) 26 | }) 27 | 28 | it('should load a page', function (done) { 29 | var browser = new ChromePopupBrowser({ 30 | pageLoadDelay: 500 31 | }) 32 | browser._initPopupWindow(function () { 33 | }) 34 | browser.loadUrl('http://example,com/', function () { 35 | done() 36 | }) 37 | }) 38 | 39 | it('should sendMessage to popup contentscript when data extraction is needed', function (done) { 40 | var sitemap = new Sitemap({ 41 | selectors: [ 42 | { 43 | id: 'a', 44 | selector: '#browserTest', 45 | type: 'SelectorText', 46 | multiple: false, 47 | parentSelectors: ['_root'] 48 | } 49 | ] 50 | }, {$, document, window}) 51 | 52 | var browser = new ChromePopupBrowser({ 53 | pageLoadDelay: 500 54 | }) 55 | browser._initPopupWindow(function () { 56 | }) 57 | browser.fetchData('http://example,com/', sitemap, '_root', function (err, data) { 58 | assert.isNull(err) 59 | assert.deepEqual(data, [ 60 | { 61 | 'a': 'a' 62 | } 63 | ]) 64 | done() 65 | }) 66 | }) 67 | }) 68 | -------------------------------------------------------------------------------- /tests/spec/browser/ScraperSpec.js: -------------------------------------------------------------------------------- 1 | const Queue = require('./../../../extension/scripts/Queue') 2 | const assert = require('chai').assert 3 | 4 | const ChromePopupBrowser = require('./../../../extension/scripts/ChromePopupBrowser') 5 | const Sitemap = require('./../../../extension/scripts/Sitemap') 6 | const FakeStore = require('./../../FakeStore') 7 | const Scraper = require('./../../../extension/scripts/Scraper') 8 | const utils = require('./../../utils') 9 | const globals = require('../../globals') 10 | 11 | describe('Scraper', function () { 12 | var q, store, $el 13 | let $ 14 | let document 15 | let window 16 | let Browser 17 | 18 | beforeEach(function () { 19 | $ = globals.$ 20 | document = globals.document 21 | window = globals.window 22 | Browser = globals.Browser 23 | 24 | q = new Queue() 25 | store = new FakeStore() 26 | document.body.innerHTML = utils.getTestHTML() 27 | }) 28 | afterEach(function () { 29 | while (document.body.firstChild) document.body.removeChild(document.body.firstChild) 30 | }) 31 | 32 | it('should store images', function (done) { 33 | var record = { 34 | '_imageBase64-test': 'test', 35 | '_imageMimeType-test': 'test', 36 | 'test-src': 'http://images/image.png' 37 | } 38 | 39 | var browser = new Browser({ 40 | pageLoadDelay: 500 41 | }) 42 | 43 | var sitemap = new Sitemap({ 44 | id: 'test' 45 | }, {$, document, window}) 46 | 47 | var scraper = new Scraper({ 48 | sitemap: sitemap, 49 | browser: browser 50 | }, {$, document, window}) 51 | 52 | var deferredSave = scraper.saveImages(record) 53 | var downloadAPICalled = false 54 | chrome.downloads.onChanged.addListener(function () { 55 | downloadAPICalled = true 56 | }) 57 | assert.equal(downloadAPICalled, false) 58 | 59 | deferredSave.then(function () { 60 | assert.equal(record['_imageBase64-test'], undefined) 61 | assert.equal(record['_imageMimeType-test'], undefined) 62 | assert.equal(downloadAPICalled, true) 63 | done() 64 | }) 65 | .then(null, function (e) { 66 | done(e) 67 | }) 68 | }) 69 | }) 70 | -------------------------------------------------------------------------------- /tests/spec/browser/Selector/SelectorImageSpec.js: -------------------------------------------------------------------------------- 1 | const Selector = require('../../../../extension/scripts/Selector') 2 | const utils = require('./../../../utils') 3 | const assert = require('chai').assert 4 | const globals = require('../../../globals') 5 | 6 | describe('Image Selector', function () { 7 | let $ 8 | let document 9 | let window 10 | var $el 11 | beforeEach(function () { 12 | $ = globals.$ 13 | document = globals.document 14 | window = globals.window 15 | 16 | document.body.innerHTML = utils.getTestHTML() 17 | $el = utils.createElementFromHTML("", document) 18 | document.body.appendChild($el) 19 | }) 20 | 21 | it('should be able to download image as base64', function (done) { 22 | var selector = new Selector({ 23 | id: 'img', 24 | type: 'SelectorImage' 25 | }, {$, document, window}) 26 | var deferredImage = selector.downloadImageBase64('base/docs/images/chrome-store-logo.png') 27 | 28 | deferredImage.then(function (imageResponse) { 29 | assert.isTrue(imageResponse.imageBase64.length > 100) 30 | done() 31 | }) 32 | }) 33 | 34 | it('should be able to get data with image data attached', function (done) { 35 | $el.innerHTML = '' 36 | 37 | var selector = new Selector({ 38 | id: 'img', 39 | type: 'SelectorImage', 40 | multiple: true, 41 | selector: 'img', 42 | downloadImage: true 43 | }, {$, document, window}) 44 | 45 | var dataDeferred = selector.getData($el) 46 | dataDeferred.then(function (data) { 47 | assert.equal(data.length, 1) 48 | assert.isTrue(!!data[0]['_imageBase64-img']) 49 | assert.isTrue(!!data[0]['_imageMimeType-img']) 50 | done() 51 | }) 52 | }) 53 | }) 54 | -------------------------------------------------------------------------------- /tests/spec/headless/browserSpec.js: -------------------------------------------------------------------------------- 1 | const ChromeHeadlessBrowser = require('./../../../extension/scripts/ChromeHeadlessBrowser') 2 | const sinon = require('sinon') 3 | const assert = require('chai').assert 4 | const utils = require('./../../utils') 5 | const Queue = require('./../../../extension/scripts/Queue') 6 | const Sitemap = require('./../../../extension/scripts/Sitemap') 7 | const FakeStore = require('./../../FakeStore') 8 | const Scraper = require('./../../../extension/scripts/Scraper') 9 | 10 | describe('Headless browser', function () { 11 | let sandbox 12 | beforeEach('Create sandbox', function () { 13 | sandbox = sinon.createSandbox() 14 | }) 15 | afterEach('Release sandbox', function () { 16 | if (sandbox) sandbox.restore() 17 | }) 18 | it('Scrape', function (done) { 19 | sandbox.stub(ChromeHeadlessBrowser.prototype, 'loadUrl').callsFake(async function () { 20 | const page = await this.pagePromise 21 | const html = utils.getTestHTML() 22 | await page.setContent(html) 23 | }) 24 | 25 | const fake$ = {} 26 | const fakeDocument = {} 27 | const fakeWindow = {} 28 | const q = new Queue() 29 | const store = new FakeStore() 30 | 31 | const sitemap = new Sitemap({ 32 | id: 'test', 33 | startUrl: 'http://test.lv/', 34 | selectors: [ 35 | { 36 | 'id': 'link', 37 | 'selector': '#scraper-test-child-page a', 38 | 'multiple': true, 39 | type: 'SelectorLink', 40 | 'parentSelectors': ['_root'] 41 | }, 42 | { 43 | 'id': 'b', 44 | 'selector': '#scraper-test-child-page b', 45 | 'multiple': false, 46 | type: 'SelectorText', 47 | 'parentSelectors': ['link'] 48 | } 49 | ] 50 | }, {$: fake$, document: fakeDocument, window: fakeWindow}) 51 | 52 | var browser = new ChromeHeadlessBrowser({ 53 | pageLoadDelay: 10 54 | }) 55 | 56 | var s = new Scraper({ 57 | queue: q, 58 | sitemap: sitemap, 59 | browser: browser, 60 | store: store, 61 | delay: 0 62 | }, {$: fake$, document: fakeDocument, window: fakeWindow}) 63 | 64 | s.run(function () { 65 | assert.deepEqual(store.data, [ 66 | {'link': 'test', 'link-href': 'http://test.lv/1/', 'b': 'b'} 67 | ]) 68 | done() 69 | }) 70 | }) 71 | 72 | it('Scraping is done in a different context', function (done) { 73 | sandbox.stub(ChromeHeadlessBrowser.prototype, 'loadUrl').callsFake(async function () { 74 | const page = await this.pagePromise 75 | const html = utils.getTestHTML() 76 | await page.setContent(html) 77 | await page.evaluate(function () { 78 | const blockedProperties = ['jquery', '$', 'jQuery'] 79 | try { 80 | for (const property of blockedProperties) { 81 | Object.defineProperty(window, property, { 82 | get () { 83 | throw new Error('Wrong property: ' + property) 84 | }, 85 | set () { 86 | throw new Error('Cannot set: ' + property) 87 | } 88 | }) 89 | } 90 | } catch (e) { 91 | // This is executed once per visited page, so it can give problems 92 | } 93 | }) 94 | }) 95 | 96 | const fake$ = {} 97 | const fakeDocument = {} 98 | const fakeWindow = {} 99 | const q = new Queue() 100 | const store = new FakeStore() 101 | 102 | const sitemap = new Sitemap({ 103 | id: 'test', 104 | startUrl: 'http://test.lv/', 105 | selectors: [ 106 | { 107 | 'id': 'link', 108 | 'selector': '#scraper-test-child-page a', 109 | 'multiple': true, 110 | type: 'SelectorLink', 111 | 'parentSelectors': ['_root'] 112 | }, 113 | { 114 | 'id': 'b', 115 | 'selector': '#scraper-test-child-page b', 116 | 'multiple': false, 117 | type: 'SelectorText', 118 | 'parentSelectors': ['link'] 119 | } 120 | ] 121 | }, {$: fake$, document: fakeDocument, window: fakeWindow}) 122 | 123 | var browser = new ChromeHeadlessBrowser({ 124 | pageLoadDelay: 10 125 | }) 126 | 127 | var s = new Scraper({ 128 | queue: q, 129 | sitemap: sitemap, 130 | browser: browser, 131 | store: store, 132 | delay: 0 133 | }, {$: fake$, document: fakeDocument, window: fakeWindow}) 134 | 135 | s.run(function () { 136 | assert.deepEqual(store.data, [ 137 | {'link': 'test', 'link-href': 'http://test.lv/1/', 'b': 'b'} 138 | ]) 139 | done() 140 | }) 141 | }) 142 | }) 143 | -------------------------------------------------------------------------------- /tests/spec/jquery.whencallsequentiallySpec.js: -------------------------------------------------------------------------------- 1 | var whenCallSequentially = require('../../extension/assets/jquery.whencallsequentially') 2 | var jquery = require('jquery-deferred') 3 | const assert = require('chai').assert 4 | 5 | describe('jQuery When call sequentially', function () { 6 | var syncCall = function () { 7 | return jquery.Deferred().resolve('sync').promise() 8 | } 9 | 10 | var asyncCall = function () { 11 | var d = jquery.Deferred() 12 | setTimeout(function () { 13 | d.resolve('async') 14 | }, 0) 15 | return d.promise() 16 | } 17 | 18 | beforeEach(function () { 19 | }) 20 | 21 | it('should return immediately empty array when no calls passed', function () { 22 | var deferred = whenCallSequentially([]) 23 | assert.equal(deferred.state(), 'resolved') 24 | var data 25 | deferred.done(function (res) { 26 | data = res 27 | }) 28 | assert.deepEqual(data, []) 29 | }) 30 | 31 | it('should return immediately with data when synchronous call passed', function () { 32 | var deferred = whenCallSequentially([syncCall]) 33 | assert.deepEqual(deferred.state(), 'resolved') 34 | var data 35 | deferred.done(function (res) { 36 | data = res 37 | }) 38 | assert.deepEqual(data, ['sync']) 39 | }) 40 | 41 | it('should return immediately with data when multiple synchronous call passed', function () { 42 | var deferred = whenCallSequentially([syncCall, syncCall, syncCall]) 43 | assert.deepEqual(deferred.state(), 'resolved') 44 | var data 45 | deferred.done(function (res) { 46 | data = res 47 | }) 48 | assert.deepEqual(data, ['sync', 'sync', 'sync']) 49 | }) 50 | 51 | it('should execute one async job', function (done) { 52 | var deferred = whenCallSequentially([asyncCall]) 53 | assert.deepEqual(deferred.state(), 'pending') 54 | 55 | deferred.then(function (data) { 56 | assert.deepEqual(data, ['async']) 57 | done() 58 | }) 59 | }) 60 | 61 | it('should execute multiple async jobs', function (done) { 62 | var deferred = whenCallSequentially([asyncCall, asyncCall, asyncCall]) 63 | assert.deepEqual(deferred.state(), 'pending') 64 | 65 | deferred.then(function (res) { 66 | assert.deepEqual(res, ['async', 'async', 'async']) 67 | done() 68 | }) 69 | }) 70 | 71 | it('should execute multiple sync and async jobs', function () { 72 | var deferred = whenCallSequentially([syncCall, syncCall, asyncCall, asyncCall, syncCall, asyncCall]) 73 | assert.deepEqual(deferred.state(), 'pending') 74 | 75 | deferred.done(function (data) { 76 | assert.deepEqual(data, ['sync', 'sync', 'async', 'async', 'sync', 'async']) 77 | }) 78 | }) 79 | 80 | it('should allow adding jobs to job array from an async job', function () { 81 | var jobs = [] 82 | var asyncMoreCall = function () { 83 | var d = jquery.Deferred() 84 | setTimeout(function () { 85 | d.resolve('asyncmore') 86 | jobs.push(asyncCall) 87 | }, 0) 88 | return d.promise() 89 | } 90 | jobs.push(asyncMoreCall) 91 | 92 | var deferred = whenCallSequentially(jobs) 93 | assert.deepEqual(deferred.state(), 'pending') 94 | 95 | deferred.then(function (data) { 96 | assert.deepEqual(data, ['asyncmore', 'async']) 97 | }) 98 | }) 99 | 100 | it('should allow adding jobs to job array from a sync job', function () { 101 | var jobs = [] 102 | var syncMoreCall = function () { 103 | var d = jquery.Deferred() 104 | jobs.push(syncCall) 105 | d.resolve('syncmore') 106 | return d.promise() 107 | } 108 | jobs.push(syncMoreCall) 109 | 110 | var deferred = whenCallSequentially(jobs) 111 | deferred.then(function (res) { 112 | assert.deepEqual(res, ['syncmore', 'sync']) 113 | }) 114 | }) 115 | }) 116 | -------------------------------------------------------------------------------- /tests/spec/jsdom/browserSpec.js: -------------------------------------------------------------------------------- 1 | const Browser = require('./../../../extension/scripts/JSDOMBrowser') 2 | 3 | it('Handle error in jsdom', function (done) { 4 | Browser.prototype.loadUrl = function (url, callback) { 5 | callback(new Error('Fake error')) 6 | } 7 | const jsdomBrowser = new Browser({}) 8 | 9 | jsdomBrowser.fetchData('a', {}, {}, function (err) { 10 | if (err) { 11 | done() 12 | } else { 13 | done(new Error('It should have failed')) 14 | } 15 | }) 16 | }) 17 | --------------------------------------------------------------------------------