├── .babelrc
├── .eslintrc
├── .gitignore
├── .gitmodules
├── .npmignore
├── LICENSE
├── README.md
├── docs
    ├── CSS selector.md
    ├── Installation.md
    ├── Open Web Scraper.md
    ├── Scraping a site.md
    ├── Selectors.md
    ├── Selectors
    │   ├── Element attribute selector.md
    │   ├── Element click selector.md
    │   ├── Element scroll down selector.md
    │   ├── Element selector.md
    │   ├── Grouped selector.md
    │   ├── HTML selector.md
    │   ├── Image selector.md
    │   ├── Link popup selector.md
    │   ├── Link selector.md
    │   ├── Table selector.md
    │   └── Text selector.md
    ├── Storage backends.md
    └── images
    │   ├── chrome-store-logo-920x680.png
    │   ├── chrome-store-logo-920x680.xcf
    │   ├── chrome-store-logo.png
    │   ├── chrome-store-logo.xcf
    │   ├── open-web-scraper
    │       └── open-web-scraper.png
    │   ├── scraping-a-site
    │       ├── news-site-selector-graph.png
    │       ├── news-site-sitemap.png
    │       └── news-site.png
    │   ├── selectors
    │       ├── element-click
    │       │   ├── click-more.png
    │       │   └── click-once.png
    │       ├── link
    │       │   ├── multiple-level-link-selectors.png
    │       │   ├── pagination-link-selectors.png
    │       │   └── pagination-selector-graph.png
    │       ├── table
    │       │   ├── selectors.png
    │       │   └── table.png
    │       └── text
    │       │   ├── text-selector-multiple-elements-with-text-selectors.png
    │       │   ├── text-selector-multiple-per-page.png
    │       │   └── text-selector-multiple-single-text-selectors-in-one-page.png
    │   ├── sitemap-tree.png
    │   └── store-logo-sources.txt
├── extension
    ├── assets
    │   ├── ICanHaz.js
    │   ├── LICENSE-d3-js
    │   ├── LICENSE-icanhaz-js
    │   ├── LICENSE-jquery-js
    │   ├── LICENSE-pouchdb-js
    │   ├── LICENSE-sugar-js
    │   ├── base64.js
    │   ├── bootstrap-3.0.0
    │   │   ├── css
    │   │   │   ├── bootstrap-theme.css
    │   │   │   ├── bootstrap-theme.min.css
    │   │   │   ├── bootstrap.css
    │   │   │   └── bootstrap.min.css
    │   │   ├── fonts
    │   │   │   ├── glyphicons-halflings-regular.eot
    │   │   │   ├── glyphicons-halflings-regular.svg
    │   │   │   ├── glyphicons-halflings-regular.ttf
    │   │   │   └── glyphicons-halflings-regular.woff
    │   │   └── js
    │   │   │   ├── bootstrap.js
    │   │   │   └── bootstrap.min.js
    │   ├── d3.v3.js
    │   ├── d3.v3.min.js
    │   ├── images
    │   │   ├── LICENSE
    │   │   ├── icon128.png
    │   │   ├── icon16.png
    │   │   ├── icon19.png
    │   │   ├── icon38.png
    │   │   └── icon48.png
    │   ├── jquery-2.0.3.js
    │   ├── jquery.bootstrapvalidator
    │   │   ├── bootstrapValidator.css
    │   │   └── bootstrapValidator.js
    │   ├── jquery.whencallsequentially.js
    │   ├── pouchdb-nightly.min.js
    │   └── sugar-1.4.1.js
    ├── background_page
    │   └── background_script.js
    ├── content_script
    │   ├── contentScraperHeadlessBundler.js
    │   ├── content_scraper.js
    │   ├── content_scraper_browser.js
    │   ├── content_script.css
    │   └── content_script.js
    ├── devtools
    │   ├── devtools_init_page.html
    │   ├── devtools_init_page.js
    │   ├── devtools_scraper_panel.css
    │   ├── devtools_scraper_panel.html
    │   └── views
    │   │   ├── DataPreview.html
    │   │   ├── SelectorEdit.html
    │   │   ├── SelectorEditTableColumn.html
    │   │   ├── SelectorList.html
    │   │   ├── SelectorListItem.html
    │   │   ├── SitemapBrowseData.html
    │   │   ├── SitemapCreate.html
    │   │   ├── SitemapEditMetadata.html
    │   │   ├── SitemapExport.html
    │   │   ├── SitemapExportDataCSV.html
    │   │   ├── SitemapHeadlessScrapeConfig.html
    │   │   ├── SitemapImport.html
    │   │   ├── SitemapList.html
    │   │   ├── SitemapListItem.html
    │   │   ├── SitemapScrapeConfig.html
    │   │   ├── SitemapSelectorGraph.html
    │   │   ├── SitemapStartUrlField.html
    │   │   └── Viewport.html
    ├── generated
    │   └── .gitignore
    ├── manifest.json
    ├── options_page
    │   ├── options.html
    │   └── options_page.js
    ├── popup.html
    └── scripts
    │   ├── App.js
    │   ├── BackgroundScript.js
    │   ├── ChromeHeadlessBrowser.js
    │   ├── ChromePopupBrowser.js
    │   ├── Config.js
    │   ├── ContentScript.js
    │   ├── ContentSelector.js
    │   ├── Controller.js
    │   ├── DataExtractor.js
    │   ├── ElementQuery.js
    │   ├── InMemoryStore.js
    │   ├── JSDOMBrowser.js
    │   ├── JSDOMBrowserLoader.js
    │   ├── Job.js
    │   ├── Queue.js
    │   ├── Scraper.js
    │   ├── Selector.js
    │   ├── Selector
    │       ├── SelectorElement.js
    │       ├── SelectorElementAttribute.js
    │       ├── SelectorElementClick.js
    │       ├── SelectorElementScroll.js
    │       ├── SelectorGoogMapID.js
    │       ├── SelectorGroup.js
    │       ├── SelectorHTML.js
    │       ├── SelectorImage.js
    │       ├── SelectorLink.js
    │       ├── SelectorPopupLink.js
    │       ├── SelectorTable.js
    │       └── SelectorText.js
    │   ├── SelectorGraph.js
    │   ├── SelectorGraphv2.js
    │   ├── SelectorList.js
    │   ├── Selectors.js
    │   ├── Sitemap.js
    │   ├── Store.js
    │   ├── StoreDevtools.js
    │   ├── UniqueElementList.js
    │   ├── WebJSDOMBrowser.js
    │   ├── getBackgroundScript.js
    │   └── getContentScript.js
├── gulpfile.js
├── index.js
├── karma.conf.js
├── package.json
├── playgrounds
    ├── extension
    │   ├── index.html
    │   └── webpage.css
    └── sitemap-tree
    │   ├── index.html
    │   ├── sitemap.json
    │   └── style.css
└── tests
    ├── ChromeAPI.js
    ├── FakeStore.js
    ├── Matchers.js
    ├── browserSpec.js
    ├── globals.js
    ├── jsdomSpec.js
    ├── spec
        ├── ContentSelectorSpec.js
        ├── DataExtractSpec.js
        ├── ElementQuerySpec.js
        ├── JobSpec.js
        ├── QueueSpec.js
        ├── ScraperSpec.js
        ├── Selector
        │   ├── SelectorElementAttributeSpec.js
        │   ├── SelectorElementClickSpec.js
        │   ├── SelectorElementScrollSpec.js
        │   ├── SelectorElementSpec.js
        │   ├── SelectorGoogMapIDSpec.js
        │   ├── SelectorGroupSpec.js
        │   ├── SelectorHTMLSpec.js
        │   ├── SelectorImageSpec.js
        │   ├── SelectorLinkSpec.js
        │   ├── SelectorTableSpec.js
        │   └── SelectorTextSpec.js
        ├── SelectorListSpec.js
        ├── SelectorSpec.js
        ├── SitemapSpec.js
        ├── UniqueElementListSpec.js
        ├── browser
        │   ├── BackgroundScriptSpec.js
        │   ├── ChromePopupBrowserSpec.js
        │   ├── ContentScriptSpec.js
        │   ├── ScraperSpec.js
        │   └── Selector
        │   │   ├── SelectorImageSpec.js
        │   │   └── SelectorPopupLinkSpec.js
        ├── headless
        │   └── browserSpec.js
        ├── jquery.whencallsequentiallySpec.js
        └── jsdom
        │   └── browserSpec.js
    └── utils.js


/.babelrc:
--------------------------------------------------------------------------------
1 | {
2 |   "plugins": ["meaningful-logs"]
3 | }


--------------------------------------------------------------------------------
/.eslintrc:
--------------------------------------------------------------------------------
 1 | {  "env": {
 2 |   "node": true
 3 | },
 4 |   "globals": {
 5 |     "d3": true,
 6 |     "$": true,
 7 |     "chrome": true,
 8 |     "jQuery": true,
 9 |     "describe": true,
10 |     "it": true,
11 |     "beforeEach": true,
12 |     "afterEach": true,
13 |     "after": true,
14 |     "before": true
15 |   },
16 |   "extends": ["standard"]}
17 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | projectFilesBackup
3 | extension.zip
4 | node_modules
5 | npm-debug.log


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "extension/assets/css-selector"]
2 | 	path = extension/assets/css-selector
3 | 	url = https://github.com/martinsbalodis/css-selector.git
4 | 


--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | projectFilesBackup
 3 | extension.zip
 4 | node_modules
 5 | npm-debug.log
 6 | extension/assets/*
 7 | extension/assets/*/
 8 | !extension/assets/jquery.whencallsequentially.js
 9 | !extension/assets/base64.js
10 | docs/images/*
11 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Web Scraper
  3 | Web Scraper is a chrome browser extension and a library built for data extraction from web 
  4 | pages. Using this extension you can create a plan (sitemap) how a web site 
  5 | should be traversed and what should be extracted. Using these sitemaps the 
  6 | Web Scraper will navigate the site accordingly and extract all data. Scraped 
  7 | data later can be exported as CSV.
  8 | 
  9 | To use it as an extension install it from [Chrome store] [chrome-store]
 10 | 
 11 | To use it as a library do `npm i web-scraper-headless`
 12 | 
 13 | ### Features
 14 | 
 15 |  1. Scrape multiple pages
 16 |  2. Sitemaps and scraped data are stored in browsers local storage or in CouchDB
 17 |  3. Multiple data selection types
 18 |  4. Extract data from dynamic pages (JavaScript+AJAX)
 19 |  5. Browse scraped data
 20 |  6. Export scraped data as CSV
 21 |  7. Import, Export sitemaps
 22 |  8. Depends only on Chrome browser
 23 | 
 24 | ### Help
 25 | 
 26 |  Documentation and tutorials are available on [webscraper.io] [webscraper.io]
 27 |  
 28 |  Ask for help, submit bugs, suggest features on [google groups] [google-groups]
 29 |  
 30 |  Submit bugs and suggest features on [bug tracker] [github-issues]
 31 |  
 32 | #### Headless mode
 33 | To use it as a library you need a sitemap, you can write it by hand, but the easiest way is to use the [original extension][extension] to scrape and then click on "export sitemap".
 34 | 
 35 |     const webscraper = require('web-scraper-headless')
 36 |     // visit github and retrieve last commit of all trending repo. 
 37 |     // The sitemap depends on the actual DOM of github, so it might get outdated
 38 |     const sitemap = {
 39 | 	     "startUrl": "https://github.com/trending",
 40 | 	     "selectors": [{
 41 | 		      "parentSelectors": ["_root"],
 42 | 		      "type": "SelectorLink",
 43 | 		      "multiple": true,
 44 | 		      "id": "link_to_repo",
 45 | 		      "selector": "h3 a",
 46 | 		      "delay": ""
 47 | 	     }, {
 48 | 		      "parentSelectors": ["link_to_repo"],
 49 | 		      "type": "SelectorText",
 50 | 		      "multiple": false,
 51 | 		      "id": "latest_commit",
 52 | 		      "selector": "a.commit-tease-sha",
 53 | 		      "regex": "",
 54 | 		      "delay": ""
 55 | 	    }],
 56 | 	    "_id": "github_trending"
 57 |     }
 58 |     const options = {delay: 10, pageLoadDelay: 10, browser: 'headless'} // optional delay, pageLoadDelay and browser
 59 |     webscraper(sitemap, options)
 60 |         .then(function (scraped) {
 61 |             // This is your scraped info
 62 |         })
 63 | 
 64 | By default webscraper-headless will open [jsdom](https://github.com/jsdom/jsdom) as a browser. This is a purely JS implementation of HTML. As such it has no native dependencies and it is very lightweighted. However, it is not capable of executing js which might be a hindrance in some cases. If that is your case, you can use chrome headless as a browser. Note that it will consume far more resources than jsdom and you need to have some native dependencies installed in the server. To use chrome headless do the following:
 65 | 
 66 |     const sitemap = // same as previous example
 67 |     const options = {browser: 'headless'}
 68 |     webscraper(sitemap, options)
 69 |         .then(function (scraped) {
 70 |             // This is your scraped info
 71 |         })
 72 | 
 73 | #### Bugs
 74 | When submitting a bug please attach an exported sitemap if possible.
 75 | 
 76 | ## License
 77 | LGPLv3
 78 | 
 79 | ## Changelog
 80 | 
 81 | ### v0.2
 82 |  * Added Element click selector
 83 |  * Added Element scroll down selector
 84 |  * Added Link popup selector
 85 |  * Improved table selector to work with any html markup
 86 |  * Added Image download
 87 |  * Added keyboard shortcuts when selecting elements
 88 |  * Added configurable delay before using selector
 89 |  * Added configurable delay between page visiting
 90 |  * Added multiple start url configuration
 91 |  * Added form field validation
 92 |  * Fixed a lot of bugs
 93 | 
 94 | ### v0.1.3
 95 |  * Added Table selector
 96 |  * Added HTML selector
 97 |  * Added HTML attribute selector
 98 |  * Added data preview
 99 |  * Added ranged start urls
100 |  * Fixed bug which made selector tree not to show on some operating systems
101 | 
102 |  [chrome-store]: https://chrome.google.com/webstore/detail/web-scraper/jnhgnonknehpejjnehehllkliplmbmhn
103 |  [webscraper.io]: http://webscraper.io/
104 |  [google-groups]: https://groups.google.com/forum/#!forum/web-scraper
105 |  [github-issues]: https://github.com/martinsbalodis/web-scraper-chrome-extension/issues
106 |  [extension]: https://chrome.google.com/webstore/detail/web-scraper/jnhgnonknehpejjnehehllkliplmbmhn
107 | 


--------------------------------------------------------------------------------
/docs/CSS selector.md:
--------------------------------------------------------------------------------
 1 | # CSS selector
 2 | 
 3 | Web Scraper uses css selectors to find HTML elements in web pages and to extract
 4 | data from them. When selecting an element the Web Scraper will try to make its
 5 | best guess what the CSS selector might be for the selected elements. But you
 6 | can also write it yourself and test it with by clicking "Element preview". You
 7 | can use CSS selectors that are available in CSS versions 1-3 and also pseudo
 8 | selectors that are additionally available in jQuery. Here are some
 9 | documentation links that might help you:
10 |  
11 |  * [CSS Selectors] [css-selectors-wikipedia]
12 |  * [jQuery CSS selectors] [css-selectors-jquery]
13 |  * [w3schools CSS selector reference] [w3schools-css-selector-reference]
14 | 
15 | ## Additional Web Scraper selectors
16 | It is possible to add new pseudo CSS selectors to Web Scraper. Right now there
17 | is only one CSS selector added.
18 | 
19 | #### Parent selector
20 | 
21 | CSS Selector `_parent_` allows a child selector of an
22 | *Element selector* to select the element that was returned by the *Element selector*. For
23 | example this CSS selector could be used in a case where you need to extract an
24 | attribute from the element that the *Element selector* returned.
25 | 
26 |  [css-selectors-wikipedia]: http://en.wikipedia.org/wiki/Cascading_Style_Sheets#Selector
27 |  [css-selectors-jquery]: http://api.jquery.com/category/selectors/
28 |  [w3schools-css-selector-reference]: http://www.w3schools.com/cssref/css_selectors.asp


--------------------------------------------------------------------------------
/docs/Installation.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | You can install the extension from [Chrome store] [1]. After installing it you
 4 | should restart chrome to make sure the extension is fully loaded. If you don't
 5 | want to restart Chrome then use the extension only in tabs that are
 6 | created after installing it.
 7 | 
 8 | ## Requirements
 9 | 
10 | The extension requires Chrome 31+ . There are no OS limitations. 
11 | 
12 |  [1]: https://chrome.google.com/webstore/detail/web-scraper/jnhgnonknehpejjnehehllkliplmbmhn  "Install web scraper from Chrome store"


--------------------------------------------------------------------------------
/docs/Open Web Scraper.md:
--------------------------------------------------------------------------------
 1 | # Open Web Scraper
 2 | 
 3 | Web Scraper is integrated into chrome Developer tools. Figure 1 shows how you
 4 | can open it. You can also use these shortcuts to open Developer tools. After
 5 | opening Developer tools open *Web Scraper* tab.
 6 | 
 7 | Shourtcuts:
 8 | 
 9 |  * windows, linux: `Ctrl+Shift+I`, `f12`, open `Tools / Developer tools`
10 |  * mac `Cmd+Opt+I`, open `Tools / Developer tools`
11 | 
12 | ![Fig. 1: Open Web Scraper][open-web-scraper]
13 | 
14 |  [open-web-scraper]: images/open-web-scraper/open-web-scraper.png?raw=true


--------------------------------------------------------------------------------
/docs/Scraping a site.md:
--------------------------------------------------------------------------------
 1 | # Scraping a site
 2 | 
 3 | Open the site that you want to scrape.
 4 | 
 5 | ## Create Sitemap
 6 | 
 7 | The first thing you need to do when creating a *sitemap* is specifying the
 8 | start url. This is the url from which the scraping will start. You can also
 9 | specify multiple start urls if the scraping should start from multiple places.
10 | For example if you want to scrape multiple search results then you could create
11 | a separate start url for each search result.
12 | 
13 | ### Specify multiple urls with ranges
14 | 
15 | In cases where a site uses numbering in pages URLs it is much simpler to create
16 | a range start url than creating *Link selectors* that would navigate the site.
17 | To specify a range url replace the numeric part of start url with a range
18 | definition - `[1-100]`. If the site uses zero padding in urls then add zero
19 | padding to the range definition - `[001-100]`. If you want to skip some urls
20 | then you can also specify incremental like this `[0-100:10]`.
21 | 
22 | Use range url like this `http://example.com/page/[1-3]` for links like these:
23 | 
24 |  * `http://example.com/page/1`
25 |  * `http://example.com/page/2`
26 |  * `http://example.com/page/3`
27 | 
28 | Use range url with zero padding like this `http://example.com/page/[001-100]`
29 | for links like these:
30 | 
31 |  * `http://example.com/page/001`
32 |  * `http://example.com/page/002`
33 |  * `http://example.com/page/003`
34 | 
35 | Use range url with increment like this `http://example.com/page/[0-100:10]` for
36 | links like these:
37 | 
38 |  * `http://example.com/page/0`
39 |  * `http://example.com/page/10`
40 |  * `http://example.com/page/20`
41 | 
42 | ## Create selectors
43 | 
44 | After you have created the *sitemap* you can add selectors to it. In the
45 | *Selectors* panel you can add new selectors, modify them and navigate the
46 | selector tree.
47 | The selectors can be added in a tree type structure. The web scraper will
48 | execute the selectors in the order how they are organized in the tree
49 | structure. For example there is a news site and you want to scrape all articles
50 | whose links are available on the first page. In image 1 you can see this
51 | example site.
52 | 
53 | ![Fig. 1: News site][image-news-site]
54 | 
55 | To scrape this site you can create a *Link selector* which will extract all
56 | article links in the first page. Then as a child selector you can add a
57 | *Text selector* that will extract articles from the article pages that the
58 | *Link selector* found links to. Image below illustrates how the *sitemap*
59 | should be built for the news site.
60 | 
61 | ![Fig. 2: News site sitemap][image-news-site-sitemap]
62 | 
63 | Note that when creating selectors use Element preview and Data preview features
64 | to ensure that you have selected the correct elements with the correct data.
65 | 
66 | More information about selector tree building is available in selector
67 | documentation. You should atleast read about these core selectors:
68 | 
69 |  * [Text selector][text-selector]
70 |  * [Link selector][link-selector]
71 |  * [Element selector][element-selector]
72 | 
73 | ### Inspect selector tree
74 | 
75 | After you have created selectors for the *sitemap* you can inspect the tree
76 | structure of selectors in the Selector graph panel. Image below shows an
77 | example selector graph.
78 | 
79 | ![Fig. 3: News site selector graph][image-news-site-selector-graph]
80 | 
81 | ## Scrape the site
82 | 
83 | After you have created selectors for the *sitemap* you can start scraping. Open
84 | *Scrape* panel and start scraping. A new popup window will open in which the
85 | scraper will load pages and extract data from them. After the scraping is done
86 | the popup window will close and you will be notified with a popup message. You can view
87 | the scraped data by opening *Browse* panel and export it by opening the
88 | *Export data as CSV* panel.
89 | 
90 | 
91 | [image-news-site]: images/scraping-a-site/news-site.png?raw=true
92 | [image-news-site-sitemap]: images/scraping-a-site/news-site-sitemap.png?raw=true
93 | [image-news-site-selector-graph]: images/scraping-a-site/news-site-selector-graph.png?raw=true
94 | [text-selector]: Selectors/Text%20selector.md
95 | [link-selector]: Selectors/Link%20selector.md
96 | [element-selector]: Selectors/Element%20selector.md


--------------------------------------------------------------------------------
/docs/Selectors.md:
--------------------------------------------------------------------------------
 1 | # Selectors
 2 | 
 3 | Web scraper has multiple selectors that can be used for different type data
 4 | extraction and for different interaction with the website. The selectors can
 5 | be divided in three groups:
 6 | 
 7 |  * Data extraction selectors for data extraction.
 8 |  * Link selectors for site navigation.
 9 |  * Element selectors for element selection that separate multiple records
10 | 
11 | ### Data extraction selectors
12 | 
13 | Data extraction selectors simply return data from the selected element. 
14 | For example [Text selector] [text-selector] extracts text from
15 | selected element. These selectors can be used as data extraction selectors:
16 | 
17 |  * [Text selector] [text-selector]
18 |  * [Link selector] [link-selector]
19 |  * [Link popup selector] [link-popup-selector]
20 |  * [Image selector] [image-selector]
21 |  * [Table selector] [table-selector]
22 |  * [Element attribute selector] [element-attribute-selector]
23 |  * [HTML selector] [html-selector]
24 |  * [Grouped selector] [grouped-selector]
25 | 
26 | ### Link selectors
27 | 
28 | Link selectors extract URLs from links that can be later opened for data
29 | extraction. For example if in a sitemap tree there is a *Link selector* that has
30 | 3 child text selectors then the Web Scraper extract all urls with the *Link
31 | selector* and then open each link and use those child data extraction selectors
32 | to extract data. Of course a link selector might have *Link selectors* as child
33 | selectors then these child *Link selectors* would be used for further page
34 | navigation. These are currently available *Link selectors*:
35 | 
36 |  * [Link selector] [link-selector]
37 |  * [Link popup selector] [link-popup-selector]
38 | 
39 | ### Element selectors
40 | 
41 | Element selectors are for element selection that contain multiple data elements.
42 | For example an element selector might be used to select a list of items in an
43 | e-commerce site. The selector will return each selected element as a parent
44 | element to its child selectors. Element selectors child selectors will 
45 | extract data only within the element that the element selector gave them.
46 | These are currently available Element selectors:
47 | 
48 |  * [Element selector] [element-selector]
49 |  * [Element scroll down selector] [element-scroll-selector]
50 |  * [Element click selector] [element-click-selector]
51 | 
52 | ## Selector configuration options
53 | 
54 | Each selector has configuration options. Here you can see the most common ones.
55 | Configuration options that are specific to a selector are described in
56 | selectors documentation.
57 | 
58 |  * selector - CSS selector that selects an element the selector will be working
59 |  on.
60 |  * multiple - should be checked when multiple records (data rows) are going to
61 |  be extracted with this selector. Data extracted from two or more selectors with 
62 |  multiple checked wont be merged in a single record.
63 |  * delay - delay before selector is being used.
64 |  * parent selectors - configure parent selectors for this selector to make the
65 | selector tree.
66 | 
67 | Note! A common mistake when using multiple configuration option is to create
68 | two selectors alongside with multiple checked and expect that the scraper will
69 | join selector values in pairs. For example if you selected pagination links and
70 | navigation links these links couldn't be logically joined in pairs. The correct
71 | way is to select a wrapper element with Element selector and add data selectors
72 | as child selectors to the element selector with multiple option not checked.
73 | 
74 |  [text-selector]: Selectors/Text%20selector.md
75 |  [link-selector]: Selectors/Link%20Selector.md
76 |  [link-popup-selector]: Selectors/Link%20Popup%20Selector.md
77 |  [image-selector]: Selectors/Image%20selector.md
78 |  [element-attribute-selector]: Selectors/Table%20selector.md
79 |  [table-selector]: Selectors/Table%20selector.md
80 |  [grouped-selector]: Selectors/Grouped%20selector.md
81 |  [html-selector]: Selectors/HTML%20selector.md
82 |  [element-selector]: Selectors/Element%20selector.md
83 |  [element-click-selector]: Selectors/Element%20click%20selector.md
84 |  [element-scroll-selector]: Selectors/Element%20scroll%20down%20selector.md
85 | 


--------------------------------------------------------------------------------
/docs/Selectors/Element attribute selector.md:
--------------------------------------------------------------------------------
 1 | # Element attribute selector
 2 | Element attribute selector can extract an attributes value of an HTML element.
 3 | For example you could use this selector to extract title attribute from
 4 | this link: `<a href="#" title="my title">link<a>`.
 5 | 
 6 | ## Configuration options
 7 |  * selector - [CSS selector] [css-selector] for the element.
 8 |  * multiple - multiple records are being extracted.
 9 |  * attribute name - the attribute that is going to be extracted. For example
10 |  `title`, `data-id`.
11 | 
12 | ## Use cases
13 | See [Text selector] [text-selector] use cases.
14 | 
15 |  [text-selector]: Text%20selector.md
16 |  [css-selector]: ../CSS%20selector.md


--------------------------------------------------------------------------------
/docs/Selectors/Element click selector.md:
--------------------------------------------------------------------------------
 1 | # Element click selector
 2 | 
 3 | Element click selector works similarly to
 4 | [Element selector] [element-selector]. It's main purpose also is element 
 5 | selection that could be given as parent elements to its child selectors. The only
 6 | difference is that *Element click selector* can interact with the web page by
 7 | clicking on buttons to load new elements. For example a page might use
 8 | JavaScript and AJAX for pagination or item loading.
 9 | 
10 | Note! when selecting clickable elements you should select them by moving the
11 | mouse over the element and pressing "S". This kind of selection will avoid
12 | events triggered by the button.
13 | 
14 | ## Configuration options
15 |  * selector - [CSS selector] [css-selector] for the wrapper elements that will
16 |  be used as parent elements for child selectors.
17 |  * click selector - [CSS selector] [css-selector] for the buttons that need to
18 |  be clicked to load more elements.
19 |  * click type - type of how the selector knows when there will be no new
20 |  elements and clicking should stop.
21 |  * click element uniqueness - type of how selector knows which buttons are 
22 |  already clicked.
23 |  * multiple - multiple records are being extracted (almost always should be
24 |  checked). Multiple option for child selectors usually should not be checked.
25 |  * delay - delay before element selection and delay between clicking. This
26 |  should usually be specified because the data won't be loaded immediately from
27 |  the server. More than 2000 ms might be a good choice if you you don't want to
28 |  loose data because the server didn't respond fast enough.
29 |  * Discard initial elements - the selector will not return the elements that
30 |  were available before clicking for the first time. This might be useful for
31 |  duplicate removal.
32 | 
33 | ### Click type
34 | #### Click Once
35 | 
36 | Click Once type will click on the buttons only once. If a new button appears
37 | that can be selected it will be also clicked. For example pagination links
38 | might show pages 1 to 5 but pages 6 to 10 would appear some time later. The
39 | selector will also click on those buttons.
40 | 
41 | #### Click More
42 | 
43 | Click More type makes the selector click on given buttons multiple times
44 | until there are no new elements appearing. A new element is considered an
45 | element that has unique text content.
46 | 
47 | ### Click element uniqueness
48 | 
49 | When using *Click Once* only unique buttons will be clicked. When using 
50 | *Click More* this helps to ignore buttons that don't generate more elements.
51 |  
52 |  * Unique Text - buttons with identical text content are considered equal
53 |  * Unique HTML+Text - buttons with identical HTML and text content are 
54 |  considered equal
55 |  * Unique HTML - buttons with identical HTML and stripped text content are 
56 |  considered equal
57 |  * Unique CSS Selector - buttons with identical CSS Selector are considered equal
58 | 
59 | ## Use cases
60 | 
61 | #### Navigate pagination using "Click once" selector type
62 | 
63 | For example there is a site that displays a list of items and there are some
64 | pagination buttons that reload these items dynamically (after clicking a button
65 | the url doesn't change. changes after hash tag # doesn't count). Using *Element
66 | click selector* you can select these items and buttons that need to be clicked.
67 | The scraper during scraping phase will click these buttons to extract all
68 | elements. Also you need to add child selectors for the *Element click selector*
69 | that select data within each element. In figure 1 you can see how to configure
70 | the *Element click selector* to extract data from the described site.
71 | 
72 |  ![Fig. 1: Sitemap when using Click once type][image-click-once]
73 | 
74 | #### Load more items in an e-commerce site by clicking "More" button
75 | 
76 | This example is similar to the one above. The only difference is that in this
77 | site items are loaded by clicking a single button multiple times. In this case
78 | the *Element click selector* should be configured to use "Click more" click
79 | type. In figure 2 you can see how to configure the *Element click selector*
80 | to extract data from this site.
81 | 
82 |  ![Fig. 2: Sitemap when using Click more type][image-click-more]
83 | 
84 |  [image-click-more]: ../images/selectors/element-click/click-more.png?raw=true
85 |  [image-click-once]: ../images/selectors/element-click/click-once.png?raw=true
86 |  [element-selector]: Element%20selector.md
87 |  [css-selector]: ../CSS%20selector.md


--------------------------------------------------------------------------------
/docs/Selectors/Element scroll down selector.md:
--------------------------------------------------------------------------------
 1 | # Element scroll down selector
 2 | 
 3 | This is another Element selector that works similarly to Element selector but
 4 | additionally it scrolls down the page multiple times to find those elements
 5 | which are added when page is scrolled down to the bottom. Use the delay
 6 | attribute to configure waiting interval between scrolling and element search.
 7 | Scrolling is stopped after no new elements are found. If the page can scroll
 8 | infinitely then this selector will be stuck in an infinite loop.
 9 | 
10 | ## Configuration options
11 | 
12 |  * selector - [CSS selector] [css-selector] for the element.
13 |  * multiple - multiple records are being extracted (almost always should be
14 |  checked). Multiple option for child selectors usually should not be checked.
15 |  * delay - delay before element selection and delay between scrolling. This
16 |  should usually be specified because the data won't be loaded immediately from
17 |  the server after scrolling down. More than 2000 ms might be a good choice if
18 |  you you don't want to loose data because the server didn't respond fast enough.
19 | 
20 | ## Use cases
21 | See [Element selector] [element-selector] use cases.
22 | 
23 |  [element-selector]: Element%20selector.md
24 |  [css-selector]: ../CSS%20selector.md


--------------------------------------------------------------------------------
/docs/Selectors/Element selector.md:
--------------------------------------------------------------------------------
 1 | # Element selector
 2 | 
 3 | Element selector is for element selection that contain multiple data elements.
 4 | For example element selector might be used to select a list of items in an
 5 | e-commerce site. The selector will return each selected element as a parent
 6 | element to its child selectors. Element selectors child selectors will be
 7 | extracting data only within the element that the element selector gave them.
 8 | 
 9 | Note! If the page dynamically loads new items after scrolling down or clicking
10 | on a button then you should try these selectors:
11 | 
12 |  * [Element scroll down selector] [element-scroll-selector]
13 |  * [Element click selector] [element-click-selector]
14 | 
15 | ## Configuration options
16 |  * selector - [CSS selector] [css-selector] for the wrapper elements that will
17 |  be used as parent elements for child selectors.
18 |  * multiple - multiple records are being extracted (almost always should be
19 |  checked). Multiple option for child selectors usually should not be checked.
20 | 
21 | ## Use cases
22 | 
23 | #### Select multiple e-commerce items from a page
24 | 
25 | For example an e-commerce site has a page with a list of items. With element
26 | selector you can select the elements that wrap these items and then add
27 | multiple child selectors to it to extract data within the items wrapper 
28 | element. Figure 1 shows how an element selector could be used in this
29 | situation.
30 | 
31 | ![Fig. 1: Multiple items selected with element selector] [multiple-elements-with-text-selectors]
32 | 
33 | #### Extract data from tables
34 | 
35 | Similarly to e-commerce item selection you can also select table rows and add
36 | child selectors for data extraction from table cells.
37 | Though [Table selector] [table-selector] might be much better solution.
38 | 
39 |  [css-selector]: ../CSS%20selector.md
40 |  [element-scroll-selector]: Element%20scroll%20down%20selector.md
41 |  [element-click-selector]: Element%20click%20selector.md
42 |  [table-selector]: Table%20selector.md
43 |  [multiple-elements-with-text-selectors]: ../images/selectors/text/text-selector-multiple-elements-with-text-selectors.png?raw=true


--------------------------------------------------------------------------------
/docs/Selectors/Grouped selector.md:
--------------------------------------------------------------------------------
 1 | # Grouped selector
 2 | 
 3 | Grouped selector can group text data from multiple elements into one record.
 4 | The extracted data will be stored as JSON.
 5 | 
 6 | ## Configuration options
 7 |  * selector - [CSS selector] [css-selector] for the elements whose text will be
 8 |  extracted and stored in JSON format.
 9 |  * attribute name - optionally this selector can extract an attribute of the
10 |  selected element. If specified the extractor will also add this attribute to
11 |  the resulting JSON.
12 | 
13 | ## Use cases
14 | 
15 | #### Extract article references
16 | 
17 | For example you are extracting a news article that might have multiple
18 | reference links. If you are selecting these links with link selector with
19 | multiple checked you would get duplicate articles in the result set where each
20 | record would contain one reference link. Using grouped selector you could
21 | serialize all these reference links into one record. To do that select all
22 | reference links and set attribute name to `href` to also extract links to these
23 | sites.
24 | 
25 |  [css-selector]: ../CSS%20selector.md


--------------------------------------------------------------------------------
/docs/Selectors/HTML selector.md:
--------------------------------------------------------------------------------
 1 | # HTML selector
 2 | HMTL selector can extract HTML and text within the selected element. Only the
 3 | inner HTML of the element will be extracted.
 4 | 
 5 | ## Configuration options
 6 |  * selector - [CSS selector] [css-selector] for the element whose inner HTML
 7 |  will be extracted.
 8 |  * multiple - multiple records are being extracted.
 9 | 
10 | ## Use cases
11 | See [Text selector] [text-selector] use cases.
12 | 
13 |  [text-selector]: Text%20selector.md
14 |  [css-selector]: ../CSS%20selector.md


--------------------------------------------------------------------------------
/docs/Selectors/Image selector.md:
--------------------------------------------------------------------------------
 1 | # Image selector
 2 | Image selector can extract `src` attribute (URL) of an image. 
 3 | Optionally you can also store the images. The images will be stored in your
 4 | downloads directory:
 5 | 
 6 | `Downloads/<sitemap-id>/<selector-id>/<image filename.jpg>`
 7 | 
 8 | Note! When selecting CSS selector for image selector all the images within the
 9 | site are moved to the top. If this feature somehow breaks sites layout please
10 | report it as a bug.
11 | 
12 | ## Configuration options
13 |  * selector - [CSS selector] [css-selector] for the image element.
14 |  * multiple - multiple records are being extracted. Usually should not be
15 |  checked for Image selector.
16 |  * download image - downloads and store images on local drive. When CouchDB
17 |  storage back end is used the image is also stored locally.
18 | 
19 | ## Use cases
20 | See [Text selector] [text-selector] use cases.
21 | 
22 |  [text-selector]: Text%20selector.md
23 |  [css-selector]: ../CSS%20selector.md


--------------------------------------------------------------------------------
/docs/Selectors/Link popup selector.md:
--------------------------------------------------------------------------------
 1 | # Link popup selector
 2 | 
 3 | *Link popup selector* works similarly as [Link selector] [link-selector]. It can
 4 | be used for url extraction and site navigation. The only difference is that
 5 | *Link popup selector* should be used when clicking on a link the site opens a new
 6 | window (popup) instead of loading the URL in the same tab or opening it in a
 7 | new tab. This selector will catch the popup creation event and extract the URL.
 8 | If the site creates a visual popup but not a real window then you should try
 9 | [Element click selector] [element-click-selector]
10 | 
11 | Note! when selecting these link elements you can move the mouse over the 
12 | element and press "S" to select it to prevent it from opening a popup.
13 | 
14 | ## Use cases
15 | See [Link selector] [link-selector] use cases.
16 | 
17 |  [link-selector]: Link%20selector.md
18 |  [element-click-selector]: Element%20click%20selector.md


--------------------------------------------------------------------------------
/docs/Selectors/Link selector.md:
--------------------------------------------------------------------------------
 1 | # Link selector
 2 | 
 3 | Link selector is used for link selection and website navigation. If you use
 4 | *Link selector* without any child selectors then it will extract the link and
 5 | the href attribute of the link. If you add child selectors to *Link selector*
 6 | then these child selectors will be used in the page that this link was leading
 7 | to. If you are selecting multiple links then check *multiple* property.
 8 | 
 9 | Note! Link selector works only with `<a>` tags with `href` attribute. If the
10 | link selector is not working for you then you can try these workarounds:
11 | 
12 |  1. Check that the link in the url bar changes after clicking an item (changes
13 |  only after hash tag doesn't count). If the link doesn't change then the site
14 |  is probably using ajax for data loading. Instead of using link selector you
15 |  should use [Element click selector] [element-click].
16 |  2. If the site opens a popup then you should use
17 |  [Link popup selector] [link-popup]
18 |  3. The site might be using JavaScript `window.location` to change the URL. Web
19 |  Scraper cannot handle this kind of navigation right now.
20 | 
21 | ## Configuration options
22 | 
23 |  * selector - [CSS selector] [css-selector] for the link element from which the
24 |  link for navigation will be extracted.
25 |  * multiple - multiple records are being extracted. Usually should be checked.
26 | 
27 | ## Use cases
28 | 
29 | **Navigate through multiple levels of navigation**
30 | 
31 | For example an e-commerce site has multi level navigation -
32 | `categories -> subcategories`. To scrape data from all categories and
33 | subcategories you can create two *Link selectors*. One selector would select
34 | category links and the other selector would select subcategory links that are
35 | available in the category pages. The subcategory *Link selector* should be made
36 | as a child of the category *Link selector*. The selectors for data extraction
37 | from subcategory pages should be made as a child selectors to the subcategory
38 | selector.
39 | 
40 | ![Fig. 1: Multiple link selectors for category navigation][multiple-level-link-selectors]
41 | 
42 | **Handle pagination**
43 | 
44 | For example an e-commerce site has multiple categories. Each category has a
45 | list of items and pagination links. Also some pages are not directly available
46 | from the category but are available from pagination pages (you can see
47 | pagination links 1-5, but not 6-8). You can start by building a sitemap that
48 | visits each category and extract items from category page. This sitemap will
49 | extract items only from the first pagination page. To extract items from all of
50 | the pagination links including the ones that are not visible at the beginning
51 | you need to create another *Link selector* that selects the pagination links.
52 | Figure 2 shows how the link selector should be created in the sitemap. When
53 | the scraper opens a category link it will extract items that are available in
54 | the page. After that it will find the pagination links and also visit those. If
55 | the pagination link selector is made a child to itself it will recursively
56 | discover all pagination pages. Figure 3 shows a selector graph where you can
57 | see how pagination links discover more pagination links and more data.
58 | 
59 | ![Fig. 2: Sitemap with Link selector for pagination][pagination-link-selectors]
60 | ![Fig. 3: Selector graph with pagination][pagination-selector-graph]
61 | 
62 |  [multiple-level-link-selectors]: ../images/selectors/link/multiple-level-link-selectors.png?raw=true
63 |  [pagination-link-selectors]: ../images/selectors/link/pagination-link-selectors.png?raw=true
64 |  [pagination-selector-graph]: ../images/selectors/link/pagination-selector-graph.png?raw=true
65 |  [element-click]: Element%20click%20selector.md
66 |  [link-popup]: Link%20popup%20selector.md
67 |  [css-selector]: ../CSS%20selector.md


--------------------------------------------------------------------------------
/docs/Selectors/Table selector.md:
--------------------------------------------------------------------------------
 1 | # Table selector
 2 | 
 3 | Table selector can extract data from tables. *Table selector* has 3
 4 | configurable CSS selectors. The selector is for table selection. After you have 
 5 | selected the selector the *Table selector* will try to guess selectors
 6 | for header row and data rows. You can click Element preview on those selectors
 7 | to see whether the *Table selector* found table header and data rows correctly.
 8 | The header row selector is used to identify table columns when data is
 9 | extracted from multiple pages. Also you can rename table columns. Figure 1
10 | shows what you should select when extracting data from a table.
11 | 
12 | ![Fig. 1: Selectors for table selector] [table-selector-selectors]
13 | 
14 | ## Configuration options
15 |  * selector - [CSS selector] [css-selector] for the table element.
16 |  * header row selector - [CSS selector] [css-selector] for table header row.
17 |  * data rows selector - [CSS selector] [css-selector] for table data rows.
18 |  * multiple - multiple records are being extracted. Usually should be
19 |  checked for Table selector because you are extracting multiple rows.
20 | 
21 | ## Use cases
22 | See [Text selector] [text-selector] use cases.
23 | 
24 |  [table-selector-selectors]: ../images/selectors/table/selectors.png?raw=true
25 |  [text-selector]: Text%20selector.md
26 |  [css-selector]: ../CSS%20selector.md


--------------------------------------------------------------------------------
/docs/Selectors/Text selector.md:
--------------------------------------------------------------------------------
 1 | # Text selector
 2 | 
 3 | Text selector is used for text selection. The text selector will extract text
 4 | from the selected element and from all its child elements. HTML will be
 5 | stripped and only text will be returned. Selector will ignore text within
 6 | `<script>` and `<style>` tags. New line `<br>` tags will be replaced with
 7 | newline characters. You can additionally apply a regular expression to
 8 | resulting data.
 9 | 
10 | ## Configuration options
11 | 
12 |  * selector - [CSS selector] [css-selector] for the element from which data
13 |  will be extracted.
14 |  * multiple - multiple records are being extracted. Usually should not be
15 |  checked. If you want to use multiple text selectors within one page with
16 |  multiple checked then you might actually need
17 |  [Element selector] [element-selector].
18 |  * regex - regular expression to extract a substring from the result.
19 | 
20 | ### Regex
21 | 
22 | The regular expression attribute can be used to extract a substring of the text
23 | that the selector extracts. When a regular expression is used the whole match
24 | (group 0) will be returned as a result  [www.regexr.com] [regex-site] is a
25 | great site where you can learn about regular expressions and try them out.
26 | 
27 | Here are some examples that you might find useful:
28 | 
29 | | text             	| regex                          	| result     	|
30 | |------------------	|--------------------------------	|------------	|
31 | | price: 14.99$    	| `[0-9]+\.[0-9]+`               	| 14.99      	|
32 | | id: H83JKDX4     	| `[A-Z0-9]{8}`                  	| H83JKDX4   	|
33 | | date: 2014-08-20 	| `[0-9]{4}\-[0-9]{2}\-[0-9]{2}` 	| 2014-08-20 	|
34 | 
35 | ## Use cases
36 | **Extract one record per page with multiple text selectors**
37 | 
38 | For example you are scraping news site that has one article per page. The page
39 | might contain the article, its title, date published and the author. A
40 | *Link selector* can navigate the scraper to each of these article pages.
41 | Multiple text selectors can extract the title, date, author and article.
42 | *Multiple* option should be left unchecked for text selectors because each page
43 | is extracting only one record.
44 | 
45 | ![Fig. 1: Multiple text selectors per page][text-selector-multiple-single-text-selectors-in-one-page]
46 | 
47 | **Extract multiple items with multiple text selectors per page**
48 | 
49 | E-commerce sites usually have multiple items per page. If you want to scrape
50 | these items you will need an *Element selector* that selects item wrapper
51 | elements and multiple text selectors that select data within each item wrapper
52 | element.
53 | 
54 | ![Fig. 2: Multiple elements with text selectors. Some arrows are skipped.][text-selector-multiple-elements-with-text-selectors]
55 | 
56 | **Extract multiple text records per page**
57 | 
58 | For example you want to extract comments for an article. There are multiple
59 | comments in a single page and you only need the comment text (If you would need
60 | other comment attributes then see the example above). You can use
61 | *Text selector* to extract these comments. The *Text selectors* multiple
62 | attribute should be checked because you will be extracting multiple records.
63 | 
64 | ![Fig. 3: Text selector selects multiple comments][text-selector-multiple-per-page]
65 | 
66 | 
67 |  [regex-site]: http://www.regexr.com/
68 |  [text-selector-multiple-single-text-selectors-in-one-page]: ../images/selectors/text/text-selector-multiple-single-text-selectors-in-one-page.png?raw=true
69 |  [text-selector-multiple-elements-with-text-selectors]: ../images/selectors/text/text-selector-multiple-elements-with-text-selectors.png?raw=true
70 |  [text-selector-multiple-per-page]: ../images/selectors/text/text-selector-multiple-per-page.png?raw=true
71 |  [element-selector]: Element%20selector.md
72 |  [css-selector]: ../CSS%20selector.md
73 | 


--------------------------------------------------------------------------------
/docs/Storage backends.md:
--------------------------------------------------------------------------------
 1 | # Storage backends
 2 | 
 3 | Web scraper can be configured to use either local storage or CouchDB. By
 4 | default all data is stored in the local storage.
 5 | 
 6 | ## Local storage
 7 | 
 8 | Local storage backend uses browsers built in database to store data. This data
 9 | is not replicated from one chrome instance to another.
10 | 
11 | ## CouchDB
12 | 
13 | [CouchDB] [couchdb] is a RESTful NoSQL JavaScript database. You can configure
14 | the extension to store sitemaps and scraped data in this database. The data
15 | then could be accessible from all your chrome instances. To do that
16 | you need to configure it in the options page. You can open it by right clicking
17 | extensions icon and selecting options. There you can switch between storage
18 | backends. For CouchDB you need to add configure the database where sitemaps
19 | will be storend and the couchdb db server where scraped data will be stored.
20 | For example you can configure it like this:
21 | 
22 |  * sitemap db - http://localhost:5984/scraper-sitemaps
23 |  * data db - http://localhost:5984/
24 | 
25 |  [couchdb]: http://couchdb.apache.org/
26 | 


--------------------------------------------------------------------------------
/docs/images/chrome-store-logo-920x680.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoblink/web-scraper-chrome-extension/80d925c7a07f024b6be0870aa4a4d9b1a5f302b4/docs/images/chrome-store-logo-920x680.png


--------------------------------------------------------------------------------
/docs/images/chrome-store-logo-920x680.xcf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoblink/web-scraper-chrome-extension/80d925c7a07f024b6be0870aa4a4d9b1a5f302b4/docs/images/chrome-store-logo-920x680.xcf


--------------------------------------------------------------------------------
/docs/images/chrome-store-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoblink/web-scraper-chrome-extension/80d925c7a07f024b6be0870aa4a4d9b1a5f302b4/docs/images/chrome-store-logo.png


--------------------------------------------------------------------------------
/docs/images/chrome-store-logo.xcf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoblink/web-scraper-chrome-extension/80d925c7a07f024b6be0870aa4a4d9b1a5f302b4/docs/images/chrome-store-logo.xcf


--------------------------------------------------------------------------------
/docs/images/open-web-scraper/open-web-scraper.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoblink/web-scraper-chrome-extension/80d925c7a07f024b6be0870aa4a4d9b1a5f302b4/docs/images/open-web-scraper/open-web-scraper.png


--------------------------------------------------------------------------------
/docs/images/scraping-a-site/news-site-selector-graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoblink/web-scraper-chrome-extension/80d925c7a07f024b6be0870aa4a4d9b1a5f302b4/docs/images/scraping-a-site/news-site-selector-graph.png


--------------------------------------------------------------------------------
/docs/images/scraping-a-site/news-site-sitemap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoblink/web-scraper-chrome-extension/80d925c7a07f024b6be0870aa4a4d9b1a5f302b4/docs/images/scraping-a-site/news-site-sitemap.png


--------------------------------------------------------------------------------
/docs/images/scraping-a-site/news-site.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoblink/web-scraper-chrome-extension/80d925c7a07f024b6be0870aa4a4d9b1a5f302b4/docs/images/scraping-a-site/news-site.png


--------------------------------------------------------------------------------
/docs/images/selectors/element-click/click-more.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoblink/web-scraper-chrome-extension/80d925c7a07f024b6be0870aa4a4d9b1a5f302b4/docs/images/selectors/element-click/click-more.png


--------------------------------------------------------------------------------
/docs/images/selectors/element-click/click-once.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoblink/web-scraper-chrome-extension/80d925c7a07f024b6be0870aa4a4d9b1a5f302b4/docs/images/selectors/element-click/click-once.png


--------------------------------------------------------------------------------
/docs/images/selectors/link/multiple-level-link-selectors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoblink/web-scraper-chrome-extension/80d925c7a07f024b6be0870aa4a4d9b1a5f302b4/docs/images/selectors/link/multiple-level-link-selectors.png


--------------------------------------------------------------------------------
/docs/images/selectors/link/pagination-link-selectors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoblink/web-scraper-chrome-extension/80d925c7a07f024b6be0870aa4a4d9b1a5f302b4/docs/images/selectors/link/pagination-link-selectors.png


--------------------------------------------------------------------------------
/docs/images/selectors/link/pagination-selector-graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoblink/web-scraper-chrome-extension/80d925c7a07f024b6be0870aa4a4d9b1a5f302b4/docs/images/selectors/link/pagination-selector-graph.png


--------------------------------------------------------------------------------
/docs/images/selectors/table/selectors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoblink/web-scraper-chrome-extension/80d925c7a07f024b6be0870aa4a4d9b1a5f302b4/docs/images/selectors/table/selectors.png


--------------------------------------------------------------------------------
/docs/images/selectors/table/table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoblink/web-scraper-chrome-extension/80d925c7a07f024b6be0870aa4a4d9b1a5f302b4/docs/images/selectors/table/table.png


--------------------------------------------------------------------------------
/docs/images/selectors/text/text-selector-multiple-elements-with-text-selectors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoblink/web-scraper-chrome-extension/80d925c7a07f024b6be0870aa4a4d9b1a5f302b4/docs/images/selectors/text/text-selector-multiple-elements-with-text-selectors.png


--------------------------------------------------------------------------------
/docs/images/selectors/text/text-selector-multiple-per-page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoblink/web-scraper-chrome-extension/80d925c7a07f024b6be0870aa4a4d9b1a5f302b4/docs/images/selectors/text/text-selector-multiple-per-page.png


--------------------------------------------------------------------------------
/docs/images/selectors/text/text-selector-multiple-single-text-selectors-in-one-page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoblink/web-scraper-chrome-extension/80d925c7a07f024b6be0870aa4a4d9b1a5f302b4/docs/images/selectors/text/text-selector-multiple-single-text-selectors-in-one-page.png


--------------------------------------------------------------------------------
/docs/images/sitemap-tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoblink/web-scraper-chrome-extension/80d925c7a07f024b6be0870aa4a4d9b1a5f302b4/docs/images/sitemap-tree.png


--------------------------------------------------------------------------------
/docs/images/store-logo-sources.txt:
--------------------------------------------------------------------------------
1 | http://jsfiddle.net/t8Sgq/
2 | http://jsfiddle.net/qpVkY/
3 | 


--------------------------------------------------------------------------------
/extension/assets/LICENSE-d3-js:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2013, Michael Bostock
 2 | All rights reserved.
 3 | 
 4 | 	Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | 	* Redistributions of source code must retain the above copyright notice, this
 8 | list of conditions and the following disclaimer.
 9 | 
10 | 	* Redistributions in binary form must reproduce the above copyright notice,
11 | 	this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 | 
14 | 	* The name Michael Bostock may not be used to endorse or promote products
15 | derived from this software without specific prior written permission.
16 | 
17 | 	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 | DISCLAIMED. IN NO EVENT SHALL MICHAEL BOSTOCK BE LIABLE FOR ANY DIRECT,
21 | 	INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22 | 	BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | 	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
26 | 	EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/extension/assets/LICENSE-icanhaz-js:
--------------------------------------------------------------------------------
 1 | ICanHaz.js is Copyright (c) 2010 Henrik Joreteg and is MIT licensed.
 2 | 
 3 | In my best attempt to comply with instructions I'm including the following license notice from Mustache and Mustache.js:
 4 | ---------------------------------------------------------------------
 5 | Copyright (c) 2009 Chris Wanstrath (Ruby)
 6 | Copyright (c) 2010 Jan Lehnardt (JavaScript)
 7 | 
 8 | Permission is hereby granted, free of charge, to any person obtaining
 9 | a copy of this software and associated documentation files (the
10 | "Software"), to deal in the Software without restriction, including
11 | without limitation the rights to use, copy, modify, merge, publish,
12 | distribute, sublicense, and/or sell copies of the Software, and to
13 | permit persons to whom the Software is furnished to do so, subject to
14 | the following conditions:
15 | 
16 | The above copyright notice and this permission notice shall be
17 | included in all copies or substantial portions of the Software.
18 | 
19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
22 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
23 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
24 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
25 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 | ---------------------------------------------------------------------


--------------------------------------------------------------------------------
/extension/assets/LICENSE-jquery-js:
--------------------------------------------------------------------------------
 1 | Copyright 2013 jQuery Foundation and other contributors
 2 | http://jquery.com/
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining
 5 | a copy of this software and associated documentation files (the
 6 | "Software"), to deal in the Software without restriction, including
 7 | without limitation the rights to use, copy, modify, merge, publish,
 8 | distribute, sublicense, and/or sell copies of the Software, and to
 9 | permit persons to whom the Software is furnished to do so, subject to
10 | the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be
13 | included in all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/extension/assets/LICENSE-sugar-js:
--------------------------------------------------------------------------------
1 | Copyright © 2011 Andrew Plummer
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sub-license, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | The above copyright notice, and every other copyright notice found in this software, and all the attributions in every file, and this permission notice shall be included in all copies or substantial portions of the Software.
5 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/extension/assets/base64.js:
--------------------------------------------------------------------------------
 1 | var jquery = require('jquery-deferred')
 2 | /**
 3 |  * @url http://jsperf.com/blob-base64-conversion
 4 |  * @type {{blobToBase64: blobToBase64, base64ToBlob: base64ToBlob}}
 5 |  */
 6 | var Base64 = {
 7 | 
 8 |   blobToBase64: function (blob) {
 9 |     var deferredResponse = jquery.Deferred()
10 |     var reader = new FileReader()
11 |     reader.onload = function () {
12 |       var dataUrl = reader.result
13 |       var base64 = dataUrl.split(',')[1]
14 |       deferredResponse.resolve(base64)
15 |     }
16 |     reader.readAsDataURL(blob)
17 | 
18 |     return deferredResponse.promise()
19 |   },
20 | 
21 |   base64ToBlob: function (base64, mimeType) {
22 |     var deferredResponse = jquery.Deferred()
23 |     var binary = atob(base64)
24 |     var len = binary.length
25 |     var buffer = new ArrayBuffer(len)
26 |     var view = new Uint8Array(buffer)
27 |     for (var i = 0; i < len; i++) {
28 |       view[i] = binary.charCodeAt(i)
29 |     }
30 |     var blob = new Blob([view], {type: mimeType})
31 |     deferredResponse.resolve(blob)
32 | 
33 |     return deferredResponse.promise()
34 |   }
35 | }
36 | 
37 | module.exports = Base64
38 | 


--------------------------------------------------------------------------------
/extension/assets/bootstrap-3.0.0/fonts/glyphicons-halflings-regular.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoblink/web-scraper-chrome-extension/80d925c7a07f024b6be0870aa4a4d9b1a5f302b4/extension/assets/bootstrap-3.0.0/fonts/glyphicons-halflings-regular.eot


--------------------------------------------------------------------------------
/extension/assets/bootstrap-3.0.0/fonts/glyphicons-halflings-regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoblink/web-scraper-chrome-extension/80d925c7a07f024b6be0870aa4a4d9b1a5f302b4/extension/assets/bootstrap-3.0.0/fonts/glyphicons-halflings-regular.ttf


--------------------------------------------------------------------------------
/extension/assets/bootstrap-3.0.0/fonts/glyphicons-halflings-regular.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoblink/web-scraper-chrome-extension/80d925c7a07f024b6be0870aa4a4d9b1a5f302b4/extension/assets/bootstrap-3.0.0/fonts/glyphicons-halflings-regular.woff


--------------------------------------------------------------------------------
/extension/assets/images/LICENSE:
--------------------------------------------------------------------------------
1 | icons source:
2 | https://www.iconfinder.com/iconsets/free-grey-cloud-icons#readme
3 | https://www.iconfinder.com/icons/129397/spider_web_icon#size=96
4 | 
5 | license:
6 | http://creativecommons.org/licenses/by/3.0/legalcode
7 | 


--------------------------------------------------------------------------------
/extension/assets/images/icon128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoblink/web-scraper-chrome-extension/80d925c7a07f024b6be0870aa4a4d9b1a5f302b4/extension/assets/images/icon128.png


--------------------------------------------------------------------------------
/extension/assets/images/icon16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoblink/web-scraper-chrome-extension/80d925c7a07f024b6be0870aa4a4d9b1a5f302b4/extension/assets/images/icon16.png


--------------------------------------------------------------------------------
/extension/assets/images/icon19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoblink/web-scraper-chrome-extension/80d925c7a07f024b6be0870aa4a4d9b1a5f302b4/extension/assets/images/icon19.png


--------------------------------------------------------------------------------
/extension/assets/images/icon38.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoblink/web-scraper-chrome-extension/80d925c7a07f024b6be0870aa4a4d9b1a5f302b4/extension/assets/images/icon38.png


--------------------------------------------------------------------------------
/extension/assets/images/icon48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoblink/web-scraper-chrome-extension/80d925c7a07f024b6be0870aa4a4d9b1a5f302b4/extension/assets/images/icon48.png


--------------------------------------------------------------------------------
/extension/assets/jquery.bootstrapvalidator/bootstrapValidator.css:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * BootstrapValidator (http://bootstrapvalidator.com)
 3 |  * The best jQuery plugin to validate form fields. Designed to use with Bootstrap 3
 4 |  *
 5 |  * @author      http://twitter.com/nghuuphuoc
 6 |  * @copyright   (c) 2013 - 2014 Nguyen Huu Phuoc
 7 |  * @license     MIT
 8 |  */
 9 | 
10 | .bv-form .help-block {
11 |     margin-bottom: 0;
12 | }
13 | .bv-form .tooltip-inner {
14 |     text-align: left;
15 | }
16 | .nav-tabs li.bv-tab-success > a {
17 |     color: #3c763d;
18 | }
19 | .nav-tabs li.bv-tab-error > a {
20 |     color: #a94442;
21 | }
22 | 


--------------------------------------------------------------------------------
/extension/assets/jquery.whencallsequentially.js:
--------------------------------------------------------------------------------
 1 | var jquery = require('jquery-deferred')
 2 | /**
 3 |  * @author Martins Balodis
 4 |  *
 5 |  * An alternative version of $.when which can be used to execute asynchronous
 6 |  * calls sequentially one after another.
 7 |  *
 8 |  * @returns jqueryDeferred().promise()
 9 |  */
10 | module.exports = function whenCallSequentially (functionCalls) {
11 |   var deferredResonse = jquery.Deferred()
12 |   var resultData = []
13 | 
14 | 	// nothing to do
15 |   if (functionCalls.length === 0) {
16 |     return deferredResonse.resolve(resultData).promise()
17 |   }
18 | 
19 |   var currentDeferred = functionCalls.shift()()
20 | 	// execute synchronous calls synchronously
21 |   while (currentDeferred.state() === 'resolved') {
22 |     currentDeferred.done(function (data) {
23 |       resultData.push(data)
24 |     })
25 |     if (functionCalls.length === 0) {
26 |       return deferredResonse.resolve(resultData).promise()
27 |     }
28 |     currentDeferred = functionCalls.shift()()
29 |   }
30 | 
31 | 	// handle async calls
32 |   var interval = setInterval(function () {
33 | 		// handle mixed sync calls
34 |     while (currentDeferred.state() === 'resolved') {
35 |       currentDeferred.done(function (data) {
36 |         resultData.push(data)
37 |       })
38 |       if (functionCalls.length === 0) {
39 |         clearInterval(interval)
40 |         deferredResonse.resolve(resultData)
41 |         break
42 |       }
43 |       currentDeferred = functionCalls.shift()()
44 |     }
45 |   }, 10)
46 | 
47 |   return deferredResonse.promise()
48 | }
49 | 


--------------------------------------------------------------------------------
/extension/content_script/contentScraperHeadlessBundler.js:
--------------------------------------------------------------------------------
 1 | const browserify = require('browserify')
 2 | const path = require('path')
 3 | // caching
 4 | let bundle
 5 | module.exports = {getBundle}
 6 | 
 7 | function getBundle () {
 8 |   return new Promise(function (resolve, reject) {
 9 |     if (bundle) {
10 |       return resolve(bundle)
11 |     }
12 |     const content = []
13 |     browserify({
14 |       standalone: 'webScraper',
15 |       entries: [
16 |         path.join(__dirname, './content_scraper_browser.js')
17 |       ]
18 |     }).bundle().on('error', function (err) {
19 |       reject(err)
20 |     }).on('data', function (buffer) {
21 |       content.push(buffer)
22 |     }).on('end', function () {
23 |       const buffer = Buffer.concat(content)
24 |       const result = buffer.toString()
25 |       bundle = result
26 |       resolve(result)
27 |     })
28 |   })
29 | }
30 | 


--------------------------------------------------------------------------------
/extension/content_script/content_scraper.js:
--------------------------------------------------------------------------------
 1 | var DataExtractor = require('./../scripts/DataExtractor')
 2 | var getContentScript = require('./../scripts/getContentScript')
 3 | const debug = require('debug')('web-scraper-headless:content_scraper')
 4 | function extensionListener (request, sender, sendResponse, options) {
 5 |   var $ = options.$
 6 |   var document = options.document
 7 |   var window = options.window
 8 |   debug('chrome.runtime.onMessage', request)
 9 | 
10 |   if (request.extractData) {
11 |     debug('received data extraction request', request)
12 |     var extractor = new DataExtractor(request, {$, window, document})
13 |     var deferredData = extractor.getData()
14 |     deferredData.done(function (data) {
15 |       debug('dataextractor data', data)
16 |       sendResponse(data)
17 |     })
18 |     return true
19 |   } else if (request.previewSelectorData) {
20 |     debug('received data-preview extraction request', request)
21 |     var extractor = new DataExtractor(request, {$, document, window})
22 |     var deferredData = extractor.getSingleSelectorData(request.parentSelectorIds, request.selectorId)
23 |     deferredData.done(function (data) {
24 |       debug('dataextractor data', data)
25 |       sendResponse(data)
26 |     })
27 |     return true
28 |   }
29 |   // Universal ContentScript communication handler
30 |   else if (request.contentScriptCall) {
31 |     var contentScript = getContentScript('ContentScript')
32 | 
33 |     debug('received ContentScript request', request)
34 | 
35 |     var deferredResponse = contentScript[request.fn](request.request, {$, document, window})
36 |     deferredResponse.done(function (response) {
37 |       sendResponse(response)
38 |     })
39 | 
40 |     return true
41 |   }
42 | }
43 | 
44 | module.exports = extensionListener
45 | 


--------------------------------------------------------------------------------
/extension/content_script/content_scraper_browser.js:
--------------------------------------------------------------------------------
1 | const listener = require('./content_scraper')
2 | const $ = require('jquery')
3 | module.exports = function (request, sender, sendResponse) {
4 |   listener(request, sender, sendResponse, {$, window, document})
5 |   // important so that chrome knows the listener is async
6 |   return true
7 | }
8 | 


--------------------------------------------------------------------------------
/extension/content_script/content_script.js:
--------------------------------------------------------------------------------
1 | chrome.runtime.onMessage.addListener(contentScraper)
2 | 


--------------------------------------------------------------------------------
/extension/devtools/devtools_init_page.html:
--------------------------------------------------------------------------------
1 | <html>
2 | <body>
3 | <script type="text/javascript" src="devtools_init_page.js"></script>
4 | </body>
5 | </html>


--------------------------------------------------------------------------------
/extension/devtools/devtools_init_page.js:
--------------------------------------------------------------------------------
1 | console.log('loading devtools')
2 | chrome.devtools.panels.create('Web Scraper Headless', '../assets/images/icon48.png', 'devtools/devtools_scraper_panel.html')
3 | 


--------------------------------------------------------------------------------
/extension/devtools/devtools_scraper_panel.css:
--------------------------------------------------------------------------------
 1 | /*body > form, body > div {*/
 2 |     /*display:none;*/
 3 | /*}*/
 4 | 
 5 | a, tbody tr {
 6 |     cursor: pointer;
 7 | }
 8 | 
 9 | 
10 | .selector-list-tpl, .sitemap-list-tpl {
11 |     display:none
12 | }
13 | 
14 | /**
15 |  * Compact elements
16 |  */
17 | .navbar-nav>li>a {
18 |     padding-top: 3px;
19 |     padding-bottom: 3px;
20 | }
21 | 
22 | .navbar-text {
23 | 	margin-top:4px;
24 | 	margin-bottom:4px;
25 | 	padding-right:3px;
26 | }
27 | 
28 | .navbar {
29 |     min-height:26px;
30 |     margin-bottom: 6px;
31 | }
32 | .table-condensed tbody>tr>td {
33 |     padding:1px 5px;
34 | }
35 | 
36 | body {
37 |     font-size: 12px;
38 | }
39 | 
40 | form .form-control {
41 |     font-size: 12px;
42 |     padding: 3px 12px;
43 |     height: 25px;
44 | }
45 | 
46 | textarea.form-control {
47 |     height: auto;
48 | }
49 | 
50 | form .btn {
51 |     font-size: 12px;
52 |     padding: 3px 12px;
53 | }
54 | 
55 | form .form-group {
56 |     margin-bottom:5px;
57 | }
58 | 
59 | form select[multiple], select[size] {
60 |     height: auto;
61 | }
62 | 
63 | #selector-graph .node circle {
64 | 	cursor: pointer;
65 | 	fill: #fff;
66 | 	stroke: steelblue;
67 | 	stroke-width: 1px;
68 | }
69 | 
70 | #selector-graph .node text {
71 | 	font-size: 11px;
72 | }
73 | 
74 | #selector-graph path.link {
75 | 	fill: none;
76 | 	stroke: #ccc;
77 | 	stroke-width: 1px;
78 | }
79 | 
80 | .data-preview-modal .modal-dialog {
81 | 	width:auto;
82 | }
83 | 
84 | .data-preview-modal .modal-body {
85 | 	overflow-y:scroll;
86 | }
87 | 
88 | .data-preview-modal tbody tr {
89 | 	cursor: initial;
90 | }


--------------------------------------------------------------------------------
/extension/devtools/devtools_scraper_panel.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 | 	<link rel="stylesheet" href="../assets/bootstrap-3.0.0/css/bootstrap.css">
 5 | 	<link rel="stylesheet" href="../assets/jquery.bootstrapvalidator/bootstrapValidator.css">
 6 | 	<link rel="stylesheet" href="devtools_scraper_panel.css">
 7 | 	<script src="../assets/jquery-2.0.3.js"></script>
 8 | 	<script src="../assets/pouchdb-nightly.min.js"></script>
 9 | 	<script src="../assets/bootstrap-3.0.0/js/bootstrap.js"></script>
10 | 	<!--<script src="../assets/d3.v3.js" charset="UTF-8"></script>-->
11 | 	<script src="../assets/ICanHaz.js"></script>
12 | 	<script src="../assets/jquery.bootstrapvalidator/bootstrapValidator.js"></script>
13 | 	<script src="../generated/devtools-scraper.js"></script>
14 | 
15 | </head>
16 | <body></body>
17 | </html>


--------------------------------------------------------------------------------
/extension/devtools/views/DataPreview.html:
--------------------------------------------------------------------------------
 1 | <div class="modal fade data-preview-modal">
 2 | 	<div class="modal-dialog">
 3 | 		<div class="modal-content">
 4 | 			<div class="modal-header">
 5 | 				<button type="button" class="close" data-dismiss="modal" aria-hidden="true">&times;</button>
 6 | 				<h4 class="modal-title">data Preview</h4>
 7 | 			</div>
 8 | 			<div class="modal-body">
 9 | 				<table class="table">
10 | 				<thead>
11 | 					<tr>
12 | 						{{#columns}}
13 | 						<th>{{.}}</th>
14 | 						{{/columns}}
15 | 					</tr>
16 | 				</thead>
17 | 				<tbody>
18 | 				</tbody>
19 | 				</table>
20 | 			</div>
21 | 		</div>
22 | 	</div>
23 | </div>


--------------------------------------------------------------------------------
/extension/devtools/views/SelectorEditTableColumn.html:
--------------------------------------------------------------------------------
1 | <tr>
2 | 	<td><input class="column-header" type="hidden" name="column[header][]" value="{{header}}">{{header}}</td>
3 | 	<td><input class="column-name form-control" type="text" name="column[name][]" value="{{name}}"></td>
4 | 	<td><input class="column-extract" type="checkbox" name="column[extract][]" {{#extract}}checked="checked"{{/extract}}></td>
5 | </tr>


--------------------------------------------------------------------------------
/extension/devtools/views/SelectorList.html:
--------------------------------------------------------------------------------
 1 | <div id="selector-tree">
 2 | 	<ol class="breadcrumb">
 3 | 		{{#parentSelectors}}
 4 | 		<li><a>{{id}}</a></li>
 5 | 		{{/parentSelectors}}
 6 | 	</ol>
 7 | 	<table class="table table-bordered table-condensed table-hover">
 8 | 		<thead>
 9 | 		<tr>
10 | 			<th>ID</th>
11 | 			<th>Selector</th>
12 | 			<th style="width:60px">type</th>
13 | 			<th style="width:60px">Multiple</th>
14 | 			<th>Parent selectors</th>
15 | 			<th style="width:295px">Actions</th>
16 | 		</tr>
17 | 		</thead>
18 | 		<tbody></tbody>
19 | 	</table>
20 | 	<button action="add-selector" type="button" class="btn btn-primary btn-xs">Add new selector</button>
21 | </div>


--------------------------------------------------------------------------------
/extension/devtools/views/SelectorListItem.html:
--------------------------------------------------------------------------------
 1 | <tr>
 2 | 	<td>{{id}}</td>
 3 | 	<td>{{selector}}</td>
 4 | 	<td>{{type}}</td>
 5 | 	<td>{{multiple}}</td>
 6 | 	<td>{{parentSelectors}}</td>
 7 | 	<td>
 8 | 		<button action="preview-selector" type="button" class="btn btn-primary btn-xs">Element preview</button>
 9 | 		<button action="data-preview-selector" type="button" class="btn btn-primary btn-xs">Data preview</button>
10 | 		<button action="edit-selector" type="button" class="btn btn-primary btn-xs">Edit</button>
11 | 		<button action="delete-selector" type="button" class="btn btn-primary btn-xs">Delete</button>
12 | 	</td>
13 | </tr>


--------------------------------------------------------------------------------
/extension/devtools/views/SitemapBrowseData.html:
--------------------------------------------------------------------------------
 1 | <div id="sitemap-data">
 2 | 	<table class="table table-bordered table-condensed table-hover">
 3 | 		<thead>
 4 | 		<tr>
 5 | 			{{#columns}}
 6 | 			<th>{{.}}</th>
 7 | 			{{/columns}}
 8 | 		</tr>
 9 | 		</thead>
10 | 		<tbody>
11 | 		</tbody>
12 | 	</table>
13 | </div>


--------------------------------------------------------------------------------
/extension/devtools/views/SitemapCreate.html:
--------------------------------------------------------------------------------
 1 | <form class="form-horizontal" role="form" id="create-sitemap">
 2 | 	<div class="form-group">
 3 | 		<label for="_id" class="col-lg-1 control-label">Sitemap name</label>
 4 | 
 5 | 		<div class="col-lg-10">
 6 | 			<input type="text" class="form-control" name="_id" id="_id" placeholder="Sitemap name">
 7 | 		</div>
 8 | 	</div>
 9 | 	<div class="form-group start-url-block">
10 | 		<label class="col-lg-1 control-label">Start URL</label>
11 | 		<div class="col-lg-10">
12 | 			<div class="input-group">
13 | 				<input type="text" class="form-control input-start-url" name="startUrl[]" placeholder="URL">
14 | 				<span class="input-group-btn">
15 | 					<button class="btn btn-default remove-start-url" type="button">-</button>
16 | 					<button class="btn btn-default add-extra-start-url" type="button">+</button>
17 | 				</span>
18 | 			</div>
19 | 		</div>
20 | 	</div>
21 | 	<div class="form-group">
22 | 		<div class="col-lg-offset-1 col-lg-10">
23 | 			<button type="submit" class="btn btn-default" id="submit-create-sitemap">Create Sitemap</button>
24 | 		</div>
25 | 	</div>
26 | </form>


--------------------------------------------------------------------------------
/extension/devtools/views/SitemapEditMetadata.html:
--------------------------------------------------------------------------------
 1 | <div id="edit-sitemap">
 2 | 	<form class="form-horizontal" role="form" id="edit-sitemap-metadata-form">
 3 | 		<div class="form-group">
 4 | 			<label for="edit_sitemap_id" class="col-lg-1 control-label">Sitemap name</label>
 5 | 
 6 | 			<div class="col-lg-10">
 7 | 				<input type="text" class="form-control" name="_id" id="edit_sitemap_id" placeholder="Sitemap name" value="{{_id}}">
 8 | 			</div>
 9 | 		</div>
10 | 		{{#startUrl.push}}
11 | 			{{#startUrl}}
12 | 				<div class="form-group start-url-block">
13 | 					<label class="col-lg-1 control-label">Start URL</label>
14 | 					<div class="col-lg-10">
15 | 						<div class="input-group">
16 | 							<input type="text" class="form-control input-start-url" name="startUrl[]" placeholder="URL" value="{{.}}">
17 | 						<span class="input-group-btn">
18 | 							<button class="btn btn-default remove-start-url" type="button">-</button>
19 | 							<button class="btn btn-default add-extra-start-url" type="button">+</button>
20 | 						</span>
21 | 						</div>
22 | 					</div>
23 | 				</div>
24 | 			{{/startUrl}}
25 | 		{{/startUrl.push}}
26 | 		{{^startUrl.push}}
27 | 			<div class="form-group start-url-block">
28 | 				<label class="col-lg-1 control-label">Start URL</label>
29 | 				<div class="col-lg-10">
30 | 					<div class="input-group">
31 | 						<input type="text" class="form-control input-start-url" name="startUrl[]" placeholder="URL" value="{{startUrl}}">
32 | 							<span class="input-group-btn">
33 | 								<button class="btn btn-default remove-start-url" type="button">-</button>
34 | 								<button class="btn btn-default add-extra-start-url" type="button">+</button>
35 | 							</span>
36 | 					</div>
37 | 				</div>
38 | 			</div>
39 | 		{{/startUrl.push}}
40 | 		<div class="form-group">
41 | 			<div class="col-lg-offset-1 col-lg-10">
42 | 				<button type="submit" class="btn btn-primary" id="submit-edit-sitemap">Save Sitemap</button>
43 | 			</div>
44 | 		</div>
45 | 	</form>
46 | </div>


--------------------------------------------------------------------------------
/extension/devtools/views/SitemapExport.html:
--------------------------------------------------------------------------------
1 | <form class="form-horizontal" role="form">
2 | 	<div class="form-group">
3 | 		<div class="col-lg-offset-1 col-lg-10">
4 | 			<textarea rows="7" class="form-control">{{sitemapJSON}}</textarea>
5 | 		</div>
6 | 	</div>
7 | </form>


--------------------------------------------------------------------------------
/extension/devtools/views/SitemapExportDataCSV.html:
--------------------------------------------------------------------------------
1 | <p>
2 | 	Export {{_id}} data as CSV. <br /> Waiting for the download button to appear. >
3 | 	<span class="download-button" href="#"><a>Download now!</a></span>
4 | </p>


--------------------------------------------------------------------------------
/extension/devtools/views/SitemapHeadlessScrapeConfig.html:
--------------------------------------------------------------------------------
 1 | <div id="scrape-sitemap-config">
 2 | 	<form class="form-horizontal" role="form" id="submit-scrape-sitemap-form">
 3 | 		<div class="form-group">
 4 | 			<label for="requestInterval" class="col-lg-1 control-label">Request interval (ms)</label>
 5 | 			<div class="col-lg-10">
 6 | 				<input type="text" class="form-control" name="requestInterval" id="requestInterval" placeholder="Request interval" value="2000">
 7 | 			</div>
 8 | 		</div>
 9 | 		<div class="form-group">
10 | 			<label for="pageLoadDelay" class="col-lg-1 control-label">Page load delay (ms)</label>
11 | 			<div class="col-lg-10">
12 | 				<input type="text" class="form-control" name="pageLoadDelay" id="pageLoadDelay" placeholder="Page load delay" value="500">
13 | 			</div>
14 | 		</div>
15 | 		<div class="alert alert-success col-lg-10 col-lg-offset-1 hide scraping-in-progress" role="alert">
16 | 			Scraping in progress. Close the popup to stop scraping.<br>
17 | 			When scraping is finished you can browse the data or export it as CSV.
18 | 		</div>
19 | 
20 | 		<div class="form-group">
21 | 			<div class="col-lg-offset-1 col-lg-10">
22 | 				<button type="submit" class="btn btn-primary" id="submit-headless-scrape-sitemap">Start scraping</button>
23 | 			</div>
24 | 		</div>
25 | 	</form>
26 | </div>


--------------------------------------------------------------------------------
/extension/devtools/views/SitemapImport.html:
--------------------------------------------------------------------------------
 1 | <form class="form-horizontal" role="form">
 2 | 	<div class="form-group">
 3 | 		<label for="sitemapJSON" class="col-lg-1 control-label">Sitemap JSON</label>
 4 | 
 5 | 		<div class="col-lg-10">
 6 | 			<textarea rows="7" class="form-control" name="sitemapJSON" id="sitemapJSON"></textarea>
 7 | 		</div>
 8 | 	</div>
 9 | 	<div class="form-group">
10 | 		<label for="edit_sitemap_id" class="col-lg-1 control-label">Rename Sitemap (optional)</label>
11 | 
12 | 		<div class="col-lg-10">
13 | 			<input type="text" class="form-control" name="_id" id="edit_sitemap_id" placeholder="Sitemap name" value="{{_id}}">
14 | 		</div>
15 | 	</div>
16 | 	<div class="form-group">
17 | 		<div class="col-lg-offset-1 col-lg-10">
18 | 			<button class="btn btn-default" id="submit-import-sitemap">Import Sitemap</button>
19 | 		</div>
20 | 	</div>
21 | </form>


--------------------------------------------------------------------------------
/extension/devtools/views/SitemapList.html:
--------------------------------------------------------------------------------
 1 | <div id="sitemaps">
 2 | 	<table class="table table-bordered table-condensed table-hover">
 3 | 		<thead>
 4 | 		<tr>
 5 | 			<th>ID</th>
 6 | 			<th>Start URL</th>
 7 | 			<th style="width:120px">actions</th>
 8 | 		</tr>
 9 | 		</thead>
10 | 		<tbody>
11 | 		</tbody>
12 | 	</table>
13 | </div>


--------------------------------------------------------------------------------
/extension/devtools/views/SitemapListItem.html:
--------------------------------------------------------------------------------
 1 | <tr>
 2 | 	<td class="id">{{_id}}</td>
 3 | 	<td>
 4 | 		{{#startUrl.push}}
 5 | 			{{#startUrl}}
 6 | 				{{.}},
 7 | 			{{/startUrl}}
 8 | 		{{/startUrl.push}}
 9 | 		{{^startUrl.push}}
10 | 			{{startUrl}}
11 | 		{{/startUrl.push}}
12 | 	</td>
13 | 	<td>
14 | 		<button action="browse-sitemap-data" type="button" class="btn btn-primary btn-xs">Browse</button>
15 | 		<!--<button action="csv-download-sitemap-data" type="button" class="btn btn-primary btn-xs">CSV</button>-->
16 | 		<button action="delete-sitemap" type="button" class="btn btn-primary btn-xs">Delete</button>
17 | 	</td>
18 | </tr>


--------------------------------------------------------------------------------
/extension/devtools/views/SitemapScrapeConfig.html:
--------------------------------------------------------------------------------
 1 | <div id="scrape-sitemap-config">
 2 | 	<form class="form-horizontal" role="form" id="submit-scrape-sitemap-form">
 3 | 		<div class="form-group">
 4 | 			<label for="requestInterval" class="col-lg-1 control-label">Request interval (ms)</label>
 5 | 			<div class="col-lg-10">
 6 | 				<input type="text" class="form-control" name="requestInterval" id="requestInterval" placeholder="Request interval" value="2000">
 7 | 			</div>
 8 | 		</div>
 9 | 		<div class="form-group">
10 | 			<label for="pageLoadDelay" class="col-lg-1 control-label">Page load delay (ms)</label>
11 | 			<div class="col-lg-10">
12 | 				<input type="text" class="form-control" name="pageLoadDelay" id="pageLoadDelay" placeholder="Page load delay" value="500">
13 | 			</div>
14 | 		</div>
15 | 		<div class="alert alert-success col-lg-10 col-lg-offset-1 hide scraping-in-progress" role="alert">
16 | 			Scraping in progress. Close the popup to stop scraping.<br>
17 | 			When scraping is finished you can browse the data or export it as CSV.
18 | 		</div>
19 | 
20 | 		<div class="form-group">
21 | 			<div class="col-lg-offset-1 col-lg-10">
22 | 				<button type="submit" class="btn btn-primary" id="submit-scrape-sitemap">Start scraping</button>
23 | 			</div>
24 | 		</div>
25 | 	</form>
26 | </div>


--------------------------------------------------------------------------------
/extension/devtools/views/SitemapSelectorGraph.html:
--------------------------------------------------------------------------------
1 | <div id="selector-graph"></div>


--------------------------------------------------------------------------------
/extension/devtools/views/SitemapStartUrlField.html:
--------------------------------------------------------------------------------
 1 | <div class="form-group start-url-block">
 2 | 	<label class="col-lg-1 control-label">Start URL</label>
 3 | 
 4 | 	<div class="col-lg-10">
 5 | 		<div class="input-group">
 6 | 			<input type="text" class="form-control input-start-url" name="startUrl[]" placeholder="URL">
 7 | 				<span class="input-group-btn">
 8 | 					<button class="btn btn-default remove-start-url" type="button">-</button>
 9 | 					<button class="btn btn-default add-extra-start-url" type="button">+</button>
10 | 				</span>
11 | 		</div>
12 | 	</div>
13 | </div>


--------------------------------------------------------------------------------
/extension/devtools/views/Viewport.html:
--------------------------------------------------------------------------------
 1 | <!-- Navigation -->
 2 | <nav class="navbar navbar-default" role="navigation">
 3 | 
 4 | 	<div class="collapse navbar-collapse navbar-ex1-collapse">
 5 | 		<ul class="nav navbar-nav">
 6 | 			<li><a id="sitemaps-nav-button">Sitemaps</a></li>
 7 | 			<li>
 8 | 				<a id="sitemap-nav-button" class="dropdown-toggle disabled" data-toggle="dropdown">Sitemap
 9 | 					<span id="navbar-active-sitemap-id"></span>
10 | 					<b class="caret"></b>
11 | 				</a>
12 | 				<ul class="dropdown-menu">
13 | 					<li><a id="sitemap-selector-list-nav-button">Selectors</a></li>
14 | 					<li><a id="sitemap-selector-graph-nav-button">Selector graph</a></li>
15 | 					<li><a id="sitemap-edit-metadata-nav-button">Edit metadata</a></li>
16 | 					<li><a id="sitemap-scrape-nav-button">Scrape</a></li>
17 | 					<li><a id="sitemap-headless-scrape-nav-button">Scrape headless</a></li>
18 | 					<li><a id="sitemap-browse-nav-button">Browse</a></li>
19 | 					<li><a id="sitemap-export-nav-button">Export Sitemap</a></li>
20 | 					<li><a id="sitemap-export-data-csv-nav-button">Export data as CSV</a></li>
21 | 				</ul>
22 | 			</li>
23 | 			<li>
24 | 				<a id="create-sitemap-nav-button" class="dropdown-toggle" data-toggle="dropdown">Create new sitemap <b class="caret"></b></a>
25 | 				<ul class="dropdown-menu">
26 | 					<li><a id="create-sitemap-create-nav-button">Create sitemap</a></li>
27 | 					<li><a id="create-sitemap-import-nav-button">Import sitemap</a></li>
28 | 				</ul>
29 | 			</li>
30 | 		</ul>
31 | 	</div>
32 | 
33 | </nav>
34 | 
35 | <div id="messages"></div>
36 | <div id="viewport"></div>


--------------------------------------------------------------------------------
/extension/generated/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/extension/manifest.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"manifest_version": 2,
 3 | 	"version": "0.2.0.9",
 4 | 	"name": "Web Scraper Headless",
 5 | 	"short_name": "Web Scraper Headless",
 6 | 	"description": "Tool for data extraction from websites",
 7 | 	"permissions": ["<all_urls>", "tabs", "notifications", "storage", "unlimitedStorage", "downloads"],
 8 | 	"icons": {
 9 | 		"16": "assets/images/icon16.png",
10 | 		"48": "assets/images/icon48.png",
11 | 		"128": "assets/images/icon128.png"
12 | 	},
13 | 	"browser_action": {
14 | 		"default_icon": {
15 | 			"19": "assets/images/icon19.png",
16 | 			"38": "assets/images/icon38.png"
17 | 		},
18 | 		"default_title": "Web Scraper",
19 | 		"default_popup": "popup.html"
20 | 	},
21 | 	"options_page": "options_page/options.html",
22 | 	"devtools_page": "devtools/devtools_init_page.html",
23 | 	"content_security_policy": "script-src 'self' 'unsafe-eval'; object-src 'self'",
24 | 	"background": {
25 | 		"scripts": [
26 | 			"assets/jquery-2.0.3.js",
27 | 			"assets/pouchdb-nightly.min.js",
28 | 			"generated/background-scraper.js"
29 | 		]
30 | 	},
31 | 	"web_accessible_resources": [
32 | 		"assets/images/icon16.png",
33 | 		"assets/images/icon48.png",
34 | 		"assets/images/icon128.png",
35 | 		"assets/images/icon19.png",
36 | 		"assets/images/icon38.png"
37 | 	],
38 | 	"content_scripts": [
39 | 		{
40 | 			"matches": ["*://*/*"],
41 | 			"js": [
42 | 				"assets/jquery-2.0.3.js",
43 | 				"assets/sugar-1.4.1.js",
44 | 				"generated/content-scraper.js",
45 | 				"content_script/content_script.js"
46 | 			],
47 | 			"css": [
48 | 				"content_script/content_script.css"
49 | 			]
50 | 		}
51 | 	]
52 | }


--------------------------------------------------------------------------------
/extension/options_page/options.html:
--------------------------------------------------------------------------------
 1 | <!--This is the options page in extensions menu !-->
 2 | <!DOCTYPE html>
 3 | <html>
 4 | 	<head>
 5 | 		<title>Web Scraper</title>
 6 | 		<link rel="stylesheet" href="../assets/bootstrap-3.0.0/css/bootstrap.min.css">
 7 | 		<script src="../assets/jquery-2.0.3.js"></script>
 8 | 		<script src="../assets/bootstrap-3.0.0/js/bootstrap.min.js"></script>
 9 | 		<script src="../scripts/Config.js"></script>
10 | 		<script src="options_page.js"></script>
11 | 	</head>
12 | 	<body>
13 | 		<div class="container">
14 | 			<h1>Web Scraper</h1>
15 | 			<p>Options page</p>
16 | 			<br />
17 | 
18 | 
19 | 			<form class="form-horizontal" id="storage_configuration">
20 | 				<fieldset>
21 | 					<legend>Storage settings</legend>
22 |                     <div class="form-group">
23 |                         <label for="storageType" class="col-lg-2 control-label">Storage type</label>
24 |                         <div class="col-lg-10">
25 |                             <select name="storageType" id="storageType" class="form-control">
26 |                                 <option value="local">local storage</option>
27 |                                 <option value="couchdb">CouchDB</option>
28 |                             </select>
29 |                         </div>
30 |                     </div>
31 | 
32 | 					<div class="form-group couchdb">
33 | 						<label for="sitemapDb" class="col-lg-2 control-label">Sitemap db</label>
34 | 
35 | 						<div class="col-lg-10">
36 | 							<input type="text" class="form-control" name="sitemapDb" id="sitemapDb" placeholder="Database URL">
37 | 						</div>
38 | 					</div>
39 | 
40 | 					<div class="form-group couchdb">
41 | 						<label for="dataDb" class="col-lg-2 control-label">Data db</label>
42 | 
43 | 						<div class="col-lg-10">
44 | 							<input type="text" class="form-control" id="dataDb" name="dataDb">
45 | 						</div>
46 | 					</div>
47 | 					<div class="form-group">
48 | 						<div class="col-lg-offset-2 col-lg-10">
49 | 							<button type="submit" class="btn btn-default">Save</button>
50 | 						</div>
51 | 					</div>
52 | 				</fieldset>
53 | 			</form>
54 | 
55 | 		</div>
56 | 
57 | 	</body>
58 | </html>
59 | 


--------------------------------------------------------------------------------
/extension/options_page/options_page.js:
--------------------------------------------------------------------------------
 1 | $(function () {
 2 |   console.log('opening config page')
 3 | 	// popups for Storage setting input fields
 4 |   $('#sitemapDb')
 5 | 		.popover({
 6 |   title: 'Database for sitemap storage',
 7 |   html: true,
 8 |   content: 'CouchDB database url<br /> http://example.com/scraper-sitemaps/',
 9 |   placement: 'bottom'
10 | })
11 | 		.blur(function () {
12 |   $(this).popover('hide')
13 | })
14 | 
15 |   $('#dataDb')
16 | 		.popover({
17 |   title: 'Database for scraped data',
18 |   html: true,
19 |   content: 'CouchDB database url. For each sitemap a new DB will be created.<br />http://example.com/',
20 |   placement: 'bottom'
21 | })
22 | 		.blur(function () {
23 |   $(this).popover('hide')
24 | })
25 | 
26 | 	// switch between configuration types
27 |   $('select[name=storageType]').change(function () {
28 |     var type = $(this).val()
29 | 
30 |     if (type === 'couchdb') {
31 |       $('.form-group.couchdb').show()
32 |     } else {
33 |       $('.form-group.couchdb').hide()
34 |     }
35 |   })
36 | 
37 | 	// Extension configuration
38 |   var config = new Config()
39 | 
40 | 	// load previously synced data
41 |   config.loadConfiguration(function () {
42 |     $('#storageType').val(config.storageType)
43 |     $('#sitemapDb').val(config.sitemapDb)
44 |     $('#dataDb').val(config.dataDb)
45 | 
46 |     $('select[name=storageType]').change()
47 |   })
48 | 
49 | 	// Sync storage settings
50 |   $('form#storage_configuration').submit(function () {
51 |     var sitemapDb = $('#sitemapDb').val()
52 |     var dataDb = $('#dataDb').val()
53 |     var storageType = $('#storageType').val()
54 | 
55 |     var newConfig
56 | 
57 |     if (storageType === 'local') {
58 |       newConfig = {
59 |         storageType: storageType,
60 |         sitemapDb: ' ',
61 |         dataDb: ' '
62 |       }
63 |     }		else {
64 |       newConfig = {
65 |         storageType: storageType,
66 |         sitemapDb: sitemapDb,
67 |         dataDb: dataDb
68 |       }
69 |     }
70 | 
71 |     config.updateConfiguration(newConfig)
72 |     return false
73 |   })
74 | })
75 | 


--------------------------------------------------------------------------------
/extension/popup.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 | 	<style>
 5 | 		body {
 6 | 			width:300px;
 7 | 			font-size:12px;
 8 | 		}
 9 | 	</style>
10 | </head>
11 | <body>
12 | <p>
13 | 	Open Developer tools where you will find Web Scraper tab:
14 | 	<ul>
15 | 		<li>
16 | 			Windows, Linux: Ctrl+Shift+I or F12
17 | 		</li>
18 | 		<li>
19 | 			Mac: Cmd+Opt+I
20 | 		</li>
21 | 		<li>
22 | 			Any OS: open Tools / Developer tools
23 | 		</li>
24 | 	</ul>
25 | </p>
26 | <p>
27 | 	Documentation is available on <a target="_blank" href="http://webscraper.io/">webscraper.io</a>
28 | </p>
29 | </body>
30 | </html>


--------------------------------------------------------------------------------
/extension/scripts/App.js:
--------------------------------------------------------------------------------
 1 | var StoreDevtools = require('./StoreDevtools')
 2 | var SitemapController = require('./Controller')
 3 | 
 4 | $(function () {
 5 | 	// init bootstrap alerts
 6 |   $('.alert').alert()
 7 | 
 8 |   var store = new StoreDevtools({$, document, window})
 9 |   new SitemapController({
10 |     store: store,
11 |     templateDir: 'views/'
12 |   }, {$, document, window})
13 | })
14 | 


--------------------------------------------------------------------------------
/extension/scripts/BackgroundScript.js:
--------------------------------------------------------------------------------
 1 | var jquery = require('jquery-deferred')
 2 | const debug = require('debug')('web-scraper-headless:background-script')
 3 | 
 4 | /**
 5 |  * ContentScript that can be called from anywhere within the extension
 6 |  */
 7 | var BackgroundScript = {
 8 | 
 9 |   dummy: function () {
10 |     return jquery.Deferred().resolve('dummy').promise()
11 |   },
12 | 
13 | 	/**
14 | 	 * Returns the id of the tab that is visible to user
15 | 	 * @returns jquery.Deferred() integer
16 | 	 */
17 |   getActiveTabId: function () {
18 |     var deferredResponse = jquery.Deferred()
19 | 
20 |     chrome.tabs.query({
21 |       active: true,
22 |       currentWindow: true
23 |     }, function (tabs) {
24 |       if (tabs.length < 1) {
25 |         debug('There seems to be no active tab in the current window. Let us try only active')
26 |         chrome.tabs.query({
27 |           active: true,
28 |           windowType: 'normal'
29 |         }, function (tabs) {
30 |           if (tabs.length < 1) {
31 |             debug('Could not find tab')
32 |             deferredResponse.reject("couldn't find the active tab")
33 |           } else {
34 |             const tabId = tabs[0].id
35 |             deferredResponse.resolve(tabId)
36 |           }
37 |         })
38 | 				// @TODO must be running within popup. maybe find another active window?
39 |       } else {
40 |         var tabId = tabs[0].id
41 |         deferredResponse.resolve(tabId)
42 |       }
43 |     })
44 |     return deferredResponse.promise()
45 |   },
46 | 
47 | 	/**
48 | 	 * Execute a function within the active tab within content script
49 | 	 * @param request.fn	function to call
50 | 	 * @param request.request	request that will be passed to the function
51 | 	 */
52 |   executeContentScript: function (request) {
53 |     var reqToContentScript = {
54 |       contentScriptCall: true,
55 |       fn: request.fn,
56 |       request: request.request
57 |     }
58 |     var deferredResponse = jquery.Deferred()
59 |     var deferredActiveTabId = this.getActiveTabId()
60 |     deferredActiveTabId.done(function (tabId) {
61 |       chrome.tabs.sendMessage(tabId, reqToContentScript, function (response) {
62 |         deferredResponse.resolve(response)
63 |       })
64 |     })
65 | 
66 |     return deferredResponse
67 |   }
68 | }
69 | 
70 | module.exports = BackgroundScript
71 | 


--------------------------------------------------------------------------------
/extension/scripts/ChromeHeadlessBrowser.js:
--------------------------------------------------------------------------------
 1 | const puppeteer = require('puppeteer')
 2 | const debug = require('debug')('web-scraper-headless:chrome-headless-browser')
 3 | const {ExecutionContext} = require('puppeteer/lib/ExecutionContext')
 4 | const contentSraperBundler = require('../content_script/contentScraperHeadlessBundler')
 5 | const jqueryDeferred = require('jquery-deferred')
 6 | const whenCallSequentially = require('../assets/jquery.whencallsequentially')
 7 | 
 8 | class ChromeHeadlessBrowser {
 9 |   constructor (options) {
10 |     this.pageLoadDelay = options.pageLoadDelay
11 |     // constructors cannot handle asynchronous
12 |     this.browserPromise = puppeteer.launch({
13 |       headless: true
14 |     })
15 |     this.pagePromise = this.browserPromise.then(function (browser) {
16 |       return browser.newPage()
17 |     })
18 |   }
19 |   async loadUrl (url) {
20 |     debug('Loading url', url)
21 |     const page = await this.pagePromise
22 |     await page.goto(url, {waitUntil: 'networkidle2'})
23 |   }
24 |   async close () {
25 |     try {
26 |       const browser = await this.browserPromise
27 |       await browser.close()
28 |     } catch (e) {
29 |       console.error(e)
30 |     }
31 |   }
32 |   saveImages (record, namingFunction) {
33 |     var deferredResponse = jqueryDeferred.Deferred()
34 |     var deferredImageStoreCalls = []
35 |     var prefixLength = '_imageBase64-'.length
36 |     for (var attr in record) {
37 |       if (attr.substr(0, prefixLength) === '_imageBase64-') {
38 |         throw new Error('Downloading images is not yet supported')
39 |       }
40 |     }
41 |     whenCallSequentially(deferredImageStoreCalls).done(function () {
42 |       deferredResponse.resolve()
43 |     })
44 | 
45 |     return deferredResponse.promise()
46 |   }
47 |   async fetchData (url, sitemap, parentSelectorId, callback, scope) {
48 |     try {
49 |       const page = await this.pagePromise
50 |       await this.loadUrl(url)
51 | 
52 |       const mainFrame = page.mainFrame()
53 | 
54 |       // Maybe we don't need a context each time?
55 |       const isolatedWorldInfo = await page._client.send('Page.createIsolatedWorld', {frameId: mainFrame._id, worldName: 'web-scraper-headless'})
56 |       const executionContextId = isolatedWorldInfo.executionContextId
57 |       const JsHandleFactory = page._frameManager.createJSHandle.bind(page._frameManager, executionContextId)
58 | 
59 |       const executionContext = new ExecutionContext(page._client, {id: executionContextId}, JsHandleFactory)
60 | 
61 |       const bundle = await contentSraperBundler.getBundle()
62 |       await executionContext.evaluate(bundle)
63 |       const message = {
64 |         extractData: true,
65 |         sitemap: JSON.parse(JSON.stringify(sitemap)),
66 |         parentSelectorId: parentSelectorId
67 |       }
68 | 
69 |       const data = await executionContext.evaluate(function (message) {
70 |         return new Promise(function (resolve, reject) {
71 |           window.webScraper(message, null, function (data) {
72 |             resolve(data)
73 |           })
74 |         })
75 |       }, message)
76 |       callback.call(scope, null, data)
77 |     } catch (e) {
78 |       return callback(e)
79 |     }
80 |   }
81 | }
82 | 
83 | module.exports = ChromeHeadlessBrowser
84 | 


--------------------------------------------------------------------------------
/extension/scripts/Config.js:
--------------------------------------------------------------------------------
 1 | var Config = function () {
 2 | 
 3 | }
 4 | 
 5 | Config.prototype = {
 6 | 
 7 |   sitemapDb: '<use loadConfiguration()>',
 8 |   dataDb: '<use loadConfiguration()>',
 9 | 
10 |   defaults: {
11 |     storageType: 'local',
12 | 		// this is where sitemap documents are stored
13 |     sitemapDb: 'scraper-sitemaps',
14 | 		// this is where scraped data is stored.
15 | 		// empty for local storage
16 |     dataDb: ''
17 |   },
18 | 
19 | 	/**
20 | 	 * Loads configuration from chrome extension sync storage
21 | 	 */
22 |   loadConfiguration: function (callback) {
23 |     chrome.storage.sync.get(['sitemapDb', 'dataDb', 'storageType'], function (items) {
24 |       this.storageType = items.storageType || this.defaults.storageType
25 |       if (this.storageType === 'local') {
26 |         this.sitemapDb = this.defaults.sitemapDb
27 |         this.dataDb = this.defaults.dataDb
28 |       } else {
29 |         this.sitemapDb = items.sitemapDb || this.defaults.sitemapDb
30 |         this.dataDb = items.dataDb || this.defaults.dataDb
31 |       }
32 | 
33 |       callback()
34 |     }.bind(this))
35 |   },
36 | 
37 | 	/**
38 | 	 * Saves configuration to chrome extension sync storage
39 | 	 * @param {type} items
40 | 	 * @param {type} callback
41 | 	 * @returns {undefined}
42 | 	 */
43 |   updateConfiguration: function (items, callback) {
44 |     chrome.storage.sync.set(items, callback)
45 |   }
46 | }
47 | 
48 | module.exports = Config
49 | 


--------------------------------------------------------------------------------
/extension/scripts/ContentScript.js:
--------------------------------------------------------------------------------
  1 | var ContentSelector = require('./ContentSelector')
  2 | var jquery = require('jquery-deferred')
  3 | const debug = require('debug')('web-scraper-headless:content-script')
  4 | 
  5 | /**
  6 |  * ContentScript that can be called from anywhere within the extension
  7 |  */
  8 | var ContentScript = {
  9 | 
 10 | 	/**
 11 | 	 * Fetch
 12 | 	 * @param request.CSSSelector	css selector as string
 13 | 	 * @returns jquery.Deferred()
 14 | 	 */
 15 |   getHTML: function (request, options) {
 16 |     var $ = options.$
 17 |     var deferredHTML = jquery.Deferred()
 18 |     var html = $(request.CSSSelector).clone().wrap('<p>').parent().html()
 19 |     deferredHTML.resolve(html)
 20 |     debug('Send html', html)
 21 |     return deferredHTML.promise()
 22 |   },
 23 | 
 24 | 	/**
 25 | 	 * Removes current content selector if is in use within the page
 26 | 	 * @returns jquery.Deferred()
 27 | 	 */
 28 |   removeCurrentContentSelector: function () {
 29 |     var deferredResponse = jquery.Deferred()
 30 |     var contentSelector = window.cs
 31 |     if (contentSelector === undefined) {
 32 |       deferredResponse.resolve()
 33 |     } else {
 34 |       contentSelector.removeGUI()
 35 |       window.cs = undefined
 36 |       deferredResponse.resolve()
 37 |     }
 38 | 
 39 |     return deferredResponse.promise()
 40 |   },
 41 | 
 42 | 	/**
 43 | 	 * Select elements within the page
 44 | 	 * @param request.parentCSSSelector
 45 | 	 * @param request.allowedElements
 46 | 	 */
 47 |   selectSelector: function (request, options) {
 48 |     var $ = options.$
 49 |     var deferredResponse = jquery.Deferred()
 50 | 
 51 |     this.removeCurrentContentSelector().done(function () {
 52 |       var contentSelector = new ContentSelector({
 53 |         parentCSSSelector: request.parentCSSSelector,
 54 |         allowedElements: request.allowedElements
 55 |       }, {$, document, window})
 56 |       window.cs = contentSelector
 57 | 
 58 |       var deferredCSSSelector = contentSelector.getCSSSelector()
 59 |       deferredCSSSelector.done(function (response) {
 60 |         this.removeCurrentContentSelector().done(function () {
 61 |           deferredResponse.resolve(response)
 62 |           window.cs = undefined
 63 |         })
 64 |       }.bind(this)).fail(function (message) {
 65 |         deferredResponse.reject(message)
 66 |         window.cs = undefined
 67 |       })
 68 |     }.bind(this))
 69 | 
 70 |     return deferredResponse.promise()
 71 |   },
 72 | 
 73 | 	/**
 74 | 	 * Preview elements
 75 | 	 * @param request.parentCSSSelector
 76 | 	 * @param request.elementCSSSelector
 77 | 	 */
 78 |   previewSelector: function (request, options) {
 79 |     var $ = options.$
 80 |     var deferredResponse = jquery.Deferred()
 81 |     this.removeCurrentContentSelector().done(function () {
 82 |       var contentSelector = new ContentSelector({
 83 |         parentCSSSelector: request.parentCSSSelector
 84 |       }, {$, document, window})
 85 |       window.cs = contentSelector
 86 | 
 87 |       var deferredSelectorPreview = contentSelector.previewSelector(request.elementCSSSelector)
 88 |       deferredSelectorPreview.done(function () {
 89 |         deferredResponse.resolve()
 90 |       }).fail(function (message) {
 91 |         deferredResponse.reject(message)
 92 |         window.cs = undefined
 93 |       })
 94 |     })
 95 |     return deferredResponse
 96 |   }
 97 | }
 98 | 
 99 | module.exports = ContentScript
100 | 


--------------------------------------------------------------------------------
/extension/scripts/ElementQuery.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Element selector. Uses jQuery as base and adds some more features
 3 |  * @param CSSSelector
 4 |  * @param parentElement
 5 |  * @param options
 6 |  */
 7 | var ElementQuery = function (CSSSelector, parentElement, options) {
 8 |   CSSSelector = CSSSelector || ''
 9 |   this.$ = options.$
10 |   this.document = options.document
11 |   this.window = options.window
12 |   if (!this.$) throw new Error('Missing jquery for ElementQuery')
13 |   if (!this.document) throw new Error("Missing document")
14 |   if(!this.window)throw new Error("Missing window")
15 |   var selectedElements = []
16 | 
17 |   var addElement = function (element) {
18 |     if (selectedElements.indexOf(element) === -1) {
19 |       selectedElements.push(element)
20 |     }
21 |   }
22 | 
23 |   var selectorParts = ElementQuery.getSelectorParts(CSSSelector)
24 |   var self = this
25 |   selectorParts.forEach(function (selector) {
26 |     // handle special case when parent is selected
27 |     if (selector === '_parent_') {
28 |       self.$(parentElement).each(function (i, element) {
29 |         addElement(element)
30 |       })
31 |     } else {
32 |       var elements = self.$(selector, self.$(parentElement))
33 |       elements.each(function (i, element) {
34 |         addElement(element)
35 |       })
36 |     }
37 |   })
38 | 
39 |   return selectedElements
40 | }
41 | 
42 | ElementQuery.getSelectorParts = function (CSSSelector) {
43 |   var selectors = CSSSelector.split(/(,|".*?"|'.*?'|\(.*?\))/)
44 | 
45 |   var resultSelectors = []
46 |   var currentSelector = ''
47 |   selectors.forEach(function (selector) {
48 |     if (selector === ',') {
49 |       if (currentSelector.trim().length) {
50 |         resultSelectors.push(currentSelector.trim())
51 |       }
52 |       currentSelector = ''
53 |     }		else {
54 |       currentSelector += selector
55 |     }
56 |   })
57 |   if (currentSelector.trim().length) {
58 |     resultSelectors.push(currentSelector.trim())
59 |   }
60 | 
61 |   return resultSelectors
62 | }
63 | 
64 | module.exports = ElementQuery
65 | 


--------------------------------------------------------------------------------
/extension/scripts/InMemoryStore.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var InMemoryStore = function () {
 3 |   this.data = []
 4 | }
 5 | 
 6 | InMemoryStore.prototype = {
 7 | 
 8 |   writeDocs: function (data, callback) {
 9 |     data.forEach(function (data) {
10 |       this.data.push(data)
11 |     }.bind(this))
12 |     callback()
13 |   },
14 | 
15 |   initSitemapDataDb: function (sitemapId, callback) {
16 |     callback(this)
17 |   }
18 | }
19 | 
20 | module.exports = InMemoryStore
21 | 


--------------------------------------------------------------------------------
/extension/scripts/JSDOMBrowser.js:
--------------------------------------------------------------------------------
 1 | const jsdom = require('jsdom')
 2 | const jQuery = require('jquery')
 3 | var jqueryDeferred = require('jquery-deferred')
 4 | 
 5 | const contentScraper = require('../content_script/content_scraper')
 6 | var whenCallSequentially = require('../assets/jquery.whencallsequentially')
 7 | const debug = require('debug')('web-scraper-headless:jsdom-browser')
 8 | var JSDOMBrowser = function (options) {
 9 |   this.pageLoadDelay = options.pageLoadDelay
10 | }
11 | 
12 | JSDOMBrowser.prototype = {
13 |   loadUrl: function (url, callback) {
14 |     const {JSDOM} = jsdom
15 |     const browser = this
16 |     JSDOM.fromURL(url)
17 |       .then(function (dom) {
18 |         const window = dom.window
19 |         const document = window.document
20 |         const $ = jQuery(dom.window)
21 |         setTimeout(function () {
22 |           callback(null, {$, document, window})
23 |         }, browser.pageLoadDelay)
24 |       }).catch(e => callback(e))
25 |   },
26 |   close: function () {
27 | 
28 |   },
29 |   saveImages: function (record, namingFunction) {
30 |     var deferredResponse = jqueryDeferred.Deferred()
31 |     var deferredImageStoreCalls = []
32 |     var prefixLength = '_imageBase64-'.length
33 |     for (var attr in record) {
34 |       if (attr.substr(0, prefixLength) === '_imageBase64-') {
35 |         throw new Error('Downloading images is not yet supported')
36 |       }
37 |     }
38 |     whenCallSequentially(deferredImageStoreCalls).done(function () {
39 |       deferredResponse.resolve()
40 |     })
41 | 
42 |     return deferredResponse.promise()
43 |   },
44 |   fetchData: function (url, sitemap, parentSelectorId, callback, scope) {
45 |     const browser = this
46 |     debug('Init jsdom browser app')
47 |     browser.loadUrl(url, function (err, options) {
48 |       if (err) {
49 |         return callback(err)
50 |       }
51 |       const {$, document, window} = options
52 | 
53 |       var message = {
54 |         extractData: true,
55 |         sitemap: JSON.parse(JSON.stringify(sitemap)),
56 |         parentSelectorId: parentSelectorId
57 |       }
58 |       function sendResponse (data) {
59 |         callback.call(scope, null, data)
60 |       }
61 |       contentScraper(message, null, sendResponse, {$, document, window})
62 |     })
63 |   }
64 | }
65 | 
66 | module.exports = JSDOMBrowser
67 | 


--------------------------------------------------------------------------------
/extension/scripts/JSDOMBrowserLoader.js:
--------------------------------------------------------------------------------
 1 | const JSDOMBrowser = require('./JSDOMBrowser')
 2 | module.exports = function (self) {
 3 |   var browser =
 4 | 
 5 |   self.onerror = function (err) {
 6 |     self.postMessage({
 7 |       err: new Error(err)
 8 |     })
 9 |     self.close()
10 |   }
11 |   self.addEventListener('message', function (ev) {
12 |     const data = ev.data
13 |     const UUID = data.UUID
14 |     if (data.topic === 'init') {
15 |       browser = new JSDOMBrowser(data.options)
16 |       return self.postMessage({
17 |         UUID
18 |       })
19 |     } else if (data.topic === 'loadUrl') {
20 |       browser.loadUrl(data.url, function (err, {$, document, window}) {
21 |         if (err) {
22 |           return self.postMessage({
23 |             UUID,
24 |             err
25 |           })
26 |         }
27 |         self.postMessage({
28 |           UUID
29 |         })
30 |       })
31 |     } else if (data.topic === 'fetchData') {
32 |       browser.fetchData(data.url, data.sitemap, data.parentSelectorId, function (err, results) {
33 |         if (err) {
34 |           return self.postMessage({
35 |             UUID,
36 |             err
37 |           })
38 |         }
39 |         self.postMessage({
40 |           UUID,
41 |           info: {
42 |             results
43 |           }
44 |         })
45 |       }, null)
46 |     } else {
47 |       self.postMessage({
48 |         err: new Error('Unknown  topic ' + data.topic)
49 |       })
50 |     }
51 |   })
52 | }


--------------------------------------------------------------------------------
/extension/scripts/Job.js:
--------------------------------------------------------------------------------
 1 | const debug = require('debug')('web-scraper-headless:job')
 2 | var Job = function (url, parentSelector, scraper, parentJob, baseData) {
 3 |   if (parentJob !== undefined) {
 4 |     this.url = this.combineUrls(parentJob.url, url)
 5 |   } else {
 6 |     this.url = url
 7 |   }
 8 |   this.parentSelector = parentSelector
 9 |   this.scraper = scraper
10 |   this.dataItems = []
11 |   this.baseData = baseData || {}
12 | }
13 | 
14 | Job.prototype = {
15 | 
16 |   combineUrls: function (parentUrl, childUrl) {
17 |     var urlMatcher = new RegExp('(https?://)?([a-z0-9\\-\\.]+\\.[a-z0-9\\-]+(:\\d+)?|\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}(:\\d+)?)?(\\/[^\\?]*\\/|\\/)?([^\\?]*)?(\\?.*)?', 'i')
18 | 
19 |     var parentMatches = parentUrl.match(urlMatcher)
20 |     var childMatches = childUrl.match(urlMatcher)
21 | 
22 | 		// special case for urls like this: ?a=1  or like-this/
23 |     if (childMatches[1] === undefined && childMatches[2] === undefined && childMatches[5] === undefined && childMatches[6] === undefined) {
24 |       var url = parentMatches[1] + parentMatches[2] + parentMatches[5] + parentMatches[6] + childMatches[7]
25 |       return url
26 |     }
27 | 
28 |     if (childMatches[1] === undefined) {
29 |       childMatches[1] = parentMatches[1]
30 |     }
31 |     if (childMatches[2] === undefined) {
32 |       childMatches[2] = parentMatches[2]
33 |     }
34 |     if (childMatches[5] === undefined) {
35 |       if (parentMatches[5] === undefined) {
36 |         childMatches[5] = '/'
37 |       } else {
38 |         childMatches[5] = parentMatches[5]
39 |       }
40 |     }
41 | 
42 |     if (childMatches[6] === undefined) {
43 |       childMatches[6] = ''
44 |     }
45 |     if (childMatches[7] === undefined) {
46 |       childMatches[7] = ''
47 |     }
48 | 
49 |     return childMatches[1] + childMatches[2] + childMatches[5] + childMatches[6] + childMatches[7]
50 |   },
51 | 
52 |   execute: function (browser, callback, scope) {
53 |     var sitemap = this.scraper.sitemap
54 |     var job = this
55 |     debug('starting fetching')
56 |     browser.fetchData(this.url, sitemap, this.parentSelector, function (err, results) {
57 |       if (err) {
58 |         return callback(err)
59 |       }
60 |       debug('finished fetching')
61 | 			// merge data with data from initialization
62 |       for (var i in results) {
63 |         var result = results[i]
64 |         for (var key in this.baseData) {
65 |           if (!(key in result)) {
66 |             result[key] = this.baseData[key]
67 |           }
68 |         }
69 |         this.dataItems.push(result)
70 |       }
71 |       callback(null, job)
72 |     }.bind(this), this)
73 |   },
74 |   getResults: function () {
75 |     return this.dataItems
76 |   }
77 | }
78 | 
79 | module.exports = Job
80 | 


--------------------------------------------------------------------------------
/extension/scripts/Queue.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var Queue = function () {
 3 |   this.jobs = []
 4 |   this.scrapedUrls = {}
 5 | }
 6 | 
 7 | Queue.prototype = {
 8 | 
 9 | 	/**
10 | 	 * Returns false if page is already scraped
11 | 	 * @param job
12 | 	 * @returns {boolean}
13 | 	 */
14 |   add: function (job) {
15 |     if (this.canBeAdded(job)) {
16 |       this.jobs.push(job)
17 |       this._setUrlScraped(job.url)
18 |       return true
19 |     }
20 |     return false
21 |   },
22 | 
23 |   canBeAdded: function (job) {
24 |     if (this.isScraped(job.url)) {
25 |       return false
26 |     }
27 | 
28 | 		// reject documents
29 |     if (job.url.match(/\.(doc|docx|pdf|ppt|pptx|odt)$/i) !== null) {
30 |       return false
31 |     }
32 |     return true
33 |   },
34 | 
35 |   getQueueSize: function () {
36 |     return this.jobs.length
37 |   },
38 | 
39 |   isScraped: function (url) {
40 |     return (this.scrapedUrls[url] !== undefined)
41 |   },
42 | 
43 |   _setUrlScraped: function (url) {
44 |     this.scrapedUrls[url] = true
45 |   },
46 | 
47 |   getNextJob: function () {
48 | 		// @TODO test this
49 |     if (this.getQueueSize() > 0) {
50 |       return this.jobs.pop()
51 |     } else {
52 |       return false
53 |     }
54 |   }
55 | }
56 | 
57 | module.exports = Queue
58 | 


--------------------------------------------------------------------------------
/extension/scripts/Selector.js:
--------------------------------------------------------------------------------
  1 | var selectors = require('./Selectors')
  2 | var ElementQuery = require('./ElementQuery')
  3 | var jquery = require('jquery-deferred')
  4 | const debug = require('debug')('web-scraper-headless:selector')
  5 | 
  6 | var Selector = function (selector, options) {
  7 |   var $ = options.$
  8 |   var document = options.document
  9 |   var window = options.window
 10 |   // We don't want enumerable properties
 11 |   Object.defineProperty(this, '$', {
 12 |     value: $,
 13 |     enumerable: false
 14 |   })
 15 |   Object.defineProperty(this, 'window', {
 16 |     value: window,
 17 |     enumerable: false
 18 |   })
 19 |   Object.defineProperty(this, 'document', {
 20 |     value: document,
 21 |     enumerable: false
 22 |   })
 23 |   if (!this.$) throw new Error('Missing jquery')
 24 |   if (!this.document) throw new Error("Missing document")
 25 |   if(!this.window)throw new Error("Missing window")
 26 | 
 27 |   this.updateData(selector)
 28 |   this.initType()
 29 | }
 30 | 
 31 | Selector.prototype = {
 32 | 
 33 | 	/**
 34 | 	 * Is this selector configured to return multiple items?
 35 | 	 * @returns {boolean}
 36 | 	 */
 37 |   willReturnMultipleRecords: function () {
 38 |     return this.canReturnMultipleRecords() && this.multiple
 39 |   },
 40 | 
 41 | 	/**
 42 | 	 * Update current selector configuration
 43 | 	 * @param data
 44 | 	 */
 45 |   updateData: function (data) {
 46 |     var allowedKeys = ['window', 'document', 'id', 'type', 'selector', 'parentSelectors']
 47 |     debug('data type', data.type)
 48 |     allowedKeys = allowedKeys.concat(selectors[data.type].getFeatures())
 49 |     var key
 50 | 		// update data
 51 |     for (key in data) {
 52 |       if (allowedKeys.indexOf(key) !== -1 || typeof data[key] === 'function') {
 53 |         this[key] = data[key]
 54 |       }
 55 |     }
 56 | 
 57 | 		// remove values that are not needed for this type of selector
 58 |     for (key in this) {
 59 |       if (allowedKeys.indexOf(key) === -1 && typeof this[key] !== 'function') {
 60 |         delete this[key]
 61 |       }
 62 |     }
 63 |   },
 64 | 
 65 | 	/**
 66 | 	 * CSS selector which will be used for element selection
 67 | 	 * @returns {string}
 68 | 	 */
 69 |   getItemCSSSelector: function () {
 70 |     return '*'
 71 |   },
 72 | 
 73 | 	/**
 74 | 	 * override objects methods based on seletor type
 75 | 	 */
 76 |   initType: function () {
 77 |     if (selectors[this.type] === undefined) {
 78 |       throw new Error('Selector type not defined ' + this.type)
 79 |     }
 80 | 
 81 | 		// overrides objects methods
 82 |     for (var i in selectors[this.type]) {
 83 |       this[i] = selectors[this.type][i]
 84 |     }
 85 |   },
 86 | 
 87 | 	/**
 88 | 	 * Check whether a selector is a paren selector of this selector
 89 | 	 * @param selectorId
 90 | 	 * @returns {boolean}
 91 | 	 */
 92 |   hasParentSelector: function (selectorId) {
 93 |     return (this.parentSelectors.indexOf(selectorId) !== -1)
 94 |   },
 95 | 
 96 |   removeParentSelector: function (selectorId) {
 97 |     var index = this.parentSelectors.indexOf(selectorId)
 98 |     if (index !== -1) {
 99 |       this.parentSelectors.splice(index, 1)
100 |     }
101 |   },
102 | 
103 |   renameParentSelector: function (originalId, replacementId) {
104 |     if (this.hasParentSelector(originalId)) {
105 |       var pos = this.parentSelectors.indexOf(originalId)
106 |       this.parentSelectors.splice(pos, 1, replacementId)
107 |     }
108 |   },
109 | 
110 |   getDataElements: function (parentElement) {
111 |     var $ = this.$
112 |     var document = this.document
113 |     var window = this.window
114 |     var elements = ElementQuery(this.selector, parentElement, {$, document, window})
115 |     if (this.multiple) {
116 |       return elements
117 |     } else if (elements.length > 0) {
118 |       return [elements[0]]
119 |     } else {
120 |       return []
121 |     }
122 |   },
123 | 
124 |   getData: function (parentElement) {
125 |     var d = jquery.Deferred()
126 |     var timeout = this.delay || 0
127 | 
128 | 		// this works much faster because whenCallSequentally isn't running next data extraction immediately
129 |     if (timeout === 0) {
130 |       var deferredData = this._getData(parentElement)
131 |       deferredData.done(function (data) {
132 |         d.resolve(data)
133 |       })
134 |     }	else {
135 |       setTimeout(function () {
136 |         var deferredData = this._getData(parentElement)
137 |         deferredData.done(function (data) {
138 |           d.resolve(data)
139 |         })
140 |       }.bind(this), timeout)
141 |     }
142 | 
143 |     return d.promise()
144 |   }
145 | }
146 | 
147 | module.exports = Selector
148 | 


--------------------------------------------------------------------------------
/extension/scripts/Selector/SelectorElement.js:
--------------------------------------------------------------------------------
 1 | var jquery = require('jquery-deferred')
 2 | 
 3 | var SelectorElement = {
 4 | 
 5 |   canReturnMultipleRecords: function () {
 6 |     return true
 7 |   },
 8 | 
 9 |   canHaveChildSelectors: function () {
10 |     return true
11 |   },
12 | 
13 |   canHaveLocalChildSelectors: function () {
14 |     return true
15 |   },
16 | 
17 |   canCreateNewJobs: function () {
18 |     return false
19 |   },
20 |   willReturnElements: function () {
21 |     return true
22 |   },
23 |   _getData: function (parentElement) {
24 |     var dfd = jquery.Deferred()
25 | 
26 |     var elements = this.getDataElements(parentElement)
27 |     dfd.resolve(this.$.makeArray(elements))
28 | 
29 |     return dfd.promise()
30 |   },
31 | 
32 |   getDataColumns: function () {
33 |     return []
34 |   },
35 | 
36 |   getFeatures: function () {
37 |     return ['multiple', 'delay']
38 |   }
39 | }
40 | 
41 | module.exports = SelectorElement
42 | 


--------------------------------------------------------------------------------
/extension/scripts/Selector/SelectorElementAttribute.js:
--------------------------------------------------------------------------------
 1 | var jquery = require('jquery-deferred')
 2 | var SelectorElementAttribute = {
 3 |   canReturnMultipleRecords: function () {
 4 |     return true
 5 |   },
 6 | 
 7 |   canHaveChildSelectors: function () {
 8 |     return false
 9 |   },
10 | 
11 |   canHaveLocalChildSelectors: function () {
12 |     return false
13 |   },
14 | 
15 |   canCreateNewJobs: function () {
16 |     return false
17 |   },
18 |   willReturnElements: function () {
19 |     return false
20 |   },
21 |   _getData: function (parentElement) {
22 |     var dfd = jquery.Deferred()
23 |     var self = this
24 |     var elements = this.getDataElements(parentElement)
25 | 
26 |     var result = []
27 |     self.$(elements).each(function (k, element) {
28 |       var data = {}
29 | 
30 |       data[this.id] = self.$(element).attr(this.extractAttribute)
31 |       result.push(data)
32 |     }.bind(this))
33 | 
34 |     if (this.multiple === false && elements.length === 0) {
35 |       var data = {}
36 |       data[this.id + '-src'] = null
37 |       result.push(data)
38 |     }
39 |     dfd.resolve(result)
40 | 
41 |     return dfd.promise()
42 |   },
43 | 
44 |   getDataColumns: function () {
45 |     return [this.id]
46 |   },
47 | 
48 |   getFeatures: function () {
49 |     return ['multiple', 'extractAttribute', 'delay']
50 |   }
51 | }
52 | 
53 | module.exports = SelectorElementAttribute
54 | 


--------------------------------------------------------------------------------
/extension/scripts/Selector/SelectorElementScroll.js:
--------------------------------------------------------------------------------
 1 | var jquery = require('jquery-deferred')
 2 | var SelectorElementScroll = {
 3 | 
 4 |   canReturnMultipleRecords: function () {
 5 |     return true
 6 |   },
 7 | 
 8 |   canHaveChildSelectors: function () {
 9 |     return true
10 |   },
11 | 
12 |   canHaveLocalChildSelectors: function () {
13 |     return true
14 |   },
15 | 
16 |   canCreateNewJobs: function () {
17 |     return false
18 |   },
19 |   willReturnElements: function () {
20 |     return true
21 |   },
22 |   scrollToBottom: function () {
23 |     var document = this.document
24 |     window.scrollTo(0, document.body.scrollHeight)
25 |   },
26 |   _getData: function (parentElement) {
27 |     var delay = parseInt(this.delay) || 0
28 |     var deferredResponse = jquery.Deferred()
29 |     var foundElements = []
30 | 
31 | 		// initially scroll down and wait
32 |     this.scrollToBottom()
33 |     var nextElementSelection = (new Date()).getTime() + delay
34 | 
35 | 		// infinitely scroll down and find all items
36 |     var interval = setInterval(function () {
37 |       var now = (new Date()).getTime()
38 | 			// sleep. wait when to extract next elements
39 |       if (now < nextElementSelection) {
40 |         return
41 |       }
42 | 
43 |       var elements = this.getDataElements(parentElement)
44 | 			// no new elements found
45 |       if (elements.length === foundElements.length) {
46 |         clearInterval(interval)
47 |         deferredResponse.resolve(this.$.makeArray(elements))
48 |       } else {
49 | 				// continue scrolling and add delay
50 |         foundElements = elements
51 |         this.scrollToBottom()
52 |         nextElementSelection = now + delay
53 |       }
54 |     }.bind(this), 50)
55 | 
56 |     return deferredResponse.promise()
57 |   },
58 | 
59 |   getDataColumns: function () {
60 |     return []
61 |   },
62 | 
63 |   getFeatures: function () {
64 |     return ['multiple', 'delay']
65 |   }
66 | }
67 | 
68 | module.exports = SelectorElementScroll
69 | 


--------------------------------------------------------------------------------
/extension/scripts/Selector/SelectorGoogMapID.js:
--------------------------------------------------------------------------------
 1 | const url = require('url')
 2 | const jquery = require('jquery-deferred')
 3 | const debug = require('debug')('web-scraper-headless:selector-goog-map-id')
 4 | 
 5 | var SelectorGoogMapID = {
 6 | 
 7 |   canReturnMultipleRecords: function () {
 8 |     return true
 9 |   },
10 | 
11 |   canHaveChildSelectors: function () {
12 |     return false
13 |   },
14 | 
15 |   canHaveLocalChildSelectors: function () {
16 |     return false
17 |   },
18 | 
19 |   canCreateNewJobs: function () {
20 |     return false
21 |   },
22 |   willReturnElements: function () {
23 |     return false
24 |   },
25 |   getMapID: function ($container) {
26 |     const $ = this.$
27 |     const mapSelector = this.getMapsSelector()
28 |     const mUrl = $($container).find(mapSelector).attr('src')
29 |     if (!mUrl) {
30 |       debug('Goog map url was undefined')
31 |       return ''
32 |     }
33 |     const mQuery = url.parse(mUrl, true).query
34 |     const pb = mQuery ? mQuery.pb : null
35 |     if (!pb) {
36 |       debug('Pb in query was undefined in url', url)
37 |       return ''
38 |     }
39 |     const match = pb.match(/0x[0-9a-f]{15,16}:0x[0-9a-f]{15,16}/)
40 |     if (!match) {
41 |       debug('Could not find fid in pb', pb)
42 |       return ''
43 |     }
44 |     return match[0]
45 |   },
46 |   _getData: function (parentElement) {
47 |     var dfd = jquery.Deferred()
48 |     var $ = this.$
49 | 
50 |     // easier to select divs containing the iframe
51 |     var containers = this.getDataElements(parentElement)
52 |     const result = []
53 |     var selector = this
54 |     $(containers).each(function (k, container) {
55 |       const mapId = selector.getMapID($(container))
56 |       result.push({[selector.id + '_FTID']: mapId})
57 |     })
58 | 
59 |     dfd.resolve(result)
60 |     return dfd.promise()
61 |   },
62 | 
63 |   getDataColumns: function () {
64 |     return [this.id + '_FTID', this.id + '_PID', this.id + '_CID']
65 |   },
66 | 
67 |   getFeatures: function () {
68 |     return ['mapsSelectorFromDiv']
69 |   },
70 | 
71 |   getItemCSSSelector: function () {
72 |     // We get the container
73 |     return '*:not(div.overlay)'
74 |   },
75 | 
76 |   getMapsSelectorFromDivHTML: function (html, options = {}) {
77 |     const $ = options.$ || this.$
78 |     const div = $(html)
79 |     const defaultSelector = 'iframe[src*="google.com/maps/embed"]'
80 |     if (div.find(defaultSelector).length) {
81 |       return defaultSelector
82 |     }
83 |     return ''
84 |   },
85 | 
86 |   getMapsSelector: function () {
87 |     if (this.mapsSelectorFromDiv === undefined) {
88 |       return 'iframe[src*="google.com/maps/embed"]'
89 |     } else {
90 |       return this.mapsSelectorFromDiv
91 |     }
92 |   }
93 | }
94 | 
95 | module.exports = SelectorGoogMapID
96 | 


--------------------------------------------------------------------------------
/extension/scripts/Selector/SelectorGroup.js:
--------------------------------------------------------------------------------
 1 | var jquery = require('jquery-deferred')
 2 | var SelectorGroup = {
 3 | 
 4 |   canReturnMultipleRecords: function () {
 5 |     return false
 6 |   },
 7 | 
 8 |   canHaveChildSelectors: function () {
 9 |     return false
10 |   },
11 | 
12 |   canHaveLocalChildSelectors: function () {
13 |     return false
14 |   },
15 | 
16 |   canCreateNewJobs: function () {
17 |     return false
18 |   },
19 |   willReturnElements: function () {
20 |     return false
21 |   },
22 |   _getData: function (parentElement) {
23 |     var dfd = jquery.Deferred()
24 |     var self = this
25 | 		// cannot reuse this.getDataElements because it depends on *multiple* property
26 |     var elements = self.$(this.selector, parentElement)
27 | 
28 |     var records = []
29 |     self.$(elements).each(function (k, element) {
30 |       var data = {}
31 | 
32 |       data[this.id] = self.$(element).text()
33 | 
34 |       if (this.extractAttribute) {
35 |         data[this.id + '-' + this.extractAttribute] = self.$(element).attr(this.extractAttribute)
36 |       }
37 | 
38 |       records.push(data)
39 |     }.bind(this))
40 | 
41 |     var result = {}
42 |     result[this.id] = records
43 | 
44 |     dfd.resolve([result])
45 |     return dfd.promise()
46 |   },
47 | 
48 |   getDataColumns: function () {
49 |     return [this.id]
50 |   },
51 | 
52 |   getFeatures: function () {
53 |     return ['delay', 'extractAttribute']
54 |   }
55 | }
56 | 
57 | module.exports = SelectorGroup
58 | 


--------------------------------------------------------------------------------
/extension/scripts/Selector/SelectorHTML.js:
--------------------------------------------------------------------------------
 1 | var jquery = require('jquery-deferred')
 2 | var SelectorHTML = {
 3 | 
 4 |   canReturnMultipleRecords: function () {
 5 |     return true
 6 |   },
 7 | 
 8 |   canHaveChildSelectors: function () {
 9 |     return false
10 |   },
11 | 
12 |   canHaveLocalChildSelectors: function () {
13 |     return false
14 |   },
15 | 
16 |   canCreateNewJobs: function () {
17 |     return false
18 |   },
19 |   willReturnElements: function () {
20 |     return false
21 |   },
22 |   _getData: function (parentElement) {
23 |     var dfd = jquery.Deferred()
24 |     var self = this
25 |     var elements = this.getDataElements(parentElement)
26 | 
27 |     var result = []
28 |     self.$(elements).each(function (k, element) {
29 |       var data = {}
30 |       var html = self.$(element).html()
31 | 
32 |       if (this.regex !== undefined && this.regex.length) {
33 |         var matches = html.match(new RegExp(this.regex))
34 |         if (matches !== null) {
35 |           html = matches[0]
36 |         } else {
37 |           html = null
38 |         }
39 |       }
40 |       data[this.id] = html
41 | 
42 |       result.push(data)
43 |     }.bind(this))
44 | 
45 |     if (this.multiple === false && elements.length === 0) {
46 |       var data = {}
47 |       data[this.id] = null
48 |       result.push(data)
49 |     }
50 | 
51 |     dfd.resolve(result)
52 |     return dfd.promise()
53 |   },
54 | 
55 |   getDataColumns: function () {
56 |     return [this.id]
57 |   },
58 | 
59 |   getFeatures: function () {
60 |     return ['multiple', 'regex', 'delay']
61 |   }
62 | }
63 | 
64 | module.exports = SelectorHTML
65 | 


--------------------------------------------------------------------------------
/extension/scripts/Selector/SelectorImage.js:
--------------------------------------------------------------------------------
  1 | var jquery = require('jquery-deferred')
  2 | var whenCallSequentially = require('../../assets/jquery.whencallsequentially')
  3 | var Base64 = require('../../assets/base64')
  4 | var SelectorImage = {
  5 |   canReturnMultipleRecords: function () {
  6 |     return true
  7 |   },
  8 | 
  9 |   canHaveChildSelectors: function () {
 10 |     return false
 11 |   },
 12 | 
 13 |   canHaveLocalChildSelectors: function () {
 14 |     return false
 15 |   },
 16 | 
 17 |   canCreateNewJobs: function () {
 18 |     return false
 19 |   },
 20 |   willReturnElements: function () {
 21 |     return false
 22 |   },
 23 |   _getData: function (parentElement) {
 24 |     var dfd = jquery.Deferred()
 25 | 
 26 |     var elements = this.getDataElements(parentElement)
 27 | 
 28 |     var deferredDataCalls = []
 29 |     this.$(elements).each(function (i, element) {
 30 |       deferredDataCalls.push(function () {
 31 |         var deferredData = jquery.Deferred()
 32 | 
 33 |         var data = {}
 34 |         data[this.id + '-src'] = element.src
 35 | 
 36 | 				// download image if required
 37 |         if (!this.downloadImage) {
 38 |           deferredData.resolve(data)
 39 |         } else {
 40 |           var deferredImageBase64 = this.downloadImageBase64(element.src)
 41 | 
 42 |           deferredImageBase64.done(function (imageResponse) {
 43 |             data['_imageBase64-' + this.id] = imageResponse.imageBase64
 44 |             data['_imageMimeType-' + this.id] = imageResponse.mimeType
 45 | 
 46 |             deferredData.resolve(data)
 47 |           }.bind(this)).fail(function () {
 48 | 						// failed to download image continue.
 49 | 						// @TODO handle errror
 50 |             deferredData.resolve(data)
 51 |           })
 52 |         }
 53 | 
 54 |         return deferredData.promise()
 55 |       }.bind(this))
 56 |     }.bind(this))
 57 | 
 58 |     whenCallSequentially(deferredDataCalls).done(function (dataResults) {
 59 |       if (this.multiple === false && elements.length === 0) {
 60 |         var data = {}
 61 |         data[this.id + '-src'] = null
 62 |         dataResults.push(data)
 63 |       }
 64 | 
 65 |       dfd.resolve(dataResults)
 66 |     })
 67 | 
 68 |     return dfd.promise()
 69 |   },
 70 | 
 71 |   downloadFileAsBlob: function (url) {
 72 |     var window = this.window
 73 |     var deferredResponse = jquery.Deferred()
 74 |     var xhr = new window.XMLHttpRequest()
 75 |     xhr.onreadystatechange = function () {
 76 |       if (this.readyState == 4) {
 77 |         if (this.status == 200) {
 78 |           var blob = this.response
 79 |           deferredResponse.resolve(blob)
 80 |         } else {
 81 |           deferredResponse.reject(xhr.statusText)
 82 |         }
 83 |       }
 84 |     }
 85 |     xhr.open('GET', url)
 86 |     xhr.responseType = 'blob'
 87 |     xhr.send()
 88 | 
 89 |     return deferredResponse.promise()
 90 |   },
 91 | 
 92 |   downloadImageBase64: function (url) {
 93 |     var deferredResponse = jquery.Deferred()
 94 |     var deferredDownload = this.downloadFileAsBlob(url)
 95 |     deferredDownload.done(function (blob) {
 96 |       var mimeType = blob.type
 97 |       var deferredBlob = Base64.blobToBase64(blob)
 98 |       deferredBlob.done(function (imageBase64) {
 99 |         deferredResponse.resolve({
100 |           mimeType: mimeType,
101 |           imageBase64: imageBase64
102 |         })
103 |       })
104 |     }).fail(deferredResponse.fail)
105 |     return deferredResponse.promise()
106 |   },
107 | 
108 |   getDataColumns: function () {
109 |     return [this.id + '-src']
110 |   },
111 | 
112 |   getFeatures: function () {
113 |     return ['multiple', 'delay', 'downloadImage']
114 |   },
115 | 
116 |   getItemCSSSelector: function () {
117 |     return 'img'
118 |   }
119 | }
120 | 
121 | module.exports = SelectorImage
122 | 


--------------------------------------------------------------------------------
/extension/scripts/Selector/SelectorLink.js:
--------------------------------------------------------------------------------
 1 | var jquery = require('jquery-deferred')
 2 | var whenCallSequentially = require('../../assets/jquery.whencallsequentially')
 3 | 
 4 | var SelectorLink = {
 5 |   canReturnMultipleRecords: function () {
 6 |     return true
 7 |   },
 8 | 
 9 |   canHaveChildSelectors: function () {
10 |     return true
11 |   },
12 | 
13 |   canHaveLocalChildSelectors: function () {
14 |     return false
15 |   },
16 | 
17 |   canCreateNewJobs: function () {
18 |     return true
19 |   },
20 |   willReturnElements: function () {
21 |     return false
22 |   },
23 |   _getData: function (parentElement) {
24 |     var elements = this.getDataElements(parentElement)
25 |     var self = this
26 | 
27 |     var dfd = jquery.Deferred()
28 | 
29 | 		// return empty record if not multiple type and no elements found
30 |     if (this.multiple === false && elements.length === 0) {
31 |       var data = {}
32 |       data[this.id] = null
33 |       dfd.resolve([data])
34 |       return dfd
35 |     }
36 | 
37 | 		// extract links one by one
38 |     var deferredDataExtractionCalls = []
39 |     self.$(elements).each(function (k, element) {
40 |       deferredDataExtractionCalls.push(function (element) {
41 |         var deferredData = jquery.Deferred()
42 | 
43 |         var data = {}
44 |         data[this.id] = self.$(element).text()
45 |         data._followSelectorId = this.id
46 |         data[this.id + '-href'] = element.href
47 |         data._follow = element.href
48 |         deferredData.resolve(data)
49 | 
50 |         return deferredData
51 |       }.bind(this, element))
52 |     }.bind(this))
53 | 
54 |     whenCallSequentially(deferredDataExtractionCalls).done(function (responses) {
55 |       var result = []
56 |       responses.forEach(function (dataResult) {
57 |         result.push(dataResult)
58 |       })
59 |       dfd.resolve(result)
60 |     })
61 | 
62 |     return dfd.promise()
63 |   },
64 | 
65 |   getDataColumns: function () {
66 |     return [this.id, this.id + '-href']
67 |   },
68 | 
69 |   getFeatures: function () {
70 |     return ['multiple', 'delay']
71 |   },
72 | 
73 |   getItemCSSSelector: function () {
74 |     return 'a'
75 |   }
76 | }
77 | 
78 | module.exports = SelectorLink
79 | 


--------------------------------------------------------------------------------
/extension/scripts/Selector/SelectorPopupLink.js:
--------------------------------------------------------------------------------
  1 | var whenCallSequentially = require('../../assets/jquery.whencallsequentially')
  2 | var jquery = require('jquery-deferred')
  3 | var CssSelector = require('css-selector').CssSelector
  4 | const debug = require('debug')('web-scraper-headless:selector:selector-popup-link')
  5 | var SelectorPopupLink = {
  6 |   canReturnMultipleRecords: function () {
  7 |     return true
  8 |   },
  9 | 
 10 |   canHaveChildSelectors: function () {
 11 |     return true
 12 |   },
 13 | 
 14 |   canHaveLocalChildSelectors: function () {
 15 |     return false
 16 |   },
 17 | 
 18 |   canCreateNewJobs: function () {
 19 |     return true
 20 |   },
 21 |   willReturnElements: function () {
 22 |     return false
 23 |   },
 24 |   _getData: function (parentElement) {
 25 |     var $ = this.$
 26 | var document = this.document
 27 | var window = this.window
 28 |     var elements = this.getDataElements(parentElement)
 29 | 
 30 |     var dfd = jquery.Deferred()
 31 | 
 32 | 		// return empty record if not multiple type and no elements found
 33 |     if (this.multiple === false && elements.length === 0) {
 34 |       var data = {}
 35 |       data[this.id] = null
 36 |       dfd.resolve([data])
 37 |       return dfd
 38 |     }
 39 | 
 40 | 		// extract links one by one
 41 |     var deferredDataExtractionCalls = []
 42 |     $(elements).each(function (k, element) {
 43 |       deferredDataExtractionCalls.push(function (element) {
 44 |         var deferredData = jquery.Deferred()
 45 | 
 46 |         var data = {}
 47 |         data[this.id] = $(element).text()
 48 |         data._followSelectorId = this.id
 49 | 
 50 |         var deferredPopupURL = this.getPopupURL(element)
 51 |         deferredPopupURL.done(function (url) {
 52 |           data[this.id + '-href'] = url
 53 |           data._follow = url
 54 |           deferredData.resolve(data)
 55 |         }.bind(this))
 56 | 
 57 |         return deferredData
 58 |       }.bind(this, element))
 59 |     }.bind(this))
 60 | 
 61 |     whenCallSequentially(deferredDataExtractionCalls).done(function (responses) {
 62 |       var result = []
 63 |       responses.forEach(function (dataResult) {
 64 |         result.push(dataResult)
 65 |       })
 66 |       dfd.resolve(result)
 67 |     })
 68 | 
 69 |     return dfd.promise()
 70 |   },
 71 | 
 72 | 	/**
 73 | 	 * Gets an url from a window.open call by mocking the window.open function
 74 | 	 * @param element
 75 | 	 * @returns $.Deferred()
 76 | 	 */
 77 |   getPopupURL: function (element) {
 78 |     var $ = this.$
 79 |     var document = this.document
 80 |     var window = this.window
 81 |     // override window.open function. we need to execute this in page scope.
 82 | 		// we need to know how to find this element from page scope.
 83 |     var cs = new CssSelector({
 84 |       enableSmartTableSelector: false,
 85 |       parent: document.body,
 86 |       enableResultStripping: false
 87 |     })
 88 |     var cssSelector = cs.getCssSelector([element])
 89 |     debug(cssSelector)
 90 |     debug(document.body.querySelectorAll(cssSelector))
 91 | 		// this function will catch window.open call and place the requested url as the elements data attribute
 92 |     var script = document.createElement('script')
 93 |     script.type = 'text/javascript'
 94 |     debug(cssSelector)
 95 |     debug(document.querySelectorAll(cssSelector))
 96 |     var el = document.querySelectorAll(cssSelector)[0]
 97 | 
 98 |     const open = window.open
 99 |     window.open = function () {
100 |       var url = arguments[0]
101 |       el.dataset.webScraperExtractUrl = url
102 |       window.open = open
103 |     }
104 |     el.click()
105 | 
106 | 		// wait for url to be available
107 |     var deferredURL = jquery.Deferred()
108 |     var timeout = Math.abs(5000 / 30) // 5s timeout to generate an url for popup
109 |     var interval = setInterval(function () {
110 |       var url = $(element).data('web-scraper-extract-url')
111 |       if (url) {
112 |         deferredURL.resolve(url)
113 |         clearInterval(interval)
114 |         script.remove()
115 |       }
116 | 			// timeout popup opening
117 |       if (timeout-- <= 0) {
118 |         clearInterval(interval)
119 |         script.remove()
120 |       }
121 |     }, 30)
122 | 
123 |     return deferredURL.promise()
124 |   },
125 | 
126 |   getDataColumns: function () {
127 |     return [this.id, this.id + '-href']
128 |   },
129 | 
130 |   getFeatures: function () {
131 |     return ['multiple', 'delay']
132 |   },
133 | 
134 |   getItemCSSSelector: function () {
135 |     return '*'
136 |   }
137 | }
138 | 
139 | module.exports = SelectorPopupLink
140 | 


--------------------------------------------------------------------------------
/extension/scripts/Selector/SelectorText.js:
--------------------------------------------------------------------------------
 1 | var jquery = require('jquery-deferred')
 2 | var SelectorText = {
 3 | 
 4 |   canReturnMultipleRecords: function () {
 5 |     return true
 6 |   },
 7 | 
 8 |   canHaveChildSelectors: function () {
 9 |     return false
10 |   },
11 | 
12 |   canHaveLocalChildSelectors: function () {
13 |     return false
14 |   },
15 | 
16 |   canCreateNewJobs: function () {
17 |     return false
18 |   },
19 |   willReturnElements: function () {
20 |     return false
21 |   },
22 |   _getData: function (parentElement) {
23 |     var $ = this.$
24 | var document = this.document
25 | var window = this.window
26 |     var dfd = jquery.Deferred()
27 | 
28 |     var elements = this.getDataElements(parentElement)
29 | 
30 |     var result = []
31 |     $(elements).each(function (k, element) {
32 |       var data = {}
33 | 
34 | 			// remove script, style tag contents from text results
35 |       var $element_clone = $(element).clone()
36 |       $element_clone.find('script, style').remove()
37 | 			// <br> replace br tags with newlines
38 |       $element_clone.find('br').after('\n')
39 | 
40 |       var text = $element_clone.text()
41 |       if (this.regex !== undefined && this.regex.length) {
42 |         var matches = text.match(new RegExp(this.regex))
43 |         if (matches !== null) {
44 |           text = matches[0]
45 |         } else {
46 |           text = null
47 |         }
48 |       }
49 |       data[this.id] = text
50 | 
51 |       result.push(data)
52 |     }.bind(this))
53 | 
54 |     if (this.multiple === false && elements.length === 0) {
55 |       var data = {}
56 |       data[this.id] = null
57 |       result.push(data)
58 |     }
59 | 
60 |     dfd.resolve(result)
61 |     return dfd.promise()
62 |   },
63 | 
64 |   getDataColumns: function () {
65 |     return [this.id]
66 |   },
67 | 
68 |   getFeatures: function () {
69 |     return ['multiple', 'regex', 'delay']
70 |   }
71 | }
72 | 
73 | module.exports = SelectorText
74 | 


--------------------------------------------------------------------------------
/extension/scripts/Selectors.js:
--------------------------------------------------------------------------------
 1 | var SelectorElement = require('./Selector/SelectorElement')
 2 | var SelectorElementAttribute = require('./Selector/SelectorElementAttribute')
 3 | var SelectorElementClick = require('./Selector/SelectorElementClick')
 4 | var SelectorElementScroll = require('./Selector/SelectorElementScroll')
 5 | var SelectorGroup = require('./Selector/SelectorGroup')
 6 | var SelectorHTML = require('./Selector/SelectorHTML')
 7 | var SelectorImage = require('./Selector/SelectorImage')
 8 | var SelectorLink = require('./Selector/SelectorLink')
 9 | var SelectorPopupLink = require('./Selector/SelectorPopupLink')
10 | var SelectorTable = require('./Selector/SelectorTable')
11 | var SelectorText = require('./Selector/SelectorText')
12 | var SelectorGoogMapID = require('./Selector/SelectorGoogMapID')
13 | module.exports = {
14 |   SelectorElement,
15 |   SelectorElementAttribute,
16 |   SelectorElementClick,
17 |   SelectorElementScroll,
18 |   SelectorGroup,
19 |   SelectorHTML,
20 |   SelectorImage,
21 |   SelectorLink,
22 |   SelectorPopupLink,
23 |   SelectorTable,
24 |   SelectorText,
25 |   SelectorGoogMapID
26 | }
27 | 


--------------------------------------------------------------------------------
/extension/scripts/Store.js:
--------------------------------------------------------------------------------
  1 | var Sitemap = require('./Sitemap')
  2 | const debug = require('debug')('web-scraper-headless:store')
  3 | var Store = function (config, options) {
  4 |   this.config = config
  5 |   this.$ = options.$
  6 | this.document = options.document
  7 | this.window = options.window
  8 |   if (!this.$) throw new Error('jquery required')
  9 | if (!this.document) throw new Error("Missing document")
 10 | if(!this.window)throw new Error("Missing window")
 11 |     // configure couchdb
 12 |   this.sitemapDb = new PouchDB(this.config.sitemapDb)
 13 | }
 14 | var StoreScrapeResultWriter = function (db) {
 15 |   this.db = db
 16 | }
 17 | 
 18 | StoreScrapeResultWriter.prototype = {
 19 |   writeDocs: function (docs, callback) {
 20 |     if (docs.length === 0) {
 21 |       callback()
 22 |     } else {
 23 |       this.db.bulkDocs({docs: docs}, function (err, response) {
 24 |         if (err !== null) {
 25 |           debug('Error while persisting scraped data to db', err)
 26 |         }
 27 |         callback()
 28 |       })
 29 |     }
 30 |   }
 31 | }
 32 | 
 33 | Store.prototype = {
 34 | 
 35 |   sanitizeSitemapDataDbName: function (dbName) {
 36 |     return 'sitemap-data-' + dbName.replace(/[^a-z0-9_\$\(\)\+\-/]/gi, '_')
 37 |   },
 38 |   getSitemapDataDbLocation: function (sitemapId) {
 39 |     var dbName = this.sanitizeSitemapDataDbName(sitemapId)
 40 |     return this.config.dataDb + dbName
 41 |   },
 42 |   getSitemapDataDb: function (sitemapId) {
 43 |     var dbLocation = this.getSitemapDataDbLocation(sitemapId)
 44 |     return new PouchDB(dbLocation)
 45 |   },
 46 | 
 47 | 	/**
 48 | 	 * creates or clears a sitemap db
 49 | 	 * @param {type} sitemapId
 50 | 	 * @returns {undefined}
 51 | 	 */
 52 |   initSitemapDataDb: function (sitemapId, callback) {
 53 |     var dbLocation = this.getSitemapDataDbLocation(sitemapId)
 54 |     var store = this
 55 | 
 56 |     PouchDB.destroy(dbLocation, function () {
 57 |       var db = store.getSitemapDataDb(sitemapId)
 58 |       var dbWriter = new StoreScrapeResultWriter(db)
 59 |       callback(dbWriter)
 60 |     })
 61 |   },
 62 | 
 63 |   createSitemap: function (sitemap, callback) {
 64 |     var sitemapJson = JSON.parse(JSON.stringify(sitemap))
 65 | 
 66 |     if (!sitemap._id) {
 67 |       debug('cannot save sitemap without an id', sitemap)
 68 |     }
 69 | 
 70 |     this.sitemapDb.put(sitemapJson, function (sitemap, err, response) {
 71 |             // @TODO handle err
 72 |       sitemap._rev = response.rev
 73 |       callback(sitemap)
 74 |     }.bind(this, sitemap))
 75 |   }, 
 76 |   saveSitemap: function (sitemap, callback) {
 77 |         // @TODO remove
 78 |     this.createSitemap(sitemap, callback)
 79 |   },
 80 |   deleteSitemap: function (sitemap, callback) {
 81 |     sitemap = JSON.parse(JSON.stringify(sitemap))
 82 | 
 83 |     this.sitemapDb.remove(sitemap, function (err, response) {
 84 |             // @TODO handle err
 85 | 
 86 | 			// delete sitemap data db
 87 |       var dbLocation = this.getSitemapDataDbLocation(sitemap._id)
 88 |       PouchDB.destroy(dbLocation, function () {
 89 |         callback()
 90 |       })
 91 |     }.bind(this))
 92 |   },
 93 |   getAllSitemaps: function (callback) {
 94 |     var $ = this.$
 95 | var document = this.document
 96 | var window = this.window
 97 |     this.sitemapDb.allDocs({include_docs: true}, function (err, response) {
 98 |       var sitemaps = []
 99 |       for (var i in response.rows) {
100 |         var sitemap = response.rows[i].doc
101 |         if (!chrome.extension) {
102 |           sitemap = new Sitemap(sitemap, {$, document, window})
103 |         }
104 | 
105 |         sitemaps.push(sitemap)
106 |       }
107 |       callback(sitemaps)
108 |     })
109 |   },
110 | 
111 |   getSitemapData: function (sitemap, callback) {
112 |     var db = this.getSitemapDataDb(sitemap._id)
113 |     db.allDocs({include_docs: true}, function (err, response) {
114 |       var responseData = []
115 |       for (var i in response.rows) {
116 |         var doc = response.rows[i].doc
117 |         responseData.push(doc)
118 |       }
119 |       callback(responseData)
120 |     })
121 |   },
122 | 	// @TODO make this call lighter
123 |   sitemapExists: function (sitemapId, callback) {
124 |     this.getAllSitemaps(function (sitemaps) {
125 |       var sitemapFound = false
126 |       for (var i in sitemaps) {
127 |         if (sitemaps[i]._id === sitemapId) {
128 |           sitemapFound = true
129 |         }
130 |       }
131 |       callback(sitemapFound)
132 |     })
133 |   }
134 | }
135 | 
136 | module.exports = Store
137 | 


--------------------------------------------------------------------------------
/extension/scripts/StoreDevtools.js:
--------------------------------------------------------------------------------
 1 | var Sitemap = require('./Sitemap')
 2 | 
 3 | /**
 4 |  * From devtools panel there is no possibility to execute XHR requests. So all requests to a remote CouchDb must be
 5 |  * handled through Background page. StoreDevtools is a simply a proxy store
 6 |  * @constructor
 7 |  */
 8 | var StoreDevtools = function (options) {
 9 |   this.$ = options.$
10 | this.document = options.document
11 | this.window = options.window
12 |   if (!this.$) throw new Error('jquery required')
13 | if (!this.document) throw new Error("Missing document")
14 | if(!this.window)throw new Error("Missing window")
15 | }
16 | 
17 | StoreDevtools.prototype = {
18 |   createSitemap: function (sitemap, callback) {
19 |     var request = {
20 |       createSitemap: true,
21 |       sitemap: JSON.parse(JSON.stringify(sitemap))
22 |     }
23 | 
24 |     chrome.runtime.sendMessage(request, function (callbackFn, originalSitemap, newSitemap) {
25 |       originalSitemap._rev = newSitemap._rev
26 |       callbackFn(originalSitemap)
27 |     }.bind(this, callback, sitemap))
28 |   },
29 |   saveSitemap: function (sitemap, callback) {
30 |     this.createSitemap(sitemap, callback)
31 |   },
32 |   deleteSitemap: function (sitemap, callback) {
33 |     var request = {
34 |       deleteSitemap: true,
35 |       sitemap: JSON.parse(JSON.stringify(sitemap))
36 |     }
37 |     chrome.runtime.sendMessage(request, function (response) {
38 |       callback()
39 |     })
40 |   },
41 |   getAllSitemaps: function (callback) {
42 |     var $ = this.$
43 | var document = this.document
44 | var window = this.window
45 |     var request = {
46 |       getAllSitemaps: true
47 |     }
48 | 
49 |     chrome.runtime.sendMessage(request, function (response) {
50 |       var sitemaps = []
51 | 
52 |       for (var i in response) {
53 |         sitemaps.push(new Sitemap(response[i], {$, document, window}))
54 |       }
55 |       callback(sitemaps)
56 |     })
57 |   },
58 |   getSitemapData: function (sitemap, callback) {
59 |     var request = {
60 |       getSitemapData: true,
61 |       sitemap: JSON.parse(JSON.stringify(sitemap))
62 |     }
63 | 
64 |     chrome.runtime.sendMessage(request, function (response) {
65 |       callback(response)
66 |     })
67 |   },
68 |   sitemapExists: function (sitemapId, callback) {
69 |     var request = {
70 |       sitemapExists: true,
71 |       sitemapId: sitemapId
72 |     }
73 | 
74 |     chrome.runtime.sendMessage(request, function (response) {
75 |       callback(response)
76 |     })
77 |   }
78 | }
79 | 
80 | module.exports = StoreDevtools
81 | 


--------------------------------------------------------------------------------
/extension/scripts/UniqueElementList.js:
--------------------------------------------------------------------------------
 1 | var CssSelector = require('css-selector').CssSelector
 2 | // TODO get rid of jquery
 3 | 
 4 | /**
 5 |  * Only Elements unique will be added to this array
 6 |  * @constructor
 7 |  */
 8 | function UniqueElementList (clickElementUniquenessType, options) {
 9 |   var $ = options.$
10 |   var window = options.window
11 |   var document = options.document
12 | 
13 |   Object.defineProperty(this, '$', {
14 |     value: $,
15 |     enumerable: false
16 |   })
17 |   Object.defineProperty(this, 'window', {
18 |     value: window,
19 |     enumerable: false
20 |   })
21 |   Object.defineProperty(this, 'document', {
22 |     value: document,
23 |     enumerable: false
24 |   })
25 |   if (!this.$) throw new Error('jquery required')
26 |   if (!this.document) {
27 |     throw new Error("Missing document")
28 |   }
29 |   if(!this.window) throw new Error("Missing window")
30 |     this.clickElementUniquenessType = clickElementUniquenessType
31 |     this.addedElements = {}
32 |   }
33 | 
34 | UniqueElementList.prototype = []
35 | 
36 | UniqueElementList.prototype.push = function (element) {
37 |   var $ = this.$
38 | var document = this.document
39 | var window = this.window
40 |   if (this.isAdded(element)) {
41 |     return false
42 |   } else {
43 |     var elementUniqueId = this.getElementUniqueId(element)
44 |     this.addedElements[elementUniqueId] = true
45 |     Array.prototype.push.call(this, $(element).clone(true)[0])
46 |     return true
47 |   }
48 | }
49 | 
50 | UniqueElementList.prototype.getElementUniqueId = function (element) {
51 |   var $ = this.$
52 | var document = this.document
53 | var window = this.window
54 |   if (this.clickElementUniquenessType === 'uniqueText') {
55 |     var elementText = $(element).text().trim()
56 |     return elementText
57 |   } else if (this.clickElementUniquenessType === 'uniqueHTMLText') {
58 |     var elementHTML = $("<div class='-web-scraper-should-not-be-visible'>").append($(element).eq(0).clone()).html()
59 |     return elementHTML
60 |   } else if (this.clickElementUniquenessType === 'uniqueHTML') {
61 | 		// get element without text
62 |     var $element = $(element).eq(0).clone()
63 | 
64 |     var removeText = function ($element) {
65 |       $element.contents()
66 | 				.filter(function () {
67 |   if (this.nodeType !== 3) {
68 |     removeText($(this))
69 |   }
70 |   return this.nodeType == 3 // Node.TEXT_NODE
71 | }).remove()
72 |     }
73 |     removeText($element)
74 | 
75 |     var elementHTML = $("<div class='-web-scraper-should-not-be-visible'>").append($element).html()
76 |     return elementHTML
77 |   } else if (this.clickElementUniquenessType === 'uniqueCSSSelector') {
78 |     var cs = new CssSelector({
79 |       enableSmartTableSelector: false,
80 |       parent: $('body')[0],
81 |       enableResultStripping: false
82 |     })
83 |     var CSSSelector = cs.getCssSelector([element])
84 |     return CSSSelector
85 |   } else {
86 |     throw 'Invalid clickElementUniquenessType ' + this.clickElementUniquenessType
87 |   }
88 | }
89 | 
90 | module.exports = UniqueElementList
91 | 
92 | UniqueElementList.prototype.isAdded = function (element) {
93 |   var elementUniqueId = this.getElementUniqueId(element)
94 |   var isAdded = elementUniqueId in this.addedElements
95 |   return isAdded
96 | }
97 | 


--------------------------------------------------------------------------------
/extension/scripts/WebJSDOMBrowser.js:
--------------------------------------------------------------------------------
  1 | // Basically runs JSDOM in a webworker
  2 | const work = require('webworkify')
  3 | const jsdomBrowserLoader = require('./JSDOMBrowserLoader')
  4 | var jqueryDeferred = require('jquery-deferred')
  5 | var whenCallSequentially = require('../assets/jquery.whencallsequentially')
  6 | const debug = require('debug')('web-scraper-headless:web-jsdom-browser')
  7 | const WebJSDOMBrowser = function (options) {
  8 |   this.pageLoadDelay = options.pageLoadDelay
  9 |   const promises = {}
 10 |   this.promises = promises
 11 | 
 12 |   this.worker = work(jsdomBrowserLoader)
 13 | 
 14 |   this.worker.addEventListener('message', function (ev) {
 15 |     const data = ev.data
 16 |     if (!data.UUID) {
 17 |       return console.error(data.err)
 18 |     }
 19 |     if (data.UUID && !promises[data.UUID]) {
 20 |       return console.error('Missing UUID', data.UUID)
 21 |     }
 22 |     if (data.err) {
 23 |       console.error(data.err)
 24 |       promises[data.UUID].reject(new Error(data.err))
 25 |       delete promises[data.UUID]
 26 |       return
 27 |     }
 28 |     promises[data.UUID].resolve(data.info)
 29 |     delete promises[data.UUID]
 30 |   })
 31 |   this.worker.postMessage({
 32 |     topic: 'init',
 33 |     UUID: 'init',
 34 |     options
 35 |   })
 36 |   promises.init = {
 37 |     resolve: function () {
 38 |       debug('successfully created')
 39 |     },
 40 |     reject: function (err) {
 41 |       console.error(err)
 42 |     }
 43 |   }
 44 | }
 45 | 
 46 | WebJSDOMBrowser.prototype = {
 47 |   loadUrl: function (url, callback) {
 48 |     const UUID = parseInt(Math.random() * 1000000).toString()
 49 |     let res, rej
 50 |     const promise = new Promise(function (resolve, reject) {
 51 |       res = resolve
 52 |       rej = reject
 53 |     })
 54 |     this.promises[UUID] = {resolve: res, reject: rej}
 55 |     this.worker.postMessage({
 56 |       topic: 'loadUrl',
 57 |       url,
 58 |       UUID
 59 |     })
 60 |     promise.then(function (info) {
 61 |       callback()
 62 |     }, function (err) {callback(err)})
 63 |   },
 64 |   saveImages: function (record, namingFunction) {
 65 |     var deferredResponse = jqueryDeferred.Deferred()
 66 |     var deferredImageStoreCalls = []
 67 |     var prefixLength = '_imageBase64-'.length
 68 |     for (var attr in record) {
 69 |       if (attr.substr(0, prefixLength) === '_imageBase64-') {
 70 |         throw new Error('Downloading images is not yet supported')
 71 |       }
 72 |     }
 73 |     whenCallSequentially(deferredImageStoreCalls).done(function () {
 74 |       deferredResponse.resolve()
 75 |     })
 76 | 
 77 |     return deferredResponse.promise()
 78 |   },
 79 |   fetchData: function (url, sitemap, parentSelectorId, callback, scope) {
 80 |     const UUID = parseInt(Math.random() * 1000000).toString()
 81 |     let res, rej
 82 |     const promise = new Promise(function (resolve, reject) {
 83 |       res = resolve
 84 |       rej = reject
 85 |     })
 86 |     this.promises[UUID] = {resolve: res, reject: rej}
 87 |     this.worker.postMessage({
 88 |       topic: 'fetchData',
 89 |       url,
 90 |       UUID,
 91 |       sitemap: JSON.parse(JSON.stringify(sitemap)),
 92 |       parentSelectorId
 93 |     })
 94 |     promise.then(function (info) {
 95 |       callback.call(scope, null, info.results)
 96 |     }, function (err) {
 97 |       callback(err)
 98 |     })
 99 |   },
100 |   close: function () {
101 |     debug('closing webjsdom browser')
102 |     if (this.worker) this.worker.terminate()
103 |     this.worker = null
104 |   }
105 | }
106 | 
107 | module.exports = WebJSDOMBrowser
108 | 


--------------------------------------------------------------------------------
/extension/scripts/getBackgroundScript.js:
--------------------------------------------------------------------------------
 1 | var jquery = require('jquery-deferred')
 2 | var BackgroundScript = require('./BackgroundScript')
 3 | /**
 4 |  * @param location	configure from where the content script is being accessed (ContentScript, BackgroundPage, DevTools)
 5 |  * @returns BackgroundScript
 6 |  */
 7 | var getBackgroundScript = function (location) {
 8 |   // Handle calls from different places
 9 |   if (location === 'BackgroundScript') {
10 |     return BackgroundScript
11 |   } else if (location === 'DevTools' || location === 'ContentScript') {
12 |     // if called within background script proxy calls to content script
13 |     var backgroundScript = {}
14 | 
15 |     Object.keys(BackgroundScript).forEach(function (attr) {
16 |       if (typeof BackgroundScript[attr] === 'function') {
17 |         backgroundScript[attr] = function (request) {
18 |           var reqToBackgroundScript = {
19 |             backgroundScriptCall: true,
20 |             fn: attr,
21 |             request: request
22 |           }
23 | 
24 |           var deferredResponse = jquery.Deferred()
25 | 
26 |           chrome.runtime.sendMessage(reqToBackgroundScript, function (response) {
27 |             deferredResponse.resolve(response)
28 |           })
29 | 
30 |           return deferredResponse
31 |         }
32 |       } else {
33 |         backgroundScript[attr] = BackgroundScript[attr]
34 |       }
35 |     })
36 | 
37 |     return backgroundScript
38 |   } else {
39 |     throw new Error('Invalid BackgroundScript initialization - ' + location)
40 |   }
41 | }
42 | 
43 | module.exports = getBackgroundScript
44 | 


--------------------------------------------------------------------------------
/extension/scripts/getContentScript.js:
--------------------------------------------------------------------------------
 1 | var getBackgroundScript = require('./getBackgroundScript')
 2 | var ContentScript = require('./ContentScript')
 3 | /**
 4 |  *
 5 |  * @param location	configure from where the content script is being accessed (ContentScript, BackgroundPage, DevTools)
 6 |  * @param options
 7 |  * @returns ContentScript
 8 |  */
 9 | var getContentScript = function (location) {
10 |   var contentScript
11 | 
12 |   // Handle calls from different places
13 |   if (location === 'ContentScript') {
14 |     contentScript = ContentScript
15 |     contentScript.backgroundScript = getBackgroundScript('ContentScript')
16 |     return contentScript
17 |   } else if (location === 'BackgroundScript' || location === 'DevTools') {
18 |     var backgroundScript = getBackgroundScript(location)
19 | 
20 |     // if called within background script proxy calls to content script
21 |     contentScript = {}
22 |     Object.keys(ContentScript).forEach(function (attr) {
23 |       if (typeof ContentScript[attr] === 'function') {
24 |         contentScript[attr] = function (request) {
25 |           var reqToContentScript = {
26 |             contentScriptCall: true,
27 |             fn: attr,
28 |             request: request
29 |           }
30 | 
31 |           return backgroundScript.executeContentScript(reqToContentScript)
32 |         }
33 |       } else {
34 |         contentScript[attr] = ContentScript[attr]
35 |       }
36 |     })
37 |     contentScript.backgroundScript = backgroundScript
38 |     return contentScript
39 |   } else {
40 |     throw new Error('Invalid ContentScript initialization - ' + location)
41 |   }
42 | }
43 | 
44 | module.exports = getContentScript
45 | 


--------------------------------------------------------------------------------
/gulpfile.js:
--------------------------------------------------------------------------------
  1 | const gulp = require('gulp')
  2 | const browserify = require('browserify')
  3 | const watchify = require('watchify')
  4 | const source = require('vinyl-source-stream')
  5 | const notify = require('gulp-notify')
  6 | const Server = require('karma').Server
  7 | const path = require('path')
  8 | const babelify = require('babelify')
  9 | const mocha = require('gulp-spawn-mocha')
 10 | // We do karma in gulp instead of npm because we need to recompute all the generated bundles that are loaded to the browser
 11 | const runTests = (function () {
 12 |   let builds = 0
 13 |   return function (done = function () {}) {
 14 |     builds++
 15 |     // One build per bundle
 16 |     if (builds % 3 === 0) {
 17 |       runKarma(done)
 18 |       runNodeTests()
 19 |     }
 20 |   }
 21 | })()
 22 | 
 23 | function runKarma (done) {
 24 |   const server = new Server({
 25 |     configFile: path.join(__dirname, 'karma.conf.js'),
 26 |     singleRun: true
 27 |   }, done)
 28 |   server.start()
 29 | }
 30 | 
 31 | function runNodeTests () {
 32 |   return gulp.src([
 33 |     'tests/jsdomSpec.js',
 34 |     'tests/spec/*Spec.js',
 35 |     'tests/spec/Selector/*Spec.js',
 36 |     'tests/spec/jsdom/*Spec.js',
 37 |     'tests/spec/headless/*Spec.js'
 38 |   ])
 39 |     .pipe(mocha({
 40 |       compilers: 'js:babel-register'
 41 |     }).on('error', console.error))
 42 | }
 43 | 
 44 | gulp.task('build:watch', () => generateBuilder(true, true))
 45 | gulp.task('build', () => generateBuilder(false, false))
 46 | 
 47 | gulp.task('default', ['build:watch'])
 48 | 
 49 | function generateBuilder (isWatch, debug) {
 50 |   const wrapper = isWatch ? watchify : (x) => x
 51 |   const bundlerBackground = wrapper(browserify({
 52 |     standalone: 'backgroundScraper',
 53 |     entries: [
 54 |       'extension/background_page/background_script.js'
 55 |     ],
 56 |     debug
 57 |   }))
 58 |   const bundlerScraper = wrapper(browserify({
 59 |     standalone: 'contentScraper',
 60 |     entries: [
 61 |       'extension/content_script/content_scraper_browser.js'
 62 |     ],
 63 |     debug
 64 |   }))
 65 |   const bundlerDevtools = wrapper(browserify({
 66 |     standalone: 'contentScraper',
 67 |     entries: [
 68 |       'extension/scripts/App.js'
 69 |     ],
 70 |     debug
 71 |   }))
 72 | 
 73 |   setBundler(bundlerBackground, 'background-scraper.js')
 74 |   setBundler(bundlerScraper, 'content-scraper.js')
 75 |   setBundler(bundlerDevtools, 'devtools-scraper.js')
 76 |   function gulpBundle (bundler, file) {
 77 |     bundler.bundle()
 78 |       .on('error', function (err) {
 79 |         return notify().write(err)
 80 |       })
 81 |       .pipe(source(file))
 82 |       .pipe(gulp.dest('extension/generated/'))
 83 |       .on('error', function (e) {
 84 |         console.error(e)
 85 |       })
 86 |       .on('end', function () {
 87 |         runTests()
 88 |         console.log('finished bundling')
 89 |         // TODO launch tests
 90 |       })
 91 |   }
 92 | 
 93 |   function setBundler (bundler, file) {
 94 |     bundler
 95 |       .transform(babelify, {})
 96 |       .on('update', function () {
 97 |         gulpBundle(bundler, file)
 98 |       })
 99 |       .on('error', function (err) {
100 |         return notify().write(err)
101 |       })
102 |       .on('log', function (log) {
103 |         console.log(log)
104 |       })
105 |     return gulpBundle(bundler, file)
106 |   }
107 | }
108 | 


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
 1 | const Queue = require('./extension/scripts/Queue')
 2 | const Sitemap = require('./extension/scripts/Sitemap')
 3 | const InMemoryStore = require('./extension/scripts/InMemoryStore')
 4 | const Scraper = require('./extension/scripts/Scraper')
 5 | const debug = require('debug')('web-scraper-headless:index')
 6 | const JSDOMBrowser = require('./extension/scripts/JSDOMBrowser')
 7 | const ChromeHeadlessBrowser = require('./extension/scripts/ChromeHeadlessBrowser')
 8 | /**
 9 |  *
10 |  * @param sitemap
11 |  * @param options
12 |  * @param options.browser jsdom|headless
13 |  * @param options.pageLoadDelay
14 |  * @param options.delay
15 |  * @return {*}
16 |  */
17 | module.exports = function (sitemap, options) {
18 |   return scrape(sitemap, options)
19 | }
20 | 
21 | function scrape (sitemapInfo, options = {}) {
22 |   return new Promise(function (resolve, reject) {
23 |     // sitemap is created twice, once in node another in the browser context.
24 |     // In node we don't actually need these variables.
25 |     const fakeWindow = {}
26 |     const fakeDocument = {}
27 |     const fake$ = {}
28 |     const q = new Queue()
29 |     const store = new InMemoryStore()
30 |     const sitemap = new Sitemap(sitemapInfo, {$: fake$, document: fakeDocument, window: fakeWindow})
31 | 
32 |     let BrowserConstructor
33 |     switch (options.browser) {
34 |       case 'jsdom':
35 |         BrowserConstructor = JSDOMBrowser
36 |         debug('Jsdom browser selected')
37 |         break
38 |       case 'headless':
39 |         BrowserConstructor = ChromeHeadlessBrowser
40 |         debug('Chrome headless browser selected')
41 |         break
42 |       default:
43 |         debug('No browser requested so jsdom was selected as default')
44 |         BrowserConstructor = JSDOMBrowser
45 |     }
46 |     const browser = new BrowserConstructor({
47 |       pageLoadDelay: options.pageLoadDelay || 2000
48 |     })
49 |     const s = new Scraper({
50 |       queue: q,
51 |       sitemap,
52 |       browser,
53 |       store,
54 |       delay: options.delay || 500
55 |     }, {})
56 |     s.run(function (err) {
57 |       if (err) {
58 |         reject(err)
59 |       } else {
60 |         resolve(store.data)
61 |       }
62 |     })
63 |   })
64 | }
65 | 


--------------------------------------------------------------------------------
/karma.conf.js:
--------------------------------------------------------------------------------
 1 | const files = ['tests/browserSpec.js', 'tests/spec/*.js', 'tests/spec/browser/*.js', 'tests/spec/Selector/*Spec.js']
 2 | const _ = require('lodash')
 3 | module.exports = function (config) {
 4 |   config.set({
 5 | 
 6 |     // base path that will be used to resolve all patterns (eg. files, exclude)
 7 |     basePath: '',
 8 | 
 9 |     // frameworks to use
10 |     // available frameworks: https://npmjs.org/browse/keyword/karma-adapter
11 |     frameworks: ['browserify', 'mocha'],
12 | 
13 |     preprocessors: _.mapValues(_.keyBy(files), () => ['browserify']),
14 |     // list of files / patterns to load in the browser
15 |     files: [
16 |       'extension/assets/sugar-1.4.1.js',
17 |       'extension/assets/pouchdb-nightly.min.js',
18 |       'tests/ChromeAPI.js',
19 |       'extension/generated/background-scraper.js', // not very nice, we need to load the background script to listen to the messages
20 |       'extension/generated/content-scraper.js',
21 |       'extension/content_script/content_script.js',
22 |       'docs/images/chrome-store-logo.png',
23 |       ...files
24 |     ],
25 |     customLaunchers: {
26 |       ChromeOutOfFocus: {
27 |         base: 'Chrome',
28 |         flags: ['--window-size=300,300']
29 |       }
30 |     },
31 |     browserify: {
32 |       debug: true,
33 |       transform: [
34 |         ['babelify', {ignore: /\/node_modules\//}]
35 |       ]
36 |     },
37 | 
38 |     // list of files to exclude
39 |     exclude: [
40 |     ],
41 |     // test results reporter to use
42 |     // possible values: 'dots', 'progress'
43 |     // available reporters: https://npmjs.org/browse/keyword/karma-reporter
44 |     reporters: ['dots'],
45 | 
46 |     // web server port
47 |     port: 9876,
48 | 
49 |     // enable / disable colors in the output (reporters and logs)
50 |     colors: true,
51 | 
52 |     // level of logging
53 |     // possible values: config.LOG_DISABLE || config.LOG_ERROR || config.LOG_WARN || config.LOG_INFO || config.LOG_DEBUG
54 |     logLevel: config.LOG_INFO,
55 | 
56 |     browserConsoleLogOptions: {
57 |       terminal: true,
58 |       level: 'error'
59 |     },
60 |     // start these browsers
61 |     // available browser launchers: https://npmjs.org/browse/keyword/karma-launcher
62 |     browsers: ['ChromeHeadless'],
63 | 
64 |     // Concurrency level
65 |     // how many browser should be started simultaneous
66 |     concurrency: Infinity,
67 |     browserNoActivityTimeout: 50000000,
68 |     plugins: [
69 |       'karma-mocha',
70 |       'karma-browserify',
71 |       'karma-chrome-launcher'
72 |     ]
73 |   })
74 | }
75 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "web-scraper-headless",
 3 |   "version": "1.0.7",
 4 |   "description": "Web Scraper Headless allows to extract data from web pages using plans (sitemaps) created with the Web Scraper browser extension. Using these sitemaps the  Web Scraper will navigate the site accordingly and extract all data. Scraped  data later can be exported as CSV.",
 5 |   "main": "index.js",
 6 |   "directories": {
 7 |     "doc": "docs",
 8 |     "test": "tests"
 9 |   },
10 |   "watch": {
11 |     "generate": {
12 |       "patterns": [
13 |         "extension"
14 |       ],
15 |       "ignore": "extension/generated",
16 |       "extensions": "js"
17 |     }
18 |   },
19 |   "standard": {
20 |     "ignore": [
21 |       "extension/generated",
22 |       "extension/assets"
23 |     ],
24 |     "globals": [
25 |       "d3",
26 |       "chrome",
27 |       "describe",
28 |       "it",
29 |       "beforeEach",
30 |       "afterEach",
31 |       "after",
32 |       "before"
33 |     ]
34 |   },
35 |   "scripts": {
36 |     "build": "gulp build",
37 |     "test-watch": "gulp"
38 |   },
39 |   "repository": {
40 |     "type": "git",
41 |     "url": "git+https://github.com/geoblink/web-scraper-chrome-extension.git"
42 |   },
43 |   "author": "",
44 |   "license": "LGPL-3.0",
45 |   "bugs": {
46 |     "url": "https://github.com/geoblink/web-scraper-chrome-extension/issues"
47 |   },
48 |   "homepage": "https://github.com/geoblink/web-scraper-chrome-extension#readme",
49 |   "devDependencies": {
50 |     "babel-plugin-meaningful-logs": "^1.0.2",
51 |     "babel-register": "^6.24.1",
52 |     "babelify": "^7.3.0",
53 |     "chai": "^3.5.0",
54 |     "chrome-remote-interface": "^0.18.0",
55 |     "gulp": "^3.9.1",
56 |     "gulp-notify": "^3.0.0",
57 |     "gulp-spawn-mocha": "^3.3.0",
58 |     "istanbul": "^0.4.5",
59 |     "jasmine-node": "^1.14.5",
60 |     "karma": "^1.6.0",
61 |     "karma-browserify": "^5.1.1",
62 |     "karma-chrome-launcher": "^2.0.0",
63 |     "karma-mocha": "^1.3.0",
64 |     "mocha": "^3.2.0",
65 |     "npm-watch": "^0.1.8",
66 |     "sinon": "^7.4.2",
67 |     "standard": "^9.0.2",
68 |     "vinyl-buffer": "^1.0.0",
69 |     "vinyl-source-stream": "^1.1.0",
70 |     "watchify": "^3.9.0",
71 |     "webworkify": "^1.4.0"
72 |   },
73 |   "dependencies": {
74 |     "browserify": "^16.1.0",
75 |     "css-selector": "git://github.com/furstenheim/css-selector.git#b50eb6befc4129ac56e91efba3dd1e233bb67202",
76 |     "debug": "^3.1.0",
77 |     "jquery": "^3.2.1",
78 |     "jquery-deferred": "^0.3.1",
79 |     "jsdom": "^10.1.0",
80 |     "puppeteer": "1.5.0"
81 |   }
82 | }
83 | 


--------------------------------------------------------------------------------
/playgrounds/extension/webpage.css:
--------------------------------------------------------------------------------
 1 | #webpage {
 2 | 	height:400px;
 3 | 	border-radius: 5px;
 4 | 	border:3px #ccc solid;
 5 | 	margin:10px;
 6 | 	overflow-y:scroll;
 7 | }
 8 | 
 9 | #webpage {
10 | 	font-size: 14px;
11 | }
12 | 
13 | #webpage .navbar-nav > li > a {
14 | 	padding-top: 15px;
15 | 	padding-bottom: 15px;
16 | }
17 | 


--------------------------------------------------------------------------------
/playgrounds/sitemap-tree/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 | 	<link type="text/css" rel="stylesheet" href="style.css" />
 5 | 	<script src="../../extension/assets/d3.v3.js"></script>
 6 | 	<script src="../../extension/scripts/Sitemap.js"></script>
 7 | 	<script src="../../extension/scripts/Selector.js"></script>
 8 | 	<script src="../../extension/scripts/Selector/SelectorElement.js"></script>
 9 | 	<script src="../../extension/scripts/SelectorList.js"></script>
10 | 	<script src="../../extension/scripts/SelectorGraphv2.js"></script>
11 | </head>
12 | <body>
13 | <div id="body"></div>
14 | <script>
15 | 	d3.json("sitemap.json", function(json) {
16 | 		var sitemap = new Sitemap(json);
17 | 		var graph = new SelectorGraphv2(sitemap);
18 | 		graph.draw(document.getElementById("body"), 1200, 200);
19 | 	});
20 | </script>
21 | </body>
22 | </html>
23 | 


--------------------------------------------------------------------------------
/playgrounds/sitemap-tree/sitemap.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"selectors":[
 3 | 		{
 4 | 			"id": "a",
 5 | 			"type": "SelectorElement",
 6 | 			"parentSelectors": ["_root", "d"]
 7 | 		},
 8 | 		{
 9 | 			"id": "b",
10 | 			"type": "SelectorElement",
11 | 			"parentSelectors": ["a"]
12 | 		},
13 | 		{
14 | 			"id": "c",
15 | 			"type": "SelectorElement",
16 | 			"parentSelectors": ["a"]
17 | 		},
18 | 		{
19 | 			"id": "d",
20 | 			"type": "SelectorElement",
21 | 			"parentSelectors": ["a"]
22 | 		}
23 | 	]
24 | }


--------------------------------------------------------------------------------
/playgrounds/sitemap-tree/style.css:
--------------------------------------------------------------------------------
 1 | .node circle {
 2 | 	cursor: pointer;
 3 | 	fill: #fff;
 4 | 	stroke: steelblue;
 5 | 	stroke-width: 1.5px;
 6 | }
 7 | 
 8 | .node text {
 9 | 	font-size: 11px;
10 | }
11 | 
12 | path.link {
13 | 	fill: none;
14 | 	stroke: #ccc;
15 | 	stroke-width: 1.5px;
16 | }


--------------------------------------------------------------------------------
/tests/FakeStore.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var FakeStore = function () {
 3 |   this.data = []
 4 | }
 5 | 
 6 | FakeStore.prototype = {
 7 | 
 8 |   writeDocs: function (data, callback) {
 9 |     data.forEach(function (data) {
10 |       this.data.push(data)
11 |     }.bind(this))
12 |     callback()
13 |   },
14 | 
15 |   initSitemapDataDb: function (sitemapId, callback) {
16 |     callback(this)
17 |   }
18 | }
19 | 
20 | module.exports = FakeStore
21 | 


--------------------------------------------------------------------------------
/tests/Matchers.js:
--------------------------------------------------------------------------------
 1 | const assert = require('chai').assert
 2 | var getSelectorIds = function (selectors) {
 3 |   var ids = []
 4 |   selectors.forEach(function (selector) {
 5 |     ids.push(selector.id)
 6 |   })
 7 |   return ids
 8 | }
 9 | 
10 | var selectorListSorter = function (a, b) {
11 |   if (a.id === b.id) {
12 |     return 0
13 |   } else if (a.id > b.id) {
14 |     return 1
15 |   } else {
16 |     return -1
17 |   }
18 | }
19 | 
20 | var selectorMatchers = {
21 |   matchSelectors: async function (actual, expectedIds) {
22 |     expectedIds = expectedIds.sort()
23 |     var actualIds = getSelectorIds(actual).sort()
24 | 
25 |     assert.deepEqual(actualIds, expectedIds)
26 |   },
27 |   matchSelectorList: async function (actual, expectedSelectors) {
28 |     var actualSelectors = actual
29 |     assert.equal(expectedSelectors.length, actualSelectors.length)
30 |     expectedSelectors.sort(selectorListSorter)
31 |     actualSelectors.sort(selectorListSorter)
32 | 
33 |     for (const i in expectedSelectors) {
34 |       console.log(expectedSelectors[i], actualSelectors[i].id)
35 |       assert.equal(expectedSelectors[i].id, actualSelectors[i].id)
36 |     }
37 |   },
38 | 	// @REFACTOR use match selector list
39 |   matchSelectorTrees: async function (actual, expectedSelectorTrees) {
40 |     var actualSelectorTrees = actual
41 | 
42 |     assert.equal(actualSelectorTrees.length, expectedSelectorTrees.length)
43 | 
44 |     for (var i in expectedSelectorTrees) {
45 |       await selectorMatchers.matchSelectors(actualSelectorTrees[i], expectedSelectorTrees[i])
46 |     }
47 |   },
48 |   deferredToEqual: function (actual, expectedData) {
49 |     var deferredData = actual
50 |     return deferredData
51 |       .then(function (d) {
52 |         assert.deepEqual(d, expectedData)
53 |       })
54 |   },
55 |   deferredToFail: async function (actual) {
56 |     var deferredData = actual
57 | 
58 |     try {
59 |       await deferredData
60 |       return Promise.reject(new Error('Promise not rejected'))
61 |     } catch (e) {
62 | 
63 |     }
64 |   }
65 | }
66 | 
67 | module.exports = selectorMatchers
68 | 


--------------------------------------------------------------------------------
/tests/browserSpec.js:
--------------------------------------------------------------------------------
 1 | const globals = require('./globals')
 2 | const $ = require('jquery')
 3 | const ChromePopupBrowser = require('../extension/scripts/ChromePopupBrowser')
 4 | beforeEach(function () {
 5 |   globals.window = window
 6 |   globals.document = document
 7 |   globals.$ = $
 8 |   globals.Browser = ChromePopupBrowser
 9 |   window.chromeAPI.reset()
10 | 
11 |   window.addEventListener('unhandledrejection', function (err, promise) {
12 |     console.error('Unhandled error', err.reason)
13 |   })
14 | })
15 | 


--------------------------------------------------------------------------------
/tests/globals.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | }
3 | 


--------------------------------------------------------------------------------
/tests/jsdomSpec.js:
--------------------------------------------------------------------------------
 1 | const globals = require('./globals')
 2 | const jsdom = require('jsdom')
 3 | const jQuery = require('jquery')
 4 | const Browser = require('./../extension/scripts/JSDOMBrowser')
 5 | beforeEach(function () {
 6 |   const {JSDOM} = jsdom
 7 |   const dom = new JSDOM()
 8 |   const $ = jQuery(dom.window)
 9 |   const window = dom.window
10 |   const document = window.document
11 |   globals.document = dom.window.document
12 |   globals.window = dom.window
13 |   globals.$ = $
14 |   globals.Browser = Browser
15 |   Browser.prototype.loadUrl = function (url, callback) {
16 |     callback(null, {$, document, window})
17 |   }
18 | })
19 | process.on('unhandledRejection', function (err) {
20 |   console.error(err)
21 | })
22 | 


--------------------------------------------------------------------------------
/tests/spec/ElementQuerySpec.js:
--------------------------------------------------------------------------------
 1 | const ElementQuery = require('../../extension/scripts/ElementQuery')
 2 | const assert = require('chai').assert
 3 | const utils = require('./../utils')
 4 | const globals = require('../globals')
 5 | 
 6 | describe('ElementQuery', function () {
 7 |   var $el
 8 |   let $
 9 |   let document
10 |   let window
11 |   beforeEach(function () {
12 |     $ = globals.$
13 |     document = globals.document
14 |     window = globals.window
15 | 
16 |     document.body.innerHTML = utils.getTestHTML()
17 |     $el = utils.createElementFromHTML("<div id='tests' style='display:none'></div>", document)
18 |     document.body.appendChild($el)
19 |   })
20 | 
21 |   it('should be able to select elements', function () {
22 |     $el.innerHTML = '<a></a><span></span>'
23 | 
24 |     var selectedElements = ElementQuery('a, span', $el, {$, document, window})
25 |     var expectedElements = Array.from($el.querySelectorAll('a, span'))
26 | 
27 |     assert.deepEqual(selectedElements.sort(), expectedElements)
28 |   })
29 | 
30 |   it('should be able to select parent', function () {
31 |     $el.innerHTML = '<a></a><span></span>'
32 | 
33 |     var selectedElements = ElementQuery('a, span, _parent_', $el, {$, document, window})
34 |     var expectedElements = Array.from($el.querySelectorAll('a, span'))
35 |     expectedElements.push($el)
36 | 
37 |     assert.deepEqual(selectedElements.sort(), expectedElements.sort())
38 |   })
39 | 
40 |   it('should should not return duplicates', function () {
41 |     $el.innerHTML = '<a></a><span></span>'
42 | 
43 |     var selectedElements = ElementQuery('*, a, span, _parent_', $el, {$, document, window})
44 |     var expectedElements = Array.from($el.querySelectorAll('a, span'))
45 |     expectedElements.push($el)
46 | 
47 |     assert.deepEqual(selectedElements.length, 3)
48 |     assert.deepEqual(selectedElements.sort(), expectedElements.sort())
49 |   })
50 | 
51 |   it('should be able to select parent when parent there are multiple parents', function () {
52 |     $el.innerHTML = '<span></span><span></span>'
53 | 
54 |     var selectedElements = ElementQuery('_parent_', $el.querySelectorAll('span'), {$, document, window})
55 |     var expectedElements = Array.from($el.querySelectorAll('span'))
56 | 
57 |     assert.deepEqual(selectedElements.length, 2)
58 |     assert.deepEqual(selectedElements.sort(), expectedElements)
59 |   })
60 | 
61 |   it('should be able to select element with a comma ,', function () {
62 |     $el.innerHTML = '<span>,</span>'
63 | 
64 |     var selectedElements = ElementQuery(":contains(',')", $el, {$, document, window})
65 |     var expectedElements = Array.from($el.querySelectorAll('span'))
66 | 
67 |     assert.deepEqual(selectedElements.length, 1)
68 |     assert.deepEqual(selectedElements.sort(), expectedElements.sort())
69 |   })
70 | 
71 |   it('should preserve spaces', function () {
72 |     var parts = ElementQuery.getSelectorParts('div.well li:nth-of-type(2) a')
73 |     assert.deepEqual(parts, ['div.well li:nth-of-type(2) a'])
74 |   })
75 | })
76 | 


--------------------------------------------------------------------------------
/tests/spec/JobSpec.js:
--------------------------------------------------------------------------------
 1 | const Job = require('./../../extension/scripts/Job')
 2 | const assert = require('chai').assert
 3 | 
 4 | describe('Job', function () {
 5 |   it('should be able to create correct url from parent job', function () {
 6 |     var parent = new Job('http://example.com/')
 7 |     var child = new Job('/test/', null, null, parent)
 8 |     assert.equal(child.url, 'http://example.com/test/')
 9 | 
10 |     parent = new Job('http://example.com')
11 |     child = new Job('test/', null, null, parent)
12 |     assert.equal(child.url, 'http://example.com/test/')
13 | 
14 |     parent = new Job('http://example.com/asdasdad')
15 |     child = new Job('tvnet.lv', null, null, parent)
16 |     assert.equal(child.url, 'http://tvnet.lv/')
17 | 
18 |     parent = new Job('http://example.com/asdasdad')
19 |     child = new Job('?test', null, null, parent)
20 |     assert.equal(child.url, 'http://example.com/asdasdad?test')
21 | 
22 |     parent = new Job('http://example.com/1/')
23 |     child = new Job('2/', null, null, parent)
24 |     assert.equal(child.url, 'http://example.com/1/2/')
25 | 
26 |     parent = new Job('http://127.0.0.1/1/')
27 |     child = new Job('2/', null, null, parent)
28 |     assert.equal(child.url, 'http://127.0.0.1/1/2/')
29 | 
30 |     parent = new Job('http://xn--80aaxitdbjk.xn--p1ai/')
31 |     child = new Job('2/', null, null, parent)
32 | 
33 |     assert.equal(child.url, 'http://xn--80aaxitdbjk.xn--p1ai/2/')
34 |   })
35 | 
36 |   it('should be able to create correct url from parent job with slashes after question mark', function () {
37 |     var parent = new Job('http://www.sportstoto.com.my/results_past.asp?date=5/1/1992')
38 |     var child = new Job('popup_past_results.asp?drawNo=418/92', null, null, parent)
39 |     assert.equal(child.url, 'http://www.sportstoto.com.my/popup_past_results.asp?drawNo=418/92')
40 |   })
41 | 
42 |   it('should be able to create correct url with a port number', function () {
43 |     var parent = new Job('http://nukrobi2.nuk.uni-lj.si:8080/wayback/20101021090940/http://volitve.gov.si/lv2010/kandidati/seznam_obcin.html')
44 |     var child = new Job('http://nukrobi2.nuk.uni-lj.si:8080/wayback/20101021091250/http://volitve.gov.si/lv2010/kandidati/zupani_os_celje.html', null, null, parent)
45 |     assert.equal(child.url, 'http://nukrobi2.nuk.uni-lj.si:8080/wayback/20101021091250/http://volitve.gov.si/lv2010/kandidati/zupani_os_celje.html')
46 | 
47 |     parent = new Job('http://nukrobi2.nuk.uni-lj.si:8080')
48 |     child = new Job('zupani_os_celje.html', null, null, parent)
49 |     assert.equal(child.url, 'http://nukrobi2.nuk.uni-lj.si:8080/zupani_os_celje.html')
50 |   })
51 | 
52 |   it('should not override data with base data if it already exists', function () {
53 |     var browser = {
54 |       fetchData: function (url, sitemap, parentSelector, callback) {
55 |         callback(null, [{a: 1, b: 2}])
56 |       }
57 |     }
58 | 
59 |     var job = new Job(undefined, undefined, {sitemap: undefined}, undefined, {a: 'do not override', c: 3})
60 |     job.execute(browser, function () {})
61 |     var results = job.getResults()
62 |     assert.deepEqual(results, [{a: 1, b: 2, c: 3}])
63 |   })
64 | })
65 | 


--------------------------------------------------------------------------------
/tests/spec/QueueSpec.js:
--------------------------------------------------------------------------------
 1 | const Queue = require('./../../extension/scripts/Queue')
 2 | const Job = require('./../../extension/scripts/Job')
 3 | const assert = require('chai').assert
 4 | 
 5 | describe('Queue', function () {
 6 |   var q
 7 |   var job
 8 | 
 9 |   beforeEach(function () {
10 |     q = new Queue()
11 |     job = new Job('http://test.lv/', {})
12 |   })
13 | 
14 |   it('should be able to add items to queue', function () {
15 |     q.add(job)
16 |     assert.equal(q.getQueueSize(), 1)
17 |     assert.equal(q.jobs[0].url, 'http://test.lv/')
18 |   })
19 | 
20 |   it('should be able to mark urls as scraped', function () {
21 |     q.add(job)
22 |     q.getNextJob()
23 |     assert.equal(q.getQueueSize(), 0)
24 | 
25 | 		// try to add this job again
26 |     q.add(job)
27 |     assert.equal(q.getQueueSize(), 0)
28 |   })
29 | 
30 |   it('should be able to reject documents', function () {
31 |     job = new Job('http://test.lv/test.doc')
32 | 
33 |     var accepted = q.add(job)
34 |     assert.isFalse(accepted)
35 |   })
36 | })
37 | 


--------------------------------------------------------------------------------
/tests/spec/Selector/SelectorElementAttributeSpec.js:
--------------------------------------------------------------------------------
  1 | const Selector = require('../../../extension/scripts/Selector')
  2 | const utils = require('./../../utils')
  3 | const assert = require('chai').assert
  4 | const globals = require('../../globals')
  5 | 
  6 | describe('Element Attribute Selector', function () {
  7 |   var $el
  8 |   let $
  9 |   let document
 10 |   let window
 11 | 
 12 |   beforeEach(function () {
 13 |     $ = globals.$
 14 |     document = globals.document
 15 |     window = globals.window
 16 | 
 17 |     document.body.innerHTML = utils.getTestHTML()
 18 |     $el = utils.createElementFromHTML("<div id='tests' style='display:none'></div>", document)
 19 |     document.body.appendChild($el)
 20 |   })
 21 | 
 22 |   it('should extract image src tag', function (done) {
 23 |     var selector = new Selector({
 24 |       id: 'img',
 25 |       type: 'SelectorElementAttribute',
 26 |       multiple: false,
 27 |       extractAttribute: 'src',
 28 |       selector: 'img'
 29 |     }, { $, document, window })
 30 | 
 31 |     var dataDeferred = selector.getData(document.querySelector('#selector-image-one-image'))
 32 |     dataDeferred.then(function (data) {
 33 |       assert.deepEqual(data, [
 34 |         {
 35 |           'img': 'http://aa/'
 36 |         }
 37 |       ])
 38 |       done()
 39 |     })
 40 |   })
 41 | 
 42 |   it('should extract multiple src tags', function (done) {
 43 |     var selector = new Selector({
 44 |       id: 'img',
 45 |       type: 'SelectorElementAttribute',
 46 |       multiple: true,
 47 |       extractAttribute: 'src',
 48 |       selector: 'img'
 49 |     }, { $, document, window })
 50 | 
 51 |     var dataDeferred = selector.getData(document.querySelector('#selector-image-multiple-images'))
 52 | 
 53 |     dataDeferred.then(function (data) {
 54 |       assert.deepEqual(data, [
 55 |         {
 56 |           'img': 'http://aa/'
 57 |         },
 58 |         {
 59 |           'img': 'http://bb/'
 60 |         }
 61 |       ])
 62 |       done()
 63 |     })
 64 |   })
 65 | 
 66 |   it('should return only one data column', function () {
 67 |     var selector = new Selector({
 68 |       id: 'id',
 69 |       type: 'SelectorElementAttribute',
 70 |       multiple: true,
 71 |       selector: 'img'
 72 |     }, { $, document, window })
 73 | 
 74 |     var columns = selector.getDataColumns()
 75 |     assert.deepEqual(columns, [ 'id' ])
 76 |   })
 77 | 
 78 |   it('should return empty array when no images are found', function (done) {
 79 |     var selector = new Selector({
 80 |       id: 'img',
 81 |       type: 'SelectorElementAttribute',
 82 |       multiple: true,
 83 |       selector: 'img.not-exist',
 84 |       extractAttribute: 'src'
 85 |     }, { $, document, window })
 86 | 
 87 |     var dataDeferred = selector.getData(document.querySelector('#not-exist'))
 88 | 
 89 |     dataDeferred.then(function (data) {
 90 |       assert.deepEqual(data, [])
 91 |       done()
 92 |     })
 93 |   })
 94 | 
 95 |   it('should be able to select data- attributes', function (done) {
 96 |     var html = '<ul><li data-type="dog"></li></ul>'
 97 |     utils.appendHTML($el, html, document)
 98 | 
 99 |     var selector = new Selector({
100 |       id: 'type',
101 |       type: 'SelectorElementAttribute',
102 |       multiple: true,
103 |       selector: 'li',
104 |       extractAttribute: 'data-type'
105 |     }, { $, document, window })
106 | 
107 |     var dataDeferred = selector.getData($el)
108 | 
109 |     dataDeferred.then(function (data) {
110 |       assert.deepEqual(data, [ {
111 |         'type': 'dog'
112 |       } ])
113 |       done()
114 |     })
115 |   })
116 | })
117 | 


--------------------------------------------------------------------------------
/tests/spec/Selector/SelectorElementScrollSpec.js:
--------------------------------------------------------------------------------
  1 | var Selector = require('../../../extension/scripts/Selector')
  2 | const utils = require('./../../utils')
  3 | const assert = require('chai').assert
  4 | const globals = require('../../globals')
  5 | 
  6 | describe('Scroll Element Selector', function () {
  7 |   var $el
  8 |   let $
  9 | let document
 10 | let window
 11 |   beforeEach(function () {
 12 |     $ = globals.$
 13 | document = globals.document
 14 | window = globals.window
 15 | 
 16 |     document.body.innerHTML = utils.getTestHTML()
 17 |     $el = utils.createElementFromHTML("<div id='tests' style='display:none'></div>", document)
 18 |     document.body.appendChild($el)
 19 |   })
 20 | 
 21 |   it('should return one element', function (done) {
 22 |     $el.innerHTML = '<div>a</div><div>b</div>'
 23 |     var selector = new Selector({
 24 |       id: 'a',
 25 |       type: 'SelectorElementScroll',
 26 |       multiple: false,
 27 |       selector: 'div'
 28 |     }, {$, document, window})
 29 | 
 30 |     var dataDeferred = selector.getData($el)
 31 |     dataDeferred.then(function (data) {
 32 |       assert.equal(data.length, 1)
 33 |       assert.equal(data[0], $el.querySelectorAll('div')[0])
 34 |       done()
 35 |     })
 36 |   })
 37 | 
 38 |   it('should return multiple elements', function (done) {
 39 |     $el.innerHTML = '<div>a</div><div>b</div>'
 40 |     var selector = new Selector({
 41 |       id: 'a',
 42 |       type: 'SelectorElementScroll',
 43 |       multiple: true,
 44 |       selector: 'div'
 45 |     }, {$, document, window})
 46 | 
 47 |     var dataDeferred = selector.getData($el)
 48 |     dataDeferred.then(function (data) {
 49 |       assert.equal(data.length, 2)
 50 |       assert.deepEqual(data, Array.from($el.querySelectorAll('div')))
 51 |       done()
 52 |     })
 53 |   })
 54 | 
 55 |   it('should get elements when scrolling is not needed', function (done) {
 56 |     $el.innerHTML = '<a>a</a>'
 57 |     var selector = new Selector({
 58 |       id: 'a',
 59 |       type: 'SelectorElementScroll',
 60 |       multiple: true,
 61 |       selector: 'a',
 62 |       delay: 100
 63 |     }, {$, document, window})
 64 |     var dataDeferred = selector.getData($el)
 65 |     dataDeferred.then(function (data) {
 66 |       assert.equal(data.length, 1)
 67 |       assert.equal(data[0], $el.querySelectorAll('a')[0])
 68 |       done()
 69 |     })
 70 |   })
 71 | 
 72 |   it('should get elements which are added a delay', function (done) {
 73 |     $el.innerHTML = '<a>a</a>'
 74 |     // add extra element after a little delay
 75 |     setTimeout(function () {
 76 |       utils.appendHTML($el, '<a>a</a>', document)
 77 |     }, 100)
 78 | 
 79 |     var selector = new Selector({
 80 |       id: 'a',
 81 |       type: 'SelectorElementScroll',
 82 |       multiple: true,
 83 |       selector: 'a',
 84 |       delay: 200
 85 |     }, {$, document, window})
 86 |     var dataDeferred = selector.getData($el)
 87 |     dataDeferred.then(function (data) {
 88 |       assert.equal(data.length, 2)
 89 |       assert.deepEqual(data, Array.from($el.querySelectorAll('a')))
 90 |       done()
 91 |     })
 92 |   })
 93 |   it('should return no data columns', function () {
 94 |     var selector = new Selector({
 95 |       id: 'a',
 96 |       type: 'SelectorElementScroll',
 97 |       multiple: true,
 98 |       selector: 'div'
 99 |     }, {$, document, window})
100 | 
101 |     var columns = selector.getDataColumns()
102 |     assert.deepEqual(columns, [])
103 |   })
104 | })
105 | 


--------------------------------------------------------------------------------
/tests/spec/Selector/SelectorElementSpec.js:
--------------------------------------------------------------------------------
 1 | var Selector = require('../../../extension/scripts/Selector')
 2 | const utils = require('./../../utils')
 3 | const assert = require('chai').assert
 4 | const globals = require('../../globals')
 5 | 
 6 | describe('Element Selector', function () {
 7 |   let $
 8 | let document
 9 | let window
10 |   beforeEach(function () {
11 |     $ = globals.$
12 | document = globals.document
13 | window = globals.window
14 | 
15 |     document.body.innerHTML = utils.getTestHTML()
16 |   })
17 | 
18 |   it('should return one element', function (done) {
19 |     var selector = new Selector({
20 |       id: 'a',
21 |       type: 'SelectorElement',
22 |       multiple: false,
23 |       selector: 'div'
24 |     }, {$, document, window})
25 | 
26 |     var dataDeferred = selector.getData(document.querySelectorAll('#selector-element-nodata')[0])
27 |     dataDeferred.then(function (data) {
28 |       assert.equal(data.length, 1)
29 |       assert.equal(data[0], document.querySelectorAll('#selector-element-nodata div')[0])
30 |       done()
31 |     })
32 |   })
33 | 
34 |   it('should return multiple elements', function (done) {
35 |     var selector = new Selector({
36 |       id: 'a',
37 |       type: 'SelectorElement',
38 |       multiple: true,
39 |       selector: 'div'
40 |     }, {$, document, window})
41 |     var dataDeferred = selector.getData(document.querySelectorAll('#selector-element-nodata')[0])
42 |     dataDeferred.then(function (data) {
43 |       assert.equal(data.length, 2)
44 |       assert.deepEqual(data, Array.from(document.querySelectorAll('#selector-element-nodata div')))
45 |       done()
46 |     })
47 |   })
48 | 
49 |   it('should return no data columns', function () {
50 |     var selector = new Selector({
51 |       id: 'a',
52 |       type: 'SelectorElement',
53 |       multiple: true,
54 |       selector: 'div'
55 |     }, {$, document, window})
56 | 
57 |     var columns = selector.getDataColumns()
58 |     assert.deepEqual(columns, [])
59 |   })
60 | })
61 | 


--------------------------------------------------------------------------------
/tests/spec/Selector/SelectorGoogMapIDSpec.js:
--------------------------------------------------------------------------------
 1 | const Selector = require('../../../extension/scripts/Selector')
 2 | const utils = require('./../../utils')
 3 | const assert = require('chai').assert
 4 | const globals = require('../../globals')
 5 | describe('Goog Map ID Selector', function () {
 6 |   var $el
 7 |   let $
 8 |   let document
 9 |   let window
10 |   beforeEach(function () {
11 |     $ = globals.$
12 |     document = globals.document
13 |     window = globals.window
14 | 
15 |     document.body.innerHTML = utils.getTestHTML()
16 |     $el = utils.createElementFromHTML("<div id='tests' style='display:none'></div>", document)
17 |     document.body.appendChild($el)
18 |   })
19 | 
20 |   it('FTID selector', function (done) {
21 |     $el.innerHTML = `
22 |         <iframe src='//20768463p.rfihub.com/ca.html?rb=303415&ca=202438463&_o=30656&_t=207878463&ra=REPLACE_ME_WITH_YOUR_CACHE_BUSTING' style='display:none;padding:0;margin:0' width='0' height='0'>
23 |         </iframe>
24 | 
25 |       <section class="map-stars">
26 |         <div class="overlay" onClick="style.pointerEvents='none'"></div>
27 |         <iframe src="https://www.google.com/maps/embed?pb=!1m18!1m12!1m3!1d2993.949554494766!2d2.161869551228708!3d41.37518070461241!2m3!1f0!2f0!3f0!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x12a4a2674531e3bd%3A0xf12f53af6888194e!2sAv.+del+Mada%C5%80lel%2C+110%2C+28025+Madrid!5e0!3m2!1ses!2ses!4v1483432429343" width="600" height="450" frameborder="0" style="border:0" allowfullscreen></iframe>
28 |     </section>
29 | 
30 |       <div></div>      
31 |     `
32 |     var selector = new Selector({
33 |       id: 'a',
34 |       type: 'SelectorGoogMapID',
35 |       selector: 'section',
36 |       mapsSelectorFromDiv: 'iframe[src*="google.com/maps/embed"]'
37 |     }, {$, document, window})
38 | 
39 |     var dataDeferred = selector.getData($el)
40 |     dataDeferred.then(function (data) {
41 |       assert.equal(data[0].a_FTID, '0x12a4a2674531e3bd:0xf12f53af6888194e')
42 |       done()
43 |     })
44 |   })
45 | })
46 | 


--------------------------------------------------------------------------------
/tests/spec/Selector/SelectorGroupSpec.js:
--------------------------------------------------------------------------------
 1 | var Selector = require('../../../extension/scripts/Selector')
 2 | const utils = require('./../../utils')
 3 | const assert = require('chai').assert
 4 | const globals = require('../../globals')
 5 | 
 6 | describe('Group Selector', function () {
 7 |   let $
 8 | let document
 9 | let window
10 |   beforeEach(function () {
11 |     $ = globals.$
12 | document = globals.document
13 | window = globals.window
14 |     document.body.innerHTML = utils.getTestHTML()
15 | 
16 |   })
17 | 
18 |   it('should extract text data', function (done) {
19 |     var selector = new Selector({
20 |       id: 'a',
21 |       type: 'SelectorGroup',
22 |       multiple: false,
23 |       selector: 'div'
24 |     }, {$, document, window})
25 | 
26 |     var dataDeferred = selector.getData(document.querySelectorAll('#selector-group-text')[0])
27 |     dataDeferred.then(function (data) {
28 |       assert.equal(data.length, 1)
29 |       var expected = [
30 |         {
31 |           a: [
32 |             {
33 |               a: 'a'
34 |             },
35 |             {
36 |               a: 'b'
37 |             }
38 |           ]
39 |         }
40 |       ]
41 |       assert.deepEqual(data, expected)
42 |       done()
43 |     })
44 |   })
45 | 
46 |   it('should extract link urls', function (done) {
47 |     var selector = new Selector({
48 |       id: 'a',
49 |       type: 'SelectorGroup',
50 |       multiple: false,
51 |       selector: 'a',
52 |       extractAttribute: 'href'
53 |     }, {$, document, window})
54 |     var dataDeferred = selector.getData(document.querySelectorAll('#selector-group-url')[0])
55 |     dataDeferred.then(function (data) {
56 |       assert.equal(data.length, 1)
57 |       var expected = [
58 |         {
59 |           a: [
60 |             {
61 |               a: 'a',
62 |               'a-href': 'http://aa/'
63 |             },
64 |             {
65 |               a: 'b',
66 |               'a-href': 'http://bb/'
67 |             }
68 |           ]
69 |         }
70 |       ]
71 |       assert.deepEqual(data, expected)
72 |       done()
73 |     })
74 |   })
75 | 
76 |   it('should return only one data column', function () {
77 |     var selector = new Selector({
78 |       id: 'id',
79 |       type: 'SelectorGroup',
80 |       multiple: true,
81 |       selector: 'div'
82 |     }, {$, document, window})
83 | 
84 |     var columns = selector.getDataColumns()
85 |     assert.deepEqual(columns, ['id'])
86 |   })
87 | })
88 | 


--------------------------------------------------------------------------------
/tests/spec/Selector/SelectorHTMLSpec.js:
--------------------------------------------------------------------------------
  1 | var Selector = require('../../../extension/scripts/Selector')
  2 | const utils = require('./../../utils')
  3 | const assert = require('chai').assert
  4 | const globals = require('../../globals')
  5 | 
  6 | describe('HTML Selector', function () {
  7 |   let $
  8 | let document
  9 | let window
 10 |   beforeEach(function () {
 11 |     $ = globals.$
 12 | document = globals.document
 13 | window = globals.window
 14 | 
 15 |     document.body.innerHTML = utils.getTestHTML()
 16 |   })
 17 | 
 18 |   it('should extract single html element', function (done) {
 19 |     var selector = new Selector({
 20 |       id: 'a',
 21 |       type: 'SelectorHTML',
 22 |       multiple: false,
 23 |       selector: 'div'
 24 |     }, {$, document, window})
 25 |     var dataDeferred = selector.getData(document.querySelectorAll('#selector-html-single-html')[0])
 26 |     dataDeferred.then(function (data) {
 27 |       assert.equal(data.length, 1)
 28 |       var expected = [
 29 |         {
 30 |           a: 'aaa<b>bbb</b>ccc'
 31 |         }
 32 |       ]
 33 |       assert.deepEqual(data, expected)
 34 |       done()
 35 |     })
 36 |   })
 37 | 
 38 |   it('should extract multiple html elements', function (done) {
 39 |     var selector = new Selector({
 40 |       id: 'a',
 41 |       type: 'SelectorHTML',
 42 |       multiple: true,
 43 |       selector: 'div'
 44 |     }, {$, document, window})
 45 |     var dataDeferred = selector.getData(document.querySelectorAll('#selector-html-multiple-html')[0])
 46 |     dataDeferred.then(function (data) {
 47 |       assert.equal(data.length, 2)
 48 |       var expected = [
 49 |         {
 50 |           a: 'aaa<b>bbb</b>ccc'
 51 |         },
 52 |         {
 53 |           a: 'ddd<b>eee</b>fff'
 54 |         }
 55 |       ]
 56 |       assert.deepEqual(data, expected)
 57 |       done()
 58 |     })
 59 |   })
 60 | 
 61 |   it('should extract null when there are no elements', function (done) {
 62 |     var selector = new Selector({
 63 |       id: 'a',
 64 |       type: 'SelectorHTML',
 65 |       multiple: false,
 66 |       selector: 'div'
 67 |     }, {$, document, window})
 68 |     console.log(document.querySelectorAll('#selector-html-single-not-exist'))
 69 |     var dataDeferred = selector.getData(document.querySelectorAll('#selector-html-single-not-exist')[0])
 70 |     dataDeferred.then(function (data) {
 71 |       assert.equal(data.length, 1)
 72 |       var expected = [
 73 |         {
 74 |           a: null
 75 |         }
 76 |       ]
 77 |       assert.deepEqual(data, expected)
 78 |       done()
 79 |     })
 80 |   })
 81 | 
 82 |   it('should extract null when there is no regex match', function (done) {
 83 |     var selector = new Selector({
 84 |       id: 'a',
 85 |       type: 'SelectorHTML',
 86 |       multiple: false,
 87 |       selector: 'div',
 88 |       regex: 'wontmatch'
 89 |     }, {$, document, window})
 90 |     var dataDeferred = selector.getData(document.querySelectorAll('#selector-html-single-html')[0])
 91 |     dataDeferred.then(function (data) {
 92 |       assert.equal(data.length, 1)
 93 |       var expected = [
 94 |         {
 95 |           a: null
 96 |         }
 97 |       ]
 98 |       assert.deepEqual(data, expected)
 99 |       done()
100 |     })
101 |   })
102 | 
103 |   it('should extract html+text using regex', function (done) {
104 |     var selector = new Selector({
105 |       id: 'a',
106 |       type: 'SelectorHTML',
107 |       multiple: false,
108 |       selector: 'div',
109 |       regex: '<b>\\w+'
110 |     }, {$, document, window})
111 |     var dataDeferred = selector.getData(document.querySelectorAll('#selector-html-single-html')[0])
112 |     dataDeferred.then(function (data) {
113 |       assert.equal(data.length, 1)
114 |       var expected = [
115 |         {
116 |           a: '<b>bbb'
117 |         }
118 |       ]
119 |       assert.deepEqual(data, expected)
120 |       done()
121 |     })
122 |   })
123 | 
124 |   it('should return only one data column', function () {
125 |     var selector = new Selector({
126 |       id: 'id',
127 |       type: 'SelectorHTML',
128 |       multiple: true,
129 |       selector: 'div'
130 |     }, {$, document, window})
131 | 
132 |     var columns = selector.getDataColumns()
133 |     assert.deepEqual(columns, ['id'])
134 |   })
135 | })
136 | 


--------------------------------------------------------------------------------
/tests/spec/Selector/SelectorImageSpec.js:
--------------------------------------------------------------------------------
  1 | const Selector = require('../../../extension/scripts/Selector')
  2 | const SelectorImage = require('../../../extension/scripts/Selector/SelectorImage')
  3 | const utils = require('./../../utils')
  4 | const assert = require('chai').assert
  5 | const globals = require('../../globals')
  6 | 
  7 | describe('Image Selector', function () {
  8 |   let $
  9 | let document
 10 | let window
 11 |   var $el
 12 |   beforeEach(function () {
 13 |     $ = globals.$
 14 |     document = globals.document
 15 |     window = globals.window
 16 | 
 17 |     document.body.innerHTML = utils.getTestHTML()
 18 |     $el = utils.createElementFromHTML("<div id='tests' style='display:none'></div>", document)
 19 |     document.body.appendChild($el)
 20 |   })
 21 | 
 22 |   it('should extract single image', function (done) {
 23 |     var selector = new Selector({
 24 |       id: 'img',
 25 |       type: 'SelectorImage',
 26 |       multiple: false,
 27 |       selector: 'img'
 28 |     }, {$, document, window})
 29 | 
 30 |     var dataDeferred = selector.getData(document.querySelectorAll('#selector-image-one-image')[0])
 31 |     dataDeferred.then(function (data) {
 32 |       assert.equal(data.length, 1)
 33 |       var expected = [
 34 |         {
 35 |           'img-src': 'http://aa/'
 36 |         }
 37 |       ]
 38 |       assert.deepEqual(data, expected)
 39 |       done()
 40 |     })
 41 |   })
 42 | 
 43 |   it('should extract multiple images', function (done) {
 44 |     var selector = new Selector({
 45 |       id: 'img',
 46 |       type: 'SelectorImage',
 47 |       multiple: true,
 48 |       selector: 'img'
 49 |     }, {$, document, window})
 50 |     var dataDeferred = selector.getData(document.querySelectorAll('#selector-image-multiple-images')[0])
 51 |     dataDeferred.then(function (data) {
 52 |       assert.equal(data.length, 2)
 53 |       var expected = [
 54 |         {
 55 |           'img-src': 'http://aa/'
 56 |         },
 57 |         {
 58 |           'img-src': 'http://bb/'
 59 |         }
 60 |       ]
 61 |       assert.deepEqual(data, expected)
 62 |       done()
 63 |     })
 64 |   })
 65 | 
 66 |   it('should return only src column', function () {
 67 |     var selector = new Selector({
 68 |       id: 'id',
 69 |       type: 'SelectorImage',
 70 |       multiple: true,
 71 |       selector: 'img'
 72 |     }, {$, document, window})
 73 | 
 74 |     var columns = selector.getDataColumns()
 75 |     assert.deepEqual(columns, ['id-src'])
 76 |   })
 77 | 
 78 |   it('should return empty array when no images are found', function (done) {
 79 |     var selector = new Selector({
 80 |       id: 'img',
 81 |       type: 'SelectorImage',
 82 |       multiple: true,
 83 |       selector: 'img.not-exist'
 84 |     }, {$, document, window})
 85 |     var dataDeferred = selector.getData(document.querySelectorAll('#not-exist')[0])
 86 |     dataDeferred.then(function (data) {
 87 |       assert.equal(data.length, 0)
 88 |       var expected = []
 89 |       assert.deepEqual(data, expected)
 90 |       done()
 91 |     })
 92 |   })
 93 | 
 94 |   // base is not a real url so it does not work from jsdom.
 95 |   it.skip('should be able to download image as base64', function (done) {
 96 |     var selector = new Selector({
 97 |       id: 'img',
 98 |       type: 'SelectorImage'
 99 |     }, {$, document, window})
100 |     var deferredImage = selector.downloadImageBase64('base/docs/images/chrome-store-logo.png')
101 | 
102 |     deferredImage.then(function (imageResponse) {
103 |       assert.isTrue(imageResponse.imageBase64.length > 100)
104 |       done()
105 |     })
106 |   })
107 | 
108 |   it.skip('should be able to get data with image data attached', function (done) {
109 |     $el.innerHTML = '<img src="base/docs/images/chrome-store-logo.png">'
110 | 
111 |     var selector = new Selector({
112 |       id: 'img',
113 |       type: 'SelectorImage',
114 |       multiple: true,
115 |       selector: 'img',
116 |       downloadImage: true
117 |     }, {$, document, window})
118 | 
119 |     var dataDeferred = selector.getData($el)
120 |     dataDeferred.then(function (data) {
121 |       assert.equal(data.length, 1)
122 |       assert.isTrue(!!data[0]['_imageBase64-img'])
123 |       assert.isTrue(!!data[0]['_imageMimeType-img'])
124 |       done()
125 |     })
126 |   })
127 | })
128 | 


--------------------------------------------------------------------------------
/tests/spec/Selector/SelectorLinkSpec.js:
--------------------------------------------------------------------------------
 1 | var Selector = require('../../../extension/scripts/Selector')
 2 | const utils = require('./../../utils')
 3 | const assert = require('chai').assert
 4 | const globals = require('../../globals')
 5 | 
 6 | describe('Link Selector', function () {
 7 |   var $el
 8 |   let $
 9 | let document
10 | let window
11 |   beforeEach(function () {
12 |     $ = globals.$
13 | document = globals.document
14 | window = globals.window
15 | 
16 |     document.body.innerHTML = utils.getTestHTML()
17 |     $el = utils.createElementFromHTML("<div id='tests' style='display:none'></div>", document)
18 |     document.body.appendChild($el)
19 |   })
20 | 
21 |   it('should extract single link', function (done) {
22 |     var selector = new Selector({
23 |       id: 'a',
24 |       type: 'SelectorLink',
25 |       multiple: false,
26 |       selector: 'a'
27 |     }, {$, document, window})
28 | 
29 |     var dataDeferred = selector.getData(document.querySelectorAll('#selector-follow')[0])
30 |     dataDeferred.then(function (data) {
31 |       var expected = [
32 |         {
33 |           a: 'a',
34 |           'a-href': 'http://example.com/a',
35 |           _follow: 'http://example.com/a',
36 |           _followSelectorId: 'a'
37 |         }
38 |       ]
39 |       assert.deepEqual(data, expected)
40 |       done()
41 |     })
42 |   })
43 | 
44 |   it('should extract multiple links', function (done) {
45 |     var selector = new Selector({
46 |       id: 'a',
47 |       type: 'SelectorLink',
48 |       multiple: true,
49 |       selector: 'a'
50 |     }, {$, document, window})
51 |     var dataDeferred = selector.getData(document.querySelectorAll('#selector-follow')[0])
52 |     dataDeferred.then(function (data) {
53 |       var expected = [
54 |         {
55 |           a: 'a',
56 |           'a-href': 'http://example.com/a',
57 |           _follow: 'http://example.com/a',
58 |           _followSelectorId: 'a'
59 |         },
60 |         {
61 |           a: 'b',
62 |           'a-href': 'http://example.com/b',
63 |           _follow: 'http://example.com/b',
64 |           _followSelectorId: 'a'
65 |         }
66 |       ]
67 |       assert.deepEqual(data, expected)
68 |       done()
69 |     })
70 |   })
71 | 
72 |   it('should return data and url columns', function () {
73 |     var selector = new Selector({
74 |       id: 'id',
75 |       type: 'SelectorLink',
76 |       multiple: true,
77 |       selector: 'div'
78 |     }, {$, document, window})
79 | 
80 |     var columns = selector.getDataColumns()
81 |     assert.deepEqual(columns, ['id', 'id-href'])
82 |   })
83 | 
84 |   it('should return empty array when no links are found', function (done) {
85 |     var selector = new Selector({
86 |       id: 'a',
87 |       type: 'SelectorLink',
88 |       multiple: true,
89 |       selector: 'a'
90 |     }, {$, document, window})
91 |     var dataDeferred = selector.getData(document.querySelectorAll('#not-exist')[0])
92 |     dataDeferred.then(function (data) {
93 |       var expected = []
94 |       assert.deepEqual(data, expected)
95 |       done()
96 |     })
97 |   })
98 | })
99 | 


--------------------------------------------------------------------------------
/tests/spec/SelectorSpec.js:
--------------------------------------------------------------------------------
 1 | const Selector = require('./../../extension/scripts/Selector')
 2 | const utils = require('./../utils')
 3 | const assert = require('chai').assert
 4 | const globals = require('../globals')
 5 | describe('Selector', function () {
 6 |   var $el
 7 |   let $
 8 | let document
 9 | let window
10 | 
11 |   beforeEach(function () {
12 |     $ = globals.$
13 | document = globals.document
14 | window = globals.window
15 | 
16 |     document.body.innerHTML = utils.getTestHTML()
17 |     $el = utils.createElementFromHTML("<div id='tests' style='display:none'>aaaaaaaaaaaa</div>", document)
18 |     document.body.appendChild($el)
19 |   })
20 | 
21 |   it('should be able to select elements', function () {
22 |     $el.innerHTML = '<a></a>'
23 |     var selector = new Selector({
24 |       selector: 'a',
25 |       type: 'SelectorLink'
26 |     }, {$, document, window})
27 |     var elements = selector.getDataElements($el)
28 | 
29 |     assert.deepEqual(elements, Object.values($el.querySelectorAll('a')))
30 |   })
31 | 
32 |   it('should be able to select parent', function () {
33 |     $el.innerHTML = '<a></a>'
34 |     var selector = new Selector({
35 |       selector: '_parent_',
36 |       type: 'SelectorLink'
37 |     }, {$, document, window})
38 |     var elements = selector.getDataElements($el)
39 | 
40 |     assert.deepEqual(elements, [$el])
41 |   })
42 | 
43 |   it('should be able to select elements with delay', function () {
44 |     var selector = new Selector({
45 |       id: 'a',
46 |       selector: 'a',
47 |       type: 'SelectorText',
48 |       delay: 100
49 |     }, {$, document, window})
50 |     var dataDeferred = selector.getData($el)
51 | 
52 | 		// add data after data extraction called
53 |     $el.innerHTML = '<a>a</a>'
54 | 
55 |     return dataDeferred.then(function (data) {
56 |       assert.deepEqual(data, [
57 |         {
58 |           'a': 'a'
59 |         }
60 |       ])
61 |     })
62 |   })
63 | })
64 | 


--------------------------------------------------------------------------------
/tests/spec/UniqueElementListSpec.js:
--------------------------------------------------------------------------------
 1 | const UniqueElementList = require('../../extension/scripts/UniqueElementList')
 2 | const utils = require('./../utils')
 3 | const assert = require('chai').assert
 4 | const globals = require('../globals')
 5 | describe('UniqueElementList', function () {
 6 |   var $el
 7 |   let $
 8 | let document
 9 | let window
10 | 
11 |   beforeEach(function () {
12 |     $ = globals.$
13 | document = globals.document
14 | window = globals.window
15 | 
16 |     document.body.innerHTML = utils.getTestHTML()
17 |     $el = utils.createElementFromHTML("<div id='tests' style='display:none'></div>", document)
18 |     document.body.appendChild($el)
19 |   })
20 | 
21 |   it('it should add only unique elements', function () {
22 |     $el.innerHTML = '<a>1</a><a>2</a>'
23 | 
24 |     var list = new UniqueElementList('uniqueText', {$, document, window})
25 |     assert.equal(list.length, 0)
26 | 
27 |     var $a = $el.querySelectorAll('a')
28 |     list.push($a[0])
29 |     assert.equal(list.length, 1)
30 |     list.push($a[0])
31 |     assert.equal(list.length, 1)
32 |     list.push($a[1])
33 |     assert.equal(list.length, 2)
34 |     list.push($a[1])
35 |     assert.equal(list.length, 2)
36 |   })
37 | 
38 |   it('it should add only unique elements when using uniqueHTMLText type', function () {
39 |     $el.innerHTML = "<a id='1'>a</a><a id='2'>a</a>"
40 | 
41 |     var list = new UniqueElementList('uniqueHTMLText', {$, document, window})
42 |     assert.equal(list.length, 0)
43 | 
44 |     var $a = $el.querySelectorAll('a')
45 |     list.push($a[0])
46 |     assert.equal(list.length, 1)
47 |     list.push($a[0])
48 |     assert.equal(list.length, 1)
49 |     list.push($a[1])
50 |     assert.equal(list.length, 2)
51 |     list.push($a[1])
52 |     assert.equal(list.length, 2)
53 |   })
54 | 
55 |   it('it should add only unique elements when using uniqueHTML type', function () {
56 |     $el.innerHTML = "<a class='1'>a<span>a</span></a><a class='2'>a<span>b</span></a><a class='1'>c<span>c</span></a>"
57 | 
58 |     var list = new UniqueElementList('uniqueHTML', {$, document, window})
59 |     assert.equal(list.length, 0)
60 | 
61 |     var $a = $el.querySelectorAll('a')
62 |     list.push($a[0])
63 |     assert.equal(list.length, 1)
64 |     list.push($a[0])
65 |     assert.equal(list.length, 1)
66 |     list.push($a[1])
67 |     assert.equal(list.length, 2)
68 |     list.push($a[1])
69 |     assert.equal(list.length, 2)
70 |     list.push($a[2])
71 |     assert.equal(list.length, 2)
72 |   })
73 | 
74 |   it('it should add only unique elements when using uniqueCSSSelector type', function () {
75 |     $el.innerHTML = '<a></a><a></a>'
76 | 
77 |     var list = new UniqueElementList('uniqueCSSSelector', {$, document, window})
78 |     assert.equal(list.length, 0)
79 | 
80 |     var $a = $el.querySelectorAll('a')
81 |     list.push($a[0])
82 |     assert.equal(list.length, 1)
83 |     list.push($a[0])
84 |     assert.equal(list.length, 1)
85 |     list.push($a[1])
86 |     assert.equal(list.length, 2)
87 |     list.push($a[1])
88 |     assert.equal(list.length, 2)
89 |   })
90 | })
91 | 


--------------------------------------------------------------------------------
/tests/spec/browser/BackgroundScriptSpec.js:
--------------------------------------------------------------------------------
 1 | const getBackgroundScript = require('../../../extension/scripts/getBackgroundScript')
 2 | const getContentScript = require('../../../extension/scripts/getContentScript')
 3 | const selectorMatchers = require('../../Matchers')
 4 | const utils = require('../../utils')
 5 | 
 6 | describe('BackgroundScript', function () {
 7 |   var backgroundScript = getBackgroundScript('BackgroundScript')
 8 |   var $el
 9 | 
10 |   beforeEach(function () {
11 |     document.body.innerHTML = utils.getTestHTML()
12 |     $el = utils.createElementFromHTML("<div id='tests' style='display:none'></div>", document)
13 |     document.body.appendChild($el)
14 |   })
15 | 
16 |   it('should be able to call BackgroundScript functions from background script', async function () {
17 |     var deferredResponse = backgroundScript.dummy()
18 |     await selectorMatchers.deferredToEqual(deferredResponse, 'dummy')
19 |     await selectorMatchers.deferredToEqual(deferredResponse, 'dummy')
20 |   })
21 | 
22 |   it('should be able to call BackgroundScript from Devtools', async function () {
23 |     var backgroundScript = getBackgroundScript('DevTools')
24 |     var deferredResponse = backgroundScript.dummy()
25 |     await selectorMatchers.deferredToEqual(deferredResponse, 'dummy')
26 |   })
27 | })
28 | 


--------------------------------------------------------------------------------
/tests/spec/browser/ChromePopupBrowserSpec.js:
--------------------------------------------------------------------------------
 1 | const ChromePopupBrowser = require('../../../extension/scripts/ChromePopupBrowser')
 2 | const Sitemap = require('../../../extension/scripts/Sitemap')
 3 | const assert = require('chai').assert
 4 | const utils = require('../../utils')
 5 | const globals = require('../../globals')
 6 | describe('Chrome popup browser', function () {
 7 |   let $
 8 |   let document
 9 |   let window
10 |   beforeEach(function () {
11 |     $ = globals.$
12 |     document = globals.document
13 |     window = globals.window
14 | 
15 |     window.chromeAPI.reset()
16 |     document.body.innerHTML = utils.getTestHTML()
17 |   })
18 | 
19 |   it('should init a popup window', function () {
20 |     var browser = new ChromePopupBrowser({
21 |       pageLoadDelay: 500
22 |     })
23 |     browser._initPopupWindow(function () {
24 |     })
25 |     assert.deepEqual(browser.tab, {id: 0})
26 |   })
27 | 
28 |   it('should load a page', function (done) {
29 |     var browser = new ChromePopupBrowser({
30 |       pageLoadDelay: 500
31 |     })
32 |     browser._initPopupWindow(function () {
33 |     })
34 |     browser.loadUrl('http://example,com/', function () {
35 |       done()
36 |     })
37 |   })
38 | 
39 |   it('should sendMessage to popup contentscript when data extraction is needed', function (done) {
40 |     var sitemap = new Sitemap({
41 |       selectors: [
42 |         {
43 |           id: 'a',
44 |           selector: '#browserTest',
45 |           type: 'SelectorText',
46 |           multiple: false,
47 |           parentSelectors: ['_root']
48 |         }
49 |       ]
50 |     }, {$, document, window})
51 | 
52 |     var browser = new ChromePopupBrowser({
53 |       pageLoadDelay: 500
54 |     })
55 |     browser._initPopupWindow(function () {
56 |     })
57 |     browser.fetchData('http://example,com/', sitemap, '_root', function (err, data) {
58 |       assert.isNull(err)
59 |       assert.deepEqual(data, [
60 |         {
61 |           'a': 'a'
62 |         }
63 |       ])
64 |       done()
65 |     })
66 |   })
67 | })
68 | 


--------------------------------------------------------------------------------
/tests/spec/browser/ScraperSpec.js:
--------------------------------------------------------------------------------
 1 | const Queue = require('./../../../extension/scripts/Queue')
 2 | const assert = require('chai').assert
 3 | 
 4 | const ChromePopupBrowser = require('./../../../extension/scripts/ChromePopupBrowser')
 5 | const Sitemap = require('./../../../extension/scripts/Sitemap')
 6 | const FakeStore = require('./../../FakeStore')
 7 | const Scraper = require('./../../../extension/scripts/Scraper')
 8 | const utils = require('./../../utils')
 9 | const globals = require('../../globals')
10 | 
11 | describe('Scraper', function () {
12 |   var q, store, $el
13 |   let $
14 |   let document
15 |   let window
16 |   let Browser
17 | 
18 |   beforeEach(function () {
19 |     $ = globals.$
20 |     document = globals.document
21 |     window = globals.window
22 |     Browser = globals.Browser
23 | 
24 |     q = new Queue()
25 |     store = new FakeStore()
26 |     document.body.innerHTML = utils.getTestHTML()
27 |   })
28 |   afterEach(function () {
29 |     while (document.body.firstChild) document.body.removeChild(document.body.firstChild)
30 |   })
31 | 
32 |   it('should store images', function (done) {
33 |     var record = {
34 |       '_imageBase64-test': 'test',
35 |       '_imageMimeType-test': 'test',
36 |       'test-src': 'http://images/image.png'
37 |     }
38 | 
39 |     var browser = new Browser({
40 |       pageLoadDelay: 500
41 |     })
42 | 
43 |     var sitemap = new Sitemap({
44 |       id: 'test'
45 |     }, {$, document, window})
46 | 
47 |     var scraper = new Scraper({
48 |       sitemap: sitemap,
49 |       browser: browser
50 |     }, {$, document, window})
51 | 
52 |     var deferredSave = scraper.saveImages(record)
53 |     var downloadAPICalled = false
54 |     chrome.downloads.onChanged.addListener(function () {
55 |       downloadAPICalled = true
56 |     })
57 |     assert.equal(downloadAPICalled, false)
58 | 
59 |     deferredSave.then(function () {
60 |       assert.equal(record['_imageBase64-test'], undefined)
61 |       assert.equal(record['_imageMimeType-test'], undefined)
62 |       assert.equal(downloadAPICalled, true)
63 |       done()
64 |     })
65 |       .then(null, function (e) {
66 |         done(e)
67 |       })
68 |   })
69 | })
70 | 


--------------------------------------------------------------------------------
/tests/spec/browser/Selector/SelectorImageSpec.js:
--------------------------------------------------------------------------------
 1 | const Selector = require('../../../../extension/scripts/Selector')
 2 | const utils = require('./../../../utils')
 3 | const assert = require('chai').assert
 4 | const globals = require('../../../globals')
 5 | 
 6 | describe('Image Selector', function () {
 7 |   let $
 8 |   let document
 9 |   let window
10 |   var $el
11 |   beforeEach(function () {
12 |     $ = globals.$
13 |     document = globals.document
14 |     window = globals.window
15 | 
16 |     document.body.innerHTML = utils.getTestHTML()
17 |     $el = utils.createElementFromHTML("<div id='tests' style='display:none'></div>", document)
18 |     document.body.appendChild($el)
19 |   })
20 | 
21 |   it('should be able to download image as base64', function (done) {
22 |     var selector = new Selector({
23 |       id: 'img',
24 |       type: 'SelectorImage'
25 |     }, {$, document, window})
26 |     var deferredImage = selector.downloadImageBase64('base/docs/images/chrome-store-logo.png')
27 | 
28 |     deferredImage.then(function (imageResponse) {
29 |       assert.isTrue(imageResponse.imageBase64.length > 100)
30 |       done()
31 |     })
32 |   })
33 | 
34 |   it('should be able to get data with image data attached', function (done) {
35 |     $el.innerHTML = '<img src="base/docs/images/chrome-store-logo.png">'
36 | 
37 |     var selector = new Selector({
38 |       id: 'img',
39 |       type: 'SelectorImage',
40 |       multiple: true,
41 |       selector: 'img',
42 |       downloadImage: true
43 |     }, {$, document, window})
44 | 
45 |     var dataDeferred = selector.getData($el)
46 |     dataDeferred.then(function (data) {
47 |       assert.equal(data.length, 1)
48 |       assert.isTrue(!!data[0]['_imageBase64-img'])
49 |       assert.isTrue(!!data[0]['_imageMimeType-img'])
50 |       done()
51 |     })
52 |   })
53 | })
54 | 


--------------------------------------------------------------------------------
/tests/spec/headless/browserSpec.js:
--------------------------------------------------------------------------------
  1 | const ChromeHeadlessBrowser = require('./../../../extension/scripts/ChromeHeadlessBrowser')
  2 | const sinon = require('sinon')
  3 | const assert = require('chai').assert
  4 | const utils = require('./../../utils')
  5 | const Queue = require('./../../../extension/scripts/Queue')
  6 | const Sitemap = require('./../../../extension/scripts/Sitemap')
  7 | const FakeStore = require('./../../FakeStore')
  8 | const Scraper = require('./../../../extension/scripts/Scraper')
  9 | 
 10 | describe('Headless browser', function () {
 11 |   let sandbox
 12 |   beforeEach('Create sandbox', function () {
 13 |     sandbox = sinon.createSandbox()
 14 |   })
 15 |   afterEach('Release sandbox', function () {
 16 |     if (sandbox) sandbox.restore()
 17 |   })
 18 |   it('Scrape', function (done) {
 19 |     sandbox.stub(ChromeHeadlessBrowser.prototype, 'loadUrl').callsFake(async function () {
 20 |       const page = await this.pagePromise
 21 |       const html = utils.getTestHTML()
 22 |       await page.setContent(html)
 23 |     })
 24 | 
 25 |     const fake$ = {}
 26 |     const fakeDocument = {}
 27 |     const fakeWindow = {}
 28 |     const q = new Queue()
 29 |     const store = new FakeStore()
 30 | 
 31 |     const sitemap = new Sitemap({
 32 |       id: 'test',
 33 |       startUrl: 'http://test.lv/',
 34 |       selectors: [
 35 |         {
 36 |           'id': 'link',
 37 |           'selector': '#scraper-test-child-page a',
 38 |           'multiple': true,
 39 |           type: 'SelectorLink',
 40 |           'parentSelectors': ['_root']
 41 |         },
 42 |         {
 43 |           'id': 'b',
 44 |           'selector': '#scraper-test-child-page b',
 45 |           'multiple': false,
 46 |           type: 'SelectorText',
 47 |           'parentSelectors': ['link']
 48 |         }
 49 |       ]
 50 |     }, {$: fake$, document: fakeDocument, window: fakeWindow})
 51 | 
 52 |     var browser = new ChromeHeadlessBrowser({
 53 |       pageLoadDelay: 10
 54 |     })
 55 | 
 56 |     var s = new Scraper({
 57 |       queue: q,
 58 |       sitemap: sitemap,
 59 |       browser: browser,
 60 |       store: store,
 61 |       delay: 0
 62 |     }, {$: fake$, document: fakeDocument, window: fakeWindow})
 63 | 
 64 |     s.run(function () {
 65 |       assert.deepEqual(store.data, [
 66 |         {'link': 'test', 'link-href': 'http://test.lv/1/', 'b': 'b'}
 67 |       ])
 68 |       done()
 69 |     })
 70 |   })
 71 | 
 72 |   it('Scraping is done in a different context', function (done) {
 73 |     sandbox.stub(ChromeHeadlessBrowser.prototype, 'loadUrl').callsFake(async function () {
 74 |       const page = await this.pagePromise
 75 |       const html = utils.getTestHTML()
 76 |       await page.setContent(html)
 77 |       await page.evaluate(function () {
 78 |         const blockedProperties = ['jquery', '$', 'jQuery']
 79 |         try {
 80 |           for (const property of blockedProperties) {
 81 |             Object.defineProperty(window, property, {
 82 |               get () {
 83 |                 throw new Error('Wrong property: ' + property)
 84 |               },
 85 |               set () {
 86 |                 throw new Error('Cannot set: ' + property)
 87 |               }
 88 |             })
 89 |           }
 90 |         } catch (e) {
 91 |           // This is executed once per visited page, so it can give problems
 92 |         }
 93 |       })
 94 |     })
 95 | 
 96 |     const fake$ = {}
 97 |     const fakeDocument = {}
 98 |     const fakeWindow = {}
 99 |     const q = new Queue()
100 |     const store = new FakeStore()
101 | 
102 |     const sitemap = new Sitemap({
103 |       id: 'test',
104 |       startUrl: 'http://test.lv/',
105 |       selectors: [
106 |         {
107 |           'id': 'link',
108 |           'selector': '#scraper-test-child-page a',
109 |           'multiple': true,
110 |           type: 'SelectorLink',
111 |           'parentSelectors': ['_root']
112 |         },
113 |         {
114 |           'id': 'b',
115 |           'selector': '#scraper-test-child-page b',
116 |           'multiple': false,
117 |           type: 'SelectorText',
118 |           'parentSelectors': ['link']
119 |         }
120 |       ]
121 |     }, {$: fake$, document: fakeDocument, window: fakeWindow})
122 | 
123 |     var browser = new ChromeHeadlessBrowser({
124 |       pageLoadDelay: 10
125 |     })
126 | 
127 |     var s = new Scraper({
128 |       queue: q,
129 |       sitemap: sitemap,
130 |       browser: browser,
131 |       store: store,
132 |       delay: 0
133 |     }, {$: fake$, document: fakeDocument, window: fakeWindow})
134 | 
135 |     s.run(function () {
136 |       assert.deepEqual(store.data, [
137 |         {'link': 'test', 'link-href': 'http://test.lv/1/', 'b': 'b'}
138 |       ])
139 |       done()
140 |     })
141 |   })
142 | })
143 | 


--------------------------------------------------------------------------------
/tests/spec/jquery.whencallsequentiallySpec.js:
--------------------------------------------------------------------------------
  1 | var whenCallSequentially = require('../../extension/assets/jquery.whencallsequentially')
  2 | var jquery = require('jquery-deferred')
  3 | const assert = require('chai').assert
  4 | 
  5 | describe('jQuery When call sequentially', function () {
  6 |   var syncCall = function () {
  7 |     return jquery.Deferred().resolve('sync').promise()
  8 |   }
  9 | 
 10 |   var asyncCall = function () {
 11 |     var d = jquery.Deferred()
 12 |     setTimeout(function () {
 13 |       d.resolve('async')
 14 |     }, 0)
 15 |     return d.promise()
 16 |   }
 17 | 
 18 |   beforeEach(function () {
 19 |   })
 20 | 
 21 |   it('should return immediately empty array when no calls passed', function () {
 22 |     var deferred = whenCallSequentially([])
 23 |     assert.equal(deferred.state(), 'resolved')
 24 |     var data
 25 |     deferred.done(function (res) {
 26 |       data = res
 27 |     })
 28 |     assert.deepEqual(data, [])
 29 |   })
 30 | 
 31 |   it('should return immediately with data when synchronous call passed', function () {
 32 |     var deferred = whenCallSequentially([syncCall])
 33 |     assert.deepEqual(deferred.state(), 'resolved')
 34 |     var data
 35 |     deferred.done(function (res) {
 36 |       data = res
 37 |     })
 38 |     assert.deepEqual(data, ['sync'])
 39 |   })
 40 | 
 41 |   it('should return immediately with data when multiple synchronous call passed', function () {
 42 |     var deferred = whenCallSequentially([syncCall, syncCall, syncCall])
 43 |     assert.deepEqual(deferred.state(), 'resolved')
 44 |     var data
 45 |     deferred.done(function (res) {
 46 |       data = res
 47 |     })
 48 |     assert.deepEqual(data, ['sync', 'sync', 'sync'])
 49 |   })
 50 | 
 51 |   it('should execute one async job', function (done) {
 52 |     var deferred = whenCallSequentially([asyncCall])
 53 |     assert.deepEqual(deferred.state(), 'pending')
 54 | 
 55 |     deferred.then(function (data) {
 56 |       assert.deepEqual(data, ['async'])
 57 |       done()
 58 |     })
 59 |   })
 60 | 
 61 |   it('should execute multiple async jobs', function (done) {
 62 |     var deferred = whenCallSequentially([asyncCall, asyncCall, asyncCall])
 63 |     assert.deepEqual(deferred.state(), 'pending')
 64 | 
 65 |     deferred.then(function (res) {
 66 |       assert.deepEqual(res, ['async', 'async', 'async'])
 67 |       done()
 68 |     })
 69 |   })
 70 | 
 71 |   it('should execute multiple sync and async jobs', function () {
 72 |     var deferred = whenCallSequentially([syncCall, syncCall, asyncCall, asyncCall, syncCall, asyncCall])
 73 |     assert.deepEqual(deferred.state(), 'pending')
 74 | 
 75 |     deferred.done(function (data) {
 76 |       assert.deepEqual(data, ['sync', 'sync', 'async', 'async', 'sync', 'async'])
 77 |     })
 78 |   })
 79 | 
 80 |   it('should allow adding jobs to job array from an async job', function () {
 81 |     var jobs = []
 82 |     var asyncMoreCall = function () {
 83 |       var d = jquery.Deferred()
 84 |       setTimeout(function () {
 85 |         d.resolve('asyncmore')
 86 |         jobs.push(asyncCall)
 87 |       }, 0)
 88 |       return d.promise()
 89 |     }
 90 |     jobs.push(asyncMoreCall)
 91 | 
 92 |     var deferred = whenCallSequentially(jobs)
 93 |     assert.deepEqual(deferred.state(), 'pending')
 94 | 
 95 |     deferred.then(function (data) {
 96 |       assert.deepEqual(data, ['asyncmore', 'async'])
 97 |     })
 98 |   })
 99 | 
100 |   it('should allow adding jobs to job array from a sync job', function () {
101 |     var jobs = []
102 |     var syncMoreCall = function () {
103 |       var d = jquery.Deferred()
104 |       jobs.push(syncCall)
105 |       d.resolve('syncmore')
106 |       return d.promise()
107 |     }
108 |     jobs.push(syncMoreCall)
109 | 
110 |     var deferred = whenCallSequentially(jobs)
111 |     deferred.then(function (res) {
112 |       assert.deepEqual(res, ['syncmore', 'sync'])
113 |     })
114 |   })
115 | })
116 | 


--------------------------------------------------------------------------------
/tests/spec/jsdom/browserSpec.js:
--------------------------------------------------------------------------------
 1 | const Browser = require('./../../../extension/scripts/JSDOMBrowser')
 2 | 
 3 | it('Handle error in jsdom', function (done) {
 4 |   Browser.prototype.loadUrl = function (url, callback) {
 5 |     callback(new Error('Fake error'))
 6 |   }
 7 |   const jsdomBrowser = new Browser({})
 8 | 
 9 |   jsdomBrowser.fetchData('a', {}, {}, function (err) {
10 |     if (err) {
11 |       done()
12 |     } else {
13 |       done(new Error('It should have failed'))
14 |     }
15 |   })
16 | })
17 | 


--------------------------------------------------------------------------------