├── .babelrc
├── .eslintrc
├── .gitignore
├── .gitmodules
├── .npmignore
├── LICENSE
├── README.md
├── docs
├── CSS selector.md
├── Installation.md
├── Open Web Scraper.md
├── Scraping a site.md
├── Selectors.md
├── Selectors
│ ├── Element attribute selector.md
│ ├── Element click selector.md
│ ├── Element scroll down selector.md
│ ├── Element selector.md
│ ├── Grouped selector.md
│ ├── HTML selector.md
│ ├── Image selector.md
│ ├── Link popup selector.md
│ ├── Link selector.md
│ ├── Table selector.md
│ └── Text selector.md
├── Storage backends.md
└── images
│ ├── chrome-store-logo-920x680.png
│ ├── chrome-store-logo-920x680.xcf
│ ├── chrome-store-logo.png
│ ├── chrome-store-logo.xcf
│ ├── open-web-scraper
│ └── open-web-scraper.png
│ ├── scraping-a-site
│ ├── news-site-selector-graph.png
│ ├── news-site-sitemap.png
│ └── news-site.png
│ ├── selectors
│ ├── element-click
│ │ ├── click-more.png
│ │ └── click-once.png
│ ├── link
│ │ ├── multiple-level-link-selectors.png
│ │ ├── pagination-link-selectors.png
│ │ └── pagination-selector-graph.png
│ ├── table
│ │ ├── selectors.png
│ │ └── table.png
│ └── text
│ │ ├── text-selector-multiple-elements-with-text-selectors.png
│ │ ├── text-selector-multiple-per-page.png
│ │ └── text-selector-multiple-single-text-selectors-in-one-page.png
│ ├── sitemap-tree.png
│ └── store-logo-sources.txt
├── extension
├── assets
│ ├── ICanHaz.js
│ ├── LICENSE-d3-js
│ ├── LICENSE-icanhaz-js
│ ├── LICENSE-jquery-js
│ ├── LICENSE-pouchdb-js
│ ├── LICENSE-sugar-js
│ ├── base64.js
│ ├── bootstrap-3.0.0
│ │ ├── css
│ │ │ ├── bootstrap-theme.css
│ │ │ ├── bootstrap-theme.min.css
│ │ │ ├── bootstrap.css
│ │ │ └── bootstrap.min.css
│ │ ├── fonts
│ │ │ ├── glyphicons-halflings-regular.eot
│ │ │ ├── glyphicons-halflings-regular.svg
│ │ │ ├── glyphicons-halflings-regular.ttf
│ │ │ └── glyphicons-halflings-regular.woff
│ │ └── js
│ │ │ ├── bootstrap.js
│ │ │ └── bootstrap.min.js
│ ├── d3.v3.js
│ ├── d3.v3.min.js
│ ├── images
│ │ ├── LICENSE
│ │ ├── icon128.png
│ │ ├── icon16.png
│ │ ├── icon19.png
│ │ ├── icon38.png
│ │ └── icon48.png
│ ├── jquery-2.0.3.js
│ ├── jquery.bootstrapvalidator
│ │ ├── bootstrapValidator.css
│ │ └── bootstrapValidator.js
│ ├── jquery.whencallsequentially.js
│ ├── pouchdb-nightly.min.js
│ └── sugar-1.4.1.js
├── background_page
│ └── background_script.js
├── content_script
│ ├── contentScraperHeadlessBundler.js
│ ├── content_scraper.js
│ ├── content_scraper_browser.js
│ ├── content_script.css
│ └── content_script.js
├── devtools
│ ├── devtools_init_page.html
│ ├── devtools_init_page.js
│ ├── devtools_scraper_panel.css
│ ├── devtools_scraper_panel.html
│ └── views
│ │ ├── DataPreview.html
│ │ ├── SelectorEdit.html
│ │ ├── SelectorEditTableColumn.html
│ │ ├── SelectorList.html
│ │ ├── SelectorListItem.html
│ │ ├── SitemapBrowseData.html
│ │ ├── SitemapCreate.html
│ │ ├── SitemapEditMetadata.html
│ │ ├── SitemapExport.html
│ │ ├── SitemapExportDataCSV.html
│ │ ├── SitemapHeadlessScrapeConfig.html
│ │ ├── SitemapImport.html
│ │ ├── SitemapList.html
│ │ ├── SitemapListItem.html
│ │ ├── SitemapScrapeConfig.html
│ │ ├── SitemapSelectorGraph.html
│ │ ├── SitemapStartUrlField.html
│ │ └── Viewport.html
├── generated
│ └── .gitignore
├── manifest.json
├── options_page
│ ├── options.html
│ └── options_page.js
├── popup.html
└── scripts
│ ├── App.js
│ ├── BackgroundScript.js
│ ├── ChromeHeadlessBrowser.js
│ ├── ChromePopupBrowser.js
│ ├── Config.js
│ ├── ContentScript.js
│ ├── ContentSelector.js
│ ├── Controller.js
│ ├── DataExtractor.js
│ ├── ElementQuery.js
│ ├── InMemoryStore.js
│ ├── JSDOMBrowser.js
│ ├── JSDOMBrowserLoader.js
│ ├── Job.js
│ ├── Queue.js
│ ├── Scraper.js
│ ├── Selector.js
│ ├── Selector
│ ├── SelectorElement.js
│ ├── SelectorElementAttribute.js
│ ├── SelectorElementClick.js
│ ├── SelectorElementScroll.js
│ ├── SelectorGoogMapID.js
│ ├── SelectorGroup.js
│ ├── SelectorHTML.js
│ ├── SelectorImage.js
│ ├── SelectorLink.js
│ ├── SelectorPopupLink.js
│ ├── SelectorTable.js
│ └── SelectorText.js
│ ├── SelectorGraph.js
│ ├── SelectorGraphv2.js
│ ├── SelectorList.js
│ ├── Selectors.js
│ ├── Sitemap.js
│ ├── Store.js
│ ├── StoreDevtools.js
│ ├── UniqueElementList.js
│ ├── WebJSDOMBrowser.js
│ ├── getBackgroundScript.js
│ └── getContentScript.js
├── gulpfile.js
├── index.js
├── karma.conf.js
├── package.json
├── playgrounds
├── extension
│ ├── index.html
│ └── webpage.css
└── sitemap-tree
│ ├── index.html
│ ├── sitemap.json
│ └── style.css
└── tests
├── ChromeAPI.js
├── FakeStore.js
├── Matchers.js
├── browserSpec.js
├── globals.js
├── jsdomSpec.js
├── spec
├── ContentSelectorSpec.js
├── DataExtractSpec.js
├── ElementQuerySpec.js
├── JobSpec.js
├── QueueSpec.js
├── ScraperSpec.js
├── Selector
│ ├── SelectorElementAttributeSpec.js
│ ├── SelectorElementClickSpec.js
│ ├── SelectorElementScrollSpec.js
│ ├── SelectorElementSpec.js
│ ├── SelectorGoogMapIDSpec.js
│ ├── SelectorGroupSpec.js
│ ├── SelectorHTMLSpec.js
│ ├── SelectorImageSpec.js
│ ├── SelectorLinkSpec.js
│ ├── SelectorTableSpec.js
│ └── SelectorTextSpec.js
├── SelectorListSpec.js
├── SelectorSpec.js
├── SitemapSpec.js
├── UniqueElementListSpec.js
├── browser
│ ├── BackgroundScriptSpec.js
│ ├── ChromePopupBrowserSpec.js
│ ├── ContentScriptSpec.js
│ ├── ScraperSpec.js
│ └── Selector
│ │ ├── SelectorImageSpec.js
│ │ └── SelectorPopupLinkSpec.js
├── headless
│ └── browserSpec.js
├── jquery.whencallsequentiallySpec.js
└── jsdom
│ └── browserSpec.js
└── utils.js
/.babelrc:
--------------------------------------------------------------------------------
1 | {
2 | "plugins": ["meaningful-logs"]
3 | }
--------------------------------------------------------------------------------
/.eslintrc:
--------------------------------------------------------------------------------
1 | { "env": {
2 | "node": true
3 | },
4 | "globals": {
5 | "d3": true,
6 | "$": true,
7 | "chrome": true,
8 | "jQuery": true,
9 | "describe": true,
10 | "it": true,
11 | "beforeEach": true,
12 | "afterEach": true,
13 | "after": true,
14 | "before": true
15 | },
16 | "extends": ["standard"]}
17 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | projectFilesBackup
3 | extension.zip
4 | node_modules
5 | npm-debug.log
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "extension/assets/css-selector"]
2 | path = extension/assets/css-selector
3 | url = https://github.com/martinsbalodis/css-selector.git
4 |
--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
1 | .idea
2 | projectFilesBackup
3 | extension.zip
4 | node_modules
5 | npm-debug.log
6 | extension/assets/*
7 | extension/assets/*/
8 | !extension/assets/jquery.whencallsequentially.js
9 | !extension/assets/base64.js
10 | docs/images/*
11 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # Web Scraper
3 | Web Scraper is a chrome browser extension and a library built for data extraction from web
4 | pages. Using this extension you can create a plan (sitemap) how a web site
5 | should be traversed and what should be extracted. Using these sitemaps the
6 | Web Scraper will navigate the site accordingly and extract all data. Scraped
7 | data later can be exported as CSV.
8 |
9 | To use it as an extension install it from [Chrome store] [chrome-store]
10 |
11 | To use it as a library do `npm i web-scraper-headless`
12 |
13 | ### Features
14 |
15 | 1. Scrape multiple pages
16 | 2. Sitemaps and scraped data are stored in browsers local storage or in CouchDB
17 | 3. Multiple data selection types
18 | 4. Extract data from dynamic pages (JavaScript+AJAX)
19 | 5. Browse scraped data
20 | 6. Export scraped data as CSV
21 | 7. Import, Export sitemaps
22 | 8. Depends only on Chrome browser
23 |
24 | ### Help
25 |
26 | Documentation and tutorials are available on [webscraper.io] [webscraper.io]
27 |
28 | Ask for help, submit bugs, suggest features on [google groups] [google-groups]
29 |
30 | Submit bugs and suggest features on [bug tracker] [github-issues]
31 |
32 | #### Headless mode
33 | To use it as a library you need a sitemap, you can write it by hand, but the easiest way is to use the [original extension][extension] to scrape and then click on "export sitemap".
34 |
35 | const webscraper = require('web-scraper-headless')
36 | // visit github and retrieve last commit of all trending repo.
37 | // The sitemap depends on the actual DOM of github, so it might get outdated
38 | const sitemap = {
39 | "startUrl": "https://github.com/trending",
40 | "selectors": [{
41 | "parentSelectors": ["_root"],
42 | "type": "SelectorLink",
43 | "multiple": true,
44 | "id": "link_to_repo",
45 | "selector": "h3 a",
46 | "delay": ""
47 | }, {
48 | "parentSelectors": ["link_to_repo"],
49 | "type": "SelectorText",
50 | "multiple": false,
51 | "id": "latest_commit",
52 | "selector": "a.commit-tease-sha",
53 | "regex": "",
54 | "delay": ""
55 | }],
56 | "_id": "github_trending"
57 | }
58 | const options = {delay: 10, pageLoadDelay: 10, browser: 'headless'} // optional delay, pageLoadDelay and browser
59 | webscraper(sitemap, options)
60 | .then(function (scraped) {
61 | // This is your scraped info
62 | })
63 |
64 | By default webscraper-headless will open [jsdom](https://github.com/jsdom/jsdom) as a browser. This is a purely JS implementation of HTML. As such it has no native dependencies and it is very lightweighted. However, it is not capable of executing js which might be a hindrance in some cases. If that is your case, you can use chrome headless as a browser. Note that it will consume far more resources than jsdom and you need to have some native dependencies installed in the server. To use chrome headless do the following:
65 |
66 | const sitemap = // same as previous example
67 | const options = {browser: 'headless'}
68 | webscraper(sitemap, options)
69 | .then(function (scraped) {
70 | // This is your scraped info
71 | })
72 |
73 | #### Bugs
74 | When submitting a bug please attach an exported sitemap if possible.
75 |
76 | ## License
77 | LGPLv3
78 |
79 | ## Changelog
80 |
81 | ### v0.2
82 | * Added Element click selector
83 | * Added Element scroll down selector
84 | * Added Link popup selector
85 | * Improved table selector to work with any html markup
86 | * Added Image download
87 | * Added keyboard shortcuts when selecting elements
88 | * Added configurable delay before using selector
89 | * Added configurable delay between page visiting
90 | * Added multiple start url configuration
91 | * Added form field validation
92 | * Fixed a lot of bugs
93 |
94 | ### v0.1.3
95 | * Added Table selector
96 | * Added HTML selector
97 | * Added HTML attribute selector
98 | * Added data preview
99 | * Added ranged start urls
100 | * Fixed bug which made selector tree not to show on some operating systems
101 |
102 | [chrome-store]: https://chrome.google.com/webstore/detail/web-scraper/jnhgnonknehpejjnehehllkliplmbmhn
103 | [webscraper.io]: http://webscraper.io/
104 | [google-groups]: https://groups.google.com/forum/#!forum/web-scraper
105 | [github-issues]: https://github.com/martinsbalodis/web-scraper-chrome-extension/issues
106 | [extension]: https://chrome.google.com/webstore/detail/web-scraper/jnhgnonknehpejjnehehllkliplmbmhn
107 |
--------------------------------------------------------------------------------
/docs/CSS selector.md:
--------------------------------------------------------------------------------
1 | # CSS selector
2 |
3 | Web Scraper uses css selectors to find HTML elements in web pages and to extract
4 | data from them. When selecting an element the Web Scraper will try to make its
5 | best guess what the CSS selector might be for the selected elements. But you
6 | can also write it yourself and test it with by clicking "Element preview". You
7 | can use CSS selectors that are available in CSS versions 1-3 and also pseudo
8 | selectors that are additionally available in jQuery. Here are some
9 | documentation links that might help you:
10 |
11 | * [CSS Selectors] [css-selectors-wikipedia]
12 | * [jQuery CSS selectors] [css-selectors-jquery]
13 | * [w3schools CSS selector reference] [w3schools-css-selector-reference]
14 |
15 | ## Additional Web Scraper selectors
16 | It is possible to add new pseudo CSS selectors to Web Scraper. Right now there
17 | is only one CSS selector added.
18 |
19 | #### Parent selector
20 |
21 | CSS Selector `_parent_` allows a child selector of an
22 | *Element selector* to select the element that was returned by the *Element selector*. For
23 | example this CSS selector could be used in a case where you need to extract an
24 | attribute from the element that the *Element selector* returned.
25 |
26 | [css-selectors-wikipedia]: http://en.wikipedia.org/wiki/Cascading_Style_Sheets#Selector
27 | [css-selectors-jquery]: http://api.jquery.com/category/selectors/
28 | [w3schools-css-selector-reference]: http://www.w3schools.com/cssref/css_selectors.asp
--------------------------------------------------------------------------------
/docs/Installation.md:
--------------------------------------------------------------------------------
1 | # Installation
2 |
3 | You can install the extension from [Chrome store] [1]. After installing it you
4 | should restart chrome to make sure the extension is fully loaded. If you don't
5 | want to restart Chrome then use the extension only in tabs that are
6 | created after installing it.
7 |
8 | ## Requirements
9 |
10 | The extension requires Chrome 31+ . There are no OS limitations.
11 |
12 | [1]: https://chrome.google.com/webstore/detail/web-scraper/jnhgnonknehpejjnehehllkliplmbmhn "Install web scraper from Chrome store"
--------------------------------------------------------------------------------
/docs/Open Web Scraper.md:
--------------------------------------------------------------------------------
1 | # Open Web Scraper
2 |
3 | Web Scraper is integrated into chrome Developer tools. Figure 1 shows how you
4 | can open it. You can also use these shortcuts to open Developer tools. After
5 | opening Developer tools open *Web Scraper* tab.
6 |
7 | Shourtcuts:
8 |
9 | * windows, linux: `Ctrl+Shift+I`, `f12`, open `Tools / Developer tools`
10 | * mac `Cmd+Opt+I`, open `Tools / Developer tools`
11 |
12 | ![Fig. 1: Open Web Scraper][open-web-scraper]
13 |
14 | [open-web-scraper]: images/open-web-scraper/open-web-scraper.png?raw=true
--------------------------------------------------------------------------------
/docs/Scraping a site.md:
--------------------------------------------------------------------------------
1 | # Scraping a site
2 |
3 | Open the site that you want to scrape.
4 |
5 | ## Create Sitemap
6 |
7 | The first thing you need to do when creating a *sitemap* is specifying the
8 | start url. This is the url from which the scraping will start. You can also
9 | specify multiple start urls if the scraping should start from multiple places.
10 | For example if you want to scrape multiple search results then you could create
11 | a separate start url for each search result.
12 |
13 | ### Specify multiple urls with ranges
14 |
15 | In cases where a site uses numbering in pages URLs it is much simpler to create
16 | a range start url than creating *Link selectors* that would navigate the site.
17 | To specify a range url replace the numeric part of start url with a range
18 | definition - `[1-100]`. If the site uses zero padding in urls then add zero
19 | padding to the range definition - `[001-100]`. If you want to skip some urls
20 | then you can also specify incremental like this `[0-100:10]`.
21 |
22 | Use range url like this `http://example.com/page/[1-3]` for links like these:
23 |
24 | * `http://example.com/page/1`
25 | * `http://example.com/page/2`
26 | * `http://example.com/page/3`
27 |
28 | Use range url with zero padding like this `http://example.com/page/[001-100]`
29 | for links like these:
30 |
31 | * `http://example.com/page/001`
32 | * `http://example.com/page/002`
33 | * `http://example.com/page/003`
34 |
35 | Use range url with increment like this `http://example.com/page/[0-100:10]` for
36 | links like these:
37 |
38 | * `http://example.com/page/0`
39 | * `http://example.com/page/10`
40 | * `http://example.com/page/20`
41 |
42 | ## Create selectors
43 |
44 | After you have created the *sitemap* you can add selectors to it. In the
45 | *Selectors* panel you can add new selectors, modify them and navigate the
46 | selector tree.
47 | The selectors can be added in a tree type structure. The web scraper will
48 | execute the selectors in the order how they are organized in the tree
49 | structure. For example there is a news site and you want to scrape all articles
50 | whose links are available on the first page. In image 1 you can see this
51 | example site.
52 |
53 | ![Fig. 1: News site][image-news-site]
54 |
55 | To scrape this site you can create a *Link selector* which will extract all
56 | article links in the first page. Then as a child selector you can add a
57 | *Text selector* that will extract articles from the article pages that the
58 | *Link selector* found links to. Image below illustrates how the *sitemap*
59 | should be built for the news site.
60 |
61 | ![Fig. 2: News site sitemap][image-news-site-sitemap]
62 |
63 | Note that when creating selectors use Element preview and Data preview features
64 | to ensure that you have selected the correct elements with the correct data.
65 |
66 | More information about selector tree building is available in selector
67 | documentation. You should atleast read about these core selectors:
68 |
69 | * [Text selector][text-selector]
70 | * [Link selector][link-selector]
71 | * [Element selector][element-selector]
72 |
73 | ### Inspect selector tree
74 |
75 | After you have created selectors for the *sitemap* you can inspect the tree
76 | structure of selectors in the Selector graph panel. Image below shows an
77 | example selector graph.
78 |
79 | ![Fig. 3: News site selector graph][image-news-site-selector-graph]
80 |
81 | ## Scrape the site
82 |
83 | After you have created selectors for the *sitemap* you can start scraping. Open
84 | *Scrape* panel and start scraping. A new popup window will open in which the
85 | scraper will load pages and extract data from them. After the scraping is done
86 | the popup window will close and you will be notified with a popup message. You can view
87 | the scraped data by opening *Browse* panel and export it by opening the
88 | *Export data as CSV* panel.
89 |
90 |
91 | [image-news-site]: images/scraping-a-site/news-site.png?raw=true
92 | [image-news-site-sitemap]: images/scraping-a-site/news-site-sitemap.png?raw=true
93 | [image-news-site-selector-graph]: images/scraping-a-site/news-site-selector-graph.png?raw=true
94 | [text-selector]: Selectors/Text%20selector.md
95 | [link-selector]: Selectors/Link%20selector.md
96 | [element-selector]: Selectors/Element%20selector.md
--------------------------------------------------------------------------------
/docs/Selectors.md:
--------------------------------------------------------------------------------
1 | # Selectors
2 |
3 | Web scraper has multiple selectors that can be used for different type data
4 | extraction and for different interaction with the website. The selectors can
5 | be divided in three groups:
6 |
7 | * Data extraction selectors for data extraction.
8 | * Link selectors for site navigation.
9 | * Element selectors for element selection that separate multiple records
10 |
11 | ### Data extraction selectors
12 |
13 | Data extraction selectors simply return data from the selected element.
14 | For example [Text selector] [text-selector] extracts text from
15 | selected element. These selectors can be used as data extraction selectors:
16 |
17 | * [Text selector] [text-selector]
18 | * [Link selector] [link-selector]
19 | * [Link popup selector] [link-popup-selector]
20 | * [Image selector] [image-selector]
21 | * [Table selector] [table-selector]
22 | * [Element attribute selector] [element-attribute-selector]
23 | * [HTML selector] [html-selector]
24 | * [Grouped selector] [grouped-selector]
25 |
26 | ### Link selectors
27 |
28 | Link selectors extract URLs from links that can be later opened for data
29 | extraction. For example if in a sitemap tree there is a *Link selector* that has
30 | 3 child text selectors then the Web Scraper extract all urls with the *Link
31 | selector* and then open each link and use those child data extraction selectors
32 | to extract data. Of course a link selector might have *Link selectors* as child
33 | selectors then these child *Link selectors* would be used for further page
34 | navigation. These are currently available *Link selectors*:
35 |
36 | * [Link selector] [link-selector]
37 | * [Link popup selector] [link-popup-selector]
38 |
39 | ### Element selectors
40 |
41 | Element selectors are for element selection that contain multiple data elements.
42 | For example an element selector might be used to select a list of items in an
43 | e-commerce site. The selector will return each selected element as a parent
44 | element to its child selectors. Element selectors child selectors will
45 | extract data only within the element that the element selector gave them.
46 | These are currently available Element selectors:
47 |
48 | * [Element selector] [element-selector]
49 | * [Element scroll down selector] [element-scroll-selector]
50 | * [Element click selector] [element-click-selector]
51 |
52 | ## Selector configuration options
53 |
54 | Each selector has configuration options. Here you can see the most common ones.
55 | Configuration options that are specific to a selector are described in
56 | selectors documentation.
57 |
58 | * selector - CSS selector that selects an element the selector will be working
59 | on.
60 | * multiple - should be checked when multiple records (data rows) are going to
61 | be extracted with this selector. Data extracted from two or more selectors with
62 | multiple checked wont be merged in a single record.
63 | * delay - delay before selector is being used.
64 | * parent selectors - configure parent selectors for this selector to make the
65 | selector tree.
66 |
67 | Note! A common mistake when using multiple configuration option is to create
68 | two selectors alongside with multiple checked and expect that the scraper will
69 | join selector values in pairs. For example if you selected pagination links and
70 | navigation links these links couldn't be logically joined in pairs. The correct
71 | way is to select a wrapper element with Element selector and add data selectors
72 | as child selectors to the element selector with multiple option not checked.
73 |
74 | [text-selector]: Selectors/Text%20selector.md
75 | [link-selector]: Selectors/Link%20Selector.md
76 | [link-popup-selector]: Selectors/Link%20Popup%20Selector.md
77 | [image-selector]: Selectors/Image%20selector.md
78 | [element-attribute-selector]: Selectors/Table%20selector.md
79 | [table-selector]: Selectors/Table%20selector.md
80 | [grouped-selector]: Selectors/Grouped%20selector.md
81 | [html-selector]: Selectors/HTML%20selector.md
82 | [element-selector]: Selectors/Element%20selector.md
83 | [element-click-selector]: Selectors/Element%20click%20selector.md
84 | [element-scroll-selector]: Selectors/Element%20scroll%20down%20selector.md
85 |
--------------------------------------------------------------------------------
/docs/Selectors/Element attribute selector.md:
--------------------------------------------------------------------------------
1 | # Element attribute selector
2 | Element attribute selector can extract an attributes value of an HTML element.
3 | For example you could use this selector to extract title attribute from
4 | this link: `link`.
5 |
6 | ## Configuration options
7 | * selector - [CSS selector] [css-selector] for the element.
8 | * multiple - multiple records are being extracted.
9 | * attribute name - the attribute that is going to be extracted. For example
10 | `title`, `data-id`.
11 |
12 | ## Use cases
13 | See [Text selector] [text-selector] use cases.
14 |
15 | [text-selector]: Text%20selector.md
16 | [css-selector]: ../CSS%20selector.md
--------------------------------------------------------------------------------
/docs/Selectors/Element click selector.md:
--------------------------------------------------------------------------------
1 | # Element click selector
2 |
3 | Element click selector works similarly to
4 | [Element selector] [element-selector]. It's main purpose also is element
5 | selection that could be given as parent elements to its child selectors. The only
6 | difference is that *Element click selector* can interact with the web page by
7 | clicking on buttons to load new elements. For example a page might use
8 | JavaScript and AJAX for pagination or item loading.
9 |
10 | Note! when selecting clickable elements you should select them by moving the
11 | mouse over the element and pressing "S". This kind of selection will avoid
12 | events triggered by the button.
13 |
14 | ## Configuration options
15 | * selector - [CSS selector] [css-selector] for the wrapper elements that will
16 | be used as parent elements for child selectors.
17 | * click selector - [CSS selector] [css-selector] for the buttons that need to
18 | be clicked to load more elements.
19 | * click type - type of how the selector knows when there will be no new
20 | elements and clicking should stop.
21 | * click element uniqueness - type of how selector knows which buttons are
22 | already clicked.
23 | * multiple - multiple records are being extracted (almost always should be
24 | checked). Multiple option for child selectors usually should not be checked.
25 | * delay - delay before element selection and delay between clicking. This
26 | should usually be specified because the data won't be loaded immediately from
27 | the server. More than 2000 ms might be a good choice if you you don't want to
28 | loose data because the server didn't respond fast enough.
29 | * Discard initial elements - the selector will not return the elements that
30 | were available before clicking for the first time. This might be useful for
31 | duplicate removal.
32 |
33 | ### Click type
34 | #### Click Once
35 |
36 | Click Once type will click on the buttons only once. If a new button appears
37 | that can be selected it will be also clicked. For example pagination links
38 | might show pages 1 to 5 but pages 6 to 10 would appear some time later. The
39 | selector will also click on those buttons.
40 |
41 | #### Click More
42 |
43 | Click More type makes the selector click on given buttons multiple times
44 | until there are no new elements appearing. A new element is considered an
45 | element that has unique text content.
46 |
47 | ### Click element uniqueness
48 |
49 | When using *Click Once* only unique buttons will be clicked. When using
50 | *Click More* this helps to ignore buttons that don't generate more elements.
51 |
52 | * Unique Text - buttons with identical text content are considered equal
53 | * Unique HTML+Text - buttons with identical HTML and text content are
54 | considered equal
55 | * Unique HTML - buttons with identical HTML and stripped text content are
56 | considered equal
57 | * Unique CSS Selector - buttons with identical CSS Selector are considered equal
58 |
59 | ## Use cases
60 |
61 | #### Navigate pagination using "Click once" selector type
62 |
63 | For example there is a site that displays a list of items and there are some
64 | pagination buttons that reload these items dynamically (after clicking a button
65 | the url doesn't change. changes after hash tag # doesn't count). Using *Element
66 | click selector* you can select these items and buttons that need to be clicked.
67 | The scraper during scraping phase will click these buttons to extract all
68 | elements. Also you need to add child selectors for the *Element click selector*
69 | that select data within each element. In figure 1 you can see how to configure
70 | the *Element click selector* to extract data from the described site.
71 |
72 | ![Fig. 1: Sitemap when using Click once type][image-click-once]
73 |
74 | #### Load more items in an e-commerce site by clicking "More" button
75 |
76 | This example is similar to the one above. The only difference is that in this
77 | site items are loaded by clicking a single button multiple times. In this case
78 | the *Element click selector* should be configured to use "Click more" click
79 | type. In figure 2 you can see how to configure the *Element click selector*
80 | to extract data from this site.
81 |
82 | ![Fig. 2: Sitemap when using Click more type][image-click-more]
83 |
84 | [image-click-more]: ../images/selectors/element-click/click-more.png?raw=true
85 | [image-click-once]: ../images/selectors/element-click/click-once.png?raw=true
86 | [element-selector]: Element%20selector.md
87 | [css-selector]: ../CSS%20selector.md
--------------------------------------------------------------------------------
/docs/Selectors/Element scroll down selector.md:
--------------------------------------------------------------------------------
1 | # Element scroll down selector
2 |
3 | This is another Element selector that works similarly to Element selector but
4 | additionally it scrolls down the page multiple times to find those elements
5 | which are added when page is scrolled down to the bottom. Use the delay
6 | attribute to configure waiting interval between scrolling and element search.
7 | Scrolling is stopped after no new elements are found. If the page can scroll
8 | infinitely then this selector will be stuck in an infinite loop.
9 |
10 | ## Configuration options
11 |
12 | * selector - [CSS selector] [css-selector] for the element.
13 | * multiple - multiple records are being extracted (almost always should be
14 | checked). Multiple option for child selectors usually should not be checked.
15 | * delay - delay before element selection and delay between scrolling. This
16 | should usually be specified because the data won't be loaded immediately from
17 | the server after scrolling down. More than 2000 ms might be a good choice if
18 | you you don't want to loose data because the server didn't respond fast enough.
19 |
20 | ## Use cases
21 | See [Element selector] [element-selector] use cases.
22 |
23 | [element-selector]: Element%20selector.md
24 | [css-selector]: ../CSS%20selector.md
--------------------------------------------------------------------------------
/docs/Selectors/Element selector.md:
--------------------------------------------------------------------------------
1 | # Element selector
2 |
3 | Element selector is for element selection that contain multiple data elements.
4 | For example element selector might be used to select a list of items in an
5 | e-commerce site. The selector will return each selected element as a parent
6 | element to its child selectors. Element selectors child selectors will be
7 | extracting data only within the element that the element selector gave them.
8 |
9 | Note! If the page dynamically loads new items after scrolling down or clicking
10 | on a button then you should try these selectors:
11 |
12 | * [Element scroll down selector] [element-scroll-selector]
13 | * [Element click selector] [element-click-selector]
14 |
15 | ## Configuration options
16 | * selector - [CSS selector] [css-selector] for the wrapper elements that will
17 | be used as parent elements for child selectors.
18 | * multiple - multiple records are being extracted (almost always should be
19 | checked). Multiple option for child selectors usually should not be checked.
20 |
21 | ## Use cases
22 |
23 | #### Select multiple e-commerce items from a page
24 |
25 | For example an e-commerce site has a page with a list of items. With element
26 | selector you can select the elements that wrap these items and then add
27 | multiple child selectors to it to extract data within the items wrapper
28 | element. Figure 1 shows how an element selector could be used in this
29 | situation.
30 |
31 | ![Fig. 1: Multiple items selected with element selector] [multiple-elements-with-text-selectors]
32 |
33 | #### Extract data from tables
34 |
35 | Similarly to e-commerce item selection you can also select table rows and add
36 | child selectors for data extraction from table cells.
37 | Though [Table selector] [table-selector] might be much better solution.
38 |
39 | [css-selector]: ../CSS%20selector.md
40 | [element-scroll-selector]: Element%20scroll%20down%20selector.md
41 | [element-click-selector]: Element%20click%20selector.md
42 | [table-selector]: Table%20selector.md
43 | [multiple-elements-with-text-selectors]: ../images/selectors/text/text-selector-multiple-elements-with-text-selectors.png?raw=true
--------------------------------------------------------------------------------
/docs/Selectors/Grouped selector.md:
--------------------------------------------------------------------------------
1 | # Grouped selector
2 |
3 | Grouped selector can group text data from multiple elements into one record.
4 | The extracted data will be stored as JSON.
5 |
6 | ## Configuration options
7 | * selector - [CSS selector] [css-selector] for the elements whose text will be
8 | extracted and stored in JSON format.
9 | * attribute name - optionally this selector can extract an attribute of the
10 | selected element. If specified the extractor will also add this attribute to
11 | the resulting JSON.
12 |
13 | ## Use cases
14 |
15 | #### Extract article references
16 |
17 | For example you are extracting a news article that might have multiple
18 | reference links. If you are selecting these links with link selector with
19 | multiple checked you would get duplicate articles in the result set where each
20 | record would contain one reference link. Using grouped selector you could
21 | serialize all these reference links into one record. To do that select all
22 | reference links and set attribute name to `href` to also extract links to these
23 | sites.
24 |
25 | [css-selector]: ../CSS%20selector.md
--------------------------------------------------------------------------------
/docs/Selectors/HTML selector.md:
--------------------------------------------------------------------------------
1 | # HTML selector
2 | HMTL selector can extract HTML and text within the selected element. Only the
3 | inner HTML of the element will be extracted.
4 |
5 | ## Configuration options
6 | * selector - [CSS selector] [css-selector] for the element whose inner HTML
7 | will be extracted.
8 | * multiple - multiple records are being extracted.
9 |
10 | ## Use cases
11 | See [Text selector] [text-selector] use cases.
12 |
13 | [text-selector]: Text%20selector.md
14 | [css-selector]: ../CSS%20selector.md
--------------------------------------------------------------------------------
/docs/Selectors/Image selector.md:
--------------------------------------------------------------------------------
1 | # Image selector
2 | Image selector can extract `src` attribute (URL) of an image.
3 | Optionally you can also store the images. The images will be stored in your
4 | downloads directory:
5 |
6 | `Downloads///`
7 |
8 | Note! When selecting CSS selector for image selector all the images within the
9 | site are moved to the top. If this feature somehow breaks sites layout please
10 | report it as a bug.
11 |
12 | ## Configuration options
13 | * selector - [CSS selector] [css-selector] for the image element.
14 | * multiple - multiple records are being extracted. Usually should not be
15 | checked for Image selector.
16 | * download image - downloads and store images on local drive. When CouchDB
17 | storage back end is used the image is also stored locally.
18 |
19 | ## Use cases
20 | See [Text selector] [text-selector] use cases.
21 |
22 | [text-selector]: Text%20selector.md
23 | [css-selector]: ../CSS%20selector.md
--------------------------------------------------------------------------------
/docs/Selectors/Link popup selector.md:
--------------------------------------------------------------------------------
1 | # Link popup selector
2 |
3 | *Link popup selector* works similarly as [Link selector] [link-selector]. It can
4 | be used for url extraction and site navigation. The only difference is that
5 | *Link popup selector* should be used when clicking on a link the site opens a new
6 | window (popup) instead of loading the URL in the same tab or opening it in a
7 | new tab. This selector will catch the popup creation event and extract the URL.
8 | If the site creates a visual popup but not a real window then you should try
9 | [Element click selector] [element-click-selector]
10 |
11 | Note! when selecting these link elements you can move the mouse over the
12 | element and press "S" to select it to prevent it from opening a popup.
13 |
14 | ## Use cases
15 | See [Link selector] [link-selector] use cases.
16 |
17 | [link-selector]: Link%20selector.md
18 | [element-click-selector]: Element%20click%20selector.md
--------------------------------------------------------------------------------
/docs/Selectors/Link selector.md:
--------------------------------------------------------------------------------
1 | # Link selector
2 |
3 | Link selector is used for link selection and website navigation. If you use
4 | *Link selector* without any child selectors then it will extract the link and
5 | the href attribute of the link. If you add child selectors to *Link selector*
6 | then these child selectors will be used in the page that this link was leading
7 | to. If you are selecting multiple links then check *multiple* property.
8 |
9 | Note! Link selector works only with `` tags with `href` attribute. If the
10 | link selector is not working for you then you can try these workarounds:
11 |
12 | 1. Check that the link in the url bar changes after clicking an item (changes
13 | only after hash tag doesn't count). If the link doesn't change then the site
14 | is probably using ajax for data loading. Instead of using link selector you
15 | should use [Element click selector] [element-click].
16 | 2. If the site opens a popup then you should use
17 | [Link popup selector] [link-popup]
18 | 3. The site might be using JavaScript `window.location` to change the URL. Web
19 | Scraper cannot handle this kind of navigation right now.
20 |
21 | ## Configuration options
22 |
23 | * selector - [CSS selector] [css-selector] for the link element from which the
24 | link for navigation will be extracted.
25 | * multiple - multiple records are being extracted. Usually should be checked.
26 |
27 | ## Use cases
28 |
29 | **Navigate through multiple levels of navigation**
30 |
31 | For example an e-commerce site has multi level navigation -
32 | `categories -> subcategories`. To scrape data from all categories and
33 | subcategories you can create two *Link selectors*. One selector would select
34 | category links and the other selector would select subcategory links that are
35 | available in the category pages. The subcategory *Link selector* should be made
36 | as a child of the category *Link selector*. The selectors for data extraction
37 | from subcategory pages should be made as a child selectors to the subcategory
38 | selector.
39 |
40 | ![Fig. 1: Multiple link selectors for category navigation][multiple-level-link-selectors]
41 |
42 | **Handle pagination**
43 |
44 | For example an e-commerce site has multiple categories. Each category has a
45 | list of items and pagination links. Also some pages are not directly available
46 | from the category but are available from pagination pages (you can see
47 | pagination links 1-5, but not 6-8). You can start by building a sitemap that
48 | visits each category and extract items from category page. This sitemap will
49 | extract items only from the first pagination page. To extract items from all of
50 | the pagination links including the ones that are not visible at the beginning
51 | you need to create another *Link selector* that selects the pagination links.
52 | Figure 2 shows how the link selector should be created in the sitemap. When
53 | the scraper opens a category link it will extract items that are available in
54 | the page. After that it will find the pagination links and also visit those. If
55 | the pagination link selector is made a child to itself it will recursively
56 | discover all pagination pages. Figure 3 shows a selector graph where you can
57 | see how pagination links discover more pagination links and more data.
58 |
59 | ![Fig. 2: Sitemap with Link selector for pagination][pagination-link-selectors]
60 | ![Fig. 3: Selector graph with pagination][pagination-selector-graph]
61 |
62 | [multiple-level-link-selectors]: ../images/selectors/link/multiple-level-link-selectors.png?raw=true
63 | [pagination-link-selectors]: ../images/selectors/link/pagination-link-selectors.png?raw=true
64 | [pagination-selector-graph]: ../images/selectors/link/pagination-selector-graph.png?raw=true
65 | [element-click]: Element%20click%20selector.md
66 | [link-popup]: Link%20popup%20selector.md
67 | [css-selector]: ../CSS%20selector.md
--------------------------------------------------------------------------------
/docs/Selectors/Table selector.md:
--------------------------------------------------------------------------------
1 | # Table selector
2 |
3 | Table selector can extract data from tables. *Table selector* has 3
4 | configurable CSS selectors. The selector is for table selection. After you have
5 | selected the selector the *Table selector* will try to guess selectors
6 | for header row and data rows. You can click Element preview on those selectors
7 | to see whether the *Table selector* found table header and data rows correctly.
8 | The header row selector is used to identify table columns when data is
9 | extracted from multiple pages. Also you can rename table columns. Figure 1
10 | shows what you should select when extracting data from a table.
11 |
12 | ![Fig. 1: Selectors for table selector] [table-selector-selectors]
13 |
14 | ## Configuration options
15 | * selector - [CSS selector] [css-selector] for the table element.
16 | * header row selector - [CSS selector] [css-selector] for table header row.
17 | * data rows selector - [CSS selector] [css-selector] for table data rows.
18 | * multiple - multiple records are being extracted. Usually should be
19 | checked for Table selector because you are extracting multiple rows.
20 |
21 | ## Use cases
22 | See [Text selector] [text-selector] use cases.
23 |
24 | [table-selector-selectors]: ../images/selectors/table/selectors.png?raw=true
25 | [text-selector]: Text%20selector.md
26 | [css-selector]: ../CSS%20selector.md
--------------------------------------------------------------------------------
/docs/Selectors/Text selector.md:
--------------------------------------------------------------------------------
1 | # Text selector
2 |
3 | Text selector is used for text selection. The text selector will extract text
4 | from the selected element and from all its child elements. HTML will be
5 | stripped and only text will be returned. Selector will ignore text within
6 | `
4 |