├── .gitignore ├── LICENSE ├── README.md ├── config ├── config.exs ├── dev.exs ├── prod.exs └── test.exs ├── doc ├── 404.html ├── Scrapex.GenSpider.Response.html ├── Scrapex.GenSpider.html ├── Scrapex.Selector.html ├── Scrapex.html ├── assets │ └── logo.png ├── dist │ ├── app.css │ ├── app.js │ └── sidebar_items.js ├── extra-api-reference.html ├── extra-readme.html ├── fonts │ ├── icomoon.eot │ ├── icomoon.svg │ ├── icomoon.ttf │ └── icomoon.woff └── index.html ├── lib ├── scrapex.ex └── scrapex │ ├── gen_spider.ex │ ├── gen_spider │ ├── README.md │ ├── request.ex │ └── response.ex │ ├── selector.ex │ └── spider │ └── webscraper.ex ├── logo.png ├── mix.exs ├── mix.lock └── test ├── sample_pages ├── e-commerce │ └── static │ │ ├── computers │ │ ├── index.html │ │ ├── index_files │ │ │ ├── cart2.png │ │ │ ├── site.js │ │ │ └── style.css │ │ ├── laptops │ │ │ └── index.html │ │ └── tablets │ │ │ └── index.html │ │ ├── index.html │ │ └── phones │ │ ├── index.html │ │ └── touch │ │ └── index.html └── example.com.html ├── scrapex ├── gen_spider_test.exs ├── selector_test.exs └── spider │ ├── example_test.exs │ ├── webscraper.csv │ └── webscraper_test.exs ├── scrapex_test.exs └── test_helper.exs /.gitignore: -------------------------------------------------------------------------------- 1 | /_build 2 | /deps 3 | erl_crash.dump 4 | *.ez 5 | .DS_Store 6 | *.beam -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Son Tran-Nguyen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Scrapex 2 | ======= 3 | 4 | An open source and collaborative framework for extracting the data you need from websites. In a fast, simple, yet extensible way. 5 | 6 | ## Features 7 | 8 | ### Fast and powerful 9 | Write the rules to extract the data and let Scrapex do the rest. 10 | 11 | ### Easily extensible 12 | Extensible by design, plug new functionality easily without having to touch the core. 13 | 14 | ### Portable, Elixir 15 | Written in Elixir and runs on Linux, Windows, Mac, BSD, and embedded devices. 16 | 17 | ## Build your own webcrawlers 18 | 19 | alias Scrapex.GenSpider 20 | defmodule StackOverflowSpider do 21 | use GenSpider 22 | import Scrapex.Selector 23 | 24 | def parse(response, state) do 25 | result = response.body 26 | |> select(".question-summary h3 a") 27 | |> extract("href") 28 | |> Enum.map(fn(href) -> 29 | GenSpider.Response.url_join(response, href) 30 | |> GenSpider.request(&parse_question/1) 31 | |> GenSpider.await 32 | end) 33 | {:ok, result, state} 34 | end 35 | 36 | defp parse_question({:ok, response}) do 37 | html = response.body 38 | [title] = html |> select("h1 a") |> extract() 39 | question = html |> select(".question") 40 | [body] = question |> select(".post-text") |> extract 41 | [votes] = question |> select(".vote-count-post") |> extract 42 | tags = question |> select(".post-tag") |> extract 43 | 44 | %{title: title, body: body, votes: votes, tags: tags} 45 | end 46 | end 47 | urls = ["http://stackoverflow.com/questions?sort=votes"] 48 | opts = [name: :stackoverflow_spider, urls: urls] 49 | {:ok, spider} = GenSpider.start_link(StackOverflowSpider, [], opts) 50 | questions = GenSpider.export(spider) 51 | #=> "[{} | _]" 52 | 53 | ## TODOS 54 | 55 | - [x] `GenSpider behaviour`. 56 | - [x] Request URL and pass response to `parse/2` callback. 57 | - [x] One time spider 58 | - [x] CSS selector 59 | - [ ] XPath selector 60 | - [x] Yield for requests in `parse/2` 61 | - [x] Follow redirects 62 | - [ ] Set custom request headers 63 | - [ ] Respect robots.txt 64 | - [ ] Resolve DNS once only 65 | - [ ] Domain blacklist 66 | - [ ] Parse response chunk by chunk 67 | - [ ] CLI -------------------------------------------------------------------------------- /config/config.exs: -------------------------------------------------------------------------------- 1 | # This file is responsible for configuring your application 2 | # and its dependencies with the aid of the Mix.Config module. 3 | use Mix.Config 4 | 5 | # This configuration is loaded before any dependency and is restricted 6 | # to this project. If another project depends on this project, this 7 | # file won't be loaded nor affect the parent project. For this reason, 8 | # if you want to provide default values for your application for third- 9 | # party users, it should be done in your mix.exs file. 10 | 11 | # Sample configuration: 12 | # 13 | # config :logger, :console, 14 | # level: :info, 15 | # format: "$date $time [$level] $metadata$message\n", 16 | # metadata: [:user_id] 17 | 18 | # It is also possible to import configuration files, relative to this 19 | # directory. For example, you can emulate configuration per environment 20 | # by uncommenting the line below and defining dev.exs, test.exs and such. 21 | # Configuration from the imported file will override the ones defined 22 | # here (which is why it is important to import them last). 23 | # 24 | import_config "#{Mix.env}.exs" 25 | -------------------------------------------------------------------------------- /config/dev.exs: -------------------------------------------------------------------------------- 1 | use Mix.Config -------------------------------------------------------------------------------- /config/prod.exs: -------------------------------------------------------------------------------- 1 | use Mix.Config 2 | 3 | # Do not print debug messages in production 4 | config :logger, level: :info -------------------------------------------------------------------------------- /config/test.exs: -------------------------------------------------------------------------------- 1 | use Mix.Config 2 | 3 | # Print only warnings and errors during test 4 | config :logger, level: :warn -------------------------------------------------------------------------------- /doc/404.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 404 – Scrapex v0.1.0 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
16 | 19 | 59 | 60 |
61 |
62 | 63 | 64 |

Page not found

65 | 66 |

Sorry, but the page you were trying to get to, does not exist. You 67 | may want to try searching this site using the sidebar or using our 68 | API Reference page to find what 69 | you were looking for.

70 | 71 | 83 |
84 |
85 |
86 | 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /doc/Scrapex.GenSpider.Response.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Scrapex.GenSpider.Response – Scrapex v0.1.0 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
16 | 19 | 59 | 60 |
61 |
62 | 63 | 64 |

65 | Scrapex.GenSpider.Response 66 | 67 | 68 | 69 | 70 | 71 | 72 |

73 | 74 | 75 |
76 |

Utilities for working response returned from GenSpider.

77 | 78 |
79 | 80 | 81 | 82 |
83 |

84 | 85 | 86 | 87 | Summary 88 |

89 | 90 |
91 |

92 | Types 93 |

94 |
95 |
96 | t() 97 |
98 | 99 |
100 | 101 |
102 | 103 | 104 | 105 |
106 |

107 | Functions 108 |

109 |
110 | 113 | 114 |

Join a path relative to the response’s URL

115 |
116 | 117 |
118 | 119 |
120 | 121 | 122 | 123 | 124 | 125 | 126 |
127 | 128 | 129 | 130 |
131 |

132 | 133 | 134 | 135 | Types 136 |

137 |
138 |
139 |
t :: %Scrapex.GenSpider.Response{url: binary, body: binary}
140 | 141 |
142 | 143 |
144 |
145 | 146 | 147 | 148 |
149 |

150 | 151 | 152 | 153 | Functions 154 |

155 |
156 |
157 | 158 | 159 | 160 | url_join(response, path) 161 | 162 | 163 | 164 | 165 | 166 |
167 | 168 |
169 |

Specs

170 |
171 | 172 |
url_join(t, binary) :: binary
173 | 174 |
175 |
176 | 177 |
178 |

Join a path relative to the response’s URL.

179 |

Examples

180 |
iex> alias Scrapex.GenSpider.Response
181 | iex> response = %Response{url: "http://www.scrapex.com/subfolder"}
182 | iex> Response.url_join(response, "/subfolder2")
183 | "http://www.scrapex.com/subfolder2"
184 | iex> Response.url_join(response, "subsubfolder")
185 | "http://www.scrapex.com/subfolder/subsubfolder"
186 | 187 |
188 |
189 | 190 |
191 | 192 | 193 | 194 | 195 | 196 | 208 |
209 |
210 |
211 | 212 | 213 | 214 | 215 | -------------------------------------------------------------------------------- /doc/Scrapex.Selector.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Scrapex.Selector – Scrapex v0.1.0 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
16 | 19 | 59 | 60 |
61 |
62 | 63 | 64 |

65 | Scrapex.Selector 66 | 67 | 68 | 69 | 70 | 71 | 72 |

73 | 74 | 75 |
76 |

Utilities for extracting data from markup language.

77 | 78 |
79 | 80 | 81 | 82 |
83 |

84 | 85 | 86 | 87 | Summary 88 |

89 | 90 |
91 |

92 | Types 93 |

94 |
95 |
96 | attribute() 97 |
98 | 99 |

Attribute of a node

100 |
101 | 102 |
103 |
104 |
105 | children() 106 |
107 | 108 |
109 |
110 |
111 | html_node() 112 |
113 | 114 |
115 |
116 |
117 | html_tree() 118 |
119 | 120 |

A tree of HTML nodes, or a node itself if only one

121 |
122 | 123 |
124 |
125 |
126 | name() 127 |
128 | 129 |

Name of the tag or attribute

130 |
131 | 132 |
133 |
134 |
135 | selector() 136 |
137 | 138 |
139 |
140 |
141 | t() 142 |
143 | 144 |
145 | 146 |
147 | 148 | 149 | 150 |
151 |

152 | Functions 153 |

154 |
155 |
156 | extract(selector) 157 |
158 | 159 |

Extracts content or attribute value for a selection

160 |
161 | 162 |
163 |
164 | 167 | 168 |
169 |
170 | 173 | 174 |

Generates a selection for a particular selector

175 |
176 | 177 |
178 | 179 |
180 | 181 | 182 | 183 | 184 | 185 | 186 |
187 | 188 | 189 | 190 |
191 |

192 | 193 | 194 | 195 | Types 196 |

197 |
198 |
199 |
attribute :: {name, binary}
200 | 201 |

Attribute of a node

202 |
203 | 204 |
205 |
206 |
children :: [html_node]
207 | 208 |
209 |
210 |
html_node :: {name, [attribute], children}
211 | 212 |
213 |
214 |
html_tree :: html_node | [html_node]
215 | 216 |

A tree of HTML nodes, or a node itself if only one

217 |
218 | 219 |
220 |
221 |
name :: binary
222 | 223 |

Name of the tag or attribute

224 |
225 | 226 |
227 |
228 |
selector :: binary
229 | 230 |
231 |
232 |
t :: %Scrapex.Selector{tree: html_tree}
233 | 234 |
235 | 236 |
237 |
238 | 239 | 240 | 241 |
242 |

243 | 244 | 245 | 246 | Functions 247 |

248 |
249 |
250 | 251 | 252 | 253 | extract(selector) 254 | 255 | 256 | 257 | 258 | 259 |
260 | 261 |
262 |

Extracts content or attribute value for a selection.

263 | 264 |
265 |
266 |
267 |
268 | 269 | 270 | 271 | extract(selector, attr) 272 | 273 | 274 | 275 | 276 | 277 |
278 | 279 |
280 |

Specs

281 |
282 | 283 |
extract(t, name) :: [binary]
284 | 285 |
286 |
287 | 288 |
289 | 290 |
291 |
292 |
293 |
294 | 295 | 296 | 297 | select(html, selector) 298 | 299 | 300 | 301 | 302 | 303 |
304 | 305 |
306 |

Specs

307 |
308 | 309 |
select(binary | t, selector) :: t
310 | 311 |
312 |
313 | 314 |
315 |

Generates a selection for a particular selector.

316 |

The return value is a Selector.t

317 | 318 |
319 |
320 | 321 |
322 | 323 | 324 | 325 | 326 | 327 | 339 |
340 |
341 |
342 | 343 | 344 | 345 | 346 | -------------------------------------------------------------------------------- /doc/Scrapex.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Scrapex – Scrapex v0.1.0 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
16 | 19 | 59 | 60 |
61 |
62 | 63 | 64 |

65 | Scrapex 66 | 67 | 68 | 69 | 70 | 71 | 72 |

73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 97 |
98 |
99 |
100 | 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /doc/assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sntran/scrapex/0b8e1db6cf24f3d98e644b03479af9a7c304b6a8/doc/assets/logo.png -------------------------------------------------------------------------------- /doc/dist/app.css: -------------------------------------------------------------------------------- 1 | @import url(https://fonts.googleapis.com/css?family=Lato:400,300,700,900|Merriweather:300italic,300,700,700italic|Inconsolata:400,700);.hljs,article,aside,details,figcaption,figure,footer,header,hgroup,main,menu,nav,section,summary{display:block}img,legend{border:0}.sidebar a,.sidebar-toggle{transition:color .3s ease-in-out}.sidebar .sidebar-search .sidebar-searchInput:focus,.sidebar .sidebar-search .sidebar-searchInput:hover,.sidebar-toggle:active,.sidebar-toggle:focus,.sidebar-toggle:hover,a:active,a:hover{outline:0}.results ul,.sidebar ul{list-style:none}.hljs-comment{color:#8e908c}.css .hljs-class,.css .hljs-id,.css .hljs-pseudo,.hljs-attribute,.hljs-regexp,.hljs-tag,.hljs-variable,.html .hljs-doctype,.ruby .hljs-constant,.xml .hljs-doctype,.xml .hljs-pi,.xml .hljs-tag .hljs-title{color:#c82829}.hljs-built_in,.hljs-constant,.hljs-literal,.hljs-number,.hljs-params,.hljs-pragma,.hljs-preprocessor{color:#f5871f}.css .hljs-rule .hljs-attribute,.ruby .hljs-class .hljs-title{color:#eab700}.hljs-header,.hljs-inheritance,.hljs-name,.hljs-string,.hljs-value,.ruby .hljs-symbol,.xml .hljs-cdata{color:#718c00}.css .hljs-hexcolor,.hljs-title{color:#3e999f}.coffeescript .hljs-title,.hljs-function,.javascript .hljs-title,.perl .hljs-sub,.python .hljs-decorator,.python .hljs-title,.ruby .hljs-function .hljs-title,.ruby .hljs-title .hljs-keyword{color:#4271ae}.hljs-keyword,.javascript .hljs-function{color:#8959a8}.hljs{overflow-x:auto;background:#fff;color:#4d4d4c;padding:.5em;-webkit-text-size-adjust:none}legend,td,th{padding:0}.coffeescript .javascript,.javascript .xml,.tex .hljs-formula,.xml .css,.xml .hljs-cdata,.xml .javascript,.xml .vbscript{opacity:.5}/*! normalize.css v3.0.3 | MIT License | github.com/necolas/normalize.css */html{font-family:sans-serif;-ms-text-size-adjust:100%;-webkit-text-size-adjust:100%}audio,canvas,progress,video{display:inline-block;vertical-align:baseline}audio:not([controls]){display:none;height:0}[hidden],template{display:none}a{background-color:transparent}abbr[title]{border-bottom:1px dotted}b,optgroup,strong{font-weight:700}dfn{font-style:italic}h1{font-size:2em;margin:.67em 0}mark{background:#ff0;color:#000}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sup{top:-.5em}sub{bottom:-.25em}svg:not(:root){overflow:hidden}figure{margin:1em 40px}hr{box-sizing:content-box;height:0}pre,textarea{overflow:auto}code,kbd,pre,samp{font-family:monospace,monospace;font-size:1em}button,input,optgroup,select,textarea{color:inherit;font:inherit;margin:0}button{overflow:visible}.main,body,html{overflow:hidden}button,select{text-transform:none}button,html input[type=button],input[type=reset],input[type=submit]{-webkit-appearance:button;cursor:pointer}button[disabled],html input[disabled]{cursor:default}button::-moz-focus-inner,input::-moz-focus-inner{border:0;padding:0}input{line-height:normal}input[type=checkbox],input[type=radio]{box-sizing:border-box;padding:0}input[type=number]::-webkit-inner-spin-button,input[type=number]::-webkit-outer-spin-button{height:auto}.content,.main,.sidebar,body,html{height:100%}input[type=search]{-webkit-appearance:textfield;box-sizing:content-box}input[type=search]::-webkit-search-cancel-button,input[type=search]::-webkit-search-decoration{-webkit-appearance:none}fieldset{border:1px solid silver;margin:0 2px;padding:.35em .625em .75em}table{border-collapse:collapse;border-spacing:0}@font-face{font-family:icomoon;src:url(../fonts/icomoon.eot?h5z89e);src:url(../fonts/icomoon.eot?#iefixh5z89e) format('embedded-opentype'),url(../fonts/icomoon.ttf?h5z89e) format('truetype'),url(../fonts/icomoon.woff?h5z89e) format('woff'),url(../fonts/icomoon.svg?h5z89e#icomoon) format('svg');font-weight:400;font-style:normal}.icon-elem,[class*=" icon-"],[class^=icon-]{font-family:icomoon;speak:none;font-style:normal;font-weight:400;font-variant:normal;text-transform:none;line-height:1;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}.sidebar,body{font-family:Lato,sans-serif}.icon-link:before{content:"\e005"}.icon-search:before{content:"\e036"}.icon-cross:before{content:"\e117"}.icon-menu:before{content:"\e120"}.icon-angle-right:before{content:"\f105"}.icon-code:before{content:"\f121"}body,html{box-sizing:border-box;width:100%}body{margin:0;font-size:16px;line-height:1.6875em}*,:after,:before{box-sizing:inherit}.main{display:-webkit-flex;display:-ms-flexbox;display:-ms-flex;display:flex}.sidebar,body.sidebar-closed .sidebar{display:none}.sidebar{-webkit-flex:0 1 300px;-moz-flex:0 1 300px;-ms-flex:0 1 300px;flex:0 1 300px;-ms-flex-positive:0;-ms-flex-negative:1;-ms-flex-preferred-size:300px;-webkit-box-orient:vertical;-moz-box-orient:vertical;-webkit-box-direction:normal;-moz-box-direction:normal;min-height:0;-webkit-flex-direction:column;-moz-flex-direction:column;-ms-flex-direction:column;flex-direction:column;position:absolute;z-index:999}.content{-webkit-flex:1 1 .01%;-moz-flex:1 1 .01%;-ms-flex:1 1 .01%;flex:1 1 .01%;-ms-flex-positive:1;-ms-flex-negative:1;-ms-flex-preferred-size:.01%;overflow-y:auto;-webkit-overflow-scrolling:touch}.content-inner{max-width:949px;margin:0 auto;padding:3px 60px}@media screen and (max-width:768px){.content-inner{padding:27px 20px 27px 40px}}body.sidebar-closed .sidebar-toggle{display:block}.sidebar-toggle{position:fixed;z-index:99;left:18px;top:8px;background-color:transparent;border:none;padding:0;font-size:16px}.sidebar-toggle:hover{color:#e1e1e1}@media screen and (min-width:768px){.sidebar-toggle{display:none}}.sidebar{font-size:14px;line-height:18px;background:#373f52;color:#d5dae6;overflow:hidden}.sidebar .sidebar-toggle{display:block;left:275px;color:#e1e1e1}.sidebar .sidebar-toggle:hover{color:#fff}.sidebar ul li{margin:0;padding:0 10px}.sidebar a{color:#d5dae6;text-decoration:none}.sidebar a:hover{color:#fff}.sidebar .sidebar-projectLink{margin:23px 30px 0}.sidebar .sidebar-projectDetails{display:inline-block;text-align:right;vertical-align:top;margin-top:6px}.sidebar .sidebar-projectImage{display:inline-block;max-width:64px;max-height:64px;margin-left:15px;vertical-align:bottom}.sidebar .sidebar-projectName{font-weight:700;font-size:24px;line-height:30px;color:#fff;margin:0;padding:0;max-width:155px}.sidebar .sidebar-projectVersion{margin:0;padding:0;font-weight:300;font-size:16px;line-height:20px;color:#fff}.sidebar .sidebar-listNav{padding:0 30px}.sidebar .sidebar-listNav li,.sidebar .sidebar-listNav li a{text-transform:uppercase;font-weight:300;font-size:13px}.sidebar .sidebar-listNav li{padding-left:17px;border-left:3px solid transparent;transition:all .3s linear;line-height:27px}.sidebar .sidebar-listNav li.selected,.sidebar .sidebar-listNav li.selected a,.sidebar .sidebar-listNav li:hover,.sidebar .sidebar-listNav li:hover a{border-color:#9768d1;color:#fff}.sidebar .sidebar-search{margin:23px 30px 18px;display:-webkit-flex;display:-ms-flexbox;display:-ms-flex;display:flex}.sidebar .sidebar-search i.icon-search{font-size:14px;color:#d5dae6}.sidebar #full-list li.clicked>a,.sidebar #full-list ul li.active a{color:#fff}.sidebar .sidebar-search .sidebar-searchInput{background-color:transparent;border:none;border-radius:0;border-bottom:1px solid #959595;margin-left:5px}.sidebar #full-list{margin:4px 0 0 30px;padding:0 20px;overflow-y:auto;-webkit-overflow-scrolling:touch;-webkit-flex:1 1 .01%;-moz-flex:1 1 .01%;-ms-flex:1 1 .01%;flex:1 1 .01%;-ms-flex-positive:1;-ms-flex-negative:1;-ms-flex-preferred-size:.01%}.sidebar #full-list ul{margin:0 20px;padding:9px 0 18px}.sidebar #full-list ul li{font-weight:300;line-height:18px}.sidebar #full-list ul li ul{display:none;padding:9px 0}.sidebar #full-list ul li ul li{border-left:1px solid #959595;padding:0 10px}.sidebar #full-list ul li ul li.active:before{font-family:icomoon;speak:none;font-style:normal;font-weight:400;font-variant:normal;text-transform:none;line-height:1;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;content:"\f105";margin-left:-10px;font-size:16px;margin-right:5px}.sidebar #full-list ul li.active{border-left:none}.sidebar #full-list ul li.active ul{display:block}.sidebar #full-list li{padding:0;line-height:27px}.sidebar #full-list li.collapsed ul{display:none}@media screen and (min-width:768px){.sidebar{position:relative;display:-webkit-flex;display:-ms-flexbox;display:-ms-flex;display:flex}}@media screen and (max-height:500px){.sidebar{overflow-y:auto}.sidebar #full-list{overflow:visible}}.content-inner{font-family:Merriweather,serif;font-size:1em;line-height:1.6875em}.content-inner h1,.content-inner h2,.content-inner h3,.content-inner h4,.content-inner h5,.content-inner h6{font-family:Lato,sans-serif;font-weight:800;line-height:1.5em;word-wrap:break-word}.content-inner h1{font-size:2em;margin:1em 0 .5em}.content-inner h1.section-heading{margin:1.5em 0 .5em}.content-inner h1 small{font-weight:300}.content-inner h1 a.view-source{font-size:1.2rem}.content-inner h2{font-size:1.625em;margin:1em 0 .5em;font-weight:400}.content-inner h3{font-size:1.375em;margin:1em 0 .5em;font-weight:600}.content-inner a{color:#000;text-decoration:none;text-shadow:.03em 0 #fff,-.03em 0 #fff,0 .03em #fff,0 -.03em #fff,.06em 0 #fff,-.06em 0 #fff,.09em 0 #fff,-.09em 0 #fff,.12em 0 #fff,-.12em 0 #fff,.15em 0 #fff,-.15em 0 #fff;background-image:linear-gradient(#fff,#fff),linear-gradient(#fff,#fff),linear-gradient(#000,#000);background-size:.05em 1px,.05em 1px,1px 1px;background-repeat:no-repeat,no-repeat,repeat-x;background-position:0 90%,100% 90%,0 90%}.content-inner a:selection{text-shadow:.03em 0 #b4d5fe,-.03em 0 #b4d5fe,0 .03em #b4d5fe,0 -.03em #b4d5fe,.06em 0 #b4d5fe,-.06em 0 #b4d5fe,.09em 0 #b4d5fe,-.09em 0 #b4d5fe,.12em 0 #b4d5fe,-.12em 0 #b4d5fe,.15em 0 #b4d5fe,-.15em 0 #b4d5fe;background:#b4d5fe}.content-inner a:-moz-selection{text-shadow:.03em 0 #b4d5fe,-.03em 0 #b4d5fe,0 .03em #b4d5fe,0 -.03em #b4d5fe,.06em 0 #b4d5fe,-.06em 0 #b4d5fe,.09em 0 #b4d5fe,-.09em 0 #b4d5fe,.12em 0 #b4d5fe,-.12em 0 #b4d5fe,.15em 0 #b4d5fe,-.15em 0 #b4d5fe;background:#b4d5fe}.content-inner a *,.content-inner a :after,.content-inner a :before,.content-inner a:after,.content-inner a:before{text-shadow:none}.content-inner a:visited{color:#000}.content-inner ul li{line-height:1.5em}.content-inner a.view-source{float:right;color:#959595;background:0 0;border:none;text-shadow:none;transition:color .3s ease-in-out}.content-inner a.view-source:hover{color:#373f52}.content-inner blockquote{font-style:italic;margin:.5em 0;padding:.25em 1.5em;border-left:3px solid #e1e1e1;display:inline-block}.content-inner blockquote :first-child{padding-top:0;margin-top:0}.content-inner blockquote :last-child{padding-bottom:0;margin-bottom:0}.content-inner table{margin:2em 0}.content-inner th{text-align:left;font-family:Lato,sans-serif;text-transform:uppercase;font-weight:600;padding-bottom:.5em}.content-inner tr{border-bottom:1px solid #d5dae6;vertical-align:bottom;height:2.5em}.content-inner .summary .summary-row .summary-signature a,.content-inner .summary h2 a{background:0 0;border:none;text-shadow:none}.content-inner td,.content-inner th{padding-left:1em;line-height:2em}.content-inner h1.section-heading:hover a.hover-link{opacity:1;text-decoration:none}.content-inner h1.section-heading a.hover-link{transition:opacity .3s ease-in-out;display:inline-block;opacity:0;padding:.3em .6em .6em;line-height:1em;margin-left:-2.7em;background:0 0;border:none;text-shadow:none;font-size:16px;vertical-align:middle}.content-inner .summary h2{font-weight:600}.content-inner .summary .summary-row .summary-signature{font-family:Inconsolata,Menlo,Courier,monospace;font-weight:600}.content-inner .summary .summary-row .summary-synopsis{font-family:Merriweather,serif;font-style:italic;padding:0 .5em;margin:0 0 .5em}.content-inner .detail-header,.content-inner code{font-family:Inconsolata,Menlo,Courier,monospace}.content-inner .summary .summary-row .summary-synopsis p{margin:0;padding:0}.content-inner .detail-header{margin:2.5em 0 .5em;padding:.5em 1em;background:#f7f7f7;border-left:3px solid #9768d1;font-size:1em;position:relative}.content-inner .detail-header .signature{font-size:1rem;font-weight:600}.content-inner .detail-header:hover a.detail-link{opacity:1;text-decoration:none}.content-inner .detail-header a.detail-link{transition:opacity .3s ease-in-out;position:absolute;top:0;left:0;display:block;opacity:0;padding:.6em;line-height:1.5em;margin-left:-2.5em;background:0 0;border:none;text-shadow:none}.content-inner .specs .specs-list pre code,.content-inner .types .types-list .type-detail pre code{padding:0 .5em;border:none}.content-inner .specs .specs-list{margin:0 0 2em}.content-inner .specs .specs-list pre{margin:.5em 0}.content-inner .types .types-list .type-detail{margin-bottom:2em}.content-inner .types .types-list .type-detail pre{margin:.5em 0}.content-inner .types .types-list .type-detail .typespec-doc{padding:0 1.5em}.content-inner a.no-underline,.content-inner code a{color:#9768d1;text-shadow:none;background-image:none}.content-inner a.no-underline:active,.content-inner a.no-underline:focus,.content-inner a.no-underline:hover,.content-inner a.no-underline:visited,.content-inner code a:active,.content-inner code a:focus,.content-inner code a:hover,.content-inner code a:visited{color:#9768d1}.content-inner code{font-size:15px;font-style:normal;line-height:24px;font-weight:400;background-color:#f7f9fc;border:1px solid #e1e1e1;vertical-align:middle;border-radius:2px;padding:0 .5em}.content-inner pre{margin:1.5em 0}.content-inner pre.spec{margin:0}.content-inner pre.spec code{padding:0}.content-inner pre code.hljs{white-space:inherit;padding:1em 1.5em;background-color:#f7f9fc}.content-inner .footer{margin:4em auto 1em;text-align:center;font-style:italic;font-size:14px;color:#959595}.content-inner .footer .line{display:inline-block}.content-inner .footer a{color:#959595;text-decoration:none;text-shadow:.03em 0 #fff,-.03em 0 #fff,0 .03em #fff,0 -.03em #fff,.06em 0 #fff,-.06em 0 #fff,.09em 0 #fff,-.09em 0 #fff,.12em 0 #fff,-.12em 0 #fff,.15em 0 #fff,-.15em 0 #fff;background-image:linear-gradient(#fff,#fff),linear-gradient(#fff,#fff),linear-gradient(#959595,#959595);background-size:.05em 1px,.05em 1px,1px 1px;background-repeat:no-repeat,no-repeat,repeat-x;background-position:0 90%,100% 90%,0 90%}.content-inner .footer a:selection{text-shadow:.03em 0 #b4d5fe,-.03em 0 #b4d5fe,0 .03em #b4d5fe,0 -.03em #b4d5fe,.06em 0 #b4d5fe,-.06em 0 #b4d5fe,.09em 0 #b4d5fe,-.09em 0 #b4d5fe,.12em 0 #b4d5fe,-.12em 0 #b4d5fe,.15em 0 #b4d5fe,-.15em 0 #b4d5fe;background:#b4d5fe}.content-inner .footer a:-moz-selection{text-shadow:.03em 0 #b4d5fe,-.03em 0 #b4d5fe,0 .03em #b4d5fe,0 -.03em #b4d5fe,.06em 0 #b4d5fe,-.06em 0 #b4d5fe,.09em 0 #b4d5fe,-.09em 0 #b4d5fe,.12em 0 #b4d5fe,-.12em 0 #b4d5fe,.15em 0 #b4d5fe,-.15em 0 #b4d5fe;background:#b4d5fe}.results .result-id a,a.close-search{text-shadow:none;background-image:none;transition:color .3s ease-in-out}.content-inner .footer a *,.content-inner .footer a :after,.content-inner .footer a :before,.content-inner .footer a:after,.content-inner .footer a:before{text-shadow:none}.content-inner .footer a:visited{color:#959595}a.close-search{margin-top:-3em;display:block;float:right}a.close-search:active,a.close-search:focus,a.close-search:visited{color:#000}a.close-search:hover{color:#9768d1}.results .result-id{font-size:1.2em}.results .result-id a:active,.results .result-id a:focus,.results .result-id a:visited{color:#000}.results .result-id a:hover{color:#9768d1}.results .result-elem em,.results .result-id em{font-style:normal;color:#9768d1}.results ul{margin:0;padding:0}@media print{#sidebar{display:none}} -------------------------------------------------------------------------------- /doc/dist/sidebar_items.js: -------------------------------------------------------------------------------- 1 | sidebarNodes={"exceptions":[],"extras":[{"id":"extra-api-reference","title":"API Reference","headers":[]},{"id":"extra-readme","title":"README","headers":[{"id":" Features","anchor":"Features"},{"id":" Build your own webcrawlers","anchor":"Build-your-own-webcrawlers"},{"id":" TODOS","anchor":"TODOS"}]}],"modules":[{"id":"Scrapex","title":"Scrapex"},{"id":"Scrapex.GenSpider","title":"Scrapex.GenSpider","functions":[{"id":"await/1","anchor":"await/1"},{"id":"export/3","anchor":"export/3"},{"id":"handle_call/3","anchor":"handle_call/3"},{"id":"handle_info/2","anchor":"handle_info/2"},{"id":"request/2","anchor":"request/2"},{"id":"start/3","anchor":"start/3"},{"id":"start_link/3","anchor":"start_link/3"}],"types":[{"id":"format/0","anchor":"t:format/0"},{"id":"option/0","anchor":"t:option/0"},{"id":"options/0","anchor":"t:options/0"},{"id":"response/0","anchor":"t:response/0"},{"id":"spider/0","anchor":"t:spider/0"},{"id":"state/0","anchor":"t:state/0"}]},{"id":"Scrapex.GenSpider.Response","title":"Scrapex.GenSpider.Response","functions":[{"id":"url_join/2","anchor":"url_join/2"}],"types":[{"id":"t/0","anchor":"t:t/0"}]},{"id":"Scrapex.Selector","title":"Scrapex.Selector","functions":[{"id":"extract/1","anchor":"extract/1"},{"id":"extract/2","anchor":"extract/2"},{"id":"select/2","anchor":"select/2"}],"types":[{"id":"attribute/0","anchor":"t:attribute/0"},{"id":"children/0","anchor":"t:children/0"},{"id":"html_node/0","anchor":"t:html_node/0"},{"id":"html_tree/0","anchor":"t:html_tree/0"},{"id":"name/0","anchor":"t:name/0"},{"id":"selector/0","anchor":"t:selector/0"},{"id":"t/0","anchor":"t:t/0"}]}],"protocols":[]} -------------------------------------------------------------------------------- /doc/extra-api-reference.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | API Reference – Scrapex v0.1.0 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
16 | 19 | 59 | 60 |
61 |
62 | 63 |

API Reference

64 | 65 | 72 | 73 | 74 |
75 |

Modules

76 |
77 |
78 | 79 | 80 |
81 |
82 | 83 | 84 |

A behaviour module for implementing a web data extractor

85 |
86 | 87 |
88 |
89 | 90 | 91 |

Utilities for working response returned from GenSpider

92 |
93 | 94 |
95 |
96 | 97 | 98 |

Utilities for extracting data from markup language

99 |
100 | 101 |
102 | 103 |
104 |
105 | 106 | 107 | 108 | 109 | 110 | 122 |
123 |
124 |
125 | 126 | 127 | 128 | 129 | -------------------------------------------------------------------------------- /doc/extra-readme.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | README – Scrapex v0.1.0 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
16 | 19 | 59 | 60 |
61 |
62 | 63 | 64 |

Scrapex

65 |

An open source and collaborative framework for extracting the data you need from websites. In a fast, simple, yet extensible way.

66 |

Features

Fast and powerful

67 |

Write the rules to extract the data and let Scrapex do the rest.

68 |

Easily extensible

69 |

Extensible by design, plug new functionality easily without having to touch the core.

70 |

Portable, Elixir

71 |

Written in Elixir and runs on Linux, Windows, Mac, BSD, and embedded devices.

72 |

Build your own webcrawlers

alias Scrapex.GenSpider
 73 | defmodule StackOverflowSpider do
 74 |   use GenSpider
 75 |   import Scrapex.Selector
 76 | 
 77 |   def parse(response, state) do
 78 |     result = response.body
 79 |     |> select(".question-summary h3 a")
 80 |     |> extract("href")
 81 |     |> Enum.map(fn(href) ->
 82 |       GenSpider.Response.url_join(response, href)
 83 |       |> GenSpider.request(&parse_question/1)
 84 |       |> GenSpider.await
 85 |     end)
 86 |     {:ok, result, state}
 87 |   end
 88 | 
 89 |   defp parse_question({:ok, response}) do
 90 |     html = response.body
 91 |     [title] = html |> select("h1 a") |> extract()
 92 |     question = html |> select(".question")
 93 |     [body] = question |> select(".post-text") |> extract
 94 |     [votes] = question |> select(".vote-count-post") |> extract
 95 |     tags = question |> select(".post-tag") |> extract
 96 | 
 97 |     %{title: title, body: body, votes: votes, tags: tags}
 98 |   end
 99 | end
100 | urls = ["http://stackoverflow.com/questions?sort=votes"]
101 | opts = [name: :webscrapper, urls: urls]
102 | {:ok, spider} = GenSpider.start_link(StackOverflowSpider, [], opts)
103 | questions = GenSpider.export(spider)
104 | #=> "[{} | _]"
105 |

TODOS

    106 |
  • [x] GenSpider behaviour. 107 |
  • 108 |
  • [x] Request URL and pass response to parse/2 callback. 109 |
  • 110 |
  • [x] One time spider 111 |
  • 112 |
  • [x] CSS selector 113 |
  • 114 |
  • [ ] XPath selector 115 |
  • 116 |
  • [x] Yield for requests in parse/2 117 |
  • 118 |
  • [ ] Parse response chunk by chunk 119 |
  • 120 |
  • [ ] CLI 121 |
  • 122 |
123 | 124 | 136 |
137 |
138 |
139 | 140 | 141 | 142 | 143 | -------------------------------------------------------------------------------- /doc/fonts/icomoon.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sntran/scrapex/0b8e1db6cf24f3d98e644b03479af9a7c304b6a8/doc/fonts/icomoon.eot -------------------------------------------------------------------------------- /doc/fonts/icomoon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Generated by IcoMoon 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /doc/fonts/icomoon.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sntran/scrapex/0b8e1db6cf24f3d98e644b03479af9a7c304b6a8/doc/fonts/icomoon.ttf -------------------------------------------------------------------------------- /doc/fonts/icomoon.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sntran/scrapex/0b8e1db6cf24f3d98e644b03479af9a7c304b6a8/doc/fonts/icomoon.woff -------------------------------------------------------------------------------- /doc/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Scrapex v0.1.0 – Documentation 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /lib/scrapex.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrapex do 2 | end 3 | -------------------------------------------------------------------------------- /lib/scrapex/gen_spider.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrapex.GenSpider do 2 | alias Scrapex.GenSpider 3 | alias GenSpider.Request 4 | 5 | require Logger 6 | @moduledoc ~S""" 7 | A behaviour module for implementing a web data extractor. 8 | 9 | A GenSpider is a process as any other Elixir process and it can be 10 | used to crawl a list of URLs, run callback to parse the response, 11 | and repeat on an interval. 12 | 13 | ## Example 14 | 15 | The GenSpider behaviour abstracts the common data extraction process. 16 | Users are only required to implement the callbacks and functionality 17 | they are interested in. 18 | 19 | Imagine we want a GenSpider that follows the links to the top voted 20 | questions on StackOverflow and scrapes some data from each page: 21 | 22 | iex> alias Scrapex.GenSpider 23 | iex> defmodule StackOverflowSpider do 24 | ...> use GenSpider 25 | ...> import Scrapex.Selector 26 | ...> 27 | ...> def parse(response) do 28 | ...> result = response.body 29 | ...> |> select(".question-summary h3 a") 30 | ...> |> extract("href") 31 | ...> |> Enum.map(fn(href) -> 32 | ...> GenSpider.Response.url_join(response, href) 33 | ...> |> GenSpider.request(&parse_question/1) 34 | ...> |> GenSpider.await 35 | ...> end) 36 | ...> {:ok, result} 37 | ...> end 38 | ...> 39 | ...> defp parse_question(response) do 40 | ...> html = response.body 41 | ...> [title] = html |> select("h1 a") |> extract() 42 | ...> question = html |> select(".question") 43 | ...> [body] = question |> select(".post-text") |> extract 44 | ...> [votes] = question |> select(".vote-count-post") |> extract 45 | ...> tags = question |> select(".post-tag") |> extract 46 | ...> 47 | ...> %{title: title, body: body, votes: votes, tags: tags} 48 | ...> end 49 | ...> end 50 | iex> urls = ["http://stackoverflow.com/questions?sort=votes"] 51 | iex> opts = [name: :webscrapper, urls: urls] 52 | iex> {:ok, spider} = GenSpider.start_link(StackOverflowSpider, [], opts) 53 | iex> [top_question|_] = GenSpider.export(spider) 54 | iex> top_question.title 55 | "Why is processing a sorted array faster than an unsorted array?" 56 | 57 | 58 | We start our `WebScrapper` by calling `start_link/3`, passing the 59 | module with the spider implementation and its initial argument (a 60 | list representing the selectors to follow and grab). We also pass 61 | a option list to register the spider with a name, and a list of urls 62 | to start following, and an interval for refetching. 63 | 64 | We can get the data from the spider by calling `GenSpider.export/2` 65 | with the `pid` of the spider, and the output format. `GenSpider` 66 | supports outputting JSON, CSV and XML. 67 | 68 | ## Callbacks 69 | 70 | There are 3 callbacks required to be implemented in a `GenSpider`. 71 | By adding `use GenSpider` to your module, all 6 callbacks will be 72 | automatically defined, leaving it up to you to implement the ones 73 | you want to customize. The callbacks are: 74 | 75 | * `init(args)` - invoked when the spider is started. 76 | 77 | It must return: 78 | - `{:ok, state}` 79 | - `{:ok, state, delay}` 80 | - `:ignore` 81 | - `{:stop, reason}` 82 | 83 | * `start_requests(state)` - called by Scrapy when the spider is 84 | opened for scraping when no particular URLs are specified. If 85 | particular URLs are specified, the `make_requests_from_url/1` 86 | is used instead to create the Requests. This method is also 87 | called only once from Scrapex, so it’s safe to implement it as 88 | a stream. 89 | 90 | The default implementation uses `make_requests_from_url/1` to 91 | generate Requests for each url in `options.urls`. 92 | 93 | * `make_requests_from_url(url)` - returns a Request object (or a 94 | list of Request objects) to scrape. It is used to construct the 95 | initial requests in `start_requests/1`, and is typically used to 96 | convert urls to requests. 97 | 98 | Unless overridden, this method returns Requests with `parse/2 as 99 | their callback function. 100 | 101 | * `parse(response)` - invoked after the spider has requested 102 | a URL successfully with a HTML in `response`. 103 | 104 | It must return: 105 | - `{:ok, result}` 106 | - `{:ignore, reason}` 107 | - `{:stop, reason}` 108 | 109 | * `terminate(reason, state)` - called when the server is about to 110 | terminate, useful for cleaning up. It must return `:ok`. 111 | 112 | * `code_change(old_vsn, state, extra)` - called when the application 113 | code is being upgraded live (hot code swapping). 114 | 115 | It must return: 116 | - `{:ok, new_state}` 117 | - `{:error, reason}` 118 | 119 | ## Client / Server APIs 120 | 121 | Although in the example above we have used `GenSpider.start_link/3` 122 | and friends to directly start and communicate with the spider, most 123 | of the time we don't call the `GenSpider` functions directly. 124 | Instead, we wrap the calls in new functions representing the public 125 | API of the spider. 126 | 127 | Here is a better implementation of our StackOverflowSpider module: 128 | 129 | defmodule StackOverflowSpider do 130 | use GenSpider 131 | 132 | # Client 133 | def start_link(sitemap) do 134 | urls = ["http://stackoverflow.com/questions?sort=votes"] 135 | opts = [name: :stackoverflow, urls: urls, interval: 3600] 136 | GenSpider.start_link(__MODULE__, [], opts) 137 | end 138 | 139 | def json(pid) do 140 | GenSpider.export(pid, :json) 141 | end 142 | 143 | # Server (callbacks) 144 | 145 | def parse(response) do 146 | result = response.body 147 | |> select(".question-summary h3 a") 148 | |> extract("href") 149 | |> Enum.map(fn(href) -> 150 | GenSpider.Response.url_join(response, href) 151 | |> GenSpider.request(&parse_question/1) 152 | |> GenSpider.await 153 | end) 154 | {:ok, result]} 155 | end 156 | 157 | defp parse_question(response) do 158 | html = response.body 159 | [title] = html |> select("h1 a") |> extract() 160 | question = html |> select(".question") 161 | [body] = question |> select(".post-text") |> extract 162 | [votes] = question |> select(".vote-count-post") |> extract 163 | tags = question |> select(".post-tag") |> extract 164 | 165 | %{title: title, body: body, votes: votes, tags: tags} 166 | end 167 | end 168 | 169 | In practice, it is common to have both server and client functions in 170 | the same module. If the server and/or client implementations are 171 | growing complex, you may want to have them in different modules. 172 | """ 173 | 174 | @typedoc "Options used by the `start*` functions" 175 | @type options :: [options] 176 | 177 | @type url :: binary 178 | 179 | @type option :: {:name, GenServer.name} | 180 | {:urls, [url]} | 181 | {:timeout, timeout} | 182 | {:interval, non_neg_integer} 183 | 184 | @typedoc "The spider reference" 185 | @type spider :: pid | GenServer.name | {atom, node} 186 | 187 | @typedoc "The internal state of the spider" 188 | @type state :: any 189 | 190 | @typedoc "The list of requests or stream for the spider to enumerate" 191 | @type requests :: Request.t | [Request.t] | Stream.t 192 | 193 | @typedoc "The response from a request to a URL" 194 | @type response :: binary 195 | 196 | @typedoc "Exportable formats" 197 | @type format :: :html | :json | :csv | :xml 198 | 199 | @type t :: %__MODULE__{module: atom, 200 | state: any, 201 | options: Keyword.t, 202 | data: [{url, any}], 203 | requests: requests, 204 | timer: reference} 205 | defstruct module: nil, state: nil, 206 | options: [], data: [], requests: [], timer: nil 207 | 208 | # `GenSpider` is based on `GenServer`. 209 | use GenServer 210 | 211 | # Define the callbacks for `GenSpider` 212 | @callback init(any) :: 213 | {:ok, state} | {:ok, state, timeout | :hibernate} | 214 | :ignore | {:stop, reason :: term} 215 | 216 | @callback start_requests([url], state) :: 217 | {:ok, requests, state} 218 | 219 | @callback make_requests_from_url(url) :: requests 220 | 221 | @callback parse(response) :: 222 | {:ok, data :: list} | {:ignore, reason :: term} | 223 | {:stop, reason :: term} 224 | 225 | @doc """ 226 | This callback is the same as the `GenServer` equivalent and is used to change 227 | the state when loading a different version of the callback module. 228 | """ 229 | @callback code_change(any, any, state) :: {:ok, state} 230 | 231 | @doc """ 232 | This callback is the same as the `GenServer` equivalent and is called when the 233 | process terminates. The first argument is the reason the process is about 234 | to exit with. 235 | """ 236 | @callback terminate(any, state) :: any 237 | 238 | @doc false 239 | defmacro __using__(_) do 240 | quote location: :keep do 241 | @behaviour GenSpider 242 | require Logger 243 | 244 | @start_urls [] 245 | 246 | @doc false 247 | def init(args) do 248 | {:ok, args} 249 | end 250 | 251 | @doc """ 252 | Default method to generate the first Requests to crawl. 253 | 254 | Uses `make_requests_from_url/1` to generate Requests for each 255 | url in `start_urls`. 256 | """ 257 | def start_requests(start_urls, state) do 258 | requests = start_urls 259 | |> Enum.map(&make_requests_from_url/1) 260 | {:ok, requests, state} 261 | end 262 | 263 | @doc """ 264 | Default method to generate a Request (or a list of Requests). 265 | """ 266 | def make_requests_from_url(url) do 267 | GenSpider.request(url, &parse/1) 268 | end 269 | 270 | @doc false 271 | def parse(response) do 272 | {:ok, [response.body]} 273 | end 274 | 275 | @doc false 276 | def terminate(_reason, _state) do 277 | :ok 278 | end 279 | 280 | @doc false 281 | def code_change(_old, state, _extra) do 282 | {:ok, state} 283 | end 284 | 285 | defoverridable [init: 1, 286 | start_requests: 2, 287 | make_requests_from_url: 1, 288 | parse: 1, 289 | terminate: 2, code_change: 3] 290 | end 291 | end 292 | 293 | @doc """ 294 | Starts a `GenSpider` process linked to the current process. 295 | 296 | This is often used to start the `GenSpider` as part of a supervision 297 | tree. 298 | 299 | Once the spider is started, it calls the `init/1` function in the 300 | given `module` passing the given `args` to initialize it. To ensure 301 | a synchronized start-up procedure, this function does not return 302 | until `init/1` has returned. 303 | 304 | Note that a `GenSpider` started with `start_link/3` is linked to the 305 | parent process and will exit in case of crashes. The GenSpider will 306 | also exit due to the `:normal` reasons in case it is configured to 307 | trap exits in the `init/1` callback. 308 | 309 | ## Options 310 | 311 | The `:name` option is used for name registration as described in the 312 | module documentation. If the option `:timeout` option is present, 313 | the spider is allowed to spend the given milliseconds initializing 314 | or it will be terminated and the start function will return 315 | `{:error, :timeout}`. 316 | 317 | The `:urls` defines a list of URLs for the spider to start from. 318 | 319 | If the `:inverval` option is present, the spider will repeat itself 320 | after every number of seconds defined by the option. Note that it 321 | will only repeat if it's not currently running a crawl. 322 | 323 | ## Return values 324 | 325 | If the spider is successfully created and initialized, the function 326 | returns `{:ok, pid}`, where pid is the pid of the spider. If there 327 | already exists a process with the specified spider name, the 328 | function returns `{:error, {:already_started, pid}}` with the pid of 329 | that process. 330 | 331 | If the `init/1` callback fails with `reason`, the function returns 332 | `{:error, reason}`. Otherwise, if it returns `{:stop, reason}`or 333 | `:ignore`, the process is terminated and the function returns 334 | `{:error, reason}` or `:ignore`, respectively. 335 | """ 336 | @spec start_link(module, any, options) :: GenServer.on_start 337 | def start_link(module, args, options \\ []) 338 | when is_atom(module) and is_list(options) 339 | do 340 | do_start(:start_link, module, args, options) 341 | end 342 | 343 | @doc """ 344 | Starts a `GenSpider` without links (outside of a supervision tree). 345 | See `start_link/3` for more information. 346 | """ 347 | @spec start(module, any, options) :: GenServer.on_start 348 | def start(module, args, options \\ []) 349 | when is_atom(module) and is_list(options) 350 | do 351 | do_start(:start, module, args, options) 352 | end 353 | 354 | @doc false 355 | defp do_start(link, module, args, options) do 356 | {name, opts} = Keyword.pop(options, :name) 357 | init_args = {module, args, opts} 358 | case name do 359 | nil -> 360 | apply(GenServer, link, [__MODULE__, init_args]) 361 | atom when is_atom(atom) -> 362 | apply(GenServer, link, [__MODULE__, init_args, [name: atom]]) 363 | {:global, _} -> 364 | apply(GenServer, link, [__MODULE__, init_args, [name: name]]) 365 | {:via, _, _} -> 366 | apply(GenServer, link, [__MODULE__, init_args, [name: name]]) 367 | end 368 | end 369 | 370 | @doc """ 371 | Exports the stored data with specific format. 372 | 373 | This call will block until all data received. 374 | 375 | This is called in the following situations: 376 | 377 | - Right after spider is started. 378 | - In the middle of a crawl. 379 | - In between the crawl interval. 380 | 381 | For the first two situations, the spider will manually awaits the 382 | requests instead of handle the response message in `handle_info/2`. 383 | 384 | If one of the `parse/1` callbacks wants to stop the spider, this 385 | function will still return partial data if any, and then stops the 386 | spider. 387 | 388 | If the third argument is true, the spider will clear any timer in 389 | place and immediately crawl for new data. 390 | """ 391 | @spec export(spider, format, boolean) :: any 392 | def export(spider, format \\ nil, override \\ false) do 393 | # Await for all the data to be collected first. 394 | __MODULE__.await(spider) 395 | GenServer.call(spider, {:export, format, override}) 396 | end 397 | 398 | def request(url, callback, from \\ self) do 399 | Request.async(url, callback, from) 400 | end 401 | 402 | def await(request = %Request{}), do: Request.await(request) 403 | def await(spider, timeout \\ :infinity) do 404 | GenServer.call(spider, :await, timeout) 405 | end 406 | 407 | # GenServer callbacks 408 | 409 | def init({module, args, opts}) do 410 | # Set default timeout, used to stop spider. 411 | opts = Keyword.put_new(opts, :timeout, 100) 412 | spider = %GenSpider{ module: module, options: opts, 413 | timer: :erlang.make_ref()} 414 | urls = opts[:urls] || [] 415 | 416 | # Set an empty data set with each URLs as keys. 417 | data = Enum.map(urls, &({&1, nil})) 418 | 419 | case apply(module, :init, [args]) do 420 | {:ok, state} -> 421 | # Return 0 timeout to trigger crawl immediately. 422 | # This works regardless of interval option, since we always 423 | # have a crawl. A crawl will use interval option to see if it 424 | # needs to do the next one. 425 | # send_after(self, 0, :crawl) 426 | Logger.debug "Starts a spider immediately" 427 | {:ok, %{spider | state: state, data: data}, 0} 428 | {:ok, state, delay} -> 429 | # Delay the crawl by the value specified in return. 430 | # send_after(self, delay, :crawl) 431 | Logger.debug "Starts a spider after #{delay} milliseconds" 432 | {:ok, %{spider | state: state, data: data}, delay} 433 | :ignore -> 434 | :ignore 435 | {:stop, reason} -> 436 | {:stop, reason} 437 | other -> 438 | other 439 | end 440 | end 441 | 442 | @doc """ 443 | Await for any remaining request(s) to finish. 444 | 445 | For any remaining requests in the state, await for them to finish. 446 | This function will receive the response instead of the `handle_info` 447 | so it then calls the `handle_info` so that the request can be removed 448 | from state and the response can be parsed by the callback module. 449 | 450 | This function can be called in the middle of a crawl of multiple URLs 451 | but since it only awaits the remaning requests, the spider's state 452 | is still being passed along correctly. 453 | """ 454 | def handle_call(:await, _from, spider) do 455 | spider = 456 | spider.requests 457 | |> Enum.reduce_while(spider, fn(request, spider) -> 458 | ref = request.ref 459 | response = Request.await(request, :infinity) 460 | case handle_info({ref, response}, spider) do 461 | {:noreply, spider} -> 462 | {:cont, spider} 463 | {:stop, _reason, spider} -> 464 | {:halt, spider} 465 | end 466 | end) 467 | Logger.debug("Awaited for data") 468 | {:reply, :ok, spider} 469 | end 470 | 471 | @doc """ 472 | Called to export the data in a specific format. 473 | """ 474 | def handle_call({:export, nil, true}, from, spider) do 475 | :erlang.cancel_timer(spider.timer) 476 | {:noreply, spider} = handle_info(:crawl, spider) 477 | {:reply, :ok, spider} = handle_call(:await, from, spider) 478 | handle_call({:export, nil, false}, from, spider) 479 | end 480 | 481 | # Main handler for exporting. 482 | def handle_call({:export, nil, false}, _from, spider) do 483 | Logger.debug("Exporting data") 484 | 485 | interval = spider.options[:interval] 486 | 487 | data = 488 | spider.data 489 | |> Enum.filter_map(fn({_,data}) -> data !== nil end, 490 | fn({_, data}) -> data end) 491 | 492 | is_complete? = length(data) === length(spider.data) 493 | data = Enum.concat(data) 494 | case interval !== nil and is_complete? do 495 | true -> 496 | {:reply, data, spider} 497 | false -> 498 | {:stop, :normal, data, spider} 499 | end 500 | end 501 | 502 | def handle_call({:export, :json, override?}, from, spider) do 503 | case handle_call({:export, nil, override?}, from, spider) do 504 | {:reply, data, spider} -> 505 | {:reply, Poison.encode!(data), spider} 506 | {:stop, _, data, spider} -> 507 | {:stop, :normal, Poison.encode!(data), spider} 508 | end 509 | end 510 | 511 | def handle_call({:export, encoder, override?}, from, spider) 512 | when is_function(encoder, 1) 513 | do 514 | case handle_call({:export, nil, override?}, from, spider) do 515 | {:reply, data, spider} -> 516 | {:reply, encoder.(data), spider} 517 | {:stop, _, data, spider} -> 518 | {:stop, :normal, encoder.(data), spider} 519 | end 520 | end 521 | 522 | def handle_call({:export, _format, true}, _from, spider) do 523 | {:reply, spider.data, spider} 524 | end 525 | 526 | @doc """ 527 | Called when a timeout occurs, usually when to start a crawl. 528 | 529 | The `GenSpider` uses the timeout value to trigger a crawl, in which 530 | it spawns a task for each URLs specified in the `opts`. 531 | 532 | The results will be handled in a different function. 533 | """ 534 | def handle_info(:timeout, spider) do 535 | handle_info(:crawl, spider) 536 | end 537 | 538 | @doc """ 539 | Called from a timer to crawl a list of URLs. 540 | 541 | This generates a list of async requests to the URLs. The response 542 | will be sent back in another message. 543 | """ 544 | def handle_info(:crawl, spider) do 545 | options = spider.options 546 | urls = options[:urls] || [] 547 | Logger.debug("Starts a crawl for #{urls}") 548 | 549 | args = [urls, spider.state] 550 | spider = case call(:start_requests, spider, args) do 551 | {:ok, requests, state} -> 552 | # `requests` can also be a `Stream`. 553 | %{spider | requests: Enum.map(requests, &(&1)), 554 | state: state} 555 | end 556 | 557 | {:noreply, spider} 558 | end 559 | 560 | @doc """ 561 | Called when a scrape request is completed. 562 | 563 | When a request is completed, i.e. receives the response, and parsed, 564 | this process receives a message with the result. 565 | 566 | If this is for the last request, it sets a new timer if needed. 567 | """ 568 | def handle_info({ref, {:ok, %Request{}=req}}, spider) do 569 | data = Request.await(req) 570 | handle_info({ref, {:ok, data}}, spider) 571 | end 572 | 573 | def handle_info({ref, {:ok, requests = [%Request{} | _]}}, spider) do 574 | data = Stream.map(requests, &Request.await(&1)) 575 | |> Enum.concat 576 | handle_info({ref, {:ok, data}}, spider) 577 | end 578 | 579 | def handle_info({ref, {:ok, data}}, spider) do 580 | # Remove this request from the list. 581 | {request, requests} = spider.requests 582 | |> Enum.reduce({nil, []}, fn(request, {req, requests}) -> 583 | case request.ref === ref do 584 | true -> {request, requests} 585 | false -> {req, requests ++ [request]} 586 | end 587 | end) 588 | spider = %{spider | requests: requests} 589 | 590 | url = request.url 591 | Logger.debug "Got data from #{url}" 592 | 593 | new_data = List.keystore(spider.data, url, 0, {url, data}) 594 | interval = spider.options[:interval] 595 | timer = spider.timer 596 | spider = %{spider | data: new_data} 597 | 598 | case {Enum.empty?(requests), interval} do 599 | {true, nil} -> 600 | timeout = spider.options[:timeout] 601 | # No more request, and no interval, so we stop 602 | send_after(self, timeout, {timer, {:stop, :normal}}) 603 | {:noreply, spider} 604 | {true, _} -> 605 | # No more request, but has interval, so schedule a new crawl. 606 | :erlang.cancel_timer(timer) 607 | timer = send_after(self, interval, :crawl) 608 | {:noreply, %{spider | timer: timer}} 609 | {_, _} -> 610 | {:noreply, spider} 611 | end 612 | end 613 | 614 | # Return value from `parse` callback 615 | def handle_info({_ref, {:stop, reason}}, spider) do 616 | Logger.info "Spider is stopped with reason #{reason}" 617 | {:stop, :normal, spider} 618 | end 619 | 620 | # The URL is 404, we remove it from the list to prevent requesting it 621 | # the next time, and call `handle_info` with empty data so it can 622 | # continue the loop. 623 | def handle_info({ref, {:error, {:not_found, url}}}, spider) do 624 | Logger.error("Failed to request to #{url} with 404 error") 625 | 626 | options = spider.options 627 | urls = Enum.filter(options.urls, &(&1 !== url)) 628 | options = Keyword.put(options, :urls, urls) 629 | handle_info({ref, {:ok, []}}, %{spider| options: options}) 630 | end 631 | 632 | def handle_info({ref, {:error, reason}}, spider) do 633 | # Retry with backoff? 634 | request = spider.requests 635 | |> Enum.find(%Request{}, &(&1.ref === ref)) 636 | 637 | Logger.error("Failed to request to #{request.url} with reason #{reason}") 638 | # Return an empty data, and let the spider tries again next time. 639 | handle_info({ref, {:ok, []}}, spider) 640 | end 641 | 642 | def handle_info(_info, state) do 643 | {:noreply, state} 644 | end 645 | 646 | defp send_after(_dest, nil, _message) do 647 | :erlang.make_ref() 648 | end 649 | defp send_after(dest, time, message) do 650 | :erlang.send_after(time, dest, message) 651 | end 652 | 653 | defp call(method, %GenSpider{}=spider, nil) when is_atom(method) do 654 | call(method, spider, [spider.state]) 655 | end 656 | defp call(method, %GenSpider{}=spider, args) when is_atom(method) do 657 | Kernel.apply(spider.module, method, args) 658 | end 659 | end -------------------------------------------------------------------------------- /lib/scrapex/gen_spider/README.md: -------------------------------------------------------------------------------- 1 | GenSpider 2 | ========= 3 | 4 | The flow of GenSpider 5 | 6 | - GenSpider.start_link 7 | - GenSpider.init 8 | - Mod.init 9 | 10 | - GenSpider.handle_info(:timeout): message from start_link 11 | - GenSpider.handle_info(:crawl): start the full crawl 12 | - Mod.start_requests 13 | - Mod.make_requests_from_url 14 | - GenSpider.request(url, &parse/1): called inside GenSpider 15 | - Request.async 16 | - Create a task with Request.do_request 17 | - do_request calls `parse(response)` 18 | # Up to here, everything is asynchronous, result will be handled in handle_info({ref,...}) 19 | # Receive data after request and parsed. 20 | - GenSpider.handle_info({ref, {:ok, data}}) 21 | - remove request with ref from spider.requests 22 | - store data into spider.data 23 | - if last request and interval option, 24 | - send(self, :crawl, interval) -------------------------------------------------------------------------------- /lib/scrapex/gen_spider/request.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrapex.GenSpider.Request do 2 | @moduledoc """ 3 | Conveniences for spawing and awaiting for HTTP requests. 4 | 5 | Requests are processes meant to perform one particular HTTP request 6 | to a specific URL throughout their life-cycle, often with little or 7 | no communication with other processes. 8 | 9 | Requests spawned with `async/2` can be awaited on by its caller 10 | process (and only its caller). They are implemented by spawing a 11 | `Task` and await on it. 12 | """ 13 | alias Scrapex.GenSpider.Request 14 | alias Scrapex.GenSpider.Response 15 | require Logger 16 | 17 | @doc """ 18 | The Request struct. 19 | 20 | It contains the following fields: 21 | 22 | * `:pid` - the process reference of the request process. 23 | 24 | * `:ref` - the request monitor reference. 25 | 26 | * `:owner` - the PID of the process that started the request. 27 | 28 | * `:url` - the url to make request to. 29 | """ 30 | defstruct pid: nil, ref: nil, owner: nil, url: "" 31 | @type t :: %__MODULE__{pid: pid, ref: reference, owner: pid, url: binary} 32 | 33 | @type url :: binary 34 | 35 | @doc """ 36 | Starts an asynchronous request that can be awaited on. 37 | 38 | This function spawns a Task that is linked to and monitored by the 39 | caller process. A `Request` struct is returned as an extended version 40 | of the `Task` struct. 41 | 42 | ## Request's message format 43 | 44 | The reply sent by the request will be the of the underlying `Task`, 45 | i.e., in the format `{ref, msg}`, where `ref` is the monitoring ref 46 | held by the request, and `msg` is the return value of the callback. 47 | """ 48 | @spec async(url, fun, pid) :: t 49 | def async(url, callback, from \\ self) when is_pid(from) do 50 | mfa = {:erlang, :apply, [&request/2, [url, callback]]} 51 | pid = Task.Supervised.spawn_link(from, get_info(from), mfa) 52 | ref = Process.monitor(pid) 53 | send(pid, {from, ref}) 54 | %Request{url: url, pid: pid, ref: ref, owner: from} 55 | end 56 | 57 | @doc """ 58 | Awaits a request response. 59 | 60 | A timeout in milliseconds can be given with default value of `5000`. 61 | In case the request process dies, this function will exit with the 62 | same reason as the request. 63 | """ 64 | @spec await(t, timeout) :: term 65 | def await(%Request{pid: pid, ref: ref, owner: owner}, timeout \\ 5000) do 66 | Task.await(%Task{pid: pid, ref: ref, owner: owner}, timeout) 67 | end 68 | 69 | defp get_info(pid) do 70 | {node(), 71 | case Process.info(pid, :registered_name) do 72 | {:registered_name, []} -> pid 73 | {:registered_name, name} -> name 74 | end} 75 | end 76 | 77 | defp request(url, callback) do 78 | case do_request(url) do 79 | # HTTP Request succeeded, return whatever the callback returns. 80 | {:ok, response} -> 81 | response |> callback.() 82 | # Forward the HTTP error to the `handle_info` 83 | {:error, reason} -> 84 | {:error, reason} 85 | end 86 | end 87 | 88 | defp do_request(url) do 89 | Logger.debug("Do request for #{url}") 90 | hackney = [follow_redirect: true, timeout: 30000, recv_timeout: 15000] 91 | case HTTPoison.get(url, [], [ hackney: hackney ]) do 92 | {:ok, %HTTPoison.Response{status_code: 200, body: body}} -> 93 | {:ok, %Response{url: url, body: body}} 94 | {:ok, %HTTPoison.Response{status_code: 404}} -> 95 | {:error, {:not_found, url}} 96 | {:error, %HTTPoison.Error{reason: reason}} -> 97 | {:error, reason} 98 | end 99 | end 100 | end -------------------------------------------------------------------------------- /lib/scrapex/gen_spider/response.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrapex.GenSpider.Response do 2 | @moduledoc """ 3 | Utilities for working response returned from `GenSpider`. 4 | """ 5 | 6 | alias Scrapex.GenSpider.Response 7 | 8 | defstruct url: "", body: "" 9 | @type t :: %__MODULE__{url: binary, body: binary} 10 | 11 | @doc """ 12 | Join a path relative to the response's URL. 13 | 14 | ## Examples 15 | 16 | iex> alias Scrapex.GenSpider.Response 17 | iex> response = %Response{url: "http://www.scrapex.com/subfolder"} 18 | iex> Response.url_join(response, "/subfolder2") 19 | "http://www.scrapex.com/subfolder2" 20 | iex> Response.url_join(response, "subsubfolder") 21 | "http://www.scrapex.com/subfolder/subsubfolder" 22 | """ 23 | @spec url_join(t, binary) :: binary 24 | def url_join(url, path) when is_binary(url) do 25 | url_join(%Response{url: url}, path) 26 | end 27 | 28 | def url_join(%Response{url: url}, "/" <> path) do 29 | uri = URI.parse(url) 30 | "#{uri.scheme}://#{uri.authority}/#{path}" 31 | end 32 | 33 | def url_join(%Response{url: _url}, "http" <> path) do 34 | "http#{path}" 35 | end 36 | 37 | def url_join(%Response{url: url}, path) do 38 | "#{url}/#{path}" 39 | end 40 | end -------------------------------------------------------------------------------- /lib/scrapex/selector.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrapex.Selector do 2 | @moduledoc """ 3 | Utilities for extracting data from markup language. 4 | """ 5 | 6 | use GenServer 7 | alias Scrapex.Selector 8 | 9 | defstruct tree: [] 10 | @type t :: %__MODULE__{tree: html_tree} 11 | 12 | @typedoc "A tree of HTML nodes, or a node itself if only one" 13 | @type html_tree :: html_node | [html_node] 14 | @typedoc "Name of the tag or attribute" 15 | @type name :: binary 16 | @typedoc "Attribute of a node" 17 | @type attribute :: {name, binary} 18 | 19 | @type html_node :: {name, [attribute], children} 20 | @type children :: [html_node] 21 | 22 | @type selector :: binary 23 | 24 | @doc """ 25 | Generates a selection for a particular selector. 26 | 27 | The return value is a Selector.t 28 | """ 29 | @spec select(binary | t, selector) :: t 30 | def select(html, selector) when is_binary(html) do 31 | %Selector{tree: Floki.parse(html)} 32 | |> select(selector) 33 | end 34 | def select(%Selector{tree: tree}, selector) do 35 | %Selector{tree: Floki.find(tree, selector)} 36 | end 37 | 38 | @doc """ 39 | Extracts content or attribute value for a selection. 40 | """ 41 | @spec extract(t, name) :: [binary] 42 | def extract(selector), do: extract(selector, "text") 43 | def extract(selector, ""), do: extract(selector, "text") 44 | def extract(%Selector{tree: tree}, "text") do 45 | Enum.map(tree, fn({_, _, children}) -> 46 | extract_text(children, "") 47 | |> String.split 48 | |> Enum.join(" ") 49 | end) 50 | end 51 | def extract(%Selector{tree: tree}, attr) do 52 | Floki.attribute(tree, attr) 53 | end 54 | 55 | defp extract_text(children), do: extract_text(children, "") 56 | defp extract_text([], result), do: result 57 | defp extract_text([text|rest], result) 58 | when is_binary(text) 59 | do 60 | extract_text(rest, result <> text) 61 | end 62 | defp extract_text([{_, _, children}|rest], result) do 63 | extract_text(rest, result <> extract_text(children)) 64 | end 65 | defp extract_text(_, result), do: result 66 | 67 | defimpl Enumerable, for: __MODULE__ do 68 | alias Scrapex.Selector 69 | 70 | def count(%Selector{tree: tree}), do: length(tree) 71 | def member?(api = %Selector{}, selector) do 72 | Selector.select(api, selector) !== [] 73 | end 74 | 75 | def reduce(_api, {:halt, acc}, _fun), do: {:halted, acc} 76 | def reduce(api, {:suspend, acc}, fun) do 77 | {:suspended, acc, &reduce(api, &1, fun)} 78 | end 79 | def reduce(%Selector{tree: []}, {:cont, acc}, _fun) do 80 | {:done, acc} 81 | end 82 | def reduce(%Selector{tree: [h | t]}, {:cont, acc}, fun) do 83 | new_acc = fun.(%Selector{tree: [h]}, acc) 84 | reduce(%Selector{tree: t}, new_acc, fun) 85 | end 86 | end 87 | end -------------------------------------------------------------------------------- /lib/scrapex/spider/webscraper.ex: -------------------------------------------------------------------------------- 1 | defmodule Scrapex.Spider.WebScraper do 2 | @moduledoc ~S""" 3 | A spider using "sitemap" configuration from WebScraper.IO 4 | 5 | WebScraper.IO provides a Chrome extension to visually define scraping 6 | rules. This module provides a spider to use those rules to collect 7 | data. 8 | 9 | ## Examples 10 | 11 | Here is an example of scraping the E-commerce training site at 12 | http://webscraper.io/test-sites/e-commerce/static, following the 13 | instructions in WebScraper's tutorials section. 14 | 15 | iex> sitemap = %{ 16 | ...> "_id" => "webscrapper", 17 | ...> "startUrl" => "http://webscraper.io/test-sites/e-commerce/static", 18 | ...> "selectors" => [{ 19 | ...> "parentSelectors" => ["_root"], 20 | ...> "type" => "SelectorLink", 21 | ...> "multiple" => true, 22 | ...> "id" => "Category", 23 | ...> "selector" => "a.category-link", 24 | ...> "delay" => "" 25 | ...> }, { 26 | ...> "parentSelectors" => ["Item"], 27 | ...> "type" => "SelectorText", 28 | ...> "multiple" => false, 29 | ...> "id" => "Name", 30 | ...> "selector" => "a.title", 31 | ...> "regex" => "", 32 | ...> "delay" => "" 33 | ...> }, { 34 | ...> "parentSelectors" => ["Item"], 35 | ...> "type" => "SelectorText", 36 | ...> "multiple" => false, 37 | ...> "id" => "Price", 38 | ...> "selector" => "h4.pull-right", 39 | ...> "regex" => "", 40 | ...> "delay" => "" 41 | ...> }, { 42 | ...> "parentSelectors" => ["Item"], 43 | ...> "type" => "SelectorText", 44 | ...> "multiple" => false, 45 | ...> "id" => "Description", 46 | ...> "selector" => "p.description", 47 | ...> "regex" => "", 48 | ...> "delay" => "" 49 | ...> }, { 50 | ...> "parentSelectors" => ["Category"], 51 | ...> "type" => "SelectorLink", 52 | ...> "multiple" => true, 53 | ...> "id" => "SubCategory", 54 | ...> "selector" => "a.subcategory-link", 55 | ...> "delay" => "" 56 | ...> }, { 57 | ...> "parentSelectors" => ["SubCategory"], 58 | ...> "type" => "SelectorElement", 59 | ...> "multiple" => true, 60 | ...> "id" => "Item", 61 | ...> "selector" => "div.thumbnail", 62 | ...> "delay" => "" 63 | ...> }] 64 | ...> } 65 | iex> {:ok, spider} = WebScraper.start_link(sitemap) 66 | iex> data = WebScraper.export() 67 | [%{ 68 | "Category" => "Computers", 69 | "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/computers", 70 | "Name" => "Iconia B1-730HD", 71 | "Price" => "$99.99", 72 | "Description" => "Black, 7\", 1.6GHz Dual-Core, 8GB, Android 4.4", 73 | "SubCategory" => "Tablets", 74 | "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/computers/tablets" 75 | },%{ 76 | "Category" => "Computers", 77 | "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/computers", 78 | "Name" => "Pavilion", 79 | "Price" => "$609.99", 80 | "Description" => "15.6\", Core i5-4200U, 6GB, 750GB, Windows 8.1", 81 | "SubCategory" => "Laptops", 82 | "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/computers/laptops" 83 | },%{ 84 | "Category" => "Phones", 85 | "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/phones", 86 | "Name" => "Samsung Galaxy", 87 | "Price" => "$93.99", 88 | "Description" => "5 mpx. Android 5.0", 89 | "SubCategory" => "Touch", 90 | "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/phones/touch" 91 | },%{ 92 | "Category" => "Phones", 93 | "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/phones", 94 | "Name" => "Sony Xperia", 95 | "Price" => "$118.99", 96 | "Description" => "GPS, waterproof", 97 | "SubCategory" => "Touch", 98 | "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/phones/touch" 99 | },%{ 100 | "Category" => "Computers", 101 | "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/computers", 102 | "Name" => "Memo Pad HD 7", 103 | "Price" => "$101.99", 104 | "Description" => "IPS, Dual-Core 1.2GHz, 8GB, Android 4.3", 105 | "SubCategory" => "Tablets", 106 | "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/computers/tablets" 107 | },%{ 108 | "Category" => "Computers", 109 | "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/computers", 110 | "Name" => "Lenovo IdeaTab", 111 | "Price" => "$69.99", 112 | "Description" => "7\" screen, Android", 113 | "SubCategory" => "Tablets", 114 | "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/computers/tablets" 115 | },%{ 116 | "Category" => "Phones", 117 | "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/phones", 118 | "Name" => "Ubuntu Edge", 119 | "Price" => "$499.99", 120 | "Description" => "Sapphire glass", 121 | "SubCategory" => "Touch", 122 | "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/phones/touch" 123 | },%{ 124 | "Category" => "Computers", 125 | "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/computers", 126 | "Name" => "Acer Iconia", 127 | "Price" => "$96.99", 128 | "Description" => "7\" screen, Android, 16GB", 129 | "SubCategory" => "Tablets", 130 | "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/computers/tablets" 131 | },%{ 132 | "Category" => "Computers", 133 | "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/computers", 134 | "Name" => "Aspire E1-572G", 135 | "Price" => "$581.99", 136 | "Description" => "15.6\", Core i5-4200U, 8GB, 1TB, Radeon R7 M265, Windows 8.1", 137 | "SubCategory" => "Laptops", 138 | "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/computers/laptops" 139 | },%{ 140 | "Category" => "Phones", 141 | "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/phones", 142 | "Name" => "Nokia X", 143 | "Price" => "$109.99", 144 | "Description" => "Andoid, Jolla dualboot", 145 | "SubCategory" => "Touch", 146 | "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/phones/touch" 147 | },%{ 148 | "Category" => "Phones", 149 | "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/phones", 150 | "Name" => "LG Optimus", 151 | "Price" => "$57.99", 152 | "Description" => "3.2\" screen", 153 | "SubCategory" => "Touch", 154 | "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/phones/touch" 155 | },%{ 156 | "Category" => "Computers", 157 | "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/computers", 158 | "Name" => "IdeaTab A3500L", 159 | "Price" => "$88.99", 160 | "Description" => "Black, 7\" IPS, Quad-Core 1.2GHz, 8GB, Android 4.2", 161 | "SubCategory" => "Tablets", 162 | "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/computers/tablets" 163 | },%{ 164 | "Category" => "Computers", 165 | "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/computers", 166 | "Name" => "Galaxy Tab 3", 167 | "Price" => "$97.99", 168 | "Description" => "7\", 8GB, Wi-Fi, Android 4.2, White", 169 | "SubCategory" => "Tablets", 170 | "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/computers/tablets" 171 | },%{ 172 | "Category" => "Computers", 173 | "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/computers", 174 | "Name" => "HP 350 G1", 175 | "Price" => "$577.99", 176 | "Description" => "15.6\", Core i5-4200U, 4GB, 750GB, Radeon HD8670M 2GB, Windows", 177 | "SubCategory" => "Laptops", 178 | "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/computers/laptops" 179 | },%{ 180 | "Category" => "Phones", 181 | "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/phones", 182 | "Name" => "Nokia 123", 183 | "Price" => "$24.99", 184 | "Description" => "7 day battery", 185 | "SubCategory" => "Touch", 186 | "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/phones/touch" 187 | },%{ 188 | "Category" => "Computers", 189 | "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/computers", 190 | "Name" => "HP 250 G3", 191 | "Price" => "$520.99", 192 | "Description" => "15.6\", Core i5-4210U, 4GB, 500GB, Windows 8.1", 193 | "SubCategory" => "Laptops", 194 | "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/computers/laptops" 195 | },%{ 196 | "Category" => "Computers", 197 | "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/computers", 198 | "Name" => "Aspire E1-510", 199 | "Price" => "$306.99", 200 | "Description" => "15.6\", Pentium N3520 2.16GHz, 4GB, 500GB, Linux", 201 | "SubCategory" => "Laptops", 202 | "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/computers/laptops" 203 | },%{ 204 | "Category" => "Computers", 205 | "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/computers", 206 | "Name" => "Packard 255 G2", 207 | "Price" => "$416.99", 208 | "Description" => "15.6\", AMD E2-3800 1.3GHz, 4GB, 500GB, Windows 8.1", 209 | "SubCategory" => "Laptops", 210 | "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/computers/laptops" 211 | }] 212 | """ 213 | 214 | @type item :: [ property ] 215 | @type property :: { key, value } 216 | @type key :: binary 217 | @type value :: binary 218 | @type rule :: %{key => value} 219 | 220 | alias Scrapex.GenSpider 221 | alias GenSpider.Response 222 | import Scrapex.Selector 223 | use GenSpider 224 | 225 | require Logger 226 | 227 | # Client 228 | def start_link(sitemap = %{"startUrl" => url}) when is_binary(url) do 229 | start_link(%{sitemap | "startUrl" => [url]}) 230 | end 231 | def start_link(sitemap = %{"startUrl" => urls}) when is_list(urls) do 232 | opts = [ 233 | urls: urls, 234 | interval: 3600] 235 | 236 | GenSpider.start_link(__MODULE__, sitemap, opts) 237 | end 238 | 239 | def export(spider, format \\ nil) do 240 | GenSpider.export(spider, format) 241 | end 242 | 243 | # Server (callbacks) 244 | 245 | def init(%{"selectors"=> rules}) do 246 | {:ok, rules} 247 | end 248 | 249 | def start_requests(urls, rules) do 250 | requests = urls 251 | |> Enum.map(fn(url) -> 252 | GenSpider.request(url, &parse(&1, rules)) 253 | end) 254 | {:ok, requests, rules} 255 | end 256 | 257 | def parse(response, rules) do 258 | by_parent = group_by_parents(rules) 259 | 260 | results 261 | = parse_level(response, "_root", by_parent) 262 | # @return: [ item ] 263 | |> Enum.map(&Enum.into(&1, %{})) 264 | 265 | {:ok, results} 266 | end 267 | 268 | @spec parse_level(binary, binary, %{key => [rule]}) :: [item] 269 | defp parse_level(response, parent, rule_groups) do 270 | body = response.body 271 | rules = (rule_groups[parent] || []) 272 | 273 | rules 274 | |> Enum.map(fn 275 | (rule = %{"type" => "SelectorGroup"}) -> 276 | # For SelectorGroup, we collect all values into a list. 277 | # Note: This is different from WebScraper.IO extension. 278 | key = rule["id"] 279 | attribute = rule["extractAttribute"] || "text" 280 | values = select(body, rule["selector"]) |> extract(attribute) 281 | [[{key, values}]] 282 | (rule) -> 283 | key = rule["id"] 284 | multiple? = rule["multiple"] 285 | selectors = select(body, rule["selector"]) 286 | selectors = if multiple?, do: selectors, else: Enum.take(selectors, 1) 287 | Logger.debug("Selecting #{rule["selector"]} into:") 288 | selectors 289 | |> Enum.map(fn(selector) -> 290 | [value] = extract(selector, "text") 291 | result = [[{key, value}]] 292 | 293 | Logger.debug("Parse response with #{rule["type"]}: #{rule["selector"]}") 294 | # For each key-value pair, return into a list, with 295 | # new key-value pair(s) if rule's selector is a link. 296 | case {rule["type"], rule_groups[key]} do 297 | {"SelectorText", _} -> 298 | case Regex.compile(rule["regex"] || "") do 299 | {:error, _reason} -> result 300 | {:ok, ~r//} -> result 301 | {:ok, regex} -> 302 | case Regex.run(regex, value) do 303 | [value|_] -> [[{key, value}]] 304 | _ -> [[{key, nil}]] 305 | end 306 | end 307 | {"SelectorLink", nil} -> 308 | # Link with no child rule just returns the text value 309 | result 310 | {"SelectorLink", _} -> 311 | [href] = extract(selector, "href") 312 | url = Response.url_join(response, href) 313 | 314 | request = GenSpider.request(url, fn(response) -> 315 | # Get sub nodes as a tuple list. 316 | parse_level(response, rule["id"], rule_groups) 317 | end) 318 | subvalues = GenSpider.await(request) 319 | # @return [ item ] 320 | combine(result, subvalues) 321 | {"SelectorElement", nil} -> 322 | # Don't return SelectorElement in result 323 | [] 324 | {"SelectorElement", _} -> 325 | # Only use the results scraped from children rules. 326 | parse_level(%{body: selector}, key, rule_groups) 327 | {"SelectorElementAttribute", _} -> 328 | [value] = extract(selector, rule["extractAttribute"]) 329 | [[{key, value}]] 330 | _ -> 331 | result 332 | end 333 | end) 334 | # @return [ item ] 335 | |> Enum.concat 336 | end) 337 | |> Enum.reduce(&combine/2) 338 | end 339 | 340 | @spec combine([item], [item]) :: [item] 341 | defp combine([], right), do: right 342 | defp combine(left, []), do: left 343 | defp combine(left, right) do 344 | for litem <- left, ritem <- right, do: Enum.concat(litem, ritem) 345 | end 346 | 347 | @spec group_by_parents([rule], binary) :: %{key => [rule]} 348 | defp group_by_parents(selectors, key \\ "parentSelectors") do 349 | Enum.reduce(selectors, %{}, fn(selector, groups) -> 350 | Enum.reduce(selector[key], groups, fn(parent, groups) -> 351 | Dict.update(groups, parent, [selector], &[selector|&1]) 352 | end) 353 | end) 354 | end 355 | end -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sntran/scrapex/0b8e1db6cf24f3d98e644b03479af9a7c304b6a8/logo.png -------------------------------------------------------------------------------- /mix.exs: -------------------------------------------------------------------------------- 1 | defmodule Scrapex.Mixfile do 2 | use Mix.Project 3 | 4 | @version "0.5.2" 5 | 6 | def project do 7 | [app: :scrapex, 8 | version: @version, 9 | name: "Scrapex", 10 | description: """ 11 | An open source and collaborative framework for extracting the data 12 | you need from websites. In a fast, simple, yet extensible way. 13 | """, 14 | source_url: "https://bitbucket.org/inhuman/scrapex", 15 | homepage_url: "https://bitbucket.org/inhuman/scrapex/overview", 16 | elixir: "~> 1.2", 17 | escript: [main_module: Scrapex], 18 | build_embedded: Mix.env == :prod, 19 | start_permanent: Mix.env == :prod, 20 | deps: deps, 21 | package: package, 22 | docs: [source_ref: "v#{@version}", 23 | logo: "logo.png", 24 | extras: ["README.md"]]] 25 | end 26 | 27 | # Configuration for the OTP application 28 | # 29 | # Type `mix help compile.app` for more information 30 | def application do 31 | [applications: [:logger, :httpoison]] 32 | end 33 | 34 | # Dependencies can be Hex packages: 35 | # 36 | # {:mydep, "~> 0.3.0"} 37 | # 38 | # Or git/path repositories: 39 | # 40 | # {:mydep, git: "https://github.com/elixir-lang/mydep.git", tag: "0.1.0"} 41 | # 42 | # Type `mix help deps` for more examples and options 43 | defp deps do 44 | [ 45 | {:httpoison, "~> 0.7"}, 46 | {:floki, "~> 0.7.0"}, 47 | {:poison, "~> 1.4.0"}, 48 | {:csv, "~> 1.2.1"}, 49 | 50 | # Docs dependencies 51 | {:earmark, "~> 0.1", only: :dev}, 52 | {:ex_doc, "~> 0.10", only: :dev} 53 | ] 54 | end 55 | 56 | defp package do 57 | [contributors: ["Son Tran-Nguyen"], 58 | licenses: ["MIT"], 59 | links: %{bitbucket: "https://bitbucket.org/inhuman/scrapex"}, 60 | files: ~w(lib priv test) ++ 61 | ~w(CHANGELOG.md LICENSE mix.exs package.json README.md)] 62 | end 63 | end -------------------------------------------------------------------------------- /mix.lock: -------------------------------------------------------------------------------- 1 | %{"csv": {:hex, :csv, "1.2.1", "9a249e1e9fddb4f34bfc2bcf2bfb43bff3aa62a55f807c72cb2249b1e3914ae9", [:mix], []}, 2 | "earmark": {:hex, :earmark, "0.1.19", "ffec54f520a11b711532c23d8a52b75a74c09697062d10613fa2dbdf8a9db36e", [:mix], []}, 3 | "ex_doc": {:hex, :ex_doc, "0.10.0", "f49c237250b829df986486b38f043e6f8e19d19b41101987f7214543f75947ec", [:mix], [{:earmark, "~> 0.1.17 or ~> 0.2", [hex: :earmark, optional: true]}]}, 4 | "floki": {:hex, :floki, "0.7.0", "52eb235995f9040dee7e2d09dd24e675f1ab02311528ff118d76baef94926f71", [:mix], [{:mochiweb, "~> 2.12.2", [hex: :mochiweb, optional: false]}]}, 5 | "hackney": {:hex, :hackney, "1.3.2", "43bd07ab88753f5e136e38fddd2a09124bee25733b03361eeb459d0173fc17ab", [:rebar, :make], [{:ssl_verify_hostname, "~> 1.0.5", [hex: :ssl_verify_hostname, optional: false]}, {:idna, "~> 1.0.2", [hex: :idna, optional: false]}]}, 6 | "httpoison": {:hex, :httpoison, "0.7.4", "053fa5420c9a2f7792ab49c9963ce67ede8b81dd9a1d0a7123cce54028deeb05", [:mix], [{:hackney, "~> 1.3.1", [hex: :hackney, optional: false]}]}, 7 | "idna": {:hex, :idna, "1.0.2", "397e3d001c002319da75759b0a81156bf11849c71d565162436d50020cb7265e", [:make], []}, 8 | "mochiweb": {:hex, :mochiweb, "2.12.2", "80804ad342afa3d7f3524040d4eed66ce74b17a555de454ac85b07c479928e46", [:make, :rebar], []}, 9 | "poison": {:hex, :poison, "1.4.0", "cd5afb9db7f0d19487572fa28185b6d4de647f14235746824e77b3139b79b725", [:mix], []}, 10 | "ssl_verify_hostname": {:hex, :ssl_verify_hostname, "1.0.5", "2e73e068cd6393526f9fa6d399353d7c9477d6886ba005f323b592d389fb47be", [:make], []}} 11 | -------------------------------------------------------------------------------- /test/sample_pages/e-commerce/static/computers/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Web Scraper 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 21 | 22 | 23 | 24 | 25 | 26 | 63 | 64 |
65 |
66 | 100 |
101 |

Computers category

102 | 103 |

Top items being scraped right now

104 | 105 |
106 |
107 |
108 | item 109 |
110 |

$1311.99

111 |

ThinkPad X240

112 |

12.5", Core i5-4300U, 8GB, 240GB SSD, Win7 Pro 64bit

113 |
114 |
115 |

8 reviews

116 |

117 | 118 | 119 | 120 | 121 |

122 |
123 |
124 |
125 |
126 |
127 | item 128 |
129 |

$251.99

130 |

Galaxy Tab

131 |

16GB, White

132 |
133 |
134 |

8 reviews

135 |

136 | 137 | 138 | 139 |

140 |
141 |
142 |
143 |
144 |
145 | item 146 |
147 |

$101.99

148 |

Memo Pad HD 7

149 |

IPS, Dual-Core 1.2GHz, 8GB, Android 4.3

150 |
151 |
152 |

8 reviews

153 |

154 | 155 | 156 | 157 | 158 |

159 |
160 |
161 |
162 |
163 | 164 |
165 |
166 |
167 | 172 | 173 | 183 | 184 | 185 | -------------------------------------------------------------------------------- /test/sample_pages/e-commerce/static/computers/index_files/cart2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sntran/scrapex/0b8e1db6cf24f3d98e644b03479af9a7c304b6a8/test/sample_pages/e-commerce/static/computers/index_files/cart2.png -------------------------------------------------------------------------------- /test/sample_pages/e-commerce/static/computers/laptops/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Web Scraper 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 21 | 22 | 23 | 24 | 25 | 26 | 63 | 64 |
65 |
66 | 100 |
101 |

Computers / Laptops

102 | 103 |
104 |
105 |
106 | item 107 |
108 |

$306.99

109 |

Aspire E1-510

110 |

15.6", Pentium N3520 2.16GHz, 4GB, 500GB, Linux

111 |
112 |
113 |

9 reviews

114 |

115 | 116 | 117 | 118 | 119 |

120 |
121 |
122 |
123 |
124 |
125 | item 126 |
127 |

$416.99

128 |

Packard 255 G2

129 |

15.6", AMD E2-3800 1.3GHz, 4GB, 500GB, Windows 8.1

130 |
131 |
132 |

2 reviews

133 |

134 | 135 | 136 | 137 |

138 |
139 |
140 |
141 |
142 |
143 | item 144 |
145 |

$520.99

146 |

HP 250 G3

147 |

15.6", Core i5-4210U, 4GB, 500GB, Windows 8.1

148 |
149 |
150 |

8 reviews

151 |

152 | 153 | 154 | 155 |

156 |
157 |
158 |
159 |
160 |
161 | item 162 |
163 |

$577.99

164 |

HP 350 G1

165 |

15.6", Core i5-4200U, 4GB, 750GB, Radeon HD8670M 2GB, Windows

166 |
167 |
168 |

4 reviews

169 |

170 | 171 | 172 |

173 |
174 |
175 |
176 |
177 |
178 | item 179 |
180 |

$581.99

181 |

Aspire E1-572G

182 |

15.6", Core i5-4200U, 8GB, 1TB, Radeon R7 M265, Windows 8.1

183 |
184 |
185 |

7 reviews

186 |

187 | 188 | 189 | 190 | 191 |

192 |
193 |
194 |
195 |
196 |
197 | item 198 |
199 |

$609.99

200 |

Pavilion

201 |

15.6", Core i5-4200U, 6GB, 750GB, Windows 8.1

202 |
203 |
204 |

3 reviews

205 |

206 | 207 | 208 | 209 | 210 | 211 |

212 |
213 |
214 |
215 |
216 | 217 | 218 |
    219 |
  • «
  • 220 |
  • 1
  • 221 |
  • 2
  • 222 |
  • 3
  • 223 |
  • 224 |
225 |
226 |
227 |
228 | 233 | 234 | 244 | 245 | 246 | -------------------------------------------------------------------------------- /test/sample_pages/e-commerce/static/computers/tablets/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Web Scraper 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 21 | 22 | 23 | 24 | 25 | 26 | 63 | 64 |
65 |
66 | 100 |
101 |

Computers / Tablets

102 | 103 |
104 |
105 |
106 | item 107 |
108 |

$69.99

109 |

Lenovo IdeaTab

110 |

7" screen, Android

111 |
112 |
113 |

13 reviews

114 |

115 | 116 | 117 | 118 |

119 |
120 |
121 |
122 |
123 |
124 | item 125 |
126 |

$88.99

127 |

IdeaTab A3500L

128 |

Black, 7" IPS, Quad-Core 1.2GHz, 8GB, Android 4.2

129 |
130 |
131 |

15 reviews

132 |

133 | 134 | 135 |

136 |
137 |
138 |
139 |
140 |
141 | item 142 |
143 |

$96.99

144 |

Acer Iconia

145 |

7" screen, Android, 16GB

146 |
147 |
148 |

2 reviews

149 |

150 | 151 | 152 | 153 | 154 |

155 |
156 |
157 |
158 |
159 |
160 | item 161 |
162 |

$97.99

163 |

Galaxy Tab 3

164 |

7", 8GB, Wi-Fi, Android 4.2, White

165 |
166 |
167 |

7 reviews

168 |

169 | 170 | 171 | 172 | 173 | 174 |

175 |
176 |
177 |
178 |
179 |
180 | item 181 |
182 |

$99.99

183 |

Iconia B1-730HD

184 |

Black, 7", 1.6GHz Dual-Core, 8GB, Android 4.4

185 |
186 |
187 |

15 reviews

188 |

189 | 190 | 191 | 192 | 193 |

194 |
195 |
196 |
197 |
198 |
199 | item 200 |
201 |

$101.99

202 |

Memo Pad HD 7

203 |

IPS, Dual-Core 1.2GHz, 8GB, Android 4.3

204 |
205 |
206 |

3 reviews

207 |

208 | 209 | 210 |

211 |
212 |
213 |
214 |
215 | 216 | 217 |
    218 |
  • «
  • 219 |
  • 1
  • 220 |
  • 2
  • 221 |
  • 3
  • 222 |
  • 4
  • 223 |
  • 224 |
225 |
226 |
227 |
228 | 233 | 234 | 244 | 245 | 246 | -------------------------------------------------------------------------------- /test/sample_pages/e-commerce/static/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Web Scraper 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 20 | 21 | 22 | 23 | 24 | 25 | 62 | 63 |
64 |
65 | 89 |
90 |
91 |

E-commerce training site

92 |

93 | Welcome to WebScraper e-commerce site. You can use this site for training 94 | to learn how to use the Web Scraper. Items listed here are not for sale. 95 |

96 |
97 | 98 |

Top items being scraped right now

99 | 100 |
101 |
102 |
103 | item 104 |
105 |

$57.99

106 |

LG Optimus

107 |

3.2" screen

108 |
109 |
110 |

8 reviews

111 |

112 | 113 | 114 | 115 |

116 |
117 |
118 |
119 |
120 |
121 | item 122 |
123 |

$537.99

124 |

iPad Mini Retina

125 |

Wi-Fi + Cellular, 32GB, Silver

126 |
127 |
128 |

13 reviews

129 |

130 | 131 | 132 | 133 | 134 |

135 |
136 |
137 |
138 |
139 |
140 | item 141 |
142 |

$130.99

143 |

MeMO Pad 7

144 |

White, 7", Atom 1.2GHz, 8GB, Android 4.4

145 |
146 |
147 |

15 reviews

148 |

149 | 150 | 151 | 152 |

153 |
154 |
155 |
156 |
157 | 158 |
159 |
160 |
161 | 166 | 167 | 177 | 178 | 179 | -------------------------------------------------------------------------------- /test/sample_pages/e-commerce/static/phones/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Web Scraper 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 21 | 22 | 23 | 24 | 25 | 26 | 63 | 64 |
65 |
66 | 96 |
97 |

Phones category

98 | 99 |

Top items being scraped right now

100 | 101 |
102 |
103 |
104 | item 105 |
106 |

$499.99

107 |

Ubuntu Edge

108 |

Sapphire glass

109 |
110 |
111 |

14 reviews

112 |

113 | 114 | 115 | 116 | 117 |

118 |
119 |
120 |
121 |
122 |
123 | item 124 |
125 |

$899.99

126 |

Iphone

127 |

Black

128 |
129 |
130 |

7 reviews

131 |

132 | 133 | 134 | 135 | 136 |

137 |
138 |
139 |
140 |
141 |
142 | item 143 |
144 |

$57.99

145 |

LG Optimus

146 |

3.2" screen

147 |
148 |
149 |

12 reviews

150 |

151 | 152 | 153 |

154 |
155 |
156 |
157 |
158 | 159 |
160 |
161 |
162 | 167 | 168 | 178 | 179 | 180 | -------------------------------------------------------------------------------- /test/sample_pages/e-commerce/static/phones/touch/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Web Scraper 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 21 | 22 | 23 | 24 | 25 | 26 | 63 | 64 |
65 |
66 | 96 |
97 |

Phones / Touch

98 | 99 |
100 |
101 |
102 | item 103 |
104 |

$24.99

105 |

Nokia 123

106 |

7 day battery

107 |
108 |
109 |

10 reviews

110 |

111 | 112 | 113 | 114 |

115 |
116 |
117 |
118 |
119 |
120 | item 121 |
122 |

$57.99

123 |

LG Optimus

124 |

3.2" screen

125 |
126 |
127 |

9 reviews

128 |

129 | 130 | 131 | 132 |

133 |
134 |
135 |
136 |
137 |
138 | item 139 |
140 |

$93.99

141 |

Samsung Galaxy

142 |

5 mpx. Android 5.0

143 |
144 |
145 |

7 reviews

146 |

147 | 148 | 149 |

150 |
151 |
152 |
153 |
154 |
155 | item 156 |
157 |

$109.99

158 |

Nokia X

159 |

Andoid, Jolla dualboot

160 |
161 |
162 |

13 reviews

163 |

164 | 165 | 166 |

167 |
168 |
169 |
170 |
171 |
172 | item 173 |
174 |

$118.99

175 |

Sony Xperia

176 |

GPS, waterproof

177 |
178 |
179 |

8 reviews

180 |

181 | 182 | 183 | 184 |

185 |
186 |
187 |
188 |
189 |
190 | item 191 |
192 |

$499.99

193 |

Ubuntu Edge

194 |

Sapphire glass

195 |
196 |
197 |

5 reviews

198 |

199 | 200 | 201 | 202 | 203 |

204 |
205 |
206 |
207 |
208 | 209 | 210 |
    211 |
  • «
  • 212 |
  • 1
  • 213 |
  • 2
  • 214 |
  • 215 |
216 |
217 |
218 |
219 | 224 | 225 | 235 | 236 | 237 | -------------------------------------------------------------------------------- /test/sample_pages/example.com.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Example Domain 5 | 6 | 7 | 8 | 9 | 40 | 41 | 42 | 43 |
44 |

Example Domain

45 |

This domain is established to be used for illustrative examples in documents. You may use this 46 | domain in examples without prior coordination or asking for permission.

47 |

More information...

48 |
49 | 50 | -------------------------------------------------------------------------------- /test/scrapex/gen_spider_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Scrapex.GenSpiderTest do 2 | use ExUnit.Case 3 | alias Scrapex.GenSpider 4 | doctest GenSpider 5 | 6 | @example_com "http://localhost:9090/example.com.html" 7 | @ecommerce_site "http://localhost:9090/e-commerce/static/index.html" 8 | @opts [urls: [@example_com]] 9 | 10 | test "a spider is a process" do 11 | defmodule GoodSpider do 12 | use GenSpider 13 | # GenSpider callbacks 14 | def init(args) do 15 | {:ok, args} 16 | end 17 | end 18 | 19 | {:ok, pid} = GenSpider.start_link(GoodSpider, []) 20 | assert is_pid(pid) 21 | 22 | {:ok, pid} = GenSpider.start(GoodSpider, []) 23 | assert is_pid(pid) 24 | end 25 | 26 | test "spider is based on GenServer" do 27 | defmodule EmoSpider do 28 | # GenSpider callbacks 29 | def init(_args) do 30 | :ignore 31 | end 32 | end 33 | 34 | defmodule BadSpider do 35 | # GenSpider callbacks 36 | def init(_args) do 37 | {:stop, :stop} 38 | end 39 | end 40 | 41 | assert :ignore == GenSpider.start(EmoSpider, []) 42 | assert {:error, :stop} == GenSpider.start(BadSpider, []) 43 | end 44 | 45 | test "default spider" do 46 | defmodule DoNothingSpider do 47 | use GenSpider 48 | end 49 | {:ok, pid} = GenSpider.start(DoNothingSpider, []) 50 | assert is_pid(pid) 51 | end 52 | 53 | test "should start the crawling immediately" do 54 | defmodule TestSpider do 55 | use GenSpider 56 | 57 | def init(tester) do 58 | {:ok, tester} 59 | end 60 | 61 | def start_requests(_urls, tester) do 62 | send tester, :start_requests 63 | {:ok, [], tester} 64 | end 65 | 66 | end 67 | 68 | GenSpider.start(TestSpider, self, @opts) 69 | 70 | assert_receive(:start_requests, 500) 71 | end 72 | 73 | test "should get the HTML of the start URL(s)" do 74 | defmodule HTMLSpider do 75 | use GenSpider 76 | 77 | def init(tester) do 78 | {:ok, tester} 79 | end 80 | 81 | def start_requests(urls, tester) do 82 | requests = urls 83 | |> Enum.map(&make_requests_from_url(&1, tester)) 84 | {:ok, requests, tester} 85 | end 86 | 87 | defp make_requests_from_url(url, tester) do 88 | GenSpider.request(url, fn(response) -> 89 | send tester, {:test_result, response.body} 90 | end) 91 | end 92 | 93 | end 94 | GenSpider.start(HTMLSpider, self, @opts) 95 | 96 | assert_receive({:test_result, actual}, 500) 97 | expected = HTTPoison.get!("http://localhost:9090/example.com.html").body 98 | assert actual === expected 99 | end 100 | 101 | test "can export data" do 102 | defmodule FastSpider do 103 | use GenSpider 104 | 105 | def start_requests(urls, tester) do 106 | requests = urls 107 | |> Enum.map(&make_requests_from_url(&1, tester)) 108 | {:ok, requests, tester} 109 | end 110 | 111 | defp make_requests_from_url(url, tester) do 112 | GenSpider.request(url, fn(response) -> 113 | send tester, {:test_result, response.body} 114 | parse(response) 115 | end) 116 | end 117 | 118 | end 119 | {:ok, spider} = GenSpider.start(FastSpider, self, @opts) 120 | 121 | assert_receive({:test_result, _}, 5000) 122 | # Assume that the spider, which requested to the same URL, should 123 | # have finished before our request below. 124 | expected = HTTPoison.get!("http://localhost:9090/example.com.html").body 125 | assert [expected] == GenSpider.export(spider) 126 | 127 | end 128 | 129 | defmodule Spider do 130 | use GenSpider 131 | 132 | def start_requests(urls, tester) do 133 | requests = urls 134 | |> Enum.map(&make_requests_from_url(&1, tester)) 135 | {:ok, requests, tester} 136 | end 137 | 138 | defp make_requests_from_url(url, tester) do 139 | GenSpider.request(url, fn(response) -> 140 | data = parse(response) 141 | send tester, {:test_result, response.body} 142 | data 143 | end) 144 | end 145 | 146 | def parse(response) do 147 | uuid = :crypto.strong_rand_bytes(8) |> Base.encode16 148 | {:ok, [uuid <> response.body]} 149 | end 150 | 151 | end 152 | 153 | test "can run on schedule" do 154 | opts = [urls: @opts[:urls], interval: 500] 155 | GenSpider.start(Spider, self, opts) 156 | 157 | assert_receive({:test_result, _}, 300) 158 | # Give time for spider to crawl 159 | :timer.sleep(50) 160 | assert_receive({:test_result, _}, 500) 161 | end 162 | 163 | test "new data will replace old data" do 164 | opts = [urls: @opts[:urls], interval: 500] 165 | {:ok, spider} = GenSpider.start(Spider, self, opts) 166 | 167 | assert_receive({:test_result, _old}, 300) 168 | [old] = GenSpider.export(spider) 169 | <> = old 170 | # Give time for spider to crawl 171 | :timer.sleep(50) 172 | assert_receive({:test_result, _new}, 500) 173 | [new] = GenSpider.export(spider) 174 | <> = new 175 | assert new_uuid !== old_uuid 176 | end 177 | 178 | test "multiple URLs should replace old data with merged new data" do 179 | opts = [urls: [ @ecommerce_site | @opts[:urls] ], interval: 500] 180 | {:ok, spider} = GenSpider.start(Spider, self, opts) 181 | 182 | assert_receive({:test_result, _old}, 1500) 183 | assert_receive({:test_result, _old}, 1500) 184 | 185 | old = GenSpider.export(spider) 186 | 187 | assert_receive({:test_result, _new}, 1500) 188 | assert_receive({:test_result, _new}, 1500) 189 | 190 | GenSpider.export(spider) 191 | |> Enum.with_index 192 | |> Enum.each(fn({data, index}) -> 193 | <> = Enum.at(old, index) 194 | <> = data 195 | assert new_uuid !== old_uuid 196 | end) 197 | end 198 | 199 | defmodule MapSpider do 200 | use GenSpider 201 | 202 | def start_requests(urls, tester) do 203 | requests = urls 204 | |> Enum.map(&make_requests_from_url(&1, tester)) 205 | {:ok, requests, tester} 206 | end 207 | 208 | defp make_requests_from_url(url, tester) do 209 | spider = self() 210 | GenSpider.request(url, fn(response) -> 211 | {:ok, result} = parse(response) 212 | case tester.(result, spider) do 213 | {:stop, reason} -> 214 | {:stop, reason} 215 | {:test_result, result} -> 216 | {:ok, result} 217 | end 218 | end) 219 | end 220 | 221 | def parse(response) do 222 | result = [%{"body" => response.body}] 223 | {:ok, result} 224 | end 225 | 226 | end 227 | 228 | test "returned map can be exported to json" do 229 | tester = self 230 | callback = fn(result, _) -> 231 | send(tester, {:test_result, result}) 232 | end 233 | {:ok, spider} = GenSpider.start(MapSpider, callback, @opts) 234 | 235 | assert_receive({:test_result, result}, 300) 236 | json = GenSpider.export(spider, :json) 237 | assert is_binary(json) 238 | assert json == Poison.encode!(result) 239 | end 240 | 241 | test "can export using an encoder" do 242 | tester = self 243 | callback = fn(result, _) -> 244 | send(tester, {:test_result, result}) 245 | end 246 | {:ok, spider} = GenSpider.start(MapSpider, callback, @opts) 247 | 248 | assert_receive({:test_result, result}, 300) 249 | json = GenSpider.export(spider, &Poison.encode!/1) 250 | assert is_binary(json) 251 | assert json == Poison.encode!(result) 252 | end 253 | 254 | test "will await for data to export" do 255 | tester = self 256 | callback = fn(result, _) -> 257 | send(tester, {:test_result, result}) 258 | end 259 | opts = [urls: [ @ecommerce_site | @opts[:urls] ]] 260 | {:ok, spider} = GenSpider.start(MapSpider, callback, opts) 261 | 262 | # Since we can export immediately after starting the spider, it 263 | # will need to await for data. 264 | data = GenSpider.export(spider) 265 | 266 | actual = 267 | opts[:urls] 268 | |> Enum.map(&(%{"body" => HTTPoison.get!(&1).body})) 269 | assert actual === data 270 | end 271 | 272 | test "will export partial or no data if spider returns stop" do 273 | tester = self 274 | first_response = HTTPoison.get!(@ecommerce_site).body 275 | callback = fn(result = [%{"body" => response}], _) -> 276 | case response do 277 | ^first_response -> 278 | send tester, {:test_result, result} 279 | _ -> 280 | {:stop, :test} 281 | end 282 | end 283 | 284 | opts = [urls: [ @ecommerce_site | @opts[:urls] ]] 285 | {:ok, spider} = GenSpider.start(MapSpider, callback, opts) 286 | 287 | data = GenSpider.export(spider) 288 | assert [%{"body" => first_response}] === data 289 | end 290 | 291 | test "stop the spider when the callback returns stop" do 292 | tester = self 293 | first_response = HTTPoison.get!(@ecommerce_site).body 294 | callback = fn(result = [%{"body" => response}], _) -> 295 | case response do 296 | ^first_response -> 297 | send tester, {:test_result, result} 298 | _ -> 299 | {:stop, :test} 300 | end 301 | end 302 | 303 | opts = [urls: [ @ecommerce_site | @opts[:urls] ]] 304 | {:ok, spider} = GenSpider.start(MapSpider, callback, opts) 305 | 306 | _data = GenSpider.export(spider) 307 | # Let the spider stop 308 | :timer.sleep(100) 309 | refute Process.alive?(spider) 310 | end 311 | 312 | test "can request fresh data regardless of timer" do 313 | opts = [urls: @opts[:urls], interval: 60000] 314 | {:ok, spider} = GenSpider.start(Spider, self, opts) 315 | # First export is always fresh, and same as next export. 316 | [old] = GenSpider.export(spider) 317 | assert [old] === GenSpider.export(spider) 318 | <> = old 319 | 320 | [new] = GenSpider.export(spider, nil, true) 321 | <> = new 322 | assert new_uuid !== old_uuid 323 | end 324 | 325 | test "can request for links during parsing" do 326 | # Instead of returning the parsed data, `parse` function 327 | # can return an async task, which will be awaited and merge 328 | # to the data. 329 | 330 | # Since this test is made without knowledge of selector engine, 331 | # we simply request other URL and return that body instead. 332 | callback = fn(_, _) -> 333 | # The final callback will send test result to this test proces, 334 | # but also return that tuple, which is what `GenSpider.await/1` 335 | # returns. 336 | # `GenSpider.request/2` returns an asynchronous task. 337 | request = GenSpider.request(@ecommerce_site, fn 338 | (response) -> {:test_result, [response.body]} 339 | end) 340 | # That task can be awaited. 341 | {:test_result, body} = GenSpider.await(request) 342 | {:test_result, body} 343 | end 344 | 345 | {:ok, spider} = GenSpider.start(MapSpider, callback, @opts) 346 | [data] = GenSpider.export(spider) 347 | assert data === HTTPoison.get!(@ecommerce_site).body 348 | end 349 | 350 | test "parse function can return an async request" do 351 | callback = fn(_what, spider) -> 352 | request = GenSpider.request(@ecommerce_site, fn 353 | (response) -> [response.body] 354 | end, spider) 355 | {:test_result, request} 356 | end 357 | 358 | {:ok, spider} = GenSpider.start(MapSpider, callback, @opts) 359 | [data] = GenSpider.export(spider) 360 | assert data === HTTPoison.get!(@ecommerce_site).body 361 | end 362 | 363 | test "parse function can return multiple async requests" do 364 | # Can be used to follow multiple links on a page. 365 | # Results will be concatenated. 366 | urls = [ @ecommerce_site | @opts[:urls] ] 367 | 368 | callback = fn(_, spider) -> 369 | requests = 370 | urls 371 | |> Enum.map(fn(url) -> 372 | GenSpider.request(url, fn 373 | (response) -> [response.body] 374 | end, spider) 375 | end) 376 | {:test_result, requests} 377 | end 378 | 379 | {:ok, spider} = GenSpider.start(MapSpider, callback, @opts) 380 | data = GenSpider.export(spider) 381 | 382 | actual = 383 | urls 384 | |> Enum.map(&(HTTPoison.get!(&1).body)) 385 | 386 | assert data === actual 387 | end 388 | 389 | test "should follow redirect" do 390 | url = "http://localhost:9090/e-commerce/static" 391 | opts = [urls: [url]] 392 | tester = self 393 | callback = fn(result, _) -> 394 | send(tester, {:test_result, result}) 395 | end 396 | 397 | {:ok, spider} = GenSpider.start(MapSpider, callback, opts) 398 | [%{"body" => data}] = GenSpider.export(spider) 399 | 400 | assert data === HTTPoison.get!(url <> "/index.html").body 401 | end 402 | 403 | test "should stop after first crawl if no interval set" do 404 | url = "http://localhost:9090/e-commerce/static" 405 | opts = [urls: [url]] 406 | tester = self 407 | callback = fn(result, _) -> 408 | send(tester, {:test_result, result}) 409 | end 410 | 411 | {:ok, spider} = GenSpider.start(MapSpider, callback, opts) 412 | # The spider will stop after a specific timeout when it's done scraping. 413 | # We call `export` immediately so it will timeout after that. 414 | [%{"body" => data}] = GenSpider.export(spider) 415 | refute Process.alive?(spider) 416 | end 417 | 418 | test "should also stop even when not exporting" do 419 | url = "http://localhost:9090/e-commerce/static" 420 | opts = [urls: [url]] 421 | tester = self 422 | callback = fn(result, _) -> 423 | send(tester, {:test_result, result}) 424 | end 425 | 426 | {:ok, spider} = GenSpider.start(MapSpider, callback, opts) 427 | # The spider will stop after a specific timeout when it's done scraping. 428 | assert_receive({:test_result, result}, 300) 429 | # Allow some time for the spider to stop. 430 | :timer.sleep(500) 431 | refute Process.alive?(spider) 432 | end 433 | 434 | defmodule StreamSpider do 435 | use GenSpider 436 | 437 | def start_requests(_urls, callback) do 438 | stream = Stream.resource( 439 | _start = fn() -> 440 | ["http://localhost:9090/e-commerce/static"] 441 | end, 442 | _next = fn 443 | ([]) -> {:halt, []} 444 | ([url|urls]) -> 445 | {[callback.(url)], urls} 446 | end, 447 | _after = fn(_) -> end 448 | ) 449 | 450 | {:ok, stream, callback} 451 | end 452 | end 453 | 454 | test "can take a stream instead of requests list" do 455 | tester = self 456 | callback = fn(url) -> 457 | GenSpider.request(url, fn(response) -> 458 | send(tester, {:ok, [response]}) 459 | end) 460 | end 461 | GenSpider.start(StreamSpider, callback, []) 462 | assert_receive({:ok, _new}, 500) 463 | end 464 | end 465 | -------------------------------------------------------------------------------- /test/scrapex/selector_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Scrapex.SelectorTest do 2 | use ExUnit.Case, async: true 3 | import Scrapex.Selector 4 | 5 | setup_all do 6 | url = "http://localhost:9090/e-commerce/static/index.html" 7 | html = HTTPoison.get!(url).body 8 | 9 | # No metadata 10 | {:ok, url: url, body: html} 11 | end 12 | 13 | test "parse CSS selector", context do 14 | [href] = context.body 15 | |> select("a.navbar-brand") 16 | |> extract("href") 17 | 18 | assert href === "/" 19 | end 20 | 21 | test "select text content", context do 22 | [h1] = context.body 23 | |> select("h1") 24 | |> extract("text") 25 | 26 | assert h1 === "E-commerce training site" 27 | end 28 | 29 | test "default to get content", context do 30 | [h1] = context.body 31 | |> select("h1") 32 | |> extract() 33 | 34 | assert h1 === "E-commerce training site" 35 | end 36 | 37 | test "select text content and children content", context do 38 | link_texts = context.body 39 | |> select("a.category-link") 40 | |> extract() 41 | 42 | assert link_texts === ["Computers", "Phones"] 43 | end 44 | 45 | test "trip all Unicode whitespaces", context do 46 | [p] = context.body 47 | |> select(".jumbotron p") 48 | |> extract() 49 | 50 | assert p === "Welcome to WebScraper e-commerce site. You can use this site for training to learn how to use the Web Scraper. Items listed here are not for sale." 51 | end 52 | 53 | # TESTS FOR ENUMERABLE 54 | 55 | test "can be enumerable", context do 56 | selectors = select(context.body, "a.category-link") 57 | # Of course you can enumarate extracted values 58 | categories = extract(selectors) 59 | |> Enum.map(&(&1)) 60 | 61 | assert categories == Enum.map(selectors, fn(selector) -> 62 | [value] = extract(selector) 63 | value 64 | end) 65 | end 66 | 67 | test "a single selector can still be enumerable", context do 68 | selectors = select(context.body, "a.category-link") 69 | # Of course you can enumarate extracted values 70 | categories = extract(selectors) 71 | |> Enum.map(&(&1)) 72 | 73 | selectors = select(context.body, "h1") 74 | expected = ["E-commerce training site"] 75 | 76 | assert expected == Enum.map(selectors, fn(selector) -> 77 | [value] = extract(selector) 78 | value 79 | end) 80 | end 81 | end -------------------------------------------------------------------------------- /test/scrapex/spider/example_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Scrapex.Spider.ExampleTest do 2 | use ExUnit.Case 3 | 4 | alias Scrapex.GenSpider 5 | alias Spider.Example 6 | import Scrapex.Selector 7 | 8 | defmodule Example do 9 | use GenSpider 10 | 11 | # Client 12 | def start_link(parser) do 13 | opts = [ 14 | urls: ["http://localhost:9090/e-commerce/static/index.html"]] 15 | GenSpider.start_link(__MODULE__, parser, opts) 16 | end 17 | 18 | def export(spider) do 19 | GenSpider.export(spider) 20 | end 21 | 22 | # Server (callbacks) 23 | 24 | def init(parser) do 25 | {:ok, parser} 26 | end 27 | 28 | def parse(response, parser) do 29 | results = parser.(response) 30 | {:ok, results, parser} 31 | end 32 | end 33 | 34 | def parse_product(response) do 35 | response.body 36 | |> select(".thumbnail") 37 | |> Enum.map(fn(selector) -> 38 | [name] = selector |> select(".title") |> extract 39 | [description] = selector |> select(".description") |> extract 40 | [price] = selector |> select(".price") |> extract 41 | 42 | %{"name" => name, "description" => description, "price" => price} 43 | end) 44 | end 45 | 46 | test "get data on page" do 47 | {:ok, spider} = Example.start_link(&parse_product/1) 48 | results = Example.export(spider) 49 | assert length(results) === 3 50 | end 51 | 52 | test "can follow links" do 53 | parser = fn(response) -> 54 | response.body 55 | |> select("#side-menu .category-link") 56 | |> Enum.flat_map(fn(anchor) -> 57 | [href] = anchor |> extract("href") 58 | full_url = GenSpider.Response.url_join(response, href) <> "/index.html" 59 | [category] = anchor |> extract() 60 | 61 | GenSpider.request(full_url, fn({:ok, response}) -> 62 | parse_product(response) 63 | end) 64 | |> GenSpider.await() 65 | |> Enum.map(&Map.put(&1, "category", category)) 66 | end) 67 | end 68 | 69 | {:ok, spider} = Example.start_link(parser) 70 | results = Example.export(spider) 71 | assert length(results) === 6 72 | end 73 | end -------------------------------------------------------------------------------- /test/scrapex/spider/webscraper.csv: -------------------------------------------------------------------------------- 1 | Category,Category-href,Name,Price,Description,SubCategory,SubCategory-href,Page,Page-href 2 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Galaxy Note 10.1","$587.99","10.1"", 32GB, Black","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","4","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/4" 3 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Galaxy Tab 3","$97.99","7"", 8GB, Wi-Fi, Android 4.2, White","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","1","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/1" 4 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","ThinkPad Yoga","$1223.99","12.5"" Touch, Core i5 4200U, 8GB, 500GB + 16GB SSD Cache, Windows","Laptops","http://webscraper.io/test-sites/e-commerce/static/computers/laptops","2","http://webscraper.io/test-sites/e-commerce/static/computers/laptops/2" 5 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","ProBook","$739.99","14"", Core i5 2.6GHz, 4GB, 500GB, Win7 Pro 64bit","Laptops","http://webscraper.io/test-sites/e-commerce/static/computers/laptops","2","http://webscraper.io/test-sites/e-commerce/static/computers/laptops/2" 6 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","iPad Mini Retina","$537.99","Wi-Fi + Cellular, 32GB, Silver","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","4","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/4" 7 | "Phones","http://webscraper.io/test-sites/e-commerce/static/phones","Iphone","$899.99","Silver","Touch","http://webscraper.io/test-sites/e-commerce/static/phones/touch","2","http://webscraper.io/test-sites/e-commerce/static/phones/touch/2" 8 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Galaxy Tab","$251.99","16GB, White","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","3","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/3" 9 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","ThinkPad X230","$1244.99","12.5"", Core i5 2.6GHz, 8GB, 180GB SSD, Win7 Pro 64bit","Laptops","http://webscraper.io/test-sites/e-commerce/static/computers/laptops","2","http://webscraper.io/test-sites/e-commerce/static/computers/laptops/2" 10 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Memo Pad HD 7","$101.99","IPS, Dual-Core 1.2GHz, 8GB, Android 4.3","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","1","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/1" 11 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Aspire E1-572G","$581.99","15.6"", Core i5-4200U, 8GB, 1TB, Radeon R7 M265, Windows 8.1","Laptops","http://webscraper.io/test-sites/e-commerce/static/computers/laptops","1","http://webscraper.io/test-sites/e-commerce/static/computers/laptops/1" 12 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Asus MeMO Pad","$102.99","7"" screen, Android, 8GB","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","2","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/2" 13 | "Phones","http://webscraper.io/test-sites/e-commerce/static/phones","LG Optimus","$57.99","3.2"" screen","Touch","http://webscraper.io/test-sites/e-commerce/static/phones/touch","«","http://webscraper.io/test-sites/e-commerce/static/phones/touch/1" 14 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Lenovo IdeaTab","$69.99","7"" screen, Android","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","1","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/1" 15 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Acer Iconia","$96.99","7"" screen, Android, 16GB","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","1","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/1" 16 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Aspire E1-510","$306.99","15.6"", Pentium N3520 2.16GHz, 4GB, 500GB, Linux","Laptops","http://webscraper.io/test-sites/e-commerce/static/computers/laptops","1","http://webscraper.io/test-sites/e-commerce/static/computers/laptops/1" 17 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","HP 250 G3","$520.99","15.6"", Core i5-4210U, 4GB, 500GB, Windows 8.1","Laptops","http://webscraper.io/test-sites/e-commerce/static/computers/laptops","1","http://webscraper.io/test-sites/e-commerce/static/computers/laptops/1" 18 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","IdeaTab A8-50","$121.99","Blue, 8"" IPS, Quad-Core 1.3GHz, 16GB, Android 4.2","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","2","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/2" 19 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Apple iPad Air","$603.99","Wi-Fi, 64GB, Silver","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","4","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/4" 20 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Amazon Kindle","$103.99","6"" screen, wifi","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","2","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/2" 21 | "Phones","http://webscraper.io/test-sites/e-commerce/static/phones","Samsung Galaxy","$93.99","5 mpx. Android 5.0","Touch","http://webscraper.io/test-sites/e-commerce/static/phones/touch","«","http://webscraper.io/test-sites/e-commerce/static/phones/touch/1" 22 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Galaxy Tab 4","$233.99","LTE (SM-T235), Quad-Core 1.2GHz, 8GB, Black","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","3","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/3" 23 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","ThinkPad X240","$1311.99","12.5"", Core i5-4300U, 8GB, 240GB SSD, Win7 Pro 64bit","Laptops","http://webscraper.io/test-sites/e-commerce/static/computers/laptops","3","http://webscraper.io/test-sites/e-commerce/static/computers/laptops/3" 24 | "Phones","http://webscraper.io/test-sites/e-commerce/static/phones","Nokia X","$109.99","Andoid, Jolla dualboot","Touch","http://webscraper.io/test-sites/e-commerce/static/phones/touch","«","http://webscraper.io/test-sites/e-commerce/static/phones/touch/1" 25 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Dell XPS 13","$1281.99","13.3"" Touch, Core i5-4210U, 8GB, 128GB SSD, Windows 8.1","Laptops","http://webscraper.io/test-sites/e-commerce/static/computers/laptops","3","http://webscraper.io/test-sites/e-commerce/static/computers/laptops/3" 26 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Inspiron 15","$745.99","Moon Silver, 15.6"", Core i7-4510U, 8GB, 1TB, Radeon HD R7 M265 2GB,","Laptops","http://webscraper.io/test-sites/e-commerce/static/computers/laptops","2","http://webscraper.io/test-sites/e-commerce/static/computers/laptops/2" 27 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Galaxy Note","$489.99","12.2"", 32GB, WiFi, Android 4.4, White","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","3","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/3" 28 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Packard 255 G2","$416.99","15.6"", AMD E2-3800 1.3GHz, 4GB, 500GB, Windows 8.1","Laptops","http://webscraper.io/test-sites/e-commerce/static/computers/laptops","1","http://webscraper.io/test-sites/e-commerce/static/computers/laptops/1" 29 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","HP 350 G1","$577.99","15.6"", Core i5-4200U, 4GB, 750GB, Radeon HD8670M 2GB, Windows","Laptops","http://webscraper.io/test-sites/e-commerce/static/computers/laptops","1","http://webscraper.io/test-sites/e-commerce/static/computers/laptops/1" 30 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Pavilion","$609.99","15.6"", Core i5-4200U, 6GB, 750GB, Windows 8.1","Laptops","http://webscraper.io/test-sites/e-commerce/static/computers/laptops","1","http://webscraper.io/test-sites/e-commerce/static/computers/laptops/1" 31 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","IdeaTab S5000","$172.99","Silver, 7"" IPS, Quad-Core 1.2Ghz, 16GB, 3G, Android 4.2","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","3","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/3" 32 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","ThinkPad Yoga","$1033.99","12.5"" Touch, Core i3-4010U, 4GB, 500GB + 16GB SSD Cache,","Laptops","http://webscraper.io/test-sites/e-commerce/static/computers/laptops","2","http://webscraper.io/test-sites/e-commerce/static/computers/laptops/2" 33 | "Phones","http://webscraper.io/test-sites/e-commerce/static/phones","Nokia 123","$24.99","7 day battery","Touch","http://webscraper.io/test-sites/e-commerce/static/phones/touch","«","http://webscraper.io/test-sites/e-commerce/static/phones/touch/1" 34 | "Phones","http://webscraper.io/test-sites/e-commerce/static/phones","Ubuntu Edge","$499.99","Sapphire glass","Touch","http://webscraper.io/test-sites/e-commerce/static/phones/touch","«","http://webscraper.io/test-sites/e-commerce/static/phones/touch/1" 35 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Galaxy Note","$399.99","10.1"", 3G, Android 4.0, Garnet Red","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","3","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/3" 36 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","MeMO Pad 7","$130.99","White, 7"", Atom 1.2GHz, 8GB, Android 4.4","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","2","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/2" 37 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","IdeaTab A3500-H","$148.99","Blue, 7"" IPS, Quad-Core 1.3GHz, 8GB, 3G, Android 4.2","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","2","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/2" 38 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Iconia B1-730HD","$99.99","Black, 7"", 1.6GHz Dual-Core, 8GB, Android 4.4","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","1","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/1" 39 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","ThinkPad T540p","$1178.99","15.6"", Core i5-4200M, 4GB, 500GB, Win7 Pro 64bit","Laptops","http://webscraper.io/test-sites/e-commerce/static/computers/laptops","2","http://webscraper.io/test-sites/e-commerce/static/computers/laptops/2" 40 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Galaxy Tab 3","$107.99","7"", 8GB, Wi-Fi, Android 4.2, Yellow","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","2","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/2" 41 | "Phones","http://webscraper.io/test-sites/e-commerce/static/phones","Iphone","$899.99","Black","Touch","http://webscraper.io/test-sites/e-commerce/static/phones/touch","2","http://webscraper.io/test-sites/e-commerce/static/phones/touch/2" 42 | "Phones","http://webscraper.io/test-sites/e-commerce/static/phones","Sony Xperia","$118.99","GPS, waterproof","Touch","http://webscraper.io/test-sites/e-commerce/static/phones/touch","«","http://webscraper.io/test-sites/e-commerce/static/phones/touch/1" 43 | "Phones","http://webscraper.io/test-sites/e-commerce/static/phones","Iphone","$899.99","White","Touch","http://webscraper.io/test-sites/e-commerce/static/phones/touch","2","http://webscraper.io/test-sites/e-commerce/static/phones/touch/2" 44 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","MeMo PAD FHD 10","$320.99","White, 10.1"" IPS, 1.6GHz, 2GB, 16GB, Android 4.2","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","3","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/3" 45 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","IdeaTab A3500L","$88.99","Black, 7"" IPS, Quad-Core 1.2GHz, 8GB, Android 4.2","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","1","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/1" 46 | -------------------------------------------------------------------------------- /test/scrapex/spider/webscraper_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Scrapex.Spider.WebScraperTest do 2 | use ExUnit.Case 3 | 4 | alias Scrapex.Spider.WebScraper 5 | 6 | @url "http://localhost:9090/e-commerce/static/index.html" 7 | @page_title "E-commerce training site" 8 | @description "Welcome to WebScraper e-commerce site. You can use this site for training to learn how to use the Web Scraper. Items listed here are not for sale." 9 | @categories ["Computers", "Phones"] 10 | @subcategories [["Laptops", "Tablets"], ["Phones"]] 11 | @home [23, 5, 39] 12 | @computers [21, 6, 8] 13 | @phones [32, 39, 11] 14 | 15 | setup_all do 16 | items = 17 | "test/scrapex/spider/webscraper.csv" 18 | |> File.stream! 19 | |> CSV.decode(headers: true) 20 | |> Enum.map(fn(row) -> 21 | row 22 | |> Map.delete("Category-href") 23 | |> Map.delete("SubCategory-href") 24 | |> Map.delete("Page-href") 25 | end) 26 | 27 | {:ok, items: items} 28 | end 29 | 30 | 31 | test "scrape single item" do 32 | selectors = [%{ 33 | "parentSelectors" => ["_root"], 34 | "type" => "SelectorText", 35 | "multiple" => false, 36 | "id" => "Page Title", 37 | "selector" => ".jumbotron h1", 38 | "delay" => "" 39 | }] 40 | 41 | sitemap = %{"startUrl" => @url, "selectors" => selectors} 42 | 43 | {:ok, spider} = WebScraper.start_link(sitemap) 44 | [data] = WebScraper.export(spider) 45 | 46 | expected = %{ 47 | "Page Title" => "E-commerce training site" 48 | } 49 | assert data === expected 50 | end 51 | 52 | test "scrape multiple single items" do 53 | selectors = [%{ 54 | "parentSelectors" => ["_root"], 55 | "type" => "SelectorText", 56 | "multiple" => false, 57 | "id" => "Page Title", 58 | "selector" => ".jumbotron h1", 59 | "delay" => "" 60 | }, %{ 61 | "parentSelectors" => ["_root"], 62 | "type" => "SelectorText", 63 | "multiple" => false, 64 | "id" => "Main Description", 65 | "selector" => ".jumbotron p", 66 | "delay" => "" 67 | }] 68 | 69 | sitemap = %{"startUrl" => @url, "selectors" => selectors} 70 | 71 | {:ok, spider} = WebScraper.start_link(sitemap) 72 | [data] = WebScraper.export(spider) 73 | 74 | expected = %{ 75 | "Page Title" => @page_title, 76 | "Main Description" => @description 77 | } 78 | assert data === expected 79 | end 80 | 81 | test "scrape multiple items" do 82 | selectors = [%{ 83 | "parentSelectors" => ["_root"], 84 | "type" => "SelectorText", 85 | "multiple" => true, 86 | "id" => "Category", 87 | "selector" => "a.category-link", 88 | "delay" => "" 89 | }] 90 | 91 | sitemap = %{"startUrl" => @url, "selectors" => selectors} 92 | 93 | {:ok, spider} = WebScraper.start_link(sitemap) 94 | data = WebScraper.export(spider) 95 | 96 | expected = [%{ 97 | "Category" => "Computers" 98 | }, %{ 99 | "Category" => "Phones" 100 | }] 101 | assert data === expected 102 | end 103 | 104 | test "scrape both single and multiple items" do 105 | selectors = [%{ 106 | "parentSelectors" => ["_root"], 107 | "type" => "SelectorText", 108 | "multiple" => true, 109 | "id" => "Category", 110 | "selector" => "a.category-link", 111 | "delay" => "" 112 | }, %{ 113 | "parentSelectors" => ["_root"], 114 | "type" => "SelectorText", 115 | "multiple" => false, 116 | "id" => "Page Title", 117 | "selector" => ".jumbotron h1", 118 | "delay" => "" 119 | }] 120 | 121 | sitemap = %{"startUrl" => @url, "selectors" => selectors} 122 | 123 | {:ok, spider} = WebScraper.start_link(sitemap) 124 | data = WebScraper.export(spider) 125 | 126 | expected = [%{ 127 | "Category" => "Computers", 128 | "Page Title" => "E-commerce training site" 129 | }, %{ 130 | "Category" => "Phones", 131 | "Page Title" => "E-commerce training site" 132 | }] 133 | assert data === expected 134 | end 135 | 136 | test "scrape with empty selector" do 137 | selectors = [%{ 138 | "parentSelectors" => ["_root"], 139 | "type" => "SelectorText", 140 | "multiple" => true, 141 | "id" => "Category", 142 | "selector" => "a.category-link", 143 | "delay" => "" 144 | }, %{ 145 | "parentSelectors" => ["_root"], 146 | "type" => "SelectorText", 147 | "multiple" => false, 148 | "id" => "Page Title", 149 | "selector" => ".jumbotron h2", # Intended typo. 150 | "delay" => "" 151 | }] 152 | 153 | sitemap = %{"startUrl" => @url, "selectors" => selectors} 154 | 155 | {:ok, spider} = WebScraper.start_link(sitemap) 156 | data = WebScraper.export(spider) 157 | 158 | expected = [%{ 159 | "Category" => "Computers" 160 | }, %{ 161 | "Category" => "Phones" 162 | }] 163 | assert data === expected 164 | 165 | selectors = [%{ 166 | "parentSelectors" => ["_root"], 167 | "type" => "SelectorText", 168 | "multiple" => true, 169 | "id" => "Category", 170 | "selector" => "a.category", # Intended typo. 171 | "delay" => "" 172 | }, %{ 173 | "parentSelectors" => ["_root"], 174 | "type" => "SelectorText", 175 | "multiple" => false, 176 | "id" => "Page Title", 177 | "selector" => ".jumbotron h1", 178 | "delay" => "" 179 | }] 180 | 181 | sitemap = %{"startUrl" => @url, "selectors" => selectors} 182 | 183 | {:ok, spider} = WebScraper.start_link(sitemap) 184 | data = WebScraper.export(spider) 185 | 186 | expected = [%{ 187 | "Page Title" => "E-commerce training site" 188 | }] 189 | assert data === expected 190 | end 191 | 192 | test "scrape only one of multiple items" do 193 | selectors = [%{ 194 | "parentSelectors" => ["_root"], 195 | "type" => "SelectorText", 196 | "multiple" => false, # We only want one category among 3 197 | "id" => "Category", 198 | "selector" => "a.category-link", 199 | "delay" => "" 200 | }, %{ 201 | "parentSelectors" => ["_root"], 202 | "type" => "SelectorText", 203 | "multiple" => true, # Even though there is only 1 h1 204 | "id" => "Page Title", 205 | "selector" => ".jumbotron h1", 206 | "delay" => "" 207 | }] 208 | 209 | sitemap = %{"startUrl" => @url, "selectors" => selectors} 210 | 211 | {:ok, spider} = WebScraper.start_link(sitemap) 212 | data = WebScraper.export(spider) 213 | 214 | expected = [%{ 215 | "Category" => "Computers", 216 | "Page Title" => "E-commerce training site" 217 | }] 218 | assert data === expected 219 | 220 | # Test group of multiple items 221 | selectors = [%{ 222 | "parentSelectors" => ["_root"], 223 | "type" => "SelectorText", 224 | "multiple" => true, 225 | "id" => "Category", 226 | "selector" => "a.category-link", 227 | "delay" => "" 228 | }, %{ 229 | "parentSelectors" => ["_root"], 230 | "type" => "SelectorText", 231 | "multiple" => false, 232 | "id" => "Navigation", # We only want one category among 3 233 | "selector" => ".navbar-right a", 234 | "delay" => "" 235 | }] 236 | 237 | sitemap = %{"startUrl" => @url, "selectors" => selectors} 238 | 239 | {:ok, spider} = WebScraper.start_link(sitemap) 240 | data = WebScraper.export(spider) 241 | 242 | expected = [%{ 243 | "Category" => "Computers", 244 | "Navigation" => "Download" 245 | }, %{ 246 | "Category" => "Phones", 247 | "Navigation" => "Download" 248 | }] 249 | assert data === expected 250 | end 251 | 252 | test "scrape mixed between single and multiple, sorted" do 253 | selectors = [%{ 254 | "parentSelectors" => ["_root"], 255 | "type" => "SelectorText", 256 | "multiple" => true, 257 | "id" => "Category", 258 | "selector" => "a.category-link ", 259 | "delay" => "" 260 | }, %{ 261 | "parentSelectors" => ["_root"], 262 | "type" => "SelectorText", 263 | "multiple" => false, 264 | "id" => "Page Title", 265 | "selector" => ".jumbotron h1", 266 | "delay" => "" 267 | }, %{ 268 | "parentSelectors" => ["_root"], 269 | "type" => "SelectorText", 270 | "multiple" => true, 271 | "id" => "Navigation", 272 | "selector" => ".navbar-right a", 273 | "delay" => "" 274 | }] 275 | 276 | sitemap = %{"startUrl" => @url, "selectors" => selectors} 277 | 278 | {:ok, spider} = WebScraper.start_link(sitemap) 279 | data = WebScraper.export(spider) 280 | 281 | expected = [%{ 282 | "Category" => "Computers", 283 | "Page Title" => "E-commerce training site", 284 | "Navigation" => "Download" 285 | }, %{ 286 | "Category" => "Phones", 287 | "Page Title" => "E-commerce training site", 288 | "Navigation" => "Download" 289 | }, %{ 290 | "Category" => "Computers", 291 | "Page Title" => "E-commerce training site", 292 | "Navigation" => "GitHub" 293 | }, %{ 294 | "Category" => "Phones", 295 | "Page Title" => "E-commerce training site", 296 | "Navigation" => "GitHub" 297 | }, %{ 298 | "Category" => "Computers", 299 | "Page Title" => "E-commerce training site", 300 | "Navigation" => "Donate" 301 | }, %{ 302 | "Category" => "Phones", 303 | "Page Title" => "E-commerce training site", 304 | "Navigation" => "Donate" 305 | }] 306 | assert ScrapexAsserter.array_equals(data, expected) 307 | end 308 | 309 | test "follow nodes under _root first" do 310 | selectors = [%{ 311 | "parentSelectors" => ["_root"], 312 | "type" => "SelectorText", 313 | "multiple" => false, 314 | "id" => "Page Title", 315 | "selector" => ".jumbotron h1", 316 | "delay" => "" 317 | }, %{ 318 | "parentSelectors" => ["Category"], 319 | "type" => "SelectorText", 320 | "multiple" => false, 321 | "id" => "SubCategory", 322 | "selector" => "a.subcategory-link", 323 | "delay" => "" 324 | }, %{ 325 | "parentSelectors" => ["_root"], 326 | "type" => "SelectorLink", 327 | "multiple" => false, 328 | "id" => "Category", 329 | "selector" => "a.category-link", 330 | "delay" => "" 331 | }] 332 | sitemap = %{"startUrl" => @url, "selectors" => selectors} 333 | 334 | {:ok, spider} = WebScraper.start_link(sitemap) 335 | [data] = WebScraper.export(spider) 336 | 337 | expected = %{ 338 | "Category" => "Computers", 339 | "Page Title" => "E-commerce training site", 340 | "SubCategory" => "Laptops" 341 | } 342 | assert data === expected 343 | end 344 | 345 | test "whether to retrieve multiple items from selector" do 346 | selectors = [%{ 347 | "parentSelectors" => ["_root"], 348 | "type" => "SelectorText", 349 | "multiple" => false, 350 | "id" => "Page Title", 351 | "selector" => ".jumbotron h1", 352 | "delay" => "" 353 | }, %{ 354 | "parentSelectors" => ["Category"], 355 | "type" => "SelectorText", 356 | "multiple" => true, 357 | "id" => "SubCategory", 358 | "selector" => "a.subcategory-link", 359 | "delay" => "" 360 | }, %{ 361 | "parentSelectors" => ["_root"], 362 | "type" => "SelectorLink", 363 | "multiple" => true, 364 | "id" => "Category", 365 | "selector" => "a.category-link", 366 | "delay" => "" 367 | }] 368 | sitemap = %{"startUrl" => @url, "selectors" => selectors} 369 | 370 | {:ok, spider} = WebScraper.start_link(sitemap) 371 | data = WebScraper.export(spider) 372 | 373 | # There are 2 categories, and 3 subcategories total, so there 374 | # must be 3 results. 375 | 376 | assert length(data) === 3 377 | expected = [%{ 378 | "Category" => "Computers", 379 | "Page Title" => "E-commerce training site", 380 | "SubCategory" => "Laptops" 381 | }, %{ 382 | "Category" => "Computers", 383 | "Page Title" => "E-commerce training site", 384 | "SubCategory" => "Tablets" 385 | }, %{ 386 | "Category" => "Phones", 387 | "Page Title" => "E-commerce training site", 388 | "SubCategory" => "Touch" 389 | }] 390 | 391 | assert data === expected 392 | end 393 | 394 | test "more level of selectors" do 395 | selectors = [%{ 396 | "parentSelectors" => ["_root"], 397 | "type" => "SelectorText", 398 | "multiple" => false, 399 | "id" => "Page Title", 400 | "selector" => ".jumbotron h1", 401 | "delay" => "" 402 | }, %{ 403 | "parentSelectors" => ["Category"], 404 | "type" => "SelectorLink", 405 | "multiple" => true, 406 | "id" => "SubCategory", 407 | "selector" => "a.subcategory-link", 408 | "delay" => "" 409 | }, %{ 410 | "parentSelectors" => ["_root"], 411 | "type" => "SelectorLink", 412 | "multiple" => true, 413 | "id" => "Category", 414 | "selector" => "a.category-link", 415 | "delay" => "" 416 | }, %{ 417 | "parentSelectors" => ["SubCategory"], 418 | "type" => "SelectorText", 419 | "multiple" => true, 420 | "id" => "Name", 421 | "selector" => "a.title", 422 | "delay" => "" 423 | }] 424 | 425 | sitemap = %{"startUrl" => @url, "selectors" => selectors} 426 | 427 | {:ok, spider} = WebScraper.start_link(sitemap) 428 | data = WebScraper.export(spider) 429 | 430 | # There are 2 categories, and 3 subcategories total, each has 6 431 | # items, for a total of 18 items. 432 | 433 | assert length(data) === 18 434 | end 435 | 436 | test "parse level with no child" do 437 | selectors = [%{ 438 | "parentSelectors" => ["_root"], 439 | "type" => "SelectorText", 440 | "multiple" => false, 441 | "id" => "Page Title", 442 | "selector" => ".jumbotron h1", 443 | "delay" => "" 444 | }, %{ 445 | "parentSelectors" => ["_root"], 446 | "type" => "SelectorLink", 447 | "multiple" => true, 448 | "id" => "Category", 449 | "selector" => "a.category-link", 450 | "delay" => "" 451 | }] 452 | sitemap = %{"startUrl" => @url, "selectors" => selectors} 453 | 454 | {:ok, spider} = WebScraper.start_link(sitemap) 455 | data = WebScraper.export(spider) 456 | 457 | expected = [%{ 458 | "Category" => "Computers", 459 | "Page Title" => "E-commerce training site" 460 | }, %{ 461 | "Category" => "Phones", 462 | "Page Title" => "E-commerce training site" 463 | }] 464 | 465 | assert data === expected 466 | end 467 | 468 | test "don't put element from SelectorElement in result" do 469 | selectors = [%{ 470 | "parentSelectors" => ["_root"], 471 | "type" => "SelectorText", 472 | "multiple" => false, 473 | "id" => "Page Title", 474 | "selector" => ".jumbotron h1", 475 | "delay" => "" 476 | }, %{ 477 | "parentSelectors" => ["Category"], 478 | "type" => "SelectorText", 479 | "multiple" => true, 480 | "id" => "SubCategory", 481 | "selector" => "a.subcategory-link", 482 | "delay" => "" 483 | }, %{ 484 | "parentSelectors" => ["_root"], 485 | "type" => "SelectorLink", 486 | "multiple" => true, 487 | "id" => "Category", 488 | "selector" => "a.category-link", 489 | "delay" => "" 490 | }, %{ 491 | "parentSelectors" => ["Category"], 492 | "type" => "SelectorElement", 493 | "multiple" => true, 494 | "id" => "Item", 495 | "selector" => "div.thumbnail", 496 | "delay" => "" 497 | }] 498 | sitemap = %{"startUrl" => @url, "selectors" => selectors} 499 | 500 | {:ok, spider} = WebScraper.start_link(sitemap) 501 | data = WebScraper.export(spider) 502 | 503 | # There are 2 categories, and 3 subcategories total, so there 504 | # must be 3 results. 505 | 506 | assert length(data) === 3 507 | expected = [%{ 508 | "Category" => "Computers", 509 | "Page Title" => "E-commerce training site", 510 | "SubCategory" => "Laptops" 511 | }, %{ 512 | "Category" => "Computers", 513 | "Page Title" => "E-commerce training site", 514 | "SubCategory" => "Tablets" 515 | }, %{ 516 | "Category" => "Phones", 517 | "Page Title" => "E-commerce training site", 518 | "SubCategory" => "Touch" 519 | }] 520 | 521 | assert data === expected 522 | end 523 | 524 | test "can group selectors with SelectorElement", context do 525 | selectors = [%{ 526 | "parentSelectors" => ["_root"], 527 | "type" => "SelectorLink", 528 | "multiple" => true, 529 | "id" => "Category", 530 | "selector" => "a.category-link", 531 | "delay" => "" 532 | }, %{ 533 | "parentSelectors" => ["Category"], 534 | "type" => "SelectorElement", 535 | "multiple" => true, 536 | "id" => "Item", 537 | "selector" => "div.thumbnail", 538 | "delay" => "" 539 | }, %{ 540 | "parentSelectors" => ["Item"], 541 | "type" => "SelectorText", 542 | "multiple" => false, 543 | "id" => "Name", 544 | "selector" => "a.title", 545 | "regex" => "", 546 | "delay" => "" 547 | }, %{ 548 | "parentSelectors" => ["Item"], 549 | "type" => "SelectorText", 550 | "multiple" => false, 551 | "id" => "Price", 552 | "selector" => "h4.pull-right", 553 | "regex" => "", 554 | "delay" => "" 555 | }, %{ 556 | "parentSelectors" => ["Item"], 557 | "type" => "SelectorText", 558 | "multiple" => false, 559 | "id" => "Description", 560 | "selector" => "p.description", 561 | "regex" => "", 562 | "delay" => "" 563 | }] 564 | sitemap = %{"startUrl" => @url, "selectors" => selectors} 565 | 566 | {:ok, spider} = WebScraper.start_link(sitemap) 567 | data = WebScraper.export(spider) 568 | 569 | # There are 2 categories. Each categories has 570 | # 3 products on display. Total 2 x 3 = 9 items. 571 | 572 | assert length(data) === 6 573 | items = context.items 574 | 575 | Enum.concat([@computers, @phones]) 576 | |> Enum.map(&Enum.at(items, &1)) 577 | |> Enum.map(fn(item) -> 578 | item 579 | |> Map.delete("Page") 580 | |> Map.delete("SubCategory") 581 | end) 582 | |> ScrapexAsserter.array_equals(data) 583 | |> assert 584 | end 585 | end -------------------------------------------------------------------------------- /test/scrapex_test.exs: -------------------------------------------------------------------------------- 1 | defmodule ScrapexTest do 2 | use ExUnit.Case 3 | 4 | test "the truth" do 5 | assert 1 + 1 == 2 6 | end 7 | end 8 | -------------------------------------------------------------------------------- /test/test_helper.exs: -------------------------------------------------------------------------------- 1 | :application.start :inets 2 | 3 | server_root = '#{Path.absname("test/sample_pages")}' 4 | test_server_config = [ 5 | port: 9090, 6 | server_name: 'localhost', 7 | server_root: server_root, 8 | document_root: server_root, 9 | bind_address: {127, 0, 0, 1}, 10 | directory_index: ['index.hml', 'index.html'] 11 | ] 12 | 13 | {:ok, pid} = :inets.start(:httpd, test_server_config) 14 | 15 | System.at_exit fn(_exit_status) -> 16 | :ok = :inets.stop(:httpd, pid) 17 | end 18 | 19 | ExUnit.start() 20 | 21 | defmodule ScrapexAsserter do 22 | def array_equals(left, right) do 23 | Enum.all?(left, &Enum.member?(right, &1)) 24 | end 25 | end --------------------------------------------------------------------------------