├── .gitignore
├── LICENSE
├── README.md
├── config
    ├── config.exs
    ├── dev.exs
    ├── prod.exs
    └── test.exs
├── doc
    ├── 404.html
    ├── Scrapex.GenSpider.Response.html
    ├── Scrapex.GenSpider.html
    ├── Scrapex.Selector.html
    ├── Scrapex.html
    ├── assets
    │   └── logo.png
    ├── dist
    │   ├── app.css
    │   ├── app.js
    │   └── sidebar_items.js
    ├── extra-api-reference.html
    ├── extra-readme.html
    ├── fonts
    │   ├── icomoon.eot
    │   ├── icomoon.svg
    │   ├── icomoon.ttf
    │   └── icomoon.woff
    └── index.html
├── lib
    ├── scrapex.ex
    └── scrapex
    │   ├── gen_spider.ex
    │   ├── gen_spider
    │       ├── README.md
    │       ├── request.ex
    │       └── response.ex
    │   ├── selector.ex
    │   └── spider
    │       └── webscraper.ex
├── logo.png
├── mix.exs
├── mix.lock
└── test
    ├── sample_pages
        ├── e-commerce
        │   └── static
        │   │   ├── computers
        │   │       ├── index.html
        │   │       ├── index_files
        │   │       │   ├── cart2.png
        │   │       │   ├── site.js
        │   │       │   └── style.css
        │   │       ├── laptops
        │   │       │   └── index.html
        │   │       └── tablets
        │   │       │   └── index.html
        │   │   ├── index.html
        │   │   └── phones
        │   │       ├── index.html
        │   │       └── touch
        │   │           └── index.html
        └── example.com.html
    ├── scrapex
        ├── gen_spider_test.exs
        ├── selector_test.exs
        └── spider
        │   ├── example_test.exs
        │   ├── webscraper.csv
        │   └── webscraper_test.exs
    ├── scrapex_test.exs
    └── test_helper.exs


/.gitignore:
--------------------------------------------------------------------------------
1 | /_build
2 | /deps
3 | erl_crash.dump
4 | *.ez
5 | .DS_Store
6 | *.beam


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Son Tran-Nguyen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Scrapex
 2 | =======
 3 | 
 4 | An open source and collaborative framework for extracting the data you need from websites. In a fast, simple, yet extensible way.
 5 | 
 6 | ## Features
 7 | 
 8 | ### Fast and powerful
 9 | Write the rules to extract the data and let Scrapex do the rest.
10 | 
11 | ### Easily extensible
12 | Extensible by design, plug new functionality easily without having to touch the core.
13 | 
14 | ### Portable, Elixir
15 | Written in Elixir and runs on Linux, Windows, Mac, BSD, and embedded devices.
16 | 
17 | ## Build your own webcrawlers
18 | 
19 |     alias Scrapex.GenSpider
20 |     defmodule StackOverflowSpider do
21 |       use GenSpider
22 |       import Scrapex.Selector
23 |       
24 |       def parse(response, state) do
25 |         result = response.body
26 |         |> select(".question-summary h3 a")
27 |         |> extract("href")
28 |         |> Enum.map(fn(href) ->
29 |           GenSpider.Response.url_join(response, href)
30 |           |> GenSpider.request(&parse_question/1)
31 |           |> GenSpider.await
32 |         end)
33 |         {:ok, result, state}
34 |       end
35 |       
36 |       defp parse_question({:ok, response}) do
37 |         html = response.body
38 |         [title] = html |> select("h1 a") |> extract()
39 |         question = html |> select(".question")
40 |         [body] = question |> select(".post-text") |> extract
41 |         [votes] = question |> select(".vote-count-post") |> extract
42 |         tags = question |> select(".post-tag") |> extract
43 |         
44 |         %{title: title, body: body, votes: votes, tags: tags}
45 |       end
46 |     end
47 |     urls = ["http://stackoverflow.com/questions?sort=votes"]
48 |     opts = [name: :stackoverflow_spider, urls: urls]
49 |     {:ok, spider} = GenSpider.start_link(StackOverflowSpider, [], opts)
50 |     questions = GenSpider.export(spider)
51 |     #=> "[{} | _]"
52 | 
53 | ## TODOS
54 | 
55 | - [x] `GenSpider behaviour`.
56 | - [x] Request URL and pass response to `parse/2` callback.
57 | - [x] One time spider
58 | - [x] CSS selector
59 | - [ ] XPath selector
60 | - [x] Yield for requests in `parse/2`
61 | - [x] Follow redirects
62 | - [ ] Set custom request headers
63 | - [ ] Respect robots.txt
64 | - [ ] Resolve DNS once only
65 | - [ ] Domain blacklist
66 | - [ ] Parse response chunk by chunk
67 | - [ ] CLI


--------------------------------------------------------------------------------
/config/config.exs:
--------------------------------------------------------------------------------
 1 | # This file is responsible for configuring your application
 2 | # and its dependencies with the aid of the Mix.Config module.
 3 | use Mix.Config
 4 | 
 5 | # This configuration is loaded before any dependency and is restricted
 6 | # to this project. If another project depends on this project, this
 7 | # file won't be loaded nor affect the parent project. For this reason,
 8 | # if you want to provide default values for your application for third-
 9 | # party users, it should be done in your mix.exs file.
10 | 
11 | # Sample configuration:
12 | #
13 | #     config :logger, :console,
14 | #       level: :info,
15 | #       format: "$date $time [$level] $metadata$message\n",
16 | #       metadata: [:user_id]
17 | 
18 | # It is also possible to import configuration files, relative to this
19 | # directory. For example, you can emulate configuration per environment
20 | # by uncommenting the line below and defining dev.exs, test.exs and such.
21 | # Configuration from the imported file will override the ones defined
22 | # here (which is why it is important to import them last).
23 | #
24 |   import_config "#{Mix.env}.exs"
25 | 


--------------------------------------------------------------------------------
/config/dev.exs:
--------------------------------------------------------------------------------
1 | use Mix.Config


--------------------------------------------------------------------------------
/config/prod.exs:
--------------------------------------------------------------------------------
1 | use Mix.Config
2 | 
3 | # Do not print debug messages in production
4 | config :logger, level: :info


--------------------------------------------------------------------------------
/config/test.exs:
--------------------------------------------------------------------------------
1 | use Mix.Config
2 | 
3 | # Print only warnings and errors during test
4 | config :logger, level: :warn


--------------------------------------------------------------------------------
/doc/404.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta charset="utf-8">
 5 |     <meta http-equiv="x-ua-compatible" content="ie=edge">
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1">
 7 |     <title>404 – Scrapex v0.1.0</title>
 8 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 9 |     <meta name="generator" content="ExDoc v0.10.0">
10 |     <link rel="stylesheet" href="dist/app.css" />
11 |     <script src="dist/sidebar_items.js"></script>
12 |   </head>
13 |   <body data-type="extras">
14 | 
15 | <div class="main">
16 | <button class="sidebar-toggle">
17 |   <i class="icon-menu"></i>
18 | </button>
19 | <section class="sidebar">
20 |   <button class="sidebar-toggle">
21 |     <i class="icon-menu"></i>
22 |   </button>
23 | 
24 |   
25 |   <a href="extra-api-reference.html" class="sidebar-projectLink">
26 |     <div class="sidebar-projectDetails">
27 |       <h1 class="sidebar-projectName">
28 |         Scrapex
29 |       </h1>
30 |       <h2 class="sidebar-projectVersion">
31 |         v0.1.0
32 |       </h2>
33 |     </div>
34 |     
35 |       <img src="assets/logo.png" alt="Scrapex" class="sidebar-projectImage">
36 |     
37 |   </a>
38 | 
39 |   <div class="sidebar-search">
40 |     <i class="icon-search"></i>
41 |     <input type="text" class="sidebar-searchInput" placeholder="search" autocomplete="off" />
42 |   </div>
43 | 
44 |   <ul class="sidebar-listNav">
45 |     <li><a id="extras-list" href="#full-list">Pages</a></li>
46 | 
47 |     
48 |       <li><a id="modules-list" href="#full-list">Modules</a></li>
49 |     
50 | 
51 |     
52 | 
53 |     
54 |   </ul>
55 | 
56 |   <ul id="full-list" class="sidebar-fullList"></ul>
57 |   <div class="sidebar-noResults"></div>
58 | </section>
59 | 
60 | <section class="content">
61 |   <div id="content" class="content-inner">
62 | 
63 | 
64 | <h2>Page not found</h2>
65 | 
66 | <p>Sorry, but the page you were trying to get to, does not exist. You
67 | may want to try searching this site using the sidebar or using our
68 | <a href="extra-api-reference.html" title="API Reference">API Reference</a> page to find what
69 | you were looking for.</p>
70 | 
71 |     <footer class="footer">
72 |       <p>
73 |         <span class="line">
74 |           Built using
75 |           <a href="https://github.com/elixir-lang/ex_doc" title="ExDoc" rel="help" target="_blank">ExDoc</a> (v0.10.0),
76 |         </span>
77 |         <span class="line">
78 |           designed by
79 |           <a href="https://twitter.com/dignifiedquire" target="_blank" title="@dignifiedquire">Friedel Ziegelmayer</a>.
80 |           </span>
81 |       </p>
82 |     </footer>
83 |   </div>
84 | </section>
85 | </div>
86 |     <script src="dist/app.js"></script>
87 |   </body>
88 | </html>
89 | 
90 | 


--------------------------------------------------------------------------------
/doc/Scrapex.GenSpider.Response.html:
--------------------------------------------------------------------------------
  1 |     <!DOCTYPE html>
  2 | <html>
  3 |   <head>
  4 |     <meta charset="utf-8">
  5 |     <meta http-equiv="x-ua-compatible" content="ie=edge">
  6 |     <meta name="viewport" content="width=device-width, initial-scale=1">
  7 |     <title>Scrapex.GenSpider.Response – Scrapex v0.1.0</title>
  8 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  9 |     <meta name="generator" content="ExDoc v0.10.0">
 10 |     <link rel="stylesheet" href="dist/app.css" />
 11 |     <script src="dist/sidebar_items.js"></script>
 12 |   </head>
 13 |   <body data-type="modules">
 14 | 
 15 |     <div class="main">
 16 | <button class="sidebar-toggle">
 17 |   <i class="icon-menu"></i>
 18 | </button>
 19 | <section class="sidebar">
 20 |   <button class="sidebar-toggle">
 21 |     <i class="icon-menu"></i>
 22 |   </button>
 23 | 
 24 |   
 25 |   <a href="extra-api-reference.html" class="sidebar-projectLink">
 26 |     <div class="sidebar-projectDetails">
 27 |       <h1 class="sidebar-projectName">
 28 |         Scrapex
 29 |       </h1>
 30 |       <h2 class="sidebar-projectVersion">
 31 |         v0.1.0
 32 |       </h2>
 33 |     </div>
 34 |     
 35 |       <img src="assets/logo.png" alt="Scrapex" class="sidebar-projectImage">
 36 |     
 37 |   </a>
 38 | 
 39 |   <div class="sidebar-search">
 40 |     <i class="icon-search"></i>
 41 |     <input type="text" class="sidebar-searchInput" placeholder="search" autocomplete="off" />
 42 |   </div>
 43 | 
 44 |   <ul class="sidebar-listNav">
 45 |     <li><a id="extras-list" href="#full-list">Pages</a></li>
 46 | 
 47 |     
 48 |       <li><a id="modules-list" href="#full-list">Modules</a></li>
 49 |     
 50 | 
 51 |     
 52 | 
 53 |     
 54 |   </ul>
 55 | 
 56 |   <ul id="full-list" class="sidebar-fullList"></ul>
 57 |   <div class="sidebar-noResults"></div>
 58 | </section>
 59 | 
 60 | <section class="content">
 61 |   <div id="content" class="content-inner">
 62 | 
 63 | 
 64 |       <h1>
 65 |         Scrapex.GenSpider.Response
 66 |         
 67 |         
 68 |           <a href="https://bitbucket.org/inhuman/scrapex/src/v0.1.0/lib/scrapex/gen_spider/response.ex#cl-1" title="View Source" class="view-source" rel="help">
 69 |             <i class="icon-code"></i>
 70 |           </a>
 71 |         
 72 |       </h1>
 73 | 
 74 |       
 75 |         <section id="moduledoc" class="docstring">
 76 |           <p>Utilities for working response returned from <code class="inline">GenSpider</code>.</p>
 77 | 
 78 |         </section>
 79 |       
 80 | 
 81 |       
 82 |         <section id="summary" class="details-list">
 83 |           <h1 class="section-heading">
 84 |             <a class="hover-link" href="#summary">
 85 |               <i class="icon-link"></i>
 86 |             </a>
 87 |             Summary
 88 |           </h1>
 89 |           
 90 |   <div class="summary-types summary">
 91 |     <h2>
 92 |       <a href="#types">Types</a>
 93 |     </h2>
 94 |     <div class="summary-row">
 95 |   <div class="summary-signature">
 96 |     <a href="#t:t/0">t()</a>
 97 |   </div>
 98 |   
 99 | </div>
100 | 
101 |   </div>
102 | 
103 | 
104 |           
105 |   <div class="summary-functions summary">
106 |     <h2>
107 |       <a href="#functions">Functions</a>
108 |     </h2>
109 |     <div class="summary-row">
110 |   <div class="summary-signature">
111 |     <a href="#url_join/2">url_join(response, path)</a>
112 |   </div>
113 |   
114 |     <div class="summary-synopsis"><p>Join a path relative to the response’s URL</p>
115 | </div>
116 |   
117 | </div>
118 | 
119 |   </div>
120 | 
121 | 
122 |           
123 | 
124 |           
125 | 
126 |         </section>
127 |       
128 | 
129 |       
130 |         <section id="types" class="types details-list">
131 |           <h1 class="section-heading">
132 |             <a class="hover-link" href="#types">
133 |               <i class="icon-link"></i>
134 |             </a>
135 |             Types
136 |           </h1>
137 |           <div class="types-list">
138 |             <div id="t:t/0" class="type-detail">
139 |   <pre><code class="elixir"><a href="#t:t/0">t</a> :: %Scrapex.GenSpider.Response{url: binary, body: binary}</code></pre>
140 |   
141 | </div>
142 | 
143 |           </div>
144 |         </section>
145 |       
146 | 
147 |       
148 |         <section id="functions" class="details-list">
149 |           <h1 class="section-heading">
150 |             <a class="hover-link" href="#functions">
151 |               <i class="icon-link"></i>
152 |             </a>
153 |             Functions
154 |           </h1>
155 |           <div class="detail" id="url_join/2">
156 |   <div class="detail-header">
157 |     <a href="#url_join/2" class="detail-link" title="Link to this function">
158 |       <i class="icon-link"></i>
159 |     </a>
160 |     <span class="signature">url_join(response, path)</span>
161 |       
162 |       <a href="https://bitbucket.org/inhuman/scrapex/src/v0.1.0/lib/scrapex/gen_spider/response.ex#cl-24" class="view-source" rel="help" title="View Source">
163 |        <i class="icon-code"></i>
164 |      </a>
165 |     
166 |   </div>
167 |   
168 |     <div class="specs">
169 |       <h4 class="specs-title">Specs</h4>
170 |       <div class="specs-list">
171 |         
172 |           <pre><code class="elixir">url_join(<a href="#t:t/0">t</a>, binary) :: binary</code></pre>
173 |         
174 |       </div>
175 |     </div>
176 |   
177 |   <section class="docstring">
178 |     <p>Join a path relative to the response’s URL.</p>
179 | <h2>Examples</h2>
180 | <pre><code class="iex elixir">iex&gt; alias Scrapex.GenSpider.Response
181 | iex&gt; response = %Response{url: &quot;http://www.scrapex.com/subfolder&quot;}
182 | iex&gt; Response.url_join(response, &quot;/subfolder2&quot;)
183 | &quot;http://www.scrapex.com/subfolder2&quot;
184 | iex&gt; Response.url_join(response, &quot;subsubfolder&quot;)
185 | &quot;http://www.scrapex.com/subfolder/subsubfolder&quot;</code></pre>
186 | 
187 |   </section>
188 | </div>
189 | 
190 |         </section>
191 |       
192 | 
193 |       
194 | 
195 |       
196 |         <footer class="footer">
197 |       <p>
198 |         <span class="line">
199 |           Built using
200 |           <a href="https://github.com/elixir-lang/ex_doc" title="ExDoc" rel="help" target="_blank">ExDoc</a> (v0.10.0),
201 |         </span>
202 |         <span class="line">
203 |           designed by
204 |           <a href="https://twitter.com/dignifiedquire" target="_blank" title="@dignifiedquire">Friedel Ziegelmayer</a>.
205 |           </span>
206 |       </p>
207 |     </footer>
208 |   </div>
209 | </section>
210 | </div>
211 |     <script src="dist/app.js"></script>
212 |   </body>
213 | </html>
214 | 
215 | 


--------------------------------------------------------------------------------
/doc/Scrapex.Selector.html:
--------------------------------------------------------------------------------
  1 |     <!DOCTYPE html>
  2 | <html>
  3 |   <head>
  4 |     <meta charset="utf-8">
  5 |     <meta http-equiv="x-ua-compatible" content="ie=edge">
  6 |     <meta name="viewport" content="width=device-width, initial-scale=1">
  7 |     <title>Scrapex.Selector – Scrapex v0.1.0</title>
  8 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  9 |     <meta name="generator" content="ExDoc v0.10.0">
 10 |     <link rel="stylesheet" href="dist/app.css" />
 11 |     <script src="dist/sidebar_items.js"></script>
 12 |   </head>
 13 |   <body data-type="modules">
 14 | 
 15 |     <div class="main">
 16 | <button class="sidebar-toggle">
 17 |   <i class="icon-menu"></i>
 18 | </button>
 19 | <section class="sidebar">
 20 |   <button class="sidebar-toggle">
 21 |     <i class="icon-menu"></i>
 22 |   </button>
 23 | 
 24 |   
 25 |   <a href="extra-api-reference.html" class="sidebar-projectLink">
 26 |     <div class="sidebar-projectDetails">
 27 |       <h1 class="sidebar-projectName">
 28 |         Scrapex
 29 |       </h1>
 30 |       <h2 class="sidebar-projectVersion">
 31 |         v0.1.0
 32 |       </h2>
 33 |     </div>
 34 |     
 35 |       <img src="assets/logo.png" alt="Scrapex" class="sidebar-projectImage">
 36 |     
 37 |   </a>
 38 | 
 39 |   <div class="sidebar-search">
 40 |     <i class="icon-search"></i>
 41 |     <input type="text" class="sidebar-searchInput" placeholder="search" autocomplete="off" />
 42 |   </div>
 43 | 
 44 |   <ul class="sidebar-listNav">
 45 |     <li><a id="extras-list" href="#full-list">Pages</a></li>
 46 | 
 47 |     
 48 |       <li><a id="modules-list" href="#full-list">Modules</a></li>
 49 |     
 50 | 
 51 |     
 52 | 
 53 |     
 54 |   </ul>
 55 | 
 56 |   <ul id="full-list" class="sidebar-fullList"></ul>
 57 |   <div class="sidebar-noResults"></div>
 58 | </section>
 59 | 
 60 | <section class="content">
 61 |   <div id="content" class="content-inner">
 62 | 
 63 | 
 64 |       <h1>
 65 |         Scrapex.Selector
 66 |         
 67 |         
 68 |           <a href="https://bitbucket.org/inhuman/scrapex/src/v0.1.0/lib/scrapex/selector.ex#cl-1" title="View Source" class="view-source" rel="help">
 69 |             <i class="icon-code"></i>
 70 |           </a>
 71 |         
 72 |       </h1>
 73 | 
 74 |       
 75 |         <section id="moduledoc" class="docstring">
 76 |           <p>Utilities for extracting data from markup language.</p>
 77 | 
 78 |         </section>
 79 |       
 80 | 
 81 |       
 82 |         <section id="summary" class="details-list">
 83 |           <h1 class="section-heading">
 84 |             <a class="hover-link" href="#summary">
 85 |               <i class="icon-link"></i>
 86 |             </a>
 87 |             Summary
 88 |           </h1>
 89 |           
 90 |   <div class="summary-types summary">
 91 |     <h2>
 92 |       <a href="#types">Types</a>
 93 |     </h2>
 94 |     <div class="summary-row">
 95 |   <div class="summary-signature">
 96 |     <a href="#t:attribute/0">attribute()</a>
 97 |   </div>
 98 |   
 99 |     <div class="summary-synopsis"><p>Attribute of a node</p>
100 | </div>
101 |   
102 | </div>
103 | <div class="summary-row">
104 |   <div class="summary-signature">
105 |     <a href="#t:children/0">children()</a>
106 |   </div>
107 |   
108 | </div>
109 | <div class="summary-row">
110 |   <div class="summary-signature">
111 |     <a href="#t:html_node/0">html_node()</a>
112 |   </div>
113 |   
114 | </div>
115 | <div class="summary-row">
116 |   <div class="summary-signature">
117 |     <a href="#t:html_tree/0">html_tree()</a>
118 |   </div>
119 |   
120 |     <div class="summary-synopsis"><p>A tree of HTML nodes, or a node itself if only one</p>
121 | </div>
122 |   
123 | </div>
124 | <div class="summary-row">
125 |   <div class="summary-signature">
126 |     <a href="#t:name/0">name()</a>
127 |   </div>
128 |   
129 |     <div class="summary-synopsis"><p>Name of the tag or attribute</p>
130 | </div>
131 |   
132 | </div>
133 | <div class="summary-row">
134 |   <div class="summary-signature">
135 |     <a href="#t:selector/0">selector()</a>
136 |   </div>
137 |   
138 | </div>
139 | <div class="summary-row">
140 |   <div class="summary-signature">
141 |     <a href="#t:t/0">t()</a>
142 |   </div>
143 |   
144 | </div>
145 | 
146 |   </div>
147 | 
148 | 
149 |           
150 |   <div class="summary-functions summary">
151 |     <h2>
152 |       <a href="#functions">Functions</a>
153 |     </h2>
154 |     <div class="summary-row">
155 |   <div class="summary-signature">
156 |     <a href="#extract/1">extract(selector)</a>
157 |   </div>
158 |   
159 |     <div class="summary-synopsis"><p>Extracts content or attribute value for a selection</p>
160 | </div>
161 |   
162 | </div>
163 | <div class="summary-row">
164 |   <div class="summary-signature">
165 |     <a href="#extract/2">extract(selector, attr)</a>
166 |   </div>
167 |   
168 | </div>
169 | <div class="summary-row">
170 |   <div class="summary-signature">
171 |     <a href="#select/2">select(html, selector)</a>
172 |   </div>
173 |   
174 |     <div class="summary-synopsis"><p>Generates a selection for a particular selector</p>
175 | </div>
176 |   
177 | </div>
178 | 
179 |   </div>
180 | 
181 | 
182 |           
183 | 
184 |           
185 | 
186 |         </section>
187 |       
188 | 
189 |       
190 |         <section id="types" class="types details-list">
191 |           <h1 class="section-heading">
192 |             <a class="hover-link" href="#types">
193 |               <i class="icon-link"></i>
194 |             </a>
195 |             Types
196 |           </h1>
197 |           <div class="types-list">
198 |             <div id="t:attribute/0" class="type-detail">
199 |   <pre><code class="elixir"><a href="#t:attribute/0">attribute</a> :: {<a href="#t:name/0">name</a>, binary}</code></pre>
200 |   
201 |     <div class="typespec-doc"><p>Attribute of a node</p>
202 | </div>
203 |   
204 | </div>
205 | <div id="t:children/0" class="type-detail">
206 |   <pre><code class="elixir"><a href="#t:children/0">children</a> :: [<a href="#t:html_node/0">html_node</a>]</code></pre>
207 |   
208 | </div>
209 | <div id="t:html_node/0" class="type-detail">
210 |   <pre><code class="elixir"><a href="#t:html_node/0">html_node</a> :: {<a href="#t:name/0">name</a>, [<a href="#t:attribute/0">attribute</a>], <a href="#t:children/0">children</a>}</code></pre>
211 |   
212 | </div>
213 | <div id="t:html_tree/0" class="type-detail">
214 |   <pre><code class="elixir"><a href="#t:html_tree/0">html_tree</a> :: <a href="#t:html_node/0">html_node</a> | [<a href="#t:html_node/0">html_node</a>]</code></pre>
215 |   
216 |     <div class="typespec-doc"><p>A tree of HTML nodes, or a node itself if only one</p>
217 | </div>
218 |   
219 | </div>
220 | <div id="t:name/0" class="type-detail">
221 |   <pre><code class="elixir"><a href="#t:name/0">name</a> :: binary</code></pre>
222 |   
223 |     <div class="typespec-doc"><p>Name of the tag or attribute</p>
224 | </div>
225 |   
226 | </div>
227 | <div id="t:selector/0" class="type-detail">
228 |   <pre><code class="elixir"><a href="#t:selector/0">selector</a> :: binary</code></pre>
229 |   
230 | </div>
231 | <div id="t:t/0" class="type-detail">
232 |   <pre><code class="elixir"><a href="#t:t/0">t</a> :: %Scrapex.Selector{tree: <a href="#t:html_tree/0">html_tree</a>}</code></pre>
233 |   
234 | </div>
235 | 
236 |           </div>
237 |         </section>
238 |       
239 | 
240 |       
241 |         <section id="functions" class="details-list">
242 |           <h1 class="section-heading">
243 |             <a class="hover-link" href="#functions">
244 |               <i class="icon-link"></i>
245 |             </a>
246 |             Functions
247 |           </h1>
248 |           <div class="detail" id="extract/1">
249 |   <div class="detail-header">
250 |     <a href="#extract/1" class="detail-link" title="Link to this function">
251 |       <i class="icon-link"></i>
252 |     </a>
253 |     <span class="signature">extract(selector)</span>
254 |       
255 |       <a href="https://bitbucket.org/inhuman/scrapex/src/v0.1.0/lib/scrapex/selector.ex#cl-42" class="view-source" rel="help" title="View Source">
256 |        <i class="icon-code"></i>
257 |      </a>
258 |     
259 |   </div>
260 |   
261 |   <section class="docstring">
262 |     <p>Extracts content or attribute value for a selection.</p>
263 | 
264 |   </section>
265 | </div>
266 | <div class="detail" id="extract/2">
267 |   <div class="detail-header">
268 |     <a href="#extract/2" class="detail-link" title="Link to this function">
269 |       <i class="icon-link"></i>
270 |     </a>
271 |     <span class="signature">extract(selector, attr)</span>
272 |       
273 |       <a href="https://bitbucket.org/inhuman/scrapex/src/v0.1.0/lib/scrapex/selector.ex#cl-43" class="view-source" rel="help" title="View Source">
274 |        <i class="icon-code"></i>
275 |      </a>
276 |     
277 |   </div>
278 |   
279 |     <div class="specs">
280 |       <h4 class="specs-title">Specs</h4>
281 |       <div class="specs-list">
282 |         
283 |           <pre><code class="elixir">extract(<a href="#t:t/0">t</a>, <a href="#t:name/0">name</a>) :: [binary]</code></pre>
284 |         
285 |       </div>
286 |     </div>
287 |   
288 |   <section class="docstring">
289 |     
290 |   </section>
291 | </div>
292 | <div class="detail" id="select/2">
293 |   <div class="detail-header">
294 |     <a href="#select/2" class="detail-link" title="Link to this function">
295 |       <i class="icon-link"></i>
296 |     </a>
297 |     <span class="signature">select(html, selector)</span>
298 |       
299 |       <a href="https://bitbucket.org/inhuman/scrapex/src/v0.1.0/lib/scrapex/selector.ex#cl-30" class="view-source" rel="help" title="View Source">
300 |        <i class="icon-code"></i>
301 |      </a>
302 |     
303 |   </div>
304 |   
305 |     <div class="specs">
306 |       <h4 class="specs-title">Specs</h4>
307 |       <div class="specs-list">
308 |         
309 |           <pre><code class="elixir">select(binary | <a href="#t:t/0">t</a>, <a href="#t:selector/0">selector</a>) :: <a href="#t:t/0">t</a></code></pre>
310 |         
311 |       </div>
312 |     </div>
313 |   
314 |   <section class="docstring">
315 |     <p>Generates a selection for a particular selector.</p>
316 | <p>The return value is a Selector.t</p>
317 | 
318 |   </section>
319 | </div>
320 | 
321 |         </section>
322 |       
323 | 
324 |       
325 | 
326 |       
327 |         <footer class="footer">
328 |       <p>
329 |         <span class="line">
330 |           Built using
331 |           <a href="https://github.com/elixir-lang/ex_doc" title="ExDoc" rel="help" target="_blank">ExDoc</a> (v0.10.0),
332 |         </span>
333 |         <span class="line">
334 |           designed by
335 |           <a href="https://twitter.com/dignifiedquire" target="_blank" title="@dignifiedquire">Friedel Ziegelmayer</a>.
336 |           </span>
337 |       </p>
338 |     </footer>
339 |   </div>
340 | </section>
341 | </div>
342 |     <script src="dist/app.js"></script>
343 |   </body>
344 | </html>
345 | 
346 | 


--------------------------------------------------------------------------------
/doc/Scrapex.html:
--------------------------------------------------------------------------------
  1 |     <!DOCTYPE html>
  2 | <html>
  3 |   <head>
  4 |     <meta charset="utf-8">
  5 |     <meta http-equiv="x-ua-compatible" content="ie=edge">
  6 |     <meta name="viewport" content="width=device-width, initial-scale=1">
  7 |     <title>Scrapex – Scrapex v0.1.0</title>
  8 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  9 |     <meta name="generator" content="ExDoc v0.10.0">
 10 |     <link rel="stylesheet" href="dist/app.css" />
 11 |     <script src="dist/sidebar_items.js"></script>
 12 |   </head>
 13 |   <body data-type="modules">
 14 | 
 15 |     <div class="main">
 16 | <button class="sidebar-toggle">
 17 |   <i class="icon-menu"></i>
 18 | </button>
 19 | <section class="sidebar">
 20 |   <button class="sidebar-toggle">
 21 |     <i class="icon-menu"></i>
 22 |   </button>
 23 | 
 24 |   
 25 |   <a href="extra-api-reference.html" class="sidebar-projectLink">
 26 |     <div class="sidebar-projectDetails">
 27 |       <h1 class="sidebar-projectName">
 28 |         Scrapex
 29 |       </h1>
 30 |       <h2 class="sidebar-projectVersion">
 31 |         v0.1.0
 32 |       </h2>
 33 |     </div>
 34 |     
 35 |       <img src="assets/logo.png" alt="Scrapex" class="sidebar-projectImage">
 36 |     
 37 |   </a>
 38 | 
 39 |   <div class="sidebar-search">
 40 |     <i class="icon-search"></i>
 41 |     <input type="text" class="sidebar-searchInput" placeholder="search" autocomplete="off" />
 42 |   </div>
 43 | 
 44 |   <ul class="sidebar-listNav">
 45 |     <li><a id="extras-list" href="#full-list">Pages</a></li>
 46 | 
 47 |     
 48 |       <li><a id="modules-list" href="#full-list">Modules</a></li>
 49 |     
 50 | 
 51 |     
 52 | 
 53 |     
 54 |   </ul>
 55 | 
 56 |   <ul id="full-list" class="sidebar-fullList"></ul>
 57 |   <div class="sidebar-noResults"></div>
 58 | </section>
 59 | 
 60 | <section class="content">
 61 |   <div id="content" class="content-inner">
 62 | 
 63 | 
 64 |       <h1>
 65 |         Scrapex
 66 |         
 67 |         
 68 |           <a href="https://bitbucket.org/inhuman/scrapex/src/v0.1.0/lib/scrapex.ex#cl-1" title="View Source" class="view-source" rel="help">
 69 |             <i class="icon-code"></i>
 70 |           </a>
 71 |         
 72 |       </h1>
 73 | 
 74 |       
 75 | 
 76 |       
 77 | 
 78 |       
 79 | 
 80 |       
 81 | 
 82 |       
 83 | 
 84 |       
 85 |         <footer class="footer">
 86 |       <p>
 87 |         <span class="line">
 88 |           Built using
 89 |           <a href="https://github.com/elixir-lang/ex_doc" title="ExDoc" rel="help" target="_blank">ExDoc</a> (v0.10.0),
 90 |         </span>
 91 |         <span class="line">
 92 |           designed by
 93 |           <a href="https://twitter.com/dignifiedquire" target="_blank" title="@dignifiedquire">Friedel Ziegelmayer</a>.
 94 |           </span>
 95 |       </p>
 96 |     </footer>
 97 |   </div>
 98 | </section>
 99 | </div>
100 |     <script src="dist/app.js"></script>
101 |   </body>
102 | </html>
103 | 
104 | 


--------------------------------------------------------------------------------
/doc/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sntran/scrapex/0b8e1db6cf24f3d98e644b03479af9a7c304b6a8/doc/assets/logo.png


--------------------------------------------------------------------------------
/doc/dist/app.css:
--------------------------------------------------------------------------------
1 | @import url(https://fonts.googleapis.com/css?family=Lato:400,300,700,900|Merriweather:300italic,300,700,700italic|Inconsolata:400,700);.hljs,article,aside,details,figcaption,figure,footer,header,hgroup,main,menu,nav,section,summary{display:block}img,legend{border:0}.sidebar a,.sidebar-toggle{transition:color .3s ease-in-out}.sidebar .sidebar-search .sidebar-searchInput:focus,.sidebar .sidebar-search .sidebar-searchInput:hover,.sidebar-toggle:active,.sidebar-toggle:focus,.sidebar-toggle:hover,a:active,a:hover{outline:0}.results ul,.sidebar ul{list-style:none}.hljs-comment{color:#8e908c}.css .hljs-class,.css .hljs-id,.css .hljs-pseudo,.hljs-attribute,.hljs-regexp,.hljs-tag,.hljs-variable,.html .hljs-doctype,.ruby .hljs-constant,.xml .hljs-doctype,.xml .hljs-pi,.xml .hljs-tag .hljs-title{color:#c82829}.hljs-built_in,.hljs-constant,.hljs-literal,.hljs-number,.hljs-params,.hljs-pragma,.hljs-preprocessor{color:#f5871f}.css .hljs-rule .hljs-attribute,.ruby .hljs-class .hljs-title{color:#eab700}.hljs-header,.hljs-inheritance,.hljs-name,.hljs-string,.hljs-value,.ruby .hljs-symbol,.xml .hljs-cdata{color:#718c00}.css .hljs-hexcolor,.hljs-title{color:#3e999f}.coffeescript .hljs-title,.hljs-function,.javascript .hljs-title,.perl .hljs-sub,.python .hljs-decorator,.python .hljs-title,.ruby .hljs-function .hljs-title,.ruby .hljs-title .hljs-keyword{color:#4271ae}.hljs-keyword,.javascript .hljs-function{color:#8959a8}.hljs{overflow-x:auto;background:#fff;color:#4d4d4c;padding:.5em;-webkit-text-size-adjust:none}legend,td,th{padding:0}.coffeescript .javascript,.javascript .xml,.tex .hljs-formula,.xml .css,.xml .hljs-cdata,.xml .javascript,.xml .vbscript{opacity:.5}/*! normalize.css v3.0.3 | MIT License | github.com/necolas/normalize.css */html{font-family:sans-serif;-ms-text-size-adjust:100%;-webkit-text-size-adjust:100%}audio,canvas,progress,video{display:inline-block;vertical-align:baseline}audio:not([controls]){display:none;height:0}[hidden],template{display:none}a{background-color:transparent}abbr[title]{border-bottom:1px dotted}b,optgroup,strong{font-weight:700}dfn{font-style:italic}h1{font-size:2em;margin:.67em 0}mark{background:#ff0;color:#000}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sup{top:-.5em}sub{bottom:-.25em}svg:not(:root){overflow:hidden}figure{margin:1em 40px}hr{box-sizing:content-box;height:0}pre,textarea{overflow:auto}code,kbd,pre,samp{font-family:monospace,monospace;font-size:1em}button,input,optgroup,select,textarea{color:inherit;font:inherit;margin:0}button{overflow:visible}.main,body,html{overflow:hidden}button,select{text-transform:none}button,html input[type=button],input[type=reset],input[type=submit]{-webkit-appearance:button;cursor:pointer}button[disabled],html input[disabled]{cursor:default}button::-moz-focus-inner,input::-moz-focus-inner{border:0;padding:0}input{line-height:normal}input[type=checkbox],input[type=radio]{box-sizing:border-box;padding:0}input[type=number]::-webkit-inner-spin-button,input[type=number]::-webkit-outer-spin-button{height:auto}.content,.main,.sidebar,body,html{height:100%}input[type=search]{-webkit-appearance:textfield;box-sizing:content-box}input[type=search]::-webkit-search-cancel-button,input[type=search]::-webkit-search-decoration{-webkit-appearance:none}fieldset{border:1px solid silver;margin:0 2px;padding:.35em .625em .75em}table{border-collapse:collapse;border-spacing:0}@font-face{font-family:icomoon;src:url(../fonts/icomoon.eot?h5z89e);src:url(../fonts/icomoon.eot?#iefixh5z89e) format('embedded-opentype'),url(../fonts/icomoon.ttf?h5z89e) format('truetype'),url(../fonts/icomoon.woff?h5z89e) format('woff'),url(../fonts/icomoon.svg?h5z89e#icomoon) format('svg');font-weight:400;font-style:normal}.icon-elem,[class*=" icon-"],[class^=icon-]{font-family:icomoon;speak:none;font-style:normal;font-weight:400;font-variant:normal;text-transform:none;line-height:1;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}.sidebar,body{font-family:Lato,sans-serif}.icon-link:before{content:"\e005"}.icon-search:before{content:"\e036"}.icon-cross:before{content:"\e117"}.icon-menu:before{content:"\e120"}.icon-angle-right:before{content:"\f105"}.icon-code:before{content:"\f121"}body,html{box-sizing:border-box;width:100%}body{margin:0;font-size:16px;line-height:1.6875em}*,:after,:before{box-sizing:inherit}.main{display:-webkit-flex;display:-ms-flexbox;display:-ms-flex;display:flex}.sidebar,body.sidebar-closed .sidebar{display:none}.sidebar{-webkit-flex:0 1 300px;-moz-flex:0 1 300px;-ms-flex:0 1 300px;flex:0 1 300px;-ms-flex-positive:0;-ms-flex-negative:1;-ms-flex-preferred-size:300px;-webkit-box-orient:vertical;-moz-box-orient:vertical;-webkit-box-direction:normal;-moz-box-direction:normal;min-height:0;-webkit-flex-direction:column;-moz-flex-direction:column;-ms-flex-direction:column;flex-direction:column;position:absolute;z-index:999}.content{-webkit-flex:1 1 .01%;-moz-flex:1 1 .01%;-ms-flex:1 1 .01%;flex:1 1 .01%;-ms-flex-positive:1;-ms-flex-negative:1;-ms-flex-preferred-size:.01%;overflow-y:auto;-webkit-overflow-scrolling:touch}.content-inner{max-width:949px;margin:0 auto;padding:3px 60px}@media screen and (max-width:768px){.content-inner{padding:27px 20px 27px 40px}}body.sidebar-closed .sidebar-toggle{display:block}.sidebar-toggle{position:fixed;z-index:99;left:18px;top:8px;background-color:transparent;border:none;padding:0;font-size:16px}.sidebar-toggle:hover{color:#e1e1e1}@media screen and (min-width:768px){.sidebar-toggle{display:none}}.sidebar{font-size:14px;line-height:18px;background:#373f52;color:#d5dae6;overflow:hidden}.sidebar .sidebar-toggle{display:block;left:275px;color:#e1e1e1}.sidebar .sidebar-toggle:hover{color:#fff}.sidebar ul li{margin:0;padding:0 10px}.sidebar a{color:#d5dae6;text-decoration:none}.sidebar a:hover{color:#fff}.sidebar .sidebar-projectLink{margin:23px 30px 0}.sidebar .sidebar-projectDetails{display:inline-block;text-align:right;vertical-align:top;margin-top:6px}.sidebar .sidebar-projectImage{display:inline-block;max-width:64px;max-height:64px;margin-left:15px;vertical-align:bottom}.sidebar .sidebar-projectName{font-weight:700;font-size:24px;line-height:30px;color:#fff;margin:0;padding:0;max-width:155px}.sidebar .sidebar-projectVersion{margin:0;padding:0;font-weight:300;font-size:16px;line-height:20px;color:#fff}.sidebar .sidebar-listNav{padding:0 30px}.sidebar .sidebar-listNav li,.sidebar .sidebar-listNav li a{text-transform:uppercase;font-weight:300;font-size:13px}.sidebar .sidebar-listNav li{padding-left:17px;border-left:3px solid transparent;transition:all .3s linear;line-height:27px}.sidebar .sidebar-listNav li.selected,.sidebar .sidebar-listNav li.selected a,.sidebar .sidebar-listNav li:hover,.sidebar .sidebar-listNav li:hover a{border-color:#9768d1;color:#fff}.sidebar .sidebar-search{margin:23px 30px 18px;display:-webkit-flex;display:-ms-flexbox;display:-ms-flex;display:flex}.sidebar .sidebar-search i.icon-search{font-size:14px;color:#d5dae6}.sidebar #full-list li.clicked>a,.sidebar #full-list ul li.active a{color:#fff}.sidebar .sidebar-search .sidebar-searchInput{background-color:transparent;border:none;border-radius:0;border-bottom:1px solid #959595;margin-left:5px}.sidebar #full-list{margin:4px 0 0 30px;padding:0 20px;overflow-y:auto;-webkit-overflow-scrolling:touch;-webkit-flex:1 1 .01%;-moz-flex:1 1 .01%;-ms-flex:1 1 .01%;flex:1 1 .01%;-ms-flex-positive:1;-ms-flex-negative:1;-ms-flex-preferred-size:.01%}.sidebar #full-list ul{margin:0 20px;padding:9px 0 18px}.sidebar #full-list ul li{font-weight:300;line-height:18px}.sidebar #full-list ul li ul{display:none;padding:9px 0}.sidebar #full-list ul li ul li{border-left:1px solid #959595;padding:0 10px}.sidebar #full-list ul li ul li.active:before{font-family:icomoon;speak:none;font-style:normal;font-weight:400;font-variant:normal;text-transform:none;line-height:1;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;content:"\f105";margin-left:-10px;font-size:16px;margin-right:5px}.sidebar #full-list ul li.active{border-left:none}.sidebar #full-list ul li.active ul{display:block}.sidebar #full-list li{padding:0;line-height:27px}.sidebar #full-list li.collapsed ul{display:none}@media screen and (min-width:768px){.sidebar{position:relative;display:-webkit-flex;display:-ms-flexbox;display:-ms-flex;display:flex}}@media screen and (max-height:500px){.sidebar{overflow-y:auto}.sidebar #full-list{overflow:visible}}.content-inner{font-family:Merriweather,serif;font-size:1em;line-height:1.6875em}.content-inner h1,.content-inner h2,.content-inner h3,.content-inner h4,.content-inner h5,.content-inner h6{font-family:Lato,sans-serif;font-weight:800;line-height:1.5em;word-wrap:break-word}.content-inner h1{font-size:2em;margin:1em 0 .5em}.content-inner h1.section-heading{margin:1.5em 0 .5em}.content-inner h1 small{font-weight:300}.content-inner h1 a.view-source{font-size:1.2rem}.content-inner h2{font-size:1.625em;margin:1em 0 .5em;font-weight:400}.content-inner h3{font-size:1.375em;margin:1em 0 .5em;font-weight:600}.content-inner a{color:#000;text-decoration:none;text-shadow:.03em 0 #fff,-.03em 0 #fff,0 .03em #fff,0 -.03em #fff,.06em 0 #fff,-.06em 0 #fff,.09em 0 #fff,-.09em 0 #fff,.12em 0 #fff,-.12em 0 #fff,.15em 0 #fff,-.15em 0 #fff;background-image:linear-gradient(#fff,#fff),linear-gradient(#fff,#fff),linear-gradient(#000,#000);background-size:.05em 1px,.05em 1px,1px 1px;background-repeat:no-repeat,no-repeat,repeat-x;background-position:0 90%,100% 90%,0 90%}.content-inner a:selection{text-shadow:.03em 0 #b4d5fe,-.03em 0 #b4d5fe,0 .03em #b4d5fe,0 -.03em #b4d5fe,.06em 0 #b4d5fe,-.06em 0 #b4d5fe,.09em 0 #b4d5fe,-.09em 0 #b4d5fe,.12em 0 #b4d5fe,-.12em 0 #b4d5fe,.15em 0 #b4d5fe,-.15em 0 #b4d5fe;background:#b4d5fe}.content-inner a:-moz-selection{text-shadow:.03em 0 #b4d5fe,-.03em 0 #b4d5fe,0 .03em #b4d5fe,0 -.03em #b4d5fe,.06em 0 #b4d5fe,-.06em 0 #b4d5fe,.09em 0 #b4d5fe,-.09em 0 #b4d5fe,.12em 0 #b4d5fe,-.12em 0 #b4d5fe,.15em 0 #b4d5fe,-.15em 0 #b4d5fe;background:#b4d5fe}.content-inner a *,.content-inner a :after,.content-inner a :before,.content-inner a:after,.content-inner a:before{text-shadow:none}.content-inner a:visited{color:#000}.content-inner ul li{line-height:1.5em}.content-inner a.view-source{float:right;color:#959595;background:0 0;border:none;text-shadow:none;transition:color .3s ease-in-out}.content-inner a.view-source:hover{color:#373f52}.content-inner blockquote{font-style:italic;margin:.5em 0;padding:.25em 1.5em;border-left:3px solid #e1e1e1;display:inline-block}.content-inner blockquote :first-child{padding-top:0;margin-top:0}.content-inner blockquote :last-child{padding-bottom:0;margin-bottom:0}.content-inner table{margin:2em 0}.content-inner th{text-align:left;font-family:Lato,sans-serif;text-transform:uppercase;font-weight:600;padding-bottom:.5em}.content-inner tr{border-bottom:1px solid #d5dae6;vertical-align:bottom;height:2.5em}.content-inner .summary .summary-row .summary-signature a,.content-inner .summary h2 a{background:0 0;border:none;text-shadow:none}.content-inner td,.content-inner th{padding-left:1em;line-height:2em}.content-inner h1.section-heading:hover a.hover-link{opacity:1;text-decoration:none}.content-inner h1.section-heading a.hover-link{transition:opacity .3s ease-in-out;display:inline-block;opacity:0;padding:.3em .6em .6em;line-height:1em;margin-left:-2.7em;background:0 0;border:none;text-shadow:none;font-size:16px;vertical-align:middle}.content-inner .summary h2{font-weight:600}.content-inner .summary .summary-row .summary-signature{font-family:Inconsolata,Menlo,Courier,monospace;font-weight:600}.content-inner .summary .summary-row .summary-synopsis{font-family:Merriweather,serif;font-style:italic;padding:0 .5em;margin:0 0 .5em}.content-inner .detail-header,.content-inner code{font-family:Inconsolata,Menlo,Courier,monospace}.content-inner .summary .summary-row .summary-synopsis p{margin:0;padding:0}.content-inner .detail-header{margin:2.5em 0 .5em;padding:.5em 1em;background:#f7f7f7;border-left:3px solid #9768d1;font-size:1em;position:relative}.content-inner .detail-header .signature{font-size:1rem;font-weight:600}.content-inner .detail-header:hover a.detail-link{opacity:1;text-decoration:none}.content-inner .detail-header a.detail-link{transition:opacity .3s ease-in-out;position:absolute;top:0;left:0;display:block;opacity:0;padding:.6em;line-height:1.5em;margin-left:-2.5em;background:0 0;border:none;text-shadow:none}.content-inner .specs .specs-list pre code,.content-inner .types .types-list .type-detail pre code{padding:0 .5em;border:none}.content-inner .specs .specs-list{margin:0 0 2em}.content-inner .specs .specs-list pre{margin:.5em 0}.content-inner .types .types-list .type-detail{margin-bottom:2em}.content-inner .types .types-list .type-detail pre{margin:.5em 0}.content-inner .types .types-list .type-detail .typespec-doc{padding:0 1.5em}.content-inner a.no-underline,.content-inner code a{color:#9768d1;text-shadow:none;background-image:none}.content-inner a.no-underline:active,.content-inner a.no-underline:focus,.content-inner a.no-underline:hover,.content-inner a.no-underline:visited,.content-inner code a:active,.content-inner code a:focus,.content-inner code a:hover,.content-inner code a:visited{color:#9768d1}.content-inner code{font-size:15px;font-style:normal;line-height:24px;font-weight:400;background-color:#f7f9fc;border:1px solid #e1e1e1;vertical-align:middle;border-radius:2px;padding:0 .5em}.content-inner pre{margin:1.5em 0}.content-inner pre.spec{margin:0}.content-inner pre.spec code{padding:0}.content-inner pre code.hljs{white-space:inherit;padding:1em 1.5em;background-color:#f7f9fc}.content-inner .footer{margin:4em auto 1em;text-align:center;font-style:italic;font-size:14px;color:#959595}.content-inner .footer .line{display:inline-block}.content-inner .footer a{color:#959595;text-decoration:none;text-shadow:.03em 0 #fff,-.03em 0 #fff,0 .03em #fff,0 -.03em #fff,.06em 0 #fff,-.06em 0 #fff,.09em 0 #fff,-.09em 0 #fff,.12em 0 #fff,-.12em 0 #fff,.15em 0 #fff,-.15em 0 #fff;background-image:linear-gradient(#fff,#fff),linear-gradient(#fff,#fff),linear-gradient(#959595,#959595);background-size:.05em 1px,.05em 1px,1px 1px;background-repeat:no-repeat,no-repeat,repeat-x;background-position:0 90%,100% 90%,0 90%}.content-inner .footer a:selection{text-shadow:.03em 0 #b4d5fe,-.03em 0 #b4d5fe,0 .03em #b4d5fe,0 -.03em #b4d5fe,.06em 0 #b4d5fe,-.06em 0 #b4d5fe,.09em 0 #b4d5fe,-.09em 0 #b4d5fe,.12em 0 #b4d5fe,-.12em 0 #b4d5fe,.15em 0 #b4d5fe,-.15em 0 #b4d5fe;background:#b4d5fe}.content-inner .footer a:-moz-selection{text-shadow:.03em 0 #b4d5fe,-.03em 0 #b4d5fe,0 .03em #b4d5fe,0 -.03em #b4d5fe,.06em 0 #b4d5fe,-.06em 0 #b4d5fe,.09em 0 #b4d5fe,-.09em 0 #b4d5fe,.12em 0 #b4d5fe,-.12em 0 #b4d5fe,.15em 0 #b4d5fe,-.15em 0 #b4d5fe;background:#b4d5fe}.results .result-id a,a.close-search{text-shadow:none;background-image:none;transition:color .3s ease-in-out}.content-inner .footer a *,.content-inner .footer a :after,.content-inner .footer a :before,.content-inner .footer a:after,.content-inner .footer a:before{text-shadow:none}.content-inner .footer a:visited{color:#959595}a.close-search{margin-top:-3em;display:block;float:right}a.close-search:active,a.close-search:focus,a.close-search:visited{color:#000}a.close-search:hover{color:#9768d1}.results .result-id{font-size:1.2em}.results .result-id a:active,.results .result-id a:focus,.results .result-id a:visited{color:#000}.results .result-id a:hover{color:#9768d1}.results .result-elem em,.results .result-id em{font-style:normal;color:#9768d1}.results ul{margin:0;padding:0}@media print{#sidebar{display:none}}


--------------------------------------------------------------------------------
/doc/dist/sidebar_items.js:
--------------------------------------------------------------------------------
1 | sidebarNodes={"exceptions":[],"extras":[{"id":"extra-api-reference","title":"API Reference","headers":[]},{"id":"extra-readme","title":"README","headers":[{"id":" Features","anchor":"Features"},{"id":" Build your own webcrawlers","anchor":"Build-your-own-webcrawlers"},{"id":" TODOS","anchor":"TODOS"}]}],"modules":[{"id":"Scrapex","title":"Scrapex"},{"id":"Scrapex.GenSpider","title":"Scrapex.GenSpider","functions":[{"id":"await/1","anchor":"await/1"},{"id":"export/3","anchor":"export/3"},{"id":"handle_call/3","anchor":"handle_call/3"},{"id":"handle_info/2","anchor":"handle_info/2"},{"id":"request/2","anchor":"request/2"},{"id":"start/3","anchor":"start/3"},{"id":"start_link/3","anchor":"start_link/3"}],"types":[{"id":"format/0","anchor":"t:format/0"},{"id":"option/0","anchor":"t:option/0"},{"id":"options/0","anchor":"t:options/0"},{"id":"response/0","anchor":"t:response/0"},{"id":"spider/0","anchor":"t:spider/0"},{"id":"state/0","anchor":"t:state/0"}]},{"id":"Scrapex.GenSpider.Response","title":"Scrapex.GenSpider.Response","functions":[{"id":"url_join/2","anchor":"url_join/2"}],"types":[{"id":"t/0","anchor":"t:t/0"}]},{"id":"Scrapex.Selector","title":"Scrapex.Selector","functions":[{"id":"extract/1","anchor":"extract/1"},{"id":"extract/2","anchor":"extract/2"},{"id":"select/2","anchor":"select/2"}],"types":[{"id":"attribute/0","anchor":"t:attribute/0"},{"id":"children/0","anchor":"t:children/0"},{"id":"html_node/0","anchor":"t:html_node/0"},{"id":"html_tree/0","anchor":"t:html_tree/0"},{"id":"name/0","anchor":"t:name/0"},{"id":"selector/0","anchor":"t:selector/0"},{"id":"t/0","anchor":"t:t/0"}]}],"protocols":[]}


--------------------------------------------------------------------------------
/doc/extra-api-reference.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 |   <head>
  4 |     <meta charset="utf-8">
  5 |     <meta http-equiv="x-ua-compatible" content="ie=edge">
  6 |     <meta name="viewport" content="width=device-width, initial-scale=1">
  7 |     <title>API Reference – Scrapex v0.1.0</title>
  8 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  9 |     <meta name="generator" content="ExDoc v0.10.0">
 10 |     <link rel="stylesheet" href="dist/app.css" />
 11 |     <script src="dist/sidebar_items.js"></script>
 12 |   </head>
 13 |   <body data-type="extras">
 14 | 
 15 | <div class="main">
 16 | <button class="sidebar-toggle">
 17 |   <i class="icon-menu"></i>
 18 | </button>
 19 | <section class="sidebar">
 20 |   <button class="sidebar-toggle">
 21 |     <i class="icon-menu"></i>
 22 |   </button>
 23 | 
 24 |   
 25 |   <a href="extra-api-reference.html" class="sidebar-projectLink">
 26 |     <div class="sidebar-projectDetails">
 27 |       <h1 class="sidebar-projectName">
 28 |         Scrapex
 29 |       </h1>
 30 |       <h2 class="sidebar-projectVersion">
 31 |         v0.1.0
 32 |       </h2>
 33 |     </div>
 34 |     
 35 |       <img src="assets/logo.png" alt="Scrapex" class="sidebar-projectImage">
 36 |     
 37 |   </a>
 38 | 
 39 |   <div class="sidebar-search">
 40 |     <i class="icon-search"></i>
 41 |     <input type="text" class="sidebar-searchInput" placeholder="search" autocomplete="off" />
 42 |   </div>
 43 | 
 44 |   <ul class="sidebar-listNav">
 45 |     <li><a id="extras-list" href="#full-list">Pages</a></li>
 46 | 
 47 |     
 48 |       <li><a id="modules-list" href="#full-list">Modules</a></li>
 49 |     
 50 | 
 51 |     
 52 | 
 53 |     
 54 |   </ul>
 55 | 
 56 |   <ul id="full-list" class="sidebar-fullList"></ul>
 57 |   <div class="sidebar-noResults"></div>
 58 | </section>
 59 | 
 60 | <section class="content">
 61 |   <div id="content" class="content-inner">
 62 | 
 63 |   <h1>API Reference</h1>
 64 | 
 65 |   <ul class="summary-links">
 66 |     
 67 |       <li><a href="#modules">Modules</a></li>
 68 |     
 69 |     
 70 |     
 71 |   </ul>
 72 | 
 73 |   
 74 |     <section id="modules" class="details-list">
 75 |       <h1 class="section-heading">Modules</h1>
 76 |       <div class="summary">
 77 |         <div class="summary-row">
 78 |   <div class="summary-signature"><a href="Scrapex.html">Scrapex</a></div>
 79 |   
 80 | </div>
 81 | <div class="summary-row">
 82 |   <div class="summary-signature"><a href="Scrapex.GenSpider.html">Scrapex.GenSpider</a></div>
 83 |   
 84 |     <div class="summary-synopsis"><p>A behaviour module for implementing a web data extractor</p>
 85 | </div>
 86 |   
 87 | </div>
 88 | <div class="summary-row">
 89 |   <div class="summary-signature"><a href="Scrapex.GenSpider.Response.html">Scrapex.GenSpider.Response</a></div>
 90 |   
 91 |     <div class="summary-synopsis"><p>Utilities for working response returned from <code class="inline">GenSpider</code></p>
 92 | </div>
 93 |   
 94 | </div>
 95 | <div class="summary-row">
 96 |   <div class="summary-signature"><a href="Scrapex.Selector.html">Scrapex.Selector</a></div>
 97 |   
 98 |     <div class="summary-synopsis"><p>Utilities for extracting data from markup language</p>
 99 | </div>
100 |   
101 | </div>
102 | 
103 |       </div>
104 |     </section>
105 |   
106 | 
107 |   
108 | 
109 |   
110 |     <footer class="footer">
111 |       <p>
112 |         <span class="line">
113 |           Built using
114 |           <a href="https://github.com/elixir-lang/ex_doc" title="ExDoc" rel="help" target="_blank">ExDoc</a> (v0.10.0),
115 |         </span>
116 |         <span class="line">
117 |           designed by
118 |           <a href="https://twitter.com/dignifiedquire" target="_blank" title="@dignifiedquire">Friedel Ziegelmayer</a>.
119 |           </span>
120 |       </p>
121 |     </footer>
122 |   </div>
123 | </section>
124 | </div>
125 |     <script src="dist/app.js"></script>
126 |   </body>
127 | </html>
128 | 
129 | 


--------------------------------------------------------------------------------
/doc/extra-readme.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 |   <head>
  4 |     <meta charset="utf-8">
  5 |     <meta http-equiv="x-ua-compatible" content="ie=edge">
  6 |     <meta name="viewport" content="width=device-width, initial-scale=1">
  7 |     <title>README – Scrapex v0.1.0</title>
  8 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  9 |     <meta name="generator" content="ExDoc v0.10.0">
 10 |     <link rel="stylesheet" href="dist/app.css" />
 11 |     <script src="dist/sidebar_items.js"></script>
 12 |   </head>
 13 |   <body data-type="extras">
 14 | 
 15 | <div class="main">
 16 | <button class="sidebar-toggle">
 17 |   <i class="icon-menu"></i>
 18 | </button>
 19 | <section class="sidebar">
 20 |   <button class="sidebar-toggle">
 21 |     <i class="icon-menu"></i>
 22 |   </button>
 23 | 
 24 |   
 25 |   <a href="extra-api-reference.html" class="sidebar-projectLink">
 26 |     <div class="sidebar-projectDetails">
 27 |       <h1 class="sidebar-projectName">
 28 |         Scrapex
 29 |       </h1>
 30 |       <h2 class="sidebar-projectVersion">
 31 |         v0.1.0
 32 |       </h2>
 33 |     </div>
 34 |     
 35 |       <img src="assets/logo.png" alt="Scrapex" class="sidebar-projectImage">
 36 |     
 37 |   </a>
 38 | 
 39 |   <div class="sidebar-search">
 40 |     <i class="icon-search"></i>
 41 |     <input type="text" class="sidebar-searchInput" placeholder="search" autocomplete="off" />
 42 |   </div>
 43 | 
 44 |   <ul class="sidebar-listNav">
 45 |     <li><a id="extras-list" href="#full-list">Pages</a></li>
 46 | 
 47 |     
 48 |       <li><a id="modules-list" href="#full-list">Modules</a></li>
 49 |     
 50 | 
 51 |     
 52 | 
 53 |     
 54 |   </ul>
 55 | 
 56 |   <ul id="full-list" class="sidebar-fullList"></ul>
 57 |   <div class="sidebar-noResults"></div>
 58 | </section>
 59 | 
 60 | <section class="content">
 61 |   <div id="content" class="content-inner">
 62 | 
 63 | 
 64 | <h1>Scrapex</h1>
 65 | <p>An open source and collaborative framework for extracting the data you need from websites. In a fast, simple, yet extensible way.</p>
 66 | <h2 id="Features"> Features</h2><h3>Fast and powerful</h3>
 67 | <p>Write the rules to extract the data and let Scrapex do the rest.</p>
 68 | <h3>Easily extensible</h3>
 69 | <p>Extensible by design, plug new functionality easily without having to touch the core.</p>
 70 | <h3>Portable, Elixir</h3>
 71 | <p>Written in Elixir and runs on Linux, Windows, Mac, BSD, and embedded devices.</p>
 72 | <h2 id="Build-your-own-webcrawlers"> Build your own webcrawlers</h2><pre><code class="elixir">alias Scrapex.GenSpider
 73 | defmodule StackOverflowSpider do
 74 |   use GenSpider
 75 |   import Scrapex.Selector
 76 | 
 77 |   def parse(response, state) do
 78 |     result = response.body
 79 |     |&gt; select(&quot;.question-summary h3 a&quot;)
 80 |     |&gt; extract(&quot;href&quot;)
 81 |     |&gt; Enum.map(fn(href) -&gt;
 82 |       GenSpider.Response.url_join(response, href)
 83 |       |&gt; GenSpider.request(&amp;parse_question/1)
 84 |       |&gt; GenSpider.await
 85 |     end)
 86 |     {:ok, result, state}
 87 |   end
 88 | 
 89 |   defp parse_question({:ok, response}) do
 90 |     html = response.body
 91 |     [title] = html |&gt; select(&quot;h1 a&quot;) |&gt; extract()
 92 |     question = html |&gt; select(&quot;.question&quot;)
 93 |     [body] = question |&gt; select(&quot;.post-text&quot;) |&gt; extract
 94 |     [votes] = question |&gt; select(&quot;.vote-count-post&quot;) |&gt; extract
 95 |     tags = question |&gt; select(&quot;.post-tag&quot;) |&gt; extract
 96 | 
 97 |     %{title: title, body: body, votes: votes, tags: tags}
 98 |   end
 99 | end
100 | urls = [&quot;http://stackoverflow.com/questions?sort=votes&quot;]
101 | opts = [name: :webscrapper, urls: urls]
102 | {:ok, spider} = GenSpider.start_link(StackOverflowSpider, [], opts)
103 | questions = GenSpider.export(spider)
104 | #=&gt; &quot;[{} | _]&quot;</code></pre>
105 | <h2 id="TODOS"> TODOS</h2><ul>
106 | <li>[x] <code class="inline">GenSpider behaviour</code>.
107 | </li>
108 | <li>[x] Request URL and pass response to <code class="inline">parse/2</code> callback.
109 | </li>
110 | <li>[x] One time spider
111 | </li>
112 | <li>[x] CSS selector
113 | </li>
114 | <li>[ ] XPath selector
115 | </li>
116 | <li>[x] Yield for requests in <code class="inline">parse/2</code>
117 | </li>
118 | <li>[ ] Parse response chunk by chunk
119 | </li>
120 | <li>[ ] CLI
121 | </li>
122 | </ul>
123 | 
124 |     <footer class="footer">
125 |       <p>
126 |         <span class="line">
127 |           Built using
128 |           <a href="https://github.com/elixir-lang/ex_doc" title="ExDoc" rel="help" target="_blank">ExDoc</a> (v0.10.0),
129 |         </span>
130 |         <span class="line">
131 |           designed by
132 |           <a href="https://twitter.com/dignifiedquire" target="_blank" title="@dignifiedquire">Friedel Ziegelmayer</a>.
133 |           </span>
134 |       </p>
135 |     </footer>
136 |   </div>
137 | </section>
138 | </div>
139 |     <script src="dist/app.js"></script>
140 |   </body>
141 | </html>
142 | 
143 | 


--------------------------------------------------------------------------------
/doc/fonts/icomoon.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sntran/scrapex/0b8e1db6cf24f3d98e644b03479af9a7c304b6a8/doc/fonts/icomoon.eot


--------------------------------------------------------------------------------
/doc/fonts/icomoon.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" standalone="no"?>
 2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd" >
 3 | <svg xmlns="http://www.w3.org/2000/svg">
 4 | <metadata>Generated by IcoMoon</metadata>
 5 | <defs>
 6 | <font id="icomoon" horiz-adv-x="512">
 7 | <font-face units-per-em="512" ascent="480" descent="-32" />
 8 | <missing-glyph horiz-adv-x="512" />
 9 | <glyph unicode="&#x20;" horiz-adv-x="256" d="" />
10 | <glyph unicode="&#xe005;" glyph-name="link" d="M362.666 448q25.166 0 48.584-9.5t41.916-28 28-41.916 9.5-48.584q0-25-9.584-48.583t-27.916-41.917l-64-64q-1.834-1.834-5.5-5.166-17.834-15.834-39.834-24.084t-45.166-8.25q-29.166 0-55.166 12.5-19.5 9.166-35.334 25t-25 35.334q12.5 12.5 30.166 12.5 6.167 0 12.5-1.834 10.834-17.5 28.334-28.334 20.5-12.5 44.5-12.5 16.666 0 32.334 6.334t28 18.666l64 64q12.334 12.333 18.666 28t6.334 32.333-6.334 32.334-18.666 28-28 18.666-32.334 6.334-32.334-6.334-28-18.666l-44.834-44.834q-21.334 5.834-44.166 5.834-3.667 0-10.334-0.334 3.333 3.667 5.167 5.5l64 64q18.334 18.334 41.916 27.917t48.584 9.583zM213.333 298.667q29.167 0 55.167-12.5 19.5-9.167 35.334-25t25-35.333q-12.5-12.5-30.166-12.5-6.166 0-12.5 1.834-10.834 17.5-28.334 28.333-20.5 12.5-44.5 12.5-16.666 0-32.334-6.333t-28-18.667l-64-64q-12.334-12.334-18.666-28t-6.334-32.334 6.334-32.334 18.666-28 28-18.666 32.334-6.334 32.334 6.334 28 18.666l44.834 44.834q21.334-5.834 44.166-5.834 3.666 0 10.334 0.334-3.334-3.666-5.166-5.5l-64-64q-18.5-18.5-41.916-28t-48.584-9.5q-25 0-48.584 9.584t-41.916 27.916q-18.5 18.5-28 41.916t-9.5 48.584 9.5 48.584 28 41.916l64 64q1.834 1.833 5.5 5.166 17.834 15.834 39.834 24.084t45.166 8.25z" />
11 | <glyph unicode="&#xe036;" glyph-name="search" d="M213.333 448q39 0 74.583-15.25t61.25-40.916 40.916-61.25 15.25-74.584q0-33.5-10.916-64.167t-31.084-55.666l121.166-121q6.166-6.166 6.166-15.166 0-9.166-6.084-15.25t-15.25-6.084q-9 0-15.166 6.166l-121 121.166q-25-20.166-55.666-31.084t-64.166-10.916q-39 0-74.584 15.25t-61.25 40.916-40.916 61.25-15.25 74.584 15.25 74.584 40.916 61.25 61.25 40.916 74.584 15.25zM213.333 405.333q-30.333 0-58-11.834t-47.666-31.833-31.834-47.667-11.834-58 11.834-58 31.834-47.666 47.666-31.834 58-11.834 58 11.834 47.666 31.834 31.834 47.666 11.834 58-11.834 58-31.834 47.667-47.666 31.833-58 11.834z" />
12 | <glyph unicode="&#xe117;" glyph-name="cross" d="M405.334 384q9.166 0 15.25-6.083t6.084-15.25q0-9-6.166-15.167l-134.334-134.166 134.333-134.167q6.166-6.166 6.166-15.166 0-9.166-6.084-15.25t-15.25-6.084q-9 0-15.166 6.166l-134.166 134.334-134.167-134.333q-6.166-6.166-15.166-6.166-9.166 0-15.25 6.084t-6.084 15.25q0 9 6.166 15.166l134.333 134.167-134.333 134.167q-6.166 6.166-6.166 15.166 0 9.167 6.084 15.25t15.25 6.084q9 0 15.166-6.167l134.167-134.333 134.167 134.333q6.166 6.166 15.166 6.166z" />
13 | <glyph unicode="&#xe120;" glyph-name="menu" d="M64 362.667h384q8.834 0 15.084-6.25t6.25-15.083-6.25-15.084-15.084-6.25h-384q-8.834 0-15.084 6.25t-6.25 15.084 6.25 15.083 15.084 6.25zM64 106.667h384q8.834 0 15.084-6.25t6.25-15.084-6.25-15.084-15.084-6.25h-384q-8.834 0-15.084 6.25t-6.25 15.084 6.25 15.084 15.084 6.25zM64 234.667h384q8.834 0 15.084-6.25t6.25-15.084-6.25-15.084-15.084-6.25h-384q-8.834 0-15.084 6.25t-6.25 15.084 6.25 15.084 15.084 6.25z" />
14 | <glyph unicode="&#xf105;" glyph-name="angle-right" horiz-adv-x="183" d="M170 201.143q0-3.714-2.857-6.572l-133.143-133.143q-2.857-2.857-6.572-2.857t-6.572 2.857l-14.286 14.286q-2.857 2.857-2.857 6.572t2.857 6.572l112.285 112.286-112.285 112.286q-2.857 2.857-2.857 6.572t2.857 6.572l14.285 14.286q2.857 2.857 6.572 2.857t6.572-2.857l133.143-133.143q2.857-2.857 2.857-6.572z" />
15 | <glyph unicode="&#xf121;" glyph-name="code" horiz-adv-x="549" d="M176.286 75.714l-14.286-14.286q-2.857-2.857-6.572-2.857t-6.572 2.857l-133.143 133.143q-2.857 2.857-2.857 6.572t2.857 6.572l133.143 133.143q2.857 2.857 6.572 2.857t6.572-2.857l14.286-14.286q2.857-2.857 2.857-6.571t-2.857-6.572l-112.285-112.286 112.285-112.286q2.857-2.857 2.857-6.572t-2.857-6.572zM345.143 380.572l-106.572-368.857q-1.143-3.714-4.428-5.572t-6.714-0.714l-17.714 4.857q-3.714 1.143-5.572 4.428t-0.714 7l106.572 368.857q1.143 3.714 4.428 5.571t6.714 0.715l17.714-4.857q3.714-1.143 5.572-4.428t0.714-7zM532.857 194.572l-133.143-133.143q-2.857-2.857-6.572-2.857t-6.572 2.857l-14.286 14.286q-2.857 2.857-2.857 6.572t2.857 6.572l112.286 112.286-112.286 112.286q-2.857 2.857-2.857 6.572t2.857 6.572l14.286 14.286q2.857 2.857 6.572 2.857t6.572-2.857l133.143-133.143q2.857-2.857 2.857-6.572t-2.857-6.572z" />
16 | </font></defs></svg>


--------------------------------------------------------------------------------
/doc/fonts/icomoon.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sntran/scrapex/0b8e1db6cf24f3d98e644b03479af9a7c304b6a8/doc/fonts/icomoon.ttf


--------------------------------------------------------------------------------
/doc/fonts/icomoon.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sntran/scrapex/0b8e1db6cf24f3d98e644b03479af9a7c304b6a8/doc/fonts/icomoon.woff


--------------------------------------------------------------------------------
/doc/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta charset="utf-8">
 5 |     <title>Scrapex v0.1.0 – Documentation</title>
 6 |     <meta http-equiv="refresh" content="0; url=extra-api-reference.html">
 7 |     <meta name="robots" content="noindex">
 8 |     <meta name="generator" content="ExDoc v0.10.0">
 9 |   </head>
10 |   <body></body>
11 | </html>
12 | 


--------------------------------------------------------------------------------
/lib/scrapex.ex:
--------------------------------------------------------------------------------
1 | defmodule Scrapex do
2 | end
3 | 


--------------------------------------------------------------------------------
/lib/scrapex/gen_spider.ex:
--------------------------------------------------------------------------------
  1 | defmodule Scrapex.GenSpider do
  2 |   alias Scrapex.GenSpider
  3 |   alias GenSpider.Request
  4 | 
  5 |   require Logger
  6 |   @moduledoc ~S"""
  7 |   A behaviour module for implementing a web data extractor.
  8 | 
  9 |   A GenSpider is a process as any other Elixir process and it can be
 10 |   used to crawl a list of URLs, run callback to parse the response,
 11 |   and repeat on an interval.
 12 | 
 13 |   ## Example
 14 | 
 15 |   The GenSpider behaviour abstracts the common data extraction process.
 16 |   Users are only required to implement the callbacks and functionality
 17 |   they are interested in.
 18 | 
 19 |   Imagine we want a GenSpider that follows the links to the top voted 
 20 |   questions on StackOverflow and scrapes some data from each page:
 21 | 
 22 |       iex> alias Scrapex.GenSpider
 23 |       iex> defmodule StackOverflowSpider do
 24 |       ...>   use GenSpider
 25 |       ...>   import Scrapex.Selector
 26 |       ...>   
 27 |       ...>   def parse(response) do
 28 |       ...>     result = response.body
 29 |       ...>     |> select(".question-summary h3 a")
 30 |       ...>     |> extract("href")
 31 |       ...>     |> Enum.map(fn(href) ->
 32 |       ...>       GenSpider.Response.url_join(response, href)
 33 |       ...>       |> GenSpider.request(&parse_question/1)
 34 |       ...>       |> GenSpider.await
 35 |       ...>     end)
 36 |       ...>     {:ok, result}
 37 |       ...>   end
 38 |       ...> 
 39 |       ...>   defp parse_question(response) do
 40 |       ...>     html = response.body
 41 |       ...>     [title] = html |> select("h1 a") |> extract()
 42 |       ...>     question = html |> select(".question")
 43 |       ...>     [body] = question |> select(".post-text") |> extract
 44 |       ...>     [votes] = question |> select(".vote-count-post") |> extract
 45 |       ...>     tags = question |> select(".post-tag") |> extract
 46 |       ...>     
 47 |       ...>     %{title: title, body: body, votes: votes, tags: tags}
 48 |       ...>   end
 49 |       ...> end
 50 |       iex> urls = ["http://stackoverflow.com/questions?sort=votes"]
 51 |       iex> opts = [name: :webscrapper, urls: urls]
 52 |       iex> {:ok, spider} = GenSpider.start_link(StackOverflowSpider, [], opts)
 53 |       iex> [top_question|_] = GenSpider.export(spider)
 54 |       iex> top_question.title
 55 |       "Why is processing a sorted array faster than an unsorted array?"
 56 | 
 57 | 
 58 |   We start our `WebScrapper` by calling `start_link/3`, passing the
 59 |   module with the spider implementation and its initial argument (a
 60 |   list representing the selectors to follow and grab). We also pass
 61 |   a option list to register the spider with a name, and a list of urls
 62 |   to start following, and an interval for refetching.
 63 | 
 64 |   We can get the data from the spider by calling `GenSpider.export/2`
 65 |   with the `pid` of the spider, and the output format. `GenSpider`
 66 |   supports outputting JSON, CSV and XML. 
 67 | 
 68 |   ## Callbacks
 69 | 
 70 |   There are 3 callbacks required to be implemented in a `GenSpider`.
 71 |   By adding `use GenSpider` to your module, all 6 callbacks will be
 72 |   automatically defined, leaving it up to you to implement the ones
 73 |   you want to customize. The callbacks are:
 74 | 
 75 |     * `init(args)` - invoked when the spider is started.
 76 | 
 77 |       It must return:
 78 |       -  `{:ok, state}`
 79 |       -  `{:ok, state, delay}`
 80 |       -  `:ignore`
 81 |       -  `{:stop, reason}`
 82 | 
 83 |     * `start_requests(state)` - called by Scrapy when the spider is 
 84 |       opened for scraping when no particular URLs are specified.  If 
 85 |       particular URLs are specified, the `make_requests_from_url/1`
 86 |       is used instead to create the Requests. This method is also 
 87 |       called only once from Scrapex, so it’s safe to implement it as 
 88 |       a stream.
 89 | 
 90 |       The default implementation uses `make_requests_from_url/1` to 
 91 |       generate Requests for each url in `options.urls`.
 92 | 
 93 |     * `make_requests_from_url(url)` - returns a Request object (or a 
 94 |       list of Request objects) to scrape. It is used to construct the 
 95 |       initial requests in  `start_requests/1`, and is typically used to 
 96 |       convert urls to requests.
 97 | 
 98 |       Unless overridden, this method returns Requests with `parse/2 as 
 99 |       their callback function.
100 | 
101 |     * `parse(response)` - invoked after the spider has requested
102 |       a URL successfully with a HTML in `response`.
103 | 
104 |       It must return:
105 |       -  `{:ok, result}`
106 |       -  `{:ignore, reason}`
107 |       -  `{:stop, reason}`
108 | 
109 |     * `terminate(reason, state)` - called when the server is about to
110 |       terminate, useful for cleaning up. It must return `:ok`.
111 | 
112 |     * `code_change(old_vsn, state, extra)` - called when the application
113 |       code is being upgraded live (hot code swapping).
114 | 
115 |       It must return:
116 |       -  `{:ok, new_state}`
117 |       -  `{:error, reason}`
118 | 
119 |   ## Client / Server APIs
120 | 
121 |   Although in the example above we have used `GenSpider.start_link/3`
122 |   and friends to directly start and communicate with the spider, most 
123 |   of the time we don't call the `GenSpider` functions directly.
124 |   Instead, we wrap the calls in new functions representing the public
125 |   API of the spider.
126 | 
127 |   Here is a better implementation of our StackOverflowSpider module:
128 | 
129 |       defmodule StackOverflowSpider do
130 |         use GenSpider
131 | 
132 |         # Client
133 |         def start_link(sitemap) do
134 |           urls = ["http://stackoverflow.com/questions?sort=votes"]
135 |           opts =  [name: :stackoverflow, urls: urls, interval: 3600]
136 |           GenSpider.start_link(__MODULE__, [], opts)
137 |         end
138 | 
139 |         def json(pid) do
140 |           GenSpider.export(pid, :json)
141 |         end
142 |         
143 |         # Server (callbacks)
144 | 
145 |         def parse(response) do
146 |           result = response.body
147 |           |> select(".question-summary h3 a")
148 |           |> extract("href")
149 |           |> Enum.map(fn(href) ->
150 |             GenSpider.Response.url_join(response, href)
151 |             |> GenSpider.request(&parse_question/1)
152 |             |> GenSpider.await
153 |           end)
154 |           {:ok, result]}
155 |         end
156 | 
157 |         defp parse_question(response) do
158 |           html = response.body
159 |           [title] = html |> select("h1 a") |> extract()
160 |           question = html |> select(".question")
161 |           [body] = question |> select(".post-text") |> extract
162 |           [votes] = question |> select(".vote-count-post") |> extract
163 |           tags = question |> select(".post-tag") |> extract
164 |           
165 |           %{title: title, body: body, votes: votes, tags: tags}
166 |         end
167 |       end
168 | 
169 |   In practice, it is common to have both server and client functions in
170 |   the same module. If the server and/or client implementations are 
171 |   growing complex, you may want to have them in different modules.
172 |   """
173 | 
174 |   @typedoc "Options used by the `start*` functions"
175 |   @type options :: [options]
176 | 
177 |   @type url :: binary
178 | 
179 |   @type option :: {:name, GenServer.name} |
180 |                   {:urls, [url]} |
181 |                   {:timeout, timeout} |
182 |                   {:interval, non_neg_integer}
183 | 
184 |   @typedoc "The spider reference"
185 |   @type spider :: pid | GenServer.name | {atom, node}
186 | 
187 |   @typedoc "The internal state of the spider"
188 |   @type state :: any
189 | 
190 |   @typedoc "The list of requests or stream for the spider to enumerate"
191 |   @type requests :: Request.t | [Request.t] | Stream.t
192 | 
193 |   @typedoc "The response from a request to a URL"
194 |   @type response :: binary
195 | 
196 |   @typedoc "Exportable formats"
197 |   @type format :: :html | :json | :csv | :xml
198 | 
199 |   @type t :: %__MODULE__{module: atom, 
200 |                          state: any, 
201 |                          options: Keyword.t,
202 |                          data: [{url, any}],
203 |                          requests: requests,
204 |                          timer: reference}
205 |   defstruct module: nil, state: nil, 
206 |             options: [], data: [], requests: [], timer: nil
207 | 
208 |   # `GenSpider` is based on `GenServer`.
209 |   use GenServer
210 | 
211 |   # Define the callbacks for `GenSpider`
212 |   @callback init(any) ::
213 |     {:ok, state} | {:ok, state, timeout | :hibernate} |
214 |     :ignore | {:stop, reason :: term}
215 | 
216 |   @callback start_requests([url], state) ::
217 |     {:ok, requests, state}
218 | 
219 |   @callback make_requests_from_url(url) :: requests
220 | 
221 |   @callback parse(response) ::
222 |     {:ok, data :: list} | {:ignore, reason :: term} |
223 |     {:stop, reason :: term}
224 | 
225 |   @doc """
226 |   This callback is the same as the `GenServer` equivalent and is used to change
227 |   the state when loading a different version of the callback module.
228 |   """
229 |   @callback code_change(any, any, state) :: {:ok, state}
230 | 
231 |   @doc """
232 |   This callback is the same as the `GenServer` equivalent and is called when the
233 |   process terminates. The first argument is the reason the process is about
234 |   to exit with.
235 |   """
236 |   @callback terminate(any, state) :: any
237 | 
238 |   @doc false
239 |   defmacro __using__(_) do
240 |     quote location: :keep do
241 |       @behaviour GenSpider
242 |       require Logger
243 | 
244 |       @start_urls []
245 | 
246 |       @doc false
247 |       def init(args) do
248 |         {:ok, args}
249 |       end
250 | 
251 |       @doc """
252 |       Default method to generate the first Requests to crawl.
253 | 
254 |       Uses `make_requests_from_url/1` to generate Requests for each 
255 |       url in `start_urls`.
256 |       """
257 |       def start_requests(start_urls, state) do
258 |         requests = start_urls
259 |         |> Enum.map(&make_requests_from_url/1)
260 |         {:ok, requests, state}
261 |       end
262 | 
263 |       @doc """
264 |       Default method to generate a Request (or a list of Requests).
265 |       """
266 |       def make_requests_from_url(url) do
267 |         GenSpider.request(url, &parse/1)
268 |       end
269 | 
270 |       @doc false
271 |       def parse(response) do
272 |         {:ok, [response.body]}
273 |       end
274 | 
275 |       @doc false
276 |       def terminate(_reason, _state) do
277 |         :ok
278 |       end
279 | 
280 |       @doc false
281 |       def code_change(_old, state, _extra) do
282 |         {:ok, state}
283 |       end
284 | 
285 |       defoverridable [init: 1, 
286 |                       start_requests: 2, 
287 |                       make_requests_from_url: 1,
288 |                       parse: 1,
289 |                       terminate: 2, code_change: 3]
290 |     end
291 |   end
292 | 
293 |   @doc """
294 |   Starts a `GenSpider` process linked to the current process.
295 | 
296 |   This is often used to start the `GenSpider` as part of a supervision 
297 |   tree.
298 | 
299 |   Once the spider is started, it calls the `init/1` function in the 
300 |   given `module` passing the given `args` to initialize it. To ensure 
301 |   a synchronized start-up procedure, this function does not return 
302 |   until `init/1` has returned.
303 | 
304 |   Note that a `GenSpider` started with `start_link/3` is linked to the
305 |   parent process and will exit in case of crashes. The GenSpider will 
306 |   also exit due to the `:normal` reasons in case it is configured to 
307 |   trap exits in the `init/1` callback.
308 | 
309 |   ## Options
310 | 
311 |   The `:name` option is used for name registration as described in the 
312 |   module documentation. If the option `:timeout` option is present, 
313 |   the spider is allowed to spend the given milliseconds initializing 
314 |   or it will be terminated and the start function will return 
315 |   `{:error, :timeout}`.
316 | 
317 |   The `:urls` defines a list of URLs for the spider to start from.
318 | 
319 |   If the `:inverval` option is present, the spider will repeat itself
320 |   after every number of seconds defined by the option. Note that it
321 |   will only repeat if it's not currently running a crawl.
322 | 
323 |   ## Return values
324 | 
325 |   If the spider is successfully created and initialized, the function 
326 |   returns `{:ok, pid}`, where pid is the pid of the spider. If there 
327 |   already exists a process with the specified spider name, the 
328 |   function returns `{:error, {:already_started, pid}}` with the pid of 
329 |   that process.
330 | 
331 |   If the `init/1` callback fails with `reason`, the function returns
332 |   `{:error, reason}`. Otherwise, if it returns `{:stop, reason}`or 
333 |   `:ignore`, the process is terminated and the function returns
334 |   `{:error, reason}` or `:ignore`, respectively.
335 |   """
336 |   @spec start_link(module, any, options) :: GenServer.on_start
337 |   def start_link(module, args, options \\ []) 
338 |   when is_atom(module) and is_list(options) 
339 |   do
340 |     do_start(:start_link, module, args, options)
341 |   end
342 | 
343 |   @doc """
344 |   Starts a `GenSpider` without links (outside of a supervision tree).
345 |   See `start_link/3` for more information.
346 |   """
347 |   @spec start(module, any, options) :: GenServer.on_start
348 |   def start(module, args, options \\ []) 
349 |   when is_atom(module) and is_list(options)
350 |   do
351 |     do_start(:start, module, args, options)
352 |   end
353 | 
354 |   @doc false
355 |   defp do_start(link, module, args, options) do
356 |     {name, opts} = Keyword.pop(options, :name)
357 |     init_args = {module, args, opts}
358 |     case name do
359 |       nil ->
360 |         apply(GenServer, link, [__MODULE__, init_args])
361 |       atom when is_atom(atom) ->
362 |         apply(GenServer, link, [__MODULE__, init_args, [name: atom]])
363 |       {:global, _} ->
364 |         apply(GenServer, link, [__MODULE__, init_args, [name: name]])
365 |       {:via, _, _} ->
366 |         apply(GenServer, link, [__MODULE__, init_args, [name: name]])
367 |     end
368 |   end
369 | 
370 |   @doc """
371 |   Exports the stored data with specific format.
372 | 
373 |   This call will block until all data received.
374 | 
375 |   This is called in the following situations:
376 | 
377 |   - Right after spider is started.
378 |   - In the middle of a crawl.
379 |   - In between the crawl interval.
380 | 
381 |   For the first two situations, the spider will manually awaits the
382 |   requests instead of handle the response message in `handle_info/2`.
383 | 
384 |   If one of the `parse/1` callbacks wants to stop the spider, this
385 |   function will still return partial data if any, and then stops the
386 |   spider.
387 | 
388 |   If the third argument is true, the spider will clear any timer in
389 |   place and immediately crawl for new data.
390 |   """
391 |   @spec export(spider, format, boolean) :: any
392 |   def export(spider, format \\ nil, override \\ false) do
393 |     # Await for all the data to be collected first.
394 |     __MODULE__.await(spider)
395 |     GenServer.call(spider, {:export, format, override})
396 |   end
397 | 
398 |   def request(url, callback, from \\ self) do
399 |     Request.async(url, callback, from)
400 |   end
401 | 
402 |   def await(request = %Request{}), do: Request.await(request)
403 |   def await(spider, timeout \\ :infinity) do
404 |     GenServer.call(spider, :await, timeout)
405 |   end
406 | 
407 |   # GenServer callbacks
408 | 
409 |   def init({module, args, opts}) do
410 |     # Set default timeout, used to stop spider.
411 |     opts = Keyword.put_new(opts, :timeout, 100)
412 |     spider = %GenSpider{  module: module, options: opts, 
413 |                           timer: :erlang.make_ref()}
414 |     urls = opts[:urls] || []
415 | 
416 |     # Set an empty data set with each URLs as keys.
417 |     data = Enum.map(urls, &({&1, nil}))
418 | 
419 |     case apply(module, :init, [args]) do
420 |       {:ok, state} ->
421 |         # Return 0 timeout to trigger crawl immediately.
422 |         # This works regardless of interval option, since we always
423 |         # have a crawl. A crawl will use interval option to see if it
424 |         # needs to do the next one.
425 |         # send_after(self, 0, :crawl)
426 |         Logger.debug "Starts a spider immediately"
427 |         {:ok, %{spider | state: state, data: data}, 0}
428 |       {:ok, state, delay} ->
429 |         # Delay the crawl by the value specified in return.
430 |         # send_after(self, delay, :crawl)
431 |         Logger.debug "Starts a spider after #{delay} milliseconds"
432 |         {:ok, %{spider | state: state, data: data}, delay}
433 |       :ignore ->
434 |         :ignore
435 |       {:stop, reason} ->
436 |         {:stop, reason}
437 |       other ->
438 |         other
439 |     end
440 |   end
441 | 
442 |   @doc """
443 |   Await for any remaining request(s) to finish.
444 | 
445 |   For any remaining requests in the state, await for them to finish.
446 |   This function will receive the response instead of the `handle_info`
447 |   so it then calls the `handle_info` so that the request can be removed
448 |   from state and the response can be parsed by the callback module.
449 | 
450 |   This function can be called in the middle of a crawl of multiple URLs
451 |   but since it only awaits the remaning requests, the spider's state
452 |   is still being passed along correctly.
453 |   """
454 |   def handle_call(:await, _from, spider) do
455 |     spider = 
456 |       spider.requests
457 |       |> Enum.reduce_while(spider, fn(request, spider) ->
458 |         ref = request.ref
459 |         response = Request.await(request, :infinity)
460 |         case handle_info({ref, response}, spider) do
461 |           {:noreply, spider} ->
462 |             {:cont, spider}
463 |           {:stop, _reason, spider} ->
464 |             {:halt, spider}
465 |         end
466 |       end)
467 |     Logger.debug("Awaited for data")
468 |     {:reply, :ok, spider}
469 |   end
470 | 
471 |   @doc """
472 |   Called to export the data in a specific format.
473 |   """
474 |   def handle_call({:export, nil, true}, from, spider) do
475 |     :erlang.cancel_timer(spider.timer)
476 |     {:noreply, spider} = handle_info(:crawl, spider)
477 |     {:reply, :ok, spider} = handle_call(:await, from, spider)
478 |     handle_call({:export, nil, false}, from, spider)
479 |   end
480 | 
481 |   # Main handler for exporting.
482 |   def handle_call({:export, nil, false}, _from, spider) do
483 |     Logger.debug("Exporting data")
484 | 
485 |     interval = spider.options[:interval]
486 | 
487 |     data = 
488 |       spider.data
489 |       |> Enum.filter_map(fn({_,data}) -> data !== nil end, 
490 |                         fn({_, data}) -> data end)
491 | 
492 |     is_complete? = length(data) === length(spider.data)
493 |     data = Enum.concat(data)
494 |     case interval !== nil and is_complete? do
495 |       true ->
496 |         {:reply, data, spider}
497 |       false ->
498 |         {:stop, :normal, data, spider}
499 |     end
500 |   end
501 | 
502 |   def handle_call({:export, :json, override?}, from, spider) do
503 |     case handle_call({:export, nil, override?}, from, spider) do
504 |       {:reply, data, spider} ->
505 |         {:reply, Poison.encode!(data), spider}
506 |       {:stop, _, data, spider} ->
507 |         {:stop, :normal, Poison.encode!(data), spider}
508 |     end
509 |   end
510 | 
511 |   def handle_call({:export, encoder, override?}, from, spider) 
512 |   when is_function(encoder, 1) 
513 |   do
514 |     case handle_call({:export, nil, override?}, from, spider) do
515 |       {:reply, data, spider} ->
516 |         {:reply, encoder.(data), spider}
517 |       {:stop, _, data, spider} ->
518 |         {:stop, :normal, encoder.(data), spider}
519 |     end
520 |   end
521 | 
522 |   def handle_call({:export, _format, true}, _from, spider) do
523 |     {:reply, spider.data, spider}
524 |   end
525 |   
526 |   @doc """
527 |   Called when a timeout occurs, usually when to start a crawl.
528 | 
529 |   The `GenSpider` uses the timeout value to trigger a crawl, in which
530 |   it spawns a task for each URLs specified in the `opts`.
531 | 
532 |   The results will be handled in a different function.
533 |   """
534 |   def handle_info(:timeout, spider) do
535 |     handle_info(:crawl, spider)
536 |   end
537 | 
538 |   @doc """
539 |   Called from a timer to crawl a list of URLs.
540 | 
541 |   This generates a list of async requests to the URLs. The response 
542 |   will be sent back in another message.
543 |   """
544 |   def handle_info(:crawl, spider) do
545 |     options = spider.options
546 |     urls = options[:urls] || []
547 |     Logger.debug("Starts a crawl for #{urls}")
548 | 
549 |     args = [urls, spider.state]
550 |     spider = case call(:start_requests, spider, args) do
551 |       {:ok, requests, state} ->
552 |         # `requests` can also be a `Stream`.
553 |         %{spider |  requests: Enum.map(requests, &(&1)), 
554 |                     state: state}
555 |     end
556 | 
557 |     {:noreply, spider}
558 |   end
559 | 
560 |   @doc """
561 |   Called when a scrape request is completed.
562 | 
563 |   When a request is completed, i.e. receives the response, and parsed, 
564 |   this process receives a message with the result.
565 | 
566 |   If this is for the last request, it sets a new timer if needed.
567 |   """
568 |   def handle_info({ref, {:ok, %Request{}=req}}, spider) do
569 |     data = Request.await(req)
570 |     handle_info({ref, {:ok, data}}, spider)
571 |   end
572 | 
573 |   def handle_info({ref, {:ok, requests = [%Request{} | _]}}, spider) do
574 |     data = Stream.map(requests, &Request.await(&1))
575 |     |> Enum.concat
576 |     handle_info({ref, {:ok, data}}, spider)
577 |   end
578 | 
579 |   def handle_info({ref, {:ok, data}}, spider) do
580 |     # Remove this request from the list.
581 |     {request, requests} = spider.requests
582 |     |> Enum.reduce({nil, []}, fn(request, {req, requests}) ->
583 |       case request.ref === ref do
584 |         true -> {request, requests}
585 |         false -> {req, requests ++ [request]}
586 |       end
587 |     end)
588 |     spider = %{spider | requests: requests}
589 | 
590 |     url = request.url
591 |     Logger.debug "Got data from #{url}"
592 | 
593 |     new_data = List.keystore(spider.data, url, 0, {url, data})
594 |     interval = spider.options[:interval]
595 |     timer = spider.timer
596 |     spider = %{spider | data: new_data}
597 | 
598 |     case {Enum.empty?(requests), interval} do
599 |       {true, nil} ->
600 |         timeout = spider.options[:timeout]
601 |         # No more request, and no interval, so we stop
602 |         send_after(self, timeout, {timer, {:stop, :normal}})
603 |         {:noreply, spider}
604 |       {true, _} ->
605 |         # No more request, but has interval, so schedule a new crawl.
606 |         :erlang.cancel_timer(timer)
607 |         timer = send_after(self, interval, :crawl)
608 |         {:noreply, %{spider | timer: timer}}
609 |       {_, _} ->
610 |         {:noreply, spider}
611 |     end
612 |   end
613 | 
614 |   # Return value from `parse` callback
615 |   def handle_info({_ref, {:stop, reason}}, spider) do
616 |     Logger.info "Spider is stopped with reason #{reason}"
617 |     {:stop, :normal, spider}
618 |   end
619 | 
620 |   # The URL is 404, we remove it from the list to prevent requesting it
621 |   # the next time, and call `handle_info` with empty data so it can
622 |   # continue the loop.
623 |   def handle_info({ref, {:error, {:not_found, url}}}, spider) do
624 |     Logger.error("Failed to request to #{url} with 404 error")
625 | 
626 |     options = spider.options
627 |     urls = Enum.filter(options.urls, &(&1 !== url))
628 |     options = Keyword.put(options, :urls, urls)
629 |     handle_info({ref, {:ok, []}}, %{spider| options: options})
630 |   end
631 | 
632 |   def handle_info({ref, {:error, reason}}, spider) do
633 |     # Retry with backoff?
634 |     request = spider.requests
635 |     |> Enum.find(%Request{}, &(&1.ref === ref))
636 | 
637 |     Logger.error("Failed to request to #{request.url} with reason #{reason}")
638 |     # Return an empty data, and let the spider tries again next time.
639 |     handle_info({ref, {:ok, []}}, spider)
640 |   end
641 | 
642 |   def handle_info(_info, state) do
643 |     {:noreply, state}
644 |   end
645 | 
646 |   defp send_after(_dest, nil, _message) do
647 |     :erlang.make_ref()
648 |   end
649 |   defp send_after(dest, time, message) do
650 |     :erlang.send_after(time, dest, message)
651 |   end
652 | 
653 |   defp call(method, %GenSpider{}=spider, nil) when is_atom(method) do
654 |     call(method, spider, [spider.state])
655 |   end
656 |   defp call(method, %GenSpider{}=spider, args) when is_atom(method) do
657 |     Kernel.apply(spider.module, method, args)
658 |   end
659 | end


--------------------------------------------------------------------------------
/lib/scrapex/gen_spider/README.md:
--------------------------------------------------------------------------------
 1 | GenSpider
 2 | =========
 3 | 
 4 | The flow of GenSpider
 5 | 
 6 | - GenSpider.start_link
 7 |   - GenSpider.init
 8 |     - Mod.init
 9 | 
10 |   - GenSpider.handle_info(:timeout): message from start_link
11 |   - GenSpider.handle_info(:crawl): start the full crawl
12 |     - Mod.start_requests
13 |       - Mod.make_requests_from_url
14 |         - GenSpider.request(url, &parse/1): called inside GenSpider
15 |           - Request.async
16 |             - Create a task with Request.do_request
17 |               - do_request calls `parse(response)`
18 |               # Up to here, everything is asynchronous, result will be handled in handle_info({ref,...})
19 |   # Receive data after request and parsed.
20 |   - GenSpider.handle_info({ref, {:ok, data}})
21 |     - remove request with ref from spider.requests
22 |     - store data into spider.data
23 |     - if last request and interval option,
24 |       - send(self, :crawl, interval)


--------------------------------------------------------------------------------
/lib/scrapex/gen_spider/request.ex:
--------------------------------------------------------------------------------
  1 | defmodule Scrapex.GenSpider.Request do
  2 |   @moduledoc """
  3 |   Conveniences for spawing and awaiting for HTTP requests.
  4 | 
  5 |   Requests are processes meant to perform one particular HTTP request
  6 |   to a specific URL throughout their life-cycle, often with little or
  7 |   no communication with other processes.
  8 | 
  9 |   Requests spawned with `async/2` can be awaited on by its caller
 10 |   process (and only its caller). They are implemented  by spawing a
 11 |   `Task` and await on it.
 12 |   """
 13 |   alias Scrapex.GenSpider.Request
 14 |   alias Scrapex.GenSpider.Response
 15 |   require Logger
 16 | 
 17 |   @doc """
 18 |   The Request struct.
 19 | 
 20 |   It contains the following fields:
 21 | 
 22 |     * `:pid` - the process reference of the request process.
 23 | 
 24 |     * `:ref` - the request monitor reference.
 25 | 
 26 |     * `:owner` - the PID of the process that started the request.
 27 | 
 28 |     * `:url` - the url to make request to.
 29 |   """
 30 |   defstruct pid: nil, ref: nil, owner: nil, url: ""
 31 |   @type t :: %__MODULE__{pid: pid, ref: reference, owner: pid, url: binary}
 32 | 
 33 |   @type url :: binary
 34 | 
 35 |   @doc """
 36 |   Starts an asynchronous request that can be awaited on.
 37 | 
 38 |   This function spawns a Task that is linked to and monitored by the
 39 |   caller process. A `Request` struct is returned as an extended version
 40 |   of the `Task` struct.
 41 | 
 42 |   ## Request's message format
 43 | 
 44 |   The reply sent by the request will be the of the underlying `Task`,
 45 |   i.e., in the format `{ref, msg}`, where `ref` is the monitoring ref
 46 |   held by the request, and `msg` is the return value of the callback.
 47 |   """
 48 |   @spec async(url, fun, pid) :: t
 49 |   def async(url, callback, from \\ self) when is_pid(from) do
 50 |     mfa = {:erlang, :apply, [&request/2, [url, callback]]}
 51 |     pid = Task.Supervised.spawn_link(from, get_info(from), mfa)
 52 |     ref = Process.monitor(pid)
 53 |     send(pid, {from, ref})
 54 |     %Request{url: url, pid: pid, ref: ref, owner: from}
 55 |   end
 56 | 
 57 |   @doc """
 58 |   Awaits a request response.
 59 | 
 60 |   A timeout in milliseconds can be given with default value of `5000`.
 61 |   In case the request process dies, this function will exit with the 
 62 |   same reason as the request.
 63 |   """
 64 |   @spec await(t, timeout) :: term
 65 |   def await(%Request{pid: pid, ref: ref, owner: owner}, timeout \\ 5000) do
 66 |     Task.await(%Task{pid: pid, ref: ref, owner: owner}, timeout)
 67 |   end
 68 | 
 69 |   defp get_info(pid) do
 70 |     {node(),
 71 |      case Process.info(pid, :registered_name) do
 72 |        {:registered_name, []} -> pid
 73 |        {:registered_name, name} -> name
 74 |      end}
 75 |   end
 76 | 
 77 |   defp request(url, callback) do
 78 |     case do_request(url) do
 79 |       # HTTP Request succeeded, return whatever the callback returns.
 80 |       {:ok, response} ->
 81 |         response |> callback.()
 82 |       # Forward the HTTP error to the `handle_info`
 83 |       {:error, reason} -> 
 84 |         {:error, reason}
 85 |     end
 86 |   end
 87 | 
 88 |   defp do_request(url) do
 89 |     Logger.debug("Do request for #{url}")
 90 |     hackney = [follow_redirect: true, timeout: 30000, recv_timeout: 15000]
 91 |     case HTTPoison.get(url, [], [ hackney: hackney ]) do
 92 |       {:ok, %HTTPoison.Response{status_code: 200, body: body}} ->
 93 |         {:ok, %Response{url: url, body: body}}
 94 |       {:ok, %HTTPoison.Response{status_code: 404}} ->
 95 |         {:error, {:not_found, url}}
 96 |       {:error, %HTTPoison.Error{reason: reason}} ->
 97 |         {:error, reason}
 98 |     end
 99 |   end
100 | end


--------------------------------------------------------------------------------
/lib/scrapex/gen_spider/response.ex:
--------------------------------------------------------------------------------
 1 | defmodule Scrapex.GenSpider.Response do
 2 |   @moduledoc """
 3 |   Utilities for working response returned from `GenSpider`.
 4 |   """
 5 | 
 6 |   alias Scrapex.GenSpider.Response
 7 | 
 8 |   defstruct url: "", body: ""
 9 |   @type t :: %__MODULE__{url: binary, body: binary}
10 | 
11 |   @doc """
12 |   Join a path relative to the response's URL.
13 | 
14 |   ## Examples
15 | 
16 |       iex> alias Scrapex.GenSpider.Response
17 |       iex> response = %Response{url: "http://www.scrapex.com/subfolder"}
18 |       iex> Response.url_join(response, "/subfolder2")
19 |       "http://www.scrapex.com/subfolder2"
20 |       iex> Response.url_join(response, "subsubfolder")
21 |       "http://www.scrapex.com/subfolder/subsubfolder"
22 |   """
23 |   @spec url_join(t, binary) :: binary
24 |   def url_join(url, path) when is_binary(url) do
25 |     url_join(%Response{url: url}, path)
26 |   end
27 | 
28 |   def url_join(%Response{url: url}, "/" <> path) do
29 |     uri = URI.parse(url)
30 |     "#{uri.scheme}://#{uri.authority}/#{path}" 
31 |   end
32 | 
33 |   def url_join(%Response{url: _url}, "http" <> path) do
34 |     "http#{path}"
35 |   end
36 | 
37 |   def url_join(%Response{url: url}, path) do
38 |     "#{url}/#{path}"
39 |   end
40 | end


--------------------------------------------------------------------------------
/lib/scrapex/selector.ex:
--------------------------------------------------------------------------------
 1 | defmodule Scrapex.Selector do
 2 |   @moduledoc """
 3 |   Utilities for extracting data from markup language.
 4 |   """
 5 |   
 6 |   use GenServer
 7 |   alias Scrapex.Selector
 8 | 
 9 |   defstruct tree: []
10 |   @type t :: %__MODULE__{tree: html_tree}
11 | 
12 |   @typedoc "A tree of HTML nodes, or a node itself if only one"
13 |   @type html_tree :: html_node | [html_node]
14 |   @typedoc "Name of the tag or attribute"
15 |   @type name :: binary
16 |   @typedoc "Attribute of a node"
17 |   @type attribute :: {name, binary}
18 | 
19 |   @type html_node :: {name, [attribute], children}
20 |   @type children :: [html_node]
21 | 
22 |   @type selector :: binary
23 | 
24 |   @doc """
25 |   Generates a selection for a particular selector.
26 | 
27 |   The return value is a Selector.t
28 |   """
29 |   @spec select(binary | t, selector) :: t
30 |   def select(html, selector) when is_binary(html) do
31 |     %Selector{tree: Floki.parse(html)}
32 |     |> select(selector)
33 |   end
34 |   def select(%Selector{tree: tree}, selector) do
35 |     %Selector{tree: Floki.find(tree, selector)}
36 |   end
37 | 
38 |   @doc """
39 |   Extracts content or attribute value for a selection.
40 |   """
41 |   @spec extract(t, name) :: [binary]
42 |   def extract(selector), do: extract(selector, "text")
43 |   def extract(selector, ""), do: extract(selector, "text")
44 |   def extract(%Selector{tree: tree}, "text") do
45 |     Enum.map(tree, fn({_, _, children}) -> 
46 |       extract_text(children, "")
47 |       |> String.split
48 |       |> Enum.join(" ")
49 |     end)
50 |   end
51 |   def extract(%Selector{tree: tree}, attr) do
52 |     Floki.attribute(tree, attr)
53 |   end
54 | 
55 |   defp extract_text(children), do: extract_text(children, "")
56 |   defp extract_text([], result), do: result
57 |   defp extract_text([text|rest], result) 
58 |   when is_binary(text) 
59 |   do
60 |     extract_text(rest, result <> text)
61 |   end
62 |   defp extract_text([{_, _, children}|rest], result) do
63 |     extract_text(rest, result <> extract_text(children))
64 |   end
65 |   defp extract_text(_, result), do: result
66 |   
67 |   defimpl Enumerable, for: __MODULE__ do
68 |     alias Scrapex.Selector
69 | 
70 |     def count(%Selector{tree: tree}), do: length(tree)
71 |     def member?(api = %Selector{}, selector) do
72 |       Selector.select(api, selector) !== []
73 |     end
74 | 
75 |     def reduce(_api, {:halt, acc}, _fun), do: {:halted, acc}
76 |     def reduce(api, {:suspend, acc}, fun) do
77 |       {:suspended, acc, &reduce(api, &1, fun)}
78 |     end
79 |     def reduce(%Selector{tree: []}, {:cont, acc}, _fun) do
80 |       {:done, acc}
81 |     end
82 |     def reduce(%Selector{tree: [h | t]}, {:cont, acc}, fun) do
83 |       new_acc = fun.(%Selector{tree: [h]}, acc)
84 |       reduce(%Selector{tree: t}, new_acc, fun)
85 |     end
86 |   end
87 | end


--------------------------------------------------------------------------------
/lib/scrapex/spider/webscraper.ex:
--------------------------------------------------------------------------------
  1 | defmodule Scrapex.Spider.WebScraper do
  2 |   @moduledoc ~S"""
  3 |   A spider using "sitemap" configuration from WebScraper.IO
  4 | 
  5 |   WebScraper.IO provides a Chrome extension to visually define scraping
  6 |   rules. This module provides a spider to use those rules to collect
  7 |   data.
  8 | 
  9 |   ## Examples
 10 | 
 11 |   Here is an example of scraping the E-commerce training site at
 12 |   http://webscraper.io/test-sites/e-commerce/static, following the
 13 |   instructions in WebScraper's tutorials section.
 14 | 
 15 |       iex> sitemap = %{
 16 |       ...> "_id" => "webscrapper",
 17 |       ...> "startUrl" => "http://webscraper.io/test-sites/e-commerce/static",
 18 |       ...> "selectors" => [{
 19 |       ...>     "parentSelectors" => ["_root"],
 20 |       ...>     "type" => "SelectorLink",
 21 |       ...>     "multiple" => true,
 22 |       ...>     "id" => "Category",
 23 |       ...>     "selector" => "a.category-link",
 24 |       ...>     "delay" => ""
 25 |       ...>   }, {
 26 |       ...>     "parentSelectors" => ["Item"],
 27 |       ...>     "type" => "SelectorText",
 28 |       ...>     "multiple" => false,
 29 |       ...>     "id" => "Name",
 30 |       ...>     "selector" => "a.title",
 31 |       ...>     "regex" => "",
 32 |       ...>     "delay" => ""
 33 |       ...>   }, {
 34 |       ...>     "parentSelectors" => ["Item"],
 35 |       ...>     "type" => "SelectorText",
 36 |       ...>     "multiple" => false,
 37 |       ...>     "id" => "Price",
 38 |       ...>     "selector" => "h4.pull-right",
 39 |       ...>     "regex" => "",
 40 |       ...>     "delay" => ""
 41 |       ...>   }, {
 42 |       ...>     "parentSelectors" => ["Item"],
 43 |       ...>     "type" => "SelectorText",
 44 |       ...>     "multiple" => false,
 45 |       ...>     "id" => "Description",
 46 |       ...>     "selector" => "p.description",
 47 |       ...>     "regex" => "",
 48 |       ...>     "delay" => ""
 49 |       ...>   }, {
 50 |       ...>     "parentSelectors" => ["Category"],
 51 |       ...>     "type" => "SelectorLink",
 52 |       ...>     "multiple" => true,
 53 |       ...>     "id" => "SubCategory",
 54 |       ...>     "selector" => "a.subcategory-link",
 55 |       ...>     "delay" => ""
 56 |       ...>   }, {
 57 |       ...>     "parentSelectors" => ["SubCategory"],
 58 |       ...>     "type" => "SelectorElement",
 59 |       ...>     "multiple" => true,
 60 |       ...>     "id" => "Item",
 61 |       ...>     "selector" => "div.thumbnail",
 62 |       ...>     "delay" => ""
 63 |       ...>   }]
 64 |       ...> }
 65 |       iex> {:ok, spider} = WebScraper.start_link(sitemap)
 66 |       iex> data = WebScraper.export()
 67 |       [%{
 68 |         "Category" => "Computers",
 69 |         "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/computers",
 70 |         "Name" => "Iconia B1-730HD",
 71 |         "Price" => "$99.99",
 72 |         "Description" => "Black, 7\", 1.6GHz Dual-Core, 8GB, Android 4.4",
 73 |         "SubCategory" => "Tablets",
 74 |         "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/computers/tablets"
 75 |       },%{
 76 |         "Category" => "Computers",
 77 |         "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/computers",
 78 |         "Name" => "Pavilion",
 79 |         "Price" => "$609.99",
 80 |         "Description" => "15.6\", Core i5-4200U, 6GB, 750GB, Windows 8.1",
 81 |         "SubCategory" => "Laptops",
 82 |         "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/computers/laptops"
 83 |       },%{
 84 |         "Category" => "Phones",
 85 |         "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/phones",
 86 |         "Name" => "Samsung Galaxy",
 87 |         "Price" => "$93.99",
 88 |         "Description" => "5 mpx. Android 5.0",
 89 |         "SubCategory" => "Touch",
 90 |         "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/phones/touch"
 91 |       },%{
 92 |         "Category" => "Phones",
 93 |         "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/phones",
 94 |         "Name" => "Sony Xperia",
 95 |         "Price" => "$118.99",
 96 |         "Description" => "GPS, waterproof",
 97 |         "SubCategory" => "Touch",
 98 |         "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/phones/touch"
 99 |       },%{
100 |         "Category" => "Computers",
101 |         "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/computers",
102 |         "Name" => "Memo Pad HD 7",
103 |         "Price" => "$101.99",
104 |         "Description" => "IPS, Dual-Core 1.2GHz, 8GB, Android 4.3",
105 |         "SubCategory" => "Tablets",
106 |         "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/computers/tablets"
107 |       },%{
108 |         "Category" => "Computers",
109 |         "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/computers",
110 |         "Name" => "Lenovo IdeaTab",
111 |         "Price" => "$69.99",
112 |         "Description" => "7\" screen, Android",
113 |         "SubCategory" => "Tablets",
114 |         "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/computers/tablets"
115 |       },%{
116 |         "Category" => "Phones",
117 |         "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/phones",
118 |         "Name" => "Ubuntu Edge",
119 |         "Price" => "$499.99",
120 |         "Description" => "Sapphire glass",
121 |         "SubCategory" => "Touch",
122 |         "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/phones/touch"
123 |       },%{
124 |         "Category" => "Computers",
125 |         "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/computers",
126 |         "Name" => "Acer Iconia",
127 |         "Price" => "$96.99",
128 |         "Description" => "7\" screen, Android, 16GB",
129 |         "SubCategory" => "Tablets",
130 |         "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/computers/tablets"
131 |       },%{
132 |         "Category" => "Computers",
133 |         "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/computers",
134 |         "Name" => "Aspire E1-572G",
135 |         "Price" => "$581.99",
136 |         "Description" => "15.6\", Core i5-4200U, 8GB, 1TB, Radeon R7 M265, Windows 8.1",
137 |         "SubCategory" => "Laptops",
138 |         "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/computers/laptops"
139 |       },%{
140 |         "Category" => "Phones",
141 |         "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/phones",
142 |         "Name" => "Nokia X",
143 |         "Price" => "$109.99",
144 |         "Description" => "Andoid, Jolla dualboot",
145 |         "SubCategory" => "Touch",
146 |         "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/phones/touch"
147 |       },%{
148 |         "Category" => "Phones",
149 |         "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/phones",
150 |         "Name" => "LG Optimus",
151 |         "Price" => "$57.99",
152 |         "Description" => "3.2\" screen",
153 |         "SubCategory" => "Touch",
154 |         "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/phones/touch"
155 |       },%{
156 |         "Category" => "Computers",
157 |         "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/computers",
158 |         "Name" => "IdeaTab A3500L",
159 |         "Price" => "$88.99",
160 |         "Description" => "Black, 7\" IPS, Quad-Core 1.2GHz, 8GB, Android 4.2",
161 |         "SubCategory" => "Tablets",
162 |         "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/computers/tablets"
163 |       },%{
164 |         "Category" => "Computers",
165 |         "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/computers",
166 |         "Name" => "Galaxy Tab 3",
167 |         "Price" => "$97.99",
168 |         "Description" => "7\", 8GB, Wi-Fi, Android 4.2, White",
169 |         "SubCategory" => "Tablets",
170 |         "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/computers/tablets"
171 |       },%{
172 |         "Category" => "Computers",
173 |         "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/computers",
174 |         "Name" => "HP 350 G1",
175 |         "Price" => "$577.99",
176 |         "Description" => "15.6\", Core i5-4200U, 4GB, 750GB, Radeon HD8670M 2GB, Windows",
177 |         "SubCategory" => "Laptops",
178 |         "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/computers/laptops"
179 |       },%{
180 |         "Category" => "Phones",
181 |         "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/phones",
182 |         "Name" => "Nokia 123",
183 |         "Price" => "$24.99",
184 |         "Description" => "7 day battery",
185 |         "SubCategory" => "Touch",
186 |         "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/phones/touch"
187 |       },%{
188 |         "Category" => "Computers",
189 |         "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/computers",
190 |         "Name" => "HP 250 G3",
191 |         "Price" => "$520.99",
192 |         "Description" => "15.6\", Core i5-4210U, 4GB, 500GB, Windows 8.1",
193 |         "SubCategory" => "Laptops",
194 |         "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/computers/laptops"
195 |       },%{
196 |         "Category" => "Computers",
197 |         "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/computers",
198 |         "Name" => "Aspire E1-510",
199 |         "Price" => "$306.99",
200 |         "Description" => "15.6\", Pentium N3520 2.16GHz, 4GB, 500GB, Linux",
201 |         "SubCategory" => "Laptops",
202 |         "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/computers/laptops"
203 |       },%{
204 |         "Category" => "Computers",
205 |         "Category-href" => "http://webscraper.io/test-sites/e-commerce/static/computers",
206 |         "Name" => "Packard 255 G2",
207 |         "Price" => "$416.99",
208 |         "Description" => "15.6\", AMD E2-3800 1.3GHz, 4GB, 500GB, Windows 8.1",
209 |         "SubCategory" => "Laptops",
210 |         "SubCategory-href" => "http://webscraper.io/test-sites/e-commerce/static/computers/laptops"
211 |       }]
212 |   """
213 | 
214 |   @type item :: [ property ]
215 |   @type property :: { key, value }
216 |   @type key :: binary
217 |   @type value :: binary
218 |   @type rule :: %{key => value}
219 | 
220 |   alias Scrapex.GenSpider
221 |   alias GenSpider.Response
222 |   import Scrapex.Selector
223 |   use GenSpider
224 | 
225 |   require Logger
226 | 
227 |   # Client
228 |   def start_link(sitemap = %{"startUrl" => url}) when is_binary(url) do
229 |     start_link(%{sitemap | "startUrl" => [url]})
230 |   end
231 |   def start_link(sitemap = %{"startUrl" => urls}) when is_list(urls) do
232 |     opts =  [
233 |         urls: urls, 
234 |         interval: 3600]
235 | 
236 |     GenSpider.start_link(__MODULE__, sitemap, opts)
237 |   end
238 | 
239 |   def export(spider, format \\ nil) do
240 |     GenSpider.export(spider, format)
241 |   end
242 |   
243 |   # Server (callbacks)
244 | 
245 |   def init(%{"selectors"=> rules}) do
246 |    {:ok, rules}
247 |   end
248 | 
249 |   def start_requests(urls, rules) do
250 |     requests = urls
251 |     |> Enum.map(fn(url) ->
252 |       GenSpider.request(url, &parse(&1, rules))
253 |     end)
254 |     {:ok, requests, rules}
255 |   end
256 | 
257 |   def parse(response, rules) do
258 |     by_parent = group_by_parents(rules)
259 | 
260 |     results
261 |     =  parse_level(response, "_root", by_parent)
262 |     # @return: [ item ]
263 |     |> Enum.map(&Enum.into(&1, %{}))
264 | 
265 |     {:ok, results}
266 |   end
267 | 
268 |   @spec parse_level(binary, binary, %{key => [rule]}) :: [item]
269 |   defp parse_level(response, parent, rule_groups) do
270 |     body = response.body
271 |     rules = (rule_groups[parent] || [])
272 | 
273 |     rules
274 |     |> Enum.map(fn
275 |       (rule = %{"type" => "SelectorGroup"}) ->
276 |         # For SelectorGroup, we collect all values into a list.
277 |         # Note: This is different from WebScraper.IO extension.
278 |         key = rule["id"]
279 |         attribute = rule["extractAttribute"] || "text"
280 |         values = select(body, rule["selector"]) |> extract(attribute)
281 |         [[{key, values}]]
282 |       (rule) ->
283 |         key = rule["id"]
284 |         multiple? = rule["multiple"]
285 |         selectors = select(body, rule["selector"])
286 |         selectors = if multiple?, do: selectors, else: Enum.take(selectors, 1)
287 |         Logger.debug("Selecting #{rule["selector"]} into:")
288 |         selectors
289 |         |> Enum.map(fn(selector) ->
290 |           [value] = extract(selector, "text")
291 |           result = [[{key, value}]]
292 | 
293 |           Logger.debug("Parse response with #{rule["type"]}: #{rule["selector"]}")
294 |           # For each key-value pair, return into a list, with
295 |           # new key-value pair(s) if rule's selector is a link.
296 |           case {rule["type"], rule_groups[key]} do
297 |             {"SelectorText", _} ->
298 |               case Regex.compile(rule["regex"] || "") do
299 |                 {:error, _reason} -> result
300 |                 {:ok, ~r//} -> result
301 |                 {:ok, regex} ->
302 |                   case Regex.run(regex, value) do
303 |                     [value|_] ->  [[{key, value}]]
304 |                     _ -> [[{key, nil}]]
305 |                   end
306 |               end
307 |             {"SelectorLink", nil} ->
308 |               # Link with no child rule just returns the text value
309 |               result
310 |             {"SelectorLink", _} ->
311 |               [href] = extract(selector, "href")
312 |               url = Response.url_join(response, href)
313 | 
314 |               request = GenSpider.request(url, fn(response) ->
315 |                 # Get sub nodes as a tuple list.
316 |                 parse_level(response, rule["id"], rule_groups)
317 |               end)
318 |               subvalues = GenSpider.await(request)
319 |               # @return [ item ]
320 |               combine(result, subvalues)
321 |             {"SelectorElement", nil} ->
322 |               # Don't return SelectorElement in result
323 |               []
324 |             {"SelectorElement", _} ->
325 |               # Only use the results scraped from children rules.
326 |               parse_level(%{body: selector}, key, rule_groups)
327 |             {"SelectorElementAttribute", _} ->
328 |               [value] = extract(selector, rule["extractAttribute"])
329 |               [[{key, value}]]
330 |             _ ->
331 |               result
332 |           end
333 |         end)
334 |       # @return [ item ]
335 |       |> Enum.concat
336 |     end)
337 |     |> Enum.reduce(&combine/2)
338 |   end
339 | 
340 |   @spec combine([item], [item]) :: [item]
341 |   defp combine([], right), do: right
342 |   defp combine(left, []), do: left
343 |   defp combine(left, right) do
344 |     for litem <- left, ritem <- right, do: Enum.concat(litem, ritem)
345 |   end
346 | 
347 |   @spec group_by_parents([rule], binary) :: %{key => [rule]}
348 |   defp group_by_parents(selectors, key \\ "parentSelectors") do
349 |     Enum.reduce(selectors, %{}, fn(selector, groups) ->
350 |       Enum.reduce(selector[key], groups, fn(parent, groups) ->
351 |         Dict.update(groups, parent, [selector], &[selector|&1])
352 |       end)
353 |     end)
354 |   end
355 | end


--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sntran/scrapex/0b8e1db6cf24f3d98e644b03479af9a7c304b6a8/logo.png


--------------------------------------------------------------------------------
/mix.exs:
--------------------------------------------------------------------------------
 1 | defmodule Scrapex.Mixfile do
 2 |   use Mix.Project
 3 | 
 4 |   @version "0.5.2"
 5 | 
 6 |   def project do
 7 |     [app: :scrapex,
 8 |      version: @version,
 9 |      name: "Scrapex",
10 |      description: """
11 |      An open source and collaborative framework for extracting the data 
12 |      you need from websites. In a fast, simple, yet extensible way.
13 |      """,
14 |      source_url: "https://bitbucket.org/inhuman/scrapex",
15 |      homepage_url: "https://bitbucket.org/inhuman/scrapex/overview",
16 |      elixir: "~> 1.2",
17 |      escript: [main_module: Scrapex],
18 |      build_embedded: Mix.env == :prod,
19 |      start_permanent: Mix.env == :prod,
20 |      deps: deps,
21 |      package: package,
22 |      docs: [source_ref: "v#{@version}",
23 |             logo: "logo.png",
24 |             extras: ["README.md"]]]
25 |   end
26 | 
27 |   # Configuration for the OTP application
28 |   #
29 |   # Type `mix help compile.app` for more information
30 |   def application do
31 |     [applications: [:logger, :httpoison]]
32 |   end
33 | 
34 |   # Dependencies can be Hex packages:
35 |   #
36 |   #   {:mydep, "~> 0.3.0"}
37 |   #
38 |   # Or git/path repositories:
39 |   #
40 |   #   {:mydep, git: "https://github.com/elixir-lang/mydep.git", tag: "0.1.0"}
41 |   #
42 |   # Type `mix help deps` for more examples and options
43 |   defp deps do
44 |     [
45 |       {:httpoison, "~> 0.7"},
46 |       {:floki, "~> 0.7.0"},
47 |       {:poison, "~> 1.4.0"},
48 |       {:csv, "~> 1.2.1"},
49 | 
50 |       # Docs dependencies
51 |       {:earmark, "~> 0.1", only: :dev},
52 |       {:ex_doc, "~> 0.10", only: :dev}
53 |     ]
54 |   end
55 | 
56 |   defp package do
57 |     [contributors: ["Son Tran-Nguyen"],
58 |      licenses: ["MIT"],
59 |      links: %{bitbucket: "https://bitbucket.org/inhuman/scrapex"},
60 |      files: ~w(lib priv test) ++
61 |             ~w(CHANGELOG.md LICENSE mix.exs package.json README.md)]
62 |   end
63 | end


--------------------------------------------------------------------------------
/mix.lock:
--------------------------------------------------------------------------------
 1 | %{"csv": {:hex, :csv, "1.2.1", "9a249e1e9fddb4f34bfc2bcf2bfb43bff3aa62a55f807c72cb2249b1e3914ae9", [:mix], []},
 2 |   "earmark": {:hex, :earmark, "0.1.19", "ffec54f520a11b711532c23d8a52b75a74c09697062d10613fa2dbdf8a9db36e", [:mix], []},
 3 |   "ex_doc": {:hex, :ex_doc, "0.10.0", "f49c237250b829df986486b38f043e6f8e19d19b41101987f7214543f75947ec", [:mix], [{:earmark, "~> 0.1.17 or ~> 0.2", [hex: :earmark, optional: true]}]},
 4 |   "floki": {:hex, :floki, "0.7.0", "52eb235995f9040dee7e2d09dd24e675f1ab02311528ff118d76baef94926f71", [:mix], [{:mochiweb, "~> 2.12.2", [hex: :mochiweb, optional: false]}]},
 5 |   "hackney": {:hex, :hackney, "1.3.2", "43bd07ab88753f5e136e38fddd2a09124bee25733b03361eeb459d0173fc17ab", [:rebar, :make], [{:ssl_verify_hostname, "~> 1.0.5", [hex: :ssl_verify_hostname, optional: false]}, {:idna, "~> 1.0.2", [hex: :idna, optional: false]}]},
 6 |   "httpoison": {:hex, :httpoison, "0.7.4", "053fa5420c9a2f7792ab49c9963ce67ede8b81dd9a1d0a7123cce54028deeb05", [:mix], [{:hackney, "~> 1.3.1", [hex: :hackney, optional: false]}]},
 7 |   "idna": {:hex, :idna, "1.0.2", "397e3d001c002319da75759b0a81156bf11849c71d565162436d50020cb7265e", [:make], []},
 8 |   "mochiweb": {:hex, :mochiweb, "2.12.2", "80804ad342afa3d7f3524040d4eed66ce74b17a555de454ac85b07c479928e46", [:make, :rebar], []},
 9 |   "poison": {:hex, :poison, "1.4.0", "cd5afb9db7f0d19487572fa28185b6d4de647f14235746824e77b3139b79b725", [:mix], []},
10 |   "ssl_verify_hostname": {:hex, :ssl_verify_hostname, "1.0.5", "2e73e068cd6393526f9fa6d399353d7c9477d6886ba005f323b592d389fb47be", [:make], []}}
11 | 


--------------------------------------------------------------------------------
/test/sample_pages/e-commerce/static/computers/index.html:
--------------------------------------------------------------------------------
  1 | 
  2 | <!DOCTYPE html>
  3 | <html>
  4 | <head>
  5 |   <title>Web Scraper</title>
  6 |   <meta charset="utf-8">
  7 |   <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
  8 | 
  9 |   <meta name="keywords" content="web scraping,Web Scraper,Chrome extension,Crawling,Cross platform scraper, " />
 10 |   <meta name="description" content="Web Scraper Chrome extension a free tool to scrape dynamic web pages." />
 11 |   <link rel="icon" sizes="128x128" href="/favicon.png">
 12 | 
 13 |   <meta name="viewport" content="width=device-width, initial-scale=1.0">
 14 |   
 15 |   <link media="all" type="text/css" rel="stylesheet" href="/style.css">
 16 | 
 17 |   <!-- HTML5 shim, for IE6-8 support of HTML5 elements -->
 18 |   <!--[if lt IE 9]>
 19 |   <script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
 20 |   <![endif]-->
 21 | 
 22 |     <script src="/site.js"></script>
 23 | 
 24 | </head>
 25 | <body>
 26 | <div class="navbar navbar-default navbar-fixed-top" role="navigation">
 27 |   <div class="container">
 28 |     <div class="navbar-header">
 29 |       <button type="button" class="navbar-toggle" data-toggle="collapse" data-target=".navbar-collapse">
 30 |         <span class="sr-only">Toggle navigation</span>
 31 |         <span class="icon-bar"></span>
 32 |         <span class="icon-bar"></span>
 33 |         <span class="icon-bar"></span>
 34 |       </button>
 35 |       <a class="navbar-brand" href="/">Web Scraper</a>
 36 |     </div>
 37 |     <div class="navbar-collapse collapse">
 38 |       <ul class="nav navbar-nav">
 39 |                 <li ><a href="/">Home</a></li>
 40 |         <li ><a href="/service">Service</a></li>
 41 |         <li ><a href="/sitemap-specialist">Sitemap Specialist</a></li>
 42 |         <li ><a href="/screenshots">Screenshots</a></li>
 43 |         <li class="dropdown">
 44 |           <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Learn <span class="caret"></span></a>
 45 |           <ul class="dropdown-menu" role="menu">
 46 |             <li ><a href="/tutorials">Tutorials</a></li>
 47 |             <li ><a href="/documentation">Documentation</a></li>
 48 |             <li class="divider"></li>
 49 |             <li class="active"><a href="">Test sites</a></li>
 50 |           </ul>
 51 |         </li>
 52 | 
 53 |         <li ><a href="/help">Help</a></li>
 54 |               </ul>
 55 |       <ul class="nav navbar-nav navbar-right">
 56 |         <li><a href="http://chrome.google.com/webstore/detail/web-scraper/jnhgnonknehpejjnehehllkliplmbmhn" target="_blank">Download</a></li>
 57 |         <li><a href="https://github.com/martinsbalodis/web-scraper-chrome-extension" target="_blank">GitHub</a></li>
 58 |         <li ><a href="/donate">Donate</a></li>
 59 |       </ul>
 60 |     </div>
 61 |   </div>
 62 | </div>
 63 | 
 64 | <div class="container">
 65 |   <div class="row">
 66 |     <div class="col-md-3 sidebar">
 67 |       <div class="navbar-default sidebar" role="navigation">
 68 |   <div class="sidebar-nav navbar-collapse">
 69 |     <ul class="nav" id="side-menu">
 70 | 
 71 |       <li >
 72 |         <a href="/e-commerce/static">Home</a>
 73 |       </li>
 74 | 
 75 |               <li class="active">
 76 |           <a href="/e-commerce/static/computers" class="category-link active">
 77 |             Computers<span class="fa arrow"></span>
 78 |           </a>
 79 | 
 80 |                       <ul class="nav nav-second-level collapse in">
 81 |                               <li>
 82 |                   <a href="/e-commerce/static/computers/laptops" class="subcategory-link ">
 83 |                     Laptops                 </a>
 84 |                 </li>
 85 |                               <li>
 86 |                   <a href="/e-commerce/static/computers/tablets" class="subcategory-link ">
 87 |                     Tablets                 </a>
 88 |                 </li>
 89 |                           </ul>
 90 |                   </li>
 91 |               <li >
 92 |           <a href="/e-commerce/static/phones" class="category-link ">
 93 |             Phones<span class="fa arrow"></span>
 94 |           </a>
 95 | 
 96 |                   </li>
 97 |           </ul>
 98 |   </div>
 99 | </div>    </div>
100 |     <div class="col-md-9">
101 |       <h1 class="page-header">Computers category</h1>
102 | 
103 | <h2>Top items being scraped right now</h2>
104 | 
105 | <div class="row">
106 |           <div class="col-sm-4 col-lg-4 col-md-4">
107 |       <div class="thumbnail">
108 |         <img class="img-responsive" alt="item" src="/images/e-commerce/items/cart2.png">
109 |         <div class="caption">
110 |           <h4 class="pull-right price">$1311.99</h4>
111 |           <h4><a href="#" class="title">ThinkPad X240</a></h4>
112 |           <p class="description">12.5&quot;, Core i5-4300U, 8GB, 240GB SSD, Win7 Pro 64bit</p>
113 |         </div>
114 |         <div class="ratings">
115 |           <p class="pull-right">8 reviews</p>
116 |           <p>
117 |                           <span class="glyphicon glyphicon-star"></span>
118 |                           <span class="glyphicon glyphicon-star"></span>
119 |                           <span class="glyphicon glyphicon-star"></span>
120 |                           <span class="glyphicon glyphicon-star"></span>
121 |                       </p>
122 |         </div>
123 |       </div>
124 |     </div>
125 |           <div class="col-sm-4 col-lg-4 col-md-4">
126 |       <div class="thumbnail">
127 |         <img class="img-responsive" alt="item" src="/images/e-commerce/items/cart2.png">
128 |         <div class="caption">
129 |           <h4 class="pull-right price">$251.99</h4>
130 |           <h4><a href="#" class="title">Galaxy Tab</a></h4>
131 |           <p class="description">16GB, White</p>
132 |         </div>
133 |         <div class="ratings">
134 |           <p class="pull-right">8 reviews</p>
135 |           <p>
136 |                           <span class="glyphicon glyphicon-star"></span>
137 |                           <span class="glyphicon glyphicon-star"></span>
138 |                           <span class="glyphicon glyphicon-star"></span>
139 |                       </p>
140 |         </div>
141 |       </div>
142 |     </div>
143 |           <div class="col-sm-4 col-lg-4 col-md-4">
144 |       <div class="thumbnail">
145 |         <img class="img-responsive" alt="item" src="/images/e-commerce/items/cart2.png">
146 |         <div class="caption">
147 |           <h4 class="pull-right price">$101.99</h4>
148 |           <h4><a href="#" class="title">Memo Pad HD 7</a></h4>
149 |           <p class="description">IPS, Dual-Core 1.2GHz, 8GB, Android 4.3</p>
150 |         </div>
151 |         <div class="ratings">
152 |           <p class="pull-right">8 reviews</p>
153 |           <p>
154 |                           <span class="glyphicon glyphicon-star"></span>
155 |                           <span class="glyphicon glyphicon-star"></span>
156 |                           <span class="glyphicon glyphicon-star"></span>
157 |                           <span class="glyphicon glyphicon-star"></span>
158 |                       </p>
159 |         </div>
160 |       </div>
161 |     </div>
162 |   </div>
163 | 
164 |     </div>
165 |   </div>
166 | </div>
167 | <div class="footer">
168 |   <div class="container">
169 |     <p class="text-muted">&copy; 2015 Web Scraper | <a href="/contact">contact</a> | <a href="/donate">donate</a></p>
170 |   </div>
171 | </div>
172 | 
173 | <script>
174 |   (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
175 |     (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
176 |     m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
177 |   })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
178 | 
179 |   ga('create', 'UA-47270784-2', 'auto');
180 |   ga('send', 'pageview');
181 | 
182 | </script>
183 | 
184 | </body>
185 | </html>


--------------------------------------------------------------------------------
/test/sample_pages/e-commerce/static/computers/index_files/cart2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sntran/scrapex/0b8e1db6cf24f3d98e644b03479af9a7c304b6a8/test/sample_pages/e-commerce/static/computers/index_files/cart2.png


--------------------------------------------------------------------------------
/test/sample_pages/e-commerce/static/computers/laptops/index.html:
--------------------------------------------------------------------------------
  1 | 
  2 | <!DOCTYPE html>
  3 | <html>
  4 | <head>
  5 |   <title>Web Scraper</title>
  6 |   <meta charset="utf-8">
  7 |   <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
  8 | 
  9 |   <meta name="keywords" content="web scraping,Web Scraper,Chrome extension,Crawling,Cross platform scraper, " />
 10 |   <meta name="description" content="Web Scraper Chrome extension a free tool to scrape dynamic web pages." />
 11 |   <link rel="icon" sizes="128x128" href="/favicon.png">
 12 | 
 13 |   <meta name="viewport" content="width=device-width, initial-scale=1.0">
 14 |   
 15 |   <link media="all" type="text/css" rel="stylesheet" href="/style.css">
 16 | 
 17 |   <!-- HTML5 shim, for IE6-8 support of HTML5 elements -->
 18 |   <!--[if lt IE 9]>
 19 |   <script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
 20 |   <![endif]-->
 21 | 
 22 |     <script src="/site.js"></script>
 23 | 
 24 | </head>
 25 | <body>
 26 | <div class="navbar navbar-default navbar-fixed-top" role="navigation">
 27 |   <div class="container">
 28 |     <div class="navbar-header">
 29 |       <button type="button" class="navbar-toggle" data-toggle="collapse" data-target=".navbar-collapse">
 30 |         <span class="sr-only">Toggle navigation</span>
 31 |         <span class="icon-bar"></span>
 32 |         <span class="icon-bar"></span>
 33 |         <span class="icon-bar"></span>
 34 |       </button>
 35 |       <a class="navbar-brand" href="/">Web Scraper</a>
 36 |     </div>
 37 |     <div class="navbar-collapse collapse">
 38 |       <ul class="nav navbar-nav">
 39 |                 <li ><a href="/">Home</a></li>
 40 |         <li ><a href="/service">Service</a></li>
 41 |         <li ><a href="/sitemap-specialist">Sitemap Specialist</a></li>
 42 |         <li ><a href="/screenshots">Screenshots</a></li>
 43 |         <li class="dropdown">
 44 |           <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Learn <span class="caret"></span></a>
 45 |           <ul class="dropdown-menu" role="menu">
 46 |             <li ><a href="/tutorials">Tutorials</a></li>
 47 |             <li ><a href="/documentation">Documentation</a></li>
 48 |             <li class="divider"></li>
 49 |             <li class="active"><a href="/test-sites">Test sites</a></li>
 50 |           </ul>
 51 |         </li>
 52 | 
 53 |         <li ><a href="/help">Help</a></li>
 54 |               </ul>
 55 |       <ul class="nav navbar-nav navbar-right">
 56 |         <li><a href="http://chrome.google.com/webstore/detail/web-scraper/jnhgnonknehpejjnehehllkliplmbmhn" target="_blank">Download</a></li>
 57 |         <li><a href="https://github.com/martinsbalodis/web-scraper-chrome-extension" target="_blank">GitHub</a></li>
 58 |         <li ><a href="/donate">Donate</a></li>
 59 |       </ul>
 60 |     </div>
 61 |   </div>
 62 | </div>
 63 | 
 64 | <div class="container">
 65 |   <div class="row">
 66 |     <div class="col-md-3 sidebar">
 67 |       <div class="navbar-default sidebar" role="navigation">
 68 |   <div class="sidebar-nav navbar-collapse">
 69 |     <ul class="nav" id="side-menu">
 70 | 
 71 |       <li >
 72 |         <a href="/test-sites/e-commerce/static">Home</a>
 73 |       </li>
 74 | 
 75 |               <li class="active">
 76 |           <a href="/test-sites/e-commerce/static/computers" class="category-link ">
 77 |             Computers<span class="fa arrow"></span>
 78 |           </a>
 79 | 
 80 |                       <ul class="nav nav-second-level collapse in">
 81 |                               <li>
 82 |                   <a href="/test-sites/e-commerce/static/computers/laptops" class="subcategory-link active">
 83 |                     Laptops                 </a>
 84 |                 </li>
 85 |                               <li>
 86 |                   <a href="/test-sites/e-commerce/static/computers/tablets" class="subcategory-link ">
 87 |                     Tablets                 </a>
 88 |                 </li>
 89 |                           </ul>
 90 |                   </li>
 91 |               <li >
 92 |           <a href="/test-sites/e-commerce/static/phones" class="category-link ">
 93 |             Phones<span class="fa arrow"></span>
 94 |           </a>
 95 | 
 96 |                   </li>
 97 |           </ul>
 98 |   </div>
 99 | </div>    </div>
100 |     <div class="col-md-9">
101 |       <h1 class="page-header">Computers / Laptops</h1>
102 | 
103 | <div class="row">
104 |           <div class="col-sm-4 col-lg-4 col-md-4">
105 |       <div class="thumbnail">
106 |         <img class="img-responsive" alt="item" src="/images/test-sites/e-commerce/items/cart2.png">
107 |         <div class="caption">
108 |           <h4 class="pull-right price">$306.99</h4>
109 |           <h4><a href="#" class="title">Aspire E1-510</a></h4>
110 |           <p class="description">15.6&quot;, Pentium N3520 2.16GHz, 4GB, 500GB, Linux</p>
111 |         </div>
112 |         <div class="ratings">
113 |           <p class="pull-right">9 reviews</p>
114 |           <p>
115 |                           <span class="glyphicon glyphicon-star"></span>
116 |                           <span class="glyphicon glyphicon-star"></span>
117 |                           <span class="glyphicon glyphicon-star"></span>
118 |                           <span class="glyphicon glyphicon-star"></span>
119 |                       </p>
120 |         </div>
121 |       </div>
122 |     </div>
123 |           <div class="col-sm-4 col-lg-4 col-md-4">
124 |       <div class="thumbnail">
125 |         <img class="img-responsive" alt="item" src="/images/test-sites/e-commerce/items/cart2.png">
126 |         <div class="caption">
127 |           <h4 class="pull-right price">$416.99</h4>
128 |           <h4><a href="#" class="title">Packard 255 G2</a></h4>
129 |           <p class="description">15.6&quot;, AMD E2-3800 1.3GHz, 4GB, 500GB, Windows 8.1</p>
130 |         </div>
131 |         <div class="ratings">
132 |           <p class="pull-right">2 reviews</p>
133 |           <p>
134 |                           <span class="glyphicon glyphicon-star"></span>
135 |                           <span class="glyphicon glyphicon-star"></span>
136 |                           <span class="glyphicon glyphicon-star"></span>
137 |                       </p>
138 |         </div>
139 |       </div>
140 |     </div>
141 |           <div class="col-sm-4 col-lg-4 col-md-4">
142 |       <div class="thumbnail">
143 |         <img class="img-responsive" alt="item" src="/images/test-sites/e-commerce/items/cart2.png">
144 |         <div class="caption">
145 |           <h4 class="pull-right price">$520.99</h4>
146 |           <h4><a href="#" class="title">HP 250 G3</a></h4>
147 |           <p class="description">15.6&quot;, Core&nbsp;i5-4210U, 4GB, 500GB, Windows 8.1</p>
148 |         </div>
149 |         <div class="ratings">
150 |           <p class="pull-right">8 reviews</p>
151 |           <p>
152 |                           <span class="glyphicon glyphicon-star"></span>
153 |                           <span class="glyphicon glyphicon-star"></span>
154 |                           <span class="glyphicon glyphicon-star"></span>
155 |                       </p>
156 |         </div>
157 |       </div>
158 |     </div>
159 |           <div class="col-sm-4 col-lg-4 col-md-4">
160 |       <div class="thumbnail">
161 |         <img class="img-responsive" alt="item" src="/images/test-sites/e-commerce/items/cart2.png">
162 |         <div class="caption">
163 |           <h4 class="pull-right price">$577.99</h4>
164 |           <h4><a href="#" class="title">HP 350 G1</a></h4>
165 |           <p class="description">15.6&quot;, Core i5-4200U, 4GB, 750GB, Radeon HD8670M 2GB, Windows</p>
166 |         </div>
167 |         <div class="ratings">
168 |           <p class="pull-right">4 reviews</p>
169 |           <p>
170 |                           <span class="glyphicon glyphicon-star"></span>
171 |                           <span class="glyphicon glyphicon-star"></span>
172 |                       </p>
173 |         </div>
174 |       </div>
175 |     </div>
176 |           <div class="col-sm-4 col-lg-4 col-md-4">
177 |       <div class="thumbnail">
178 |         <img class="img-responsive" alt="item" src="/images/test-sites/e-commerce/items/cart2.png">
179 |         <div class="caption">
180 |           <h4 class="pull-right price">$581.99</h4>
181 |           <h4><a href="#" class="title">Aspire E1-572G</a></h4>
182 |           <p class="description">15.6&quot;, Core i5-4200U, 8GB, 1TB, Radeon R7 M265, Windows 8.1</p>
183 |         </div>
184 |         <div class="ratings">
185 |           <p class="pull-right">7 reviews</p>
186 |           <p>
187 |                           <span class="glyphicon glyphicon-star"></span>
188 |                           <span class="glyphicon glyphicon-star"></span>
189 |                           <span class="glyphicon glyphicon-star"></span>
190 |                           <span class="glyphicon glyphicon-star"></span>
191 |                       </p>
192 |         </div>
193 |       </div>
194 |     </div>
195 |           <div class="col-sm-4 col-lg-4 col-md-4">
196 |       <div class="thumbnail">
197 |         <img class="img-responsive" alt="item" src="/images/test-sites/e-commerce/items/cart2.png">
198 |         <div class="caption">
199 |           <h4 class="pull-right price">$609.99</h4>
200 |           <h4><a href="#" class="title">Pavilion</a></h4>
201 |           <p class="description">15.6&quot;, Core i5-4200U, 6GB, 750GB, Windows 8.1</p>
202 |         </div>
203 |         <div class="ratings">
204 |           <p class="pull-right">3 reviews</p>
205 |           <p>
206 |                           <span class="glyphicon glyphicon-star"></span>
207 |                           <span class="glyphicon glyphicon-star"></span>
208 |                           <span class="glyphicon glyphicon-star"></span>
209 |                           <span class="glyphicon glyphicon-star"></span>
210 |                           <span class="glyphicon glyphicon-star"></span>
211 |                       </p>
212 |         </div>
213 |       </div>
214 |     </div>
215 |   </div>
216 | 
217 | 
218 |   <ul class="pagination">
219 |           <li class="disabled"><span>«</span></li>
220 |                       <li class="active"><span>1</span></li>
221 |                         <li><a href="/test-sites/e-commerce/static/computers/laptops/2">2</a></li>
222 |                         <li><a href="/test-sites/e-commerce/static/computers/laptops/3">3</a></li>
223 |                     <li><a href="/test-sites/e-commerce/static/computers/laptops/2" rel="next">»</a></li>
224 |       </ul>
225 |     </div>
226 |   </div>
227 | </div>
228 | <div class="footer">
229 |   <div class="container">
230 |     <p class="text-muted">&copy; 2015 Web Scraper | <a href="/contact">contact</a> | <a href="/donate">donate</a></p>
231 |   </div>
232 | </div>
233 | 
234 | <script>
235 |   (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
236 |     (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
237 |     m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
238 |   })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
239 | 
240 |   ga('create', 'UA-47270784-2', 'auto');
241 |   ga('send', 'pageview');
242 | 
243 | </script>
244 | 
245 | </body>
246 | </html>


--------------------------------------------------------------------------------
/test/sample_pages/e-commerce/static/computers/tablets/index.html:
--------------------------------------------------------------------------------
  1 | 
  2 | <!DOCTYPE html>
  3 | <html>
  4 | <head>
  5 |   <title>Web Scraper</title>
  6 |   <meta charset="utf-8">
  7 |   <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
  8 | 
  9 |   <meta name="keywords" content="web scraping,Web Scraper,Chrome extension,Crawling,Cross platform scraper, " />
 10 |   <meta name="description" content="Web Scraper Chrome extension a free tool to scrape dynamic web pages." />
 11 |   <link rel="icon" sizes="128x128" href="/favicon.png">
 12 | 
 13 |   <meta name="viewport" content="width=device-width, initial-scale=1.0">
 14 |   
 15 |   <link media="all" type="text/css" rel="stylesheet" href="/style.css">
 16 | 
 17 |   <!-- HTML5 shim, for IE6-8 support of HTML5 elements -->
 18 |   <!--[if lt IE 9]>
 19 |   <script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
 20 |   <![endif]-->
 21 | 
 22 |     <script src="/site.js"></script>
 23 | 
 24 | </head>
 25 | <body>
 26 | <div class="navbar navbar-default navbar-fixed-top" role="navigation">
 27 |   <div class="container">
 28 |     <div class="navbar-header">
 29 |       <button type="button" class="navbar-toggle" data-toggle="collapse" data-target=".navbar-collapse">
 30 |         <span class="sr-only">Toggle navigation</span>
 31 |         <span class="icon-bar"></span>
 32 |         <span class="icon-bar"></span>
 33 |         <span class="icon-bar"></span>
 34 |       </button>
 35 |       <a class="navbar-brand" href="/">Web Scraper</a>
 36 |     </div>
 37 |     <div class="navbar-collapse collapse">
 38 |       <ul class="nav navbar-nav">
 39 |                 <li ><a href="/">Home</a></li>
 40 |         <li ><a href="/service">Service</a></li>
 41 |         <li ><a href="/sitemap-specialist">Sitemap Specialist</a></li>
 42 |         <li ><a href="/screenshots">Screenshots</a></li>
 43 |         <li class="dropdown">
 44 |           <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Learn <span class="caret"></span></a>
 45 |           <ul class="dropdown-menu" role="menu">
 46 |             <li ><a href="/tutorials">Tutorials</a></li>
 47 |             <li ><a href="/documentation">Documentation</a></li>
 48 |             <li class="divider"></li>
 49 |             <li class="active"><a href="/test-sites">Test sites</a></li>
 50 |           </ul>
 51 |         </li>
 52 | 
 53 |         <li ><a href="/help">Help</a></li>
 54 |               </ul>
 55 |       <ul class="nav navbar-nav navbar-right">
 56 |         <li><a href="http://chrome.google.com/webstore/detail/web-scraper/jnhgnonknehpejjnehehllkliplmbmhn" target="_blank">Download</a></li>
 57 |         <li><a href="https://github.com/martinsbalodis/web-scraper-chrome-extension" target="_blank">GitHub</a></li>
 58 |         <li ><a href="/donate">Donate</a></li>
 59 |       </ul>
 60 |     </div>
 61 |   </div>
 62 | </div>
 63 | 
 64 | <div class="container">
 65 |   <div class="row">
 66 |     <div class="col-md-3 sidebar">
 67 |       <div class="navbar-default sidebar" role="navigation">
 68 |   <div class="sidebar-nav navbar-collapse">
 69 |     <ul class="nav" id="side-menu">
 70 | 
 71 |       <li >
 72 |         <a href="/test-sites/e-commerce/static">Home</a>
 73 |       </li>
 74 | 
 75 |               <li class="active">
 76 |           <a href="/test-sites/e-commerce/static/computers" class="category-link ">
 77 |             Computers<span class="fa arrow"></span>
 78 |           </a>
 79 | 
 80 |                       <ul class="nav nav-second-level collapse in">
 81 |                               <li>
 82 |                   <a href="/test-sites/e-commerce/static/computers/laptops" class="subcategory-link ">
 83 |                     Laptops                 </a>
 84 |                 </li>
 85 |                               <li>
 86 |                   <a href="/test-sites/e-commerce/static/computers/tablets" class="subcategory-link active">
 87 |                     Tablets                 </a>
 88 |                 </li>
 89 |                           </ul>
 90 |                   </li>
 91 |               <li >
 92 |           <a href="/test-sites/e-commerce/static/phones" class="category-link ">
 93 |             Phones<span class="fa arrow"></span>
 94 |           </a>
 95 | 
 96 |                   </li>
 97 |           </ul>
 98 |   </div>
 99 | </div>    </div>
100 |     <div class="col-md-9">
101 |       <h1 class="page-header">Computers / Tablets</h1>
102 | 
103 | <div class="row">
104 |           <div class="col-sm-4 col-lg-4 col-md-4">
105 |       <div class="thumbnail">
106 |         <img class="img-responsive" alt="item" src="/images/test-sites/e-commerce/items/cart2.png">
107 |         <div class="caption">
108 |           <h4 class="pull-right price">$69.99</h4>
109 |           <h4><a href="#" class="title">Lenovo IdeaTab</a></h4>
110 |           <p class="description">7&quot; screen, Android</p>
111 |         </div>
112 |         <div class="ratings">
113 |           <p class="pull-right">13 reviews</p>
114 |           <p>
115 |                           <span class="glyphicon glyphicon-star"></span>
116 |                           <span class="glyphicon glyphicon-star"></span>
117 |                           <span class="glyphicon glyphicon-star"></span>
118 |                       </p>
119 |         </div>
120 |       </div>
121 |     </div>
122 |           <div class="col-sm-4 col-lg-4 col-md-4">
123 |       <div class="thumbnail">
124 |         <img class="img-responsive" alt="item" src="/images/test-sites/e-commerce/items/cart2.png">
125 |         <div class="caption">
126 |           <h4 class="pull-right price">$88.99</h4>
127 |           <h4><a href="#" class="title">IdeaTab A3500L</a></h4>
128 |           <p class="description">Black, 7&quot; IPS, Quad-Core 1.2GHz, 8GB, Android 4.2</p>
129 |         </div>
130 |         <div class="ratings">
131 |           <p class="pull-right">15 reviews</p>
132 |           <p>
133 |                           <span class="glyphicon glyphicon-star"></span>
134 |                           <span class="glyphicon glyphicon-star"></span>
135 |                       </p>
136 |         </div>
137 |       </div>
138 |     </div>
139 |           <div class="col-sm-4 col-lg-4 col-md-4">
140 |       <div class="thumbnail">
141 |         <img class="img-responsive" alt="item" src="/images/test-sites/e-commerce/items/cart2.png">
142 |         <div class="caption">
143 |           <h4 class="pull-right price">$96.99</h4>
144 |           <h4><a href="#" class="title">Acer Iconia</a></h4>
145 |           <p class="description">7&quot; screen, Android, 16GB</p>
146 |         </div>
147 |         <div class="ratings">
148 |           <p class="pull-right">2 reviews</p>
149 |           <p>
150 |                           <span class="glyphicon glyphicon-star"></span>
151 |                           <span class="glyphicon glyphicon-star"></span>
152 |                           <span class="glyphicon glyphicon-star"></span>
153 |                           <span class="glyphicon glyphicon-star"></span>
154 |                       </p>
155 |         </div>
156 |       </div>
157 |     </div>
158 |           <div class="col-sm-4 col-lg-4 col-md-4">
159 |       <div class="thumbnail">
160 |         <img class="img-responsive" alt="item" src="/images/test-sites/e-commerce/items/cart2.png">
161 |         <div class="caption">
162 |           <h4 class="pull-right price">$97.99</h4>
163 |           <h4><a href="#" class="title">Galaxy Tab 3</a></h4>
164 |           <p class="description">7&quot;, 8GB, Wi-Fi, Android 4.2, White</p>
165 |         </div>
166 |         <div class="ratings">
167 |           <p class="pull-right">7 reviews</p>
168 |           <p>
169 |                           <span class="glyphicon glyphicon-star"></span>
170 |                           <span class="glyphicon glyphicon-star"></span>
171 |                           <span class="glyphicon glyphicon-star"></span>
172 |                           <span class="glyphicon glyphicon-star"></span>
173 |                           <span class="glyphicon glyphicon-star"></span>
174 |                       </p>
175 |         </div>
176 |       </div>
177 |     </div>
178 |           <div class="col-sm-4 col-lg-4 col-md-4">
179 |       <div class="thumbnail">
180 |         <img class="img-responsive" alt="item" src="/images/test-sites/e-commerce/items/cart2.png">
181 |         <div class="caption">
182 |           <h4 class="pull-right price">$99.99</h4>
183 |           <h4><a href="#" class="title">Iconia B1-730HD</a></h4>
184 |           <p class="description">Black, 7&quot;, 1.6GHz Dual-Core, 8GB, Android 4.4</p>
185 |         </div>
186 |         <div class="ratings">
187 |           <p class="pull-right">15 reviews</p>
188 |           <p>
189 |                           <span class="glyphicon glyphicon-star"></span>
190 |                           <span class="glyphicon glyphicon-star"></span>
191 |                           <span class="glyphicon glyphicon-star"></span>
192 |                           <span class="glyphicon glyphicon-star"></span>
193 |                       </p>
194 |         </div>
195 |       </div>
196 |     </div>
197 |           <div class="col-sm-4 col-lg-4 col-md-4">
198 |       <div class="thumbnail">
199 |         <img class="img-responsive" alt="item" src="/images/test-sites/e-commerce/items/cart2.png">
200 |         <div class="caption">
201 |           <h4 class="pull-right price">$101.99</h4>
202 |           <h4><a href="#" class="title">Memo Pad HD 7</a></h4>
203 |           <p class="description">IPS, Dual-Core 1.2GHz, 8GB, Android 4.3</p>
204 |         </div>
205 |         <div class="ratings">
206 |           <p class="pull-right">3 reviews</p>
207 |           <p>
208 |                           <span class="glyphicon glyphicon-star"></span>
209 |                           <span class="glyphicon glyphicon-star"></span>
210 |                       </p>
211 |         </div>
212 |       </div>
213 |     </div>
214 |   </div>
215 | 
216 | 
217 |   <ul class="pagination">
218 |           <li class="disabled"><span>«</span></li>
219 |                       <li class="active"><span>1</span></li>
220 |                         <li><a href="/test-sites/e-commerce/static/computers/tablets/2">2</a></li>
221 |                         <li><a href="/test-sites/e-commerce/static/computers/tablets/3">3</a></li>
222 |                         <li><a href="/test-sites/e-commerce/static/computers/tablets/4">4</a></li>
223 |                     <li><a href="/test-sites/e-commerce/static/computers/tablets/2" rel="next">»</a></li>
224 |       </ul>
225 |     </div>
226 |   </div>
227 | </div>
228 | <div class="footer">
229 |   <div class="container">
230 |     <p class="text-muted">&copy; 2015 Web Scraper | <a href="/contact">contact</a> | <a href="/donate">donate</a></p>
231 |   </div>
232 | </div>
233 | 
234 | <script>
235 |   (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
236 |     (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
237 |     m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
238 |   })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
239 | 
240 |   ga('create', 'UA-47270784-2', 'auto');
241 |   ga('send', 'pageview');
242 | 
243 | </script>
244 | 
245 | </body>
246 | </html>


--------------------------------------------------------------------------------
/test/sample_pages/e-commerce/static/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |   <title>Web Scraper</title>
  5 |   <meta charset="utf-8">
  6 |   <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
  7 | 
  8 |   <meta name="keywords" content="web scraping,Web Scraper,Chrome extension,Crawling,Cross platform scraper, " />
  9 |   <meta name="description" content="Web Scraper Chrome extension a free tool to scrape dynamic web pages." />
 10 |   <link rel="icon" sizes="128x128" href="/favicon.png">
 11 | 
 12 |   <meta name="viewport" content="width=device-width, initial-scale=1.0">
 13 |   
 14 |   <link media="all" type="text/css" rel="stylesheet" href="/style.css">
 15 | 
 16 |   <!-- HTML5 shim, for IE6-8 support of HTML5 elements -->
 17 |   <!--[if lt IE 9]>
 18 |   <script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
 19 |   <![endif]-->
 20 | 
 21 |     <script src="/site.js"></script>
 22 | 
 23 | </head>
 24 | <body>
 25 | <div class="navbar navbar-default navbar-fixed-top" role="navigation">
 26 |   <div class="container">
 27 |     <div class="navbar-header">
 28 |       <button type="button" class="navbar-toggle" data-toggle="collapse" data-target=".navbar-collapse">
 29 |         <span class="sr-only">Toggle navigation</span>
 30 |         <span class="icon-bar"></span>
 31 |         <span class="icon-bar"></span>
 32 |         <span class="icon-bar"></span>
 33 |       </button>
 34 |       <a class="navbar-brand" href="/">Web Scraper</a>
 35 |     </div>
 36 |     <div class="navbar-collapse collapse">
 37 |       <ul class="nav navbar-nav">
 38 |                 <li ><a href="/">Home</a></li>
 39 |         <li ><a href="/service">Service</a></li>
 40 |         <li ><a href="/sitemap-specialist">Sitemap Specialist</a></li>
 41 |         <li ><a href="/screenshots">Screenshots</a></li>
 42 |         <li class="dropdown">
 43 |           <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Learn <span class="caret"></span></a>
 44 |           <ul class="dropdown-menu" role="menu">
 45 |             <li ><a href="/tutorials">Tutorials</a></li>
 46 |             <li ><a href="/documentation">Documentation</a></li>
 47 |             <li class="divider"></li>
 48 |             <li class="active"><a href="/">Test sites</a></li>
 49 |           </ul>
 50 |         </li>
 51 | 
 52 |         <li ><a href="/help">Help</a></li>
 53 |               </ul>
 54 |       <ul class="nav navbar-nav navbar-right">
 55 |         <li><a href="http://chrome.google.com/webstore/detail/web-scraper/jnhgnonknehpejjnehehllkliplmbmhn" target="_blank">Download</a></li>
 56 |         <li><a href="https://github.com/martinsbalodis/web-scraper-chrome-extension" target="_blank">GitHub</a></li>
 57 |         <li ><a href="/donate">Donate</a></li>
 58 |       </ul>
 59 |     </div>
 60 |   </div>
 61 | </div>
 62 | 
 63 | <div class="container">
 64 |   <div class="row">
 65 |     <div class="col-md-3 sidebar">
 66 |       <div class="navbar-default sidebar" role="navigation">
 67 |   <div class="sidebar-nav navbar-collapse">
 68 |     <ul class="nav" id="side-menu">
 69 | 
 70 |       <li class="active">
 71 |         <a href="/e-commerce/static">Home</a>
 72 |       </li>
 73 | 
 74 |               <li >
 75 |           <a href="/e-commerce/static/computers" class="category-link ">
 76 |             Computers<span class="fa arrow"></span>
 77 |           </a>
 78 | 
 79 |                   </li>
 80 |               <li >
 81 |           <a href="/e-commerce/static/phones" class="category-link ">
 82 |             Phones<span class="fa arrow"></span>
 83 |           </a>
 84 | 
 85 |                   </li>
 86 |           </ul>
 87 |   </div>
 88 | </div>    </div>
 89 |     <div class="col-md-9">
 90 |       <div class="jumbotron">
 91 |   <h1>E-commerce training site</h1>
 92 |   <p>
 93 |     Welcome to WebScraper e-commerce site. You can use this site for training
 94 |     to learn how to use the Web Scraper. Items listed here are not for sale.
 95 |   </p>
 96 | </div>
 97 | 
 98 | <h2>Top items being scraped right now</h2>
 99 | 
100 | <div class="row">
101 |           <div class="col-sm-4 col-lg-4 col-md-4">
102 |       <div class="thumbnail">
103 |         <img class="img-responsive" alt="item" src="/images/e-commerce/items/cart2.png">
104 |         <div class="caption">
105 |           <h4 class="pull-right price">$57.99</h4>
106 |           <h4><a href="#" class="title">LG Optimus</a></h4>
107 |           <p class="description">3.2&quot; screen</p>
108 |         </div>
109 |         <div class="ratings">
110 |           <p class="pull-right">8 reviews</p>
111 |           <p>
112 |                           <span class="glyphicon glyphicon-star"></span>
113 |                           <span class="glyphicon glyphicon-star"></span>
114 |                           <span class="glyphicon glyphicon-star"></span>
115 |                       </p>
116 |         </div>
117 |       </div>
118 |     </div>
119 |           <div class="col-sm-4 col-lg-4 col-md-4">
120 |       <div class="thumbnail">
121 |         <img class="img-responsive" alt="item" src="/images/e-commerce/items/cart2.png">
122 |         <div class="caption">
123 |           <h4 class="pull-right price">$537.99</h4>
124 |           <h4><a href="#" class="title">iPad Mini Retina</a></h4>
125 |           <p class="description">Wi-Fi + Cellular, 32GB, Silver</p>
126 |         </div>
127 |         <div class="ratings">
128 |           <p class="pull-right">13 reviews</p>
129 |           <p>
130 |                           <span class="glyphicon glyphicon-star"></span>
131 |                           <span class="glyphicon glyphicon-star"></span>
132 |                           <span class="glyphicon glyphicon-star"></span>
133 |                           <span class="glyphicon glyphicon-star"></span>
134 |                       </p>
135 |         </div>
136 |       </div>
137 |     </div>
138 |           <div class="col-sm-4 col-lg-4 col-md-4">
139 |       <div class="thumbnail">
140 |         <img class="img-responsive" alt="item" src="/images/e-commerce/items/cart2.png">
141 |         <div class="caption">
142 |           <h4 class="pull-right price">$130.99</h4>
143 |           <h4><a href="#" class="title">MeMO Pad 7</a></h4>
144 |           <p class="description">White, 7&quot;, Atom 1.2GHz, 8GB, Android 4.4</p>
145 |         </div>
146 |         <div class="ratings">
147 |           <p class="pull-right">15 reviews</p>
148 |           <p>
149 |                           <span class="glyphicon glyphicon-star"></span>
150 |                           <span class="glyphicon glyphicon-star"></span>
151 |                           <span class="glyphicon glyphicon-star"></span>
152 |                       </p>
153 |         </div>
154 |       </div>
155 |     </div>
156 |   </div>
157 | 
158 |     </div>
159 |   </div>
160 | </div>
161 | <div class="footer">
162 |   <div class="container">
163 |     <p class="text-muted">&copy; 2015 Web Scraper | <a href="/contact">contact</a> | <a href="/donate">donate</a></p>
164 |   </div>
165 | </div>
166 | 
167 | <script>
168 |   (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
169 |     (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
170 |     m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
171 |   })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
172 | 
173 |   ga('create', 'UA-47270784-2', 'auto');
174 |   ga('send', 'pageview');
175 | 
176 | </script>
177 | 
178 | </body>
179 | </html>


--------------------------------------------------------------------------------
/test/sample_pages/e-commerce/static/phones/index.html:
--------------------------------------------------------------------------------
  1 | 
  2 | <!DOCTYPE html>
  3 | <html>
  4 | <head>
  5 |   <title>Web Scraper</title>
  6 |   <meta charset="utf-8">
  7 |   <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
  8 | 
  9 |   <meta name="keywords" content="web scraping,Web Scraper,Chrome extension,Crawling,Cross platform scraper, " />
 10 |   <meta name="description" content="Web Scraper Chrome extension a free tool to scrape dynamic web pages." />
 11 |   <link rel="icon" sizes="128x128" href="/favicon.png">
 12 | 
 13 |   <meta name="viewport" content="width=device-width, initial-scale=1.0">
 14 |   
 15 |   <link media="all" type="text/css" rel="stylesheet" href="/style.css">
 16 | 
 17 |   <!-- HTML5 shim, for IE6-8 support of HTML5 elements -->
 18 |   <!--[if lt IE 9]>
 19 |   <script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
 20 |   <![endif]-->
 21 | 
 22 |     <script src="/site.js"></script>
 23 | 
 24 | </head>
 25 | <body>
 26 | <div class="navbar navbar-default navbar-fixed-top" role="navigation">
 27 |   <div class="container">
 28 |     <div class="navbar-header">
 29 |       <button type="button" class="navbar-toggle" data-toggle="collapse" data-target=".navbar-collapse">
 30 |         <span class="sr-only">Toggle navigation</span>
 31 |         <span class="icon-bar"></span>
 32 |         <span class="icon-bar"></span>
 33 |         <span class="icon-bar"></span>
 34 |       </button>
 35 |       <a class="navbar-brand" href="/">Web Scraper</a>
 36 |     </div>
 37 |     <div class="navbar-collapse collapse">
 38 |       <ul class="nav navbar-nav">
 39 |                 <li ><a href="/">Home</a></li>
 40 |         <li ><a href="/service">Service</a></li>
 41 |         <li ><a href="/sitemap-specialist">Sitemap Specialist</a></li>
 42 |         <li ><a href="/screenshots">Screenshots</a></li>
 43 |         <li class="dropdown">
 44 |           <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Learn <span class="caret"></span></a>
 45 |           <ul class="dropdown-menu" role="menu">
 46 |             <li ><a href="/tutorials">Tutorials</a></li>
 47 |             <li ><a href="/documentation">Documentation</a></li>
 48 |             <li class="divider"></li>
 49 |             <li class="active"><a href="/">Test sites</a></li>
 50 |           </ul>
 51 |         </li>
 52 | 
 53 |         <li ><a href="/help">Help</a></li>
 54 |               </ul>
 55 |       <ul class="nav navbar-nav navbar-right">
 56 |         <li><a href="http://chrome.google.com/webstore/detail/web-scraper/jnhgnonknehpejjnehehllkliplmbmhn" target="_blank">Download</a></li>
 57 |         <li><a href="https://github.com/martinsbalodis/web-scraper-chrome-extension" target="_blank">GitHub</a></li>
 58 |         <li ><a href="/donate">Donate</a></li>
 59 |       </ul>
 60 |     </div>
 61 |   </div>
 62 | </div>
 63 | 
 64 | <div class="container">
 65 |   <div class="row">
 66 |     <div class="col-md-3 sidebar">
 67 |       <div class="navbar-default sidebar" role="navigation">
 68 |   <div class="sidebar-nav navbar-collapse">
 69 |     <ul class="nav" id="side-menu">
 70 | 
 71 |       <li >
 72 |         <a href="/e-commerce/static">Home</a>
 73 |       </li>
 74 | 
 75 |               <li >
 76 |           <a href="/e-commerce/static/computers" class="category-link ">
 77 |             Computers<span class="fa arrow"></span>
 78 |           </a>
 79 | 
 80 |                   </li>
 81 |               <li class="active">
 82 |           <a href="/e-commerce/static/phones" class="category-link active">
 83 |             Phones<span class="fa arrow"></span>
 84 |           </a>
 85 | 
 86 |                       <ul class="nav nav-second-level collapse in">
 87 |                               <li>
 88 |                   <a href="/e-commerce/static/phones/touch" class="subcategory-link ">
 89 |                     Touch                 </a>
 90 |                 </li>
 91 |                           </ul>
 92 |                   </li>
 93 |           </ul>
 94 |   </div>
 95 | </div>    </div>
 96 |     <div class="col-md-9">
 97 |       <h1 class="page-header">Phones category</h1>
 98 | 
 99 | <h2>Top items being scraped right now</h2>
100 | 
101 | <div class="row">
102 |           <div class="col-sm-4 col-lg-4 col-md-4">
103 |       <div class="thumbnail">
104 |         <img class="img-responsive" alt="item" src="/images/e-commerce/items/cart2.png">
105 |         <div class="caption">
106 |           <h4 class="pull-right price">$499.99</h4>
107 |           <h4><a href="#" class="title">Ubuntu Edge</a></h4>
108 |           <p class="description">Sapphire glass</p>
109 |         </div>
110 |         <div class="ratings">
111 |           <p class="pull-right">14 reviews</p>
112 |           <p>
113 |                           <span class="glyphicon glyphicon-star"></span>
114 |                           <span class="glyphicon glyphicon-star"></span>
115 |                           <span class="glyphicon glyphicon-star"></span>
116 |                           <span class="glyphicon glyphicon-star"></span>
117 |                       </p>
118 |         </div>
119 |       </div>
120 |     </div>
121 |           <div class="col-sm-4 col-lg-4 col-md-4">
122 |       <div class="thumbnail">
123 |         <img class="img-responsive" alt="item" src="/images/e-commerce/items/cart2.png">
124 |         <div class="caption">
125 |           <h4 class="pull-right price">$899.99</h4>
126 |           <h4><a href="#" class="title">Iphone</a></h4>
127 |           <p class="description">Black</p>
128 |         </div>
129 |         <div class="ratings">
130 |           <p class="pull-right">7 reviews</p>
131 |           <p>
132 |                           <span class="glyphicon glyphicon-star"></span>
133 |                           <span class="glyphicon glyphicon-star"></span>
134 |                           <span class="glyphicon glyphicon-star"></span>
135 |                           <span class="glyphicon glyphicon-star"></span>
136 |                       </p>
137 |         </div>
138 |       </div>
139 |     </div>
140 |           <div class="col-sm-4 col-lg-4 col-md-4">
141 |       <div class="thumbnail">
142 |         <img class="img-responsive" alt="item" src="/images/e-commerce/items/cart2.png">
143 |         <div class="caption">
144 |           <h4 class="pull-right price">$57.99</h4>
145 |           <h4><a href="#" class="title">LG Optimus</a></h4>
146 |           <p class="description">3.2&quot; screen</p>
147 |         </div>
148 |         <div class="ratings">
149 |           <p class="pull-right">12 reviews</p>
150 |           <p>
151 |                           <span class="glyphicon glyphicon-star"></span>
152 |                           <span class="glyphicon glyphicon-star"></span>
153 |                       </p>
154 |         </div>
155 |       </div>
156 |     </div>
157 |   </div>
158 | 
159 |     </div>
160 |   </div>
161 | </div>
162 | <div class="footer">
163 |   <div class="container">
164 |     <p class="text-muted">&copy; 2015 Web Scraper | <a href="/contact">contact</a> | <a href="/donate">donate</a></p>
165 |   </div>
166 | </div>
167 | 
168 | <script>
169 |   (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
170 |     (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
171 |     m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
172 |   })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
173 | 
174 |   ga('create', 'UA-47270784-2', 'auto');
175 |   ga('send', 'pageview');
176 | 
177 | </script>
178 | 
179 | </body>
180 | </html>


--------------------------------------------------------------------------------
/test/sample_pages/e-commerce/static/phones/touch/index.html:
--------------------------------------------------------------------------------
  1 | 
  2 | <!DOCTYPE html>
  3 | <html>
  4 | <head>
  5 |   <title>Web Scraper</title>
  6 |   <meta charset="utf-8">
  7 |   <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
  8 | 
  9 |   <meta name="keywords" content="web scraping,Web Scraper,Chrome extension,Crawling,Cross platform scraper, " />
 10 |   <meta name="description" content="Web Scraper Chrome extension a free tool to scrape dynamic web pages." />
 11 |   <link rel="icon" sizes="128x128" href="/favicon.png">
 12 | 
 13 |   <meta name="viewport" content="width=device-width, initial-scale=1.0">
 14 |   
 15 |   <link media="all" type="text/css" rel="stylesheet" href="/style.css">
 16 | 
 17 |   <!-- HTML5 shim, for IE6-8 support of HTML5 elements -->
 18 |   <!--[if lt IE 9]>
 19 |   <script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
 20 |   <![endif]-->
 21 | 
 22 |     <script src="/site.js"></script>
 23 | 
 24 | </head>
 25 | <body>
 26 | <div class="navbar navbar-default navbar-fixed-top" role="navigation">
 27 |   <div class="container">
 28 |     <div class="navbar-header">
 29 |       <button type="button" class="navbar-toggle" data-toggle="collapse" data-target=".navbar-collapse">
 30 |         <span class="sr-only">Toggle navigation</span>
 31 |         <span class="icon-bar"></span>
 32 |         <span class="icon-bar"></span>
 33 |         <span class="icon-bar"></span>
 34 |       </button>
 35 |       <a class="navbar-brand" href="/">Web Scraper</a>
 36 |     </div>
 37 |     <div class="navbar-collapse collapse">
 38 |       <ul class="nav navbar-nav">
 39 |                 <li ><a href="/">Home</a></li>
 40 |         <li ><a href="/service">Service</a></li>
 41 |         <li ><a href="/sitemap-specialist">Sitemap Specialist</a></li>
 42 |         <li ><a href="/screenshots">Screenshots</a></li>
 43 |         <li class="dropdown">
 44 |           <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Learn <span class="caret"></span></a>
 45 |           <ul class="dropdown-menu" role="menu">
 46 |             <li ><a href="/tutorials">Tutorials</a></li>
 47 |             <li ><a href="/documentation">Documentation</a></li>
 48 |             <li class="divider"></li>
 49 |             <li class="active"><a href="/test-sites">Test sites</a></li>
 50 |           </ul>
 51 |         </li>
 52 | 
 53 |         <li ><a href="/help">Help</a></li>
 54 |               </ul>
 55 |       <ul class="nav navbar-nav navbar-right">
 56 |         <li><a href="http://chrome.google.com/webstore/detail/web-scraper/jnhgnonknehpejjnehehllkliplmbmhn" target="_blank">Download</a></li>
 57 |         <li><a href="https://github.com/martinsbalodis/web-scraper-chrome-extension" target="_blank">GitHub</a></li>
 58 |         <li ><a href="/donate">Donate</a></li>
 59 |       </ul>
 60 |     </div>
 61 |   </div>
 62 | </div>
 63 | 
 64 | <div class="container">
 65 |   <div class="row">
 66 |     <div class="col-md-3 sidebar">
 67 |       <div class="navbar-default sidebar" role="navigation">
 68 |   <div class="sidebar-nav navbar-collapse">
 69 |     <ul class="nav" id="side-menu">
 70 | 
 71 |       <li >
 72 |         <a href="/test-sites/e-commerce/static">Home</a>
 73 |       </li>
 74 | 
 75 |               <li >
 76 |           <a href="/test-sites/e-commerce/static/computers" class="category-link ">
 77 |             Computers<span class="fa arrow"></span>
 78 |           </a>
 79 | 
 80 |                   </li>
 81 |               <li class="active">
 82 |           <a href="/test-sites/e-commerce/static/phones" class="category-link ">
 83 |             Phones<span class="fa arrow"></span>
 84 |           </a>
 85 | 
 86 |                       <ul class="nav nav-second-level collapse in">
 87 |                               <li>
 88 |                   <a href="/test-sites/e-commerce/static/phones/touch" class="subcategory-link active">
 89 |                     Touch                 </a>
 90 |                 </li>
 91 |                           </ul>
 92 |                   </li>
 93 |           </ul>
 94 |   </div>
 95 | </div>    </div>
 96 |     <div class="col-md-9">
 97 |       <h1 class="page-header">Phones / Touch</h1>
 98 | 
 99 | <div class="row">
100 |           <div class="col-sm-4 col-lg-4 col-md-4">
101 |       <div class="thumbnail">
102 |         <img class="img-responsive" alt="item" src="/images/test-sites/e-commerce/items/cart2.png">
103 |         <div class="caption">
104 |           <h4 class="pull-right price">$24.99</h4>
105 |           <h4><a href="#" class="title">Nokia 123</a></h4>
106 |           <p class="description">7 day battery</p>
107 |         </div>
108 |         <div class="ratings">
109 |           <p class="pull-right">10 reviews</p>
110 |           <p>
111 |                           <span class="glyphicon glyphicon-star"></span>
112 |                           <span class="glyphicon glyphicon-star"></span>
113 |                           <span class="glyphicon glyphicon-star"></span>
114 |                       </p>
115 |         </div>
116 |       </div>
117 |     </div>
118 |           <div class="col-sm-4 col-lg-4 col-md-4">
119 |       <div class="thumbnail">
120 |         <img class="img-responsive" alt="item" src="/images/test-sites/e-commerce/items/cart2.png">
121 |         <div class="caption">
122 |           <h4 class="pull-right price">$57.99</h4>
123 |           <h4><a href="#" class="title">LG Optimus</a></h4>
124 |           <p class="description">3.2&quot; screen</p>
125 |         </div>
126 |         <div class="ratings">
127 |           <p class="pull-right">9 reviews</p>
128 |           <p>
129 |                           <span class="glyphicon glyphicon-star"></span>
130 |                           <span class="glyphicon glyphicon-star"></span>
131 |                           <span class="glyphicon glyphicon-star"></span>
132 |                       </p>
133 |         </div>
134 |       </div>
135 |     </div>
136 |           <div class="col-sm-4 col-lg-4 col-md-4">
137 |       <div class="thumbnail">
138 |         <img class="img-responsive" alt="item" src="/images/test-sites/e-commerce/items/cart2.png">
139 |         <div class="caption">
140 |           <h4 class="pull-right price">$93.99</h4>
141 |           <h4><a href="#" class="title">Samsung Galaxy</a></h4>
142 |           <p class="description">5 mpx. Android 5.0</p>
143 |         </div>
144 |         <div class="ratings">
145 |           <p class="pull-right">7 reviews</p>
146 |           <p>
147 |                           <span class="glyphicon glyphicon-star"></span>
148 |                           <span class="glyphicon glyphicon-star"></span>
149 |                       </p>
150 |         </div>
151 |       </div>
152 |     </div>
153 |           <div class="col-sm-4 col-lg-4 col-md-4">
154 |       <div class="thumbnail">
155 |         <img class="img-responsive" alt="item" src="/images/test-sites/e-commerce/items/cart2.png">
156 |         <div class="caption">
157 |           <h4 class="pull-right price">$109.99</h4>
158 |           <h4><a href="#" class="title">Nokia X</a></h4>
159 |           <p class="description">Andoid, Jolla dualboot</p>
160 |         </div>
161 |         <div class="ratings">
162 |           <p class="pull-right">13 reviews</p>
163 |           <p>
164 |                           <span class="glyphicon glyphicon-star"></span>
165 |                           <span class="glyphicon glyphicon-star"></span>
166 |                       </p>
167 |         </div>
168 |       </div>
169 |     </div>
170 |           <div class="col-sm-4 col-lg-4 col-md-4">
171 |       <div class="thumbnail">
172 |         <img class="img-responsive" alt="item" src="/images/test-sites/e-commerce/items/cart2.png">
173 |         <div class="caption">
174 |           <h4 class="pull-right price">$118.99</h4>
175 |           <h4><a href="#" class="title">Sony Xperia</a></h4>
176 |           <p class="description">GPS, waterproof</p>
177 |         </div>
178 |         <div class="ratings">
179 |           <p class="pull-right">8 reviews</p>
180 |           <p>
181 |                           <span class="glyphicon glyphicon-star"></span>
182 |                           <span class="glyphicon glyphicon-star"></span>
183 |                           <span class="glyphicon glyphicon-star"></span>
184 |                       </p>
185 |         </div>
186 |       </div>
187 |     </div>
188 |           <div class="col-sm-4 col-lg-4 col-md-4">
189 |       <div class="thumbnail">
190 |         <img class="img-responsive" alt="item" src="/images/test-sites/e-commerce/items/cart2.png">
191 |         <div class="caption">
192 |           <h4 class="pull-right price">$499.99</h4>
193 |           <h4><a href="#" class="title">Ubuntu Edge</a></h4>
194 |           <p class="description">Sapphire glass</p>
195 |         </div>
196 |         <div class="ratings">
197 |           <p class="pull-right">5 reviews</p>
198 |           <p>
199 |                           <span class="glyphicon glyphicon-star"></span>
200 |                           <span class="glyphicon glyphicon-star"></span>
201 |                           <span class="glyphicon glyphicon-star"></span>
202 |                           <span class="glyphicon glyphicon-star"></span>
203 |                       </p>
204 |         </div>
205 |       </div>
206 |     </div>
207 |   </div>
208 | 
209 | 
210 |   <ul class="pagination">
211 |           <li class="disabled"><span>«</span></li>
212 |                       <li class="active"><span>1</span></li>
213 |                         <li><a href="/test-sites/e-commerce/static/phones/touch/2">2</a></li>
214 |                     <li><a href="/test-sites/e-commerce/static/phones/touch/2" rel="next">»</a></li>
215 |       </ul>
216 |     </div>
217 |   </div>
218 | </div>
219 | <div class="footer">
220 |   <div class="container">
221 |     <p class="text-muted">&copy; 2015 Web Scraper | <a href="/contact">contact</a> | <a href="/donate">donate</a></p>
222 |   </div>
223 | </div>
224 | 
225 | <script>
226 |   (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
227 |     (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
228 |     m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
229 |   })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
230 | 
231 |   ga('create', 'UA-47270784-2', 'auto');
232 |   ga('send', 'pageview');
233 | 
234 | </script>
235 | 
236 | </body>
237 | </html>


--------------------------------------------------------------------------------
/test/sample_pages/example.com.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html>
 3 | <head>
 4 |     <title>Example Domain</title>
 5 | 
 6 |     <meta charset="utf-8" />
 7 |     <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
 8 |     <meta name="viewport" content="width=device-width, initial-scale=1" />
 9 |     <style type="text/css">
10 |     body {
11 |         background-color: #f0f0f2;
12 |         margin: 0;
13 |         padding: 0;
14 |         font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
15 |         
16 |     }
17 |     div {
18 |         width: 600px;
19 |         margin: 5em auto;
20 |         padding: 50px;
21 |         background-color: #fff;
22 |         border-radius: 1em;
23 |     }
24 |     a:link, a:visited {
25 |         color: #38488f;
26 |         text-decoration: none;
27 |     }
28 |     @media (max-width: 700px) {
29 |         body {
30 |             background-color: #fff;
31 |         }
32 |         div {
33 |             width: auto;
34 |             margin: 0 auto;
35 |             border-radius: 0;
36 |             padding: 1em;
37 |         }
38 |     }
39 |     </style>    
40 | </head>
41 | 
42 | <body>
43 | <div>
44 |     <h1>Example Domain</h1>
45 |     <p>This domain is established to be used for illustrative examples in documents. You may use this
46 |     domain in examples without prior coordination or asking for permission.</p>
47 |     <p><a href="http://www.iana.org/domains/example">More information...</a></p>
48 | </div>
49 | </body>
50 | </html>


--------------------------------------------------------------------------------
/test/scrapex/gen_spider_test.exs:
--------------------------------------------------------------------------------
  1 | defmodule Scrapex.GenSpiderTest do
  2 |   use ExUnit.Case
  3 |   alias Scrapex.GenSpider
  4 |   doctest GenSpider
  5 | 
  6 |   @example_com "http://localhost:9090/example.com.html"
  7 |   @ecommerce_site "http://localhost:9090/e-commerce/static/index.html"
  8 |   @opts [urls: [@example_com]]
  9 | 
 10 |   test "a spider is a process" do
 11 |     defmodule GoodSpider do
 12 |       use GenSpider
 13 |       # GenSpider callbacks
 14 |       def init(args) do
 15 |         {:ok, args}
 16 |       end
 17 |     end
 18 | 
 19 |     {:ok, pid} = GenSpider.start_link(GoodSpider, [])
 20 |     assert is_pid(pid)
 21 | 
 22 |     {:ok, pid} = GenSpider.start(GoodSpider, [])
 23 |     assert is_pid(pid)
 24 |   end
 25 | 
 26 |   test "spider is based on GenServer" do
 27 |     defmodule EmoSpider do
 28 |       # GenSpider callbacks
 29 |       def init(_args) do
 30 |         :ignore
 31 |       end
 32 |     end
 33 | 
 34 |     defmodule BadSpider do
 35 |       # GenSpider callbacks
 36 |       def init(_args) do
 37 |         {:stop, :stop}
 38 |       end
 39 |     end
 40 | 
 41 |     assert :ignore == GenSpider.start(EmoSpider, [])
 42 |     assert {:error, :stop} == GenSpider.start(BadSpider, [])
 43 |   end
 44 | 
 45 |   test "default spider" do
 46 |     defmodule DoNothingSpider do
 47 |       use GenSpider
 48 |     end
 49 |     {:ok, pid} = GenSpider.start(DoNothingSpider, [])
 50 |     assert is_pid(pid)
 51 |   end
 52 | 
 53 |   test "should start the crawling immediately" do
 54 |     defmodule TestSpider do
 55 |       use GenSpider
 56 | 
 57 |       def init(tester) do
 58 |         {:ok, tester}
 59 |       end
 60 | 
 61 |       def start_requests(_urls, tester) do
 62 |         send tester, :start_requests
 63 |         {:ok, [], tester}
 64 |       end
 65 |       
 66 |     end
 67 | 
 68 |     GenSpider.start(TestSpider, self, @opts)
 69 | 
 70 |     assert_receive(:start_requests, 500)
 71 |   end
 72 | 
 73 |   test "should get the HTML of the start URL(s)" do
 74 |     defmodule HTMLSpider do
 75 |       use GenSpider
 76 | 
 77 |       def init(tester) do
 78 |         {:ok, tester}
 79 |       end
 80 | 
 81 |       def start_requests(urls, tester) do
 82 |         requests = urls
 83 |         |> Enum.map(&make_requests_from_url(&1, tester))
 84 |         {:ok, requests, tester}
 85 |       end
 86 | 
 87 |       defp make_requests_from_url(url, tester) do
 88 |         GenSpider.request(url, fn(response) -> 
 89 |           send tester, {:test_result, response.body}
 90 |         end)
 91 |       end
 92 |       
 93 |     end
 94 |     GenSpider.start(HTMLSpider, self, @opts)
 95 | 
 96 |     assert_receive({:test_result, actual}, 500)
 97 |     expected = HTTPoison.get!("http://localhost:9090/example.com.html").body
 98 |     assert actual === expected
 99 |   end
100 | 
101 |   test "can export data" do
102 |     defmodule FastSpider do
103 |       use GenSpider
104 | 
105 |       def start_requests(urls, tester) do
106 |         requests = urls
107 |         |> Enum.map(&make_requests_from_url(&1, tester))
108 |         {:ok, requests, tester}
109 |       end
110 | 
111 |       defp make_requests_from_url(url, tester) do
112 |         GenSpider.request(url, fn(response) ->
113 |           send tester, {:test_result, response.body}
114 |           parse(response)
115 |         end)
116 |       end
117 |       
118 |     end
119 |     {:ok, spider} = GenSpider.start(FastSpider, self, @opts)
120 | 
121 |     assert_receive({:test_result, _}, 5000)
122 |     # Assume that the spider, which requested to the same URL, should
123 |     # have finished before our request below.
124 |     expected = HTTPoison.get!("http://localhost:9090/example.com.html").body
125 |     assert [expected] == GenSpider.export(spider)
126 | 
127 |   end
128 | 
129 |   defmodule Spider do
130 |     use GenSpider
131 | 
132 |     def start_requests(urls, tester) do
133 |       requests = urls
134 |       |> Enum.map(&make_requests_from_url(&1, tester))
135 |       {:ok, requests, tester}
136 |     end
137 | 
138 |     defp make_requests_from_url(url, tester) do
139 |       GenSpider.request(url, fn(response) ->
140 |         data = parse(response)
141 |         send tester, {:test_result, response.body}
142 |         data
143 |       end)
144 |     end
145 | 
146 |     def parse(response) do
147 |       uuid = :crypto.strong_rand_bytes(8) |> Base.encode16
148 |       {:ok, [uuid <> response.body]}
149 |     end
150 |     
151 |   end
152 | 
153 |   test "can run on schedule" do
154 |     opts = [urls: @opts[:urls], interval: 500]
155 |     GenSpider.start(Spider, self, opts)
156 | 
157 |     assert_receive({:test_result, _}, 300)
158 |     # Give time for spider to crawl
159 |     :timer.sleep(50)
160 |     assert_receive({:test_result, _}, 500)
161 |   end
162 | 
163 |   test "new data will replace old data" do
164 |     opts = [urls: @opts[:urls], interval: 500]
165 |     {:ok, spider} = GenSpider.start(Spider, self, opts)
166 | 
167 |     assert_receive({:test_result, _old}, 300)
168 |     [old] = GenSpider.export(spider)
169 |     <<old_uuid :: 128, _rest :: binary>> = old
170 |     # Give time for spider to crawl
171 |     :timer.sleep(50)
172 |     assert_receive({:test_result, _new}, 500)
173 |     [new] = GenSpider.export(spider)
174 |     <<new_uuid :: 128, _rest :: binary>> = new
175 |     assert new_uuid !== old_uuid
176 |   end
177 | 
178 |   test "multiple URLs should replace old data with merged new data" do
179 |     opts = [urls: [ @ecommerce_site | @opts[:urls] ], interval: 500]
180 |     {:ok, spider} = GenSpider.start(Spider, self, opts)
181 | 
182 |     assert_receive({:test_result, _old}, 1500)
183 |     assert_receive({:test_result, _old}, 1500)
184 | 
185 |     old = GenSpider.export(spider)
186 | 
187 |     assert_receive({:test_result, _new}, 1500)
188 |     assert_receive({:test_result, _new}, 1500)
189 | 
190 |     GenSpider.export(spider)
191 |     |> Enum.with_index
192 |     |> Enum.each(fn({data, index}) ->
193 |       <<old_uuid :: 128, _rest :: binary>> = Enum.at(old, index)
194 |       <<new_uuid :: 128, _rest :: binary>> = data
195 |       assert new_uuid !== old_uuid
196 |     end)
197 |   end
198 | 
199 |   defmodule MapSpider do
200 |     use GenSpider
201 | 
202 |     def start_requests(urls, tester) do
203 |       requests = urls
204 |       |> Enum.map(&make_requests_from_url(&1, tester))
205 |       {:ok, requests, tester}
206 |     end
207 | 
208 |     defp make_requests_from_url(url, tester) do
209 |       spider = self()
210 |       GenSpider.request(url, fn(response) ->
211 |         {:ok, result} = parse(response)
212 |         case tester.(result, spider) do
213 |           {:stop, reason} ->
214 |             {:stop, reason}
215 |           {:test_result, result} ->
216 |             {:ok, result}
217 |         end
218 |       end)
219 |     end
220 | 
221 |     def parse(response) do
222 |       result = [%{"body" => response.body}]
223 |       {:ok, result}
224 |     end
225 |     
226 |   end
227 | 
228 |   test "returned map can be exported to json" do
229 |     tester = self
230 |     callback = fn(result, _) ->
231 |       send(tester, {:test_result, result})
232 |     end
233 |     {:ok, spider} = GenSpider.start(MapSpider, callback, @opts)
234 | 
235 |     assert_receive({:test_result, result}, 300)
236 |     json = GenSpider.export(spider, :json)
237 |     assert is_binary(json)
238 |     assert json == Poison.encode!(result)
239 |   end
240 | 
241 |   test "can export using an encoder" do
242 |     tester = self
243 |     callback = fn(result, _) ->
244 |       send(tester, {:test_result, result})
245 |     end
246 |     {:ok, spider} = GenSpider.start(MapSpider, callback, @opts)
247 | 
248 |     assert_receive({:test_result, result}, 300)
249 |     json = GenSpider.export(spider, &Poison.encode!/1)
250 |     assert is_binary(json)
251 |     assert json == Poison.encode!(result)
252 |   end
253 | 
254 |   test "will await for data to export" do
255 |     tester = self
256 |     callback = fn(result, _) ->
257 |       send(tester, {:test_result, result})
258 |     end
259 |     opts = [urls: [ @ecommerce_site | @opts[:urls] ]]
260 |     {:ok, spider} = GenSpider.start(MapSpider, callback, opts)
261 | 
262 |     # Since we can export immediately after starting the spider, it
263 |     # will need to await for data.
264 |     data = GenSpider.export(spider)
265 | 
266 |     actual =
267 |     opts[:urls]
268 |     |> Enum.map(&(%{"body" => HTTPoison.get!(&1).body}))
269 |     assert actual === data
270 |   end
271 | 
272 |   test "will export partial or no data if spider returns stop" do
273 |     tester = self
274 |     first_response = HTTPoison.get!(@ecommerce_site).body
275 |     callback = fn(result = [%{"body" => response}], _) ->
276 |       case response do
277 |         ^first_response ->
278 |           send tester, {:test_result, result}
279 |         _ ->
280 |           {:stop, :test}
281 |       end
282 |     end
283 | 
284 |     opts = [urls: [ @ecommerce_site | @opts[:urls] ]]
285 |     {:ok, spider} = GenSpider.start(MapSpider, callback, opts)
286 | 
287 |     data = GenSpider.export(spider)
288 |     assert [%{"body" => first_response}] === data
289 |   end
290 | 
291 |   test "stop the spider when the callback returns stop" do
292 |     tester = self
293 |     first_response = HTTPoison.get!(@ecommerce_site).body
294 |     callback = fn(result = [%{"body" => response}], _) ->
295 |       case response do
296 |         ^first_response ->
297 |           send tester, {:test_result, result}
298 |         _ ->
299 |           {:stop, :test}
300 |       end
301 |     end
302 | 
303 |     opts = [urls: [ @ecommerce_site | @opts[:urls] ]]
304 |     {:ok, spider} = GenSpider.start(MapSpider, callback, opts)
305 | 
306 |     _data = GenSpider.export(spider)
307 |     # Let the spider stop
308 |     :timer.sleep(100)
309 |     refute Process.alive?(spider)
310 |   end
311 | 
312 |   test "can request fresh data regardless of timer" do
313 |     opts = [urls: @opts[:urls], interval: 60000]
314 |     {:ok, spider} = GenSpider.start(Spider, self, opts)
315 |     # First export is always fresh, and same as next export.
316 |     [old] = GenSpider.export(spider)
317 |     assert [old] === GenSpider.export(spider)
318 |     <<old_uuid :: 128, _rest :: binary>> = old
319 | 
320 |     [new] = GenSpider.export(spider, nil, true)
321 |     <<new_uuid :: 128, _rest :: binary>> = new
322 |     assert new_uuid !== old_uuid
323 |   end
324 | 
325 |   test "can request for links during parsing" do
326 |     # Instead of returning the parsed data, `parse` function
327 |     # can return an async task, which will be awaited and merge
328 |     # to the data.
329 | 
330 |     # Since this test is made without knowledge of selector engine,
331 |     # we simply request other URL and return that body instead.
332 |     callback = fn(_, _) ->
333 |       # The final callback will send test result to this test proces,
334 |       # but also return that tuple, which is what `GenSpider.await/1`
335 |       # returns.
336 |       # `GenSpider.request/2` returns an asynchronous task.
337 |       request = GenSpider.request(@ecommerce_site, fn
338 |         (response) -> {:test_result, [response.body]}
339 |       end)
340 |       # That task can be awaited.
341 |       {:test_result, body} = GenSpider.await(request)
342 |       {:test_result, body}
343 |     end
344 | 
345 |     {:ok, spider} = GenSpider.start(MapSpider, callback, @opts)
346 |     [data] = GenSpider.export(spider)
347 |     assert data === HTTPoison.get!(@ecommerce_site).body
348 |   end
349 | 
350 |   test "parse function can return an async request" do
351 |     callback = fn(_what, spider) ->
352 |       request = GenSpider.request(@ecommerce_site, fn
353 |         (response) -> [response.body]
354 |       end, spider)
355 |       {:test_result, request}
356 |     end
357 | 
358 |     {:ok, spider} = GenSpider.start(MapSpider, callback, @opts)
359 |     [data] = GenSpider.export(spider)
360 |     assert data === HTTPoison.get!(@ecommerce_site).body
361 |   end
362 | 
363 |   test "parse function can return multiple async requests" do
364 |     # Can be used to follow multiple links on a page.
365 |     # Results will be concatenated.
366 |     urls = [ @ecommerce_site | @opts[:urls] ]
367 | 
368 |     callback = fn(_, spider) ->
369 |       requests =
370 |       urls
371 |       |> Enum.map(fn(url) ->
372 |         GenSpider.request(url, fn
373 |           (response) -> [response.body]
374 |         end, spider)
375 |       end)
376 |       {:test_result, requests}
377 |     end
378 | 
379 |     {:ok, spider} = GenSpider.start(MapSpider, callback, @opts)
380 |     data = GenSpider.export(spider)
381 | 
382 |     actual =
383 |     urls
384 |     |> Enum.map(&(HTTPoison.get!(&1).body))
385 | 
386 |     assert data === actual
387 |   end
388 | 
389 |   test "should follow redirect" do
390 |     url = "http://localhost:9090/e-commerce/static"
391 |     opts = [urls: [url]]
392 |     tester = self
393 |     callback = fn(result, _) ->
394 |       send(tester, {:test_result, result})
395 |     end
396 | 
397 |     {:ok, spider} = GenSpider.start(MapSpider, callback, opts)
398 |     [%{"body" => data}] = GenSpider.export(spider)
399 | 
400 |     assert data === HTTPoison.get!(url <> "/index.html").body
401 |   end
402 | 
403 |   test "should stop after first crawl if no interval set" do
404 |     url = "http://localhost:9090/e-commerce/static"
405 |     opts = [urls: [url]]
406 |     tester = self
407 |     callback = fn(result, _) ->
408 |       send(tester, {:test_result, result})
409 |     end
410 | 
411 |     {:ok, spider} = GenSpider.start(MapSpider, callback, opts)
412 |     # The spider will stop after a specific timeout when it's done scraping.
413 |     # We call `export` immediately so it will timeout after that.
414 |     [%{"body" => data}] = GenSpider.export(spider)
415 |     refute Process.alive?(spider)
416 |   end
417 | 
418 |   test "should also stop even when not exporting" do
419 |     url = "http://localhost:9090/e-commerce/static"
420 |     opts = [urls: [url]]
421 |     tester = self
422 |     callback = fn(result, _) ->
423 |       send(tester, {:test_result, result})
424 |     end
425 | 
426 |     {:ok, spider} = GenSpider.start(MapSpider, callback, opts)
427 |     # The spider will stop after a specific timeout when it's done scraping.
428 |     assert_receive({:test_result, result}, 300)
429 |     # Allow some time for the spider to stop.
430 |     :timer.sleep(500)
431 |     refute Process.alive?(spider)
432 |   end
433 | 
434 |   defmodule StreamSpider do
435 |     use GenSpider
436 | 
437 |     def start_requests(_urls, callback) do
438 |       stream = Stream.resource(
439 |         _start = fn() -> 
440 |           ["http://localhost:9090/e-commerce/static"]
441 |         end,
442 |         _next = fn
443 |           ([]) -> {:halt, []}
444 |           ([url|urls]) ->
445 |             {[callback.(url)], urls}
446 |         end,
447 |         _after = fn(_) -> end
448 |       )
449 | 
450 |       {:ok, stream, callback}
451 |     end
452 |   end
453 | 
454 |   test "can take a stream instead of requests list" do
455 |     tester = self
456 |     callback = fn(url) ->
457 |       GenSpider.request(url, fn(response) ->
458 |         send(tester, {:ok, [response]})
459 |       end)
460 |     end
461 |     GenSpider.start(StreamSpider, callback, [])
462 |     assert_receive({:ok, _new}, 500)
463 |   end
464 | end
465 | 


--------------------------------------------------------------------------------
/test/scrapex/selector_test.exs:
--------------------------------------------------------------------------------
 1 | defmodule Scrapex.SelectorTest do
 2 |   use ExUnit.Case, async: true
 3 |   import Scrapex.Selector
 4 | 
 5 |   setup_all do
 6 |     url = "http://localhost:9090/e-commerce/static/index.html"
 7 |     html = HTTPoison.get!(url).body
 8 | 
 9 |     # No metadata
10 |     {:ok, url: url, body: html}
11 |   end
12 | 
13 |   test "parse CSS selector", context do
14 |     [href] = context.body
15 |     |> select("a.navbar-brand")
16 |     |> extract("href")
17 | 
18 |     assert href === "/"
19 |   end
20 | 
21 |   test "select text content", context do
22 |     [h1] = context.body
23 |     |> select("h1")
24 |     |> extract("text")
25 | 
26 |     assert h1 === "E-commerce training site"
27 |   end
28 | 
29 |   test "default to get content", context do
30 |     [h1] = context.body
31 |     |> select("h1")
32 |     |> extract()
33 | 
34 |     assert h1 === "E-commerce training site"
35 |   end
36 | 
37 |   test "select text content and children content", context do
38 |     link_texts = context.body
39 |     |> select("a.category-link")
40 |     |> extract()
41 | 
42 |     assert link_texts === ["Computers", "Phones"]
43 |   end
44 | 
45 |   test "trip all Unicode whitespaces", context do
46 |     [p] = context.body
47 |     |> select(".jumbotron p")
48 |     |> extract()
49 | 
50 |     assert p === "Welcome to WebScraper e-commerce site. You can use this site for training to learn how to use the Web Scraper. Items listed here are not for sale."
51 |   end
52 | 
53 |   # TESTS FOR ENUMERABLE
54 | 
55 |   test "can be enumerable", context do
56 |     selectors = select(context.body, "a.category-link")
57 |     # Of course you can enumarate extracted values
58 |     categories = extract(selectors)
59 |     |> Enum.map(&(&1))
60 | 
61 |     assert categories == Enum.map(selectors, fn(selector) ->
62 |       [value] = extract(selector)
63 |       value
64 |     end)
65 |   end
66 | 
67 |   test "a single selector can still be enumerable", context do
68 |     selectors = select(context.body, "a.category-link")
69 |     # Of course you can enumarate extracted values
70 |     categories = extract(selectors)
71 |     |> Enum.map(&(&1))
72 | 
73 |     selectors = select(context.body, "h1")
74 |     expected = ["E-commerce training site"]
75 | 
76 |     assert expected == Enum.map(selectors, fn(selector) ->
77 |       [value] = extract(selector)
78 |       value
79 |     end)
80 |   end
81 | end


--------------------------------------------------------------------------------
/test/scrapex/spider/example_test.exs:
--------------------------------------------------------------------------------
 1 | defmodule Scrapex.Spider.ExampleTest do
 2 |   use ExUnit.Case
 3 | 
 4 |   alias Scrapex.GenSpider
 5 |   alias Spider.Example
 6 |   import Scrapex.Selector
 7 | 
 8 |   defmodule Example do
 9 |     use GenSpider
10 | 
11 |     # Client
12 |     def start_link(parser) do
13 |       opts =  [
14 |           urls: ["http://localhost:9090/e-commerce/static/index.html"]]
15 |       GenSpider.start_link(__MODULE__, parser, opts)
16 |     end
17 | 
18 |     def export(spider) do
19 |       GenSpider.export(spider)
20 |     end
21 | 
22 |     # Server (callbacks)
23 | 
24 |     def init(parser) do
25 |       {:ok, parser}
26 |     end
27 |     
28 |     def parse(response, parser) do
29 |       results = parser.(response)
30 |       {:ok, results, parser}
31 |     end
32 |   end
33 | 
34 |   def parse_product(response) do
35 |     response.body 
36 |     |> select(".thumbnail")
37 |     |> Enum.map(fn(selector) ->
38 |       [name] = selector |> select(".title") |> extract
39 |       [description] = selector |> select(".description") |> extract
40 |       [price] = selector |> select(".price") |> extract
41 | 
42 |       %{"name" => name, "description" => description, "price" => price}
43 |     end)
44 |   end
45 | 
46 |   test "get data on page" do
47 |     {:ok, spider} = Example.start_link(&parse_product/1)
48 |     results = Example.export(spider)
49 |     assert length(results) === 3
50 |   end
51 | 
52 |   test "can follow links" do
53 |     parser = fn(response) ->
54 |       response.body 
55 |       |> select("#side-menu .category-link")
56 |       |> Enum.flat_map(fn(anchor) ->
57 |         [href] = anchor |> extract("href")
58 |         full_url = GenSpider.Response.url_join(response, href) <> "/index.html"
59 |         [category] = anchor |> extract()
60 | 
61 |         GenSpider.request(full_url, fn({:ok, response}) ->
62 |           parse_product(response)
63 |         end)
64 |         |> GenSpider.await()
65 |         |> Enum.map(&Map.put(&1, "category", category))
66 |       end)
67 |     end
68 | 
69 |     {:ok, spider} = Example.start_link(parser)
70 |     results = Example.export(spider)
71 |     assert length(results) === 6
72 |   end
73 | end


--------------------------------------------------------------------------------
/test/scrapex/spider/webscraper.csv:
--------------------------------------------------------------------------------
 1 | Category,Category-href,Name,Price,Description,SubCategory,SubCategory-href,Page,Page-href
 2 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Galaxy Note 10.1","$587.99","10.1"", 32GB, Black","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","4","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/4"
 3 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Galaxy Tab 3","$97.99","7"", 8GB, Wi-Fi, Android 4.2, White","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","1","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/1"
 4 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","ThinkPad Yoga","$1223.99","12.5"" Touch, Core i5 4200U, 8GB, 500GB + 16GB SSD Cache, Windows","Laptops","http://webscraper.io/test-sites/e-commerce/static/computers/laptops","2","http://webscraper.io/test-sites/e-commerce/static/computers/laptops/2"
 5 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","ProBook","$739.99","14"", Core i5 2.6GHz, 4GB, 500GB, Win7 Pro 64bit","Laptops","http://webscraper.io/test-sites/e-commerce/static/computers/laptops","2","http://webscraper.io/test-sites/e-commerce/static/computers/laptops/2"
 6 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","iPad Mini Retina","$537.99","Wi-Fi + Cellular, 32GB, Silver","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","4","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/4"
 7 | "Phones","http://webscraper.io/test-sites/e-commerce/static/phones","Iphone","$899.99","Silver","Touch","http://webscraper.io/test-sites/e-commerce/static/phones/touch","2","http://webscraper.io/test-sites/e-commerce/static/phones/touch/2"
 8 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Galaxy Tab","$251.99","16GB, White","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","3","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/3"
 9 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","ThinkPad X230","$1244.99","12.5"", Core i5 2.6GHz, 8GB, 180GB SSD, Win7 Pro 64bit","Laptops","http://webscraper.io/test-sites/e-commerce/static/computers/laptops","2","http://webscraper.io/test-sites/e-commerce/static/computers/laptops/2"
10 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Memo Pad HD 7","$101.99","IPS, Dual-Core 1.2GHz, 8GB, Android 4.3","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","1","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/1"
11 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Aspire E1-572G","$581.99","15.6"", Core i5-4200U, 8GB, 1TB, Radeon R7 M265, Windows 8.1","Laptops","http://webscraper.io/test-sites/e-commerce/static/computers/laptops","1","http://webscraper.io/test-sites/e-commerce/static/computers/laptops/1"
12 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Asus MeMO Pad","$102.99","7"" screen, Android, 8GB","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","2","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/2"
13 | "Phones","http://webscraper.io/test-sites/e-commerce/static/phones","LG Optimus","$57.99","3.2"" screen","Touch","http://webscraper.io/test-sites/e-commerce/static/phones/touch","«","http://webscraper.io/test-sites/e-commerce/static/phones/touch/1"
14 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Lenovo IdeaTab","$69.99","7"" screen, Android","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","1","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/1"
15 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Acer Iconia","$96.99","7"" screen, Android, 16GB","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","1","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/1"
16 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Aspire E1-510","$306.99","15.6"", Pentium N3520 2.16GHz, 4GB, 500GB, Linux","Laptops","http://webscraper.io/test-sites/e-commerce/static/computers/laptops","1","http://webscraper.io/test-sites/e-commerce/static/computers/laptops/1"
17 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","HP 250 G3","$520.99","15.6"", Core i5-4210U, 4GB, 500GB, Windows 8.1","Laptops","http://webscraper.io/test-sites/e-commerce/static/computers/laptops","1","http://webscraper.io/test-sites/e-commerce/static/computers/laptops/1"
18 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","IdeaTab A8-50","$121.99","Blue, 8"" IPS, Quad-Core 1.3GHz, 16GB, Android 4.2","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","2","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/2"
19 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Apple iPad Air","$603.99","Wi-Fi, 64GB, Silver","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","4","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/4"
20 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Amazon Kindle","$103.99","6"" screen, wifi","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","2","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/2"
21 | "Phones","http://webscraper.io/test-sites/e-commerce/static/phones","Samsung Galaxy","$93.99","5 mpx. Android 5.0","Touch","http://webscraper.io/test-sites/e-commerce/static/phones/touch","«","http://webscraper.io/test-sites/e-commerce/static/phones/touch/1"
22 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Galaxy Tab 4","$233.99","LTE (SM-T235), Quad-Core 1.2GHz, 8GB, Black","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","3","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/3"
23 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","ThinkPad X240","$1311.99","12.5"", Core i5-4300U, 8GB, 240GB SSD, Win7 Pro 64bit","Laptops","http://webscraper.io/test-sites/e-commerce/static/computers/laptops","3","http://webscraper.io/test-sites/e-commerce/static/computers/laptops/3"
24 | "Phones","http://webscraper.io/test-sites/e-commerce/static/phones","Nokia X","$109.99","Andoid, Jolla dualboot","Touch","http://webscraper.io/test-sites/e-commerce/static/phones/touch","«","http://webscraper.io/test-sites/e-commerce/static/phones/touch/1"
25 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Dell XPS 13","$1281.99","13.3"" Touch, Core i5-4210U, 8GB, 128GB SSD, Windows 8.1","Laptops","http://webscraper.io/test-sites/e-commerce/static/computers/laptops","3","http://webscraper.io/test-sites/e-commerce/static/computers/laptops/3"
26 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Inspiron 15","$745.99","Moon Silver, 15.6"", Core i7-4510U, 8GB, 1TB, Radeon HD R7 M265 2GB,","Laptops","http://webscraper.io/test-sites/e-commerce/static/computers/laptops","2","http://webscraper.io/test-sites/e-commerce/static/computers/laptops/2"
27 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Galaxy Note","$489.99","12.2"", 32GB, WiFi, Android 4.4, White","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","3","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/3"
28 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Packard 255 G2","$416.99","15.6"", AMD E2-3800 1.3GHz, 4GB, 500GB, Windows 8.1","Laptops","http://webscraper.io/test-sites/e-commerce/static/computers/laptops","1","http://webscraper.io/test-sites/e-commerce/static/computers/laptops/1"
29 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","HP 350 G1","$577.99","15.6"", Core i5-4200U, 4GB, 750GB, Radeon HD8670M 2GB, Windows","Laptops","http://webscraper.io/test-sites/e-commerce/static/computers/laptops","1","http://webscraper.io/test-sites/e-commerce/static/computers/laptops/1"
30 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Pavilion","$609.99","15.6"", Core i5-4200U, 6GB, 750GB, Windows 8.1","Laptops","http://webscraper.io/test-sites/e-commerce/static/computers/laptops","1","http://webscraper.io/test-sites/e-commerce/static/computers/laptops/1"
31 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","IdeaTab S5000","$172.99","Silver, 7"" IPS, Quad-Core 1.2Ghz, 16GB, 3G, Android 4.2","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","3","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/3"
32 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","ThinkPad Yoga","$1033.99","12.5"" Touch, Core i3-4010U, 4GB, 500GB + 16GB SSD Cache,","Laptops","http://webscraper.io/test-sites/e-commerce/static/computers/laptops","2","http://webscraper.io/test-sites/e-commerce/static/computers/laptops/2"
33 | "Phones","http://webscraper.io/test-sites/e-commerce/static/phones","Nokia 123","$24.99","7 day battery","Touch","http://webscraper.io/test-sites/e-commerce/static/phones/touch","«","http://webscraper.io/test-sites/e-commerce/static/phones/touch/1"
34 | "Phones","http://webscraper.io/test-sites/e-commerce/static/phones","Ubuntu Edge","$499.99","Sapphire glass","Touch","http://webscraper.io/test-sites/e-commerce/static/phones/touch","«","http://webscraper.io/test-sites/e-commerce/static/phones/touch/1"
35 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Galaxy Note","$399.99","10.1"", 3G, Android 4.0, Garnet Red","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","3","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/3"
36 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","MeMO Pad 7","$130.99","White, 7"", Atom 1.2GHz, 8GB, Android 4.4","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","2","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/2"
37 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","IdeaTab A3500-H","$148.99","Blue, 7"" IPS, Quad-Core 1.3GHz, 8GB, 3G, Android 4.2","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","2","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/2"
38 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Iconia B1-730HD","$99.99","Black, 7"", 1.6GHz Dual-Core, 8GB, Android 4.4","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","1","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/1"
39 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","ThinkPad T540p","$1178.99","15.6"", Core i5-4200M, 4GB, 500GB, Win7 Pro 64bit","Laptops","http://webscraper.io/test-sites/e-commerce/static/computers/laptops","2","http://webscraper.io/test-sites/e-commerce/static/computers/laptops/2"
40 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","Galaxy Tab 3","$107.99","7"", 8GB, Wi-Fi, Android 4.2, Yellow","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","2","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/2"
41 | "Phones","http://webscraper.io/test-sites/e-commerce/static/phones","Iphone","$899.99","Black","Touch","http://webscraper.io/test-sites/e-commerce/static/phones/touch","2","http://webscraper.io/test-sites/e-commerce/static/phones/touch/2"
42 | "Phones","http://webscraper.io/test-sites/e-commerce/static/phones","Sony Xperia","$118.99","GPS, waterproof","Touch","http://webscraper.io/test-sites/e-commerce/static/phones/touch","«","http://webscraper.io/test-sites/e-commerce/static/phones/touch/1"
43 | "Phones","http://webscraper.io/test-sites/e-commerce/static/phones","Iphone","$899.99","White","Touch","http://webscraper.io/test-sites/e-commerce/static/phones/touch","2","http://webscraper.io/test-sites/e-commerce/static/phones/touch/2"
44 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","MeMo PAD FHD 10","$320.99","White, 10.1"" IPS, 1.6GHz, 2GB, 16GB, Android 4.2","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","3","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/3"
45 | "Computers","http://webscraper.io/test-sites/e-commerce/static/computers","IdeaTab A3500L","$88.99","Black, 7"" IPS, Quad-Core 1.2GHz, 8GB, Android 4.2","Tablets","http://webscraper.io/test-sites/e-commerce/static/computers/tablets","1","http://webscraper.io/test-sites/e-commerce/static/computers/tablets/1"
46 | 


--------------------------------------------------------------------------------
/test/scrapex/spider/webscraper_test.exs:
--------------------------------------------------------------------------------
  1 | defmodule Scrapex.Spider.WebScraperTest do
  2 |   use ExUnit.Case
  3 | 
  4 |   alias Scrapex.Spider.WebScraper
  5 | 
  6 |   @url "http://localhost:9090/e-commerce/static/index.html"
  7 |   @page_title "E-commerce training site"
  8 |   @description "Welcome to WebScraper e-commerce site. You can use this site for training to learn how to use the Web Scraper. Items listed here are not for sale."
  9 |   @categories ["Computers", "Phones"]
 10 |   @subcategories [["Laptops", "Tablets"], ["Phones"]]
 11 |   @home [23, 5, 39]
 12 |   @computers [21, 6, 8]
 13 |   @phones [32, 39, 11]
 14 | 
 15 |   setup_all do
 16 |     items =
 17 |     "test/scrapex/spider/webscraper.csv"
 18 |     |> File.stream!
 19 |     |> CSV.decode(headers: true)
 20 |     |> Enum.map(fn(row) ->
 21 |       row
 22 |       |> Map.delete("Category-href")
 23 |       |> Map.delete("SubCategory-href")
 24 |       |> Map.delete("Page-href")
 25 |     end)
 26 | 
 27 |     {:ok, items: items}
 28 |   end
 29 | 
 30 | 
 31 |   test "scrape single item" do
 32 |     selectors = [%{
 33 |       "parentSelectors" => ["_root"],
 34 |       "type" => "SelectorText",
 35 |       "multiple" => false,
 36 |       "id" => "Page Title",
 37 |       "selector" => ".jumbotron h1",
 38 |       "delay" => ""
 39 |     }]
 40 | 
 41 |     sitemap = %{"startUrl" => @url, "selectors" => selectors}
 42 | 
 43 |     {:ok, spider} = WebScraper.start_link(sitemap)
 44 |     [data] = WebScraper.export(spider)
 45 | 
 46 |     expected = %{
 47 |       "Page Title" => "E-commerce training site"
 48 |     }
 49 |     assert data === expected
 50 |   end
 51 | 
 52 |   test "scrape multiple single items" do
 53 |     selectors = [%{
 54 |       "parentSelectors" => ["_root"],
 55 |       "type" => "SelectorText",
 56 |       "multiple" => false,
 57 |       "id" => "Page Title",
 58 |       "selector" => ".jumbotron h1",
 59 |       "delay" => ""
 60 |     }, %{
 61 |       "parentSelectors" => ["_root"],
 62 |       "type" => "SelectorText",
 63 |       "multiple" => false,
 64 |       "id" => "Main Description",
 65 |       "selector" => ".jumbotron p",
 66 |       "delay" => ""
 67 |     }]
 68 | 
 69 |     sitemap = %{"startUrl" => @url, "selectors" => selectors}
 70 | 
 71 |     {:ok, spider} = WebScraper.start_link(sitemap)
 72 |     [data] = WebScraper.export(spider)
 73 | 
 74 |     expected = %{
 75 |       "Page Title" => @page_title,
 76 |       "Main Description" => @description
 77 |     }
 78 |     assert data === expected
 79 |   end
 80 | 
 81 |   test "scrape multiple items" do
 82 |     selectors = [%{
 83 |       "parentSelectors" => ["_root"],
 84 |       "type" => "SelectorText",
 85 |       "multiple" => true,
 86 |       "id" => "Category",
 87 |       "selector" => "a.category-link",
 88 |       "delay" => ""
 89 |     }]
 90 | 
 91 |     sitemap = %{"startUrl" => @url, "selectors" => selectors}
 92 | 
 93 |     {:ok, spider} = WebScraper.start_link(sitemap)
 94 |     data = WebScraper.export(spider)
 95 | 
 96 |     expected = [%{
 97 |       "Category" => "Computers"
 98 |     }, %{
 99 |       "Category" => "Phones"
100 |     }]
101 |     assert data === expected
102 |   end
103 | 
104 |   test "scrape both single and multiple items" do
105 |     selectors = [%{
106 |       "parentSelectors" => ["_root"],
107 |       "type" => "SelectorText",
108 |       "multiple" => true,
109 |       "id" => "Category",
110 |       "selector" => "a.category-link",
111 |       "delay" => ""
112 |     }, %{
113 |       "parentSelectors" => ["_root"],
114 |       "type" => "SelectorText",
115 |       "multiple" => false,
116 |       "id" => "Page Title",
117 |       "selector" => ".jumbotron h1",
118 |       "delay" => ""
119 |     }]
120 | 
121 |     sitemap = %{"startUrl" => @url, "selectors" => selectors}
122 | 
123 |     {:ok, spider} = WebScraper.start_link(sitemap)
124 |     data = WebScraper.export(spider)
125 | 
126 |     expected = [%{
127 |       "Category" => "Computers",
128 |       "Page Title" => "E-commerce training site"
129 |     }, %{
130 |       "Category" => "Phones",
131 |       "Page Title" => "E-commerce training site"
132 |     }]
133 |     assert data === expected
134 |   end
135 | 
136 |   test "scrape with empty selector" do
137 |     selectors = [%{
138 |       "parentSelectors" => ["_root"],
139 |       "type" => "SelectorText",
140 |       "multiple" => true,
141 |       "id" => "Category",
142 |       "selector" => "a.category-link",
143 |       "delay" => ""
144 |     }, %{
145 |       "parentSelectors" => ["_root"],
146 |       "type" => "SelectorText",
147 |       "multiple" => false,
148 |       "id" => "Page Title",
149 |       "selector" => ".jumbotron h2", # Intended typo.
150 |       "delay" => ""
151 |     }]
152 | 
153 |     sitemap = %{"startUrl" => @url, "selectors" => selectors}
154 | 
155 |     {:ok, spider} = WebScraper.start_link(sitemap)
156 |     data = WebScraper.export(spider)
157 | 
158 |     expected = [%{
159 |       "Category" => "Computers"
160 |     }, %{
161 |       "Category" => "Phones"
162 |     }]
163 |     assert data === expected
164 | 
165 |     selectors = [%{
166 |       "parentSelectors" => ["_root"],
167 |       "type" => "SelectorText",
168 |       "multiple" => true,
169 |       "id" => "Category",
170 |       "selector" => "a.category", # Intended typo.
171 |       "delay" => ""
172 |     }, %{
173 |       "parentSelectors" => ["_root"],
174 |       "type" => "SelectorText",
175 |       "multiple" => false,
176 |       "id" => "Page Title",
177 |       "selector" => ".jumbotron h1",
178 |       "delay" => ""
179 |     }]
180 | 
181 |     sitemap = %{"startUrl" => @url, "selectors" => selectors}
182 | 
183 |     {:ok, spider} = WebScraper.start_link(sitemap)
184 |     data = WebScraper.export(spider)
185 | 
186 |     expected = [%{
187 |       "Page Title" => "E-commerce training site"
188 |     }]
189 |     assert data === expected
190 |   end
191 | 
192 |   test "scrape only one of multiple items" do
193 |     selectors = [%{
194 |       "parentSelectors" => ["_root"],
195 |       "type" => "SelectorText",
196 |       "multiple" => false, # We only want one category among 3
197 |       "id" => "Category",
198 |       "selector" => "a.category-link",
199 |       "delay" => ""
200 |     }, %{
201 |       "parentSelectors" => ["_root"],
202 |       "type" => "SelectorText",
203 |       "multiple" => true, # Even though there is only 1 h1
204 |       "id" => "Page Title",
205 |       "selector" => ".jumbotron h1",
206 |       "delay" => ""
207 |     }]
208 | 
209 |     sitemap = %{"startUrl" => @url, "selectors" => selectors}
210 | 
211 |     {:ok, spider} = WebScraper.start_link(sitemap)
212 |     data = WebScraper.export(spider)
213 | 
214 |     expected = [%{
215 |       "Category" => "Computers",
216 |       "Page Title" => "E-commerce training site"
217 |     }]
218 |     assert data === expected
219 | 
220 |     # Test group of multiple items
221 |     selectors = [%{
222 |       "parentSelectors" => ["_root"],
223 |       "type" => "SelectorText",
224 |       "multiple" => true,
225 |       "id" => "Category",
226 |       "selector" => "a.category-link",
227 |       "delay" => ""
228 |     }, %{
229 |       "parentSelectors" => ["_root"],
230 |       "type" => "SelectorText",
231 |       "multiple" => false,
232 |       "id" => "Navigation", # We only want one category among 3
233 |       "selector" => ".navbar-right a",
234 |       "delay" => ""
235 |     }]
236 | 
237 |     sitemap = %{"startUrl" => @url, "selectors" => selectors}
238 | 
239 |     {:ok, spider} = WebScraper.start_link(sitemap)
240 |     data = WebScraper.export(spider)
241 | 
242 |     expected = [%{
243 |       "Category" => "Computers",
244 |       "Navigation" => "Download"
245 |     }, %{
246 |       "Category" => "Phones",
247 |       "Navigation" => "Download"
248 |     }]
249 |     assert data === expected
250 |   end
251 | 
252 |   test "scrape mixed between single and multiple, sorted" do
253 |     selectors = [%{
254 |       "parentSelectors" => ["_root"],
255 |       "type" => "SelectorText",
256 |       "multiple" => true,
257 |       "id" => "Category",
258 |       "selector" => "a.category-link ",
259 |       "delay" => ""
260 |     }, %{
261 |       "parentSelectors" => ["_root"],
262 |       "type" => "SelectorText",
263 |       "multiple" => false,
264 |       "id" => "Page Title",
265 |       "selector" => ".jumbotron h1",
266 |       "delay" => ""
267 |     }, %{
268 |       "parentSelectors" => ["_root"],
269 |       "type" => "SelectorText",
270 |       "multiple" => true,
271 |       "id" => "Navigation",
272 |       "selector" => ".navbar-right a",
273 |       "delay" => ""
274 |     }]
275 | 
276 |     sitemap = %{"startUrl" => @url, "selectors" => selectors}
277 | 
278 |     {:ok, spider} = WebScraper.start_link(sitemap)
279 |     data = WebScraper.export(spider)
280 | 
281 |     expected = [%{
282 |       "Category" => "Computers", 
283 |       "Page Title" => "E-commerce training site", 
284 |       "Navigation" => "Download"
285 |     }, %{
286 |       "Category" => "Phones", 
287 |       "Page Title" => "E-commerce training site", 
288 |       "Navigation" => "Download"
289 |     }, %{
290 |       "Category" => "Computers", 
291 |       "Page Title" => "E-commerce training site", 
292 |       "Navigation" => "GitHub"
293 |     }, %{
294 |       "Category" => "Phones", 
295 |       "Page Title" => "E-commerce training site", 
296 |       "Navigation" => "GitHub"
297 |     }, %{
298 |       "Category" => "Computers", 
299 |       "Page Title" => "E-commerce training site", 
300 |       "Navigation" => "Donate"
301 |     }, %{
302 |       "Category" => "Phones", 
303 |       "Page Title" => "E-commerce training site", 
304 |       "Navigation" => "Donate"
305 |     }]
306 |     assert ScrapexAsserter.array_equals(data, expected)
307 |   end
308 | 
309 |   test "follow nodes under _root first" do
310 |     selectors = [%{
311 |       "parentSelectors" => ["_root"],
312 |       "type" => "SelectorText",
313 |       "multiple" => false,
314 |       "id" => "Page Title",
315 |       "selector" => ".jumbotron h1",
316 |       "delay" => ""
317 |     }, %{
318 |       "parentSelectors" => ["Category"],
319 |       "type" => "SelectorText",
320 |       "multiple" => false,
321 |       "id" => "SubCategory",
322 |       "selector" => "a.subcategory-link",
323 |       "delay" => ""
324 |     }, %{
325 |       "parentSelectors" => ["_root"],
326 |       "type" => "SelectorLink",
327 |       "multiple" => false,
328 |       "id" => "Category",
329 |       "selector" => "a.category-link",
330 |       "delay" => ""
331 |     }]
332 |     sitemap = %{"startUrl" => @url, "selectors" => selectors}
333 | 
334 |     {:ok, spider} = WebScraper.start_link(sitemap)
335 |     [data] = WebScraper.export(spider)
336 | 
337 |     expected = %{
338 |       "Category" => "Computers",
339 |       "Page Title" => "E-commerce training site",
340 |       "SubCategory" => "Laptops"
341 |     }
342 |     assert data === expected
343 |   end
344 | 
345 |   test "whether to retrieve multiple items from selector" do
346 |     selectors = [%{
347 |       "parentSelectors" => ["_root"],
348 |       "type" => "SelectorText",
349 |       "multiple" => false,
350 |       "id" => "Page Title",
351 |       "selector" => ".jumbotron h1",
352 |       "delay" => ""
353 |     }, %{
354 |       "parentSelectors" => ["Category"],
355 |       "type" => "SelectorText",
356 |       "multiple" => true,
357 |       "id" => "SubCategory",
358 |       "selector" => "a.subcategory-link",
359 |       "delay" => ""
360 |     }, %{
361 |       "parentSelectors" => ["_root"],
362 |       "type" => "SelectorLink",
363 |       "multiple" => true,
364 |       "id" => "Category",
365 |       "selector" => "a.category-link",
366 |       "delay" => ""
367 |     }]
368 |     sitemap = %{"startUrl" => @url, "selectors" => selectors}
369 | 
370 |     {:ok, spider} = WebScraper.start_link(sitemap)
371 |     data = WebScraper.export(spider)
372 | 
373 |     # There are 2 categories, and 3 subcategories total, so there
374 |     # must be 3 results.
375 | 
376 |     assert length(data) === 3
377 |     expected = [%{
378 |       "Category" => "Computers",
379 |       "Page Title" => "E-commerce training site",
380 |       "SubCategory" => "Laptops"
381 |     }, %{
382 |       "Category" => "Computers",
383 |       "Page Title" => "E-commerce training site",
384 |       "SubCategory" => "Tablets"
385 |     }, %{
386 |       "Category" => "Phones",
387 |       "Page Title" => "E-commerce training site",
388 |       "SubCategory" => "Touch"
389 |     }]
390 | 
391 |     assert data === expected
392 |   end
393 | 
394 |   test "more level of selectors" do
395 |     selectors = [%{
396 |       "parentSelectors" => ["_root"],
397 |       "type" => "SelectorText",
398 |       "multiple" => false,
399 |       "id" => "Page Title",
400 |       "selector" => ".jumbotron h1",
401 |       "delay" => ""
402 |     }, %{
403 |       "parentSelectors" => ["Category"],
404 |       "type" => "SelectorLink",
405 |       "multiple" => true,
406 |       "id" => "SubCategory",
407 |       "selector" => "a.subcategory-link",
408 |       "delay" => ""
409 |     }, %{
410 |       "parentSelectors" => ["_root"],
411 |       "type" => "SelectorLink",
412 |       "multiple" => true,
413 |       "id" => "Category",
414 |       "selector" => "a.category-link",
415 |       "delay" => ""
416 |     }, %{
417 |       "parentSelectors" => ["SubCategory"],
418 |       "type" => "SelectorText",
419 |       "multiple" => true,
420 |       "id" => "Name",
421 |       "selector" => "a.title",
422 |       "delay" => ""
423 |     }]
424 | 
425 |     sitemap = %{"startUrl" => @url, "selectors" => selectors}
426 | 
427 |     {:ok, spider} = WebScraper.start_link(sitemap)
428 |     data = WebScraper.export(spider)
429 | 
430 |     # There are 2 categories, and 3 subcategories total, each has 6 
431 |     # items, for a total of 18 items.
432 | 
433 |     assert length(data) === 18
434 |   end
435 | 
436 |   test "parse level with no child" do
437 |     selectors = [%{
438 |       "parentSelectors" => ["_root"],
439 |       "type" => "SelectorText",
440 |       "multiple" => false,
441 |       "id" => "Page Title",
442 |       "selector" => ".jumbotron h1",
443 |       "delay" => ""
444 |     }, %{
445 |       "parentSelectors" => ["_root"],
446 |       "type" => "SelectorLink",
447 |       "multiple" => true,
448 |       "id" => "Category",
449 |       "selector" => "a.category-link",
450 |       "delay" => ""
451 |     }]
452 |     sitemap = %{"startUrl" => @url, "selectors" => selectors}
453 | 
454 |     {:ok, spider} = WebScraper.start_link(sitemap)
455 |     data = WebScraper.export(spider)
456 | 
457 |     expected = [%{
458 |       "Category" => "Computers",
459 |       "Page Title" => "E-commerce training site"
460 |     }, %{
461 |       "Category" => "Phones",
462 |       "Page Title" => "E-commerce training site"
463 |     }]
464 | 
465 |     assert data === expected
466 |   end
467 | 
468 |   test "don't put element from SelectorElement in result" do
469 |     selectors = [%{
470 |       "parentSelectors" => ["_root"],
471 |       "type" => "SelectorText",
472 |       "multiple" => false,
473 |       "id" => "Page Title",
474 |       "selector" => ".jumbotron h1",
475 |       "delay" => ""
476 |     }, %{
477 |       "parentSelectors" => ["Category"],
478 |       "type" => "SelectorText",
479 |       "multiple" => true,
480 |       "id" => "SubCategory",
481 |       "selector" => "a.subcategory-link",
482 |       "delay" => ""
483 |     }, %{
484 |       "parentSelectors" => ["_root"],
485 |       "type" => "SelectorLink",
486 |       "multiple" => true,
487 |       "id" => "Category",
488 |       "selector" => "a.category-link",
489 |       "delay" => ""
490 |     }, %{
491 |       "parentSelectors" => ["Category"],
492 |       "type" => "SelectorElement",
493 |       "multiple" => true,
494 |       "id" => "Item",
495 |       "selector" => "div.thumbnail",
496 |       "delay" => ""
497 |     }]
498 |     sitemap = %{"startUrl" => @url, "selectors" => selectors}
499 | 
500 |     {:ok, spider} = WebScraper.start_link(sitemap)
501 |     data = WebScraper.export(spider)
502 | 
503 |     # There are 2 categories, and 3 subcategories total, so there
504 |     # must be 3 results.
505 | 
506 |     assert length(data) === 3
507 |     expected = [%{
508 |       "Category" => "Computers",
509 |       "Page Title" => "E-commerce training site",
510 |       "SubCategory" => "Laptops"
511 |     }, %{
512 |       "Category" => "Computers",
513 |       "Page Title" => "E-commerce training site",
514 |       "SubCategory" => "Tablets"
515 |     }, %{
516 |       "Category" => "Phones",
517 |       "Page Title" => "E-commerce training site",
518 |       "SubCategory" => "Touch"
519 |     }]
520 | 
521 |     assert data === expected
522 |   end
523 | 
524 |   test "can group selectors with SelectorElement", context do
525 |     selectors = [%{
526 |       "parentSelectors" => ["_root"],
527 |       "type" => "SelectorLink",
528 |       "multiple" => true,
529 |       "id" => "Category",
530 |       "selector" => "a.category-link",
531 |       "delay" => ""
532 |     }, %{
533 |       "parentSelectors" => ["Category"],
534 |       "type" => "SelectorElement",
535 |       "multiple" => true,
536 |       "id" => "Item",
537 |       "selector" => "div.thumbnail",
538 |       "delay" => ""
539 |     }, %{
540 |       "parentSelectors" => ["Item"],
541 |       "type" => "SelectorText",
542 |       "multiple" => false,
543 |       "id" => "Name",
544 |       "selector" => "a.title",
545 |       "regex" => "",
546 |       "delay" => ""
547 |     }, %{
548 |       "parentSelectors" => ["Item"],
549 |       "type" => "SelectorText",
550 |       "multiple" => false,
551 |       "id" => "Price",
552 |       "selector" => "h4.pull-right",
553 |       "regex" => "",
554 |       "delay" => ""
555 |     }, %{
556 |       "parentSelectors" => ["Item"],
557 |       "type" => "SelectorText",
558 |       "multiple" => false,
559 |       "id" => "Description",
560 |       "selector" => "p.description",
561 |       "regex" => "",
562 |       "delay" => ""
563 |     }]
564 |     sitemap = %{"startUrl" => @url, "selectors" => selectors}
565 | 
566 |     {:ok, spider} = WebScraper.start_link(sitemap)
567 |     data = WebScraper.export(spider)
568 | 
569 |     # There are 2 categories. Each categories has
570 |     # 3 products on display. Total 2 x 3 = 9 items.
571 | 
572 |     assert length(data) === 6
573 |     items = context.items
574 | 
575 |     Enum.concat([@computers, @phones])
576 |     |> Enum.map(&Enum.at(items, &1))
577 |     |> Enum.map(fn(item) ->
578 |       item
579 |       |> Map.delete("Page")
580 |       |> Map.delete("SubCategory")
581 |     end)
582 |     |> ScrapexAsserter.array_equals(data)
583 |     |> assert
584 |   end
585 | end


--------------------------------------------------------------------------------
/test/scrapex_test.exs:
--------------------------------------------------------------------------------
1 | defmodule ScrapexTest do
2 |   use ExUnit.Case
3 | 
4 |   test "the truth" do
5 |     assert 1 + 1 == 2
6 |   end
7 | end
8 | 


--------------------------------------------------------------------------------
/test/test_helper.exs:
--------------------------------------------------------------------------------
 1 | :application.start :inets
 2 | 
 3 | server_root = '#{Path.absname("test/sample_pages")}'
 4 | test_server_config = [
 5 |   port: 9090,
 6 |   server_name: 'localhost',
 7 |   server_root: server_root,
 8 |   document_root: server_root,
 9 |   bind_address: {127, 0, 0, 1},
10 |   directory_index: ['index.hml', 'index.html']
11 | ]
12 | 
13 | {:ok, pid} = :inets.start(:httpd, test_server_config)
14 | 
15 | System.at_exit fn(_exit_status) ->
16 |   :ok = :inets.stop(:httpd, pid)
17 | end
18 | 
19 | ExUnit.start()
20 | 
21 | defmodule ScrapexAsserter do
22 |   def array_equals(left, right) do
23 |     Enum.all?(left, &Enum.member?(right, &1))
24 |   end
25 | end


--------------------------------------------------------------------------------