Page not found
65 | 66 |Sorry, but the page you were trying to get to, does not exist. You 67 | may want to try searching this site using the sidebar or using our 68 | API Reference page to find what 69 | you were looking for.
70 | 71 | 83 |├── .gitignore ├── LICENSE ├── README.md ├── config ├── config.exs ├── dev.exs ├── prod.exs └── test.exs ├── doc ├── 404.html ├── Scrapex.GenSpider.Response.html ├── Scrapex.GenSpider.html ├── Scrapex.Selector.html ├── Scrapex.html ├── assets │ └── logo.png ├── dist │ ├── app.css │ ├── app.js │ └── sidebar_items.js ├── extra-api-reference.html ├── extra-readme.html ├── fonts │ ├── icomoon.eot │ ├── icomoon.svg │ ├── icomoon.ttf │ └── icomoon.woff └── index.html ├── lib ├── scrapex.ex └── scrapex │ ├── gen_spider.ex │ ├── gen_spider │ ├── README.md │ ├── request.ex │ └── response.ex │ ├── selector.ex │ └── spider │ └── webscraper.ex ├── logo.png ├── mix.exs ├── mix.lock └── test ├── sample_pages ├── e-commerce │ └── static │ │ ├── computers │ │ ├── index.html │ │ ├── index_files │ │ │ ├── cart2.png │ │ │ ├── site.js │ │ │ └── style.css │ │ ├── laptops │ │ │ └── index.html │ │ └── tablets │ │ │ └── index.html │ │ ├── index.html │ │ └── phones │ │ ├── index.html │ │ └── touch │ │ └── index.html └── example.com.html ├── scrapex ├── gen_spider_test.exs ├── selector_test.exs └── spider │ ├── example_test.exs │ ├── webscraper.csv │ └── webscraper_test.exs ├── scrapex_test.exs └── test_helper.exs /.gitignore: -------------------------------------------------------------------------------- 1 | /_build 2 | /deps 3 | erl_crash.dump 4 | *.ez 5 | .DS_Store 6 | *.beam -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Son Tran-Nguyen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Scrapex 2 | ======= 3 | 4 | An open source and collaborative framework for extracting the data you need from websites. In a fast, simple, yet extensible way. 5 | 6 | ## Features 7 | 8 | ### Fast and powerful 9 | Write the rules to extract the data and let Scrapex do the rest. 10 | 11 | ### Easily extensible 12 | Extensible by design, plug new functionality easily without having to touch the core. 13 | 14 | ### Portable, Elixir 15 | Written in Elixir and runs on Linux, Windows, Mac, BSD, and embedded devices. 16 | 17 | ## Build your own webcrawlers 18 | 19 | alias Scrapex.GenSpider 20 | defmodule StackOverflowSpider do 21 | use GenSpider 22 | import Scrapex.Selector 23 | 24 | def parse(response, state) do 25 | result = response.body 26 | |> select(".question-summary h3 a") 27 | |> extract("href") 28 | |> Enum.map(fn(href) -> 29 | GenSpider.Response.url_join(response, href) 30 | |> GenSpider.request(&parse_question/1) 31 | |> GenSpider.await 32 | end) 33 | {:ok, result, state} 34 | end 35 | 36 | defp parse_question({:ok, response}) do 37 | html = response.body 38 | [title] = html |> select("h1 a") |> extract() 39 | question = html |> select(".question") 40 | [body] = question |> select(".post-text") |> extract 41 | [votes] = question |> select(".vote-count-post") |> extract 42 | tags = question |> select(".post-tag") |> extract 43 | 44 | %{title: title, body: body, votes: votes, tags: tags} 45 | end 46 | end 47 | urls = ["http://stackoverflow.com/questions?sort=votes"] 48 | opts = [name: :stackoverflow_spider, urls: urls] 49 | {:ok, spider} = GenSpider.start_link(StackOverflowSpider, [], opts) 50 | questions = GenSpider.export(spider) 51 | #=> "[{} | _]" 52 | 53 | ## TODOS 54 | 55 | - [x] `GenSpider behaviour`. 56 | - [x] Request URL and pass response to `parse/2` callback. 57 | - [x] One time spider 58 | - [x] CSS selector 59 | - [ ] XPath selector 60 | - [x] Yield for requests in `parse/2` 61 | - [x] Follow redirects 62 | - [ ] Set custom request headers 63 | - [ ] Respect robots.txt 64 | - [ ] Resolve DNS once only 65 | - [ ] Domain blacklist 66 | - [ ] Parse response chunk by chunk 67 | - [ ] CLI -------------------------------------------------------------------------------- /config/config.exs: -------------------------------------------------------------------------------- 1 | # This file is responsible for configuring your application 2 | # and its dependencies with the aid of the Mix.Config module. 3 | use Mix.Config 4 | 5 | # This configuration is loaded before any dependency and is restricted 6 | # to this project. If another project depends on this project, this 7 | # file won't be loaded nor affect the parent project. For this reason, 8 | # if you want to provide default values for your application for third- 9 | # party users, it should be done in your mix.exs file. 10 | 11 | # Sample configuration: 12 | # 13 | # config :logger, :console, 14 | # level: :info, 15 | # format: "$date $time [$level] $metadata$message\n", 16 | # metadata: [:user_id] 17 | 18 | # It is also possible to import configuration files, relative to this 19 | # directory. For example, you can emulate configuration per environment 20 | # by uncommenting the line below and defining dev.exs, test.exs and such. 21 | # Configuration from the imported file will override the ones defined 22 | # here (which is why it is important to import them last). 23 | # 24 | import_config "#{Mix.env}.exs" 25 | -------------------------------------------------------------------------------- /config/dev.exs: -------------------------------------------------------------------------------- 1 | use Mix.Config -------------------------------------------------------------------------------- /config/prod.exs: -------------------------------------------------------------------------------- 1 | use Mix.Config 2 | 3 | # Do not print debug messages in production 4 | config :logger, level: :info -------------------------------------------------------------------------------- /config/test.exs: -------------------------------------------------------------------------------- 1 | use Mix.Config 2 | 3 | # Print only warnings and errors during test 4 | config :logger, level: :warn -------------------------------------------------------------------------------- /doc/404.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | 6 | 7 |Sorry, but the page you were trying to get to, does not exist. You 67 | may want to try searching this site using the sidebar or using our 68 | API Reference page to find what 69 | you were looking for.
70 | 71 | 83 |Utilities for working response returned from GenSpider
.
Join a path relative to the response’s URL
115 |t :: %Scrapex.GenSpider.Response{url: binary, body: binary}
140 |
141 | url_join(t, binary) :: binary
173 |
174 | Join a path relative to the response’s URL.
179 |iex> alias Scrapex.GenSpider.Response
181 | iex> response = %Response{url: "http://www.scrapex.com/subfolder"}
182 | iex> Response.url_join(response, "/subfolder2")
183 | "http://www.scrapex.com/subfolder2"
184 | iex> Response.url_join(response, "subsubfolder")
185 | "http://www.scrapex.com/subfolder/subsubfolder"
186 |
187 | Utilities for extracting data from markup language.
77 | 78 |Attribute of a node
100 |A tree of HTML nodes, or a node itself if only one
121 |Name of the tag or attribute
130 |Extracts content or attribute value for a selection
160 |Generates a selection for a particular selector
175 |Extracts content or attribute value for a selection.
263 | 264 |Generates a selection for a particular selector.
316 |The return value is a Selector.t
317 | 318 |A behaviour module for implementing a web data extractor
85 |Utilities for working response returned from GenSpider
Utilities for extracting data from markup language
99 |An open source and collaborative framework for extracting the data you need from websites. In a fast, simple, yet extensible way.
66 |Write the rules to extract the data and let Scrapex do the rest.
68 |Extensible by design, plug new functionality easily without having to touch the core.
70 |Written in Elixir and runs on Linux, Windows, Mac, BSD, and embedded devices.
72 |alias Scrapex.GenSpider
73 | defmodule StackOverflowSpider do
74 | use GenSpider
75 | import Scrapex.Selector
76 |
77 | def parse(response, state) do
78 | result = response.body
79 | |> select(".question-summary h3 a")
80 | |> extract("href")
81 | |> Enum.map(fn(href) ->
82 | GenSpider.Response.url_join(response, href)
83 | |> GenSpider.request(&parse_question/1)
84 | |> GenSpider.await
85 | end)
86 | {:ok, result, state}
87 | end
88 |
89 | defp parse_question({:ok, response}) do
90 | html = response.body
91 | [title] = html |> select("h1 a") |> extract()
92 | question = html |> select(".question")
93 | [body] = question |> select(".post-text") |> extract
94 | [votes] = question |> select(".vote-count-post") |> extract
95 | tags = question |> select(".post-tag") |> extract
96 |
97 | %{title: title, body: body, votes: votes, tags: tags}
98 | end
99 | end
100 | urls = ["http://stackoverflow.com/questions?sort=votes"]
101 | opts = [name: :webscrapper, urls: urls]
102 | {:ok, spider} = GenSpider.start_link(StackOverflowSpider, [], opts)
103 | questions = GenSpider.export(spider)
104 | #=> "[{} | _]"
105 | GenSpider behaviour
.
107 | parse/2
callback.
109 | parse/2
117 | 15.6", Core i5-4200U, 4GB, 750GB, Radeon HD8670M 2GB, Windows
166 |15.6", Core i5-4200U, 8GB, 1TB, Radeon R7 M265, Windows 8.1
183 |93 | Welcome to WebScraper e-commerce site. You can use this site for training 94 | to learn how to use the Web Scraper. Items listed here are not for sale. 95 |
96 |This domain is established to be used for illustrative examples in documents. You may use this 46 | domain in examples without prior coordination or asking for permission.
47 | 48 |