├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── project
├── Build.scala
├── build.properties
└── plugins.sbt
├── scraper-demo
└── src
│ └── main
│ └── scala
│ ├── DevelopApp.scala
│ └── ScrapeApp.scala
├── scraper-server
└── src
│ └── main
│ └── scala
│ ├── ClientApp.scala
│ ├── MainApp.scala
│ └── ServerApp.scala
└── scraper
└── src
└── main
└── scala
└── org
└── rovak
└── scraper
├── ScrapeManager.scala
├── collectors
├── Collector.scala
└── FileWriterCollector.scala
├── models
├── QueryBuilder.scala
├── Result.scala
└── WebPage.scala
├── scrapers
├── AkkaScraper.scala
├── DefaultScraper.scala
└── Scraper.scala
├── spiders
├── EmailSpider.scala
├── SitemapSpider.scala
└── Spider.scala
└── websites
└── Google.scala
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 | *.log
3 |
4 | # other
5 | .cache
6 | .classpath
7 |
8 | # sbt specific
9 | dist/*
10 | target/
11 | lib_managed/
12 | src_managed/
13 | project/boot/
14 | project/plugins/project/
15 |
16 | # Scala-IDE specific
17 | .scala_dependencies
18 |
19 | .idea
20 | .idea_modules
21 |
22 | .project
23 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: scala
2 | scala:
3 | - 2.10.2
4 | - 2.10.3
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2013 Roy van Kaathoven (www.razko.nl)
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
6 | this software and associated documentation files (the "Software"), to deal in
7 | the Software without restriction, including without limitation the rights to
8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Scala Webscraper 0.4.1
2 | ======================
3 |
4 | [](https://travis-ci.org/Rovak/ScalaWebscraper)
5 |
6 | ## Getting started
7 |
8 | The project is build with Scala 2.10.2 and sbt 0.13.0, both can be installed
9 | using this [install script](https://gist.github.com/Rovak/4967148)
10 |
11 | To try the example navigate to the project folder and run `sbt "project scraper-demo" run`
12 | which will start the example scraper
13 |
14 | ## Installation
15 |
16 | If you use SBT, you just have to edit `build.sbt` and add the following:
17 |
18 | ```scala
19 | libraryDependencies += "nl.razko" %% "scraper" % "0.4.1"
20 | ```
21 |
22 | If you want to use bleeding edge versions using snapshots then add the Sonatype snapshots to the resolvers:
23 |
24 | ```scala
25 | resolvers += "Sonatype Snapshots" at "http://oss.sonatype.org/content/repositories/snapshots/"
26 |
27 | libraryDependencies += "nl.razko" %% "scraper" % "0.4.1-SNAPSHOT"
28 | ```
29 |
30 | ## DSL
31 |
32 | The webscraper provides a simple DSL to write scrape rules
33 |
34 | ```scala
35 | import org.rovak.scraper.ScrapeManager._
36 | import org.jsoup.nodes.Element
37 |
38 | object Google {
39 | val results = "#res li.g h3.r a"
40 | def search(term: String) = {
41 | "http://www.google.com/search?q=" + term.replace(" ", "+")
42 | }
43 | }
44 |
45 | // Open the search results page for the query "php elephant"
46 | scrape from Google.search("php elephant") open { implicit page =>
47 |
48 | // Iterate through every result link
49 | Google.results each { x: Element =>
50 |
51 | val link = x.select("a[href]").attr("abs:href").substring(28)
52 | if (link.isValidURL) {
53 |
54 | // Iterate through every found link in the found page
55 | scrape from link each (x => println("found: " + x))
56 | }
57 | }
58 | }
59 |
60 | ```
61 |
62 | ## Spiders
63 |
64 | A spider is a scraper which recursively loads a page and opens every link it finds. It will
65 | keep scraping until all pages within the allowed domains are visited once.
66 |
67 | The following snippet demonstrates a basic spider which crawls a website and provides
68 | hooks to do something with the data
69 |
70 | ```scala
71 | new Spider {
72 | startUrls ::= "http://events.stanford.edu/"
73 | allowedDomains ::= "events.stanford.edu"
74 |
75 | onReceivedPage ::= { page: WebPage =>
76 | // Page received
77 | }
78 |
79 | onLinkFound ::= { link: Href =>
80 | println(s"Found link ${link.url} with name ${link.name}")
81 | }
82 | }.start()
83 | ```
84 |
85 | The spider can be extended by providing traits, if you want to scrape emails then
86 | add the EmailSpider trait which offers a new `onEmailFound` hook in which emails can be collected.
87 |
88 | ```scala
89 | new Spider with EmailSpider {
90 | startUrls ::= "http://events.stanford.edu/"
91 | allowedDomains ::= "events.stanford.edu"
92 |
93 | onEmailFound ::= { email: String =>
94 | // Email found
95 | }
96 |
97 | onReceivedPage ::= { page: WebPage =>
98 | // Page received
99 | }
100 |
101 | onLinkFound ::= { link: Href =>
102 | println(s"Found link ${link.url} with name ${link.name}")
103 | }
104 | }.start()
105 | ```
106 |
107 | Multiple spiders can be mixed together
108 |
109 | ```scala
110 | new Spider with EmailSpider with SitemapSpider {
111 | startUrls ::= "http://events.stanford.edu/"
112 | allowedDomains ::= "events.stanford.edu"
113 | sitemapUrls ::= "http://events.stanford.edu/sitemap.xml"
114 |
115 | onEmailFound ::= { email: String =>
116 | println("Found email: " + email)
117 | }
118 |
119 | onReceivedPage ::= { page: WebPage =>
120 | // Page received
121 | }
122 |
123 | onLinkFound ::= { link: Href =>
124 | println(s"Found link ${link.url} with name ${link.name}")
125 | }
126 | }.start()
127 | ```
128 |
129 | ## Documentation
130 |
131 | - [API](http://ci.razko.nl/job/WebsiteScraper/Documentation/index.html)
132 |
--------------------------------------------------------------------------------
/project/Build.scala:
--------------------------------------------------------------------------------
1 | import sbt._
2 | import sbt.Keys._
3 | import scala.Some
4 |
5 | object Build extends Build {
6 |
7 | val projectVersion = "0.4.1"
8 |
9 | val defaultSettings = Project.defaultSettings ++ Seq(
10 | resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/",
11 | resolvers += "sonatype-public" at "https://oss.sonatype.org/content/groups/public",
12 | version := projectVersion,
13 | scalaVersion := "2.10.3")
14 |
15 | val publishSettings = Seq(
16 |
17 | /**
18 | * Publish settings
19 | */
20 | publishTo <<= version { v: String =>
21 | val nexus = "https://oss.sonatype.org/"
22 | if (v.trim.endsWith("SNAPSHOT"))
23 | Some("snapshots" at nexus + "content/repositories/snapshots")
24 | else
25 | Some("releases" at nexus + "service/local/staging/deploy/maven2")
26 | },
27 |
28 | organization := "nl.razko",
29 |
30 | publishMavenStyle := true,
31 |
32 | pomIncludeRepository := { _ => false },
33 |
34 | publishArtifact in Test := false,
35 |
36 | pomExtra := (
37 | https://github.com/Rovak/ScalaWebscraper
38 |
39 |
40 | MIT
41 | http://opensource.org/licenses/MIT
42 | repo
43 |
44 |
45 |
46 | git@github.com:Rovak/ScalaWebscraper.git
47 | scm:git:git@github.com:Rovak/ScalaWebscraper.git
48 |
49 |
50 |
51 | rovak
52 | Roy van Kaathoven
53 | http://rovak.pro
54 |
55 |
56 | )
57 | )
58 |
59 | lazy val scraper = Project(
60 | id = "scraper",
61 | base = file("scraper"),
62 | settings = defaultSettings ++ publishSettings ++ Seq(
63 | libraryDependencies ++= Seq(
64 | Dependencies.Etc.jsoup,
65 | Dependencies.Akka.actor)
66 | )
67 | )
68 |
69 | lazy val scraperServer = Project(
70 | id = "scraper-server",
71 | base = file("scraper-server"),
72 | dependencies = Seq(scraper),
73 | settings = defaultSettings ++ Seq(
74 | libraryDependencies ++= Dependencies.akka
75 | )
76 | ).aggregate(scraper)
77 |
78 | lazy val scraperDemo = Project(
79 | id = "scraper-demo",
80 | base = file("scraper-demo"),
81 | dependencies = Seq(scraper),
82 | settings = defaultSettings ++ Seq(
83 | libraryDependencies ++= Dependencies.database ++ Seq(Dependencies.Typesafe.config)
84 | )
85 | ).aggregate(scraper)
86 |
87 | object Dependencies {
88 | object Akka {
89 | val actor = "com.typesafe.akka" %% "akka-actor" % "2.2.1"
90 | val remote = "com.typesafe.akka" %% "akka-remote" % "2.2.1"
91 | val kernel = "com.typesafe.akka" %% "akka-kernel" % "2.2.1"
92 | }
93 |
94 | object Typesafe {
95 | val config = "com.typesafe" % "config" % "1.0"
96 | }
97 |
98 | object Db {
99 | val slick = "com.typesafe.slick" %% "slick" % "1.0.0"
100 | val mysql = "mysql" % "mysql-connector-java" % "5.1.13"
101 | }
102 |
103 | object Etc {
104 | val jsoup = "org.jsoup" % "jsoup" % "1.7.2"
105 | val slf4j = "org.slf4j" % "slf4j-nop" % "1.6.4"
106 | }
107 |
108 | val database = Seq(Db.slick, Db.mysql)
109 | val akka = Seq(Akka.actor, Akka.remote, Akka.kernel)
110 | }
111 | }
112 |
--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=0.13.0
2 |
--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += "Sonatype snapshots" at "http://oss.sonatype.org/content/repositories/snapshots/"
2 |
3 | addSbtPlugin(dependency="com.github.mpeltonen" % "sbt-idea" % "1.5.0-SNAPSHOT")
4 |
--------------------------------------------------------------------------------
/scraper-demo/src/main/scala/DevelopApp.scala:
--------------------------------------------------------------------------------
1 | class DevelopApp {
2 |
3 | }
4 |
--------------------------------------------------------------------------------
/scraper-demo/src/main/scala/ScrapeApp.scala:
--------------------------------------------------------------------------------
1 | package scraper.demo
2 |
3 | import org.rovak.scraper.models._
4 | import org.rovak.scraper.spiders._
5 | import org.rovak.scraper.models.Href
6 |
7 | object ScrapeApp extends App {
8 |
9 | new Spider {
10 | startUrls ::= "http://events.stanford.edu/"
11 | allowedDomains ::= "events.stanford.edu"
12 |
13 | onReceivedPage ::= { page: WebPage =>
14 | // Page received
15 | }
16 |
17 | onLinkFound ::= { link: Href =>
18 | println(s"Found link ${link.url} with name ${link.name}")
19 | }
20 | }.start()
21 |
22 | }
--------------------------------------------------------------------------------
/scraper-server/src/main/scala/ClientApp.scala:
--------------------------------------------------------------------------------
1 | package scraper.demo
2 |
3 | import akka.actor._
4 | import akka.routing._
5 | import akka.kernel.Bootable
6 | import com.typesafe.config._
7 |
8 | /**
9 | * Client application
10 | */
11 | class ClientApp(port: Int = 2555) extends Bootable {
12 |
13 | var cfg = ConfigFactory.parseString("""
14 | akka.remote.netty {
15 | hostname = "127.0.0.1"
16 | port = """ + port + """
17 | }
18 | """)
19 | .withFallback(ConfigFactory.load.getConfig("client"));
20 |
21 | val system = ActorSystem("Client", cfg)
22 | //val actor = system.actorOf(Props[scrapers.QueryScraper].withRouter(RoundRobinRouter(nrOfInstances = 15)), "query")
23 |
24 | def startup = {
25 |
26 | }
27 |
28 | def shutdown = {
29 | system.shutdown()
30 | }
31 | }
32 |
33 | /**
34 | * Main
35 | */
36 | object ClientListener {
37 | def main(args: Array[String]) {
38 | new ClientApp
39 | println("Started Application - waiting for instructions")
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/scraper-server/src/main/scala/MainApp.scala:
--------------------------------------------------------------------------------
1 | package scraper.demo
2 |
3 | /**
4 | * Main application which will be used in the packaged version
5 | */
6 | object MainApp {
7 |
8 | var serverType: String = "client"
9 | var port: Int = 2555
10 |
11 | /**
12 | * Run the app and switch modes based on the given console arguments
13 | *
14 | * @param args
15 | * server|client if server or client mode
16 | * port if client then add port number
17 | */
18 | def main(args: Array[String]) {
19 |
20 | if (args.length >= 1) {
21 | serverType = args(0)
22 | }
23 |
24 | if (args.length >= 2 && serverType == "client") {
25 | port = args(1).toInt
26 | }
27 |
28 | serverType.toLowerCase() match {
29 | case "client" => runClient
30 | case "server" => runServer
31 | }
32 | }
33 |
34 | def runServer = {
35 | println("Running as Server")
36 | new ServerApp
37 | }
38 |
39 | def runClient = {
40 | println("Running as Client on port: " + port.toString)
41 | new ClientApp(port)
42 | }
43 |
44 | }
--------------------------------------------------------------------------------
/scraper-server/src/main/scala/ServerApp.scala:
--------------------------------------------------------------------------------
1 | package scraper.demo
2 |
3 | import akka.actor._
4 | import akka.kernel.Bootable
5 | import com.typesafe.config._
6 | import akka.routing.RoundRobinRouter
7 |
8 | /**
9 | * Main server which will send instructions to the clients
10 | */
11 | class ServerApp extends Bootable {
12 | val system = ActorSystem("ServerPool", ConfigFactory.load.getConfig("server"))
13 | val server1 = system.actorFor("akka://Client@127.0.0.1:2554/user/query")
14 | val server2 = system.actorFor("akka://Client@127.0.0.1:2555/user/query")
15 | val routees = List[ActorRef](server1, server2)
16 | val router2 = system.actorOf(Props().withRouter(RoundRobinRouter(routees = routees)))
17 |
18 | def sendMessage = {
19 |
20 | }
21 |
22 | def startup = {
23 |
24 | }
25 |
26 | def shutdown = {
27 | system.shutdown()
28 | }
29 | }
30 |
31 | object ServerBootableApp {
32 | def main(args: Array[String]) {
33 | val app = new ServerApp
34 | println("Started Server Sending messages")
35 | while (true) {
36 | app.sendMessage
37 | Thread.sleep(200)
38 | }
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/scraper/src/main/scala/org/rovak/scraper/ScrapeManager.scala:
--------------------------------------------------------------------------------
1 | package org.rovak.scraper
2 |
3 | import akka.actor._
4 | import akka.routing.RoundRobinRouter
5 | import org.rovak.scraper.models.{Result, Href, WebPage, QueryBuilder}
6 | import akka.pattern.ask
7 | import scala.concurrent.duration._
8 | import akka.util.Timeout
9 | import scala.collection.JavaConversions.asScalaBuffer
10 | import org.jsoup.nodes.Element
11 | import java.net._
12 | import org.rovak.scraper.collectors.Collector
13 | import org.rovak.scraper.models.WebPage
14 | import org.rovak.scraper.scrapers.DefaultScraper
15 |
16 | object ScrapeManager {
17 |
18 | implicit def String2Url(url: String) = new URL(url)
19 |
20 | implicit var scraper = new DefaultScraper
21 |
22 | def scrape = new QueryBuilder()
23 |
24 | implicit class StringUtils(query: String) {
25 | def collect(reader: Element => Result)(implicit c: Collector, page: WebPage) = {
26 | page.doc.select(query).map(x => c.collect(reader(x)))
27 | }
28 |
29 | def each[T](reader: Element => T)(implicit page: WebPage): List[T] = {
30 | page.doc.select(query).map(reader).toList
31 | }
32 |
33 | /**
34 | * Validate if the given URL is valid
35 | * @return
36 | */
37 | def isValidURL: Boolean = {
38 | try {
39 | new URL(query).toURI
40 | true
41 | }
42 | catch {
43 | case e: MalformedURLException => false
44 | case e: URISyntaxException => false
45 | }
46 | }
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/scraper/src/main/scala/org/rovak/scraper/collectors/Collector.scala:
--------------------------------------------------------------------------------
1 | package org.rovak.scraper.collectors
2 |
3 | import org.rovak.scraper.models.Result
4 |
5 | abstract class Collector {
6 | def collect(result: Result)
7 | }
8 |
9 |
--------------------------------------------------------------------------------
/scraper/src/main/scala/org/rovak/scraper/collectors/FileWriterCollector.scala:
--------------------------------------------------------------------------------
1 | package org.rovak.scraper.collectors
2 |
3 | import java.io._
4 | import org.rovak.scraper.models.Result
5 |
6 | /**
7 | * Collects results and writes them to a local file
8 | */
9 | class FileWriterCollector(filename: String = "results.txt") extends Collector {
10 |
11 | val writer: Writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), "utf-8"))
12 |
13 | def collect(result: Result) = {
14 | writer.write(result.toCSV + "\n")
15 | writer.flush()
16 | }
17 |
18 | }
19 |
--------------------------------------------------------------------------------
/scraper/src/main/scala/org/rovak/scraper/models/QueryBuilder.scala:
--------------------------------------------------------------------------------
1 | package org.rovak.scraper.models
2 |
3 | import scala.concurrent.{ExecutionContext, Await}
4 | import scala.concurrent.duration._
5 | import scala.collection.JavaConversions._
6 | import org.jsoup.select.Elements
7 | import org.jsoup.nodes.Element
8 | import org.rovak.scraper.scrapers.Scraper
9 |
10 | case class FromClass(f: QueryBuilder => String) {
11 | def execute(qb: QueryBuilder) = f(qb)
12 | }
13 |
14 | class QueryBuilder(implicit val scraper: Scraper, var pageUrl: String = "", var query: String = "") extends Serializable with Iterable[Href] {
15 |
16 | import ExecutionContext.Implicits.global
17 |
18 | def iterator = Await.result(links, 5 second).iterator
19 |
20 | def from(newUrl: String): QueryBuilder = {
21 | pageUrl = newUrl
22 | this
23 | }
24 |
25 | def from(func: FromClass): QueryBuilder = {
26 | func.execute(this)
27 | this
28 | }
29 |
30 | def select(newQuery: String): QueryBuilder = {
31 | query = newQuery
32 | this
33 | }
34 |
35 | /**
36 | * Read the page and execute the method on success
37 | * @param webPage
38 | * @return
39 | */
40 | def open(webPage: WebPage => Unit): QueryBuilder = {
41 | page onSuccess { case (page: WebPage) => webPage(page) }
42 | this
43 | }
44 |
45 | /**
46 | * Download a page
47 | */
48 | protected def page = scraper.downloadPage(pageUrl)
49 |
50 | /**
51 | * Download the page and look for tags with a href attribute
52 | */
53 | def links = {
54 | page map {
55 | case (x: WebPage) => {
56 | x.doc.select(query).map(x =>
57 | new Href {
58 | url = x.select("a[href]").attr("abs:href")
59 | name = x.select("a[href]").text
60 | }).toList
61 | }
62 | }
63 | }
64 |
65 | /**
66 | * Scrape the page and collect the data
67 | *
68 | * @param f a function which will be called for every result
69 | */
70 | def each(f: Element => Unit): QueryBuilder = {
71 | page map {
72 | case (x: WebPage) => x.doc.select(query)
73 | } onSuccess {
74 | case (x: Elements) => x.map(f)
75 | }
76 | this
77 | }
78 | }
--------------------------------------------------------------------------------
/scraper/src/main/scala/org/rovak/scraper/models/Result.scala:
--------------------------------------------------------------------------------
1 | package org.rovak.scraper.models
2 |
3 | trait Result {
4 |
5 | def toCSV: String
6 | }
7 |
--------------------------------------------------------------------------------
/scraper/src/main/scala/org/rovak/scraper/models/WebPage.scala:
--------------------------------------------------------------------------------
1 | package org.rovak.scraper.models
2 |
3 | import org.jsoup.nodes.Document
4 | import scala.collection.JavaConversions._
5 |
6 | case class WebPage(url: java.net.URL) {
7 |
8 | var content = ""
9 |
10 | var doc: Document = null
11 |
12 | def link = url.toString
13 |
14 | def links = {
15 | doc.select("a").map(x =>
16 | new Href {
17 | url = x.select("a[href]").attr("abs:href")
18 | name = x.select("a[href]").text
19 | }).toList
20 | }
21 |
22 | }
23 |
24 | case class PageNotFound()
25 |
26 | case class Href(var url: String = "", var name: String = "") extends Result {
27 | def toCSV: String = url
28 | }
--------------------------------------------------------------------------------
/scraper/src/main/scala/org/rovak/scraper/scrapers/AkkaScraper.scala:
--------------------------------------------------------------------------------
1 | package org.rovak.scraper.scrapers
2 |
3 | import akka.actor.{Props, ActorSystem, Actor}
4 | import org.jsoup.Jsoup
5 | import org.rovak.scraper.models.{PageNotFound, WebPage}
6 | import java.net.URL
7 | import akka.routing.RoundRobinRouter
8 | import scala.concurrent.ExecutionContext
9 | import org.rovak.scraper.scrapers.AkkaScraperActor.DownloadPage
10 | import akka.pattern.ask
11 | import akka.util.Timeout
12 | import scala.concurrent.duration._
13 |
14 | object AkkaScraperManager {
15 | val system = ActorSystem()
16 | }
17 |
18 | object AkkaScraperActor {
19 | case class DownloadPage(url: String)
20 | }
21 |
22 | class AkkaScraperActor extends Actor {
23 |
24 | import AkkaScraperActor._
25 |
26 | def fetchPage(pageUrl: String) = {
27 | try {
28 | new WebPage(new URL(pageUrl)) {
29 | doc = Jsoup
30 | .connect(pageUrl)
31 | .userAgent("Mozilla")
32 | .followRedirects(true)
33 | .timeout(0)
34 | .get
35 | }
36 | }
37 | catch {
38 | case e: Exception => PageNotFound()
39 | }
40 | }
41 |
42 | def receive = {
43 | case DownloadPage(url) => sender ! fetchPage(url.toString)
44 | }
45 | }
46 |
47 | /**
48 | * Akka scraper
49 | */
50 | class AkkaScraper extends Scraper {
51 |
52 | import ExecutionContext.Implicits.global
53 |
54 | implicit val timeout = Timeout(5.seconds)
55 |
56 | val scrapeActor = AkkaScraperManager.system.actorOf(Props[AkkaScraperActor].withRouter(RoundRobinRouter(nrOfInstances = 15)), "scraper")
57 |
58 | def downloadPage(url: String) = {
59 | (scrapeActor ? DownloadPage(url)) map {
60 | case page: WebPage => page
61 | }
62 | }
63 | }
--------------------------------------------------------------------------------
/scraper/src/main/scala/org/rovak/scraper/scrapers/DefaultScraper.scala:
--------------------------------------------------------------------------------
1 | package org.rovak.scraper.scrapers
2 |
3 | import scala.concurrent.{ExecutionContext, Future}
4 | import org.rovak.scraper.models.{PageNotFound, WebPage}
5 | import java.net.URL
6 | import org.jsoup.Jsoup
7 |
8 | /**
9 | * Default scraper
10 | */
11 | class DefaultScraper extends Scraper {
12 |
13 | import ExecutionContext.Implicits.global
14 |
15 | def downloadPage(pageUrl: String) = Future {
16 | new WebPage(new URL(pageUrl)) {
17 | doc = Jsoup
18 | .connect(pageUrl)
19 | .userAgent("Mozilla")
20 | .followRedirects(true)
21 | .timeout(0)
22 | .get
23 | }
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/scraper/src/main/scala/org/rovak/scraper/scrapers/Scraper.scala:
--------------------------------------------------------------------------------
1 | package org.rovak.scraper.scrapers
2 |
3 | import org.rovak.scraper.models.WebPage
4 | import scala.concurrent.Future
5 |
6 | trait Scraper {
7 | def downloadPage(url: String): Future[WebPage]
8 | }
9 |
--------------------------------------------------------------------------------
/scraper/src/main/scala/org/rovak/scraper/spiders/EmailSpider.scala:
--------------------------------------------------------------------------------
1 | package org.rovak.scraper.spiders
2 |
3 | import org.rovak.scraper.models.WebPage
4 | import java.util.regex.Pattern
5 |
6 | trait EmailSpider {
7 | this: Spider =>
8 |
9 | var foundEmails = List[String]()
10 |
11 | val searchPattern = "[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+"
12 |
13 | private val pattern = Pattern.compile(searchPattern)
14 |
15 | var onEmailFound = List[String => Unit]()
16 |
17 | onReceivedPage ::= {
18 | case page: WebPage if page.doc != null =>
19 | try {
20 | val m = pattern.matcher(page.doc.body().text)
21 | while (m.find()) {
22 | onEmailFound.foreach(_(m.group))
23 | foundEmails ::= m.group
24 | }
25 | }
26 | catch {
27 | case e: Exception =>
28 | }
29 | case _ =>
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/scraper/src/main/scala/org/rovak/scraper/spiders/SitemapSpider.scala:
--------------------------------------------------------------------------------
1 | package org.rovak.scraper.spiders
2 |
3 | import java.net.URL
4 | import scala.xml.{Elem, XML}
5 |
6 | trait SitemapSpider {
7 | this: Spider =>
8 |
9 | var sitemapUrls = List[String]()
10 |
11 | def openSitemap(url: String): List[String] = {
12 | val xml = XML.load(url)
13 | (xml \\ "loc") map {
14 | case (x: Elem) =>
15 | val location = x.text
16 | if (location.endsWith(".xml")) openSitemap(location)
17 | else List(location)
18 | } flatMap { x: List[String] => x} toList
19 | }
20 |
21 | def startUrlsSiteMaps = {
22 | startUrls.map { startUrl: String =>
23 | val start = new URL(startUrl)
24 | s"${start.getProtocol}://${start.getHost}/sitemap.xml"
25 | }
26 | }
27 |
28 | onStart ::= { spider: Spider =>
29 | (sitemapUrls ++ startUrlsSiteMaps).foldLeft(List[String]()) {
30 | (list, sitemap) => list ++ openSitemap(sitemap)
31 | }
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/scraper/src/main/scala/org/rovak/scraper/spiders/Spider.scala:
--------------------------------------------------------------------------------
1 | package org.rovak.scraper.spiders
2 |
3 | import org.rovak.scraper.models.{Href, WebPage}
4 | import org.rovak.scraper.scrapers.DefaultScraper
5 | import org.rovak.scraper.ScrapeManager
6 |
7 |
8 | class Spider {
9 |
10 | import ScrapeManager._
11 |
12 | /**
13 | * Scraper which will be used to scrape pages
14 | */
15 | implicit var scraper = new DefaultScraper
16 |
17 | /**
18 | * Allowed domains
19 | */
20 | var allowedDomains = List[String]()
21 |
22 | /**
23 | * Urls from which the crawling will start
24 | */
25 | var startUrls = List[String]()
26 |
27 | /**
28 | * All crawled pages
29 | */
30 | var crawledPages = List[WebPage]()
31 |
32 | /**
33 | * Triggered when a page has been downloaded
34 | */
35 | var onReceivedPage = List[WebPage => Unit]()
36 |
37 | /**
38 | * Triggered just before the spider starts searching
39 | *
40 | * Additional start urls can be returned by the method
41 | */
42 | var onStart = List[Spider => List[String]]()
43 |
44 | /**
45 | * Triggered every time a link is found
46 | */
47 | var onLinkFound = List[Href => Unit]()
48 |
49 | /**
50 | * Before reading a page check if it is allowed
51 | * @param page page which will be scraped
52 | * @return if the page is allowed to be read
53 | */
54 | def beforeReadingPage(page: WebPage) = {
55 | allowedDomains.contains(page.url.getHost) && !crawledPages.contains(page)
56 | }
57 |
58 | /**
59 | * Scrape a single page
60 | *
61 | * @param page page to scrape
62 | */
63 | def scrapePage(page: WebPage): Unit = {
64 | try {
65 | if (beforeReadingPage(page)) {
66 | crawledPages ::= page
67 | scrape from page.link open { page =>
68 | onReceivedPage.foreach(_(page))
69 | page.links.foreach { link =>
70 | onLinkFound.foreach(y => y(link))
71 | scrapePage(WebPage(link.url))
72 | }
73 | }
74 | }
75 | }
76 | catch {
77 | case invalidUrl: java.net.MalformedURLException => println("Invalid URL")
78 | case e: Exception => println("Error")
79 | }
80 | }
81 |
82 | /**
83 | * Start running the spider
84 | */
85 | def start() = {
86 | onStart.foldLeft(startUrls) {
87 | case (urls, current) => urls ++ current(this)
88 | } foreach (x => scrapePage(new WebPage(x)))
89 | }
90 | }
91 |
--------------------------------------------------------------------------------
/scraper/src/main/scala/org/rovak/scraper/websites/Google.scala:
--------------------------------------------------------------------------------
1 | package org.rovak.scraper.websites
2 |
3 |
4 | object Google {
5 |
6 | val results = "#res li.g h3.r a"
7 |
8 | def search(term: String) = {
9 | "http://www.google.com/search?q=" + term.replace(" ", "+")
10 | }
11 | }
12 |
--------------------------------------------------------------------------------