├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── project ├── Build.scala ├── build.properties └── plugins.sbt ├── scraper-demo └── src │ └── main │ └── scala │ ├── DevelopApp.scala │ └── ScrapeApp.scala ├── scraper-server └── src │ └── main │ └── scala │ ├── ClientApp.scala │ ├── MainApp.scala │ └── ServerApp.scala └── scraper └── src └── main └── scala └── org └── rovak └── scraper ├── ScrapeManager.scala ├── collectors ├── Collector.scala └── FileWriterCollector.scala ├── models ├── QueryBuilder.scala ├── Result.scala └── WebPage.scala ├── scrapers ├── AkkaScraper.scala ├── DefaultScraper.scala └── Scraper.scala ├── spiders ├── EmailSpider.scala ├── SitemapSpider.scala └── Spider.scala └── websites └── Google.scala /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | # other 5 | .cache 6 | .classpath 7 | 8 | # sbt specific 9 | dist/* 10 | target/ 11 | lib_managed/ 12 | src_managed/ 13 | project/boot/ 14 | project/plugins/project/ 15 | 16 | # Scala-IDE specific 17 | .scala_dependencies 18 | 19 | .idea 20 | .idea_modules 21 | 22 | .project 23 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | scala: 3 | - 2.10.2 4 | - 2.10.3 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Roy van Kaathoven (www.razko.nl) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Scala Webscraper 0.4.1 2 | ====================== 3 | 4 | [![Build Status](https://api.travis-ci.org/Rovak/ScalaWebscraper.png?branch=master)](https://travis-ci.org/Rovak/ScalaWebscraper) 5 | 6 | ## Getting started 7 | 8 | The project is build with Scala 2.10.2 and sbt 0.13.0, both can be installed 9 | using this [install script](https://gist.github.com/Rovak/4967148) 10 | 11 | To try the example navigate to the project folder and run `sbt "project scraper-demo" run` 12 | which will start the example scraper 13 | 14 | ## Installation 15 | 16 | If you use SBT, you just have to edit `build.sbt` and add the following: 17 | 18 | ```scala 19 | libraryDependencies += "nl.razko" %% "scraper" % "0.4.1" 20 | ``` 21 | 22 | If you want to use bleeding edge versions using snapshots then add the Sonatype snapshots to the resolvers: 23 | 24 | ```scala 25 | resolvers += "Sonatype Snapshots" at "http://oss.sonatype.org/content/repositories/snapshots/" 26 | 27 | libraryDependencies += "nl.razko" %% "scraper" % "0.4.1-SNAPSHOT" 28 | ``` 29 | 30 | ## DSL 31 | 32 | The webscraper provides a simple DSL to write scrape rules 33 | 34 | ```scala 35 | import org.rovak.scraper.ScrapeManager._ 36 | import org.jsoup.nodes.Element 37 | 38 | object Google { 39 | val results = "#res li.g h3.r a" 40 | def search(term: String) = { 41 | "http://www.google.com/search?q=" + term.replace(" ", "+") 42 | } 43 | } 44 | 45 | // Open the search results page for the query "php elephant" 46 | scrape from Google.search("php elephant") open { implicit page => 47 | 48 | // Iterate through every result link 49 | Google.results each { x: Element => 50 | 51 | val link = x.select("a[href]").attr("abs:href").substring(28) 52 | if (link.isValidURL) { 53 | 54 | // Iterate through every found link in the found page 55 | scrape from link each (x => println("found: " + x)) 56 | } 57 | } 58 | } 59 | 60 | ``` 61 | 62 | ## Spiders 63 | 64 | A spider is a scraper which recursively loads a page and opens every link it finds. It will 65 | keep scraping until all pages within the allowed domains are visited once. 66 | 67 | The following snippet demonstrates a basic spider which crawls a website and provides 68 | hooks to do something with the data 69 | 70 | ```scala 71 | new Spider { 72 | startUrls ::= "http://events.stanford.edu/" 73 | allowedDomains ::= "events.stanford.edu" 74 | 75 | onReceivedPage ::= { page: WebPage => 76 | // Page received 77 | } 78 | 79 | onLinkFound ::= { link: Href => 80 | println(s"Found link ${link.url} with name ${link.name}") 81 | } 82 | }.start() 83 | ``` 84 | 85 | The spider can be extended by providing traits, if you want to scrape emails then 86 | add the EmailSpider trait which offers a new `onEmailFound` hook in which emails can be collected. 87 | 88 | ```scala 89 | new Spider with EmailSpider { 90 | startUrls ::= "http://events.stanford.edu/" 91 | allowedDomains ::= "events.stanford.edu" 92 | 93 | onEmailFound ::= { email: String => 94 | // Email found 95 | } 96 | 97 | onReceivedPage ::= { page: WebPage => 98 | // Page received 99 | } 100 | 101 | onLinkFound ::= { link: Href => 102 | println(s"Found link ${link.url} with name ${link.name}") 103 | } 104 | }.start() 105 | ``` 106 | 107 | Multiple spiders can be mixed together 108 | 109 | ```scala 110 | new Spider with EmailSpider with SitemapSpider { 111 | startUrls ::= "http://events.stanford.edu/" 112 | allowedDomains ::= "events.stanford.edu" 113 | sitemapUrls ::= "http://events.stanford.edu/sitemap.xml" 114 | 115 | onEmailFound ::= { email: String => 116 | println("Found email: " + email) 117 | } 118 | 119 | onReceivedPage ::= { page: WebPage => 120 | // Page received 121 | } 122 | 123 | onLinkFound ::= { link: Href => 124 | println(s"Found link ${link.url} with name ${link.name}") 125 | } 126 | }.start() 127 | ``` 128 | 129 | ## Documentation 130 | 131 | - [API](http://ci.razko.nl/job/WebsiteScraper/Documentation/index.html) 132 | -------------------------------------------------------------------------------- /project/Build.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | import sbt.Keys._ 3 | import scala.Some 4 | 5 | object Build extends Build { 6 | 7 | val projectVersion = "0.4.1" 8 | 9 | val defaultSettings = Project.defaultSettings ++ Seq( 10 | resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/", 11 | resolvers += "sonatype-public" at "https://oss.sonatype.org/content/groups/public", 12 | version := projectVersion, 13 | scalaVersion := "2.10.3") 14 | 15 | val publishSettings = Seq( 16 | 17 | /** 18 | * Publish settings 19 | */ 20 | publishTo <<= version { v: String => 21 | val nexus = "https://oss.sonatype.org/" 22 | if (v.trim.endsWith("SNAPSHOT")) 23 | Some("snapshots" at nexus + "content/repositories/snapshots") 24 | else 25 | Some("releases" at nexus + "service/local/staging/deploy/maven2") 26 | }, 27 | 28 | organization := "nl.razko", 29 | 30 | publishMavenStyle := true, 31 | 32 | pomIncludeRepository := { _ => false }, 33 | 34 | publishArtifact in Test := false, 35 | 36 | pomExtra := ( 37 | https://github.com/Rovak/ScalaWebscraper 38 | 39 | 40 | MIT 41 | http://opensource.org/licenses/MIT 42 | repo 43 | 44 | 45 | 46 | git@github.com:Rovak/ScalaWebscraper.git 47 | scm:git:git@github.com:Rovak/ScalaWebscraper.git 48 | 49 | 50 | 51 | rovak 52 | Roy van Kaathoven 53 | http://rovak.pro 54 | 55 | 56 | ) 57 | ) 58 | 59 | lazy val scraper = Project( 60 | id = "scraper", 61 | base = file("scraper"), 62 | settings = defaultSettings ++ publishSettings ++ Seq( 63 | libraryDependencies ++= Seq( 64 | Dependencies.Etc.jsoup, 65 | Dependencies.Akka.actor) 66 | ) 67 | ) 68 | 69 | lazy val scraperServer = Project( 70 | id = "scraper-server", 71 | base = file("scraper-server"), 72 | dependencies = Seq(scraper), 73 | settings = defaultSettings ++ Seq( 74 | libraryDependencies ++= Dependencies.akka 75 | ) 76 | ).aggregate(scraper) 77 | 78 | lazy val scraperDemo = Project( 79 | id = "scraper-demo", 80 | base = file("scraper-demo"), 81 | dependencies = Seq(scraper), 82 | settings = defaultSettings ++ Seq( 83 | libraryDependencies ++= Dependencies.database ++ Seq(Dependencies.Typesafe.config) 84 | ) 85 | ).aggregate(scraper) 86 | 87 | object Dependencies { 88 | object Akka { 89 | val actor = "com.typesafe.akka" %% "akka-actor" % "2.2.1" 90 | val remote = "com.typesafe.akka" %% "akka-remote" % "2.2.1" 91 | val kernel = "com.typesafe.akka" %% "akka-kernel" % "2.2.1" 92 | } 93 | 94 | object Typesafe { 95 | val config = "com.typesafe" % "config" % "1.0" 96 | } 97 | 98 | object Db { 99 | val slick = "com.typesafe.slick" %% "slick" % "1.0.0" 100 | val mysql = "mysql" % "mysql-connector-java" % "5.1.13" 101 | } 102 | 103 | object Etc { 104 | val jsoup = "org.jsoup" % "jsoup" % "1.7.2" 105 | val slf4j = "org.slf4j" % "slf4j-nop" % "1.6.4" 106 | } 107 | 108 | val database = Seq(Db.slick, Db.mysql) 109 | val akka = Seq(Akka.actor, Akka.remote, Akka.kernel) 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.0 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += "Sonatype snapshots" at "http://oss.sonatype.org/content/repositories/snapshots/" 2 | 3 | addSbtPlugin(dependency="com.github.mpeltonen" % "sbt-idea" % "1.5.0-SNAPSHOT") 4 | -------------------------------------------------------------------------------- /scraper-demo/src/main/scala/DevelopApp.scala: -------------------------------------------------------------------------------- 1 | class DevelopApp { 2 | 3 | } 4 | -------------------------------------------------------------------------------- /scraper-demo/src/main/scala/ScrapeApp.scala: -------------------------------------------------------------------------------- 1 | package scraper.demo 2 | 3 | import org.rovak.scraper.models._ 4 | import org.rovak.scraper.spiders._ 5 | import org.rovak.scraper.models.Href 6 | 7 | object ScrapeApp extends App { 8 | 9 | new Spider { 10 | startUrls ::= "http://events.stanford.edu/" 11 | allowedDomains ::= "events.stanford.edu" 12 | 13 | onReceivedPage ::= { page: WebPage => 14 | // Page received 15 | } 16 | 17 | onLinkFound ::= { link: Href => 18 | println(s"Found link ${link.url} with name ${link.name}") 19 | } 20 | }.start() 21 | 22 | } -------------------------------------------------------------------------------- /scraper-server/src/main/scala/ClientApp.scala: -------------------------------------------------------------------------------- 1 | package scraper.demo 2 | 3 | import akka.actor._ 4 | import akka.routing._ 5 | import akka.kernel.Bootable 6 | import com.typesafe.config._ 7 | 8 | /** 9 | * Client application 10 | */ 11 | class ClientApp(port: Int = 2555) extends Bootable { 12 | 13 | var cfg = ConfigFactory.parseString(""" 14 | akka.remote.netty { 15 | hostname = "127.0.0.1" 16 | port = """ + port + """ 17 | } 18 | """) 19 | .withFallback(ConfigFactory.load.getConfig("client")); 20 | 21 | val system = ActorSystem("Client", cfg) 22 | //val actor = system.actorOf(Props[scrapers.QueryScraper].withRouter(RoundRobinRouter(nrOfInstances = 15)), "query") 23 | 24 | def startup = { 25 | 26 | } 27 | 28 | def shutdown = { 29 | system.shutdown() 30 | } 31 | } 32 | 33 | /** 34 | * Main 35 | */ 36 | object ClientListener { 37 | def main(args: Array[String]) { 38 | new ClientApp 39 | println("Started Application - waiting for instructions") 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /scraper-server/src/main/scala/MainApp.scala: -------------------------------------------------------------------------------- 1 | package scraper.demo 2 | 3 | /** 4 | * Main application which will be used in the packaged version 5 | */ 6 | object MainApp { 7 | 8 | var serverType: String = "client" 9 | var port: Int = 2555 10 | 11 | /** 12 | * Run the app and switch modes based on the given console arguments 13 | * 14 | * @param args 15 | * server|client if server or client mode 16 | * port if client then add port number 17 | */ 18 | def main(args: Array[String]) { 19 | 20 | if (args.length >= 1) { 21 | serverType = args(0) 22 | } 23 | 24 | if (args.length >= 2 && serverType == "client") { 25 | port = args(1).toInt 26 | } 27 | 28 | serverType.toLowerCase() match { 29 | case "client" => runClient 30 | case "server" => runServer 31 | } 32 | } 33 | 34 | def runServer = { 35 | println("Running as Server") 36 | new ServerApp 37 | } 38 | 39 | def runClient = { 40 | println("Running as Client on port: " + port.toString) 41 | new ClientApp(port) 42 | } 43 | 44 | } -------------------------------------------------------------------------------- /scraper-server/src/main/scala/ServerApp.scala: -------------------------------------------------------------------------------- 1 | package scraper.demo 2 | 3 | import akka.actor._ 4 | import akka.kernel.Bootable 5 | import com.typesafe.config._ 6 | import akka.routing.RoundRobinRouter 7 | 8 | /** 9 | * Main server which will send instructions to the clients 10 | */ 11 | class ServerApp extends Bootable { 12 | val system = ActorSystem("ServerPool", ConfigFactory.load.getConfig("server")) 13 | val server1 = system.actorFor("akka://Client@127.0.0.1:2554/user/query") 14 | val server2 = system.actorFor("akka://Client@127.0.0.1:2555/user/query") 15 | val routees = List[ActorRef](server1, server2) 16 | val router2 = system.actorOf(Props().withRouter(RoundRobinRouter(routees = routees))) 17 | 18 | def sendMessage = { 19 | 20 | } 21 | 22 | def startup = { 23 | 24 | } 25 | 26 | def shutdown = { 27 | system.shutdown() 28 | } 29 | } 30 | 31 | object ServerBootableApp { 32 | def main(args: Array[String]) { 33 | val app = new ServerApp 34 | println("Started Server Sending messages") 35 | while (true) { 36 | app.sendMessage 37 | Thread.sleep(200) 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /scraper/src/main/scala/org/rovak/scraper/ScrapeManager.scala: -------------------------------------------------------------------------------- 1 | package org.rovak.scraper 2 | 3 | import akka.actor._ 4 | import akka.routing.RoundRobinRouter 5 | import org.rovak.scraper.models.{Result, Href, WebPage, QueryBuilder} 6 | import akka.pattern.ask 7 | import scala.concurrent.duration._ 8 | import akka.util.Timeout 9 | import scala.collection.JavaConversions.asScalaBuffer 10 | import org.jsoup.nodes.Element 11 | import java.net._ 12 | import org.rovak.scraper.collectors.Collector 13 | import org.rovak.scraper.models.WebPage 14 | import org.rovak.scraper.scrapers.DefaultScraper 15 | 16 | object ScrapeManager { 17 | 18 | implicit def String2Url(url: String) = new URL(url) 19 | 20 | implicit var scraper = new DefaultScraper 21 | 22 | def scrape = new QueryBuilder() 23 | 24 | implicit class StringUtils(query: String) { 25 | def collect(reader: Element => Result)(implicit c: Collector, page: WebPage) = { 26 | page.doc.select(query).map(x => c.collect(reader(x))) 27 | } 28 | 29 | def each[T](reader: Element => T)(implicit page: WebPage): List[T] = { 30 | page.doc.select(query).map(reader).toList 31 | } 32 | 33 | /** 34 | * Validate if the given URL is valid 35 | * @return 36 | */ 37 | def isValidURL: Boolean = { 38 | try { 39 | new URL(query).toURI 40 | true 41 | } 42 | catch { 43 | case e: MalformedURLException => false 44 | case e: URISyntaxException => false 45 | } 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /scraper/src/main/scala/org/rovak/scraper/collectors/Collector.scala: -------------------------------------------------------------------------------- 1 | package org.rovak.scraper.collectors 2 | 3 | import org.rovak.scraper.models.Result 4 | 5 | abstract class Collector { 6 | def collect(result: Result) 7 | } 8 | 9 | -------------------------------------------------------------------------------- /scraper/src/main/scala/org/rovak/scraper/collectors/FileWriterCollector.scala: -------------------------------------------------------------------------------- 1 | package org.rovak.scraper.collectors 2 | 3 | import java.io._ 4 | import org.rovak.scraper.models.Result 5 | 6 | /** 7 | * Collects results and writes them to a local file 8 | */ 9 | class FileWriterCollector(filename: String = "results.txt") extends Collector { 10 | 11 | val writer: Writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), "utf-8")) 12 | 13 | def collect(result: Result) = { 14 | writer.write(result.toCSV + "\n") 15 | writer.flush() 16 | } 17 | 18 | } 19 | -------------------------------------------------------------------------------- /scraper/src/main/scala/org/rovak/scraper/models/QueryBuilder.scala: -------------------------------------------------------------------------------- 1 | package org.rovak.scraper.models 2 | 3 | import scala.concurrent.{ExecutionContext, Await} 4 | import scala.concurrent.duration._ 5 | import scala.collection.JavaConversions._ 6 | import org.jsoup.select.Elements 7 | import org.jsoup.nodes.Element 8 | import org.rovak.scraper.scrapers.Scraper 9 | 10 | case class FromClass(f: QueryBuilder => String) { 11 | def execute(qb: QueryBuilder) = f(qb) 12 | } 13 | 14 | class QueryBuilder(implicit val scraper: Scraper, var pageUrl: String = "", var query: String = "") extends Serializable with Iterable[Href] { 15 | 16 | import ExecutionContext.Implicits.global 17 | 18 | def iterator = Await.result(links, 5 second).iterator 19 | 20 | def from(newUrl: String): QueryBuilder = { 21 | pageUrl = newUrl 22 | this 23 | } 24 | 25 | def from(func: FromClass): QueryBuilder = { 26 | func.execute(this) 27 | this 28 | } 29 | 30 | def select(newQuery: String): QueryBuilder = { 31 | query = newQuery 32 | this 33 | } 34 | 35 | /** 36 | * Read the page and execute the method on success 37 | * @param webPage 38 | * @return 39 | */ 40 | def open(webPage: WebPage => Unit): QueryBuilder = { 41 | page onSuccess { case (page: WebPage) => webPage(page) } 42 | this 43 | } 44 | 45 | /** 46 | * Download a page 47 | */ 48 | protected def page = scraper.downloadPage(pageUrl) 49 | 50 | /** 51 | * Download the page and look for tags with a href attribute 52 | */ 53 | def links = { 54 | page map { 55 | case (x: WebPage) => { 56 | x.doc.select(query).map(x => 57 | new Href { 58 | url = x.select("a[href]").attr("abs:href") 59 | name = x.select("a[href]").text 60 | }).toList 61 | } 62 | } 63 | } 64 | 65 | /** 66 | * Scrape the page and collect the data 67 | * 68 | * @param f a function which will be called for every result 69 | */ 70 | def each(f: Element => Unit): QueryBuilder = { 71 | page map { 72 | case (x: WebPage) => x.doc.select(query) 73 | } onSuccess { 74 | case (x: Elements) => x.map(f) 75 | } 76 | this 77 | } 78 | } -------------------------------------------------------------------------------- /scraper/src/main/scala/org/rovak/scraper/models/Result.scala: -------------------------------------------------------------------------------- 1 | package org.rovak.scraper.models 2 | 3 | trait Result { 4 | 5 | def toCSV: String 6 | } 7 | -------------------------------------------------------------------------------- /scraper/src/main/scala/org/rovak/scraper/models/WebPage.scala: -------------------------------------------------------------------------------- 1 | package org.rovak.scraper.models 2 | 3 | import org.jsoup.nodes.Document 4 | import scala.collection.JavaConversions._ 5 | 6 | case class WebPage(url: java.net.URL) { 7 | 8 | var content = "" 9 | 10 | var doc: Document = null 11 | 12 | def link = url.toString 13 | 14 | def links = { 15 | doc.select("a").map(x => 16 | new Href { 17 | url = x.select("a[href]").attr("abs:href") 18 | name = x.select("a[href]").text 19 | }).toList 20 | } 21 | 22 | } 23 | 24 | case class PageNotFound() 25 | 26 | case class Href(var url: String = "", var name: String = "") extends Result { 27 | def toCSV: String = url 28 | } -------------------------------------------------------------------------------- /scraper/src/main/scala/org/rovak/scraper/scrapers/AkkaScraper.scala: -------------------------------------------------------------------------------- 1 | package org.rovak.scraper.scrapers 2 | 3 | import akka.actor.{Props, ActorSystem, Actor} 4 | import org.jsoup.Jsoup 5 | import org.rovak.scraper.models.{PageNotFound, WebPage} 6 | import java.net.URL 7 | import akka.routing.RoundRobinRouter 8 | import scala.concurrent.ExecutionContext 9 | import org.rovak.scraper.scrapers.AkkaScraperActor.DownloadPage 10 | import akka.pattern.ask 11 | import akka.util.Timeout 12 | import scala.concurrent.duration._ 13 | 14 | object AkkaScraperManager { 15 | val system = ActorSystem() 16 | } 17 | 18 | object AkkaScraperActor { 19 | case class DownloadPage(url: String) 20 | } 21 | 22 | class AkkaScraperActor extends Actor { 23 | 24 | import AkkaScraperActor._ 25 | 26 | def fetchPage(pageUrl: String) = { 27 | try { 28 | new WebPage(new URL(pageUrl)) { 29 | doc = Jsoup 30 | .connect(pageUrl) 31 | .userAgent("Mozilla") 32 | .followRedirects(true) 33 | .timeout(0) 34 | .get 35 | } 36 | } 37 | catch { 38 | case e: Exception => PageNotFound() 39 | } 40 | } 41 | 42 | def receive = { 43 | case DownloadPage(url) => sender ! fetchPage(url.toString) 44 | } 45 | } 46 | 47 | /** 48 | * Akka scraper 49 | */ 50 | class AkkaScraper extends Scraper { 51 | 52 | import ExecutionContext.Implicits.global 53 | 54 | implicit val timeout = Timeout(5.seconds) 55 | 56 | val scrapeActor = AkkaScraperManager.system.actorOf(Props[AkkaScraperActor].withRouter(RoundRobinRouter(nrOfInstances = 15)), "scraper") 57 | 58 | def downloadPage(url: String) = { 59 | (scrapeActor ? DownloadPage(url)) map { 60 | case page: WebPage => page 61 | } 62 | } 63 | } -------------------------------------------------------------------------------- /scraper/src/main/scala/org/rovak/scraper/scrapers/DefaultScraper.scala: -------------------------------------------------------------------------------- 1 | package org.rovak.scraper.scrapers 2 | 3 | import scala.concurrent.{ExecutionContext, Future} 4 | import org.rovak.scraper.models.{PageNotFound, WebPage} 5 | import java.net.URL 6 | import org.jsoup.Jsoup 7 | 8 | /** 9 | * Default scraper 10 | */ 11 | class DefaultScraper extends Scraper { 12 | 13 | import ExecutionContext.Implicits.global 14 | 15 | def downloadPage(pageUrl: String) = Future { 16 | new WebPage(new URL(pageUrl)) { 17 | doc = Jsoup 18 | .connect(pageUrl) 19 | .userAgent("Mozilla") 20 | .followRedirects(true) 21 | .timeout(0) 22 | .get 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /scraper/src/main/scala/org/rovak/scraper/scrapers/Scraper.scala: -------------------------------------------------------------------------------- 1 | package org.rovak.scraper.scrapers 2 | 3 | import org.rovak.scraper.models.WebPage 4 | import scala.concurrent.Future 5 | 6 | trait Scraper { 7 | def downloadPage(url: String): Future[WebPage] 8 | } 9 | -------------------------------------------------------------------------------- /scraper/src/main/scala/org/rovak/scraper/spiders/EmailSpider.scala: -------------------------------------------------------------------------------- 1 | package org.rovak.scraper.spiders 2 | 3 | import org.rovak.scraper.models.WebPage 4 | import java.util.regex.Pattern 5 | 6 | trait EmailSpider { 7 | this: Spider => 8 | 9 | var foundEmails = List[String]() 10 | 11 | val searchPattern = "[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+" 12 | 13 | private val pattern = Pattern.compile(searchPattern) 14 | 15 | var onEmailFound = List[String => Unit]() 16 | 17 | onReceivedPage ::= { 18 | case page: WebPage if page.doc != null => 19 | try { 20 | val m = pattern.matcher(page.doc.body().text) 21 | while (m.find()) { 22 | onEmailFound.foreach(_(m.group)) 23 | foundEmails ::= m.group 24 | } 25 | } 26 | catch { 27 | case e: Exception => 28 | } 29 | case _ => 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /scraper/src/main/scala/org/rovak/scraper/spiders/SitemapSpider.scala: -------------------------------------------------------------------------------- 1 | package org.rovak.scraper.spiders 2 | 3 | import java.net.URL 4 | import scala.xml.{Elem, XML} 5 | 6 | trait SitemapSpider { 7 | this: Spider => 8 | 9 | var sitemapUrls = List[String]() 10 | 11 | def openSitemap(url: String): List[String] = { 12 | val xml = XML.load(url) 13 | (xml \\ "loc") map { 14 | case (x: Elem) => 15 | val location = x.text 16 | if (location.endsWith(".xml")) openSitemap(location) 17 | else List(location) 18 | } flatMap { x: List[String] => x} toList 19 | } 20 | 21 | def startUrlsSiteMaps = { 22 | startUrls.map { startUrl: String => 23 | val start = new URL(startUrl) 24 | s"${start.getProtocol}://${start.getHost}/sitemap.xml" 25 | } 26 | } 27 | 28 | onStart ::= { spider: Spider => 29 | (sitemapUrls ++ startUrlsSiteMaps).foldLeft(List[String]()) { 30 | (list, sitemap) => list ++ openSitemap(sitemap) 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /scraper/src/main/scala/org/rovak/scraper/spiders/Spider.scala: -------------------------------------------------------------------------------- 1 | package org.rovak.scraper.spiders 2 | 3 | import org.rovak.scraper.models.{Href, WebPage} 4 | import org.rovak.scraper.scrapers.DefaultScraper 5 | import org.rovak.scraper.ScrapeManager 6 | 7 | 8 | class Spider { 9 | 10 | import ScrapeManager._ 11 | 12 | /** 13 | * Scraper which will be used to scrape pages 14 | */ 15 | implicit var scraper = new DefaultScraper 16 | 17 | /** 18 | * Allowed domains 19 | */ 20 | var allowedDomains = List[String]() 21 | 22 | /** 23 | * Urls from which the crawling will start 24 | */ 25 | var startUrls = List[String]() 26 | 27 | /** 28 | * All crawled pages 29 | */ 30 | var crawledPages = List[WebPage]() 31 | 32 | /** 33 | * Triggered when a page has been downloaded 34 | */ 35 | var onReceivedPage = List[WebPage => Unit]() 36 | 37 | /** 38 | * Triggered just before the spider starts searching 39 | * 40 | * Additional start urls can be returned by the method 41 | */ 42 | var onStart = List[Spider => List[String]]() 43 | 44 | /** 45 | * Triggered every time a link is found 46 | */ 47 | var onLinkFound = List[Href => Unit]() 48 | 49 | /** 50 | * Before reading a page check if it is allowed 51 | * @param page page which will be scraped 52 | * @return if the page is allowed to be read 53 | */ 54 | def beforeReadingPage(page: WebPage) = { 55 | allowedDomains.contains(page.url.getHost) && !crawledPages.contains(page) 56 | } 57 | 58 | /** 59 | * Scrape a single page 60 | * 61 | * @param page page to scrape 62 | */ 63 | def scrapePage(page: WebPage): Unit = { 64 | try { 65 | if (beforeReadingPage(page)) { 66 | crawledPages ::= page 67 | scrape from page.link open { page => 68 | onReceivedPage.foreach(_(page)) 69 | page.links.foreach { link => 70 | onLinkFound.foreach(y => y(link)) 71 | scrapePage(WebPage(link.url)) 72 | } 73 | } 74 | } 75 | } 76 | catch { 77 | case invalidUrl: java.net.MalformedURLException => println("Invalid URL") 78 | case e: Exception => println("Error") 79 | } 80 | } 81 | 82 | /** 83 | * Start running the spider 84 | */ 85 | def start() = { 86 | onStart.foldLeft(startUrls) { 87 | case (urls, current) => urls ++ current(this) 88 | } foreach (x => scrapePage(new WebPage(x))) 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /scraper/src/main/scala/org/rovak/scraper/websites/Google.scala: -------------------------------------------------------------------------------- 1 | package org.rovak.scraper.websites 2 | 3 | 4 | object Google { 5 | 6 | val results = "#res li.g h3.r a" 7 | 8 | def search(term: String) = { 9 | "http://www.google.com/search?q=" + term.replace(" ", "+") 10 | } 11 | } 12 | --------------------------------------------------------------------------------