├── .gitignore ├── young-crawler-searcher ├── public │ ├── stylesheets │ │ └── main.css │ ├── javascripts │ │ └── hello.js │ └── images │ │ └── favicon.png ├── .gitignore ├── libexec │ └── activator-launch-1.3.10.jar ├── project │ ├── build.properties │ └── plugins.sbt ├── build.sbt ├── LICENSE ├── app │ ├── views │ │ ├── index.scala.html │ │ └── main.scala.html │ ├── controllers │ │ ├── HomeController.scala │ │ ├── CountController.scala │ │ └── AsyncController.scala │ ├── services │ │ ├── Counter.scala │ │ └── ApplicationTimer.scala │ ├── Module.scala │ ├── Filters.scala │ └── filters │ │ └── ExampleFilter.scala ├── test │ ├── IntegrationSpec.scala │ └── ApplicationSpec.scala ├── conf │ ├── routes │ ├── logback.xml │ └── application.conf ├── README └── bin │ ├── activator.bat │ └── activator ├── README.md └── young-crawler-core └── src ├── main ├── scala │ └── com │ │ └── young │ │ └── crawler │ │ ├── spider │ │ ├── task │ │ │ ├── IndexTask.scala │ │ │ ├── FetchTask.scala │ │ │ ├── SlaveTask.scala │ │ │ ├── CounterTask.scala │ │ │ ├── ParserTask.scala │ │ │ ├── InjectTask.scala │ │ │ └── support │ │ │ │ └── actor │ │ │ │ ├── IndexActorTask.scala │ │ │ │ ├── FetchActorTask.scala │ │ │ │ ├── ParseActorTask.scala │ │ │ │ ├── InjectActorTask.scala │ │ │ │ └── CounterActorTask.scala │ │ ├── parser │ │ │ ├── Parser.scala │ │ │ └── support │ │ │ │ ├── HtmlParseParser.scala │ │ │ │ └── JsoupParser.scala │ │ ├── fetcher │ │ │ ├── FetcherCache.scala │ │ │ ├── Fetcher.scala │ │ │ └── support │ │ │ │ ├── HttpClientFetcher.scala │ │ │ │ └── HttpWatch.scala │ │ └── indexer │ │ │ ├── Indexer.scala │ │ │ └── support │ │ │ └── ElasticIndexer.scala │ │ ├── entity │ │ ├── InjectEntitys.scala │ │ ├── PageIndexEntity.scala │ │ ├── CounterEntity.scala │ │ └── HttpEntitys.scala │ │ ├── exception │ │ ├── IndexException.scala │ │ ├── ParseException.scala │ │ └── FetchException.scala │ │ ├── utils │ │ ├── JsonUtil.scala │ │ ├── MD5Util.scala │ │ └── IOUtil.scala │ │ ├── cache │ │ ├── Cache.scala │ │ └── support │ │ │ ├── MapCache.scala │ │ │ └── RedisCache.scala │ │ ├── config │ │ ├── CrawlerConfig.scala │ │ └── CrawlerConfigContants.scala │ │ └── boot │ │ └── CrawlerBoot.scala ├── java │ ├── Thread1.java │ ├── Runnable1.java │ └── ThreadBoot.java └── resources │ ├── seeds.txt │ └── crawler.properties └── test └── scala └── com └── young └── crawler ├── actor ├── ActorExample.scala └── ActorSelectorExample.scala ├── http └── CrawlerTest.scala ├── cache ├── MapCacheExample.scala └── RedisCacheExample.scala ├── parser └── JsoupExample.scala └── indexer └── Elastic4sExample.scala /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | project 3 | target 4 | -------------------------------------------------------------------------------- /young-crawler-searcher/public/stylesheets/main.css: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # young-crawler 2 | scala结合actor编写的分布式网络爬虫,实现上采用Akka 异步消息处理框架,无阻塞,性能高,网页爬取速度快 3 | #如何启动 4 | 下载项目后配置cralwer.properties,详细配置文件里有注释 5 | -------------------------------------------------------------------------------- /young-crawler-searcher/.gitignore: -------------------------------------------------------------------------------- 1 | logs 2 | target 3 | /.idea 4 | /.idea_modules 5 | /.classpath 6 | /.project 7 | /.settings 8 | /RUNNING_PID 9 | -------------------------------------------------------------------------------- /young-crawler-searcher/public/javascripts/hello.js: -------------------------------------------------------------------------------- 1 | if (window.console) { 2 | console.log("Welcome to your Play application's JavaScript!"); 3 | } 4 | -------------------------------------------------------------------------------- /young-crawler-searcher/public/images/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangwx1402/young-crawler/HEAD/young-crawler-searcher/public/images/favicon.png -------------------------------------------------------------------------------- /young-crawler-searcher/libexec/activator-launch-1.3.10.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangwx1402/young-crawler/HEAD/young-crawler-searcher/libexec/activator-launch-1.3.10.jar -------------------------------------------------------------------------------- /young-crawler-searcher/project/build.properties: -------------------------------------------------------------------------------- 1 | #Activator-generated Properties 2 | #Sun Sep 11 15:22:45 CST 2016 3 | template.uuid=b0d11fa6-d1b3-4963-94aa-319a15612bf3 4 | sbt.version=0.13.11 5 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/spider/task/IndexTask.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.spider.task 2 | 3 | /** 4 | * Created by dell on 2016/8/29. 5 | */ 6 | trait IndexTask { 7 | 8 | } 9 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/spider/task/FetchTask.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.spider.task 2 | 3 | /** 4 | * Created by young.yang on 2016/8/28. 5 | */ 6 | trait FetchTask { 7 | 8 | } 9 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/spider/task/SlaveTask.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.spider.task 2 | 3 | /** 4 | * Created by young.yang on 2016/8/28. 5 | */ 6 | trait SlaveTask { 7 | 8 | } 9 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/spider/task/CounterTask.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.spider.task 2 | 3 | /** 4 | * Created by young.yang on 2016/9/3. 5 | */ 6 | trait CounterTask { 7 | 8 | } 9 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/spider/task/ParserTask.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.spider.task 2 | 3 | /** 4 | * Created by young.yang on 2016/8/28. 5 | */ 6 | trait ParserTask { 7 | 8 | } 9 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/java/Thread1.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by young.yang on 2016/8/31. 3 | */ 4 | public class Thread1 extends Thread{ 5 | public void run(){ 6 | System.out.println("Thread1 run"); 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/java/Runnable1.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by young.yang on 2016/8/31. 3 | */ 4 | public class Runnable1 implements Runnable { 5 | @Override 6 | public void run() { 7 | System.out.println("Runnable1 run"); 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/resources/seeds.txt: -------------------------------------------------------------------------------- 1 | http://www.sina.com.cn 2 | http://www.baidu.com 3 | http://www.163.com 4 | http://www.sohu.com 5 | http://www.ifeng.com 6 | http://www.autohome.com.cn/beijing 7 | http://bj.fang.com 8 | http://blog.csdn.net 9 | http://www.gc-zb.com -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/entity/InjectEntitys.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.entity 2 | 3 | /** 4 | * Created by dell on 2016/8/29. 5 | * 初始化种子消息,用来传递给Inject Actor解析种子信息 6 | */ 7 | case class InitSeed(seedPath:String,fileEncode:String="utf-8") 8 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/spider/parser/Parser.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.spider.parser 2 | 3 | import com.young.crawler.entity.{HttpResult, HttpPage} 4 | 5 | /** 6 | * Created by young.yang on 2016/8/28. 7 | * html页面解析接口 8 | */ 9 | trait Parser { 10 | def parse(html:HttpResult):HttpPage 11 | } 12 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/spider/task/InjectTask.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.spider.task 2 | 3 | import com.young.crawler.entity.Seed 4 | 5 | /** 6 | * Created by young.yang on 2016/8/28. 7 | */ 8 | trait InjectTask { 9 | def initSeeds(seedPath:String,fileEncode:String="utf-8"): List[Seed] 10 | } 11 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/exception/IndexException.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.exception 2 | 3 | /** 4 | * Created by young.yang on 2016/8/31. 5 | * 自定义索引异常 6 | */ 7 | class IndexException(message:String,e:Throwable) extends Exception(message,e) { 8 | 9 | def this(message:String) = this(message,new Exception(message)) 10 | } 11 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/exception/ParseException.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.exception 2 | 3 | /** 4 | * Created by young.yang on 2016/8/31. 5 | * 自定义解析异常 6 | */ 7 | class ParseException(message:String,e:Throwable) extends Exception(message,e){ 8 | 9 | def this(message:String) = this(message,new Exception(message)) 10 | } 11 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/exception/FetchException.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.exception 2 | 3 | /** 4 | * Created by young.yang on 2016/8/28. 5 | * 自定义抓取异常 6 | */ 7 | class FetchException(message:String,e:Throwable) extends Exception(message,e){ 8 | 9 | def this(message:String)=this(message,new Exception(message)) 10 | 11 | } 12 | -------------------------------------------------------------------------------- /young-crawler-searcher/build.sbt: -------------------------------------------------------------------------------- 1 | name := """young-crawler-searcher""" 2 | 3 | version := "1.0-SNAPSHOT" 4 | 5 | lazy val root = (project in file(".")).enablePlugins(PlayScala) 6 | 7 | scalaVersion := "2.11.7" 8 | 9 | libraryDependencies ++= Seq( 10 | jdbc, 11 | cache, 12 | ws, 13 | "org.scalatestplus.play" %% "scalatestplus-play" % "1.5.1" % Test 14 | ) 15 | 16 | -------------------------------------------------------------------------------- /young-crawler-core/src/test/scala/com/young/crawler/actor/ActorExample.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.actor 2 | 3 | import akka.actor.Actor 4 | import akka.actor.Actor.Receive 5 | 6 | /** 7 | * Created by young.yang on 2016/9/8. 8 | */ 9 | class ActorExample extends Actor{ 10 | override def receive: Receive = { 11 | case line:String=>println("receive a message "+line) 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/utils/JsonUtil.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.utils 2 | 3 | import org.codehaus.jackson.map.ObjectMapper 4 | 5 | /** 6 | * Created by dell on 2016/8/31. 7 | */ 8 | private[crawler] object JsonUtil { 9 | 10 | private val mapper = new ObjectMapper 11 | 12 | def toJson(obj:Any):String={ 13 | mapper.writeValueAsString(obj) 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/cache/Cache.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.cache 2 | 3 | /** 4 | * Created by dell on 2016/9/2. 5 | * 缓存接口 6 | */ 7 | trait Cache[KEY,VALUE] { 8 | 9 | def contains(key:KEY):Boolean 10 | 11 | def put(key:KEY,value:VALUE) 12 | 13 | def get(key:KEY):Option[VALUE] 14 | 15 | def size():Int 16 | 17 | def keys():scala.collection.Set[KEY] 18 | 19 | } 20 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/java/ThreadBoot.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by young.yang on 2016/8/31. 3 | */ 4 | public class ThreadBoot { 5 | public static void main(String[] args) throws InterruptedException { 6 | Thread thread1 = new Thread1(); 7 | Runnable runnable = new Runnable1(); 8 | Thread thread2 = new Thread(runnable); 9 | thread1.start(); 10 | thread2.start(); 11 | Thread.sleep(5000); 12 | thread1.start(); 13 | new Thread(runnable).start(); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/utils/MD5Util.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.utils 2 | 3 | import java.nio.charset.Charset 4 | 5 | import com.google.common.hash.Hashing 6 | 7 | /** 8 | * Created by dell on 2016/8/31. 9 | */ 10 | private[crawler] object MD5Util { 11 | 12 | def md5(line: String) = Hashing.md5().newHasher().putString(line, Charset.defaultCharset()).hash().toString 13 | 14 | def main(args: Array[String]) { 15 | println(MD5Util.md5("杨勇")) 16 | println(MD5Util.md5("123")) 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /young-crawler-core/src/test/scala/com/young/crawler/http/CrawlerTest.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.http 2 | 3 | import com.young.crawler.entity.{SeedType, UrlInfo} 4 | import com.young.crawler.spider.fetcher.support.HttpWatch 5 | 6 | /** 7 | * Created by young.yang on 2016/8/28. 8 | */ 9 | object CrawlerTest { 10 | 11 | def main(args: Array[String]) { 12 | val url = "http://www.sina.com.cn" 13 | val result = HttpWatch.get(UrlInfo(url,"",SeedType,0)) 14 | println(result.content) 15 | println(result.status) 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/spider/parser/support/HtmlParseParser.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.spider.parser.support 2 | 3 | import com.young.crawler.entity.{HttpPage, HttpResult} 4 | import com.young.crawler.spider.parser.Parser 5 | 6 | /** 7 | * Created by young.yang on 2016/8/28. 8 | */ 9 | private[crawler] class HtmlParseParser extends Parser { 10 | override def parse(html: HttpResult): HttpPage = { 11 | val page = new HttpPage 12 | page.setContent(html.content) 13 | page.setUrl(html.url) 14 | page 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /young-crawler-core/src/test/scala/com/young/crawler/cache/MapCacheExample.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.cache 2 | 3 | import com.young.crawler.cache.support.MapCache 4 | 5 | /** 6 | * Created by dell on 2016/9/2. 7 | */ 8 | object MapCacheExample { 9 | def main(args: Array[String]) { 10 | val cache = new MapCache[String, String] 11 | for(i<-0 to 10){ 12 | cache.put("key_"+i,"value_"+i) 13 | } 14 | println(cache.contains("key_0")) 15 | println(cache.keys()) 16 | println(cache.size()) 17 | println(cache.get("key_12").isEmpty) 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /young-crawler-core/src/test/scala/com/young/crawler/cache/RedisCacheExample.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.cache 2 | 3 | import com.young.crawler.cache.support.{RedisCache, MapCache} 4 | 5 | /** 6 | * Created by dell on 2016/9/9. 7 | */ 8 | object RedisCacheExample { 9 | 10 | def main(args: Array[String]) { 11 | val cache = new RedisCache[String, String] 12 | for(i<-0 to 10){ 13 | cache.put("key_"+i,"value_"+i) 14 | } 15 | println(cache.contains("key_0")) 16 | println(cache.size()) 17 | println(cache.get("key_12").isEmpty) 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /young-crawler-searcher/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | // The Play plugin 2 | addSbtPlugin("com.typesafe.play" % "sbt-plugin" % "2.5.6") 3 | 4 | // web plugins 5 | 6 | addSbtPlugin("com.typesafe.sbt" % "sbt-coffeescript" % "1.0.0") 7 | 8 | addSbtPlugin("com.typesafe.sbt" % "sbt-less" % "1.1.0") 9 | 10 | addSbtPlugin("com.typesafe.sbt" % "sbt-jshint" % "1.0.3") 11 | 12 | addSbtPlugin("com.typesafe.sbt" % "sbt-rjs" % "1.0.7") 13 | 14 | addSbtPlugin("com.typesafe.sbt" % "sbt-digest" % "1.1.0") 15 | 16 | addSbtPlugin("com.typesafe.sbt" % "sbt-mocha" % "1.1.0") 17 | 18 | addSbtPlugin("org.irundaia.sbt" % "sbt-sassify" % "1.4.2") 19 | -------------------------------------------------------------------------------- /young-crawler-searcher/LICENSE: -------------------------------------------------------------------------------- 1 | This software is licensed under the Apache 2 license, quoted below. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this project except in compliance with 4 | the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. 5 | 6 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an 7 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific 8 | language governing permissions and limitations under the License. -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/spider/fetcher/FetcherCache.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.spider.fetcher 2 | 3 | import com.young.crawler.cache.Cache 4 | import com.young.crawler.config.{CrawlerConfigContants, CrawlerConfig} 5 | 6 | /** 7 | * Created by young.yang on 2016/9/2. 8 | * 网页缓存,用来爬取过程中的去重 9 | */ 10 | private[crawler] object FetcherCache { 11 | //val fetcherCache = new MapCache[String,Byte] 12 | val fetcherCache : Cache[String,Byte] = Class.forName(CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_fetcher_cache_imp)).newInstance().asInstanceOf[(Cache[String,Byte])] 13 | } 14 | -------------------------------------------------------------------------------- /young-crawler-searcher/app/views/index.scala.html: -------------------------------------------------------------------------------- 1 | @* 2 | * This template takes a single argument, a String containing a 3 | * message to display. 4 | *@ 5 | @(message: String) 6 | 7 | @* 8 | * Call the `main` template with two arguments. The first 9 | * argument is a `String` with the title of the page, the second 10 | * argument is an `Html` object containing the body of the page. 11 | *@ 12 | @main("Welcome to Play") { 13 | 14 | @* 15 | * Get an `Html` object by calling the built-in Play welcome 16 | * template and passing a `String` message. 17 | *@ 18 | @play20.welcome(message, style = "Scala") 19 | 20 | } 21 | -------------------------------------------------------------------------------- /young-crawler-searcher/test/IntegrationSpec.scala: -------------------------------------------------------------------------------- 1 | import org.scalatestplus.play._ 2 | import play.api.test._ 3 | import play.api.test.Helpers._ 4 | 5 | /** 6 | * add your integration spec here. 7 | * An integration test will fire up a whole play application in a real (or headless) browser 8 | */ 9 | class IntegrationSpec extends PlaySpec with OneServerPerTest with OneBrowserPerTest with HtmlUnitFactory { 10 | 11 | "Application" should { 12 | 13 | "work from within a browser" in { 14 | 15 | go to ("http://localhost:" + port) 16 | 17 | pageSource must include ("Your new application is ready.") 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/entity/PageIndexEntity.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.entity 2 | 3 | import scala.beans.BeanProperty 4 | 5 | /** 6 | * Created by dell on 2016/8/31. 7 | * 索引信息 8 | */ 9 | class PageIndexEntity { 10 | @BeanProperty 11 | var url: String = "" 12 | @BeanProperty 13 | var title: String = "" 14 | @BeanProperty 15 | var content: String = "" 16 | @BeanProperty 17 | var publishTime: Long = 0 18 | @BeanProperty 19 | var updateTime: Long = 0 20 | @BeanProperty 21 | var author: String = "" 22 | @BeanProperty 23 | var keywords:String ="" 24 | @BeanProperty 25 | var desc:String = "" 26 | } 27 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/utils/IOUtil.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.utils 2 | 3 | import java.io.{BufferedReader, InputStream, InputStreamReader} 4 | 5 | /** 6 | * Created by young.yang on 2016/8/28. 7 | */ 8 | private [crawler] object IOUtil { 9 | 10 | def toString(inputStream:InputStream,encode:String):String={ 11 | val bufferReader = new BufferedReader(new InputStreamReader(inputStream,encode)) 12 | val buffer = new StringBuilder(1000) 13 | var line = bufferReader.readLine() 14 | while(line!=null){ 15 | buffer.append(line+"\n") 16 | line = bufferReader.readLine() 17 | } 18 | return buffer.toString() 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /young-crawler-core/src/test/scala/com/young/crawler/actor/ActorSelectorExample.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.actor 2 | 3 | import akka.actor.{Props, ActorSystem} 4 | import com.young.crawler.config.{CrawlerConfigContants, CrawlerConfig} 5 | 6 | /** 7 | * Created by young.yang on 2016/9/8. 8 | */ 9 | object ActorSelectorExample { 10 | 11 | def main(args: Array[String]) { 12 | val system = ActorSystem(CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_appName)) 13 | val actor = system.actorOf(Props[ActorExample],"print") 14 | actor!"test" 15 | println(actor) 16 | val actor2 = system.actorSelection("akka://young-crawler/user/print") 17 | actor2!"222" 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /young-crawler-searcher/conf/routes: -------------------------------------------------------------------------------- 1 | # Routes 2 | # This file defines all application routes (Higher priority routes first) 3 | # ~~~~ 4 | 5 | # An example controller showing a sample home page 6 | GET / controllers.HomeController.index 7 | # An example controller showing how to use dependency injection 8 | GET /count controllers.CountController.count 9 | # An example controller showing how to write asynchronous code 10 | GET /message controllers.AsyncController.message 11 | 12 | # Map static resources from the /public folder to the /assets URL path 13 | GET /assets/*file controllers.Assets.versioned(path="/public", file: Asset) 14 | -------------------------------------------------------------------------------- /young-crawler-searcher/app/controllers/HomeController.scala: -------------------------------------------------------------------------------- 1 | package controllers 2 | 3 | import javax.inject._ 4 | import play.api._ 5 | import play.api.mvc._ 6 | 7 | /** 8 | * This controller creates an `Action` to handle HTTP requests to the 9 | * application's home page. 10 | */ 11 | @Singleton 12 | class HomeController @Inject() extends Controller { 13 | 14 | /** 15 | * Create an Action to render an HTML page with a welcome message. 16 | * The configuration in the `routes` file means that this method 17 | * will be called when the application receives a `GET` request with 18 | * a path of `/`. 19 | */ 20 | def index = Action { 21 | Ok(views.html.index("Your new application is ready.")) 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/spider/indexer/Indexer.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.spider.indexer 2 | 3 | import com.young.crawler.config.{CrawlerConfig, CrawlerConfigContants} 4 | import com.young.crawler.entity.{HttpPage, IndexResult} 5 | 6 | /** 7 | * Created by dell on 2016/8/29. 8 | * 索引接口 9 | */ 10 | trait Indexer { 11 | 12 | /** 13 | * 文档索引 14 | * @param page 15 | * @return 16 | */ 17 | def index(page: HttpPage): IndexResult 18 | } 19 | 20 | /** 21 | * ES中所有名称和类型 22 | */ 23 | object IndexerConstants { 24 | val indexName = CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_indexer_es_name) 25 | val indexType = CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_indexer_es_type) 26 | } 27 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/cache/support/MapCache.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.cache.support 2 | 3 | import com.young.crawler.cache.Cache 4 | 5 | import scala.collection.immutable.Nil 6 | import scala.collection.mutable 7 | 8 | /** 9 | * Created by dell on 2016/9/2. 10 | * 采用本地Map实现的缓存 11 | */ 12 | private[crawler] class MapCache[KEY,VALUE] extends Cache[KEY,VALUE]{ 13 | 14 | private val map = new mutable.HashMap[KEY,VALUE]() 15 | 16 | override def contains(key: KEY): Boolean = map.contains(key) 17 | 18 | override def get(key: KEY): Option[VALUE] =map.get(key) 19 | 20 | override def put(key:KEY,value:VALUE): Unit = map.put(key,value) 21 | 22 | override def size(): Int = map.size 23 | 24 | override def keys(): scala.collection.Set[KEY] = map.keySet 25 | } 26 | -------------------------------------------------------------------------------- /young-crawler-searcher/app/controllers/CountController.scala: -------------------------------------------------------------------------------- 1 | package controllers 2 | 3 | import javax.inject._ 4 | import play.api._ 5 | import play.api.mvc._ 6 | 7 | import services.Counter 8 | 9 | /** 10 | * This controller demonstrates how to use dependency injection to 11 | * bind a component into a controller class. The class creates an 12 | * `Action` that shows an incrementing count to users. The [[Counter]] 13 | * object is injected by the Guice dependency injection system. 14 | */ 15 | @Singleton 16 | class CountController @Inject() (counter: Counter) extends Controller { 17 | 18 | /** 19 | * Create an action that responds with the [[Counter]]'s current 20 | * count. The result is plain text. This `Action` is mapped to 21 | * `GET /count` requests by an entry in the `routes` config file. 22 | */ 23 | def count = Action { Ok(counter.nextCount().toString) } 24 | 25 | } 26 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/entity/CounterEntity.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.entity 2 | 3 | /** 4 | * Created by young.yang on 2016/9/3. 5 | */ 6 | sealed trait Counter 7 | 8 | case class FetchCounter(num: Int) extends Counter 9 | 10 | case class FetchOk(num: Int) extends Counter 11 | 12 | case class FetchError(num: Int) extends Counter 13 | 14 | case class InjectCounter(num: Int) extends Counter 15 | 16 | case class ParseCounter(num: Int) extends Counter 17 | 18 | case class ParseChildUrlCounter(num: Int) extends Counter 19 | 20 | case class IndexCounter(num: Int) extends Counter 21 | 22 | case object PrintCounter extends Counter 23 | 24 | case object GetAllCounter extends Counter 25 | 26 | case class AllCounter(fetchCounter: FetchCounter, fetchOk: FetchOk, fetchError: FetchError, injectCounter: InjectCounter, parseCounter: ParseCounter, parseChildUrlCounter: ParseChildUrlCounter, indexCounter: IndexCounter) extends Counter 27 | 28 | -------------------------------------------------------------------------------- /young-crawler-core/src/test/scala/com/young/crawler/parser/JsoupExample.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.parser 2 | 3 | import com.young.crawler.entity.{SeedType, UrlInfo} 4 | import com.young.crawler.spider.fetcher.support.HttpClientFetcher 5 | import com.young.crawler.spider.parser.support.JsoupParser 6 | 7 | /** 8 | * Created by dell on 2016/9/1. 9 | */ 10 | object JsoupExample { 11 | 12 | def parserHtml(url:UrlInfo): Unit ={ 13 | val fetcher = new HttpClientFetcher 14 | val parser = new JsoupParser 15 | val page = fetcher.fetchPage(url) 16 | println(page) 17 | val page1 = fetcher.fetchPage(url) 18 | println(page1) 19 | val result = parser.parse(page.get) 20 | println(result.keywords) 21 | println(result.desc) 22 | result.childLink._1.foreach(println _) 23 | } 24 | 25 | def main(args: Array[String]) { 26 | val url = "http://bj.fang.com/" 27 | JsoupExample.parserHtml(UrlInfo(url,"",SeedType,0)) 28 | 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /young-crawler-searcher/app/services/Counter.scala: -------------------------------------------------------------------------------- 1 | package services 2 | 3 | import java.util.concurrent.atomic.AtomicInteger 4 | import javax.inject._ 5 | 6 | /** 7 | * This trait demonstrates how to create a component that is injected 8 | * into a controller. The trait represents a counter that returns a 9 | * incremented number each time it is called. 10 | */ 11 | trait Counter { 12 | def nextCount(): Int 13 | } 14 | 15 | /** 16 | * This class is a concrete implementation of the [[Counter]] trait. 17 | * It is configured for Guice dependency injection in the [[Module]] 18 | * class. 19 | * 20 | * This class has a `Singleton` annotation because we need to make 21 | * sure we only use one counter per application. Without this 22 | * annotation we would get a new instance every time a [[Counter]] is 23 | * injected. 24 | */ 25 | @Singleton 26 | class AtomicCounter extends Counter { 27 | private val atomicCounter = new AtomicInteger() 28 | override def nextCount(): Int = atomicCounter.getAndIncrement() 29 | } 30 | -------------------------------------------------------------------------------- /young-crawler-searcher/app/views/main.scala.html: -------------------------------------------------------------------------------- 1 | @* 2 | * This template is called from the `index` template. This template 3 | * handles the rendering of the page header and body tags. It takes 4 | * two arguments, a `String` for the title of the page and an `Html` 5 | * object to insert into the body of the page. 6 | *@ 7 | @(title: String)(content: Html) 8 | 9 | 10 | 11 | 12 | @* Here's where we render the page title `String`. *@ 13 | @title 14 | 15 | 16 | 17 | 18 | 19 | @* And here's where we render the `Html` object containing 20 | * the page content. *@ 21 | @content 22 | 23 | 24 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/spider/task/support/actor/IndexActorTask.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.spider.task.support.actor 2 | 3 | import akka.actor.Actor 4 | import com.young.crawler.config.{CrawlerConfigContants, CrawlerConfig} 5 | import com.young.crawler.entity.{IndexCounter, HttpPage} 6 | import com.young.crawler.spider.indexer.Indexer 7 | import com.young.crawler.spider.task.IndexTask 8 | 9 | /** 10 | * Created by dell on 2016/8/29. 11 | * 索引任务 12 | */ 13 | private[crawler] class IndexActorTask(indexer: Indexer) extends Actor with IndexTask { 14 | 15 | private val countActor = context.system.actorSelection("akka://" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_appName) + "/user/" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_count_name)) 16 | 17 | 18 | context.system.actorSelection("") 19 | 20 | override def receive: Receive = { 21 | case page: HttpPage => 22 | indexer.index(page) 23 | countActor ! IndexCounter(1) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/config/CrawlerConfig.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.config 2 | 3 | import java.util.{Locale, ResourceBundle} 4 | 5 | import org.apache.commons.logging.LogFactory 6 | 7 | /** 8 | * Created by young.yang on 2016/9/3. 9 | */ 10 | private[crawler] object CrawlerConfig { 11 | 12 | private val log = LogFactory.getLog("com.young.crawler.config.CrawlerConfig") 13 | 14 | private val config = ResourceBundle.getBundle("crawler", Locale.getDefault) 15 | 16 | private var init_flag = true 17 | 18 | private def init(): Unit = { 19 | log.info("init crawler config start") 20 | val keys = config.keySet() 21 | val iterator = keys.iterator() 22 | while (iterator.hasNext) { 23 | val key = iterator.next() 24 | log.info("crawler config key = [" + key + "] value = [" + config.getString(key) + "]") 25 | } 26 | log.info("init crawler config end") 27 | init_flag = false 28 | } 29 | 30 | def getConfig = { 31 | if (init_flag) { 32 | init() 33 | } 34 | config 35 | } 36 | 37 | } 38 | 39 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/spider/fetcher/Fetcher.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.spider.fetcher 2 | 3 | import com.young.crawler.entity.{UrlInfo, HttpResult} 4 | import com.young.crawler.exception.FetchException 5 | import org.apache.http.Header 6 | 7 | /** 8 | * Created by young.yang on 2016/8/28. 9 | * 爬取接口 10 | */ 11 | trait Fetcher { 12 | 13 | private val CONTENT_TYPE = "Content-Type" 14 | 15 | private val DEFAULT_ENCODE = "utf-8" 16 | 17 | val FETCH_SUCCESS = 200 18 | 19 | val URL_NOT_FOUND = 404 20 | 21 | /** 22 | * 爬取网页入口 23 | */ 24 | @throws[FetchException] 25 | def fetchPage(url:UrlInfo):Option[HttpResult] 26 | 27 | /** 28 | * 根据网页header来探测网页编码 29 | * @param headers 30 | * @return 31 | */ 32 | def getEncode(headers:Array[Header]):String={ 33 | for(header<-headers){ 34 | if(CONTENT_TYPE.equals(header.getName)){ 35 | val temp = header.getValue.split("=") 36 | if(temp.length==2){ 37 | return temp(1) 38 | } 39 | } 40 | } 41 | DEFAULT_ENCODE 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /young-crawler-searcher/app/Module.scala: -------------------------------------------------------------------------------- 1 | import com.google.inject.AbstractModule 2 | import java.time.Clock 3 | 4 | import services.{ApplicationTimer, AtomicCounter, Counter} 5 | 6 | /** 7 | * This class is a Guice module that tells Guice how to bind several 8 | * different types. This Guice module is created when the Play 9 | * application starts. 10 | 11 | * Play will automatically use any class called `Module` that is in 12 | * the root package. You can create modules in other locations by 13 | * adding `play.modules.enabled` settings to the `application.conf` 14 | * configuration file. 15 | */ 16 | class Module extends AbstractModule { 17 | 18 | override def configure() = { 19 | // Use the system clock as the default implementation of Clock 20 | bind(classOf[Clock]).toInstance(Clock.systemDefaultZone) 21 | // Ask Guice to create an instance of ApplicationTimer when the 22 | // application starts. 23 | bind(classOf[ApplicationTimer]).asEagerSingleton() 24 | // Set AtomicCounter as the implementation for Counter. 25 | bind(classOf[Counter]).to(classOf[AtomicCounter]) 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /young-crawler-searcher/app/Filters.scala: -------------------------------------------------------------------------------- 1 | import javax.inject._ 2 | import play.api._ 3 | import play.api.http.HttpFilters 4 | import play.api.mvc._ 5 | 6 | import filters.ExampleFilter 7 | 8 | /** 9 | * This class configures filters that run on every request. This 10 | * class is queried by Play to get a list of filters. 11 | * 12 | * Play will automatically use filters from any class called 13 | * `Filters` that is placed the root package. You can load filters 14 | * from a different class by adding a `play.http.filters` setting to 15 | * the `application.conf` configuration file. 16 | * 17 | * @param env Basic environment settings for the current application. 18 | * @param exampleFilter A demonstration filter that adds a header to 19 | * each response. 20 | */ 21 | @Singleton 22 | class Filters @Inject() ( 23 | env: Environment, 24 | exampleFilter: ExampleFilter) extends HttpFilters { 25 | 26 | override val filters = { 27 | // Use the example filter if we're running development mode. If 28 | // we're running in production or test mode then don't use any 29 | // filters at all. 30 | if (env.mode == Mode.Dev) Seq(exampleFilter) else Seq.empty 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /young-crawler-core/src/test/scala/com/young/crawler/indexer/Elastic4sExample.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.indexer 2 | 3 | import java.net.InetAddress 4 | 5 | import com.young.crawler.entity.PageIndexEntity 6 | import com.young.crawler.spider.indexer.IndexerConstants 7 | import com.young.crawler.utils.{JsonUtil, MD5Util} 8 | import org.elasticsearch.client.transport.TransportClient 9 | import org.elasticsearch.common.transport.InetSocketTransportAddress 10 | 11 | /** 12 | * Created by young.yang on 2016/8/30. 13 | */ 14 | object Elastic4sExample { 15 | 16 | val client = TransportClient.builder().build().addTransportAddress(new InetSocketTransportAddress(InetAddress.getByName("115.29.47.216"), 9300)) 17 | 18 | def main(args: Array[String]) { 19 | val page = new PageIndexEntity 20 | page.setAuthor("杨勇") 21 | page.setContent("中华人民共和過") 22 | page.setTitle("测试") 23 | page.setUrl("http://www.baidu.com/1") 24 | page.setPublishTime(System.currentTimeMillis()) 25 | page.setUpdateTime(System.currentTimeMillis()) 26 | client.prepareIndex(IndexerConstants.indexName,IndexerConstants.indexType).setId(MD5Util.md5(page.getUrl)).setSource(JsonUtil.toJson(page)).get() 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /young-crawler-searcher/app/filters/ExampleFilter.scala: -------------------------------------------------------------------------------- 1 | package filters 2 | 3 | import akka.stream.Materializer 4 | import javax.inject._ 5 | import play.api.mvc._ 6 | import scala.concurrent.{ExecutionContext, Future} 7 | 8 | /** 9 | * This is a simple filter that adds a header to all requests. It's 10 | * added to the application's list of filters by the 11 | * [[Filters]] class. 12 | * 13 | * @param mat This object is needed to handle streaming of requests 14 | * and responses. 15 | * @param exec This class is needed to execute code asynchronously. 16 | * It is used below by the `map` method. 17 | */ 18 | @Singleton 19 | class ExampleFilter @Inject()( 20 | implicit override val mat: Materializer, 21 | exec: ExecutionContext) extends Filter { 22 | 23 | override def apply(nextFilter: RequestHeader => Future[Result]) 24 | (requestHeader: RequestHeader): Future[Result] = { 25 | // Run the next filter in the chain. This will call other filters 26 | // and eventually call the action. Take the result and modify it 27 | // by adding a new header. 28 | nextFilter(requestHeader).map { result => 29 | result.withHeaders("X-ExampleFilter" -> "foo") 30 | } 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /young-crawler-searcher/test/ApplicationSpec.scala: -------------------------------------------------------------------------------- 1 | import org.scalatestplus.play._ 2 | import play.api.test._ 3 | import play.api.test.Helpers._ 4 | 5 | /** 6 | * Add your spec here. 7 | * You can mock out a whole application including requests, plugins etc. 8 | * For more information, consult the wiki. 9 | */ 10 | class ApplicationSpec extends PlaySpec with OneAppPerTest { 11 | 12 | "Routes" should { 13 | 14 | "send 404 on a bad request" in { 15 | route(app, FakeRequest(GET, "/boum")).map(status(_)) mustBe Some(NOT_FOUND) 16 | } 17 | 18 | } 19 | 20 | "HomeController" should { 21 | 22 | "render the index page" in { 23 | val home = route(app, FakeRequest(GET, "/")).get 24 | 25 | status(home) mustBe OK 26 | contentType(home) mustBe Some("text/html") 27 | contentAsString(home) must include ("Your new application is ready.") 28 | } 29 | 30 | } 31 | 32 | "CountController" should { 33 | 34 | "return an increasing count" in { 35 | contentAsString(route(app, FakeRequest(GET, "/count")).get) mustBe "0" 36 | contentAsString(route(app, FakeRequest(GET, "/count")).get) mustBe "1" 37 | contentAsString(route(app, FakeRequest(GET, "/count")).get) mustBe "2" 38 | } 39 | 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /young-crawler-searcher/README: -------------------------------------------------------------------------------- 1 | This is your new Play application 2 | ================================= 3 | 4 | This file will be packaged with your application when using `activator dist`. 5 | 6 | There are several demonstration files available in this template. 7 | 8 | Controllers 9 | =========== 10 | 11 | - HomeController.scala: 12 | 13 | Shows how to handle simple HTTP requests. 14 | 15 | - AsyncController.scala: 16 | 17 | Shows how to do asynchronous programming when handling a request. 18 | 19 | - CountController.scala: 20 | 21 | Shows how to inject a component into a controller and use the component when 22 | handling requests. 23 | 24 | Components 25 | ========== 26 | 27 | - Module.scala: 28 | 29 | Shows how to use Guice to bind all the components needed by your application. 30 | 31 | - Counter.scala: 32 | 33 | An example of a component that contains state, in this case a simple counter. 34 | 35 | - ApplicationTimer.scala: 36 | 37 | An example of a component that starts when the application starts and stops 38 | when the application stops. 39 | 40 | Filters 41 | ======= 42 | 43 | - Filters.scala: 44 | 45 | Creates the list of HTTP filters used by your application. 46 | 47 | - ExampleFilter.scala 48 | 49 | A simple filter that adds a header to every response. -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/config/CrawlerConfigContants.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.config 2 | 3 | /** 4 | * Created by young.yang on 2016/9/3. 5 | */ 6 | private[crawler] object CrawlerConfigContants { 7 | val young_crawler_appName = "young.crawler.appName" 8 | val young_crawler_task_inject_name = "young.crawler.task.inject.name" 9 | val young_crawler_task_fetch_name = "young.crawler.task.fetch.name" 10 | val young_crawler_task_parse_name = "young.crawler.task.parse.name" 11 | val young_crawler_task_index_name = "young.crawler.task.index.name" 12 | val young_crawler_task_count_name = "young.cralwer.task.count.name" 13 | val young_crawler_task_seed_path = "young.crawler.task.seed.path" 14 | val young_crawler_task_parallel_int = "young.crawler.task.parallel.int" 15 | val young_crawler_fetcher_cache_imp = "young.crawler.fetcher.cache.imp" 16 | val young_crawler_fetcher_timeout = "young.crawler.fetcher.timeout" 17 | val young_crawler_fetcher_useragent = "young.crawler.fetcher.useragent" 18 | val young_crawler_indexer_es_host = "young.crawler.indexer.es.host" 19 | val young_crawler_indexer_es_port = "young.crawler.indexer.es.port" 20 | val young_crawler_indexer_es_name = "young.crawler.indexer.es.name" 21 | val young_crawler_indexer_es_type = "young.crawler.indexer.es.type" 22 | val young_cralwer_fetcher_friendtime = "young.cralwer.fetcher.friendtime" 23 | val young_crawler_fetcher_deep = "young.crawler.fetcher.deep" 24 | 25 | } 26 | -------------------------------------------------------------------------------- /young-crawler-searcher/app/controllers/AsyncController.scala: -------------------------------------------------------------------------------- 1 | package controllers 2 | 3 | import akka.actor.ActorSystem 4 | import javax.inject._ 5 | import play.api._ 6 | import play.api.mvc._ 7 | import scala.concurrent.{ExecutionContext, Future, Promise} 8 | import scala.concurrent.duration._ 9 | 10 | /** 11 | * This controller creates an `Action` that demonstrates how to write 12 | * simple asynchronous code in a controller. It uses a timer to 13 | * asynchronously delay sending a response for 1 second. 14 | * 15 | * @param actorSystem We need the `ActorSystem`'s `Scheduler` to 16 | * run code after a delay. 17 | * @param exec We need an `ExecutionContext` to execute our 18 | * asynchronous code. 19 | */ 20 | @Singleton 21 | class AsyncController @Inject() (actorSystem: ActorSystem)(implicit exec: ExecutionContext) extends Controller { 22 | 23 | /** 24 | * Create an Action that returns a plain text message after a delay 25 | * of 1 second. 26 | * 27 | * The configuration in the `routes` file means that this method 28 | * will be called when the application receives a `GET` request with 29 | * a path of `/message`. 30 | */ 31 | def message = Action.async { 32 | getFutureMessage(1.second).map { msg => Ok(msg) } 33 | } 34 | 35 | private def getFutureMessage(delayTime: FiniteDuration): Future[String] = { 36 | val promise: Promise[String] = Promise[String]() 37 | actorSystem.scheduler.scheduleOnce(delayTime) { promise.success("Hi!") } 38 | promise.future 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/resources/crawler.properties: -------------------------------------------------------------------------------- 1 | #appName akka ActorSystem name 2 | young.crawler.appName=young-crawler 3 | #inject task actor name 4 | young.crawler.task.inject.name=young-injector 5 | #fetcher task actor name 6 | young.crawler.task.fetch.name=young-fetcher 7 | #parse task actor name 8 | young.crawler.task.parse.name=young-parser 9 | #index task actor name 10 | young.crawler.task.index.name=young-indexr 11 | #counter task actor name 12 | young.cralwer.task.count.name=young-count 13 | # seed config 14 | young.crawler.task.seed.path=classpath:/seeds.txt 15 | #并行度 16 | young.crawler.task.parallel.int=5 17 | #爬取url超时时间 18 | young.crawler.fetcher.timeout=5000 19 | #网页友好访问时间 20 | young.cralwer.fetcher.friendtime=1000 21 | #爬虫爬取深度 22 | young.crawler.fetcher.deep=1 23 | #爬取网页使用的useragent 24 | young.crawler.fetcher.useragent=Mozilla/5.0 (X11; U; Linux i686; zh-CN; rv:1.9.1.2) Gecko/20090803 Fedora/3.5.2-2.fc11 Firefox/3.5.2", timeout: Int = 10000, poolSize: Int = 100) 25 | #索引网页时候elasticsearch host 26 | young.crawler.indexer.es.host=115.29.47.216 27 | #es port 28 | young.crawler.indexer.es.port=9300 29 | #es index name 30 | young.crawler.indexer.es.name=page 31 | #es index type 32 | young.crawler.indexer.es.type=html 33 | #url排重使用的排重实现类 34 | young.crawler.fetcher.cache.imp=com.young.crawler.cache.support.RedisCache 35 | #url排重缓存时间,单位为s 36 | young.crawler.fetcher.cache.timeout=100 37 | #redis config 38 | young.crawler.fetcher.cache.redis.host=115.29.47.216 39 | young.crawler.fetcher.cache.redis.port=6379 40 | young.crawler.fetcher.cache.redis.password= 41 | 42 | 43 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/spider/task/support/actor/FetchActorTask.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.spider.task.support.actor 2 | 3 | import akka.actor.{ActorRef, Actor} 4 | import akka.event.Logging 5 | import com.young.crawler.config.{CrawlerConfigContants, CrawlerConfig} 6 | import com.young.crawler.entity.{FetchError, FetchOk, FetchCounter, UrlInfo} 7 | import com.young.crawler.spider.fetcher.Fetcher 8 | import com.young.crawler.spider.task.{FetchTask, ParserTask} 9 | 10 | /** 11 | * Created by young.yang on 2016/8/28. 12 | * 网页抓取任务,采用Actor实现 13 | */ 14 | private[crawler] class FetchActorTask(fetcher: Fetcher, parserTask: ActorRef) extends Actor with FetchTask { 15 | 16 | private val countActor = context.system.actorSelection("akka://" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_appName) + "/user/" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_count_name)) 17 | 18 | private val log = Logging(context.system, this) 19 | 20 | private var injector: ActorRef = null 21 | 22 | override def receive: Receive = { 23 | //处理抓取任务 24 | case page: UrlInfo => 25 | injector = sender() 26 | val httpResult = fetcher.fetchPage(page) 27 | countActor ! FetchCounter(1) 28 | if (!httpResult.isEmpty) { 29 | parserTask ! httpResult.get 30 | log.info("FetcherTask send parserTask a httpResult [" + httpResult + "]") 31 | countActor ! FetchOk(1) 32 | } else { 33 | countActor ! FetchError(1) 34 | } 35 | //将解析完成的子url发送到注入任务继续抓取 36 | case urls: List[UrlInfo] => injector ! urls 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/cache/support/RedisCache.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.cache.support 2 | 3 | import com.young.crawler.cache.Cache 4 | import com.young.crawler.config.CrawlerConfig 5 | import redis.clients.jedis.JedisPool 6 | 7 | /** 8 | * Created by dell on 2016/9/2. 9 | * 采用Redis实现的缓存 10 | */ 11 | private[crawler] class RedisCache[KEY, VALUE] extends Cache[KEY, VALUE] { 12 | 13 | private val JEDIS_HOST = CrawlerConfig.getConfig.getString("young.crawler.fetcher.cache.redis.host") 14 | 15 | private val JEDIS_PORT = CrawlerConfig.getConfig.getString("young.crawler.fetcher.cache.redis.port").toInt 16 | 17 | private val JEDIS_PASS = CrawlerConfig.getConfig.getString("young.crawler.fetcher.cache.redis.password") 18 | 19 | private val expire = CrawlerConfig.getConfig.getString("young.crawler.fetcher.cache.timeout").toInt 20 | 21 | private val jedisPool = new JedisPool(JEDIS_HOST, JEDIS_PORT) 22 | 23 | override def contains(key: KEY): Boolean = { 24 | val jedis = jedisPool.getResource 25 | val bool = jedis.exists(key.toString) 26 | jedis.close() 27 | bool 28 | } 29 | 30 | override def get(key: KEY): Option[VALUE] = { 31 | val jedis = jedisPool.getResource 32 | val result = Option(jedis.get(key.toString).asInstanceOf[VALUE]) 33 | jedis.close() 34 | result 35 | } 36 | 37 | override def put(key:KEY,value:VALUE): Unit = { 38 | val jedis = jedisPool.getResource 39 | jedis.setex(key.toString,expire, value.toString) 40 | jedis.close() 41 | } 42 | 43 | override def size(): Int = 0 44 | 45 | override def keys(): Set[KEY] = throw new Exception("unsupport operation") 46 | } 47 | -------------------------------------------------------------------------------- /young-crawler-searcher/conf/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | ${application.home:-.}/logs/application.log 8 | 9 | %date [%level] from %logger in %thread - %message%n%xException 10 | 11 | 12 | 13 | 14 | 15 | %coloredLevel %logger{15} - %message%n%xException{10} 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /young-crawler-searcher/app/services/ApplicationTimer.scala: -------------------------------------------------------------------------------- 1 | package services 2 | 3 | import java.time.{Clock, Instant} 4 | import javax.inject._ 5 | import play.api.Logger 6 | import play.api.inject.ApplicationLifecycle 7 | import scala.concurrent.Future 8 | 9 | /** 10 | * This class demonstrates how to run code when the 11 | * application starts and stops. It starts a timer when the 12 | * application starts. When the application stops it prints out how 13 | * long the application was running for. 14 | * 15 | * This class is registered for Guice dependency injection in the 16 | * [[Module]] class. We want the class to start when the application 17 | * starts, so it is registered as an "eager singleton". See the code 18 | * in the [[Module]] class to see how this happens. 19 | * 20 | * This class needs to run code when the server stops. It uses the 21 | * application's [[ApplicationLifecycle]] to register a stop hook. 22 | */ 23 | @Singleton 24 | class ApplicationTimer @Inject() (clock: Clock, appLifecycle: ApplicationLifecycle) { 25 | 26 | // This code is called when the application starts. 27 | private val start: Instant = clock.instant 28 | Logger.info(s"ApplicationTimer demo: Starting application at $start.") 29 | 30 | // When the application starts, register a stop hook with the 31 | // ApplicationLifecycle object. The code inside the stop hook will 32 | // be run when the application stops. 33 | appLifecycle.addStopHook { () => 34 | val stop: Instant = clock.instant 35 | val runningTime: Long = stop.getEpochSecond - start.getEpochSecond 36 | Logger.info(s"ApplicationTimer demo: Stopping application at ${clock.instant} after ${runningTime}s.") 37 | Future.successful(()) 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/entity/HttpEntitys.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.entity 2 | 3 | import java.net.{DatagramSocket, DatagramPacket} 4 | 5 | import scala.beans.BeanProperty 6 | 7 | /** 8 | * Created by young.yang on 2016/8/28. 9 | * 通过爬取回来的http原始页面 10 | */ 11 | case class HttpResult(status:Int,content:String,message:String,url:String,deep:Int){ 12 | override def toString()="status="+status+",context length="+content.length+",url="+url 13 | } 14 | 15 | 16 | sealed trait UrlType 17 | case object SeedType extends UrlType 18 | case object GenerateType extends UrlType 19 | /** 20 | * 爬取url类 21 | * @param url url 22 | * @param parent 父url 23 | */ 24 | case class UrlInfo(url:String,parent:String,urlType: UrlType,deep:Int){ 25 | override def toString()=url+"\n" 26 | } 27 | 28 | /** 29 | * 索引结果 30 | * @param status 31 | */ 32 | case class IndexResult(status:Int) 33 | 34 | /** 35 | * 种子类 36 | * @param url 种子url 37 | */ 38 | case class Seed(url:String){ 39 | override def toString() = url+"\n" 40 | } 41 | 42 | /** 43 | * 解析出来的HTTP网页信息 44 | */ 45 | class HttpPage{ 46 | @BeanProperty 47 | var url: String = "" 48 | @BeanProperty 49 | var title: String = "" 50 | @BeanProperty 51 | var html:String = "" 52 | @BeanProperty 53 | var content: String = "" 54 | @BeanProperty 55 | var publishTime: Long = 0 56 | @BeanProperty 57 | var updateTime: Long = 0 58 | @BeanProperty 59 | var author: String = "" 60 | @BeanProperty 61 | var keywords:String = "" 62 | @BeanProperty 63 | var desc:String = "" 64 | @BeanProperty 65 | var childLink:(List[UrlInfo],Int) = (List(),0) 66 | @BeanProperty 67 | var meta:Map[String,String] = Map() 68 | 69 | override def toString()="url="+url+",context length="+content.length 70 | 71 | } 72 | 73 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/spider/task/support/actor/ParseActorTask.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.spider.task.support.actor 2 | 3 | import akka.actor.{ActorRef, Actor} 4 | import akka.event.Logging 5 | import com.young.crawler.config.{CrawlerConfigContants, CrawlerConfig} 6 | import com.young.crawler.entity.{ParseChildUrlCounter, ParseCounter, HttpPage, HttpResult} 7 | import com.young.crawler.spider.parser.Parser 8 | import com.young.crawler.spider.task.ParserTask 9 | 10 | /** 11 | * Created by young.yang on 2016/8/28. 12 | * 解析任务 13 | */ 14 | private[crawler] class ParseActorTask(parser: Parser, indexTask: ActorRef) extends Actor with ParserTask { 15 | 16 | private val log = Logging(context.system, this) 17 | 18 | private val countActor = context.system.actorSelection("akka://" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_appName) + "/user/" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_count_name)) 19 | 20 | private val fetchDeep = CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_fetcher_deep).toInt 21 | 22 | private var fetcher: ActorRef = null 23 | 24 | override def receive: Receive = { 25 | case httpResult: HttpResult => 26 | fetcher = sender() 27 | val page: HttpPage = parser.parse(httpResult) 28 | indexTask ! page 29 | countActor ! ParseCounter(1) 30 | log.info("ParserTask send IndexerTask a index request -[" + page + "]") 31 | val childLinks = page.getChildLink 32 | if(childLinks._2 throw new FetchException("fetch error message error url is " + url, e) 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/spider/parser/support/JsoupParser.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.spider.parser.support 2 | 3 | import com.young.crawler.entity.{GenerateType, HttpPage, HttpResult, UrlInfo} 4 | import com.young.crawler.spider.parser.Parser 5 | import org.jsoup.Jsoup 6 | import org.jsoup.select.Elements 7 | 8 | import scala.collection.mutable.ListBuffer 9 | 10 | /** 11 | * Created by young.yang on 2016/8/31. 12 | * Jsoup解析器 13 | */ 14 | private[crawler] class JsoupParser extends Parser { 15 | 16 | private val KEYWORDS = "keywords" 17 | 18 | private val DESCRIPTION = "description" 19 | 20 | /** 21 | * 解析meta信息 22 | * @param key 23 | * @param meta 24 | * @return 25 | */ 26 | private def getMeta(key: String, meta: Elements): String = { 27 | for (i <- 0 until meta.size()) { 28 | val element = meta.get(i) 29 | if (key.equals(element.attr("name").toLowerCase)) { 30 | return element.attr("content") 31 | } 32 | } 33 | "" 34 | } 35 | 36 | /** 37 | * 解析子url 38 | */ 39 | private def parserUrls(urls: Elements, deep: Int): (List[UrlInfo],Int) = { 40 | val list = new ListBuffer[UrlInfo]() 41 | for (i <- 0 until urls.size()) { 42 | val element = urls.get(i) 43 | val url = element.attr("href") 44 | if (url.startsWith("http")) 45 | list.append(UrlInfo(url, "", GenerateType, deep + 1)) 46 | } 47 | (list.toList,deep+1) 48 | } 49 | 50 | /** 51 | * 解析具体实现 52 | * @param html 53 | * @return 54 | */ 55 | override def parse(html: HttpResult): HttpPage = { 56 | val htmlPage = new HttpPage 57 | val document = Jsoup.parse(html.content) 58 | val meta = document.select("meta") 59 | htmlPage.setTitle(document.title()) 60 | htmlPage.setContent(document.text()) 61 | // htmlPage.setHtml(html.content) 62 | htmlPage.setPublishTime(System.currentTimeMillis()) 63 | htmlPage.setUpdateTime(System.currentTimeMillis()) 64 | htmlPage.setUrl(html.url) 65 | htmlPage.setKeywords(getMeta(KEYWORDS, meta)) 66 | htmlPage.setDesc(getMeta(DESCRIPTION, meta)) 67 | htmlPage.setChildLink(parserUrls(document.body().select("a"),html.deep)) 68 | htmlPage 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/spider/task/support/actor/InjectActorTask.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.spider.task.support.actor 2 | 3 | import akka.actor.{ActorRef, Actor, Props} 4 | import akka.event.Logging 5 | import com.young.crawler.config.{CrawlerConfigContants, CrawlerConfig} 6 | import com.young.crawler.entity._ 7 | import com.young.crawler.spider.fetcher.support.HttpClientFetcher 8 | import com.young.crawler.spider.indexer.support.ElasticIndexer 9 | import com.young.crawler.spider.parser.support.{JsoupParser, HtmlParseParser} 10 | import com.young.crawler.spider.task.InjectTask 11 | 12 | import scala.io.Source 13 | 14 | /** 15 | * Created by dell on 2016/8/29. 16 | * 抓取种子注入任务,将需要抓取的任务注入到该任务中 17 | */ 18 | private[crawler] class InjectActorTask(fetcher: ActorRef) extends Actor with InjectTask { 19 | private val log = Logging(context.system, this) 20 | 21 | private val countActor = context.system.actorSelection("akka://" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_appName) + "/user/" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_count_name)) 22 | 23 | 24 | override def receive: Receive = { 25 | //初始化注入 26 | case init: InitSeed => 27 | val seeds = initSeeds(init.seedPath, init.fileEncode) 28 | log.info("init seeds -" + seeds) 29 | seeds.map(seed => { 30 | fetcher ! UrlInfo(seed.url, null,SeedType,0) 31 | countActor ! InjectCounter(1) 32 | }) 33 | //子url注入 34 | case urls: List[UrlInfo] => 35 | log.info("inject urls -" + urls) 36 | urls.filter(seed => seed.url.startsWith("http")).map(seed => { 37 | fetcher ! seed 38 | countActor ! InjectCounter(1) 39 | } 40 | ) 41 | } 42 | 43 | override def initSeeds(seedPath: String, fileEncode: String = "utf-8"): List[Seed] = { 44 | log.info("seedpath = ["+seedPath+"] encoding = ["+fileEncode+"]") 45 | if (seedPath == null || seedPath.trim.equals("") || seedPath.startsWith("classpath:")) { 46 | val temp = seedPath.split(":") 47 | log.info("classpath seedpath = ["+temp(1)+"]") 48 | Source.fromInputStream(classOf[InjectTask].getResourceAsStream(temp(1))).getLines().map(line => Seed(line)).toList 49 | } else 50 | Source.fromFile(seedPath, fileEncode).getLines().map(line => Seed(line)).toList 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/spider/task/support/actor/CounterActorTask.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.spider.task.support.actor 2 | 3 | import akka.actor.Actor 4 | import com.young.crawler.entity._ 5 | import com.young.crawler.spider.task.CounterTask 6 | 7 | /** 8 | * Created by young.yang on 2016/9/3. 9 | * 用来对任务进行计数 10 | */ 11 | private[crawler] class CounterActorTask extends Actor with CounterTask { 12 | 13 | private var fetchCounter = FetchCounter(0) 14 | private var fetchOk = FetchOk(0) 15 | private var fetchError = FetchError(0) 16 | private var injectCounter = InjectCounter(0) 17 | private var parseCounter = ParseCounter(0) 18 | private var parseChildUrlCounter = ParseChildUrlCounter(0) 19 | private var indexCounter = IndexCounter(0) 20 | 21 | 22 | private def printCounter(): String = { 23 | val buffer = new StringBuilder 24 | buffer.append("task counter details start ------" + "\n") 25 | buffer.append("fetchCounter = [" + fetchCounter.num + "]" + "\n") 26 | buffer.append("fetchOk = [" + fetchOk.num + "]" + "\n") 27 | buffer.append("fetchError = [" + fetchError.num + "]" + "\n") 28 | buffer.append("injectCounter = [" + injectCounter.num + "]" + "\n") 29 | buffer.append("parseCounter = [" + parseCounter.num + "]" + "\n") 30 | buffer.append("parseChildUrlCounter = [" + parseChildUrlCounter.num + "]" + "\n") 31 | buffer.append("indexCounter = [" + indexCounter.num + "]" + "\n") 32 | buffer.append("task counter details end -------") 33 | buffer.toString() 34 | } 35 | 36 | private def getAllCounter():AllCounter = AllCounter(fetchCounter,fetchOk,fetchError,injectCounter,parseCounter,parseChildUrlCounter,indexCounter) 37 | 38 | override def receive: Receive = { 39 | case counter: FetchCounter => fetchCounter = FetchCounter(fetchCounter.num + counter.num) 40 | case count: FetchOk => fetchOk = FetchOk(count.num + fetchOk.num) 41 | case count: FetchError => fetchError = FetchError(count.num + fetchError.num) 42 | case count: InjectCounter => injectCounter = InjectCounter(count.num + injectCounter.num) 43 | case count: ParseCounter => parseCounter = ParseCounter(count.num + parseCounter.num) 44 | case count: ParseChildUrlCounter => parseChildUrlCounter = ParseChildUrlCounter(count.num + parseChildUrlCounter.num) 45 | case count: IndexCounter => indexCounter = IndexCounter(count.num + indexCounter.num) 46 | case PrintCounter => sender() ! printCounter() 47 | case GetAllCounter => sender() ! getAllCounter 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/spider/fetcher/support/HttpWatch.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.spider.fetcher.support 2 | 3 | import com.young.crawler.config.{CrawlerConfigContants, CrawlerConfig} 4 | import com.young.crawler.entity.{UrlInfo, HttpResult} 5 | import org.apache.commons.io.IOUtils 6 | import org.apache.http.annotation.NotThreadSafe 7 | import org.apache.http.client.config.RequestConfig 8 | import org.apache.http.client.methods.{HttpHead, HttpGet, HttpUriRequest} 9 | import org.apache.http.impl.client.HttpClients 10 | import org.apache.http.Header 11 | 12 | /** 13 | * Created by young.yang on 2016/8/28. 14 | */ 15 | class HttpWatch(userAgent: String = "Mozilla/5.0 (X11; U; Linux i686; zh-CN; rv:1.9.1.2) Gecko/20090803 Fedora/3.5.2-2.fc11 Firefox/3.5.2", timeout: Int = 10000, poolSize: Int = 100) { 16 | 17 | private val defaultRequestConfig = RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build() 18 | 19 | private val httpClient = HttpClients.custom().setUserAgent(userAgent).setMaxConnTotal(poolSize) 20 | .setMaxConnPerRoute(poolSize).setDefaultRequestConfig(defaultRequestConfig).build(); 21 | 22 | private def doGet(url: UrlInfo, encode: String = "utf-8"): HttpResult = { 23 | val get = new HttpGet(url.url) 24 | val result = sendRequest(get, encode) 25 | HttpResult(result._1, result._2, result._3, url.url,url.deep) 26 | } 27 | 28 | private def doHeader(url: String): Array[Header] = { 29 | val header = new HttpHead(url) 30 | httpClient.execute(header).getAllHeaders 31 | } 32 | 33 | private def sendRequest(request: HttpUriRequest, encode: String): (Int, String, String) = { 34 | val response = httpClient.execute(request) 35 | val statusCode = response.getStatusLine.getStatusCode 36 | val message = response.getStatusLine.getReasonPhrase 37 | val content = IOUtils.toString(response.getEntity.getContent, encode) 38 | // val content = IOUtil.toString(response.getEntity.getContent,encode) 39 | (statusCode, content, message) 40 | } 41 | } 42 | @NotThreadSafe 43 | object HttpWatch { 44 | val WATCH_TYPE_PROTOTYPE = "prototype" 45 | val WATCH_TYPE_SINGLETON = "singleton" 46 | var WATCH_TYPE = WATCH_TYPE_PROTOTYPE 47 | private val httpWatch = getHttpWatch() 48 | 49 | def get(url:UrlInfo, encode: String = "utf-8"): HttpResult = getHttpWatch().doGet(url, encode) 50 | 51 | def header(url: String): Array[Header] = getHttpWatch().doHeader(url) 52 | 53 | private def getHttpWatch(): HttpWatch = { 54 | if (WATCH_TYPE_PROTOTYPE.equals(WATCH_TYPE)) 55 | new HttpWatch(CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_fetcher_useragent), CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_fetcher_timeout).toInt) 56 | else 57 | httpWatch 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /young-crawler-core/src/main/scala/com/young/crawler/boot/CrawlerBoot.scala: -------------------------------------------------------------------------------- 1 | package com.young.crawler.boot 2 | 3 | import akka.actor.{ActorSystem, Props} 4 | import akka.pattern.ask 5 | import akka.routing.RoundRobinPool 6 | import com.young.crawler.config.{CrawlerConfig, CrawlerConfigContants} 7 | import com.young.crawler.entity.{AllCounter, GetAllCounter, InitSeed, PrintCounter} 8 | import com.young.crawler.spider.fetcher.support.HttpClientFetcher 9 | import com.young.crawler.spider.indexer.support.ElasticIndexer 10 | import com.young.crawler.spider.parser.support.JsoupParser 11 | import com.young.crawler.spider.task.support.actor._ 12 | import org.apache.commons.logging.LogFactory 13 | 14 | import scala.concurrent.Await 15 | import scala.concurrent.duration.Duration 16 | 17 | /** 18 | * Created by dell on 2016/8/29. 19 | * 爬虫主函数 20 | */ 21 | object CrawlerBoot { 22 | 23 | private val system = ActorSystem(CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_appName)) 24 | 25 | private val log = LogFactory.getLog(CrawlerConfigContants.young_crawler_appName) 26 | 27 | private val timeout = Duration(5, "s") 28 | 29 | /** 30 | * 爬虫启动函数 31 | */ 32 | def start(): Unit = { 33 | val initSeeds = InitSeed(CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_seed_path)) 34 | //每个角色的actor都可以通过组组成一组actor进行处理 35 | val parallel = CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_parallel_int).toInt 36 | val indexerActor = system.actorOf(RoundRobinPool(parallel).props(Props(new IndexActorTask(new ElasticIndexer))), CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_index_name)) 37 | log.info("create indexerActor name -[" + indexerActor + "]") 38 | val parserActor = system.actorOf(RoundRobinPool(parallel).props(Props(new ParseActorTask(new JsoupParser, indexerActor))), CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_parse_name)) 39 | log.info("create parserActor name -[" + parserActor + "]") 40 | val fetcher = system.actorOf(RoundRobinPool(parallel).props(Props(new FetchActorTask(new HttpClientFetcher, parserActor))), CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_fetch_name)) 41 | log.info("create fetcherActor name -[" + fetcher + "]") 42 | val injectActor = system.actorOf(RoundRobinPool(parallel).props(Props(new InjectActorTask(fetcher))), CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_inject_name)) 43 | log.info("create injectActor name -[" + injectActor + "]") 44 | val countActor = system.actorOf(Props[CounterActorTask], CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_count_name)) 45 | log.info("create countActor name -[" + countActor + "]") 46 | injectActor ! initSeeds 47 | } 48 | 49 | /** 50 | * 停止爬虫程序 51 | */ 52 | def stop(): Unit = { 53 | system.terminate() 54 | } 55 | 56 | def printCount(): String = { 57 | val countActor = system.actorSelection("akka://" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_appName) + "/user/" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_count_name)) 58 | val result = ask(countActor, PrintCounter)(timeout) 59 | Await.result(result, timeout).asInstanceOf[String] 60 | } 61 | 62 | def getCounter(): AllCounter = { 63 | val countActor = system.actorSelection("akka://" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_appName) + "/user/" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_count_name)) 64 | val result = ask(countActor, GetAllCounter)(timeout) 65 | Await.result(result, timeout).asInstanceOf[AllCounter] 66 | } 67 | 68 | def main(args: Array[String]) { 69 | CrawlerBoot.start() 70 | // Thread.sleep(3000) 71 | // println(CrawlerBoot.printCount()) 72 | // println(CrawlerBoot.getCounter()) 73 | // CrawlerBoot.stop() 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /young-crawler-searcher/bin/activator.bat: -------------------------------------------------------------------------------- 1 | @REM activator launcher script 2 | @REM 3 | @REM Environment: 4 | @REM In order for Activator to work you must have Java available on the classpath 5 | @REM JAVA_HOME - location of a JDK home dir (optional if java on path) 6 | @REM CFG_OPTS - JVM options (optional) 7 | @REM Configuration: 8 | @REM activatorconfig.txt found in the ACTIVATOR_HOME or ACTIVATOR_HOME/ACTIVATOR_VERSION 9 | @setlocal enabledelayedexpansion 10 | 11 | @echo off 12 | 13 | set "var1=%~1" 14 | if defined var1 ( 15 | if "%var1%"=="help" ( 16 | echo. 17 | echo Usage activator [options] [command] 18 | echo. 19 | echo Commands: 20 | echo ui Start the Activator UI 21 | echo new [name] [template-id] Create a new project with [name] using template [template-id] 22 | echo list-templates Print all available template names 23 | echo help Print this message 24 | echo. 25 | echo Options: 26 | echo -jvm-debug [port] Turn on JVM debugging, open at the given port. Defaults to 9999 if no port given. 27 | echo. 28 | echo Environment variables ^(read from context^): 29 | echo JAVA_OPTS Environment variable, if unset uses "" 30 | echo SBT_OPTS Environment variable, if unset uses "" 31 | echo ACTIVATOR_OPTS Environment variable, if unset uses "" 32 | echo. 33 | echo Please note that in order for Activator to work you must have Java available on the classpath 34 | echo. 35 | goto :end 36 | ) 37 | ) 38 | 39 | @REM determine ACTIVATOR_HOME environment variable 40 | set BIN_DIRECTORY=%~dp0 41 | set BIN_DIRECTORY=%BIN_DIRECTORY:~0,-1% 42 | for %%d in (%BIN_DIRECTORY%) do set ACTIVATOR_HOME=%%~dpd 43 | set ACTIVATOR_HOME=%ACTIVATOR_HOME:~0,-1% 44 | 45 | echo ACTIVATOR_HOME=%ACTIVATOR_HOME% 46 | 47 | set ERROR_CODE=0 48 | set APP_VERSION=1.3.10 49 | set ACTIVATOR_LAUNCH_JAR=activator-launch-%APP_VERSION%.jar 50 | 51 | rem Detect if we were double clicked, although theoretically A user could 52 | rem manually run cmd /c 53 | for %%x in (%cmdcmdline%) do if %%~x==/c set DOUBLECLICKED=1 54 | 55 | set SBT_HOME=%BIN_DIRECTORY 56 | 57 | rem Detect if we were double clicked, although theoretically A user could 58 | rem manually run cmd /c 59 | for %%x in (%cmdcmdline%) do if %%~x==/c set DOUBLECLICKED=1 60 | 61 | rem FIRST we load the config file of extra options. 62 | set FN=%SBT_HOME%\..\conf\sbtconfig.txt 63 | set CFG_OPTS= 64 | FOR /F "tokens=* eol=# usebackq delims=" %%i IN ("%FN%") DO ( 65 | set DO_NOT_REUSE_ME=%%i 66 | rem ZOMG (Part #2) WE use !! here to delay the expansion of 67 | rem CFG_OPTS, otherwise it remains "" for this loop. 68 | set CFG_OPTS=!CFG_OPTS! !DO_NOT_REUSE_ME! 69 | ) 70 | 71 | rem FIRST we load a config file of extra options (if there is one) 72 | set "CFG_FILE_HOME=%UserProfile%\.activator\activatorconfig.txt" 73 | set "CFG_FILE_VERSION=%UserProfile%\.activator\%APP_VERSION%\activatorconfig.txt" 74 | if exist %CFG_FILE_VERSION% ( 75 | FOR /F "tokens=* eol=# usebackq delims=" %%i IN ("%CFG_FILE_VERSION%") DO ( 76 | set DO_NOT_REUSE_ME=%%i 77 | rem ZOMG (Part #2) WE use !! here to delay the expansion of 78 | rem CFG_OPTS, otherwise it remains "" for this loop. 79 | set CFG_OPTS=!CFG_OPTS! !DO_NOT_REUSE_ME! 80 | ) 81 | ) 82 | if "%CFG_OPTS%"=="" ( 83 | if exist %CFG_FILE_HOME% ( 84 | FOR /F "tokens=* eol=# usebackq delims=" %%i IN ("%CFG_FILE_HOME%") DO ( 85 | set DO_NOT_REUSE_ME=%%i 86 | rem ZOMG (Part #2) WE use !! here to delay the expansion of 87 | rem CFG_OPTS, otherwise it remains "" for this loop. 88 | set CFG_OPTS=!CFG_OPTS! !DO_NOT_REUSE_ME! 89 | ) 90 | ) 91 | ) 92 | 93 | rem We use the value of the JAVACMD environment variable if defined 94 | set _JAVACMD=%JAVACMD% 95 | 96 | if "%_JAVACMD%"=="" ( 97 | if not "%JAVA_HOME%"=="" ( 98 | if exist "%JAVA_HOME%\bin\java.exe" set "_JAVACMD=%JAVA_HOME%\bin\java.exe" 99 | 100 | rem if there is a java home set we make sure it is the first picked up when invoking 'java' 101 | SET "PATH=%JAVA_HOME%\bin;%PATH%" 102 | ) 103 | ) 104 | 105 | if "%_JAVACMD%"=="" set _JAVACMD=java 106 | 107 | rem Detect if this java is ok to use. 108 | for /F %%j in ('"%_JAVACMD%" -version 2^>^&1') do ( 109 | if %%~j==java set JAVAINSTALLED=1 110 | if %%~j==openjdk set JAVAINSTALLED=1 111 | ) 112 | 113 | rem Detect the same thing about javac 114 | if "%_JAVACCMD%"=="" ( 115 | if not "%JAVA_HOME%"=="" ( 116 | if exist "%JAVA_HOME%\bin\javac.exe" set "_JAVACCMD=%JAVA_HOME%\bin\javac.exe" 117 | ) 118 | ) 119 | if "%_JAVACCMD%"=="" set _JAVACCMD=javac 120 | for /F %%j in ('"%_JAVACCMD%" -version 2^>^&1') do ( 121 | if %%~j==javac set JAVACINSTALLED=1 122 | ) 123 | 124 | rem BAT has no logical or, so we do it OLD SCHOOL! Oppan Redmond Style 125 | set JAVAOK=true 126 | if not defined JAVAINSTALLED set JAVAOK=false 127 | if not defined JAVACINSTALLED set JAVAOK=false 128 | 129 | if "%JAVAOK%"=="false" ( 130 | echo. 131 | echo A Java JDK is not installed or can't be found. 132 | if not "%JAVA_HOME%"=="" ( 133 | echo JAVA_HOME = "%JAVA_HOME%" 134 | ) 135 | echo. 136 | echo Please go to 137 | echo http://www.oracle.com/technetwork/java/javase/downloads/index.html 138 | echo and download a valid Java JDK and install before running Activator. 139 | echo. 140 | echo If you think this message is in error, please check 141 | echo your environment variables to see if "java.exe" and "javac.exe" are 142 | echo available via JAVA_HOME or PATH. 143 | echo. 144 | if defined DOUBLECLICKED pause 145 | exit /B 1 146 | ) 147 | 148 | rem Check what Java version is being used to determine what memory options to use 149 | for /f "tokens=3" %%g in ('java -version 2^>^&1 ^| findstr /i "version"') do ( 150 | set JAVA_VERSION=%%g 151 | ) 152 | 153 | rem Strips away the " characters 154 | set JAVA_VERSION=%JAVA_VERSION:"=% 155 | 156 | rem TODO Check if there are existing mem settings in JAVA_OPTS/CFG_OPTS and use those instead of the below 157 | for /f "delims=. tokens=1-3" %%v in ("%JAVA_VERSION%") do ( 158 | set MAJOR=%%v 159 | set MINOR=%%w 160 | set BUILD=%%x 161 | 162 | set META_SIZE=-XX:MetaspaceSize=64M -XX:MaxMetaspaceSize=256M 163 | if "!MINOR!" LSS "8" ( 164 | set META_SIZE=-XX:PermSize=64M -XX:MaxPermSize=256M 165 | ) 166 | 167 | set MEM_OPTS=!META_SIZE! 168 | ) 169 | 170 | rem We use the value of the JAVA_OPTS environment variable if defined, rather than the config. 171 | set _JAVA_OPTS=%JAVA_OPTS% 172 | if "%_JAVA_OPTS%"=="" set _JAVA_OPTS=%CFG_OPTS% 173 | 174 | set DEBUG_OPTS= 175 | 176 | rem Loop through the arguments, building remaining args in args variable 177 | set args= 178 | :argsloop 179 | if not "%~1"=="" ( 180 | rem Checks if the argument contains "-D" and if true, adds argument 1 with 2 and puts an equal sign between them. 181 | rem This is done since batch considers "=" to be a delimiter so we need to circumvent this behavior with a small hack. 182 | set arg1=%~1 183 | if "!arg1:~0,2!"=="-D" ( 184 | set "args=%args% "%~1"="%~2"" 185 | shift 186 | shift 187 | goto argsloop 188 | ) 189 | 190 | if "%~1"=="-jvm-debug" ( 191 | if not "%~2"=="" ( 192 | rem This piece of magic somehow checks that an argument is a number 193 | for /F "delims=0123456789" %%i in ("%~2") do ( 194 | set var="%%i" 195 | ) 196 | if defined var ( 197 | rem Not a number, assume no argument given and default to 9999 198 | set JPDA_PORT=9999 199 | ) else ( 200 | rem Port was given, shift arguments 201 | set JPDA_PORT=%~2 202 | shift 203 | ) 204 | ) else ( 205 | set JPDA_PORT=9999 206 | ) 207 | shift 208 | 209 | set DEBUG_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=!JPDA_PORT! 210 | goto argsloop 211 | ) 212 | rem else 213 | set "args=%args% "%~1"" 214 | shift 215 | goto argsloop 216 | ) 217 | 218 | :run 219 | 220 | if "!args!"=="" ( 221 | if defined DOUBLECLICKED ( 222 | set CMDS="ui" 223 | ) else set CMDS=!args! 224 | ) else set CMDS=!args! 225 | 226 | rem We add a / in front, so we get file:///C: instead of file://C: 227 | rem Java considers the later a UNC path. 228 | rem We also attempt a solid effort at making it URI friendly. 229 | rem We don't even bother with UNC paths. 230 | set JAVA_FRIENDLY_HOME_1=/!ACTIVATOR_HOME:\=/! 231 | set JAVA_FRIENDLY_HOME=/!JAVA_FRIENDLY_HOME_1: =%%20! 232 | 233 | rem Checks if the command contains spaces to know if it should be wrapped in quotes or not 234 | set NON_SPACED_CMD=%_JAVACMD: =% 235 | if "%_JAVACMD%"=="%NON_SPACED_CMD%" %_JAVACMD% %DEBUG_OPTS% %MEM_OPTS% %ACTIVATOR_OPTS% %SBT_OPTS% %_JAVA_OPTS% "-Dactivator.home=%JAVA_FRIENDLY_HOME%" -jar "%ACTIVATOR_HOME%\libexec\%ACTIVATOR_LAUNCH_JAR%" %CMDS% 236 | if NOT "%_JAVACMD%"=="%NON_SPACED_CMD%" "%_JAVACMD%" %DEBUG_OPTS% %MEM_OPTS% %ACTIVATOR_OPTS% %SBT_OPTS% %_JAVA_OPTS% "-Dactivator.home=%JAVA_FRIENDLY_HOME%" -jar "%ACTIVATOR_HOME%\libexec\%ACTIVATOR_LAUNCH_JAR%" %CMDS% 237 | 238 | if ERRORLEVEL 1 goto error 239 | goto end 240 | 241 | :error 242 | set ERROR_CODE=1 243 | 244 | :end 245 | 246 | @endlocal 247 | 248 | exit /B %ERROR_CODE% 249 | -------------------------------------------------------------------------------- /young-crawler-searcher/conf/application.conf: -------------------------------------------------------------------------------- 1 | # This is the main configuration file for the application. 2 | # https://www.playframework.com/documentation/latest/ConfigFile 3 | # ~~~~~ 4 | # Play uses HOCON as its configuration file format. HOCON has a number 5 | # of advantages over other config formats, but there are two things that 6 | # can be used when modifying settings. 7 | # 8 | # You can include other configuration files in this main application.conf file: 9 | #include "extra-config.conf" 10 | # 11 | # You can declare variables and substitute for them: 12 | #mykey = ${some.value} 13 | # 14 | # And if an environment variable exists when there is no other subsitution, then 15 | # HOCON will fall back to substituting environment variable: 16 | #mykey = ${JAVA_HOME} 17 | 18 | ## Akka 19 | # https://www.playframework.com/documentation/latest/ScalaAkka#Configuration 20 | # https://www.playframework.com/documentation/latest/JavaAkka#Configuration 21 | # ~~~~~ 22 | # Play uses Akka internally and exposes Akka Streams and actors in Websockets and 23 | # other streaming HTTP responses. 24 | akka { 25 | # "akka.log-config-on-start" is extraordinarly useful because it log the complete 26 | # configuration at INFO level, including defaults and overrides, so it s worth 27 | # putting at the very top. 28 | # 29 | # Put the following in your conf/logback.xml file: 30 | # 31 | # 32 | # 33 | # And then uncomment this line to debug the configuration. 34 | # 35 | #log-config-on-start = true 36 | } 37 | 38 | ## Secret key 39 | # http://www.playframework.com/documentation/latest/ApplicationSecret 40 | # ~~~~~ 41 | # The secret key is used to sign Play's session cookie. 42 | # This must be changed for production, but we don't recommend you change it in this file. 43 | play.crypto.secret = "changeme" 44 | 45 | ## Modules 46 | # https://www.playframework.com/documentation/latest/Modules 47 | # ~~~~~ 48 | # Control which modules are loaded when Play starts. Note that modules are 49 | # the replacement for "GlobalSettings", which are deprecated in 2.5.x. 50 | # Please see https://www.playframework.com/documentation/latest/GlobalSettings 51 | # for more information. 52 | # 53 | # You can also extend Play functionality by using one of the publically available 54 | # Play modules: https://playframework.com/documentation/latest/ModuleDirectory 55 | play.modules { 56 | # By default, Play will load any class called Module that is defined 57 | # in the root package (the "app" directory), or you can define them 58 | # explicitly below. 59 | # If there are any built-in modules that you want to disable, you can list them here. 60 | #enabled += my.application.Module 61 | 62 | # If there are any built-in modules that you want to disable, you can list them here. 63 | #disabled += "" 64 | } 65 | 66 | ## IDE 67 | # https://www.playframework.com/documentation/latest/IDE 68 | # ~~~~~ 69 | # Depending on your IDE, you can add a hyperlink for errors that will jump you 70 | # directly to the code location in the IDE in dev mode. The following line makes 71 | # use of the IntelliJ IDEA REST interface: 72 | #play.editor="http://localhost:63342/api/file/?file=%s&line=%s" 73 | 74 | ## Internationalisation 75 | # https://www.playframework.com/documentation/latest/JavaI18N 76 | # https://www.playframework.com/documentation/latest/ScalaI18N 77 | # ~~~~~ 78 | # Play comes with its own i18n settings, which allow the user's preferred language 79 | # to map through to internal messages, or allow the language to be stored in a cookie. 80 | play.i18n { 81 | # The application languages 82 | langs = [ "en" ] 83 | 84 | # Whether the language cookie should be secure or not 85 | #langCookieSecure = true 86 | 87 | # Whether the HTTP only attribute of the cookie should be set to true 88 | #langCookieHttpOnly = true 89 | } 90 | 91 | ## Play HTTP settings 92 | # ~~~~~ 93 | play.http { 94 | ## Router 95 | # https://www.playframework.com/documentation/latest/JavaRouting 96 | # https://www.playframework.com/documentation/latest/ScalaRouting 97 | # ~~~~~ 98 | # Define the Router object to use for this application. 99 | # This router will be looked up first when the application is starting up, 100 | # so make sure this is the entry point. 101 | # Furthermore, it's assumed your route file is named properly. 102 | # So for an application router like `my.application.Router`, 103 | # you may need to define a router file `conf/my.application.routes`. 104 | # Default to Routes in the root package (aka "apps" folder) (and conf/routes) 105 | #router = my.application.Router 106 | 107 | ## Action Creator 108 | # https://www.playframework.com/documentation/latest/JavaActionCreator 109 | # ~~~~~ 110 | #actionCreator = null 111 | 112 | ## ErrorHandler 113 | # https://www.playframework.com/documentation/latest/JavaRouting 114 | # https://www.playframework.com/documentation/latest/ScalaRouting 115 | # ~~~~~ 116 | # If null, will attempt to load a class called ErrorHandler in the root package, 117 | #errorHandler = null 118 | 119 | ## Filters 120 | # https://www.playframework.com/documentation/latest/ScalaHttpFilters 121 | # https://www.playframework.com/documentation/latest/JavaHttpFilters 122 | # ~~~~~ 123 | # Filters run code on every request. They can be used to perform 124 | # common logic for all your actions, e.g. adding common headers. 125 | # Defaults to "Filters" in the root package (aka "apps" folder) 126 | # Alternatively you can explicitly register a class here. 127 | #filters = my.application.Filters 128 | 129 | ## Session & Flash 130 | # https://www.playframework.com/documentation/latest/JavaSessionFlash 131 | # https://www.playframework.com/documentation/latest/ScalaSessionFlash 132 | # ~~~~~ 133 | session { 134 | # Sets the cookie to be sent only over HTTPS. 135 | #secure = true 136 | 137 | # Sets the cookie to be accessed only by the server. 138 | #httpOnly = true 139 | 140 | # Sets the max-age field of the cookie to 5 minutes. 141 | # NOTE: this only sets when the browser will discard the cookie. Play will consider any 142 | # cookie value with a valid signature to be a valid session forever. To implement a server side session timeout, 143 | # you need to put a timestamp in the session and check it at regular intervals to possibly expire it. 144 | #maxAge = 300 145 | 146 | # Sets the domain on the session cookie. 147 | #domain = "example.com" 148 | } 149 | 150 | flash { 151 | # Sets the cookie to be sent only over HTTPS. 152 | #secure = true 153 | 154 | # Sets the cookie to be accessed only by the server. 155 | #httpOnly = true 156 | } 157 | } 158 | 159 | ## Netty Provider 160 | # https://www.playframework.com/documentation/latest/SettingsNetty 161 | # ~~~~~ 162 | play.server.netty { 163 | # Whether the Netty wire should be logged 164 | #log.wire = true 165 | 166 | # If you run Play on Linux, you can use Netty's native socket transport 167 | # for higher performance with less garbage. 168 | #transport = "native" 169 | } 170 | 171 | ## WS (HTTP Client) 172 | # https://www.playframework.com/documentation/latest/ScalaWS#Configuring-WS 173 | # ~~~~~ 174 | # The HTTP client primarily used for REST APIs. The default client can be 175 | # configured directly, but you can also create different client instances 176 | # with customized settings. You must enable this by adding to build.sbt: 177 | # 178 | # libraryDependencies += ws // or javaWs if using java 179 | # 180 | play.ws { 181 | # Sets HTTP requests not to follow 302 requests 182 | #followRedirects = false 183 | 184 | # Sets the maximum number of open HTTP connections for the client. 185 | #ahc.maxConnectionsTotal = 50 186 | 187 | ## WS SSL 188 | # https://www.playframework.com/documentation/latest/WsSSL 189 | # ~~~~~ 190 | ssl { 191 | # Configuring HTTPS with Play WS does not require programming. You can 192 | # set up both trustManager and keyManager for mutual authentication, and 193 | # turn on JSSE debugging in development with a reload. 194 | #debug.handshake = true 195 | #trustManager = { 196 | # stores = [ 197 | # { type = "JKS", path = "exampletrust.jks" } 198 | # ] 199 | #} 200 | } 201 | } 202 | 203 | ## Cache 204 | # https://www.playframework.com/documentation/latest/JavaCache 205 | # https://www.playframework.com/documentation/latest/ScalaCache 206 | # ~~~~~ 207 | # Play comes with an integrated cache API that can reduce the operational 208 | # overhead of repeated requests. You must enable this by adding to build.sbt: 209 | # 210 | # libraryDependencies += cache 211 | # 212 | play.cache { 213 | # If you want to bind several caches, you can bind the individually 214 | #bindCaches = ["db-cache", "user-cache", "session-cache"] 215 | } 216 | 217 | ## Filters 218 | # https://www.playframework.com/documentation/latest/Filters 219 | # ~~~~~ 220 | # There are a number of built-in filters that can be enabled and configured 221 | # to give Play greater security. You must enable this by adding to build.sbt: 222 | # 223 | # libraryDependencies += filters 224 | # 225 | play.filters { 226 | ## CORS filter configuration 227 | # https://www.playframework.com/documentation/latest/CorsFilter 228 | # ~~~~~ 229 | # CORS is a protocol that allows web applications to make requests from the browser 230 | # across different domains. 231 | # NOTE: You MUST apply the CORS configuration before the CSRF filter, as CSRF has 232 | # dependencies on CORS settings. 233 | cors { 234 | # Filter paths by a whitelist of path prefixes 235 | #pathPrefixes = ["/some/path", ...] 236 | 237 | # The allowed origins. If null, all origins are allowed. 238 | #allowedOrigins = ["http://www.example.com"] 239 | 240 | # The allowed HTTP methods. If null, all methods are allowed 241 | #allowedHttpMethods = ["GET", "POST"] 242 | } 243 | 244 | ## CSRF Filter 245 | # https://www.playframework.com/documentation/latest/ScalaCsrf#Applying-a-global-CSRF-filter 246 | # https://www.playframework.com/documentation/latest/JavaCsrf#Applying-a-global-CSRF-filter 247 | # ~~~~~ 248 | # Play supports multiple methods for verifying that a request is not a CSRF request. 249 | # The primary mechanism is a CSRF token. This token gets placed either in the query string 250 | # or body of every form submitted, and also gets placed in the users session. 251 | # Play then verifies that both tokens are present and match. 252 | csrf { 253 | # Sets the cookie to be sent only over HTTPS 254 | #cookie.secure = true 255 | 256 | # Defaults to CSRFErrorHandler in the root package. 257 | #errorHandler = MyCSRFErrorHandler 258 | } 259 | 260 | ## Security headers filter configuration 261 | # https://www.playframework.com/documentation/latest/SecurityHeaders 262 | # ~~~~~ 263 | # Defines security headers that prevent XSS attacks. 264 | # If enabled, then all options are set to the below configuration by default: 265 | headers { 266 | # The X-Frame-Options header. If null, the header is not set. 267 | #frameOptions = "DENY" 268 | 269 | # The X-XSS-Protection header. If null, the header is not set. 270 | #xssProtection = "1; mode=block" 271 | 272 | # The X-Content-Type-Options header. If null, the header is not set. 273 | #contentTypeOptions = "nosniff" 274 | 275 | # The X-Permitted-Cross-Domain-Policies header. If null, the header is not set. 276 | #permittedCrossDomainPolicies = "master-only" 277 | 278 | # The Content-Security-Policy header. If null, the header is not set. 279 | #contentSecurityPolicy = "default-src 'self'" 280 | } 281 | 282 | ## Allowed hosts filter configuration 283 | # https://www.playframework.com/documentation/latest/AllowedHostsFilter 284 | # ~~~~~ 285 | # Play provides a filter that lets you configure which hosts can access your application. 286 | # This is useful to prevent cache poisoning attacks. 287 | hosts { 288 | # Allow requests to example.com, its subdomains, and localhost:9000. 289 | #allowed = [".example.com", "localhost:9000"] 290 | } 291 | } 292 | 293 | ## Evolutions 294 | # https://www.playframework.com/documentation/latest/Evolutions 295 | # ~~~~~ 296 | # Evolutions allows database scripts to be automatically run on startup in dev mode 297 | # for database migrations. You must enable this by adding to build.sbt: 298 | # 299 | # libraryDependencies += evolutions 300 | # 301 | play.evolutions { 302 | # You can disable evolutions for a specific datasource if necessary 303 | #db.default.enabled = false 304 | } 305 | 306 | ## Database Connection Pool 307 | # https://www.playframework.com/documentation/latest/SettingsJDBC 308 | # ~~~~~ 309 | # Play doesn't require a JDBC database to run, but you can easily enable one. 310 | # 311 | # libraryDependencies += jdbc 312 | # 313 | play.db { 314 | # The combination of these two settings results in "db.default" as the 315 | # default JDBC pool: 316 | #config = "db" 317 | #default = "default" 318 | 319 | # Play uses HikariCP as the default connection pool. You can override 320 | # settings by changing the prototype: 321 | prototype { 322 | # Sets a fixed JDBC connection pool size of 50 323 | #hikaricp.minimumIdle = 50 324 | #hikaricp.maximumPoolSize = 50 325 | } 326 | } 327 | 328 | ## JDBC Datasource 329 | # https://www.playframework.com/documentation/latest/JavaDatabase 330 | # https://www.playframework.com/documentation/latest/ScalaDatabase 331 | # ~~~~~ 332 | # Once JDBC datasource is set up, you can work with several different 333 | # database options: 334 | # 335 | # Slick (Scala preferred option): https://www.playframework.com/documentation/latest/PlaySlick 336 | # JPA (Java preferred option): https://playframework.com/documentation/latest/JavaJPA 337 | # EBean: https://playframework.com/documentation/latest/JavaEbean 338 | # Anorm: https://www.playframework.com/documentation/latest/ScalaAnorm 339 | # 340 | db { 341 | # You can declare as many datasources as you want. 342 | # By convention, the default datasource is named `default` 343 | 344 | # https://www.playframework.com/documentation/latest/Developing-with-the-H2-Database 345 | #default.driver = org.h2.Driver 346 | #default.url = "jdbc:h2:mem:play" 347 | #default.username = sa 348 | #default.password = "" 349 | 350 | # You can turn on SQL logging for any datasource 351 | # https://www.playframework.com/documentation/latest/Highlights25#Logging-SQL-statements 352 | #default.logSql=true 353 | } 354 | -------------------------------------------------------------------------------- /young-crawler-searcher/bin/activator: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ### ------------------------------- ### 4 | ### Helper methods for BASH scripts ### 5 | ### ------------------------------- ### 6 | 7 | realpath () { 8 | ( 9 | TARGET_FILE="$1" 10 | FIX_CYGPATH="$2" 11 | 12 | cd "$(dirname "$TARGET_FILE")" 13 | TARGET_FILE=$(basename "$TARGET_FILE") 14 | 15 | COUNT=0 16 | while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ] 17 | do 18 | TARGET_FILE=$(readlink "$TARGET_FILE") 19 | cd "$(dirname "$TARGET_FILE")" 20 | TARGET_FILE=$(basename "$TARGET_FILE") 21 | COUNT=$(($COUNT + 1)) 22 | done 23 | 24 | # make sure we grab the actual windows path, instead of cygwin's path. 25 | if [[ "x$FIX_CYGPATH" != "x" ]]; then 26 | echo "$(cygwinpath "$(pwd -P)/$TARGET_FILE")" 27 | else 28 | echo "$(pwd -P)/$TARGET_FILE" 29 | fi 30 | ) 31 | } 32 | 33 | 34 | # Uses uname to detect if we're in the odd cygwin environment. 35 | is_cygwin() { 36 | local os=$(uname -s) 37 | case "$os" in 38 | CYGWIN*) return 0 ;; 39 | *) return 1 ;; 40 | esac 41 | } 42 | 43 | # TODO - Use nicer bash-isms here. 44 | CYGWIN_FLAG=$(if is_cygwin; then echo true; else echo false; fi) 45 | 46 | 47 | # This can fix cygwin style /cygdrive paths so we get the 48 | # windows style paths. 49 | cygwinpath() { 50 | local file="$1" 51 | if [[ "$CYGWIN_FLAG" == "true" ]]; then 52 | echo $(cygpath -w $file) 53 | else 54 | echo $file 55 | fi 56 | } 57 | 58 | # Make something URI friendly 59 | make_url() { 60 | url="$1" 61 | local nospaces=${url// /%20} 62 | if is_cygwin; then 63 | echo "/${nospaces//\\//}" 64 | else 65 | echo "$nospaces" 66 | fi 67 | } 68 | 69 | declare -a residual_args 70 | declare -a java_args 71 | declare -a scalac_args 72 | declare -a sbt_commands 73 | declare java_cmd=java 74 | declare java_version 75 | declare -r real_script_path="$(realpath "$0")" 76 | declare -r sbt_home="$(realpath "$(dirname "$(dirname "$real_script_path")")")" 77 | declare -r sbt_bin_dir="$(dirname "$real_script_path")" 78 | declare -r app_version="1.3.10" 79 | 80 | declare -r script_name=activator 81 | declare -r java_opts=( "${ACTIVATOR_OPTS[@]}" "${SBT_OPTS[@]}" "${JAVA_OPTS[@]}" "${java_opts[@]}" ) 82 | userhome="$HOME" 83 | if is_cygwin; then 84 | # cygwin sets home to something f-d up, set to real windows homedir 85 | userhome="$USERPROFILE" 86 | fi 87 | declare -r activator_user_home_dir="${userhome}/.activator" 88 | declare -r java_opts_config_home="${activator_user_home_dir}/activatorconfig.txt" 89 | declare -r java_opts_config_version="${activator_user_home_dir}/${app_version}/activatorconfig.txt" 90 | 91 | echoerr () { 92 | echo 1>&2 "$@" 93 | } 94 | vlog () { 95 | [[ $verbose || $debug ]] && echoerr "$@" 96 | } 97 | dlog () { 98 | [[ $debug ]] && echoerr "$@" 99 | } 100 | 101 | jar_file () { 102 | echo "$(cygwinpath "${sbt_home}/libexec/activator-launch-${app_version}.jar")" 103 | } 104 | 105 | acquire_sbt_jar () { 106 | sbt_jar="$(jar_file)" 107 | 108 | if [[ ! -f "$sbt_jar" ]]; then 109 | echoerr "Could not find launcher jar: $sbt_jar" 110 | exit 2 111 | fi 112 | } 113 | 114 | execRunner () { 115 | # print the arguments one to a line, quoting any containing spaces 116 | [[ $verbose || $debug ]] && echo "# Executing command line:" && { 117 | for arg; do 118 | if printf "%s\n" "$arg" | grep -q ' '; then 119 | printf "\"%s\"\n" "$arg" 120 | else 121 | printf "%s\n" "$arg" 122 | fi 123 | done 124 | echo "" 125 | } 126 | 127 | # THis used to be exec, but we loose the ability to re-hook stty then 128 | # for cygwin... Maybe we should flag the feature here... 129 | "$@" 130 | } 131 | 132 | addJava () { 133 | dlog "[addJava] arg = '$1'" 134 | java_args=( "${java_args[@]}" "$1" ) 135 | } 136 | addSbt () { 137 | dlog "[addSbt] arg = '$1'" 138 | sbt_commands=( "${sbt_commands[@]}" "$1" ) 139 | } 140 | addResidual () { 141 | dlog "[residual] arg = '$1'" 142 | residual_args=( "${residual_args[@]}" "$1" ) 143 | } 144 | addDebugger () { 145 | addJava "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=$1" 146 | } 147 | 148 | get_mem_opts () { 149 | # if we detect any of these settings in ${JAVA_OPTS} we need to NOT output our settings. 150 | # The reason is the Xms/Xmx, if they don't line up, cause errors. 151 | if [[ "${JAVA_OPTS}" == *-Xmx* ]] || [[ "${JAVA_OPTS}" == *-Xms* ]] || [[ "${JAVA_OPTS}" == *-XX:MaxPermSize* ]] || [[ "${JAVA_OPTS}" == *-XX:MaxMetaspaceSize* ]] || [[ "${JAVA_OPTS}" == *-XX:ReservedCodeCacheSize* ]]; then 152 | echo "" 153 | else 154 | # a ham-fisted attempt to move some memory settings in concert 155 | # so they need not be messed around with individually. 156 | local mem=${1:-1024} 157 | local codecache=$(( $mem / 8 )) 158 | (( $codecache > 128 )) || codecache=128 159 | (( $codecache < 512 )) || codecache=512 160 | local class_metadata_size=$(( $codecache * 2 )) 161 | local class_metadata_opt=$([[ "$java_version" < "1.8" ]] && echo "MaxPermSize" || echo "MaxMetaspaceSize") 162 | 163 | echo "-Xms${mem}m -Xmx${mem}m -XX:ReservedCodeCacheSize=${codecache}m -XX:${class_metadata_opt}=${class_metadata_size}m" 164 | fi 165 | } 166 | 167 | require_arg () { 168 | local type="$1" 169 | local opt="$2" 170 | local arg="$3" 171 | if [[ -z "$arg" ]] || [[ "${arg:0:1}" == "-" ]]; then 172 | echo "$opt requires <$type> argument" 173 | exit 1 174 | fi 175 | } 176 | 177 | is_function_defined() { 178 | declare -f "$1" > /dev/null 179 | } 180 | 181 | # If we're *not* running in a terminal, and we don't have any arguments, then we need to add the 'ui' parameter 182 | detect_terminal_for_ui() { 183 | [[ ! -t 0 ]] && [[ "${#residual_args}" == "0" ]] && { 184 | addResidual "ui" 185 | } 186 | # SPECIAL TEST FOR MAC 187 | [[ "$(uname)" == "Darwin" ]] && [[ "$HOME" == "$PWD" ]] && [[ "${#residual_args}" == "0" ]] && { 188 | echo "Detected MAC OSX launched script...." 189 | echo "Swapping to UI" 190 | addResidual "ui" 191 | } 192 | } 193 | 194 | process_args () { 195 | while [[ $# -gt 0 ]]; do 196 | case "$1" in 197 | -h|-help) usage; exit 1 ;; 198 | -v|-verbose) verbose=1 && shift ;; 199 | -d|-debug) debug=1 && shift ;; 200 | 201 | -ivy) require_arg path "$1" "$2" && addJava "-Dsbt.ivy.home=$2" && shift 2 ;; 202 | -mem) require_arg integer "$1" "$2" && sbt_mem="$2" && shift 2 ;; 203 | -jvm-debug) require_arg port "$1" "$2" && addDebugger $2 && shift 2 ;; 204 | -batch) exec &1 | awk -F '"' '/version/ {print $2}') 223 | vlog "[process_args] java_version = '$java_version'" 224 | } 225 | 226 | # Detect that we have java installed. 227 | checkJava() { 228 | local required_version="$1" 229 | # Now check to see if it's a good enough version 230 | if [[ "$java_version" == "" ]]; then 231 | echo 232 | echo No java installations was detected. 233 | echo Please go to http://www.java.com/getjava/ and download 234 | echo 235 | exit 1 236 | elif [[ ! "$java_version" > "$required_version" ]]; then 237 | echo 238 | echo The java installation you have is not up to date 239 | echo $script_name requires at least version $required_version+, you have 240 | echo version $java_version 241 | echo 242 | echo Please go to http://www.java.com/getjava/ and download 243 | echo a valid Java Runtime and install before running $script_name. 244 | echo 245 | exit 1 246 | fi 247 | } 248 | 249 | 250 | run() { 251 | # no jar? download it. 252 | [[ -f "$sbt_jar" ]] || acquire_sbt_jar "$sbt_version" || { 253 | # still no jar? uh-oh. 254 | echo "Download failed. Obtain the sbt-launch.jar manually and place it at $sbt_jar" 255 | exit 1 256 | } 257 | 258 | # process the combined args, then reset "$@" to the residuals 259 | process_args "$@" 260 | detect_terminal_for_ui 261 | set -- "${residual_args[@]}" 262 | argumentCount=$# 263 | 264 | # TODO - java check should be configurable... 265 | checkJava "1.6" 266 | 267 | #If we're in cygwin, we should use the windows config, and terminal hacks 268 | if [[ "$CYGWIN_FLAG" == "true" ]]; then 269 | stty -icanon min 1 -echo > /dev/null 2>&1 270 | addJava "-Djline.terminal=jline.UnixTerminal" 271 | addJava "-Dsbt.cygwin=true" 272 | fi 273 | 274 | # run sbt 275 | execRunner "$java_cmd" \ 276 | "-Dactivator.home=$(make_url "$sbt_home")" \ 277 | ${SBT_OPTS:-$default_sbt_opts} \ 278 | $(get_mem_opts $sbt_mem) \ 279 | ${JAVA_OPTS} \ 280 | ${java_args[@]} \ 281 | -jar "$sbt_jar" \ 282 | "${sbt_commands[@]}" \ 283 | "${residual_args[@]}" 284 | 285 | exit_code=$? 286 | 287 | # Clean up the terminal from cygwin hacks. 288 | if [[ "$CYGWIN_FLAG" == "true" ]]; then 289 | stty icanon echo > /dev/null 2>&1 290 | fi 291 | exit $exit_code 292 | } 293 | 294 | 295 | declare -r noshare_opts="-Dsbt.global.base=project/.sbtboot -Dsbt.boot.directory=project/.boot -Dsbt.ivy.home=project/.ivy" 296 | declare -r sbt_opts_file=".sbtopts" 297 | declare -r etc_sbt_opts_file="${sbt_home}/conf/sbtopts" 298 | declare -r win_sbt_opts_file="${sbt_home}/conf/sbtconfig.txt" 299 | 300 | usage() { 301 | cat < path to global settings/plugins directory (default: ~/.sbt) 316 | -sbt-boot path to shared boot directory (default: ~/.sbt/boot in 0.11 series) 317 | -ivy path to local Ivy repository (default: ~/.ivy2) 318 | -mem set memory options (default: $sbt_mem, which is $(get_mem_opts $sbt_mem)) 319 | -no-share use all local caches; no sharing 320 | -no-global uses global caches, but does not use global ~/.sbt directory. 321 | -jvm-debug Turn on JVM debugging, open at the given port. 322 | -batch Disable interactive mode 323 | 324 | # sbt version (default: from project/build.properties if present, else latest release) 325 | -sbt-version use the specified version of sbt 326 | -sbt-jar use the specified jar as the sbt launcher 327 | -sbt-rc use an RC version of sbt 328 | -sbt-snapshot use a snapshot version of sbt 329 | 330 | # java version (default: java from PATH, currently $(java -version 2>&1 | grep version)) 331 | -java-home alternate JAVA_HOME 332 | 333 | # jvm options and output control 334 | JAVA_OPTS environment variable, if unset uses "$java_opts" 335 | SBT_OPTS environment variable, if unset uses "$default_sbt_opts" 336 | ACTIVATOR_OPTS Environment variable, if unset uses "" 337 | .sbtopts if this file exists in the current directory, it is 338 | prepended to the runner args 339 | /etc/sbt/sbtopts if this file exists, it is prepended to the runner args 340 | -Dkey=val pass -Dkey=val directly to the java runtime 341 | -J-X pass option -X directly to the java runtime 342 | (-J is stripped) 343 | -S-X add -X to sbt's scalacOptions (-S is stripped) 344 | 345 | In the case of duplicated or conflicting options, the order above 346 | shows precedence: JAVA_OPTS lowest, command line options highest. 347 | EOM 348 | } 349 | 350 | 351 | 352 | process_my_args () { 353 | while [[ $# -gt 0 ]]; do 354 | case "$1" in 355 | -no-colors) addJava "-Dsbt.log.noformat=true" && shift ;; 356 | -no-share) addJava "$noshare_opts" && shift ;; 357 | -no-global) addJava "-Dsbt.global.base=$(pwd)/project/.sbtboot" && shift ;; 358 | -sbt-boot) require_arg path "$1" "$2" && addJava "-Dsbt.boot.directory=$2" && shift 2 ;; 359 | -sbt-dir) require_arg path "$1" "$2" && addJava "-Dsbt.global.base=$2" && shift 2 ;; 360 | -debug-inc) addJava "-Dxsbt.inc.debug=true" && shift ;; 361 | -batch) exec