├── .gitignore
├── young-crawler-searcher
├── public
│ ├── stylesheets
│ │ └── main.css
│ ├── javascripts
│ │ └── hello.js
│ └── images
│ │ └── favicon.png
├── .gitignore
├── libexec
│ └── activator-launch-1.3.10.jar
├── project
│ ├── build.properties
│ └── plugins.sbt
├── build.sbt
├── LICENSE
├── app
│ ├── views
│ │ ├── index.scala.html
│ │ └── main.scala.html
│ ├── controllers
│ │ ├── HomeController.scala
│ │ ├── CountController.scala
│ │ └── AsyncController.scala
│ ├── services
│ │ ├── Counter.scala
│ │ └── ApplicationTimer.scala
│ ├── Module.scala
│ ├── Filters.scala
│ └── filters
│ │ └── ExampleFilter.scala
├── test
│ ├── IntegrationSpec.scala
│ └── ApplicationSpec.scala
├── conf
│ ├── routes
│ ├── logback.xml
│ └── application.conf
├── README
└── bin
│ ├── activator.bat
│ └── activator
├── README.md
└── young-crawler-core
└── src
├── main
├── scala
│ └── com
│ │ └── young
│ │ └── crawler
│ │ ├── spider
│ │ ├── task
│ │ │ ├── IndexTask.scala
│ │ │ ├── FetchTask.scala
│ │ │ ├── SlaveTask.scala
│ │ │ ├── CounterTask.scala
│ │ │ ├── ParserTask.scala
│ │ │ ├── InjectTask.scala
│ │ │ └── support
│ │ │ │ └── actor
│ │ │ │ ├── IndexActorTask.scala
│ │ │ │ ├── FetchActorTask.scala
│ │ │ │ ├── ParseActorTask.scala
│ │ │ │ ├── InjectActorTask.scala
│ │ │ │ └── CounterActorTask.scala
│ │ ├── parser
│ │ │ ├── Parser.scala
│ │ │ └── support
│ │ │ │ ├── HtmlParseParser.scala
│ │ │ │ └── JsoupParser.scala
│ │ ├── fetcher
│ │ │ ├── FetcherCache.scala
│ │ │ ├── Fetcher.scala
│ │ │ └── support
│ │ │ │ ├── HttpClientFetcher.scala
│ │ │ │ └── HttpWatch.scala
│ │ └── indexer
│ │ │ ├── Indexer.scala
│ │ │ └── support
│ │ │ └── ElasticIndexer.scala
│ │ ├── entity
│ │ ├── InjectEntitys.scala
│ │ ├── PageIndexEntity.scala
│ │ ├── CounterEntity.scala
│ │ └── HttpEntitys.scala
│ │ ├── exception
│ │ ├── IndexException.scala
│ │ ├── ParseException.scala
│ │ └── FetchException.scala
│ │ ├── utils
│ │ ├── JsonUtil.scala
│ │ ├── MD5Util.scala
│ │ └── IOUtil.scala
│ │ ├── cache
│ │ ├── Cache.scala
│ │ └── support
│ │ │ ├── MapCache.scala
│ │ │ └── RedisCache.scala
│ │ ├── config
│ │ ├── CrawlerConfig.scala
│ │ └── CrawlerConfigContants.scala
│ │ └── boot
│ │ └── CrawlerBoot.scala
├── java
│ ├── Thread1.java
│ ├── Runnable1.java
│ └── ThreadBoot.java
└── resources
│ ├── seeds.txt
│ └── crawler.properties
└── test
└── scala
└── com
└── young
└── crawler
├── actor
├── ActorExample.scala
└── ActorSelectorExample.scala
├── http
└── CrawlerTest.scala
├── cache
├── MapCacheExample.scala
└── RedisCacheExample.scala
├── parser
└── JsoupExample.scala
└── indexer
└── Elastic4sExample.scala
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | project
3 | target
4 |
--------------------------------------------------------------------------------
/young-crawler-searcher/public/stylesheets/main.css:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # young-crawler
2 | scala结合actor编写的分布式网络爬虫,实现上采用Akka 异步消息处理框架,无阻塞,性能高,网页爬取速度快
3 | #如何启动
4 | 下载项目后配置cralwer.properties,详细配置文件里有注释
5 |
--------------------------------------------------------------------------------
/young-crawler-searcher/.gitignore:
--------------------------------------------------------------------------------
1 | logs
2 | target
3 | /.idea
4 | /.idea_modules
5 | /.classpath
6 | /.project
7 | /.settings
8 | /RUNNING_PID
9 |
--------------------------------------------------------------------------------
/young-crawler-searcher/public/javascripts/hello.js:
--------------------------------------------------------------------------------
1 | if (window.console) {
2 | console.log("Welcome to your Play application's JavaScript!");
3 | }
4 |
--------------------------------------------------------------------------------
/young-crawler-searcher/public/images/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangwx1402/young-crawler/HEAD/young-crawler-searcher/public/images/favicon.png
--------------------------------------------------------------------------------
/young-crawler-searcher/libexec/activator-launch-1.3.10.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangwx1402/young-crawler/HEAD/young-crawler-searcher/libexec/activator-launch-1.3.10.jar
--------------------------------------------------------------------------------
/young-crawler-searcher/project/build.properties:
--------------------------------------------------------------------------------
1 | #Activator-generated Properties
2 | #Sun Sep 11 15:22:45 CST 2016
3 | template.uuid=b0d11fa6-d1b3-4963-94aa-319a15612bf3
4 | sbt.version=0.13.11
5 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/task/IndexTask.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.spider.task
2 |
3 | /**
4 | * Created by dell on 2016/8/29.
5 | */
6 | trait IndexTask {
7 |
8 | }
9 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/task/FetchTask.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.spider.task
2 |
3 | /**
4 | * Created by young.yang on 2016/8/28.
5 | */
6 | trait FetchTask {
7 |
8 | }
9 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/task/SlaveTask.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.spider.task
2 |
3 | /**
4 | * Created by young.yang on 2016/8/28.
5 | */
6 | trait SlaveTask {
7 |
8 | }
9 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/task/CounterTask.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.spider.task
2 |
3 | /**
4 | * Created by young.yang on 2016/9/3.
5 | */
6 | trait CounterTask {
7 |
8 | }
9 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/task/ParserTask.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.spider.task
2 |
3 | /**
4 | * Created by young.yang on 2016/8/28.
5 | */
6 | trait ParserTask {
7 |
8 | }
9 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/java/Thread1.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Created by young.yang on 2016/8/31.
3 | */
4 | public class Thread1 extends Thread{
5 | public void run(){
6 | System.out.println("Thread1 run");
7 | }
8 | }
9 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/java/Runnable1.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Created by young.yang on 2016/8/31.
3 | */
4 | public class Runnable1 implements Runnable {
5 | @Override
6 | public void run() {
7 | System.out.println("Runnable1 run");
8 | }
9 | }
10 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/resources/seeds.txt:
--------------------------------------------------------------------------------
1 | http://www.sina.com.cn
2 | http://www.baidu.com
3 | http://www.163.com
4 | http://www.sohu.com
5 | http://www.ifeng.com
6 | http://www.autohome.com.cn/beijing
7 | http://bj.fang.com
8 | http://blog.csdn.net
9 | http://www.gc-zb.com
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/entity/InjectEntitys.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.entity
2 |
3 | /**
4 | * Created by dell on 2016/8/29.
5 | * 初始化种子消息,用来传递给Inject Actor解析种子信息
6 | */
7 | case class InitSeed(seedPath:String,fileEncode:String="utf-8")
8 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/parser/Parser.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.spider.parser
2 |
3 | import com.young.crawler.entity.{HttpResult, HttpPage}
4 |
5 | /**
6 | * Created by young.yang on 2016/8/28.
7 | * html页面解析接口
8 | */
9 | trait Parser {
10 | def parse(html:HttpResult):HttpPage
11 | }
12 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/task/InjectTask.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.spider.task
2 |
3 | import com.young.crawler.entity.Seed
4 |
5 | /**
6 | * Created by young.yang on 2016/8/28.
7 | */
8 | trait InjectTask {
9 | def initSeeds(seedPath:String,fileEncode:String="utf-8"): List[Seed]
10 | }
11 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/exception/IndexException.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.exception
2 |
3 | /**
4 | * Created by young.yang on 2016/8/31.
5 | * 自定义索引异常
6 | */
7 | class IndexException(message:String,e:Throwable) extends Exception(message,e) {
8 |
9 | def this(message:String) = this(message,new Exception(message))
10 | }
11 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/exception/ParseException.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.exception
2 |
3 | /**
4 | * Created by young.yang on 2016/8/31.
5 | * 自定义解析异常
6 | */
7 | class ParseException(message:String,e:Throwable) extends Exception(message,e){
8 |
9 | def this(message:String) = this(message,new Exception(message))
10 | }
11 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/exception/FetchException.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.exception
2 |
3 | /**
4 | * Created by young.yang on 2016/8/28.
5 | * 自定义抓取异常
6 | */
7 | class FetchException(message:String,e:Throwable) extends Exception(message,e){
8 |
9 | def this(message:String)=this(message,new Exception(message))
10 |
11 | }
12 |
--------------------------------------------------------------------------------
/young-crawler-searcher/build.sbt:
--------------------------------------------------------------------------------
1 | name := """young-crawler-searcher"""
2 |
3 | version := "1.0-SNAPSHOT"
4 |
5 | lazy val root = (project in file(".")).enablePlugins(PlayScala)
6 |
7 | scalaVersion := "2.11.7"
8 |
9 | libraryDependencies ++= Seq(
10 | jdbc,
11 | cache,
12 | ws,
13 | "org.scalatestplus.play" %% "scalatestplus-play" % "1.5.1" % Test
14 | )
15 |
16 |
--------------------------------------------------------------------------------
/young-crawler-core/src/test/scala/com/young/crawler/actor/ActorExample.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.actor
2 |
3 | import akka.actor.Actor
4 | import akka.actor.Actor.Receive
5 |
6 | /**
7 | * Created by young.yang on 2016/9/8.
8 | */
9 | class ActorExample extends Actor{
10 | override def receive: Receive = {
11 | case line:String=>println("receive a message "+line)
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/utils/JsonUtil.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.utils
2 |
3 | import org.codehaus.jackson.map.ObjectMapper
4 |
5 | /**
6 | * Created by dell on 2016/8/31.
7 | */
8 | private[crawler] object JsonUtil {
9 |
10 | private val mapper = new ObjectMapper
11 |
12 | def toJson(obj:Any):String={
13 | mapper.writeValueAsString(obj)
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/cache/Cache.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.cache
2 |
3 | /**
4 | * Created by dell on 2016/9/2.
5 | * 缓存接口
6 | */
7 | trait Cache[KEY,VALUE] {
8 |
9 | def contains(key:KEY):Boolean
10 |
11 | def put(key:KEY,value:VALUE)
12 |
13 | def get(key:KEY):Option[VALUE]
14 |
15 | def size():Int
16 |
17 | def keys():scala.collection.Set[KEY]
18 |
19 | }
20 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/java/ThreadBoot.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Created by young.yang on 2016/8/31.
3 | */
4 | public class ThreadBoot {
5 | public static void main(String[] args) throws InterruptedException {
6 | Thread thread1 = new Thread1();
7 | Runnable runnable = new Runnable1();
8 | Thread thread2 = new Thread(runnable);
9 | thread1.start();
10 | thread2.start();
11 | Thread.sleep(5000);
12 | thread1.start();
13 | new Thread(runnable).start();
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/utils/MD5Util.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.utils
2 |
3 | import java.nio.charset.Charset
4 |
5 | import com.google.common.hash.Hashing
6 |
7 | /**
8 | * Created by dell on 2016/8/31.
9 | */
10 | private[crawler] object MD5Util {
11 |
12 | def md5(line: String) = Hashing.md5().newHasher().putString(line, Charset.defaultCharset()).hash().toString
13 |
14 | def main(args: Array[String]) {
15 | println(MD5Util.md5("杨勇"))
16 | println(MD5Util.md5("123"))
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/young-crawler-core/src/test/scala/com/young/crawler/http/CrawlerTest.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.http
2 |
3 | import com.young.crawler.entity.{SeedType, UrlInfo}
4 | import com.young.crawler.spider.fetcher.support.HttpWatch
5 |
6 | /**
7 | * Created by young.yang on 2016/8/28.
8 | */
9 | object CrawlerTest {
10 |
11 | def main(args: Array[String]) {
12 | val url = "http://www.sina.com.cn"
13 | val result = HttpWatch.get(UrlInfo(url,"",SeedType,0))
14 | println(result.content)
15 | println(result.status)
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/parser/support/HtmlParseParser.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.spider.parser.support
2 |
3 | import com.young.crawler.entity.{HttpPage, HttpResult}
4 | import com.young.crawler.spider.parser.Parser
5 |
6 | /**
7 | * Created by young.yang on 2016/8/28.
8 | */
9 | private[crawler] class HtmlParseParser extends Parser {
10 | override def parse(html: HttpResult): HttpPage = {
11 | val page = new HttpPage
12 | page.setContent(html.content)
13 | page.setUrl(html.url)
14 | page
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/young-crawler-core/src/test/scala/com/young/crawler/cache/MapCacheExample.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.cache
2 |
3 | import com.young.crawler.cache.support.MapCache
4 |
5 | /**
6 | * Created by dell on 2016/9/2.
7 | */
8 | object MapCacheExample {
9 | def main(args: Array[String]) {
10 | val cache = new MapCache[String, String]
11 | for(i<-0 to 10){
12 | cache.put("key_"+i,"value_"+i)
13 | }
14 | println(cache.contains("key_0"))
15 | println(cache.keys())
16 | println(cache.size())
17 | println(cache.get("key_12").isEmpty)
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/young-crawler-core/src/test/scala/com/young/crawler/cache/RedisCacheExample.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.cache
2 |
3 | import com.young.crawler.cache.support.{RedisCache, MapCache}
4 |
5 | /**
6 | * Created by dell on 2016/9/9.
7 | */
8 | object RedisCacheExample {
9 |
10 | def main(args: Array[String]) {
11 | val cache = new RedisCache[String, String]
12 | for(i<-0 to 10){
13 | cache.put("key_"+i,"value_"+i)
14 | }
15 | println(cache.contains("key_0"))
16 | println(cache.size())
17 | println(cache.get("key_12").isEmpty)
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/young-crawler-searcher/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | // The Play plugin
2 | addSbtPlugin("com.typesafe.play" % "sbt-plugin" % "2.5.6")
3 |
4 | // web plugins
5 |
6 | addSbtPlugin("com.typesafe.sbt" % "sbt-coffeescript" % "1.0.0")
7 |
8 | addSbtPlugin("com.typesafe.sbt" % "sbt-less" % "1.1.0")
9 |
10 | addSbtPlugin("com.typesafe.sbt" % "sbt-jshint" % "1.0.3")
11 |
12 | addSbtPlugin("com.typesafe.sbt" % "sbt-rjs" % "1.0.7")
13 |
14 | addSbtPlugin("com.typesafe.sbt" % "sbt-digest" % "1.1.0")
15 |
16 | addSbtPlugin("com.typesafe.sbt" % "sbt-mocha" % "1.1.0")
17 |
18 | addSbtPlugin("org.irundaia.sbt" % "sbt-sassify" % "1.4.2")
19 |
--------------------------------------------------------------------------------
/young-crawler-searcher/LICENSE:
--------------------------------------------------------------------------------
1 | This software is licensed under the Apache 2 license, quoted below.
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this project except in compliance with
4 | the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
5 |
6 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
7 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific
8 | language governing permissions and limitations under the License.
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/fetcher/FetcherCache.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.spider.fetcher
2 |
3 | import com.young.crawler.cache.Cache
4 | import com.young.crawler.config.{CrawlerConfigContants, CrawlerConfig}
5 |
6 | /**
7 | * Created by young.yang on 2016/9/2.
8 | * 网页缓存,用来爬取过程中的去重
9 | */
10 | private[crawler] object FetcherCache {
11 | //val fetcherCache = new MapCache[String,Byte]
12 | val fetcherCache : Cache[String,Byte] = Class.forName(CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_fetcher_cache_imp)).newInstance().asInstanceOf[(Cache[String,Byte])]
13 | }
14 |
--------------------------------------------------------------------------------
/young-crawler-searcher/app/views/index.scala.html:
--------------------------------------------------------------------------------
1 | @*
2 | * This template takes a single argument, a String containing a
3 | * message to display.
4 | *@
5 | @(message: String)
6 |
7 | @*
8 | * Call the `main` template with two arguments. The first
9 | * argument is a `String` with the title of the page, the second
10 | * argument is an `Html` object containing the body of the page.
11 | *@
12 | @main("Welcome to Play") {
13 |
14 | @*
15 | * Get an `Html` object by calling the built-in Play welcome
16 | * template and passing a `String` message.
17 | *@
18 | @play20.welcome(message, style = "Scala")
19 |
20 | }
21 |
--------------------------------------------------------------------------------
/young-crawler-searcher/test/IntegrationSpec.scala:
--------------------------------------------------------------------------------
1 | import org.scalatestplus.play._
2 | import play.api.test._
3 | import play.api.test.Helpers._
4 |
5 | /**
6 | * add your integration spec here.
7 | * An integration test will fire up a whole play application in a real (or headless) browser
8 | */
9 | class IntegrationSpec extends PlaySpec with OneServerPerTest with OneBrowserPerTest with HtmlUnitFactory {
10 |
11 | "Application" should {
12 |
13 | "work from within a browser" in {
14 |
15 | go to ("http://localhost:" + port)
16 |
17 | pageSource must include ("Your new application is ready.")
18 | }
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/entity/PageIndexEntity.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.entity
2 |
3 | import scala.beans.BeanProperty
4 |
5 | /**
6 | * Created by dell on 2016/8/31.
7 | * 索引信息
8 | */
9 | class PageIndexEntity {
10 | @BeanProperty
11 | var url: String = ""
12 | @BeanProperty
13 | var title: String = ""
14 | @BeanProperty
15 | var content: String = ""
16 | @BeanProperty
17 | var publishTime: Long = 0
18 | @BeanProperty
19 | var updateTime: Long = 0
20 | @BeanProperty
21 | var author: String = ""
22 | @BeanProperty
23 | var keywords:String =""
24 | @BeanProperty
25 | var desc:String = ""
26 | }
27 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/utils/IOUtil.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.utils
2 |
3 | import java.io.{BufferedReader, InputStream, InputStreamReader}
4 |
5 | /**
6 | * Created by young.yang on 2016/8/28.
7 | */
8 | private [crawler] object IOUtil {
9 |
10 | def toString(inputStream:InputStream,encode:String):String={
11 | val bufferReader = new BufferedReader(new InputStreamReader(inputStream,encode))
12 | val buffer = new StringBuilder(1000)
13 | var line = bufferReader.readLine()
14 | while(line!=null){
15 | buffer.append(line+"\n")
16 | line = bufferReader.readLine()
17 | }
18 | return buffer.toString()
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/young-crawler-core/src/test/scala/com/young/crawler/actor/ActorSelectorExample.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.actor
2 |
3 | import akka.actor.{Props, ActorSystem}
4 | import com.young.crawler.config.{CrawlerConfigContants, CrawlerConfig}
5 |
6 | /**
7 | * Created by young.yang on 2016/9/8.
8 | */
9 | object ActorSelectorExample {
10 |
11 | def main(args: Array[String]) {
12 | val system = ActorSystem(CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_appName))
13 | val actor = system.actorOf(Props[ActorExample],"print")
14 | actor!"test"
15 | println(actor)
16 | val actor2 = system.actorSelection("akka://young-crawler/user/print")
17 | actor2!"222"
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/young-crawler-searcher/conf/routes:
--------------------------------------------------------------------------------
1 | # Routes
2 | # This file defines all application routes (Higher priority routes first)
3 | # ~~~~
4 |
5 | # An example controller showing a sample home page
6 | GET / controllers.HomeController.index
7 | # An example controller showing how to use dependency injection
8 | GET /count controllers.CountController.count
9 | # An example controller showing how to write asynchronous code
10 | GET /message controllers.AsyncController.message
11 |
12 | # Map static resources from the /public folder to the /assets URL path
13 | GET /assets/*file controllers.Assets.versioned(path="/public", file: Asset)
14 |
--------------------------------------------------------------------------------
/young-crawler-searcher/app/controllers/HomeController.scala:
--------------------------------------------------------------------------------
1 | package controllers
2 |
3 | import javax.inject._
4 | import play.api._
5 | import play.api.mvc._
6 |
7 | /**
8 | * This controller creates an `Action` to handle HTTP requests to the
9 | * application's home page.
10 | */
11 | @Singleton
12 | class HomeController @Inject() extends Controller {
13 |
14 | /**
15 | * Create an Action to render an HTML page with a welcome message.
16 | * The configuration in the `routes` file means that this method
17 | * will be called when the application receives a `GET` request with
18 | * a path of `/`.
19 | */
20 | def index = Action {
21 | Ok(views.html.index("Your new application is ready."))
22 | }
23 |
24 | }
25 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/indexer/Indexer.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.spider.indexer
2 |
3 | import com.young.crawler.config.{CrawlerConfig, CrawlerConfigContants}
4 | import com.young.crawler.entity.{HttpPage, IndexResult}
5 |
6 | /**
7 | * Created by dell on 2016/8/29.
8 | * 索引接口
9 | */
10 | trait Indexer {
11 |
12 | /**
13 | * 文档索引
14 | * @param page
15 | * @return
16 | */
17 | def index(page: HttpPage): IndexResult
18 | }
19 |
20 | /**
21 | * ES中所有名称和类型
22 | */
23 | object IndexerConstants {
24 | val indexName = CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_indexer_es_name)
25 | val indexType = CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_indexer_es_type)
26 | }
27 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/cache/support/MapCache.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.cache.support
2 |
3 | import com.young.crawler.cache.Cache
4 |
5 | import scala.collection.immutable.Nil
6 | import scala.collection.mutable
7 |
8 | /**
9 | * Created by dell on 2016/9/2.
10 | * 采用本地Map实现的缓存
11 | */
12 | private[crawler] class MapCache[KEY,VALUE] extends Cache[KEY,VALUE]{
13 |
14 | private val map = new mutable.HashMap[KEY,VALUE]()
15 |
16 | override def contains(key: KEY): Boolean = map.contains(key)
17 |
18 | override def get(key: KEY): Option[VALUE] =map.get(key)
19 |
20 | override def put(key:KEY,value:VALUE): Unit = map.put(key,value)
21 |
22 | override def size(): Int = map.size
23 |
24 | override def keys(): scala.collection.Set[KEY] = map.keySet
25 | }
26 |
--------------------------------------------------------------------------------
/young-crawler-searcher/app/controllers/CountController.scala:
--------------------------------------------------------------------------------
1 | package controllers
2 |
3 | import javax.inject._
4 | import play.api._
5 | import play.api.mvc._
6 |
7 | import services.Counter
8 |
9 | /**
10 | * This controller demonstrates how to use dependency injection to
11 | * bind a component into a controller class. The class creates an
12 | * `Action` that shows an incrementing count to users. The [[Counter]]
13 | * object is injected by the Guice dependency injection system.
14 | */
15 | @Singleton
16 | class CountController @Inject() (counter: Counter) extends Controller {
17 |
18 | /**
19 | * Create an action that responds with the [[Counter]]'s current
20 | * count. The result is plain text. This `Action` is mapped to
21 | * `GET /count` requests by an entry in the `routes` config file.
22 | */
23 | def count = Action { Ok(counter.nextCount().toString) }
24 |
25 | }
26 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/entity/CounterEntity.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.entity
2 |
3 | /**
4 | * Created by young.yang on 2016/9/3.
5 | */
6 | sealed trait Counter
7 |
8 | case class FetchCounter(num: Int) extends Counter
9 |
10 | case class FetchOk(num: Int) extends Counter
11 |
12 | case class FetchError(num: Int) extends Counter
13 |
14 | case class InjectCounter(num: Int) extends Counter
15 |
16 | case class ParseCounter(num: Int) extends Counter
17 |
18 | case class ParseChildUrlCounter(num: Int) extends Counter
19 |
20 | case class IndexCounter(num: Int) extends Counter
21 |
22 | case object PrintCounter extends Counter
23 |
24 | case object GetAllCounter extends Counter
25 |
26 | case class AllCounter(fetchCounter: FetchCounter, fetchOk: FetchOk, fetchError: FetchError, injectCounter: InjectCounter, parseCounter: ParseCounter, parseChildUrlCounter: ParseChildUrlCounter, indexCounter: IndexCounter) extends Counter
27 |
28 |
--------------------------------------------------------------------------------
/young-crawler-core/src/test/scala/com/young/crawler/parser/JsoupExample.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.parser
2 |
3 | import com.young.crawler.entity.{SeedType, UrlInfo}
4 | import com.young.crawler.spider.fetcher.support.HttpClientFetcher
5 | import com.young.crawler.spider.parser.support.JsoupParser
6 |
7 | /**
8 | * Created by dell on 2016/9/1.
9 | */
10 | object JsoupExample {
11 |
12 | def parserHtml(url:UrlInfo): Unit ={
13 | val fetcher = new HttpClientFetcher
14 | val parser = new JsoupParser
15 | val page = fetcher.fetchPage(url)
16 | println(page)
17 | val page1 = fetcher.fetchPage(url)
18 | println(page1)
19 | val result = parser.parse(page.get)
20 | println(result.keywords)
21 | println(result.desc)
22 | result.childLink._1.foreach(println _)
23 | }
24 |
25 | def main(args: Array[String]) {
26 | val url = "http://bj.fang.com/"
27 | JsoupExample.parserHtml(UrlInfo(url,"",SeedType,0))
28 |
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/young-crawler-searcher/app/services/Counter.scala:
--------------------------------------------------------------------------------
1 | package services
2 |
3 | import java.util.concurrent.atomic.AtomicInteger
4 | import javax.inject._
5 |
6 | /**
7 | * This trait demonstrates how to create a component that is injected
8 | * into a controller. The trait represents a counter that returns a
9 | * incremented number each time it is called.
10 | */
11 | trait Counter {
12 | def nextCount(): Int
13 | }
14 |
15 | /**
16 | * This class is a concrete implementation of the [[Counter]] trait.
17 | * It is configured for Guice dependency injection in the [[Module]]
18 | * class.
19 | *
20 | * This class has a `Singleton` annotation because we need to make
21 | * sure we only use one counter per application. Without this
22 | * annotation we would get a new instance every time a [[Counter]] is
23 | * injected.
24 | */
25 | @Singleton
26 | class AtomicCounter extends Counter {
27 | private val atomicCounter = new AtomicInteger()
28 | override def nextCount(): Int = atomicCounter.getAndIncrement()
29 | }
30 |
--------------------------------------------------------------------------------
/young-crawler-searcher/app/views/main.scala.html:
--------------------------------------------------------------------------------
1 | @*
2 | * This template is called from the `index` template. This template
3 | * handles the rendering of the page header and body tags. It takes
4 | * two arguments, a `String` for the title of the page and an `Html`
5 | * object to insert into the body of the page.
6 | *@
7 | @(title: String)(content: Html)
8 |
9 |
10 |
11 |
12 | @* Here's where we render the page title `String`. *@
13 | @title
14 |
15 |
16 |
17 |
18 |
19 | @* And here's where we render the `Html` object containing
20 | * the page content. *@
21 | @content
22 |
23 |
24 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/task/support/actor/IndexActorTask.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.spider.task.support.actor
2 |
3 | import akka.actor.Actor
4 | import com.young.crawler.config.{CrawlerConfigContants, CrawlerConfig}
5 | import com.young.crawler.entity.{IndexCounter, HttpPage}
6 | import com.young.crawler.spider.indexer.Indexer
7 | import com.young.crawler.spider.task.IndexTask
8 |
9 | /**
10 | * Created by dell on 2016/8/29.
11 | * 索引任务
12 | */
13 | private[crawler] class IndexActorTask(indexer: Indexer) extends Actor with IndexTask {
14 |
15 | private val countActor = context.system.actorSelection("akka://" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_appName) + "/user/" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_count_name))
16 |
17 |
18 | context.system.actorSelection("")
19 |
20 | override def receive: Receive = {
21 | case page: HttpPage =>
22 | indexer.index(page)
23 | countActor ! IndexCounter(1)
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/config/CrawlerConfig.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.config
2 |
3 | import java.util.{Locale, ResourceBundle}
4 |
5 | import org.apache.commons.logging.LogFactory
6 |
7 | /**
8 | * Created by young.yang on 2016/9/3.
9 | */
10 | private[crawler] object CrawlerConfig {
11 |
12 | private val log = LogFactory.getLog("com.young.crawler.config.CrawlerConfig")
13 |
14 | private val config = ResourceBundle.getBundle("crawler", Locale.getDefault)
15 |
16 | private var init_flag = true
17 |
18 | private def init(): Unit = {
19 | log.info("init crawler config start")
20 | val keys = config.keySet()
21 | val iterator = keys.iterator()
22 | while (iterator.hasNext) {
23 | val key = iterator.next()
24 | log.info("crawler config key = [" + key + "] value = [" + config.getString(key) + "]")
25 | }
26 | log.info("init crawler config end")
27 | init_flag = false
28 | }
29 |
30 | def getConfig = {
31 | if (init_flag) {
32 | init()
33 | }
34 | config
35 | }
36 |
37 | }
38 |
39 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/fetcher/Fetcher.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.spider.fetcher
2 |
3 | import com.young.crawler.entity.{UrlInfo, HttpResult}
4 | import com.young.crawler.exception.FetchException
5 | import org.apache.http.Header
6 |
7 | /**
8 | * Created by young.yang on 2016/8/28.
9 | * 爬取接口
10 | */
11 | trait Fetcher {
12 |
13 | private val CONTENT_TYPE = "Content-Type"
14 |
15 | private val DEFAULT_ENCODE = "utf-8"
16 |
17 | val FETCH_SUCCESS = 200
18 |
19 | val URL_NOT_FOUND = 404
20 |
21 | /**
22 | * 爬取网页入口
23 | */
24 | @throws[FetchException]
25 | def fetchPage(url:UrlInfo):Option[HttpResult]
26 |
27 | /**
28 | * 根据网页header来探测网页编码
29 | * @param headers
30 | * @return
31 | */
32 | def getEncode(headers:Array[Header]):String={
33 | for(header<-headers){
34 | if(CONTENT_TYPE.equals(header.getName)){
35 | val temp = header.getValue.split("=")
36 | if(temp.length==2){
37 | return temp(1)
38 | }
39 | }
40 | }
41 | DEFAULT_ENCODE
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/young-crawler-searcher/app/Module.scala:
--------------------------------------------------------------------------------
1 | import com.google.inject.AbstractModule
2 | import java.time.Clock
3 |
4 | import services.{ApplicationTimer, AtomicCounter, Counter}
5 |
6 | /**
7 | * This class is a Guice module that tells Guice how to bind several
8 | * different types. This Guice module is created when the Play
9 | * application starts.
10 |
11 | * Play will automatically use any class called `Module` that is in
12 | * the root package. You can create modules in other locations by
13 | * adding `play.modules.enabled` settings to the `application.conf`
14 | * configuration file.
15 | */
16 | class Module extends AbstractModule {
17 |
18 | override def configure() = {
19 | // Use the system clock as the default implementation of Clock
20 | bind(classOf[Clock]).toInstance(Clock.systemDefaultZone)
21 | // Ask Guice to create an instance of ApplicationTimer when the
22 | // application starts.
23 | bind(classOf[ApplicationTimer]).asEagerSingleton()
24 | // Set AtomicCounter as the implementation for Counter.
25 | bind(classOf[Counter]).to(classOf[AtomicCounter])
26 | }
27 |
28 | }
29 |
--------------------------------------------------------------------------------
/young-crawler-searcher/app/Filters.scala:
--------------------------------------------------------------------------------
1 | import javax.inject._
2 | import play.api._
3 | import play.api.http.HttpFilters
4 | import play.api.mvc._
5 |
6 | import filters.ExampleFilter
7 |
8 | /**
9 | * This class configures filters that run on every request. This
10 | * class is queried by Play to get a list of filters.
11 | *
12 | * Play will automatically use filters from any class called
13 | * `Filters` that is placed the root package. You can load filters
14 | * from a different class by adding a `play.http.filters` setting to
15 | * the `application.conf` configuration file.
16 | *
17 | * @param env Basic environment settings for the current application.
18 | * @param exampleFilter A demonstration filter that adds a header to
19 | * each response.
20 | */
21 | @Singleton
22 | class Filters @Inject() (
23 | env: Environment,
24 | exampleFilter: ExampleFilter) extends HttpFilters {
25 |
26 | override val filters = {
27 | // Use the example filter if we're running development mode. If
28 | // we're running in production or test mode then don't use any
29 | // filters at all.
30 | if (env.mode == Mode.Dev) Seq(exampleFilter) else Seq.empty
31 | }
32 |
33 | }
34 |
--------------------------------------------------------------------------------
/young-crawler-core/src/test/scala/com/young/crawler/indexer/Elastic4sExample.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.indexer
2 |
3 | import java.net.InetAddress
4 |
5 | import com.young.crawler.entity.PageIndexEntity
6 | import com.young.crawler.spider.indexer.IndexerConstants
7 | import com.young.crawler.utils.{JsonUtil, MD5Util}
8 | import org.elasticsearch.client.transport.TransportClient
9 | import org.elasticsearch.common.transport.InetSocketTransportAddress
10 |
11 | /**
12 | * Created by young.yang on 2016/8/30.
13 | */
14 | object Elastic4sExample {
15 |
16 | val client = TransportClient.builder().build().addTransportAddress(new InetSocketTransportAddress(InetAddress.getByName("115.29.47.216"), 9300))
17 |
18 | def main(args: Array[String]) {
19 | val page = new PageIndexEntity
20 | page.setAuthor("杨勇")
21 | page.setContent("中华人民共和過")
22 | page.setTitle("测试")
23 | page.setUrl("http://www.baidu.com/1")
24 | page.setPublishTime(System.currentTimeMillis())
25 | page.setUpdateTime(System.currentTimeMillis())
26 | client.prepareIndex(IndexerConstants.indexName,IndexerConstants.indexType).setId(MD5Util.md5(page.getUrl)).setSource(JsonUtil.toJson(page)).get()
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/young-crawler-searcher/app/filters/ExampleFilter.scala:
--------------------------------------------------------------------------------
1 | package filters
2 |
3 | import akka.stream.Materializer
4 | import javax.inject._
5 | import play.api.mvc._
6 | import scala.concurrent.{ExecutionContext, Future}
7 |
8 | /**
9 | * This is a simple filter that adds a header to all requests. It's
10 | * added to the application's list of filters by the
11 | * [[Filters]] class.
12 | *
13 | * @param mat This object is needed to handle streaming of requests
14 | * and responses.
15 | * @param exec This class is needed to execute code asynchronously.
16 | * It is used below by the `map` method.
17 | */
18 | @Singleton
19 | class ExampleFilter @Inject()(
20 | implicit override val mat: Materializer,
21 | exec: ExecutionContext) extends Filter {
22 |
23 | override def apply(nextFilter: RequestHeader => Future[Result])
24 | (requestHeader: RequestHeader): Future[Result] = {
25 | // Run the next filter in the chain. This will call other filters
26 | // and eventually call the action. Take the result and modify it
27 | // by adding a new header.
28 | nextFilter(requestHeader).map { result =>
29 | result.withHeaders("X-ExampleFilter" -> "foo")
30 | }
31 | }
32 |
33 | }
34 |
--------------------------------------------------------------------------------
/young-crawler-searcher/test/ApplicationSpec.scala:
--------------------------------------------------------------------------------
1 | import org.scalatestplus.play._
2 | import play.api.test._
3 | import play.api.test.Helpers._
4 |
5 | /**
6 | * Add your spec here.
7 | * You can mock out a whole application including requests, plugins etc.
8 | * For more information, consult the wiki.
9 | */
10 | class ApplicationSpec extends PlaySpec with OneAppPerTest {
11 |
12 | "Routes" should {
13 |
14 | "send 404 on a bad request" in {
15 | route(app, FakeRequest(GET, "/boum")).map(status(_)) mustBe Some(NOT_FOUND)
16 | }
17 |
18 | }
19 |
20 | "HomeController" should {
21 |
22 | "render the index page" in {
23 | val home = route(app, FakeRequest(GET, "/")).get
24 |
25 | status(home) mustBe OK
26 | contentType(home) mustBe Some("text/html")
27 | contentAsString(home) must include ("Your new application is ready.")
28 | }
29 |
30 | }
31 |
32 | "CountController" should {
33 |
34 | "return an increasing count" in {
35 | contentAsString(route(app, FakeRequest(GET, "/count")).get) mustBe "0"
36 | contentAsString(route(app, FakeRequest(GET, "/count")).get) mustBe "1"
37 | contentAsString(route(app, FakeRequest(GET, "/count")).get) mustBe "2"
38 | }
39 |
40 | }
41 |
42 | }
43 |
--------------------------------------------------------------------------------
/young-crawler-searcher/README:
--------------------------------------------------------------------------------
1 | This is your new Play application
2 | =================================
3 |
4 | This file will be packaged with your application when using `activator dist`.
5 |
6 | There are several demonstration files available in this template.
7 |
8 | Controllers
9 | ===========
10 |
11 | - HomeController.scala:
12 |
13 | Shows how to handle simple HTTP requests.
14 |
15 | - AsyncController.scala:
16 |
17 | Shows how to do asynchronous programming when handling a request.
18 |
19 | - CountController.scala:
20 |
21 | Shows how to inject a component into a controller and use the component when
22 | handling requests.
23 |
24 | Components
25 | ==========
26 |
27 | - Module.scala:
28 |
29 | Shows how to use Guice to bind all the components needed by your application.
30 |
31 | - Counter.scala:
32 |
33 | An example of a component that contains state, in this case a simple counter.
34 |
35 | - ApplicationTimer.scala:
36 |
37 | An example of a component that starts when the application starts and stops
38 | when the application stops.
39 |
40 | Filters
41 | =======
42 |
43 | - Filters.scala:
44 |
45 | Creates the list of HTTP filters used by your application.
46 |
47 | - ExampleFilter.scala
48 |
49 | A simple filter that adds a header to every response.
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/config/CrawlerConfigContants.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.config
2 |
3 | /**
4 | * Created by young.yang on 2016/9/3.
5 | */
6 | private[crawler] object CrawlerConfigContants {
7 | val young_crawler_appName = "young.crawler.appName"
8 | val young_crawler_task_inject_name = "young.crawler.task.inject.name"
9 | val young_crawler_task_fetch_name = "young.crawler.task.fetch.name"
10 | val young_crawler_task_parse_name = "young.crawler.task.parse.name"
11 | val young_crawler_task_index_name = "young.crawler.task.index.name"
12 | val young_crawler_task_count_name = "young.cralwer.task.count.name"
13 | val young_crawler_task_seed_path = "young.crawler.task.seed.path"
14 | val young_crawler_task_parallel_int = "young.crawler.task.parallel.int"
15 | val young_crawler_fetcher_cache_imp = "young.crawler.fetcher.cache.imp"
16 | val young_crawler_fetcher_timeout = "young.crawler.fetcher.timeout"
17 | val young_crawler_fetcher_useragent = "young.crawler.fetcher.useragent"
18 | val young_crawler_indexer_es_host = "young.crawler.indexer.es.host"
19 | val young_crawler_indexer_es_port = "young.crawler.indexer.es.port"
20 | val young_crawler_indexer_es_name = "young.crawler.indexer.es.name"
21 | val young_crawler_indexer_es_type = "young.crawler.indexer.es.type"
22 | val young_cralwer_fetcher_friendtime = "young.cralwer.fetcher.friendtime"
23 | val young_crawler_fetcher_deep = "young.crawler.fetcher.deep"
24 |
25 | }
26 |
--------------------------------------------------------------------------------
/young-crawler-searcher/app/controllers/AsyncController.scala:
--------------------------------------------------------------------------------
1 | package controllers
2 |
3 | import akka.actor.ActorSystem
4 | import javax.inject._
5 | import play.api._
6 | import play.api.mvc._
7 | import scala.concurrent.{ExecutionContext, Future, Promise}
8 | import scala.concurrent.duration._
9 |
10 | /**
11 | * This controller creates an `Action` that demonstrates how to write
12 | * simple asynchronous code in a controller. It uses a timer to
13 | * asynchronously delay sending a response for 1 second.
14 | *
15 | * @param actorSystem We need the `ActorSystem`'s `Scheduler` to
16 | * run code after a delay.
17 | * @param exec We need an `ExecutionContext` to execute our
18 | * asynchronous code.
19 | */
20 | @Singleton
21 | class AsyncController @Inject() (actorSystem: ActorSystem)(implicit exec: ExecutionContext) extends Controller {
22 |
23 | /**
24 | * Create an Action that returns a plain text message after a delay
25 | * of 1 second.
26 | *
27 | * The configuration in the `routes` file means that this method
28 | * will be called when the application receives a `GET` request with
29 | * a path of `/message`.
30 | */
31 | def message = Action.async {
32 | getFutureMessage(1.second).map { msg => Ok(msg) }
33 | }
34 |
35 | private def getFutureMessage(delayTime: FiniteDuration): Future[String] = {
36 | val promise: Promise[String] = Promise[String]()
37 | actorSystem.scheduler.scheduleOnce(delayTime) { promise.success("Hi!") }
38 | promise.future
39 | }
40 |
41 | }
42 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/resources/crawler.properties:
--------------------------------------------------------------------------------
1 | #appName akka ActorSystem name
2 | young.crawler.appName=young-crawler
3 | #inject task actor name
4 | young.crawler.task.inject.name=young-injector
5 | #fetcher task actor name
6 | young.crawler.task.fetch.name=young-fetcher
7 | #parse task actor name
8 | young.crawler.task.parse.name=young-parser
9 | #index task actor name
10 | young.crawler.task.index.name=young-indexr
11 | #counter task actor name
12 | young.cralwer.task.count.name=young-count
13 | # seed config
14 | young.crawler.task.seed.path=classpath:/seeds.txt
15 | #并行度
16 | young.crawler.task.parallel.int=5
17 | #爬取url超时时间
18 | young.crawler.fetcher.timeout=5000
19 | #网页友好访问时间
20 | young.cralwer.fetcher.friendtime=1000
21 | #爬虫爬取深度
22 | young.crawler.fetcher.deep=1
23 | #爬取网页使用的useragent
24 | young.crawler.fetcher.useragent=Mozilla/5.0 (X11; U; Linux i686; zh-CN; rv:1.9.1.2) Gecko/20090803 Fedora/3.5.2-2.fc11 Firefox/3.5.2", timeout: Int = 10000, poolSize: Int = 100)
25 | #索引网页时候elasticsearch host
26 | young.crawler.indexer.es.host=115.29.47.216
27 | #es port
28 | young.crawler.indexer.es.port=9300
29 | #es index name
30 | young.crawler.indexer.es.name=page
31 | #es index type
32 | young.crawler.indexer.es.type=html
33 | #url排重使用的排重实现类
34 | young.crawler.fetcher.cache.imp=com.young.crawler.cache.support.RedisCache
35 | #url排重缓存时间,单位为s
36 | young.crawler.fetcher.cache.timeout=100
37 | #redis config
38 | young.crawler.fetcher.cache.redis.host=115.29.47.216
39 | young.crawler.fetcher.cache.redis.port=6379
40 | young.crawler.fetcher.cache.redis.password=
41 |
42 |
43 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/task/support/actor/FetchActorTask.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.spider.task.support.actor
2 |
3 | import akka.actor.{ActorRef, Actor}
4 | import akka.event.Logging
5 | import com.young.crawler.config.{CrawlerConfigContants, CrawlerConfig}
6 | import com.young.crawler.entity.{FetchError, FetchOk, FetchCounter, UrlInfo}
7 | import com.young.crawler.spider.fetcher.Fetcher
8 | import com.young.crawler.spider.task.{FetchTask, ParserTask}
9 |
10 | /**
11 | * Created by young.yang on 2016/8/28.
12 | * 网页抓取任务,采用Actor实现
13 | */
14 | private[crawler] class FetchActorTask(fetcher: Fetcher, parserTask: ActorRef) extends Actor with FetchTask {
15 |
16 | private val countActor = context.system.actorSelection("akka://" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_appName) + "/user/" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_count_name))
17 |
18 | private val log = Logging(context.system, this)
19 |
20 | private var injector: ActorRef = null
21 |
22 | override def receive: Receive = {
23 | //处理抓取任务
24 | case page: UrlInfo =>
25 | injector = sender()
26 | val httpResult = fetcher.fetchPage(page)
27 | countActor ! FetchCounter(1)
28 | if (!httpResult.isEmpty) {
29 | parserTask ! httpResult.get
30 | log.info("FetcherTask send parserTask a httpResult [" + httpResult + "]")
31 | countActor ! FetchOk(1)
32 | } else {
33 | countActor ! FetchError(1)
34 | }
35 | //将解析完成的子url发送到注入任务继续抓取
36 | case urls: List[UrlInfo] => injector ! urls
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/cache/support/RedisCache.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.cache.support
2 |
3 | import com.young.crawler.cache.Cache
4 | import com.young.crawler.config.CrawlerConfig
5 | import redis.clients.jedis.JedisPool
6 |
7 | /**
8 | * Created by dell on 2016/9/2.
9 | * 采用Redis实现的缓存
10 | */
11 | private[crawler] class RedisCache[KEY, VALUE] extends Cache[KEY, VALUE] {
12 |
13 | private val JEDIS_HOST = CrawlerConfig.getConfig.getString("young.crawler.fetcher.cache.redis.host")
14 |
15 | private val JEDIS_PORT = CrawlerConfig.getConfig.getString("young.crawler.fetcher.cache.redis.port").toInt
16 |
17 | private val JEDIS_PASS = CrawlerConfig.getConfig.getString("young.crawler.fetcher.cache.redis.password")
18 |
19 | private val expire = CrawlerConfig.getConfig.getString("young.crawler.fetcher.cache.timeout").toInt
20 |
21 | private val jedisPool = new JedisPool(JEDIS_HOST, JEDIS_PORT)
22 |
23 | override def contains(key: KEY): Boolean = {
24 | val jedis = jedisPool.getResource
25 | val bool = jedis.exists(key.toString)
26 | jedis.close()
27 | bool
28 | }
29 |
30 | override def get(key: KEY): Option[VALUE] = {
31 | val jedis = jedisPool.getResource
32 | val result = Option(jedis.get(key.toString).asInstanceOf[VALUE])
33 | jedis.close()
34 | result
35 | }
36 |
37 | override def put(key:KEY,value:VALUE): Unit = {
38 | val jedis = jedisPool.getResource
39 | jedis.setex(key.toString,expire, value.toString)
40 | jedis.close()
41 | }
42 |
43 | override def size(): Int = 0
44 |
45 | override def keys(): Set[KEY] = throw new Exception("unsupport operation")
46 | }
47 |
--------------------------------------------------------------------------------
/young-crawler-searcher/conf/logback.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | ${application.home:-.}/logs/application.log
8 |
9 | %date [%level] from %logger in %thread - %message%n%xException
10 |
11 |
12 |
13 |
14 |
15 | %coloredLevel %logger{15} - %message%n%xException{10}
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/young-crawler-searcher/app/services/ApplicationTimer.scala:
--------------------------------------------------------------------------------
1 | package services
2 |
3 | import java.time.{Clock, Instant}
4 | import javax.inject._
5 | import play.api.Logger
6 | import play.api.inject.ApplicationLifecycle
7 | import scala.concurrent.Future
8 |
9 | /**
10 | * This class demonstrates how to run code when the
11 | * application starts and stops. It starts a timer when the
12 | * application starts. When the application stops it prints out how
13 | * long the application was running for.
14 | *
15 | * This class is registered for Guice dependency injection in the
16 | * [[Module]] class. We want the class to start when the application
17 | * starts, so it is registered as an "eager singleton". See the code
18 | * in the [[Module]] class to see how this happens.
19 | *
20 | * This class needs to run code when the server stops. It uses the
21 | * application's [[ApplicationLifecycle]] to register a stop hook.
22 | */
23 | @Singleton
24 | class ApplicationTimer @Inject() (clock: Clock, appLifecycle: ApplicationLifecycle) {
25 |
26 | // This code is called when the application starts.
27 | private val start: Instant = clock.instant
28 | Logger.info(s"ApplicationTimer demo: Starting application at $start.")
29 |
30 | // When the application starts, register a stop hook with the
31 | // ApplicationLifecycle object. The code inside the stop hook will
32 | // be run when the application stops.
33 | appLifecycle.addStopHook { () =>
34 | val stop: Instant = clock.instant
35 | val runningTime: Long = stop.getEpochSecond - start.getEpochSecond
36 | Logger.info(s"ApplicationTimer demo: Stopping application at ${clock.instant} after ${runningTime}s.")
37 | Future.successful(())
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/entity/HttpEntitys.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.entity
2 |
3 | import java.net.{DatagramSocket, DatagramPacket}
4 |
5 | import scala.beans.BeanProperty
6 |
7 | /**
8 | * Created by young.yang on 2016/8/28.
9 | * 通过爬取回来的http原始页面
10 | */
11 | case class HttpResult(status:Int,content:String,message:String,url:String,deep:Int){
12 | override def toString()="status="+status+",context length="+content.length+",url="+url
13 | }
14 |
15 |
16 | sealed trait UrlType
17 | case object SeedType extends UrlType
18 | case object GenerateType extends UrlType
19 | /**
20 | * 爬取url类
21 | * @param url url
22 | * @param parent 父url
23 | */
24 | case class UrlInfo(url:String,parent:String,urlType: UrlType,deep:Int){
25 | override def toString()=url+"\n"
26 | }
27 |
28 | /**
29 | * 索引结果
30 | * @param status
31 | */
32 | case class IndexResult(status:Int)
33 |
34 | /**
35 | * 种子类
36 | * @param url 种子url
37 | */
38 | case class Seed(url:String){
39 | override def toString() = url+"\n"
40 | }
41 |
42 | /**
43 | * 解析出来的HTTP网页信息
44 | */
45 | class HttpPage{
46 | @BeanProperty
47 | var url: String = ""
48 | @BeanProperty
49 | var title: String = ""
50 | @BeanProperty
51 | var html:String = ""
52 | @BeanProperty
53 | var content: String = ""
54 | @BeanProperty
55 | var publishTime: Long = 0
56 | @BeanProperty
57 | var updateTime: Long = 0
58 | @BeanProperty
59 | var author: String = ""
60 | @BeanProperty
61 | var keywords:String = ""
62 | @BeanProperty
63 | var desc:String = ""
64 | @BeanProperty
65 | var childLink:(List[UrlInfo],Int) = (List(),0)
66 | @BeanProperty
67 | var meta:Map[String,String] = Map()
68 |
69 | override def toString()="url="+url+",context length="+content.length
70 |
71 | }
72 |
73 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/task/support/actor/ParseActorTask.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.spider.task.support.actor
2 |
3 | import akka.actor.{ActorRef, Actor}
4 | import akka.event.Logging
5 | import com.young.crawler.config.{CrawlerConfigContants, CrawlerConfig}
6 | import com.young.crawler.entity.{ParseChildUrlCounter, ParseCounter, HttpPage, HttpResult}
7 | import com.young.crawler.spider.parser.Parser
8 | import com.young.crawler.spider.task.ParserTask
9 |
10 | /**
11 | * Created by young.yang on 2016/8/28.
12 | * 解析任务
13 | */
14 | private[crawler] class ParseActorTask(parser: Parser, indexTask: ActorRef) extends Actor with ParserTask {
15 |
16 | private val log = Logging(context.system, this)
17 |
18 | private val countActor = context.system.actorSelection("akka://" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_appName) + "/user/" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_count_name))
19 |
20 | private val fetchDeep = CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_fetcher_deep).toInt
21 |
22 | private var fetcher: ActorRef = null
23 |
24 | override def receive: Receive = {
25 | case httpResult: HttpResult =>
26 | fetcher = sender()
27 | val page: HttpPage = parser.parse(httpResult)
28 | indexTask ! page
29 | countActor ! ParseCounter(1)
30 | log.info("ParserTask send IndexerTask a index request -[" + page + "]")
31 | val childLinks = page.getChildLink
32 | if(childLinks._2 throw new FetchException("fetch error message error url is " + url, e)
46 | }
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/parser/support/JsoupParser.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.spider.parser.support
2 |
3 | import com.young.crawler.entity.{GenerateType, HttpPage, HttpResult, UrlInfo}
4 | import com.young.crawler.spider.parser.Parser
5 | import org.jsoup.Jsoup
6 | import org.jsoup.select.Elements
7 |
8 | import scala.collection.mutable.ListBuffer
9 |
10 | /**
11 | * Created by young.yang on 2016/8/31.
12 | * Jsoup解析器
13 | */
14 | private[crawler] class JsoupParser extends Parser {
15 |
16 | private val KEYWORDS = "keywords"
17 |
18 | private val DESCRIPTION = "description"
19 |
20 | /**
21 | * 解析meta信息
22 | * @param key
23 | * @param meta
24 | * @return
25 | */
26 | private def getMeta(key: String, meta: Elements): String = {
27 | for (i <- 0 until meta.size()) {
28 | val element = meta.get(i)
29 | if (key.equals(element.attr("name").toLowerCase)) {
30 | return element.attr("content")
31 | }
32 | }
33 | ""
34 | }
35 |
36 | /**
37 | * 解析子url
38 | */
39 | private def parserUrls(urls: Elements, deep: Int): (List[UrlInfo],Int) = {
40 | val list = new ListBuffer[UrlInfo]()
41 | for (i <- 0 until urls.size()) {
42 | val element = urls.get(i)
43 | val url = element.attr("href")
44 | if (url.startsWith("http"))
45 | list.append(UrlInfo(url, "", GenerateType, deep + 1))
46 | }
47 | (list.toList,deep+1)
48 | }
49 |
50 | /**
51 | * 解析具体实现
52 | * @param html
53 | * @return
54 | */
55 | override def parse(html: HttpResult): HttpPage = {
56 | val htmlPage = new HttpPage
57 | val document = Jsoup.parse(html.content)
58 | val meta = document.select("meta")
59 | htmlPage.setTitle(document.title())
60 | htmlPage.setContent(document.text())
61 | // htmlPage.setHtml(html.content)
62 | htmlPage.setPublishTime(System.currentTimeMillis())
63 | htmlPage.setUpdateTime(System.currentTimeMillis())
64 | htmlPage.setUrl(html.url)
65 | htmlPage.setKeywords(getMeta(KEYWORDS, meta))
66 | htmlPage.setDesc(getMeta(DESCRIPTION, meta))
67 | htmlPage.setChildLink(parserUrls(document.body().select("a"),html.deep))
68 | htmlPage
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/task/support/actor/InjectActorTask.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.spider.task.support.actor
2 |
3 | import akka.actor.{ActorRef, Actor, Props}
4 | import akka.event.Logging
5 | import com.young.crawler.config.{CrawlerConfigContants, CrawlerConfig}
6 | import com.young.crawler.entity._
7 | import com.young.crawler.spider.fetcher.support.HttpClientFetcher
8 | import com.young.crawler.spider.indexer.support.ElasticIndexer
9 | import com.young.crawler.spider.parser.support.{JsoupParser, HtmlParseParser}
10 | import com.young.crawler.spider.task.InjectTask
11 |
12 | import scala.io.Source
13 |
14 | /**
15 | * Created by dell on 2016/8/29.
16 | * 抓取种子注入任务,将需要抓取的任务注入到该任务中
17 | */
18 | private[crawler] class InjectActorTask(fetcher: ActorRef) extends Actor with InjectTask {
19 | private val log = Logging(context.system, this)
20 |
21 | private val countActor = context.system.actorSelection("akka://" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_appName) + "/user/" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_count_name))
22 |
23 |
24 | override def receive: Receive = {
25 | //初始化注入
26 | case init: InitSeed =>
27 | val seeds = initSeeds(init.seedPath, init.fileEncode)
28 | log.info("init seeds -" + seeds)
29 | seeds.map(seed => {
30 | fetcher ! UrlInfo(seed.url, null,SeedType,0)
31 | countActor ! InjectCounter(1)
32 | })
33 | //子url注入
34 | case urls: List[UrlInfo] =>
35 | log.info("inject urls -" + urls)
36 | urls.filter(seed => seed.url.startsWith("http")).map(seed => {
37 | fetcher ! seed
38 | countActor ! InjectCounter(1)
39 | }
40 | )
41 | }
42 |
43 | override def initSeeds(seedPath: String, fileEncode: String = "utf-8"): List[Seed] = {
44 | log.info("seedpath = ["+seedPath+"] encoding = ["+fileEncode+"]")
45 | if (seedPath == null || seedPath.trim.equals("") || seedPath.startsWith("classpath:")) {
46 | val temp = seedPath.split(":")
47 | log.info("classpath seedpath = ["+temp(1)+"]")
48 | Source.fromInputStream(classOf[InjectTask].getResourceAsStream(temp(1))).getLines().map(line => Seed(line)).toList
49 | } else
50 | Source.fromFile(seedPath, fileEncode).getLines().map(line => Seed(line)).toList
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/task/support/actor/CounterActorTask.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.spider.task.support.actor
2 |
3 | import akka.actor.Actor
4 | import com.young.crawler.entity._
5 | import com.young.crawler.spider.task.CounterTask
6 |
7 | /**
8 | * Created by young.yang on 2016/9/3.
9 | * 用来对任务进行计数
10 | */
11 | private[crawler] class CounterActorTask extends Actor with CounterTask {
12 |
13 | private var fetchCounter = FetchCounter(0)
14 | private var fetchOk = FetchOk(0)
15 | private var fetchError = FetchError(0)
16 | private var injectCounter = InjectCounter(0)
17 | private var parseCounter = ParseCounter(0)
18 | private var parseChildUrlCounter = ParseChildUrlCounter(0)
19 | private var indexCounter = IndexCounter(0)
20 |
21 |
22 | private def printCounter(): String = {
23 | val buffer = new StringBuilder
24 | buffer.append("task counter details start ------" + "\n")
25 | buffer.append("fetchCounter = [" + fetchCounter.num + "]" + "\n")
26 | buffer.append("fetchOk = [" + fetchOk.num + "]" + "\n")
27 | buffer.append("fetchError = [" + fetchError.num + "]" + "\n")
28 | buffer.append("injectCounter = [" + injectCounter.num + "]" + "\n")
29 | buffer.append("parseCounter = [" + parseCounter.num + "]" + "\n")
30 | buffer.append("parseChildUrlCounter = [" + parseChildUrlCounter.num + "]" + "\n")
31 | buffer.append("indexCounter = [" + indexCounter.num + "]" + "\n")
32 | buffer.append("task counter details end -------")
33 | buffer.toString()
34 | }
35 |
36 | private def getAllCounter():AllCounter = AllCounter(fetchCounter,fetchOk,fetchError,injectCounter,parseCounter,parseChildUrlCounter,indexCounter)
37 |
38 | override def receive: Receive = {
39 | case counter: FetchCounter => fetchCounter = FetchCounter(fetchCounter.num + counter.num)
40 | case count: FetchOk => fetchOk = FetchOk(count.num + fetchOk.num)
41 | case count: FetchError => fetchError = FetchError(count.num + fetchError.num)
42 | case count: InjectCounter => injectCounter = InjectCounter(count.num + injectCounter.num)
43 | case count: ParseCounter => parseCounter = ParseCounter(count.num + parseCounter.num)
44 | case count: ParseChildUrlCounter => parseChildUrlCounter = ParseChildUrlCounter(count.num + parseChildUrlCounter.num)
45 | case count: IndexCounter => indexCounter = IndexCounter(count.num + indexCounter.num)
46 | case PrintCounter => sender() ! printCounter()
47 | case GetAllCounter => sender() ! getAllCounter
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/fetcher/support/HttpWatch.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.spider.fetcher.support
2 |
3 | import com.young.crawler.config.{CrawlerConfigContants, CrawlerConfig}
4 | import com.young.crawler.entity.{UrlInfo, HttpResult}
5 | import org.apache.commons.io.IOUtils
6 | import org.apache.http.annotation.NotThreadSafe
7 | import org.apache.http.client.config.RequestConfig
8 | import org.apache.http.client.methods.{HttpHead, HttpGet, HttpUriRequest}
9 | import org.apache.http.impl.client.HttpClients
10 | import org.apache.http.Header
11 |
12 | /**
13 | * Created by young.yang on 2016/8/28.
14 | */
15 | class HttpWatch(userAgent: String = "Mozilla/5.0 (X11; U; Linux i686; zh-CN; rv:1.9.1.2) Gecko/20090803 Fedora/3.5.2-2.fc11 Firefox/3.5.2", timeout: Int = 10000, poolSize: Int = 100) {
16 |
17 | private val defaultRequestConfig = RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build()
18 |
19 | private val httpClient = HttpClients.custom().setUserAgent(userAgent).setMaxConnTotal(poolSize)
20 | .setMaxConnPerRoute(poolSize).setDefaultRequestConfig(defaultRequestConfig).build();
21 |
22 | private def doGet(url: UrlInfo, encode: String = "utf-8"): HttpResult = {
23 | val get = new HttpGet(url.url)
24 | val result = sendRequest(get, encode)
25 | HttpResult(result._1, result._2, result._3, url.url,url.deep)
26 | }
27 |
28 | private def doHeader(url: String): Array[Header] = {
29 | val header = new HttpHead(url)
30 | httpClient.execute(header).getAllHeaders
31 | }
32 |
33 | private def sendRequest(request: HttpUriRequest, encode: String): (Int, String, String) = {
34 | val response = httpClient.execute(request)
35 | val statusCode = response.getStatusLine.getStatusCode
36 | val message = response.getStatusLine.getReasonPhrase
37 | val content = IOUtils.toString(response.getEntity.getContent, encode)
38 | // val content = IOUtil.toString(response.getEntity.getContent,encode)
39 | (statusCode, content, message)
40 | }
41 | }
42 | @NotThreadSafe
43 | object HttpWatch {
44 | val WATCH_TYPE_PROTOTYPE = "prototype"
45 | val WATCH_TYPE_SINGLETON = "singleton"
46 | var WATCH_TYPE = WATCH_TYPE_PROTOTYPE
47 | private val httpWatch = getHttpWatch()
48 |
49 | def get(url:UrlInfo, encode: String = "utf-8"): HttpResult = getHttpWatch().doGet(url, encode)
50 |
51 | def header(url: String): Array[Header] = getHttpWatch().doHeader(url)
52 |
53 | private def getHttpWatch(): HttpWatch = {
54 | if (WATCH_TYPE_PROTOTYPE.equals(WATCH_TYPE))
55 | new HttpWatch(CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_fetcher_useragent), CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_fetcher_timeout).toInt)
56 | else
57 | httpWatch
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/boot/CrawlerBoot.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.boot
2 |
3 | import akka.actor.{ActorSystem, Props}
4 | import akka.pattern.ask
5 | import akka.routing.RoundRobinPool
6 | import com.young.crawler.config.{CrawlerConfig, CrawlerConfigContants}
7 | import com.young.crawler.entity.{AllCounter, GetAllCounter, InitSeed, PrintCounter}
8 | import com.young.crawler.spider.fetcher.support.HttpClientFetcher
9 | import com.young.crawler.spider.indexer.support.ElasticIndexer
10 | import com.young.crawler.spider.parser.support.JsoupParser
11 | import com.young.crawler.spider.task.support.actor._
12 | import org.apache.commons.logging.LogFactory
13 |
14 | import scala.concurrent.Await
15 | import scala.concurrent.duration.Duration
16 |
17 | /**
18 | * Created by dell on 2016/8/29.
19 | * 爬虫主函数
20 | */
21 | object CrawlerBoot {
22 |
23 | private val system = ActorSystem(CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_appName))
24 |
25 | private val log = LogFactory.getLog(CrawlerConfigContants.young_crawler_appName)
26 |
27 | private val timeout = Duration(5, "s")
28 |
29 | /**
30 | * 爬虫启动函数
31 | */
32 | def start(): Unit = {
33 | val initSeeds = InitSeed(CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_seed_path))
34 | //每个角色的actor都可以通过组组成一组actor进行处理
35 | val parallel = CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_parallel_int).toInt
36 | val indexerActor = system.actorOf(RoundRobinPool(parallel).props(Props(new IndexActorTask(new ElasticIndexer))), CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_index_name))
37 | log.info("create indexerActor name -[" + indexerActor + "]")
38 | val parserActor = system.actorOf(RoundRobinPool(parallel).props(Props(new ParseActorTask(new JsoupParser, indexerActor))), CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_parse_name))
39 | log.info("create parserActor name -[" + parserActor + "]")
40 | val fetcher = system.actorOf(RoundRobinPool(parallel).props(Props(new FetchActorTask(new HttpClientFetcher, parserActor))), CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_fetch_name))
41 | log.info("create fetcherActor name -[" + fetcher + "]")
42 | val injectActor = system.actorOf(RoundRobinPool(parallel).props(Props(new InjectActorTask(fetcher))), CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_inject_name))
43 | log.info("create injectActor name -[" + injectActor + "]")
44 | val countActor = system.actorOf(Props[CounterActorTask], CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_count_name))
45 | log.info("create countActor name -[" + countActor + "]")
46 | injectActor ! initSeeds
47 | }
48 |
49 | /**
50 | * 停止爬虫程序
51 | */
52 | def stop(): Unit = {
53 | system.terminate()
54 | }
55 |
56 | def printCount(): String = {
57 | val countActor = system.actorSelection("akka://" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_appName) + "/user/" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_count_name))
58 | val result = ask(countActor, PrintCounter)(timeout)
59 | Await.result(result, timeout).asInstanceOf[String]
60 | }
61 |
62 | def getCounter(): AllCounter = {
63 | val countActor = system.actorSelection("akka://" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_appName) + "/user/" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_count_name))
64 | val result = ask(countActor, GetAllCounter)(timeout)
65 | Await.result(result, timeout).asInstanceOf[AllCounter]
66 | }
67 |
68 | def main(args: Array[String]) {
69 | CrawlerBoot.start()
70 | // Thread.sleep(3000)
71 | // println(CrawlerBoot.printCount())
72 | // println(CrawlerBoot.getCounter())
73 | // CrawlerBoot.stop()
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/young-crawler-searcher/bin/activator.bat:
--------------------------------------------------------------------------------
1 | @REM activator launcher script
2 | @REM
3 | @REM Environment:
4 | @REM In order for Activator to work you must have Java available on the classpath
5 | @REM JAVA_HOME - location of a JDK home dir (optional if java on path)
6 | @REM CFG_OPTS - JVM options (optional)
7 | @REM Configuration:
8 | @REM activatorconfig.txt found in the ACTIVATOR_HOME or ACTIVATOR_HOME/ACTIVATOR_VERSION
9 | @setlocal enabledelayedexpansion
10 |
11 | @echo off
12 |
13 | set "var1=%~1"
14 | if defined var1 (
15 | if "%var1%"=="help" (
16 | echo.
17 | echo Usage activator [options] [command]
18 | echo.
19 | echo Commands:
20 | echo ui Start the Activator UI
21 | echo new [name] [template-id] Create a new project with [name] using template [template-id]
22 | echo list-templates Print all available template names
23 | echo help Print this message
24 | echo.
25 | echo Options:
26 | echo -jvm-debug [port] Turn on JVM debugging, open at the given port. Defaults to 9999 if no port given.
27 | echo.
28 | echo Environment variables ^(read from context^):
29 | echo JAVA_OPTS Environment variable, if unset uses ""
30 | echo SBT_OPTS Environment variable, if unset uses ""
31 | echo ACTIVATOR_OPTS Environment variable, if unset uses ""
32 | echo.
33 | echo Please note that in order for Activator to work you must have Java available on the classpath
34 | echo.
35 | goto :end
36 | )
37 | )
38 |
39 | @REM determine ACTIVATOR_HOME environment variable
40 | set BIN_DIRECTORY=%~dp0
41 | set BIN_DIRECTORY=%BIN_DIRECTORY:~0,-1%
42 | for %%d in (%BIN_DIRECTORY%) do set ACTIVATOR_HOME=%%~dpd
43 | set ACTIVATOR_HOME=%ACTIVATOR_HOME:~0,-1%
44 |
45 | echo ACTIVATOR_HOME=%ACTIVATOR_HOME%
46 |
47 | set ERROR_CODE=0
48 | set APP_VERSION=1.3.10
49 | set ACTIVATOR_LAUNCH_JAR=activator-launch-%APP_VERSION%.jar
50 |
51 | rem Detect if we were double clicked, although theoretically A user could
52 | rem manually run cmd /c
53 | for %%x in (%cmdcmdline%) do if %%~x==/c set DOUBLECLICKED=1
54 |
55 | set SBT_HOME=%BIN_DIRECTORY
56 |
57 | rem Detect if we were double clicked, although theoretically A user could
58 | rem manually run cmd /c
59 | for %%x in (%cmdcmdline%) do if %%~x==/c set DOUBLECLICKED=1
60 |
61 | rem FIRST we load the config file of extra options.
62 | set FN=%SBT_HOME%\..\conf\sbtconfig.txt
63 | set CFG_OPTS=
64 | FOR /F "tokens=* eol=# usebackq delims=" %%i IN ("%FN%") DO (
65 | set DO_NOT_REUSE_ME=%%i
66 | rem ZOMG (Part #2) WE use !! here to delay the expansion of
67 | rem CFG_OPTS, otherwise it remains "" for this loop.
68 | set CFG_OPTS=!CFG_OPTS! !DO_NOT_REUSE_ME!
69 | )
70 |
71 | rem FIRST we load a config file of extra options (if there is one)
72 | set "CFG_FILE_HOME=%UserProfile%\.activator\activatorconfig.txt"
73 | set "CFG_FILE_VERSION=%UserProfile%\.activator\%APP_VERSION%\activatorconfig.txt"
74 | if exist %CFG_FILE_VERSION% (
75 | FOR /F "tokens=* eol=# usebackq delims=" %%i IN ("%CFG_FILE_VERSION%") DO (
76 | set DO_NOT_REUSE_ME=%%i
77 | rem ZOMG (Part #2) WE use !! here to delay the expansion of
78 | rem CFG_OPTS, otherwise it remains "" for this loop.
79 | set CFG_OPTS=!CFG_OPTS! !DO_NOT_REUSE_ME!
80 | )
81 | )
82 | if "%CFG_OPTS%"=="" (
83 | if exist %CFG_FILE_HOME% (
84 | FOR /F "tokens=* eol=# usebackq delims=" %%i IN ("%CFG_FILE_HOME%") DO (
85 | set DO_NOT_REUSE_ME=%%i
86 | rem ZOMG (Part #2) WE use !! here to delay the expansion of
87 | rem CFG_OPTS, otherwise it remains "" for this loop.
88 | set CFG_OPTS=!CFG_OPTS! !DO_NOT_REUSE_ME!
89 | )
90 | )
91 | )
92 |
93 | rem We use the value of the JAVACMD environment variable if defined
94 | set _JAVACMD=%JAVACMD%
95 |
96 | if "%_JAVACMD%"=="" (
97 | if not "%JAVA_HOME%"=="" (
98 | if exist "%JAVA_HOME%\bin\java.exe" set "_JAVACMD=%JAVA_HOME%\bin\java.exe"
99 |
100 | rem if there is a java home set we make sure it is the first picked up when invoking 'java'
101 | SET "PATH=%JAVA_HOME%\bin;%PATH%"
102 | )
103 | )
104 |
105 | if "%_JAVACMD%"=="" set _JAVACMD=java
106 |
107 | rem Detect if this java is ok to use.
108 | for /F %%j in ('"%_JAVACMD%" -version 2^>^&1') do (
109 | if %%~j==java set JAVAINSTALLED=1
110 | if %%~j==openjdk set JAVAINSTALLED=1
111 | )
112 |
113 | rem Detect the same thing about javac
114 | if "%_JAVACCMD%"=="" (
115 | if not "%JAVA_HOME%"=="" (
116 | if exist "%JAVA_HOME%\bin\javac.exe" set "_JAVACCMD=%JAVA_HOME%\bin\javac.exe"
117 | )
118 | )
119 | if "%_JAVACCMD%"=="" set _JAVACCMD=javac
120 | for /F %%j in ('"%_JAVACCMD%" -version 2^>^&1') do (
121 | if %%~j==javac set JAVACINSTALLED=1
122 | )
123 |
124 | rem BAT has no logical or, so we do it OLD SCHOOL! Oppan Redmond Style
125 | set JAVAOK=true
126 | if not defined JAVAINSTALLED set JAVAOK=false
127 | if not defined JAVACINSTALLED set JAVAOK=false
128 |
129 | if "%JAVAOK%"=="false" (
130 | echo.
131 | echo A Java JDK is not installed or can't be found.
132 | if not "%JAVA_HOME%"=="" (
133 | echo JAVA_HOME = "%JAVA_HOME%"
134 | )
135 | echo.
136 | echo Please go to
137 | echo http://www.oracle.com/technetwork/java/javase/downloads/index.html
138 | echo and download a valid Java JDK and install before running Activator.
139 | echo.
140 | echo If you think this message is in error, please check
141 | echo your environment variables to see if "java.exe" and "javac.exe" are
142 | echo available via JAVA_HOME or PATH.
143 | echo.
144 | if defined DOUBLECLICKED pause
145 | exit /B 1
146 | )
147 |
148 | rem Check what Java version is being used to determine what memory options to use
149 | for /f "tokens=3" %%g in ('java -version 2^>^&1 ^| findstr /i "version"') do (
150 | set JAVA_VERSION=%%g
151 | )
152 |
153 | rem Strips away the " characters
154 | set JAVA_VERSION=%JAVA_VERSION:"=%
155 |
156 | rem TODO Check if there are existing mem settings in JAVA_OPTS/CFG_OPTS and use those instead of the below
157 | for /f "delims=. tokens=1-3" %%v in ("%JAVA_VERSION%") do (
158 | set MAJOR=%%v
159 | set MINOR=%%w
160 | set BUILD=%%x
161 |
162 | set META_SIZE=-XX:MetaspaceSize=64M -XX:MaxMetaspaceSize=256M
163 | if "!MINOR!" LSS "8" (
164 | set META_SIZE=-XX:PermSize=64M -XX:MaxPermSize=256M
165 | )
166 |
167 | set MEM_OPTS=!META_SIZE!
168 | )
169 |
170 | rem We use the value of the JAVA_OPTS environment variable if defined, rather than the config.
171 | set _JAVA_OPTS=%JAVA_OPTS%
172 | if "%_JAVA_OPTS%"=="" set _JAVA_OPTS=%CFG_OPTS%
173 |
174 | set DEBUG_OPTS=
175 |
176 | rem Loop through the arguments, building remaining args in args variable
177 | set args=
178 | :argsloop
179 | if not "%~1"=="" (
180 | rem Checks if the argument contains "-D" and if true, adds argument 1 with 2 and puts an equal sign between them.
181 | rem This is done since batch considers "=" to be a delimiter so we need to circumvent this behavior with a small hack.
182 | set arg1=%~1
183 | if "!arg1:~0,2!"=="-D" (
184 | set "args=%args% "%~1"="%~2""
185 | shift
186 | shift
187 | goto argsloop
188 | )
189 |
190 | if "%~1"=="-jvm-debug" (
191 | if not "%~2"=="" (
192 | rem This piece of magic somehow checks that an argument is a number
193 | for /F "delims=0123456789" %%i in ("%~2") do (
194 | set var="%%i"
195 | )
196 | if defined var (
197 | rem Not a number, assume no argument given and default to 9999
198 | set JPDA_PORT=9999
199 | ) else (
200 | rem Port was given, shift arguments
201 | set JPDA_PORT=%~2
202 | shift
203 | )
204 | ) else (
205 | set JPDA_PORT=9999
206 | )
207 | shift
208 |
209 | set DEBUG_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=!JPDA_PORT!
210 | goto argsloop
211 | )
212 | rem else
213 | set "args=%args% "%~1""
214 | shift
215 | goto argsloop
216 | )
217 |
218 | :run
219 |
220 | if "!args!"=="" (
221 | if defined DOUBLECLICKED (
222 | set CMDS="ui"
223 | ) else set CMDS=!args!
224 | ) else set CMDS=!args!
225 |
226 | rem We add a / in front, so we get file:///C: instead of file://C:
227 | rem Java considers the later a UNC path.
228 | rem We also attempt a solid effort at making it URI friendly.
229 | rem We don't even bother with UNC paths.
230 | set JAVA_FRIENDLY_HOME_1=/!ACTIVATOR_HOME:\=/!
231 | set JAVA_FRIENDLY_HOME=/!JAVA_FRIENDLY_HOME_1: =%%20!
232 |
233 | rem Checks if the command contains spaces to know if it should be wrapped in quotes or not
234 | set NON_SPACED_CMD=%_JAVACMD: =%
235 | if "%_JAVACMD%"=="%NON_SPACED_CMD%" %_JAVACMD% %DEBUG_OPTS% %MEM_OPTS% %ACTIVATOR_OPTS% %SBT_OPTS% %_JAVA_OPTS% "-Dactivator.home=%JAVA_FRIENDLY_HOME%" -jar "%ACTIVATOR_HOME%\libexec\%ACTIVATOR_LAUNCH_JAR%" %CMDS%
236 | if NOT "%_JAVACMD%"=="%NON_SPACED_CMD%" "%_JAVACMD%" %DEBUG_OPTS% %MEM_OPTS% %ACTIVATOR_OPTS% %SBT_OPTS% %_JAVA_OPTS% "-Dactivator.home=%JAVA_FRIENDLY_HOME%" -jar "%ACTIVATOR_HOME%\libexec\%ACTIVATOR_LAUNCH_JAR%" %CMDS%
237 |
238 | if ERRORLEVEL 1 goto error
239 | goto end
240 |
241 | :error
242 | set ERROR_CODE=1
243 |
244 | :end
245 |
246 | @endlocal
247 |
248 | exit /B %ERROR_CODE%
249 |
--------------------------------------------------------------------------------
/young-crawler-searcher/conf/application.conf:
--------------------------------------------------------------------------------
1 | # This is the main configuration file for the application.
2 | # https://www.playframework.com/documentation/latest/ConfigFile
3 | # ~~~~~
4 | # Play uses HOCON as its configuration file format. HOCON has a number
5 | # of advantages over other config formats, but there are two things that
6 | # can be used when modifying settings.
7 | #
8 | # You can include other configuration files in this main application.conf file:
9 | #include "extra-config.conf"
10 | #
11 | # You can declare variables and substitute for them:
12 | #mykey = ${some.value}
13 | #
14 | # And if an environment variable exists when there is no other subsitution, then
15 | # HOCON will fall back to substituting environment variable:
16 | #mykey = ${JAVA_HOME}
17 |
18 | ## Akka
19 | # https://www.playframework.com/documentation/latest/ScalaAkka#Configuration
20 | # https://www.playframework.com/documentation/latest/JavaAkka#Configuration
21 | # ~~~~~
22 | # Play uses Akka internally and exposes Akka Streams and actors in Websockets and
23 | # other streaming HTTP responses.
24 | akka {
25 | # "akka.log-config-on-start" is extraordinarly useful because it log the complete
26 | # configuration at INFO level, including defaults and overrides, so it s worth
27 | # putting at the very top.
28 | #
29 | # Put the following in your conf/logback.xml file:
30 | #
31 | #
32 | #
33 | # And then uncomment this line to debug the configuration.
34 | #
35 | #log-config-on-start = true
36 | }
37 |
38 | ## Secret key
39 | # http://www.playframework.com/documentation/latest/ApplicationSecret
40 | # ~~~~~
41 | # The secret key is used to sign Play's session cookie.
42 | # This must be changed for production, but we don't recommend you change it in this file.
43 | play.crypto.secret = "changeme"
44 |
45 | ## Modules
46 | # https://www.playframework.com/documentation/latest/Modules
47 | # ~~~~~
48 | # Control which modules are loaded when Play starts. Note that modules are
49 | # the replacement for "GlobalSettings", which are deprecated in 2.5.x.
50 | # Please see https://www.playframework.com/documentation/latest/GlobalSettings
51 | # for more information.
52 | #
53 | # You can also extend Play functionality by using one of the publically available
54 | # Play modules: https://playframework.com/documentation/latest/ModuleDirectory
55 | play.modules {
56 | # By default, Play will load any class called Module that is defined
57 | # in the root package (the "app" directory), or you can define them
58 | # explicitly below.
59 | # If there are any built-in modules that you want to disable, you can list them here.
60 | #enabled += my.application.Module
61 |
62 | # If there are any built-in modules that you want to disable, you can list them here.
63 | #disabled += ""
64 | }
65 |
66 | ## IDE
67 | # https://www.playframework.com/documentation/latest/IDE
68 | # ~~~~~
69 | # Depending on your IDE, you can add a hyperlink for errors that will jump you
70 | # directly to the code location in the IDE in dev mode. The following line makes
71 | # use of the IntelliJ IDEA REST interface:
72 | #play.editor="http://localhost:63342/api/file/?file=%s&line=%s"
73 |
74 | ## Internationalisation
75 | # https://www.playframework.com/documentation/latest/JavaI18N
76 | # https://www.playframework.com/documentation/latest/ScalaI18N
77 | # ~~~~~
78 | # Play comes with its own i18n settings, which allow the user's preferred language
79 | # to map through to internal messages, or allow the language to be stored in a cookie.
80 | play.i18n {
81 | # The application languages
82 | langs = [ "en" ]
83 |
84 | # Whether the language cookie should be secure or not
85 | #langCookieSecure = true
86 |
87 | # Whether the HTTP only attribute of the cookie should be set to true
88 | #langCookieHttpOnly = true
89 | }
90 |
91 | ## Play HTTP settings
92 | # ~~~~~
93 | play.http {
94 | ## Router
95 | # https://www.playframework.com/documentation/latest/JavaRouting
96 | # https://www.playframework.com/documentation/latest/ScalaRouting
97 | # ~~~~~
98 | # Define the Router object to use for this application.
99 | # This router will be looked up first when the application is starting up,
100 | # so make sure this is the entry point.
101 | # Furthermore, it's assumed your route file is named properly.
102 | # So for an application router like `my.application.Router`,
103 | # you may need to define a router file `conf/my.application.routes`.
104 | # Default to Routes in the root package (aka "apps" folder) (and conf/routes)
105 | #router = my.application.Router
106 |
107 | ## Action Creator
108 | # https://www.playframework.com/documentation/latest/JavaActionCreator
109 | # ~~~~~
110 | #actionCreator = null
111 |
112 | ## ErrorHandler
113 | # https://www.playframework.com/documentation/latest/JavaRouting
114 | # https://www.playframework.com/documentation/latest/ScalaRouting
115 | # ~~~~~
116 | # If null, will attempt to load a class called ErrorHandler in the root package,
117 | #errorHandler = null
118 |
119 | ## Filters
120 | # https://www.playframework.com/documentation/latest/ScalaHttpFilters
121 | # https://www.playframework.com/documentation/latest/JavaHttpFilters
122 | # ~~~~~
123 | # Filters run code on every request. They can be used to perform
124 | # common logic for all your actions, e.g. adding common headers.
125 | # Defaults to "Filters" in the root package (aka "apps" folder)
126 | # Alternatively you can explicitly register a class here.
127 | #filters = my.application.Filters
128 |
129 | ## Session & Flash
130 | # https://www.playframework.com/documentation/latest/JavaSessionFlash
131 | # https://www.playframework.com/documentation/latest/ScalaSessionFlash
132 | # ~~~~~
133 | session {
134 | # Sets the cookie to be sent only over HTTPS.
135 | #secure = true
136 |
137 | # Sets the cookie to be accessed only by the server.
138 | #httpOnly = true
139 |
140 | # Sets the max-age field of the cookie to 5 minutes.
141 | # NOTE: this only sets when the browser will discard the cookie. Play will consider any
142 | # cookie value with a valid signature to be a valid session forever. To implement a server side session timeout,
143 | # you need to put a timestamp in the session and check it at regular intervals to possibly expire it.
144 | #maxAge = 300
145 |
146 | # Sets the domain on the session cookie.
147 | #domain = "example.com"
148 | }
149 |
150 | flash {
151 | # Sets the cookie to be sent only over HTTPS.
152 | #secure = true
153 |
154 | # Sets the cookie to be accessed only by the server.
155 | #httpOnly = true
156 | }
157 | }
158 |
159 | ## Netty Provider
160 | # https://www.playframework.com/documentation/latest/SettingsNetty
161 | # ~~~~~
162 | play.server.netty {
163 | # Whether the Netty wire should be logged
164 | #log.wire = true
165 |
166 | # If you run Play on Linux, you can use Netty's native socket transport
167 | # for higher performance with less garbage.
168 | #transport = "native"
169 | }
170 |
171 | ## WS (HTTP Client)
172 | # https://www.playframework.com/documentation/latest/ScalaWS#Configuring-WS
173 | # ~~~~~
174 | # The HTTP client primarily used for REST APIs. The default client can be
175 | # configured directly, but you can also create different client instances
176 | # with customized settings. You must enable this by adding to build.sbt:
177 | #
178 | # libraryDependencies += ws // or javaWs if using java
179 | #
180 | play.ws {
181 | # Sets HTTP requests not to follow 302 requests
182 | #followRedirects = false
183 |
184 | # Sets the maximum number of open HTTP connections for the client.
185 | #ahc.maxConnectionsTotal = 50
186 |
187 | ## WS SSL
188 | # https://www.playframework.com/documentation/latest/WsSSL
189 | # ~~~~~
190 | ssl {
191 | # Configuring HTTPS with Play WS does not require programming. You can
192 | # set up both trustManager and keyManager for mutual authentication, and
193 | # turn on JSSE debugging in development with a reload.
194 | #debug.handshake = true
195 | #trustManager = {
196 | # stores = [
197 | # { type = "JKS", path = "exampletrust.jks" }
198 | # ]
199 | #}
200 | }
201 | }
202 |
203 | ## Cache
204 | # https://www.playframework.com/documentation/latest/JavaCache
205 | # https://www.playframework.com/documentation/latest/ScalaCache
206 | # ~~~~~
207 | # Play comes with an integrated cache API that can reduce the operational
208 | # overhead of repeated requests. You must enable this by adding to build.sbt:
209 | #
210 | # libraryDependencies += cache
211 | #
212 | play.cache {
213 | # If you want to bind several caches, you can bind the individually
214 | #bindCaches = ["db-cache", "user-cache", "session-cache"]
215 | }
216 |
217 | ## Filters
218 | # https://www.playframework.com/documentation/latest/Filters
219 | # ~~~~~
220 | # There are a number of built-in filters that can be enabled and configured
221 | # to give Play greater security. You must enable this by adding to build.sbt:
222 | #
223 | # libraryDependencies += filters
224 | #
225 | play.filters {
226 | ## CORS filter configuration
227 | # https://www.playframework.com/documentation/latest/CorsFilter
228 | # ~~~~~
229 | # CORS is a protocol that allows web applications to make requests from the browser
230 | # across different domains.
231 | # NOTE: You MUST apply the CORS configuration before the CSRF filter, as CSRF has
232 | # dependencies on CORS settings.
233 | cors {
234 | # Filter paths by a whitelist of path prefixes
235 | #pathPrefixes = ["/some/path", ...]
236 |
237 | # The allowed origins. If null, all origins are allowed.
238 | #allowedOrigins = ["http://www.example.com"]
239 |
240 | # The allowed HTTP methods. If null, all methods are allowed
241 | #allowedHttpMethods = ["GET", "POST"]
242 | }
243 |
244 | ## CSRF Filter
245 | # https://www.playframework.com/documentation/latest/ScalaCsrf#Applying-a-global-CSRF-filter
246 | # https://www.playframework.com/documentation/latest/JavaCsrf#Applying-a-global-CSRF-filter
247 | # ~~~~~
248 | # Play supports multiple methods for verifying that a request is not a CSRF request.
249 | # The primary mechanism is a CSRF token. This token gets placed either in the query string
250 | # or body of every form submitted, and also gets placed in the users session.
251 | # Play then verifies that both tokens are present and match.
252 | csrf {
253 | # Sets the cookie to be sent only over HTTPS
254 | #cookie.secure = true
255 |
256 | # Defaults to CSRFErrorHandler in the root package.
257 | #errorHandler = MyCSRFErrorHandler
258 | }
259 |
260 | ## Security headers filter configuration
261 | # https://www.playframework.com/documentation/latest/SecurityHeaders
262 | # ~~~~~
263 | # Defines security headers that prevent XSS attacks.
264 | # If enabled, then all options are set to the below configuration by default:
265 | headers {
266 | # The X-Frame-Options header. If null, the header is not set.
267 | #frameOptions = "DENY"
268 |
269 | # The X-XSS-Protection header. If null, the header is not set.
270 | #xssProtection = "1; mode=block"
271 |
272 | # The X-Content-Type-Options header. If null, the header is not set.
273 | #contentTypeOptions = "nosniff"
274 |
275 | # The X-Permitted-Cross-Domain-Policies header. If null, the header is not set.
276 | #permittedCrossDomainPolicies = "master-only"
277 |
278 | # The Content-Security-Policy header. If null, the header is not set.
279 | #contentSecurityPolicy = "default-src 'self'"
280 | }
281 |
282 | ## Allowed hosts filter configuration
283 | # https://www.playframework.com/documentation/latest/AllowedHostsFilter
284 | # ~~~~~
285 | # Play provides a filter that lets you configure which hosts can access your application.
286 | # This is useful to prevent cache poisoning attacks.
287 | hosts {
288 | # Allow requests to example.com, its subdomains, and localhost:9000.
289 | #allowed = [".example.com", "localhost:9000"]
290 | }
291 | }
292 |
293 | ## Evolutions
294 | # https://www.playframework.com/documentation/latest/Evolutions
295 | # ~~~~~
296 | # Evolutions allows database scripts to be automatically run on startup in dev mode
297 | # for database migrations. You must enable this by adding to build.sbt:
298 | #
299 | # libraryDependencies += evolutions
300 | #
301 | play.evolutions {
302 | # You can disable evolutions for a specific datasource if necessary
303 | #db.default.enabled = false
304 | }
305 |
306 | ## Database Connection Pool
307 | # https://www.playframework.com/documentation/latest/SettingsJDBC
308 | # ~~~~~
309 | # Play doesn't require a JDBC database to run, but you can easily enable one.
310 | #
311 | # libraryDependencies += jdbc
312 | #
313 | play.db {
314 | # The combination of these two settings results in "db.default" as the
315 | # default JDBC pool:
316 | #config = "db"
317 | #default = "default"
318 |
319 | # Play uses HikariCP as the default connection pool. You can override
320 | # settings by changing the prototype:
321 | prototype {
322 | # Sets a fixed JDBC connection pool size of 50
323 | #hikaricp.minimumIdle = 50
324 | #hikaricp.maximumPoolSize = 50
325 | }
326 | }
327 |
328 | ## JDBC Datasource
329 | # https://www.playframework.com/documentation/latest/JavaDatabase
330 | # https://www.playframework.com/documentation/latest/ScalaDatabase
331 | # ~~~~~
332 | # Once JDBC datasource is set up, you can work with several different
333 | # database options:
334 | #
335 | # Slick (Scala preferred option): https://www.playframework.com/documentation/latest/PlaySlick
336 | # JPA (Java preferred option): https://playframework.com/documentation/latest/JavaJPA
337 | # EBean: https://playframework.com/documentation/latest/JavaEbean
338 | # Anorm: https://www.playframework.com/documentation/latest/ScalaAnorm
339 | #
340 | db {
341 | # You can declare as many datasources as you want.
342 | # By convention, the default datasource is named `default`
343 |
344 | # https://www.playframework.com/documentation/latest/Developing-with-the-H2-Database
345 | #default.driver = org.h2.Driver
346 | #default.url = "jdbc:h2:mem:play"
347 | #default.username = sa
348 | #default.password = ""
349 |
350 | # You can turn on SQL logging for any datasource
351 | # https://www.playframework.com/documentation/latest/Highlights25#Logging-SQL-statements
352 | #default.logSql=true
353 | }
354 |
--------------------------------------------------------------------------------
/young-crawler-searcher/bin/activator:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | ### ------------------------------- ###
4 | ### Helper methods for BASH scripts ###
5 | ### ------------------------------- ###
6 |
7 | realpath () {
8 | (
9 | TARGET_FILE="$1"
10 | FIX_CYGPATH="$2"
11 |
12 | cd "$(dirname "$TARGET_FILE")"
13 | TARGET_FILE=$(basename "$TARGET_FILE")
14 |
15 | COUNT=0
16 | while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ]
17 | do
18 | TARGET_FILE=$(readlink "$TARGET_FILE")
19 | cd "$(dirname "$TARGET_FILE")"
20 | TARGET_FILE=$(basename "$TARGET_FILE")
21 | COUNT=$(($COUNT + 1))
22 | done
23 |
24 | # make sure we grab the actual windows path, instead of cygwin's path.
25 | if [[ "x$FIX_CYGPATH" != "x" ]]; then
26 | echo "$(cygwinpath "$(pwd -P)/$TARGET_FILE")"
27 | else
28 | echo "$(pwd -P)/$TARGET_FILE"
29 | fi
30 | )
31 | }
32 |
33 |
34 | # Uses uname to detect if we're in the odd cygwin environment.
35 | is_cygwin() {
36 | local os=$(uname -s)
37 | case "$os" in
38 | CYGWIN*) return 0 ;;
39 | *) return 1 ;;
40 | esac
41 | }
42 |
43 | # TODO - Use nicer bash-isms here.
44 | CYGWIN_FLAG=$(if is_cygwin; then echo true; else echo false; fi)
45 |
46 |
47 | # This can fix cygwin style /cygdrive paths so we get the
48 | # windows style paths.
49 | cygwinpath() {
50 | local file="$1"
51 | if [[ "$CYGWIN_FLAG" == "true" ]]; then
52 | echo $(cygpath -w $file)
53 | else
54 | echo $file
55 | fi
56 | }
57 |
58 | # Make something URI friendly
59 | make_url() {
60 | url="$1"
61 | local nospaces=${url// /%20}
62 | if is_cygwin; then
63 | echo "/${nospaces//\\//}"
64 | else
65 | echo "$nospaces"
66 | fi
67 | }
68 |
69 | declare -a residual_args
70 | declare -a java_args
71 | declare -a scalac_args
72 | declare -a sbt_commands
73 | declare java_cmd=java
74 | declare java_version
75 | declare -r real_script_path="$(realpath "$0")"
76 | declare -r sbt_home="$(realpath "$(dirname "$(dirname "$real_script_path")")")"
77 | declare -r sbt_bin_dir="$(dirname "$real_script_path")"
78 | declare -r app_version="1.3.10"
79 |
80 | declare -r script_name=activator
81 | declare -r java_opts=( "${ACTIVATOR_OPTS[@]}" "${SBT_OPTS[@]}" "${JAVA_OPTS[@]}" "${java_opts[@]}" )
82 | userhome="$HOME"
83 | if is_cygwin; then
84 | # cygwin sets home to something f-d up, set to real windows homedir
85 | userhome="$USERPROFILE"
86 | fi
87 | declare -r activator_user_home_dir="${userhome}/.activator"
88 | declare -r java_opts_config_home="${activator_user_home_dir}/activatorconfig.txt"
89 | declare -r java_opts_config_version="${activator_user_home_dir}/${app_version}/activatorconfig.txt"
90 |
91 | echoerr () {
92 | echo 1>&2 "$@"
93 | }
94 | vlog () {
95 | [[ $verbose || $debug ]] && echoerr "$@"
96 | }
97 | dlog () {
98 | [[ $debug ]] && echoerr "$@"
99 | }
100 |
101 | jar_file () {
102 | echo "$(cygwinpath "${sbt_home}/libexec/activator-launch-${app_version}.jar")"
103 | }
104 |
105 | acquire_sbt_jar () {
106 | sbt_jar="$(jar_file)"
107 |
108 | if [[ ! -f "$sbt_jar" ]]; then
109 | echoerr "Could not find launcher jar: $sbt_jar"
110 | exit 2
111 | fi
112 | }
113 |
114 | execRunner () {
115 | # print the arguments one to a line, quoting any containing spaces
116 | [[ $verbose || $debug ]] && echo "# Executing command line:" && {
117 | for arg; do
118 | if printf "%s\n" "$arg" | grep -q ' '; then
119 | printf "\"%s\"\n" "$arg"
120 | else
121 | printf "%s\n" "$arg"
122 | fi
123 | done
124 | echo ""
125 | }
126 |
127 | # THis used to be exec, but we loose the ability to re-hook stty then
128 | # for cygwin... Maybe we should flag the feature here...
129 | "$@"
130 | }
131 |
132 | addJava () {
133 | dlog "[addJava] arg = '$1'"
134 | java_args=( "${java_args[@]}" "$1" )
135 | }
136 | addSbt () {
137 | dlog "[addSbt] arg = '$1'"
138 | sbt_commands=( "${sbt_commands[@]}" "$1" )
139 | }
140 | addResidual () {
141 | dlog "[residual] arg = '$1'"
142 | residual_args=( "${residual_args[@]}" "$1" )
143 | }
144 | addDebugger () {
145 | addJava "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=$1"
146 | }
147 |
148 | get_mem_opts () {
149 | # if we detect any of these settings in ${JAVA_OPTS} we need to NOT output our settings.
150 | # The reason is the Xms/Xmx, if they don't line up, cause errors.
151 | if [[ "${JAVA_OPTS}" == *-Xmx* ]] || [[ "${JAVA_OPTS}" == *-Xms* ]] || [[ "${JAVA_OPTS}" == *-XX:MaxPermSize* ]] || [[ "${JAVA_OPTS}" == *-XX:MaxMetaspaceSize* ]] || [[ "${JAVA_OPTS}" == *-XX:ReservedCodeCacheSize* ]]; then
152 | echo ""
153 | else
154 | # a ham-fisted attempt to move some memory settings in concert
155 | # so they need not be messed around with individually.
156 | local mem=${1:-1024}
157 | local codecache=$(( $mem / 8 ))
158 | (( $codecache > 128 )) || codecache=128
159 | (( $codecache < 512 )) || codecache=512
160 | local class_metadata_size=$(( $codecache * 2 ))
161 | local class_metadata_opt=$([[ "$java_version" < "1.8" ]] && echo "MaxPermSize" || echo "MaxMetaspaceSize")
162 |
163 | echo "-Xms${mem}m -Xmx${mem}m -XX:ReservedCodeCacheSize=${codecache}m -XX:${class_metadata_opt}=${class_metadata_size}m"
164 | fi
165 | }
166 |
167 | require_arg () {
168 | local type="$1"
169 | local opt="$2"
170 | local arg="$3"
171 | if [[ -z "$arg" ]] || [[ "${arg:0:1}" == "-" ]]; then
172 | echo "$opt requires <$type> argument"
173 | exit 1
174 | fi
175 | }
176 |
177 | is_function_defined() {
178 | declare -f "$1" > /dev/null
179 | }
180 |
181 | # If we're *not* running in a terminal, and we don't have any arguments, then we need to add the 'ui' parameter
182 | detect_terminal_for_ui() {
183 | [[ ! -t 0 ]] && [[ "${#residual_args}" == "0" ]] && {
184 | addResidual "ui"
185 | }
186 | # SPECIAL TEST FOR MAC
187 | [[ "$(uname)" == "Darwin" ]] && [[ "$HOME" == "$PWD" ]] && [[ "${#residual_args}" == "0" ]] && {
188 | echo "Detected MAC OSX launched script...."
189 | echo "Swapping to UI"
190 | addResidual "ui"
191 | }
192 | }
193 |
194 | process_args () {
195 | while [[ $# -gt 0 ]]; do
196 | case "$1" in
197 | -h|-help) usage; exit 1 ;;
198 | -v|-verbose) verbose=1 && shift ;;
199 | -d|-debug) debug=1 && shift ;;
200 |
201 | -ivy) require_arg path "$1" "$2" && addJava "-Dsbt.ivy.home=$2" && shift 2 ;;
202 | -mem) require_arg integer "$1" "$2" && sbt_mem="$2" && shift 2 ;;
203 | -jvm-debug) require_arg port "$1" "$2" && addDebugger $2 && shift 2 ;;
204 | -batch) exec &1 | awk -F '"' '/version/ {print $2}')
223 | vlog "[process_args] java_version = '$java_version'"
224 | }
225 |
226 | # Detect that we have java installed.
227 | checkJava() {
228 | local required_version="$1"
229 | # Now check to see if it's a good enough version
230 | if [[ "$java_version" == "" ]]; then
231 | echo
232 | echo No java installations was detected.
233 | echo Please go to http://www.java.com/getjava/ and download
234 | echo
235 | exit 1
236 | elif [[ ! "$java_version" > "$required_version" ]]; then
237 | echo
238 | echo The java installation you have is not up to date
239 | echo $script_name requires at least version $required_version+, you have
240 | echo version $java_version
241 | echo
242 | echo Please go to http://www.java.com/getjava/ and download
243 | echo a valid Java Runtime and install before running $script_name.
244 | echo
245 | exit 1
246 | fi
247 | }
248 |
249 |
250 | run() {
251 | # no jar? download it.
252 | [[ -f "$sbt_jar" ]] || acquire_sbt_jar "$sbt_version" || {
253 | # still no jar? uh-oh.
254 | echo "Download failed. Obtain the sbt-launch.jar manually and place it at $sbt_jar"
255 | exit 1
256 | }
257 |
258 | # process the combined args, then reset "$@" to the residuals
259 | process_args "$@"
260 | detect_terminal_for_ui
261 | set -- "${residual_args[@]}"
262 | argumentCount=$#
263 |
264 | # TODO - java check should be configurable...
265 | checkJava "1.6"
266 |
267 | #If we're in cygwin, we should use the windows config, and terminal hacks
268 | if [[ "$CYGWIN_FLAG" == "true" ]]; then
269 | stty -icanon min 1 -echo > /dev/null 2>&1
270 | addJava "-Djline.terminal=jline.UnixTerminal"
271 | addJava "-Dsbt.cygwin=true"
272 | fi
273 |
274 | # run sbt
275 | execRunner "$java_cmd" \
276 | "-Dactivator.home=$(make_url "$sbt_home")" \
277 | ${SBT_OPTS:-$default_sbt_opts} \
278 | $(get_mem_opts $sbt_mem) \
279 | ${JAVA_OPTS} \
280 | ${java_args[@]} \
281 | -jar "$sbt_jar" \
282 | "${sbt_commands[@]}" \
283 | "${residual_args[@]}"
284 |
285 | exit_code=$?
286 |
287 | # Clean up the terminal from cygwin hacks.
288 | if [[ "$CYGWIN_FLAG" == "true" ]]; then
289 | stty icanon echo > /dev/null 2>&1
290 | fi
291 | exit $exit_code
292 | }
293 |
294 |
295 | declare -r noshare_opts="-Dsbt.global.base=project/.sbtboot -Dsbt.boot.directory=project/.boot -Dsbt.ivy.home=project/.ivy"
296 | declare -r sbt_opts_file=".sbtopts"
297 | declare -r etc_sbt_opts_file="${sbt_home}/conf/sbtopts"
298 | declare -r win_sbt_opts_file="${sbt_home}/conf/sbtconfig.txt"
299 |
300 | usage() {
301 | cat < path to global settings/plugins directory (default: ~/.sbt)
316 | -sbt-boot path to shared boot directory (default: ~/.sbt/boot in 0.11 series)
317 | -ivy path to local Ivy repository (default: ~/.ivy2)
318 | -mem set memory options (default: $sbt_mem, which is $(get_mem_opts $sbt_mem))
319 | -no-share use all local caches; no sharing
320 | -no-global uses global caches, but does not use global ~/.sbt directory.
321 | -jvm-debug Turn on JVM debugging, open at the given port.
322 | -batch Disable interactive mode
323 |
324 | # sbt version (default: from project/build.properties if present, else latest release)
325 | -sbt-version use the specified version of sbt
326 | -sbt-jar use the specified jar as the sbt launcher
327 | -sbt-rc use an RC version of sbt
328 | -sbt-snapshot use a snapshot version of sbt
329 |
330 | # java version (default: java from PATH, currently $(java -version 2>&1 | grep version))
331 | -java-home alternate JAVA_HOME
332 |
333 | # jvm options and output control
334 | JAVA_OPTS environment variable, if unset uses "$java_opts"
335 | SBT_OPTS environment variable, if unset uses "$default_sbt_opts"
336 | ACTIVATOR_OPTS Environment variable, if unset uses ""
337 | .sbtopts if this file exists in the current directory, it is
338 | prepended to the runner args
339 | /etc/sbt/sbtopts if this file exists, it is prepended to the runner args
340 | -Dkey=val pass -Dkey=val directly to the java runtime
341 | -J-X pass option -X directly to the java runtime
342 | (-J is stripped)
343 | -S-X add -X to sbt's scalacOptions (-S is stripped)
344 |
345 | In the case of duplicated or conflicting options, the order above
346 | shows precedence: JAVA_OPTS lowest, command line options highest.
347 | EOM
348 | }
349 |
350 |
351 |
352 | process_my_args () {
353 | while [[ $# -gt 0 ]]; do
354 | case "$1" in
355 | -no-colors) addJava "-Dsbt.log.noformat=true" && shift ;;
356 | -no-share) addJava "$noshare_opts" && shift ;;
357 | -no-global) addJava "-Dsbt.global.base=$(pwd)/project/.sbtboot" && shift ;;
358 | -sbt-boot) require_arg path "$1" "$2" && addJava "-Dsbt.boot.directory=$2" && shift 2 ;;
359 | -sbt-dir) require_arg path "$1" "$2" && addJava "-Dsbt.global.base=$2" && shift 2 ;;
360 | -debug-inc) addJava "-Dxsbt.inc.debug=true" && shift ;;
361 | -batch) exec