├── .gitignore
├── young-crawler-searcher
    ├── public
    │   ├── stylesheets
    │   │   └── main.css
    │   ├── javascripts
    │   │   └── hello.js
    │   └── images
    │   │   └── favicon.png
    ├── .gitignore
    ├── libexec
    │   └── activator-launch-1.3.10.jar
    ├── project
    │   ├── build.properties
    │   └── plugins.sbt
    ├── build.sbt
    ├── LICENSE
    ├── app
    │   ├── views
    │   │   ├── index.scala.html
    │   │   └── main.scala.html
    │   ├── controllers
    │   │   ├── HomeController.scala
    │   │   ├── CountController.scala
    │   │   └── AsyncController.scala
    │   ├── services
    │   │   ├── Counter.scala
    │   │   └── ApplicationTimer.scala
    │   ├── Module.scala
    │   ├── Filters.scala
    │   └── filters
    │   │   └── ExampleFilter.scala
    ├── test
    │   ├── IntegrationSpec.scala
    │   └── ApplicationSpec.scala
    ├── conf
    │   ├── routes
    │   ├── logback.xml
    │   └── application.conf
    ├── README
    └── bin
    │   ├── activator.bat
    │   └── activator
├── README.md
└── young-crawler-core
    └── src
        ├── main
            ├── scala
            │   └── com
            │   │   └── young
            │   │       └── crawler
            │   │           ├── spider
            │   │               ├── task
            │   │               │   ├── IndexTask.scala
            │   │               │   ├── FetchTask.scala
            │   │               │   ├── SlaveTask.scala
            │   │               │   ├── CounterTask.scala
            │   │               │   ├── ParserTask.scala
            │   │               │   ├── InjectTask.scala
            │   │               │   └── support
            │   │               │   │   └── actor
            │   │               │   │       ├── IndexActorTask.scala
            │   │               │   │       ├── FetchActorTask.scala
            │   │               │   │       ├── ParseActorTask.scala
            │   │               │   │       ├── InjectActorTask.scala
            │   │               │   │       └── CounterActorTask.scala
            │   │               ├── parser
            │   │               │   ├── Parser.scala
            │   │               │   └── support
            │   │               │   │   ├── HtmlParseParser.scala
            │   │               │   │   └── JsoupParser.scala
            │   │               ├── fetcher
            │   │               │   ├── FetcherCache.scala
            │   │               │   ├── Fetcher.scala
            │   │               │   └── support
            │   │               │   │   ├── HttpClientFetcher.scala
            │   │               │   │   └── HttpWatch.scala
            │   │               └── indexer
            │   │               │   ├── Indexer.scala
            │   │               │   └── support
            │   │               │       └── ElasticIndexer.scala
            │   │           ├── entity
            │   │               ├── InjectEntitys.scala
            │   │               ├── PageIndexEntity.scala
            │   │               ├── CounterEntity.scala
            │   │               └── HttpEntitys.scala
            │   │           ├── exception
            │   │               ├── IndexException.scala
            │   │               ├── ParseException.scala
            │   │               └── FetchException.scala
            │   │           ├── utils
            │   │               ├── JsonUtil.scala
            │   │               ├── MD5Util.scala
            │   │               └── IOUtil.scala
            │   │           ├── cache
            │   │               ├── Cache.scala
            │   │               └── support
            │   │               │   ├── MapCache.scala
            │   │               │   └── RedisCache.scala
            │   │           ├── config
            │   │               ├── CrawlerConfig.scala
            │   │               └── CrawlerConfigContants.scala
            │   │           └── boot
            │   │               └── CrawlerBoot.scala
            ├── java
            │   ├── Thread1.java
            │   ├── Runnable1.java
            │   └── ThreadBoot.java
            └── resources
            │   ├── seeds.txt
            │   └── crawler.properties
        └── test
            └── scala
                └── com
                    └── young
                        └── crawler
                            ├── actor
                                ├── ActorExample.scala
                                └── ActorSelectorExample.scala
                            ├── http
                                └── CrawlerTest.scala
                            ├── cache
                                ├── MapCacheExample.scala
                                └── RedisCacheExample.scala
                            ├── parser
                                └── JsoupExample.scala
                            └── indexer
                                └── Elastic4sExample.scala


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | project
3 | target
4 | 


--------------------------------------------------------------------------------
/young-crawler-searcher/public/stylesheets/main.css:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # young-crawler
2 | scala结合actor编写的分布式网络爬虫,实现上采用Akka 异步消息处理框架,无阻塞,性能高,网页爬取速度快
3 | #如何启动
4 | 下载项目后配置cralwer.properties,详细配置文件里有注释
5 | 


--------------------------------------------------------------------------------
/young-crawler-searcher/.gitignore:
--------------------------------------------------------------------------------
1 | logs
2 | target
3 | /.idea
4 | /.idea_modules
5 | /.classpath
6 | /.project
7 | /.settings
8 | /RUNNING_PID
9 | 


--------------------------------------------------------------------------------
/young-crawler-searcher/public/javascripts/hello.js:
--------------------------------------------------------------------------------
1 | if (window.console) {
2 |   console.log("Welcome to your Play application's JavaScript!");
3 | }
4 | 


--------------------------------------------------------------------------------
/young-crawler-searcher/public/images/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangwx1402/young-crawler/HEAD/young-crawler-searcher/public/images/favicon.png


--------------------------------------------------------------------------------
/young-crawler-searcher/libexec/activator-launch-1.3.10.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangwx1402/young-crawler/HEAD/young-crawler-searcher/libexec/activator-launch-1.3.10.jar


--------------------------------------------------------------------------------
/young-crawler-searcher/project/build.properties:
--------------------------------------------------------------------------------
1 | #Activator-generated Properties
2 | #Sun Sep 11 15:22:45 CST 2016
3 | template.uuid=b0d11fa6-d1b3-4963-94aa-319a15612bf3
4 | sbt.version=0.13.11
5 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/task/IndexTask.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.spider.task
2 | 
3 | /**
4 |  * Created by dell on 2016/8/29.
5 |  */
6 | trait IndexTask {
7 | 
8 | }
9 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/task/FetchTask.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.spider.task
2 | 
3 | /**
4 |  * Created by young.yang on 2016/8/28.
5 |  */
6 | trait FetchTask {
7 | 
8 | }
9 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/task/SlaveTask.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.spider.task
2 | 
3 | /**
4 |  * Created by young.yang on 2016/8/28.
5 |  */
6 | trait SlaveTask {
7 | 
8 | }
9 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/task/CounterTask.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.spider.task
2 | 
3 | /**
4 |  * Created by young.yang on 2016/9/3.
5 |  */
6 | trait CounterTask {
7 | 
8 | }
9 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/task/ParserTask.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.spider.task
2 | 
3 | /**
4 |  * Created by young.yang on 2016/8/28.
5 |  */
6 | trait ParserTask {
7 | 
8 | }
9 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/java/Thread1.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Created by young.yang on 2016/8/31.
3 |  */
4 | public class Thread1 extends Thread{
5 |     public void run(){
6 |         System.out.println("Thread1 run");
7 |     }
8 | }
9 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/java/Runnable1.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Created by young.yang on 2016/8/31.
 3 |  */
 4 | public class Runnable1 implements Runnable {
 5 |     @Override
 6 |     public void run() {
 7 |         System.out.println("Runnable1 run");
 8 |     }
 9 | }
10 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/resources/seeds.txt:
--------------------------------------------------------------------------------
1 | http://www.sina.com.cn
2 | http://www.baidu.com
3 | http://www.163.com
4 | http://www.sohu.com
5 | http://www.ifeng.com
6 | http://www.autohome.com.cn/beijing
7 | http://bj.fang.com
8 | http://blog.csdn.net
9 | http://www.gc-zb.com


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/entity/InjectEntitys.scala:
--------------------------------------------------------------------------------
1 | package com.young.crawler.entity
2 | 
3 | /**
4 |  * Created by dell on 2016/8/29.
5 |  * 初始化种子消息,用来传递给Inject Actor解析种子信息
6 |  */
7 | case class InitSeed(seedPath:String,fileEncode:String="utf-8")
8 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/parser/Parser.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.spider.parser
 2 | 
 3 | import com.young.crawler.entity.{HttpResult, HttpPage}
 4 | 
 5 | /**
 6 |  * Created by young.yang on 2016/8/28.
 7 |  * html页面解析接口
 8 |  */
 9 | trait Parser {
10 |   def parse(html:HttpResult):HttpPage
11 | }
12 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/task/InjectTask.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.spider.task
 2 | 
 3 | import com.young.crawler.entity.Seed
 4 | 
 5 | /**
 6 |  * Created by young.yang on 2016/8/28.
 7 |  */
 8 | trait InjectTask {
 9 |   def initSeeds(seedPath:String,fileEncode:String="utf-8"): List[Seed]
10 | }
11 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/exception/IndexException.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.exception
 2 | 
 3 | /**
 4 |  * Created by young.yang on 2016/8/31.
 5 |  * 自定义索引异常
 6 |  */
 7 | class IndexException(message:String,e:Throwable) extends Exception(message,e) {
 8 | 
 9 |   def this(message:String) = this(message,new Exception(message))
10 | }
11 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/exception/ParseException.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.exception
 2 | 
 3 | /**
 4 |  * Created by young.yang on 2016/8/31.
 5 |  * 自定义解析异常
 6 |  */
 7 | class ParseException(message:String,e:Throwable) extends Exception(message,e){
 8 | 
 9 |   def this(message:String) = this(message,new Exception(message))
10 | }
11 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/exception/FetchException.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.exception
 2 | 
 3 | /**
 4 |  * Created by young.yang on 2016/8/28.
 5 |  * 自定义抓取异常
 6 |  */
 7 | class FetchException(message:String,e:Throwable) extends Exception(message,e){
 8 | 
 9 |   def this(message:String)=this(message,new Exception(message))
10 | 
11 | }
12 | 


--------------------------------------------------------------------------------
/young-crawler-searcher/build.sbt:
--------------------------------------------------------------------------------
 1 | name := """young-crawler-searcher"""
 2 | 
 3 | version := "1.0-SNAPSHOT"
 4 | 
 5 | lazy val root = (project in file(".")).enablePlugins(PlayScala)
 6 | 
 7 | scalaVersion := "2.11.7"
 8 | 
 9 | libraryDependencies ++= Seq(
10 |   jdbc,
11 |   cache,
12 |   ws,
13 |   "org.scalatestplus.play" %% "scalatestplus-play" % "1.5.1" % Test
14 | )
15 | 
16 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/test/scala/com/young/crawler/actor/ActorExample.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.actor
 2 | 
 3 | import akka.actor.Actor
 4 | import akka.actor.Actor.Receive
 5 | 
 6 | /**
 7 |  * Created by young.yang on 2016/9/8.
 8 |  */
 9 | class ActorExample extends Actor{
10 |   override def receive: Receive = {
11 |     case line:String=>println("receive a message "+line)
12 |   }
13 | }
14 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/utils/JsonUtil.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.utils
 2 | 
 3 | import org.codehaus.jackson.map.ObjectMapper
 4 | 
 5 | /**
 6 |  * Created by dell on 2016/8/31.
 7 |  */
 8 | private[crawler] object JsonUtil {
 9 | 
10 |   private val mapper = new ObjectMapper
11 | 
12 |   def toJson(obj:Any):String={
13 |     mapper.writeValueAsString(obj)
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/cache/Cache.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.cache
 2 | 
 3 | /**
 4 |  * Created by dell on 2016/9/2.
 5 |  * 缓存接口
 6 |  */
 7 | trait Cache[KEY,VALUE] {
 8 | 
 9 |   def contains(key:KEY):Boolean
10 | 
11 |   def put(key:KEY,value:VALUE)
12 | 
13 |   def get(key:KEY):Option[VALUE]
14 | 
15 |   def size():Int
16 | 
17 |   def keys():scala.collection.Set[KEY]
18 | 
19 | }
20 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/java/ThreadBoot.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Created by young.yang on 2016/8/31.
 3 |  */
 4 | public class ThreadBoot {
 5 |     public static void main(String[] args) throws InterruptedException {
 6 |        Thread thread1 = new Thread1();
 7 |         Runnable runnable = new Runnable1();
 8 |         Thread thread2 = new Thread(runnable);
 9 |         thread1.start();
10 |         thread2.start();
11 |         Thread.sleep(5000);
12 |         thread1.start();
13 |         new Thread(runnable).start();
14 |     }
15 | }
16 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/utils/MD5Util.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.utils
 2 | 
 3 | import java.nio.charset.Charset
 4 | 
 5 | import com.google.common.hash.Hashing
 6 | 
 7 | /**
 8 |  * Created by dell on 2016/8/31.
 9 |  */
10 | private[crawler] object MD5Util {
11 | 
12 |   def md5(line: String) = Hashing.md5().newHasher().putString(line, Charset.defaultCharset()).hash().toString
13 | 
14 |   def main(args: Array[String]) {
15 |     println(MD5Util.md5("杨勇"))
16 |     println(MD5Util.md5("123"))
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/test/scala/com/young/crawler/http/CrawlerTest.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.http
 2 | 
 3 | import com.young.crawler.entity.{SeedType, UrlInfo}
 4 | import com.young.crawler.spider.fetcher.support.HttpWatch
 5 | 
 6 | /**
 7 |  * Created by young.yang on 2016/8/28.
 8 |  */
 9 | object CrawlerTest {
10 | 
11 |   def main(args: Array[String]) {
12 |     val url = "http://www.sina.com.cn"
13 |     val result = HttpWatch.get(UrlInfo(url,"",SeedType,0))
14 |     println(result.content)
15 |     println(result.status)
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/parser/support/HtmlParseParser.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.spider.parser.support
 2 | 
 3 | import com.young.crawler.entity.{HttpPage, HttpResult}
 4 | import com.young.crawler.spider.parser.Parser
 5 | 
 6 | /**
 7 |  * Created by young.yang on 2016/8/28.
 8 |  */
 9 | private[crawler] class HtmlParseParser extends Parser {
10 |   override def parse(html: HttpResult): HttpPage = {
11 |     val page = new HttpPage
12 |     page.setContent(html.content)
13 |     page.setUrl(html.url)
14 |     page
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/test/scala/com/young/crawler/cache/MapCacheExample.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.cache
 2 | 
 3 | import com.young.crawler.cache.support.MapCache
 4 | 
 5 | /**
 6 |  * Created by dell on 2016/9/2.
 7 |  */
 8 | object MapCacheExample {
 9 |   def main(args: Array[String]) {
10 |     val cache = new MapCache[String, String]
11 |     for(i<-0 to 10){
12 |       cache.put("key_"+i,"value_"+i)
13 |     }
14 |     println(cache.contains("key_0"))
15 |     println(cache.keys())
16 |     println(cache.size())
17 |     println(cache.get("key_12").isEmpty)
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/test/scala/com/young/crawler/cache/RedisCacheExample.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.cache
 2 | 
 3 | import com.young.crawler.cache.support.{RedisCache, MapCache}
 4 | 
 5 | /**
 6 |  * Created by dell on 2016/9/9.
 7 |  */
 8 | object RedisCacheExample {
 9 | 
10 |     def main(args: Array[String]) {
11 |       val cache = new RedisCache[String, String]
12 |       for(i<-0 to 10){
13 |         cache.put("key_"+i,"value_"+i)
14 |       }
15 |       println(cache.contains("key_0"))
16 |       println(cache.size())
17 |       println(cache.get("key_12").isEmpty)
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/young-crawler-searcher/project/plugins.sbt:
--------------------------------------------------------------------------------
 1 | // The Play plugin
 2 | addSbtPlugin("com.typesafe.play" % "sbt-plugin" % "2.5.6")
 3 | 
 4 | // web plugins
 5 | 
 6 | addSbtPlugin("com.typesafe.sbt" % "sbt-coffeescript" % "1.0.0")
 7 | 
 8 | addSbtPlugin("com.typesafe.sbt" % "sbt-less" % "1.1.0")
 9 | 
10 | addSbtPlugin("com.typesafe.sbt" % "sbt-jshint" % "1.0.3")
11 | 
12 | addSbtPlugin("com.typesafe.sbt" % "sbt-rjs" % "1.0.7")
13 | 
14 | addSbtPlugin("com.typesafe.sbt" % "sbt-digest" % "1.1.0")
15 | 
16 | addSbtPlugin("com.typesafe.sbt" % "sbt-mocha" % "1.1.0")
17 | 
18 | addSbtPlugin("org.irundaia.sbt" % "sbt-sassify" % "1.4.2")
19 | 


--------------------------------------------------------------------------------
/young-crawler-searcher/LICENSE:
--------------------------------------------------------------------------------
1 | This software is licensed under the Apache 2 license, quoted below.
2 | 
3 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this project except in compliance with
4 | the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
5 | 
6 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
7 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific
8 | language governing permissions and limitations under the License.


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/fetcher/FetcherCache.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.spider.fetcher
 2 | 
 3 | import com.young.crawler.cache.Cache
 4 | import com.young.crawler.config.{CrawlerConfigContants, CrawlerConfig}
 5 | 
 6 | /**
 7 |  * Created by young.yang on 2016/9/2.
 8 |  * 网页缓存,用来爬取过程中的去重
 9 |  */
10 | private[crawler] object FetcherCache {
11 |    //val fetcherCache = new MapCache[String,Byte]
12 |    val fetcherCache : Cache[String,Byte] = Class.forName(CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_fetcher_cache_imp)).newInstance().asInstanceOf[(Cache[String,Byte])]
13 | }
14 | 


--------------------------------------------------------------------------------
/young-crawler-searcher/app/views/index.scala.html:
--------------------------------------------------------------------------------
 1 | @*
 2 |  * This template takes a single argument, a String containing a
 3 |  * message to display.
 4 |  *@
 5 | @(message: String)
 6 | 
 7 | @*
 8 |  * Call the `main` template with two arguments. The first
 9 |  * argument is a `String` with the title of the page, the second
10 |  * argument is an `Html` object containing the body of the page.
11 |  *@
12 | @main("Welcome to Play") {
13 | 
14 |     @*
15 |      * Get an `Html` object by calling the built-in Play welcome
16 |      * template and passing a `String` message.
17 |      *@
18 |     @play20.welcome(message, style = "Scala")
19 | 
20 | }
21 | 


--------------------------------------------------------------------------------
/young-crawler-searcher/test/IntegrationSpec.scala:
--------------------------------------------------------------------------------
 1 | import org.scalatestplus.play._
 2 | import play.api.test._
 3 | import play.api.test.Helpers._
 4 | 
 5 | /**
 6 |  * add your integration spec here.
 7 |  * An integration test will fire up a whole play application in a real (or headless) browser
 8 |  */
 9 | class IntegrationSpec extends PlaySpec with OneServerPerTest with OneBrowserPerTest with HtmlUnitFactory {
10 | 
11 |   "Application" should {
12 | 
13 |     "work from within a browser" in {
14 | 
15 |       go to ("http://localhost:" + port)
16 | 
17 |       pageSource must include ("Your new application is ready.")
18 |     }
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/entity/PageIndexEntity.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.entity
 2 | 
 3 | import scala.beans.BeanProperty
 4 | 
 5 | /**
 6 |  * Created by dell on 2016/8/31.
 7 |  * 索引信息
 8 |  */
 9 | class PageIndexEntity {
10 |   @BeanProperty
11 |   var url: String = ""
12 |   @BeanProperty
13 |   var title: String = ""
14 |   @BeanProperty
15 |   var content: String = ""
16 |   @BeanProperty
17 |   var publishTime: Long = 0
18 |   @BeanProperty
19 |   var updateTime: Long = 0
20 |   @BeanProperty
21 |   var author: String = ""
22 |   @BeanProperty
23 |   var keywords:String =""
24 |   @BeanProperty
25 |   var desc:String = ""
26 | }
27 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/utils/IOUtil.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.utils
 2 | 
 3 | import java.io.{BufferedReader, InputStream, InputStreamReader}
 4 | 
 5 | /**
 6 |  * Created by young.yang on 2016/8/28.
 7 |  */
 8 | private [crawler] object IOUtil {
 9 | 
10 |   def toString(inputStream:InputStream,encode:String):String={
11 |     val bufferReader = new BufferedReader(new InputStreamReader(inputStream,encode))
12 |     val buffer = new StringBuilder(1000)
13 |     var line = bufferReader.readLine()
14 |     while(line!=null){
15 |        buffer.append(line+"\n")
16 |       line = bufferReader.readLine()
17 |     }
18 |     return buffer.toString()
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/test/scala/com/young/crawler/actor/ActorSelectorExample.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.actor
 2 | 
 3 | import akka.actor.{Props, ActorSystem}
 4 | import com.young.crawler.config.{CrawlerConfigContants, CrawlerConfig}
 5 | 
 6 | /**
 7 |  * Created by young.yang on 2016/9/8.
 8 |  */
 9 | object ActorSelectorExample {
10 | 
11 |   def main(args: Array[String]) {
12 |     val system = ActorSystem(CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_appName))
13 |     val actor = system.actorOf(Props[ActorExample],"print")
14 |     actor!"test"
15 |     println(actor)
16 |     val actor2 = system.actorSelection("akka://young-crawler/user/print")
17 |     actor2!"222"
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/young-crawler-searcher/conf/routes:
--------------------------------------------------------------------------------
 1 | # Routes
 2 | # This file defines all application routes (Higher priority routes first)
 3 | # ~~~~
 4 | 
 5 | # An example controller showing a sample home page
 6 | GET     /                           controllers.HomeController.index
 7 | # An example controller showing how to use dependency injection
 8 | GET     /count                      controllers.CountController.count
 9 | # An example controller showing how to write asynchronous code
10 | GET     /message                    controllers.AsyncController.message
11 | 
12 | # Map static resources from the /public folder to the /assets URL path
13 | GET     /assets/*file               controllers.Assets.versioned(path="/public", file: Asset)
14 | 


--------------------------------------------------------------------------------
/young-crawler-searcher/app/controllers/HomeController.scala:
--------------------------------------------------------------------------------
 1 | package controllers
 2 | 
 3 | import javax.inject._
 4 | import play.api._
 5 | import play.api.mvc._
 6 | 
 7 | /**
 8 |  * This controller creates an `Action` to handle HTTP requests to the
 9 |  * application's home page.
10 |  */
11 | @Singleton
12 | class HomeController @Inject() extends Controller {
13 | 
14 |   /**
15 |    * Create an Action to render an HTML page with a welcome message.
16 |    * The configuration in the `routes` file means that this method
17 |    * will be called when the application receives a `GET` request with
18 |    * a path of `/`.
19 |    */
20 |   def index = Action {
21 |     Ok(views.html.index("Your new application is ready."))
22 |   }
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/indexer/Indexer.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.spider.indexer
 2 | 
 3 | import com.young.crawler.config.{CrawlerConfig, CrawlerConfigContants}
 4 | import com.young.crawler.entity.{HttpPage, IndexResult}
 5 | 
 6 | /**
 7 |  * Created by dell on 2016/8/29.
 8 |  * 索引接口
 9 |  */
10 | trait Indexer {
11 | 
12 |   /**
13 |    * 文档索引
14 |    * @param page
15 |    * @return
16 |    */
17 |   def index(page: HttpPage): IndexResult
18 | }
19 | 
20 | /**
21 |  * ES中所有名称和类型
22 |  */
23 | object IndexerConstants {
24 |   val indexName = CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_indexer_es_name)
25 |   val indexType = CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_indexer_es_type)
26 | }
27 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/cache/support/MapCache.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.cache.support
 2 | 
 3 | import com.young.crawler.cache.Cache
 4 | 
 5 | import scala.collection.immutable.Nil
 6 | import scala.collection.mutable
 7 | 
 8 | /**
 9 |  * Created by dell on 2016/9/2.
10 |  * 采用本地Map实现的缓存
11 |  */
12 | private[crawler] class MapCache[KEY,VALUE] extends Cache[KEY,VALUE]{
13 | 
14 |   private val map = new mutable.HashMap[KEY,VALUE]()
15 | 
16 |   override def contains(key: KEY): Boolean = map.contains(key)
17 | 
18 |   override def get(key: KEY): Option[VALUE] =map.get(key)
19 | 
20 |   override def put(key:KEY,value:VALUE): Unit = map.put(key,value)
21 | 
22 |   override def size(): Int = map.size
23 | 
24 |   override def keys(): scala.collection.Set[KEY] = map.keySet
25 | }
26 | 


--------------------------------------------------------------------------------
/young-crawler-searcher/app/controllers/CountController.scala:
--------------------------------------------------------------------------------
 1 | package controllers
 2 | 
 3 | import javax.inject._
 4 | import play.api._
 5 | import play.api.mvc._
 6 | 
 7 | import services.Counter
 8 | 
 9 | /**
10 |  * This controller demonstrates how to use dependency injection to
11 |  * bind a component into a controller class. The class creates an
12 |  * `Action` that shows an incrementing count to users. The [[Counter]]
13 |  * object is injected by the Guice dependency injection system.
14 |  */
15 | @Singleton
16 | class CountController @Inject() (counter: Counter) extends Controller {
17 | 
18 |   /**
19 |    * Create an action that responds with the [[Counter]]'s current
20 |    * count. The result is plain text. This `Action` is mapped to
21 |    * `GET /count` requests by an entry in the `routes` config file.
22 |    */
23 |   def count = Action { Ok(counter.nextCount().toString) }
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/entity/CounterEntity.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.entity
 2 | 
 3 | /**
 4 |  * Created by young.yang on 2016/9/3.
 5 |  */
 6 | sealed trait Counter
 7 | 
 8 | case class FetchCounter(num: Int) extends Counter
 9 | 
10 | case class FetchOk(num: Int) extends Counter
11 | 
12 | case class FetchError(num: Int) extends Counter
13 | 
14 | case class InjectCounter(num: Int) extends Counter
15 | 
16 | case class ParseCounter(num: Int) extends Counter
17 | 
18 | case class ParseChildUrlCounter(num: Int) extends Counter
19 | 
20 | case class IndexCounter(num: Int) extends Counter
21 | 
22 | case object PrintCounter extends Counter
23 | 
24 | case object GetAllCounter extends Counter
25 | 
26 | case class AllCounter(fetchCounter: FetchCounter, fetchOk: FetchOk, fetchError: FetchError, injectCounter: InjectCounter, parseCounter: ParseCounter, parseChildUrlCounter: ParseChildUrlCounter, indexCounter: IndexCounter) extends Counter
27 | 
28 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/test/scala/com/young/crawler/parser/JsoupExample.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.parser
 2 | 
 3 | import com.young.crawler.entity.{SeedType, UrlInfo}
 4 | import com.young.crawler.spider.fetcher.support.HttpClientFetcher
 5 | import com.young.crawler.spider.parser.support.JsoupParser
 6 | 
 7 | /**
 8 |  * Created by dell on 2016/9/1.
 9 |  */
10 | object JsoupExample {
11 | 
12 |   def parserHtml(url:UrlInfo): Unit ={
13 |     val fetcher = new HttpClientFetcher
14 |     val parser = new JsoupParser
15 |     val page = fetcher.fetchPage(url)
16 |     println(page)
17 |     val page1 = fetcher.fetchPage(url)
18 |     println(page1)
19 |     val result = parser.parse(page.get)
20 |     println(result.keywords)
21 |     println(result.desc)
22 |     result.childLink._1.foreach(println _)
23 |   }
24 | 
25 |   def main(args: Array[String]) {
26 |     val url = "http://bj.fang.com/"
27 |     JsoupExample.parserHtml(UrlInfo(url,"",SeedType,0))
28 | 
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/young-crawler-searcher/app/services/Counter.scala:
--------------------------------------------------------------------------------
 1 | package services
 2 | 
 3 | import java.util.concurrent.atomic.AtomicInteger
 4 | import javax.inject._
 5 | 
 6 | /**
 7 |  * This trait demonstrates how to create a component that is injected
 8 |  * into a controller. The trait represents a counter that returns a
 9 |  * incremented number each time it is called.
10 |  */
11 | trait Counter {
12 |   def nextCount(): Int
13 | }
14 | 
15 | /**
16 |  * This class is a concrete implementation of the [[Counter]] trait.
17 |  * It is configured for Guice dependency injection in the [[Module]]
18 |  * class.
19 |  *
20 |  * This class has a `Singleton` annotation because we need to make
21 |  * sure we only use one counter per application. Without this
22 |  * annotation we would get a new instance every time a [[Counter]] is
23 |  * injected.
24 |  */
25 | @Singleton
26 | class AtomicCounter extends Counter {  
27 |   private val atomicCounter = new AtomicInteger()
28 |   override def nextCount(): Int = atomicCounter.getAndIncrement()
29 | }
30 | 


--------------------------------------------------------------------------------
/young-crawler-searcher/app/views/main.scala.html:
--------------------------------------------------------------------------------
 1 | @*
 2 |  * This template is called from the `index` template. This template
 3 |  * handles the rendering of the page header and body tags. It takes
 4 |  * two arguments, a `String` for the title of the page and an `Html`
 5 |  * object to insert into the body of the page.
 6 |  *@
 7 | @(title: String)(content: Html)
 8 | 
 9 | <!DOCTYPE html>
10 | <html lang="en">
11 |     <head>
12 |         @* Here's where we render the page title `String`. *@
13 |         <title>@title</title>
14 |         <link rel="stylesheet" media="screen" href="@routes.Assets.versioned("stylesheets/main.css")">
15 |         <link rel="shortcut icon" type="image/png" href="@routes.Assets.versioned("images/favicon.png")">
16 |         <script src="@routes.Assets.versioned("javascripts/hello.js")" type="text/javascript"></script>
17 |     </head>
18 |     <body>
19 |         @* And here's where we render the `Html` object containing
20 |          * the page content. *@
21 |         @content
22 |     </body>
23 | </html>
24 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/task/support/actor/IndexActorTask.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.spider.task.support.actor
 2 | 
 3 | import akka.actor.Actor
 4 | import com.young.crawler.config.{CrawlerConfigContants, CrawlerConfig}
 5 | import com.young.crawler.entity.{IndexCounter, HttpPage}
 6 | import com.young.crawler.spider.indexer.Indexer
 7 | import com.young.crawler.spider.task.IndexTask
 8 | 
 9 | /**
10 |  * Created by dell on 2016/8/29.
11 |  * 索引任务
12 |  */
13 | private[crawler] class IndexActorTask(indexer: Indexer) extends Actor with IndexTask {
14 | 
15 |   private val countActor = context.system.actorSelection("akka://" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_appName) + "/user/" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_count_name))
16 | 
17 | 
18 |   context.system.actorSelection("")
19 | 
20 |   override def receive: Receive = {
21 |     case page: HttpPage =>
22 |       indexer.index(page)
23 |       countActor ! IndexCounter(1)
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/config/CrawlerConfig.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.config
 2 | 
 3 | import java.util.{Locale, ResourceBundle}
 4 | 
 5 | import org.apache.commons.logging.LogFactory
 6 | 
 7 | /**
 8 |  * Created by young.yang on 2016/9/3.
 9 |  */
10 | private[crawler] object CrawlerConfig {
11 | 
12 |   private val log = LogFactory.getLog("com.young.crawler.config.CrawlerConfig")
13 | 
14 |   private val config = ResourceBundle.getBundle("crawler", Locale.getDefault)
15 | 
16 |   private var init_flag = true
17 | 
18 |   private def init(): Unit = {
19 |     log.info("init crawler config start")
20 |     val keys = config.keySet()
21 |     val iterator = keys.iterator()
22 |     while (iterator.hasNext) {
23 |       val key = iterator.next()
24 |       log.info("crawler config key = [" + key + "] value = [" + config.getString(key) + "]")
25 |     }
26 |     log.info("init crawler config end")
27 |     init_flag = false
28 |   }
29 | 
30 |   def getConfig = {
31 |     if (init_flag) {
32 |       init()
33 |     }
34 |     config
35 |   }
36 | 
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/fetcher/Fetcher.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.spider.fetcher
 2 | 
 3 | import com.young.crawler.entity.{UrlInfo, HttpResult}
 4 | import com.young.crawler.exception.FetchException
 5 | import org.apache.http.Header
 6 | 
 7 | /**
 8 |  * Created by young.yang on 2016/8/28.
 9 |  * 爬取接口
10 |  */
11 | trait Fetcher {
12 | 
13 |   private val CONTENT_TYPE = "Content-Type"
14 | 
15 |   private val DEFAULT_ENCODE = "utf-8"
16 | 
17 |   val FETCH_SUCCESS = 200
18 | 
19 |   val URL_NOT_FOUND = 404
20 | 
21 |   /**
22 |    * 爬取网页入口
23 |    */
24 |   @throws[FetchException]
25 |   def fetchPage(url:UrlInfo):Option[HttpResult]
26 | 
27 |   /**
28 |    * 根据网页header来探测网页编码
29 |    * @param headers
30 |    * @return
31 |    */
32 |   def getEncode(headers:Array[Header]):String={
33 |     for(header<-headers){
34 |       if(CONTENT_TYPE.equals(header.getName)){
35 |          val temp = header.getValue.split("=")
36 |         if(temp.length==2){
37 |           return temp(1)
38 |         }
39 |       }
40 |     }
41 |     DEFAULT_ENCODE
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/young-crawler-searcher/app/Module.scala:
--------------------------------------------------------------------------------
 1 | import com.google.inject.AbstractModule
 2 | import java.time.Clock
 3 | 
 4 | import services.{ApplicationTimer, AtomicCounter, Counter}
 5 | 
 6 | /**
 7 |  * This class is a Guice module that tells Guice how to bind several
 8 |  * different types. This Guice module is created when the Play
 9 |  * application starts.
10 | 
11 |  * Play will automatically use any class called `Module` that is in
12 |  * the root package. You can create modules in other locations by
13 |  * adding `play.modules.enabled` settings to the `application.conf`
14 |  * configuration file.
15 |  */
16 | class Module extends AbstractModule {
17 | 
18 |   override def configure() = {
19 |     // Use the system clock as the default implementation of Clock
20 |     bind(classOf[Clock]).toInstance(Clock.systemDefaultZone)
21 |     // Ask Guice to create an instance of ApplicationTimer when the
22 |     // application starts.
23 |     bind(classOf[ApplicationTimer]).asEagerSingleton()
24 |     // Set AtomicCounter as the implementation for Counter.
25 |     bind(classOf[Counter]).to(classOf[AtomicCounter])
26 |   }
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/young-crawler-searcher/app/Filters.scala:
--------------------------------------------------------------------------------
 1 | import javax.inject._
 2 | import play.api._
 3 | import play.api.http.HttpFilters
 4 | import play.api.mvc._
 5 | 
 6 | import filters.ExampleFilter
 7 | 
 8 | /**
 9 |  * This class configures filters that run on every request. This
10 |  * class is queried by Play to get a list of filters.
11 |  *
12 |  * Play will automatically use filters from any class called
13 |  * `Filters` that is placed the root package. You can load filters
14 |  * from a different class by adding a `play.http.filters` setting to
15 |  * the `application.conf` configuration file.
16 |  *
17 |  * @param env Basic environment settings for the current application.
18 |  * @param exampleFilter A demonstration filter that adds a header to
19 |  * each response.
20 |  */
21 | @Singleton
22 | class Filters @Inject() (
23 |   env: Environment,
24 |   exampleFilter: ExampleFilter) extends HttpFilters {
25 | 
26 |   override val filters = {
27 |     // Use the example filter if we're running development mode. If
28 |     // we're running in production or test mode then don't use any
29 |     // filters at all.
30 |     if (env.mode == Mode.Dev) Seq(exampleFilter) else Seq.empty
31 |   }
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/test/scala/com/young/crawler/indexer/Elastic4sExample.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.indexer
 2 | 
 3 | import java.net.InetAddress
 4 | 
 5 | import com.young.crawler.entity.PageIndexEntity
 6 | import com.young.crawler.spider.indexer.IndexerConstants
 7 | import com.young.crawler.utils.{JsonUtil, MD5Util}
 8 | import org.elasticsearch.client.transport.TransportClient
 9 | import org.elasticsearch.common.transport.InetSocketTransportAddress
10 | 
11 | /**
12 |  * Created by young.yang on 2016/8/30.
13 |  */
14 | object Elastic4sExample {
15 | 
16 |   val client = TransportClient.builder().build().addTransportAddress(new InetSocketTransportAddress(InetAddress.getByName("115.29.47.216"), 9300))
17 | 
18 |   def main(args: Array[String]) {
19 |     val page = new PageIndexEntity
20 |     page.setAuthor("杨勇")
21 |     page.setContent("中华人民共和過")
22 |     page.setTitle("测试")
23 |     page.setUrl("http://www.baidu.com/1")
24 |     page.setPublishTime(System.currentTimeMillis())
25 |     page.setUpdateTime(System.currentTimeMillis())
26 |     client.prepareIndex(IndexerConstants.indexName,IndexerConstants.indexType).setId(MD5Util.md5(page.getUrl)).setSource(JsonUtil.toJson(page)).get()
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/young-crawler-searcher/app/filters/ExampleFilter.scala:
--------------------------------------------------------------------------------
 1 | package filters
 2 | 
 3 | import akka.stream.Materializer
 4 | import javax.inject._
 5 | import play.api.mvc._
 6 | import scala.concurrent.{ExecutionContext, Future}
 7 | 
 8 | /**
 9 |  * This is a simple filter that adds a header to all requests. It's
10 |  * added to the application's list of filters by the
11 |  * [[Filters]] class.
12 |  *
13 |  * @param mat This object is needed to handle streaming of requests
14 |  * and responses.
15 |  * @param exec This class is needed to execute code asynchronously.
16 |  * It is used below by the `map` method.
17 |  */
18 | @Singleton
19 | class ExampleFilter @Inject()(
20 |     implicit override val mat: Materializer,
21 |     exec: ExecutionContext) extends Filter {
22 | 
23 |   override def apply(nextFilter: RequestHeader => Future[Result])
24 |            (requestHeader: RequestHeader): Future[Result] = {
25 |     // Run the next filter in the chain. This will call other filters
26 |     // and eventually call the action. Take the result and modify it
27 |     // by adding a new header.
28 |     nextFilter(requestHeader).map { result =>
29 |       result.withHeaders("X-ExampleFilter" -> "foo")
30 |     }
31 |   }
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/young-crawler-searcher/test/ApplicationSpec.scala:
--------------------------------------------------------------------------------
 1 | import org.scalatestplus.play._
 2 | import play.api.test._
 3 | import play.api.test.Helpers._
 4 | 
 5 | /**
 6 |  * Add your spec here.
 7 |  * You can mock out a whole application including requests, plugins etc.
 8 |  * For more information, consult the wiki.
 9 |  */
10 | class ApplicationSpec extends PlaySpec with OneAppPerTest {
11 | 
12 |   "Routes" should {
13 | 
14 |     "send 404 on a bad request" in  {
15 |       route(app, FakeRequest(GET, "/boum")).map(status(_)) mustBe Some(NOT_FOUND)
16 |     }
17 | 
18 |   }
19 | 
20 |   "HomeController" should {
21 | 
22 |     "render the index page" in {
23 |       val home = route(app, FakeRequest(GET, "/")).get
24 | 
25 |       status(home) mustBe OK
26 |       contentType(home) mustBe Some("text/html")
27 |       contentAsString(home) must include ("Your new application is ready.")
28 |     }
29 | 
30 |   }
31 | 
32 |   "CountController" should {
33 | 
34 |     "return an increasing count" in {
35 |       contentAsString(route(app, FakeRequest(GET, "/count")).get) mustBe "0"
36 |       contentAsString(route(app, FakeRequest(GET, "/count")).get) mustBe "1"
37 |       contentAsString(route(app, FakeRequest(GET, "/count")).get) mustBe "2"
38 |     }
39 | 
40 |   }
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/young-crawler-searcher/README:
--------------------------------------------------------------------------------
 1 | This is your new Play application
 2 | =================================
 3 | 
 4 | This file will be packaged with your application when using `activator dist`.
 5 | 
 6 | There are several demonstration files available in this template.
 7 | 
 8 | Controllers
 9 | ===========
10 | 
11 | - HomeController.scala:
12 | 
13 |   Shows how to handle simple HTTP requests.
14 | 
15 | - AsyncController.scala:
16 | 
17 |   Shows how to do asynchronous programming when handling a request.
18 | 
19 | - CountController.scala:
20 | 
21 |   Shows how to inject a component into a controller and use the component when
22 |   handling requests.
23 | 
24 | Components
25 | ==========
26 | 
27 | - Module.scala:
28 | 
29 |   Shows how to use Guice to bind all the components needed by your application.
30 | 
31 | - Counter.scala:
32 | 
33 |   An example of a component that contains state, in this case a simple counter.
34 | 
35 | - ApplicationTimer.scala:
36 | 
37 |   An example of a component that starts when the application starts and stops
38 |   when the application stops.
39 | 
40 | Filters
41 | =======
42 | 
43 | - Filters.scala:
44 | 
45 |   Creates the list of HTTP filters used by your application.
46 | 
47 | - ExampleFilter.scala
48 | 
49 |   A simple filter that adds a header to every response.


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/config/CrawlerConfigContants.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.config
 2 | 
 3 | /**
 4 |  * Created by young.yang on 2016/9/3.
 5 |  */
 6 | private[crawler] object CrawlerConfigContants {
 7 |   val young_crawler_appName = "young.crawler.appName"
 8 |   val young_crawler_task_inject_name = "young.crawler.task.inject.name"
 9 |   val young_crawler_task_fetch_name = "young.crawler.task.fetch.name"
10 |   val young_crawler_task_parse_name = "young.crawler.task.parse.name"
11 |   val young_crawler_task_index_name = "young.crawler.task.index.name"
12 |   val young_crawler_task_count_name = "young.cralwer.task.count.name"
13 |   val young_crawler_task_seed_path = "young.crawler.task.seed.path"
14 |   val young_crawler_task_parallel_int = "young.crawler.task.parallel.int"
15 |   val young_crawler_fetcher_cache_imp = "young.crawler.fetcher.cache.imp"
16 |   val young_crawler_fetcher_timeout = "young.crawler.fetcher.timeout"
17 |   val young_crawler_fetcher_useragent = "young.crawler.fetcher.useragent"
18 |   val young_crawler_indexer_es_host = "young.crawler.indexer.es.host"
19 |   val young_crawler_indexer_es_port = "young.crawler.indexer.es.port"
20 |   val young_crawler_indexer_es_name = "young.crawler.indexer.es.name"
21 |   val young_crawler_indexer_es_type = "young.crawler.indexer.es.type"
22 |   val young_cralwer_fetcher_friendtime = "young.cralwer.fetcher.friendtime"
23 |   val young_crawler_fetcher_deep = "young.crawler.fetcher.deep"
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/young-crawler-searcher/app/controllers/AsyncController.scala:
--------------------------------------------------------------------------------
 1 | package controllers
 2 | 
 3 | import akka.actor.ActorSystem
 4 | import javax.inject._
 5 | import play.api._
 6 | import play.api.mvc._
 7 | import scala.concurrent.{ExecutionContext, Future, Promise}
 8 | import scala.concurrent.duration._
 9 | 
10 | /**
11 |  * This controller creates an `Action` that demonstrates how to write
12 |  * simple asynchronous code in a controller. It uses a timer to
13 |  * asynchronously delay sending a response for 1 second.
14 |  *
15 |  * @param actorSystem We need the `ActorSystem`'s `Scheduler` to
16 |  * run code after a delay.
17 |  * @param exec We need an `ExecutionContext` to execute our
18 |  * asynchronous code.
19 |  */
20 | @Singleton
21 | class AsyncController @Inject() (actorSystem: ActorSystem)(implicit exec: ExecutionContext) extends Controller {
22 | 
23 |   /**
24 |    * Create an Action that returns a plain text message after a delay
25 |    * of 1 second.
26 |    *
27 |    * The configuration in the `routes` file means that this method
28 |    * will be called when the application receives a `GET` request with
29 |    * a path of `/message`.
30 |    */
31 |   def message = Action.async {
32 |     getFutureMessage(1.second).map { msg => Ok(msg) }
33 |   }
34 | 
35 |   private def getFutureMessage(delayTime: FiniteDuration): Future[String] = {
36 |     val promise: Promise[String] = Promise[String]()
37 |     actorSystem.scheduler.scheduleOnce(delayTime) { promise.success("Hi!") }
38 |     promise.future
39 |   }
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/resources/crawler.properties:
--------------------------------------------------------------------------------
 1 | #appName akka ActorSystem name
 2 | young.crawler.appName=young-crawler
 3 | #inject task actor name
 4 | young.crawler.task.inject.name=young-injector
 5 | #fetcher task actor name
 6 | young.crawler.task.fetch.name=young-fetcher
 7 | #parse task actor name
 8 | young.crawler.task.parse.name=young-parser
 9 | #index task actor name
10 | young.crawler.task.index.name=young-indexr
11 | #counter task actor name
12 | young.cralwer.task.count.name=young-count
13 | # seed config
14 | young.crawler.task.seed.path=classpath:/seeds.txt
15 | #并行度
16 | young.crawler.task.parallel.int=5
17 | #爬取url超时时间
18 | young.crawler.fetcher.timeout=5000
19 | #网页友好访问时间
20 | young.cralwer.fetcher.friendtime=1000
21 | #爬虫爬取深度
22 | young.crawler.fetcher.deep=1
23 | #爬取网页使用的useragent
24 | young.crawler.fetcher.useragent=Mozilla/5.0 (X11; U; Linux i686; zh-CN; rv:1.9.1.2) Gecko/20090803 Fedora/3.5.2-2.fc11 Firefox/3.5.2", timeout: Int = 10000, poolSize: Int = 100)
25 | #索引网页时候elasticsearch host
26 | young.crawler.indexer.es.host=115.29.47.216
27 | #es port
28 | young.crawler.indexer.es.port=9300
29 | #es index name
30 | young.crawler.indexer.es.name=page
31 | #es index type
32 | young.crawler.indexer.es.type=html
33 | #url排重使用的排重实现类
34 | young.crawler.fetcher.cache.imp=com.young.crawler.cache.support.RedisCache
35 | #url排重缓存时间,单位为s
36 | young.crawler.fetcher.cache.timeout=100
37 | #redis config
38 | young.crawler.fetcher.cache.redis.host=115.29.47.216
39 | young.crawler.fetcher.cache.redis.port=6379
40 | young.crawler.fetcher.cache.redis.password=
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/task/support/actor/FetchActorTask.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.spider.task.support.actor
 2 | 
 3 | import akka.actor.{ActorRef, Actor}
 4 | import akka.event.Logging
 5 | import com.young.crawler.config.{CrawlerConfigContants, CrawlerConfig}
 6 | import com.young.crawler.entity.{FetchError, FetchOk, FetchCounter, UrlInfo}
 7 | import com.young.crawler.spider.fetcher.Fetcher
 8 | import com.young.crawler.spider.task.{FetchTask, ParserTask}
 9 | 
10 | /**
11 |  * Created by young.yang on 2016/8/28.
12 |  * 网页抓取任务,采用Actor实现
13 |  */
14 | private[crawler] class FetchActorTask(fetcher: Fetcher, parserTask: ActorRef) extends Actor with FetchTask {
15 | 
16 |   private val countActor = context.system.actorSelection("akka://" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_appName) + "/user/" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_count_name))
17 | 
18 |   private val log = Logging(context.system, this)
19 | 
20 |   private var injector: ActorRef = null
21 | 
22 |   override def receive: Receive = {
23 |     //处理抓取任务
24 |     case page: UrlInfo =>
25 |       injector = sender()
26 |       val httpResult = fetcher.fetchPage(page)
27 |       countActor ! FetchCounter(1)
28 |       if (!httpResult.isEmpty) {
29 |         parserTask ! httpResult.get
30 |         log.info("FetcherTask send parserTask a httpResult [" + httpResult + "]")
31 |         countActor ! FetchOk(1)
32 |       } else {
33 |         countActor ! FetchError(1)
34 |       }
35 |     //将解析完成的子url发送到注入任务继续抓取
36 |     case urls: List[UrlInfo] => injector ! urls
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/cache/support/RedisCache.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.cache.support
 2 | 
 3 | import com.young.crawler.cache.Cache
 4 | import com.young.crawler.config.CrawlerConfig
 5 | import redis.clients.jedis.JedisPool
 6 | 
 7 | /**
 8 |  * Created by dell on 2016/9/2.
 9 |  * 采用Redis实现的缓存
10 |  */
11 | private[crawler] class RedisCache[KEY, VALUE] extends Cache[KEY, VALUE] {
12 | 
13 |   private val JEDIS_HOST = CrawlerConfig.getConfig.getString("young.crawler.fetcher.cache.redis.host")
14 | 
15 |   private val JEDIS_PORT = CrawlerConfig.getConfig.getString("young.crawler.fetcher.cache.redis.port").toInt
16 | 
17 |   private val JEDIS_PASS = CrawlerConfig.getConfig.getString("young.crawler.fetcher.cache.redis.password")
18 | 
19 |   private val expire = CrawlerConfig.getConfig.getString("young.crawler.fetcher.cache.timeout").toInt
20 | 
21 |   private val jedisPool = new JedisPool(JEDIS_HOST, JEDIS_PORT)
22 | 
23 |   override def contains(key: KEY): Boolean = {
24 |     val jedis = jedisPool.getResource
25 |     val bool = jedis.exists(key.toString)
26 |     jedis.close()
27 |     bool
28 |   }
29 | 
30 |   override def get(key: KEY): Option[VALUE] = {
31 |     val jedis = jedisPool.getResource
32 |     val result = Option(jedis.get(key.toString).asInstanceOf[VALUE])
33 |     jedis.close()
34 |     result
35 |   }
36 | 
37 |   override def put(key:KEY,value:VALUE): Unit = {
38 |     val jedis = jedisPool.getResource
39 |     jedis.setex(key.toString,expire, value.toString)
40 |     jedis.close()
41 |   }
42 | 
43 |   override def size(): Int = 0
44 | 
45 |   override def keys(): Set[KEY] = throw new Exception("unsupport operation")
46 | }
47 | 


--------------------------------------------------------------------------------
/young-crawler-searcher/conf/logback.xml:
--------------------------------------------------------------------------------
 1 | <!-- https://www.playframework.com/documentation/latest/SettingsLogger -->
 2 | <configuration>
 3 | 
 4 |   <conversionRule conversionWord="coloredLevel" converterClass="play.api.libs.logback.ColoredLevel" />
 5 | 
 6 |   <appender name="FILE" class="ch.qos.logback.core.FileAppender">
 7 |     <file>${application.home:-.}/logs/application.log</file>
 8 |     <encoder>
 9 |       <pattern>%date [%level] from %logger in %thread - %message%n%xException</pattern>
10 |     </encoder>
11 |   </appender>
12 | 
13 |   <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
14 |     <encoder>
15 |       <pattern>%coloredLevel %logger{15} - %message%n%xException{10}</pattern>
16 |     </encoder>
17 |   </appender>
18 | 
19 |   <appender name="ASYNCFILE" class="ch.qos.logback.classic.AsyncAppender">
20 |     <appender-ref ref="FILE" />
21 |   </appender>
22 | 
23 |   <appender name="ASYNCSTDOUT" class="ch.qos.logback.classic.AsyncAppender">
24 |     <appender-ref ref="STDOUT" />
25 |   </appender>
26 | 
27 |   <logger name="play" level="INFO" />
28 |   <logger name="application" level="DEBUG" />
29 | 
30 |   <!-- Off these ones as they are annoying, and anyway we manage configuration ourselves -->
31 |   <logger name="com.avaje.ebean.config.PropertyMapLoader" level="OFF" />
32 |   <logger name="com.avaje.ebeaninternal.server.core.XmlConfigLoader" level="OFF" />
33 |   <logger name="com.avaje.ebeaninternal.server.lib.BackgroundThread" level="OFF" />
34 |   <logger name="com.gargoylesoftware.htmlunit.javascript" level="OFF" />
35 | 
36 |   <root level="WARN">
37 |     <appender-ref ref="ASYNCFILE" />
38 |     <appender-ref ref="ASYNCSTDOUT" />
39 |   </root>
40 | 
41 | </configuration>
42 | 


--------------------------------------------------------------------------------
/young-crawler-searcher/app/services/ApplicationTimer.scala:
--------------------------------------------------------------------------------
 1 | package services
 2 | 
 3 | import java.time.{Clock, Instant}
 4 | import javax.inject._
 5 | import play.api.Logger
 6 | import play.api.inject.ApplicationLifecycle
 7 | import scala.concurrent.Future
 8 | 
 9 | /**
10 |  * This class demonstrates how to run code when the
11 |  * application starts and stops. It starts a timer when the
12 |  * application starts. When the application stops it prints out how
13 |  * long the application was running for.
14 |  *
15 |  * This class is registered for Guice dependency injection in the
16 |  * [[Module]] class. We want the class to start when the application
17 |  * starts, so it is registered as an "eager singleton". See the code
18 |  * in the [[Module]] class to see how this happens.
19 |  *
20 |  * This class needs to run code when the server stops. It uses the
21 |  * application's [[ApplicationLifecycle]] to register a stop hook.
22 |  */
23 | @Singleton
24 | class ApplicationTimer @Inject() (clock: Clock, appLifecycle: ApplicationLifecycle) {
25 | 
26 |   // This code is called when the application starts.
27 |   private val start: Instant = clock.instant
28 |   Logger.info(s"ApplicationTimer demo: Starting application at $start.")
29 | 
30 |   // When the application starts, register a stop hook with the
31 |   // ApplicationLifecycle object. The code inside the stop hook will
32 |   // be run when the application stops.
33 |   appLifecycle.addStopHook { () =>
34 |     val stop: Instant = clock.instant
35 |     val runningTime: Long = stop.getEpochSecond - start.getEpochSecond
36 |     Logger.info(s"ApplicationTimer demo: Stopping application at ${clock.instant} after ${runningTime}s.")
37 |     Future.successful(())
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/entity/HttpEntitys.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.entity
 2 | 
 3 | import java.net.{DatagramSocket, DatagramPacket}
 4 | 
 5 | import scala.beans.BeanProperty
 6 | 
 7 | /**
 8 |  * Created by young.yang on 2016/8/28.
 9 |  * 通过爬取回来的http原始页面
10 |  */
11 | case class HttpResult(status:Int,content:String,message:String,url:String,deep:Int){
12 |   override def toString()="status="+status+",context length="+content.length+",url="+url
13 | }
14 | 
15 | 
16 | sealed trait UrlType
17 | case object SeedType extends UrlType
18 | case object GenerateType extends UrlType
19 | /**
20 |  * 爬取url类
21 |  * @param url  url
22 |  * @param parent  父url
23 |  */
24 | case class UrlInfo(url:String,parent:String,urlType: UrlType,deep:Int){
25 |   override def toString()=url+"\n"
26 | }
27 | 
28 | /**
29 |  * 索引结果
30 |  * @param status
31 |  */
32 | case class IndexResult(status:Int)
33 | 
34 | /**
35 |  * 种子类
36 |  * @param url  种子url
37 |  */
38 | case class Seed(url:String){
39 |   override def toString() = url+"\n"
40 | }
41 | 
42 | /**
43 |  * 解析出来的HTTP网页信息
44 |  */
45 | class HttpPage{
46 |   @BeanProperty
47 |   var url: String = ""
48 |   @BeanProperty
49 |   var title: String = ""
50 |   @BeanProperty
51 |   var html:String = ""
52 |   @BeanProperty
53 |   var content: String = ""
54 |   @BeanProperty
55 |   var publishTime: Long = 0
56 |   @BeanProperty
57 |   var updateTime: Long = 0
58 |   @BeanProperty
59 |   var author: String = ""
60 |   @BeanProperty
61 |   var keywords:String = ""
62 |   @BeanProperty
63 |   var desc:String = ""
64 |   @BeanProperty
65 |   var childLink:(List[UrlInfo],Int) = (List(),0)
66 |   @BeanProperty
67 |   var meta:Map[String,String] = Map()
68 | 
69 |   override def toString()="url="+url+",context length="+content.length
70 | 
71 | }
72 | 
73 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/task/support/actor/ParseActorTask.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.spider.task.support.actor
 2 | 
 3 | import akka.actor.{ActorRef, Actor}
 4 | import akka.event.Logging
 5 | import com.young.crawler.config.{CrawlerConfigContants, CrawlerConfig}
 6 | import com.young.crawler.entity.{ParseChildUrlCounter, ParseCounter, HttpPage, HttpResult}
 7 | import com.young.crawler.spider.parser.Parser
 8 | import com.young.crawler.spider.task.ParserTask
 9 | 
10 | /**
11 |  * Created by young.yang on 2016/8/28.
12 |  * 解析任务
13 |  */
14 | private[crawler] class ParseActorTask(parser: Parser, indexTask: ActorRef) extends Actor with ParserTask {
15 | 
16 |   private val log = Logging(context.system, this)
17 | 
18 |   private val countActor = context.system.actorSelection("akka://" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_appName) + "/user/" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_count_name))
19 | 
20 |   private val fetchDeep = CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_fetcher_deep).toInt
21 | 
22 |   private var fetcher: ActorRef = null
23 | 
24 |   override def receive: Receive = {
25 |     case httpResult: HttpResult =>
26 |       fetcher = sender()
27 |       val page: HttpPage = parser.parse(httpResult)
28 |       indexTask ! page
29 |       countActor ! ParseCounter(1)
30 |       log.info("ParserTask send IndexerTask a index request -[" + page + "]")
31 |       val childLinks = page.getChildLink
32 |       if(childLinks._2<fetchDeep) {
33 |         fetcher ! childLinks._1
34 |         countActor ! ParseChildUrlCounter(childLinks._1.size)
35 |       }else{
36 |         log.info("fetch deep size now  is -["+childLinks._2+"] remove urls size -["+childLinks._1.size+"]")
37 |       }
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/indexer/support/ElasticIndexer.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.spider.indexer.support
 2 | 
 3 | import java.net.InetAddress
 4 | 
 5 | import com.young.crawler.config.{CrawlerConfigContants, CrawlerConfig}
 6 | import com.young.crawler.entity.{PageIndexEntity, IndexResult, HttpPage}
 7 | import com.young.crawler.spider.indexer.{IndexerConstants, Indexer}
 8 | import com.young.crawler.utils.{JsonUtil, MD5Util}
 9 | import org.apache.commons.logging.LogFactory
10 | import org.elasticsearch.client.transport.TransportClient
11 | import org.elasticsearch.common.transport.InetSocketTransportAddress
12 | 
13 | /**
14 |  * Created by dell on 2016/8/29.
15 |  * ES索引器
16 |  */
17 | private[crawler] class ElasticIndexer extends Indexer{
18 | 
19 |   private val log = LogFactory.getLog(classOf[ElasticIndexer])
20 | 
21 |   private val host = CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_indexer_es_host)
22 | 
23 |   private val port = CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_indexer_es_port).toInt
24 | 
25 |   private val client = TransportClient.builder().build().addTransportAddress(new InetSocketTransportAddress(InetAddress.getByName(host), port))
26 | 
27 |   /**
28 |    * 索引网页信息
29 |    * @param htmlpage
30 |    * @return
31 |    */
32 |   override def index(htmlpage: HttpPage): IndexResult = {
33 |     log.info("index page url "+htmlpage.getUrl+" page info -["+htmlpage+"]")
34 |     val page = new PageIndexEntity
35 |     page.setAuthor(htmlpage.getAuthor)
36 |    // page.setContent(htmlpage.getContent)
37 |     page.setTitle(htmlpage.getTitle)
38 |     page.setUrl(htmlpage.getUrl)
39 |     page.setPublishTime(htmlpage.getPublishTime)
40 |     page.setUpdateTime(htmlpage.getUpdateTime)
41 |     page.setKeywords(htmlpage.getKeywords)
42 |     page.setDesc(htmlpage.getDesc)
43 |     client.prepareIndex(IndexerConstants.indexName,IndexerConstants.indexType).setId(MD5Util.md5(page.getUrl)).setSource(JsonUtil.toJson(page)).get()
44 |     IndexResult(200)
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/fetcher/support/HttpClientFetcher.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.spider.fetcher.support
 2 | 
 3 | import com.young.crawler.config.{CrawlerConfig, CrawlerConfigContants}
 4 | import com.young.crawler.entity.{GenerateType, HttpResult, UrlInfo}
 5 | import com.young.crawler.exception.FetchException
 6 | import com.young.crawler.spider.fetcher.{Fetcher, FetcherCache}
 7 | import com.young.crawler.utils.MD5Util
 8 | import org.apache.commons.logging.LogFactory
 9 | 
10 | /**
11 |  * Created by young.yang on 2016/8/28.
12 |  * 采用HttpClient实现的爬取器
13 |  */
14 | private[crawler] class HttpClientFetcher extends Fetcher {
15 | 
16 |   private val friendtime = CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_cralwer_fetcher_friendtime).toLong
17 | 
18 |   private val log = LogFactory.getLog(classOf[HttpClientFetcher])
19 |   HttpWatch.WATCH_TYPE = HttpWatch.WATCH_TYPE_PROTOTYPE
20 | 
21 |   @throws[FetchException]
22 |   override def fetchPage(url: UrlInfo): Option[HttpResult] = {
23 |     val md5 = MD5Util.md5(url.url)
24 |     log.info("fetcher cache size -" + FetcherCache.fetcherCache.size())
25 |     if (url.urlType == GenerateType && FetcherCache.fetcherCache.contains(md5)) {
26 |       log.info("url  -" + url + " is fetched ")
27 |       return None
28 |     } else {
29 |       FetcherCache.fetcherCache.put(md5, 1)
30 |     }
31 |     try {
32 |       val headers = HttpWatch.header(url.url)
33 |       val encode = getEncode(headers)
34 |       log.info("get " + url + " encode =" + encode)
35 |       val start = System.currentTimeMillis()
36 |       Thread.sleep(friendtime)
37 |       val result = HttpWatch.get(url, encode)
38 |       log.info("fetch url " + url + ", cost time -" + (System.currentTimeMillis() - start) + " content length -" + result.content.length)
39 |       if (result.status == FETCH_SUCCESS) {
40 |         Option(result)
41 |       } else {
42 |         throw new FetchException("fetch error code is -" + result.status + ",error url is " + url)
43 |       }
44 |     } catch {
45 |       case e => throw new FetchException("fetch error message error url is " + url, e)
46 |     }
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/parser/support/JsoupParser.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.spider.parser.support
 2 | 
 3 | import com.young.crawler.entity.{GenerateType, HttpPage, HttpResult, UrlInfo}
 4 | import com.young.crawler.spider.parser.Parser
 5 | import org.jsoup.Jsoup
 6 | import org.jsoup.select.Elements
 7 | 
 8 | import scala.collection.mutable.ListBuffer
 9 | 
10 | /**
11 |  * Created by young.yang on 2016/8/31.
12 |  * Jsoup解析器
13 |  */
14 | private[crawler] class JsoupParser extends Parser {
15 | 
16 |   private val KEYWORDS = "keywords"
17 | 
18 |   private val DESCRIPTION = "description"
19 | 
20 |   /**
21 |    * 解析meta信息
22 |    * @param key
23 |    * @param meta
24 |    * @return
25 |    */
26 |   private def getMeta(key: String, meta: Elements): String = {
27 |     for (i <- 0 until meta.size()) {
28 |       val element = meta.get(i)
29 |       if (key.equals(element.attr("name").toLowerCase)) {
30 |         return element.attr("content")
31 |       }
32 |     }
33 |     ""
34 |   }
35 | 
36 |   /**
37 |    * 解析子url
38 |    */
39 |   private def parserUrls(urls: Elements, deep: Int): (List[UrlInfo],Int) = {
40 |     val list = new ListBuffer[UrlInfo]()
41 |     for (i <- 0 until urls.size()) {
42 |       val element = urls.get(i)
43 |       val url = element.attr("href")
44 |       if (url.startsWith("http"))
45 |         list.append(UrlInfo(url, "", GenerateType, deep + 1))
46 |     }
47 |     (list.toList,deep+1)
48 |   }
49 | 
50 |   /**
51 |    * 解析具体实现
52 |    * @param html
53 |    * @return
54 |    */
55 |   override def parse(html: HttpResult): HttpPage = {
56 |     val htmlPage = new HttpPage
57 |     val document = Jsoup.parse(html.content)
58 |     val meta = document.select("meta")
59 |     htmlPage.setTitle(document.title())
60 |     htmlPage.setContent(document.text())
61 |     // htmlPage.setHtml(html.content)
62 |     htmlPage.setPublishTime(System.currentTimeMillis())
63 |     htmlPage.setUpdateTime(System.currentTimeMillis())
64 |     htmlPage.setUrl(html.url)
65 |     htmlPage.setKeywords(getMeta(KEYWORDS, meta))
66 |     htmlPage.setDesc(getMeta(DESCRIPTION, meta))
67 |     htmlPage.setChildLink(parserUrls(document.body().select("a"),html.deep))
68 |     htmlPage
69 |   }
70 | }
71 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/task/support/actor/InjectActorTask.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.spider.task.support.actor
 2 | 
 3 | import akka.actor.{ActorRef, Actor, Props}
 4 | import akka.event.Logging
 5 | import com.young.crawler.config.{CrawlerConfigContants, CrawlerConfig}
 6 | import com.young.crawler.entity._
 7 | import com.young.crawler.spider.fetcher.support.HttpClientFetcher
 8 | import com.young.crawler.spider.indexer.support.ElasticIndexer
 9 | import com.young.crawler.spider.parser.support.{JsoupParser, HtmlParseParser}
10 | import com.young.crawler.spider.task.InjectTask
11 | 
12 | import scala.io.Source
13 | 
14 | /**
15 |  * Created by dell on 2016/8/29.
16 |  * 抓取种子注入任务,将需要抓取的任务注入到该任务中
17 |  */
18 | private[crawler] class InjectActorTask(fetcher: ActorRef) extends Actor with InjectTask {
19 |   private val log = Logging(context.system, this)
20 | 
21 |   private val countActor = context.system.actorSelection("akka://" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_appName) + "/user/" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_count_name))
22 | 
23 | 
24 |   override def receive: Receive = {
25 |     //初始化注入
26 |     case init: InitSeed =>
27 |       val seeds = initSeeds(init.seedPath, init.fileEncode)
28 |       log.info("init seeds -" + seeds)
29 |       seeds.map(seed => {
30 |         fetcher ! UrlInfo(seed.url, null,SeedType,0)
31 |         countActor ! InjectCounter(1)
32 |       })
33 |     //子url注入
34 |     case urls: List[UrlInfo] =>
35 |       log.info("inject urls -" + urls)
36 |       urls.filter(seed => seed.url.startsWith("http")).map(seed => {
37 |         fetcher ! seed
38 |         countActor ! InjectCounter(1)
39 |       }
40 |       )
41 |   }
42 | 
43 |   override def initSeeds(seedPath: String, fileEncode: String = "utf-8"): List[Seed] = {
44 |     log.info("seedpath = ["+seedPath+"] encoding = ["+fileEncode+"]")
45 |     if (seedPath == null || seedPath.trim.equals("") || seedPath.startsWith("classpath:")) {
46 |       val temp = seedPath.split(":")
47 |       log.info("classpath seedpath = ["+temp(1)+"]")
48 |       Source.fromInputStream(classOf[InjectTask].getResourceAsStream(temp(1))).getLines().map(line => Seed(line)).toList
49 |     } else
50 |       Source.fromFile(seedPath, fileEncode).getLines().map(line => Seed(line)).toList
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/task/support/actor/CounterActorTask.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.spider.task.support.actor
 2 | 
 3 | import akka.actor.Actor
 4 | import com.young.crawler.entity._
 5 | import com.young.crawler.spider.task.CounterTask
 6 | 
 7 | /**
 8 |  * Created by young.yang on 2016/9/3.
 9 |  * 用来对任务进行计数
10 |  */
11 | private[crawler] class CounterActorTask extends Actor with CounterTask {
12 | 
13 |   private var fetchCounter = FetchCounter(0)
14 |   private var fetchOk = FetchOk(0)
15 |   private var fetchError = FetchError(0)
16 |   private var injectCounter = InjectCounter(0)
17 |   private var parseCounter = ParseCounter(0)
18 |   private var parseChildUrlCounter = ParseChildUrlCounter(0)
19 |   private var indexCounter = IndexCounter(0)
20 | 
21 | 
22 |   private def printCounter(): String = {
23 |     val buffer = new StringBuilder
24 |     buffer.append("task counter details start ------" + "\n")
25 |     buffer.append("fetchCounter = [" + fetchCounter.num + "]" + "\n")
26 |     buffer.append("fetchOk = [" + fetchOk.num + "]" + "\n")
27 |     buffer.append("fetchError = [" + fetchError.num + "]" + "\n")
28 |     buffer.append("injectCounter = [" + injectCounter.num + "]" + "\n")
29 |     buffer.append("parseCounter = [" + parseCounter.num + "]" + "\n")
30 |     buffer.append("parseChildUrlCounter = [" + parseChildUrlCounter.num + "]" + "\n")
31 |     buffer.append("indexCounter = [" + indexCounter.num + "]" + "\n")
32 |     buffer.append("task counter details end -------")
33 |     buffer.toString()
34 |   }
35 | 
36 |   private def getAllCounter():AllCounter = AllCounter(fetchCounter,fetchOk,fetchError,injectCounter,parseCounter,parseChildUrlCounter,indexCounter)
37 | 
38 |   override def receive: Receive = {
39 |     case counter: FetchCounter => fetchCounter = FetchCounter(fetchCounter.num + counter.num)
40 |     case count: FetchOk => fetchOk = FetchOk(count.num + fetchOk.num)
41 |     case count: FetchError => fetchError = FetchError(count.num + fetchError.num)
42 |     case count: InjectCounter => injectCounter = InjectCounter(count.num + injectCounter.num)
43 |     case count: ParseCounter => parseCounter = ParseCounter(count.num + parseCounter.num)
44 |     case count: ParseChildUrlCounter => parseChildUrlCounter = ParseChildUrlCounter(count.num + parseChildUrlCounter.num)
45 |     case count: IndexCounter => indexCounter = IndexCounter(count.num + indexCounter.num)
46 |     case PrintCounter => sender() ! printCounter()
47 |     case GetAllCounter => sender() ! getAllCounter
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/spider/fetcher/support/HttpWatch.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.spider.fetcher.support
 2 | 
 3 | import com.young.crawler.config.{CrawlerConfigContants, CrawlerConfig}
 4 | import com.young.crawler.entity.{UrlInfo, HttpResult}
 5 | import org.apache.commons.io.IOUtils
 6 | import org.apache.http.annotation.NotThreadSafe
 7 | import org.apache.http.client.config.RequestConfig
 8 | import org.apache.http.client.methods.{HttpHead, HttpGet, HttpUriRequest}
 9 | import org.apache.http.impl.client.HttpClients
10 | import org.apache.http.Header
11 | 
12 | /**
13 |  * Created by young.yang on 2016/8/28.
14 |  */
15 | class HttpWatch(userAgent: String = "Mozilla/5.0 (X11; U; Linux i686; zh-CN; rv:1.9.1.2) Gecko/20090803 Fedora/3.5.2-2.fc11 Firefox/3.5.2", timeout: Int = 10000, poolSize: Int = 100) {
16 | 
17 |   private val defaultRequestConfig = RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build()
18 | 
19 |   private val httpClient = HttpClients.custom().setUserAgent(userAgent).setMaxConnTotal(poolSize)
20 |     .setMaxConnPerRoute(poolSize).setDefaultRequestConfig(defaultRequestConfig).build();
21 | 
22 |   private def doGet(url: UrlInfo, encode: String = "utf-8"): HttpResult = {
23 |     val get = new HttpGet(url.url)
24 |     val result = sendRequest(get, encode)
25 |     HttpResult(result._1, result._2, result._3, url.url,url.deep)
26 |   }
27 | 
28 |   private def doHeader(url: String): Array[Header] = {
29 |     val header = new HttpHead(url)
30 |     httpClient.execute(header).getAllHeaders
31 |   }
32 | 
33 |   private def sendRequest(request: HttpUriRequest, encode: String): (Int, String, String) = {
34 |     val response = httpClient.execute(request)
35 |     val statusCode = response.getStatusLine.getStatusCode
36 |     val message = response.getStatusLine.getReasonPhrase
37 |     val content = IOUtils.toString(response.getEntity.getContent, encode)
38 |     // val content = IOUtil.toString(response.getEntity.getContent,encode)
39 |     (statusCode, content, message)
40 |   }
41 | }
42 | @NotThreadSafe
43 | object HttpWatch {
44 |   val WATCH_TYPE_PROTOTYPE = "prototype"
45 |   val WATCH_TYPE_SINGLETON = "singleton"
46 |   var WATCH_TYPE = WATCH_TYPE_PROTOTYPE
47 |   private val httpWatch = getHttpWatch()
48 | 
49 |   def get(url:UrlInfo, encode: String = "utf-8"): HttpResult = getHttpWatch().doGet(url, encode)
50 | 
51 |   def header(url: String): Array[Header] = getHttpWatch().doHeader(url)
52 | 
53 |   private def getHttpWatch(): HttpWatch = {
54 |     if (WATCH_TYPE_PROTOTYPE.equals(WATCH_TYPE))
55 |       new HttpWatch(CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_fetcher_useragent), CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_fetcher_timeout).toInt)
56 |     else
57 |       httpWatch
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/young-crawler-core/src/main/scala/com/young/crawler/boot/CrawlerBoot.scala:
--------------------------------------------------------------------------------
 1 | package com.young.crawler.boot
 2 | 
 3 | import akka.actor.{ActorSystem, Props}
 4 | import akka.pattern.ask
 5 | import akka.routing.RoundRobinPool
 6 | import com.young.crawler.config.{CrawlerConfig, CrawlerConfigContants}
 7 | import com.young.crawler.entity.{AllCounter, GetAllCounter, InitSeed, PrintCounter}
 8 | import com.young.crawler.spider.fetcher.support.HttpClientFetcher
 9 | import com.young.crawler.spider.indexer.support.ElasticIndexer
10 | import com.young.crawler.spider.parser.support.JsoupParser
11 | import com.young.crawler.spider.task.support.actor._
12 | import org.apache.commons.logging.LogFactory
13 | 
14 | import scala.concurrent.Await
15 | import scala.concurrent.duration.Duration
16 | 
17 | /**
18 |  * Created by dell on 2016/8/29.
19 |  * 爬虫主函数
20 |  */
21 | object CrawlerBoot {
22 | 
23 |   private val system = ActorSystem(CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_appName))
24 | 
25 |   private val log = LogFactory.getLog(CrawlerConfigContants.young_crawler_appName)
26 | 
27 |   private val timeout = Duration(5, "s")
28 | 
29 |   /**
30 |    * 爬虫启动函数
31 |    */
32 |   def start(): Unit = {
33 |     val initSeeds = InitSeed(CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_seed_path))
34 |     //每个角色的actor都可以通过组组成一组actor进行处理
35 |     val parallel = CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_parallel_int).toInt
36 |     val indexerActor = system.actorOf(RoundRobinPool(parallel).props(Props(new IndexActorTask(new ElasticIndexer))), CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_index_name))
37 |     log.info("create indexerActor name -[" + indexerActor + "]")
38 |     val parserActor = system.actorOf(RoundRobinPool(parallel).props(Props(new ParseActorTask(new JsoupParser, indexerActor))), CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_parse_name))
39 |     log.info("create parserActor name -[" + parserActor + "]")
40 |     val fetcher = system.actorOf(RoundRobinPool(parallel).props(Props(new FetchActorTask(new HttpClientFetcher, parserActor))), CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_fetch_name))
41 |     log.info("create fetcherActor name -[" + fetcher + "]")
42 |     val injectActor = system.actorOf(RoundRobinPool(parallel).props(Props(new InjectActorTask(fetcher))), CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_inject_name))
43 |     log.info("create injectActor name -[" + injectActor + "]")
44 |     val countActor = system.actorOf(Props[CounterActorTask], CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_count_name))
45 |     log.info("create countActor name -[" + countActor + "]")
46 |     injectActor ! initSeeds
47 |   }
48 | 
49 |   /**
50 |    * 停止爬虫程序
51 |    */
52 |   def stop(): Unit = {
53 |     system.terminate()
54 |   }
55 | 
56 |   def printCount(): String = {
57 |     val countActor = system.actorSelection("akka://" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_appName) + "/user/" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_count_name))
58 |     val result = ask(countActor, PrintCounter)(timeout)
59 |     Await.result(result, timeout).asInstanceOf[String]
60 |   }
61 | 
62 |   def getCounter(): AllCounter = {
63 |     val countActor = system.actorSelection("akka://" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_appName) + "/user/" + CrawlerConfig.getConfig.getString(CrawlerConfigContants.young_crawler_task_count_name))
64 |     val result = ask(countActor, GetAllCounter)(timeout)
65 |     Await.result(result, timeout).asInstanceOf[AllCounter]
66 |   }
67 | 
68 |   def main(args: Array[String]) {
69 |     CrawlerBoot.start()
70 |     //   Thread.sleep(3000)
71 |     //    println(CrawlerBoot.printCount())
72 |     //    println(CrawlerBoot.getCounter())
73 |     // CrawlerBoot.stop()
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------
/young-crawler-searcher/bin/activator.bat:
--------------------------------------------------------------------------------
  1 | @REM activator launcher script
  2 | @REM
  3 | @REM Environment:
  4 | @REM In order for Activator to work you must have Java available on the classpath
  5 | @REM JAVA_HOME - location of a JDK home dir (optional if java on path)
  6 | @REM CFG_OPTS  - JVM options (optional)
  7 | @REM Configuration:
  8 | @REM activatorconfig.txt found in the ACTIVATOR_HOME or ACTIVATOR_HOME/ACTIVATOR_VERSION
  9 | @setlocal enabledelayedexpansion
 10 | 
 11 | @echo off
 12 | 
 13 | set "var1=%~1"
 14 | if defined var1 (
 15 |   if "%var1%"=="help" (
 16 |     echo.
 17 |     echo Usage activator [options] [command]
 18 |     echo.
 19 |     echo Commands:
 20 |     echo ui                 Start the Activator UI
 21 |     echo new [name] [template-id]  Create a new project with [name] using template [template-id]
 22 |     echo list-templates     Print all available template names
 23 |     echo help               Print this message
 24 |     echo.
 25 |     echo Options:
 26 |     echo -jvm-debug [port]  Turn on JVM debugging, open at the given port.  Defaults to 9999 if no port given.
 27 |     echo.
 28 |     echo Environment variables ^(read from context^):
 29 |     echo JAVA_OPTS          Environment variable, if unset uses ""
 30 |     echo SBT_OPTS           Environment variable, if unset uses ""
 31 |     echo ACTIVATOR_OPTS     Environment variable, if unset uses ""
 32 |     echo.
 33 |     echo Please note that in order for Activator to work you must have Java available on the classpath
 34 |     echo.
 35 |     goto :end
 36 |   )
 37 | )
 38 | 
 39 | @REM determine ACTIVATOR_HOME environment variable
 40 | set BIN_DIRECTORY=%~dp0
 41 | set BIN_DIRECTORY=%BIN_DIRECTORY:~0,-1%
 42 | for %%d in (%BIN_DIRECTORY%) do set ACTIVATOR_HOME=%%~dpd
 43 | set ACTIVATOR_HOME=%ACTIVATOR_HOME:~0,-1%
 44 | 
 45 | echo ACTIVATOR_HOME=%ACTIVATOR_HOME%
 46 | 
 47 | set ERROR_CODE=0
 48 | set APP_VERSION=1.3.10
 49 | set ACTIVATOR_LAUNCH_JAR=activator-launch-%APP_VERSION%.jar
 50 | 
 51 | rem Detect if we were double clicked, although theoretically A user could
 52 | rem manually run cmd /c
 53 | for %%x in (%cmdcmdline%) do if %%~x==/c set DOUBLECLICKED=1
 54 | 
 55 | set SBT_HOME=%BIN_DIRECTORY
 56 | 
 57 | rem Detect if we were double clicked, although theoretically A user could
 58 | rem manually run cmd /c
 59 | for %%x in (%cmdcmdline%) do if %%~x==/c set DOUBLECLICKED=1
 60 | 
 61 | rem FIRST we load the config file of extra options.
 62 | set FN=%SBT_HOME%\..\conf\sbtconfig.txt
 63 | set CFG_OPTS=
 64 | FOR /F "tokens=* eol=# usebackq delims=" %%i IN ("%FN%") DO (
 65 |   set DO_NOT_REUSE_ME=%%i
 66 |   rem ZOMG (Part #2) WE use !! here to delay the expansion of
 67 |   rem CFG_OPTS, otherwise it remains "" for this loop.
 68 |   set CFG_OPTS=!CFG_OPTS! !DO_NOT_REUSE_ME!
 69 | )
 70 | 
 71 | rem FIRST we load a config file of extra options (if there is one)
 72 | set "CFG_FILE_HOME=%UserProfile%\.activator\activatorconfig.txt"
 73 | set "CFG_FILE_VERSION=%UserProfile%\.activator\%APP_VERSION%\activatorconfig.txt"
 74 | if exist %CFG_FILE_VERSION% (
 75 |   FOR /F "tokens=* eol=# usebackq delims=" %%i IN ("%CFG_FILE_VERSION%") DO (
 76 |     set DO_NOT_REUSE_ME=%%i
 77 |     rem ZOMG (Part #2) WE use !! here to delay the expansion of
 78 |     rem CFG_OPTS, otherwise it remains "" for this loop.
 79 |     set CFG_OPTS=!CFG_OPTS! !DO_NOT_REUSE_ME!
 80 |   )
 81 | )
 82 | if "%CFG_OPTS%"=="" (
 83 |   if exist %CFG_FILE_HOME% (
 84 |     FOR /F "tokens=* eol=# usebackq delims=" %%i IN ("%CFG_FILE_HOME%") DO (
 85 |       set DO_NOT_REUSE_ME=%%i
 86 |       rem ZOMG (Part #2) WE use !! here to delay the expansion of
 87 |       rem CFG_OPTS, otherwise it remains "" for this loop.
 88 |       set CFG_OPTS=!CFG_OPTS! !DO_NOT_REUSE_ME!
 89 |     )
 90 |   )
 91 | )
 92 | 
 93 | rem We use the value of the JAVACMD environment variable if defined
 94 | set _JAVACMD=%JAVACMD%
 95 | 
 96 | if "%_JAVACMD%"=="" (
 97 |   if not "%JAVA_HOME%"=="" (
 98 |     if exist "%JAVA_HOME%\bin\java.exe" set "_JAVACMD=%JAVA_HOME%\bin\java.exe"
 99 | 
100 |     rem if there is a java home set we make sure it is the first picked up when invoking 'java'
101 |     SET "PATH=%JAVA_HOME%\bin;%PATH%"
102 |   )
103 | )
104 | 
105 | if "%_JAVACMD%"=="" set _JAVACMD=java
106 | 
107 | rem Detect if this java is ok to use.
108 | for /F %%j in ('"%_JAVACMD%" -version  2^>^&1') do (
109 |   if %%~j==java set JAVAINSTALLED=1
110 |   if %%~j==openjdk set JAVAINSTALLED=1
111 | )
112 | 
113 | rem Detect the same thing about javac
114 | if "%_JAVACCMD%"=="" (
115 |   if not "%JAVA_HOME%"=="" (
116 |     if exist "%JAVA_HOME%\bin\javac.exe" set "_JAVACCMD=%JAVA_HOME%\bin\javac.exe"
117 |   )
118 | )
119 | if "%_JAVACCMD%"=="" set _JAVACCMD=javac
120 | for /F %%j in ('"%_JAVACCMD%" -version 2^>^&1') do (
121 |   if %%~j==javac set JAVACINSTALLED=1
122 | )
123 | 
124 | rem BAT has no logical or, so we do it OLD SCHOOL! Oppan Redmond Style
125 | set JAVAOK=true
126 | if not defined JAVAINSTALLED set JAVAOK=false
127 | if not defined JAVACINSTALLED set JAVAOK=false
128 | 
129 | if "%JAVAOK%"=="false" (
130 |   echo.
131 |   echo A Java JDK is not installed or can't be found.
132 |   if not "%JAVA_HOME%"=="" (
133 |     echo JAVA_HOME = "%JAVA_HOME%"
134 |   )
135 |   echo.
136 |   echo Please go to
137 |   echo   http://www.oracle.com/technetwork/java/javase/downloads/index.html
138 |   echo and download a valid Java JDK and install before running Activator.
139 |   echo.
140 |   echo If you think this message is in error, please check
141 |   echo your environment variables to see if "java.exe" and "javac.exe" are
142 |   echo available via JAVA_HOME or PATH.
143 |   echo.
144 |   if defined DOUBLECLICKED pause
145 |   exit /B 1
146 | )
147 | 
148 | rem Check what Java version is being used to determine what memory options to use
149 | for /f "tokens=3" %%g in ('java -version 2^>^&1 ^| findstr /i "version"') do (
150 |     set JAVA_VERSION=%%g
151 | )
152 | 
153 | rem Strips away the " characters
154 | set JAVA_VERSION=%JAVA_VERSION:"=%
155 | 
156 | rem TODO Check if there are existing mem settings in JAVA_OPTS/CFG_OPTS and use those instead of the below
157 | for /f "delims=. tokens=1-3" %%v in ("%JAVA_VERSION%") do (
158 |     set MAJOR=%%v
159 |     set MINOR=%%w
160 |     set BUILD=%%x
161 | 
162 |     set META_SIZE=-XX:MetaspaceSize=64M -XX:MaxMetaspaceSize=256M
163 |     if "!MINOR!" LSS "8" (
164 |       set META_SIZE=-XX:PermSize=64M -XX:MaxPermSize=256M
165 |     )
166 | 
167 |     set MEM_OPTS=!META_SIZE!
168 |  )
169 | 
170 | rem We use the value of the JAVA_OPTS environment variable if defined, rather than the config.
171 | set _JAVA_OPTS=%JAVA_OPTS%
172 | if "%_JAVA_OPTS%"=="" set _JAVA_OPTS=%CFG_OPTS%
173 | 
174 | set DEBUG_OPTS=
175 | 
176 | rem Loop through the arguments, building remaining args in args variable
177 | set args=
178 | :argsloop
179 | if not "%~1"=="" (
180 |   rem Checks if the argument contains "-D" and if true, adds argument 1 with 2 and puts an equal sign between them.
181 |   rem This is done since batch considers "=" to be a delimiter so we need to circumvent this behavior with a small hack.
182 |   set arg1=%~1
183 |   if "!arg1:~0,2!"=="-D" (
184 |      set "args=%args% "%~1"="%~2""
185 |     shift
186 |     shift
187 |     goto argsloop
188 |   )
189 | 
190 |   if "%~1"=="-jvm-debug" (
191 |     if not "%~2"=="" (
192 |       rem This piece of magic somehow checks that an argument is a number
193 |       for /F "delims=0123456789" %%i in ("%~2") do (
194 |         set var="%%i"
195 |       )
196 |       if defined var (
197 |         rem Not a number, assume no argument given and default to 9999
198 |         set JPDA_PORT=9999
199 |       ) else (
200 |         rem Port was given, shift arguments
201 |         set JPDA_PORT=%~2
202 |         shift
203 |       )
204 |     ) else (
205 |       set JPDA_PORT=9999
206 |     )
207 |     shift
208 | 
209 |     set DEBUG_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=!JPDA_PORT!
210 |     goto argsloop
211 |   )
212 |   rem else
213 |   set "args=%args% "%~1""
214 |   shift
215 |   goto argsloop
216 | )
217 | 
218 | :run
219 | 
220 | if "!args!"=="" (
221 |   if defined DOUBLECLICKED (
222 |     set CMDS="ui"
223 |   ) else set CMDS=!args!
224 | ) else set CMDS=!args!
225 | 
226 | rem We add a / in front, so we get file:///C: instead of file://C:
227 | rem Java considers the later a UNC path.
228 | rem We also attempt a solid effort at making it URI friendly.
229 | rem We don't even bother with UNC paths.
230 | set JAVA_FRIENDLY_HOME_1=/!ACTIVATOR_HOME:\=/!
231 | set JAVA_FRIENDLY_HOME=/!JAVA_FRIENDLY_HOME_1: =%%20!
232 | 
233 | rem Checks if the command contains spaces to know if it should be wrapped in quotes or not
234 | set NON_SPACED_CMD=%_JAVACMD: =%
235 | if "%_JAVACMD%"=="%NON_SPACED_CMD%" %_JAVACMD% %DEBUG_OPTS% %MEM_OPTS% %ACTIVATOR_OPTS% %SBT_OPTS% %_JAVA_OPTS% "-Dactivator.home=%JAVA_FRIENDLY_HOME%" -jar "%ACTIVATOR_HOME%\libexec\%ACTIVATOR_LAUNCH_JAR%" %CMDS%
236 | if NOT "%_JAVACMD%"=="%NON_SPACED_CMD%" "%_JAVACMD%" %DEBUG_OPTS% %MEM_OPTS% %ACTIVATOR_OPTS% %SBT_OPTS% %_JAVA_OPTS% "-Dactivator.home=%JAVA_FRIENDLY_HOME%" -jar "%ACTIVATOR_HOME%\libexec\%ACTIVATOR_LAUNCH_JAR%" %CMDS%
237 | 
238 | if ERRORLEVEL 1 goto error
239 | goto end
240 | 
241 | :error
242 | set ERROR_CODE=1
243 | 
244 | :end
245 | 
246 | @endlocal
247 | 
248 | exit /B %ERROR_CODE%
249 | 


--------------------------------------------------------------------------------
/young-crawler-searcher/conf/application.conf:
--------------------------------------------------------------------------------
  1 | # This is the main configuration file for the application.
  2 | # https://www.playframework.com/documentation/latest/ConfigFile
  3 | # ~~~~~
  4 | # Play uses HOCON as its configuration file format.  HOCON has a number
  5 | # of advantages over other config formats, but there are two things that
  6 | # can be used when modifying settings.
  7 | #
  8 | # You can include other configuration files in this main application.conf file:
  9 | #include "extra-config.conf"
 10 | #
 11 | # You can declare variables and substitute for them:
 12 | #mykey = ${some.value}
 13 | #
 14 | # And if an environment variable exists when there is no other subsitution, then
 15 | # HOCON will fall back to substituting environment variable:
 16 | #mykey = ${JAVA_HOME}
 17 | 
 18 | ## Akka
 19 | # https://www.playframework.com/documentation/latest/ScalaAkka#Configuration
 20 | # https://www.playframework.com/documentation/latest/JavaAkka#Configuration
 21 | # ~~~~~
 22 | # Play uses Akka internally and exposes Akka Streams and actors in Websockets and
 23 | # other streaming HTTP responses.
 24 | akka {
 25 |   # "akka.log-config-on-start" is extraordinarly useful because it log the complete
 26 |   # configuration at INFO level, including defaults and overrides, so it s worth
 27 |   # putting at the very top.
 28 |   #
 29 |   # Put the following in your conf/logback.xml file:
 30 |   #
 31 |   # <logger name="akka.actor" level="INFO" />
 32 |   #
 33 |   # And then uncomment this line to debug the configuration.
 34 |   #
 35 |   #log-config-on-start = true
 36 | }
 37 | 
 38 | ## Secret key
 39 | # http://www.playframework.com/documentation/latest/ApplicationSecret
 40 | # ~~~~~
 41 | # The secret key is used to sign Play's session cookie.
 42 | # This must be changed for production, but we don't recommend you change it in this file.
 43 | play.crypto.secret = "changeme"
 44 | 
 45 | ## Modules
 46 | # https://www.playframework.com/documentation/latest/Modules
 47 | # ~~~~~
 48 | # Control which modules are loaded when Play starts.  Note that modules are
 49 | # the replacement for "GlobalSettings", which are deprecated in 2.5.x.
 50 | # Please see https://www.playframework.com/documentation/latest/GlobalSettings
 51 | # for more information.
 52 | #
 53 | # You can also extend Play functionality by using one of the publically available
 54 | # Play modules: https://playframework.com/documentation/latest/ModuleDirectory
 55 | play.modules {
 56 |   # By default, Play will load any class called Module that is defined
 57 |   # in the root package (the "app" directory), or you can define them
 58 |   # explicitly below.
 59 |   # If there are any built-in modules that you want to disable, you can list them here.
 60 |   #enabled += my.application.Module
 61 | 
 62 |   # If there are any built-in modules that you want to disable, you can list them here.
 63 |   #disabled += ""
 64 | }
 65 | 
 66 | ## IDE
 67 | # https://www.playframework.com/documentation/latest/IDE
 68 | # ~~~~~
 69 | # Depending on your IDE, you can add a hyperlink for errors that will jump you
 70 | # directly to the code location in the IDE in dev mode. The following line makes 
 71 | # use of the IntelliJ IDEA REST interface: 
 72 | #play.editor="http://localhost:63342/api/file/?file=%s&line=%s"
 73 | 
 74 | ## Internationalisation
 75 | # https://www.playframework.com/documentation/latest/JavaI18N
 76 | # https://www.playframework.com/documentation/latest/ScalaI18N
 77 | # ~~~~~
 78 | # Play comes with its own i18n settings, which allow the user's preferred language
 79 | # to map through to internal messages, or allow the language to be stored in a cookie.
 80 | play.i18n {
 81 |   # The application languages
 82 |   langs = [ "en" ]
 83 | 
 84 |   # Whether the language cookie should be secure or not
 85 |   #langCookieSecure = true
 86 | 
 87 |   # Whether the HTTP only attribute of the cookie should be set to true
 88 |   #langCookieHttpOnly = true
 89 | }
 90 | 
 91 | ## Play HTTP settings
 92 | # ~~~~~
 93 | play.http {
 94 |   ## Router
 95 |   # https://www.playframework.com/documentation/latest/JavaRouting
 96 |   # https://www.playframework.com/documentation/latest/ScalaRouting
 97 |   # ~~~~~
 98 |   # Define the Router object to use for this application.
 99 |   # This router will be looked up first when the application is starting up,
100 |   # so make sure this is the entry point.
101 |   # Furthermore, it's assumed your route file is named properly.
102 |   # So for an application router like `my.application.Router`,
103 |   # you may need to define a router file `conf/my.application.routes`.
104 |   # Default to Routes in the root package (aka "apps" folder) (and conf/routes)
105 |   #router = my.application.Router
106 | 
107 |   ## Action Creator
108 |   # https://www.playframework.com/documentation/latest/JavaActionCreator
109 |   # ~~~~~
110 |   #actionCreator = null
111 | 
112 |   ## ErrorHandler
113 |   # https://www.playframework.com/documentation/latest/JavaRouting
114 |   # https://www.playframework.com/documentation/latest/ScalaRouting
115 |   # ~~~~~
116 |   # If null, will attempt to load a class called ErrorHandler in the root package,
117 |   #errorHandler = null
118 | 
119 |   ## Filters
120 |   # https://www.playframework.com/documentation/latest/ScalaHttpFilters
121 |   # https://www.playframework.com/documentation/latest/JavaHttpFilters
122 |   # ~~~~~
123 |   # Filters run code on every request. They can be used to perform
124 |   # common logic for all your actions, e.g. adding common headers.
125 |   # Defaults to "Filters" in the root package (aka "apps" folder)
126 |   # Alternatively you can explicitly register a class here.
127 |   #filters = my.application.Filters
128 | 
129 |   ## Session & Flash
130 |   # https://www.playframework.com/documentation/latest/JavaSessionFlash
131 |   # https://www.playframework.com/documentation/latest/ScalaSessionFlash
132 |   # ~~~~~
133 |   session {
134 |     # Sets the cookie to be sent only over HTTPS.
135 |     #secure = true
136 | 
137 |     # Sets the cookie to be accessed only by the server.
138 |     #httpOnly = true
139 | 
140 |     # Sets the max-age field of the cookie to 5 minutes.
141 |     # NOTE: this only sets when the browser will discard the cookie. Play will consider any
142 |     # cookie value with a valid signature to be a valid session forever. To implement a server side session timeout,
143 |     # you need to put a timestamp in the session and check it at regular intervals to possibly expire it.
144 |     #maxAge = 300
145 | 
146 |     # Sets the domain on the session cookie.
147 |     #domain = "example.com"
148 |   }
149 | 
150 |   flash {
151 |     # Sets the cookie to be sent only over HTTPS.
152 |     #secure = true
153 | 
154 |     # Sets the cookie to be accessed only by the server.
155 |     #httpOnly = true
156 |   }
157 | }
158 | 
159 | ## Netty Provider
160 | # https://www.playframework.com/documentation/latest/SettingsNetty
161 | # ~~~~~
162 | play.server.netty {
163 |   # Whether the Netty wire should be logged
164 |   #log.wire = true
165 | 
166 |   # If you run Play on Linux, you can use Netty's native socket transport
167 |   # for higher performance with less garbage.
168 |   #transport = "native"
169 | }
170 | 
171 | ## WS (HTTP Client)
172 | # https://www.playframework.com/documentation/latest/ScalaWS#Configuring-WS
173 | # ~~~~~
174 | # The HTTP client primarily used for REST APIs.  The default client can be
175 | # configured directly, but you can also create different client instances
176 | # with customized settings. You must enable this by adding to build.sbt:
177 | #
178 | # libraryDependencies += ws // or javaWs if using java
179 | #
180 | play.ws {
181 |   # Sets HTTP requests not to follow 302 requests
182 |   #followRedirects = false
183 | 
184 |   # Sets the maximum number of open HTTP connections for the client.
185 |   #ahc.maxConnectionsTotal = 50
186 | 
187 |   ## WS SSL
188 |   # https://www.playframework.com/documentation/latest/WsSSL
189 |   # ~~~~~
190 |   ssl {
191 |     # Configuring HTTPS with Play WS does not require programming.  You can
192 |     # set up both trustManager and keyManager for mutual authentication, and
193 |     # turn on JSSE debugging in development with a reload.
194 |     #debug.handshake = true
195 |     #trustManager = {
196 |     #  stores = [
197 |     #    { type = "JKS", path = "exampletrust.jks" }
198 |     #  ]
199 |     #}
200 |   }
201 | }
202 | 
203 | ## Cache
204 | # https://www.playframework.com/documentation/latest/JavaCache
205 | # https://www.playframework.com/documentation/latest/ScalaCache
206 | # ~~~~~
207 | # Play comes with an integrated cache API that can reduce the operational
208 | # overhead of repeated requests. You must enable this by adding to build.sbt:
209 | #
210 | # libraryDependencies += cache
211 | #
212 | play.cache {
213 |   # If you want to bind several caches, you can bind the individually
214 |   #bindCaches = ["db-cache", "user-cache", "session-cache"]
215 | }
216 | 
217 | ## Filters
218 | # https://www.playframework.com/documentation/latest/Filters
219 | # ~~~~~
220 | # There are a number of built-in filters that can be enabled and configured
221 | # to give Play greater security.  You must enable this by adding to build.sbt:
222 | #
223 | # libraryDependencies += filters
224 | #
225 | play.filters {
226 |   ## CORS filter configuration
227 |   # https://www.playframework.com/documentation/latest/CorsFilter
228 |   # ~~~~~
229 |   # CORS is a protocol that allows web applications to make requests from the browser
230 |   # across different domains.
231 |   # NOTE: You MUST apply the CORS configuration before the CSRF filter, as CSRF has
232 |   # dependencies on CORS settings.
233 |   cors {
234 |     # Filter paths by a whitelist of path prefixes
235 |     #pathPrefixes = ["/some/path", ...]
236 | 
237 |     # The allowed origins. If null, all origins are allowed.
238 |     #allowedOrigins = ["http://www.example.com"]
239 | 
240 |     # The allowed HTTP methods. If null, all methods are allowed
241 |     #allowedHttpMethods = ["GET", "POST"]
242 |   }
243 | 
244 |   ## CSRF Filter
245 |   # https://www.playframework.com/documentation/latest/ScalaCsrf#Applying-a-global-CSRF-filter
246 |   # https://www.playframework.com/documentation/latest/JavaCsrf#Applying-a-global-CSRF-filter
247 |   # ~~~~~
248 |   # Play supports multiple methods for verifying that a request is not a CSRF request.
249 |   # The primary mechanism is a CSRF token. This token gets placed either in the query string
250 |   # or body of every form submitted, and also gets placed in the users session.
251 |   # Play then verifies that both tokens are present and match.
252 |   csrf {
253 |     # Sets the cookie to be sent only over HTTPS
254 |     #cookie.secure = true
255 | 
256 |     # Defaults to CSRFErrorHandler in the root package.
257 |     #errorHandler = MyCSRFErrorHandler
258 |   }
259 | 
260 |   ## Security headers filter configuration
261 |   # https://www.playframework.com/documentation/latest/SecurityHeaders
262 |   # ~~~~~
263 |   # Defines security headers that prevent XSS attacks.
264 |   # If enabled, then all options are set to the below configuration by default:
265 |   headers {
266 |     # The X-Frame-Options header. If null, the header is not set.
267 |     #frameOptions = "DENY"
268 | 
269 |     # The X-XSS-Protection header. If null, the header is not set.
270 |     #xssProtection = "1; mode=block"
271 | 
272 |     # The X-Content-Type-Options header. If null, the header is not set.
273 |     #contentTypeOptions = "nosniff"
274 | 
275 |     # The X-Permitted-Cross-Domain-Policies header. If null, the header is not set.
276 |     #permittedCrossDomainPolicies = "master-only"
277 | 
278 |     # The Content-Security-Policy header. If null, the header is not set.
279 |     #contentSecurityPolicy = "default-src 'self'"
280 |   }
281 | 
282 |   ## Allowed hosts filter configuration
283 |   # https://www.playframework.com/documentation/latest/AllowedHostsFilter
284 |   # ~~~~~
285 |   # Play provides a filter that lets you configure which hosts can access your application.
286 |   # This is useful to prevent cache poisoning attacks.
287 |   hosts {
288 |     # Allow requests to example.com, its subdomains, and localhost:9000.
289 |     #allowed = [".example.com", "localhost:9000"]
290 |   }
291 | }
292 | 
293 | ## Evolutions
294 | # https://www.playframework.com/documentation/latest/Evolutions
295 | # ~~~~~
296 | # Evolutions allows database scripts to be automatically run on startup in dev mode
297 | # for database migrations. You must enable this by adding to build.sbt:
298 | #
299 | # libraryDependencies += evolutions
300 | #
301 | play.evolutions {
302 |   # You can disable evolutions for a specific datasource if necessary
303 |   #db.default.enabled = false
304 | }
305 | 
306 | ## Database Connection Pool
307 | # https://www.playframework.com/documentation/latest/SettingsJDBC
308 | # ~~~~~
309 | # Play doesn't require a JDBC database to run, but you can easily enable one.
310 | #
311 | # libraryDependencies += jdbc
312 | #
313 | play.db {
314 |   # The combination of these two settings results in "db.default" as the
315 |   # default JDBC pool:
316 |   #config = "db"
317 |   #default = "default"
318 | 
319 |   # Play uses HikariCP as the default connection pool.  You can override
320 |   # settings by changing the prototype:
321 |   prototype {
322 |     # Sets a fixed JDBC connection pool size of 50
323 |     #hikaricp.minimumIdle = 50
324 |     #hikaricp.maximumPoolSize = 50
325 |   }
326 | }
327 | 
328 | ## JDBC Datasource
329 | # https://www.playframework.com/documentation/latest/JavaDatabase
330 | # https://www.playframework.com/documentation/latest/ScalaDatabase
331 | # ~~~~~
332 | # Once JDBC datasource is set up, you can work with several different
333 | # database options:
334 | #
335 | # Slick (Scala preferred option): https://www.playframework.com/documentation/latest/PlaySlick
336 | # JPA (Java preferred option): https://playframework.com/documentation/latest/JavaJPA
337 | # EBean: https://playframework.com/documentation/latest/JavaEbean
338 | # Anorm: https://www.playframework.com/documentation/latest/ScalaAnorm
339 | #
340 | db {
341 |   # You can declare as many datasources as you want.
342 |   # By convention, the default datasource is named `default`
343 | 
344 |   # https://www.playframework.com/documentation/latest/Developing-with-the-H2-Database
345 |   #default.driver = org.h2.Driver
346 |   #default.url = "jdbc:h2:mem:play"
347 |   #default.username = sa
348 |   #default.password = ""
349 | 
350 |   # You can turn on SQL logging for any datasource
351 |   # https://www.playframework.com/documentation/latest/Highlights25#Logging-SQL-statements
352 |   #default.logSql=true
353 | }
354 | 


--------------------------------------------------------------------------------
/young-crawler-searcher/bin/activator:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | ###  ------------------------------- ###
  4 | ###  Helper methods for BASH scripts ###
  5 | ###  ------------------------------- ###
  6 | 
  7 | realpath () {
  8 | (
  9 |   TARGET_FILE="$1"
 10 |   FIX_CYGPATH="$2"
 11 | 
 12 |   cd "$(dirname "$TARGET_FILE")"
 13 |   TARGET_FILE=$(basename "$TARGET_FILE")
 14 | 
 15 |   COUNT=0
 16 |   while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ]
 17 |   do
 18 |       TARGET_FILE=$(readlink "$TARGET_FILE")
 19 |       cd "$(dirname "$TARGET_FILE")"
 20 |       TARGET_FILE=$(basename "$TARGET_FILE")
 21 |       COUNT=$(($COUNT + 1))
 22 |   done
 23 | 
 24 |   # make sure we grab the actual windows path, instead of cygwin's path.
 25 |   if [[ "x$FIX_CYGPATH" != "x" ]]; then
 26 |     echo "$(cygwinpath "$(pwd -P)/$TARGET_FILE")"
 27 |   else
 28 |     echo "$(pwd -P)/$TARGET_FILE"
 29 |   fi
 30 | )
 31 | }
 32 | 
 33 | 
 34 | # Uses uname to detect if we're in the odd cygwin environment.
 35 | is_cygwin() {
 36 |   local os=$(uname -s)
 37 |   case "$os" in
 38 |     CYGWIN*) return 0 ;;
 39 |     *)  return 1 ;;
 40 |   esac
 41 | }
 42 | 
 43 | # TODO - Use nicer bash-isms here.
 44 | CYGWIN_FLAG=$(if is_cygwin; then echo true; else echo false; fi)
 45 | 
 46 | 
 47 | # This can fix cygwin style /cygdrive paths so we get the
 48 | # windows style paths.
 49 | cygwinpath() {
 50 |   local file="$1"
 51 |   if [[ "$CYGWIN_FLAG" == "true" ]]; then
 52 |     echo $(cygpath -w $file)
 53 |   else
 54 |     echo $file
 55 |   fi
 56 | }
 57 | 
 58 | # Make something URI friendly
 59 | make_url() {
 60 |   url="$1"
 61 |   local nospaces=${url// /%20}
 62 |   if is_cygwin; then
 63 |     echo "/${nospaces//\\//}"
 64 |   else
 65 |     echo "$nospaces"
 66 |   fi
 67 | }
 68 | 
 69 | declare -a residual_args
 70 | declare -a java_args
 71 | declare -a scalac_args
 72 | declare -a sbt_commands
 73 | declare java_cmd=java
 74 | declare java_version
 75 | declare -r real_script_path="$(realpath "$0")"
 76 | declare -r sbt_home="$(realpath "$(dirname "$(dirname "$real_script_path")")")"
 77 | declare -r sbt_bin_dir="$(dirname "$real_script_path")"
 78 | declare -r app_version="1.3.10"
 79 | 
 80 | declare -r script_name=activator
 81 | declare -r java_opts=( "${ACTIVATOR_OPTS[@]}" "${SBT_OPTS[@]}" "${JAVA_OPTS[@]}" "${java_opts[@]}" )
 82 | userhome="$HOME"
 83 | if is_cygwin; then
 84 |   # cygwin sets home to something f-d up, set to real windows homedir
 85 |   userhome="$USERPROFILE"
 86 | fi
 87 | declare -r activator_user_home_dir="${userhome}/.activator"
 88 | declare -r java_opts_config_home="${activator_user_home_dir}/activatorconfig.txt"
 89 | declare -r java_opts_config_version="${activator_user_home_dir}/${app_version}/activatorconfig.txt"
 90 | 
 91 | echoerr () {
 92 |   echo 1>&2 "$@"
 93 | }
 94 | vlog () {
 95 |   [[ $verbose || $debug ]] && echoerr "$@"
 96 | }
 97 | dlog () {
 98 |   [[ $debug ]] && echoerr "$@"
 99 | }
100 | 
101 | jar_file () {
102 |   echo "$(cygwinpath "${sbt_home}/libexec/activator-launch-${app_version}.jar")"
103 | }
104 | 
105 | acquire_sbt_jar () {
106 |   sbt_jar="$(jar_file)"
107 | 
108 |   if [[ ! -f "$sbt_jar" ]]; then
109 |     echoerr "Could not find launcher jar: $sbt_jar"
110 |     exit 2
111 |   fi
112 | }
113 | 
114 | execRunner () {
115 |   # print the arguments one to a line, quoting any containing spaces
116 |   [[ $verbose || $debug ]] && echo "# Executing command line:" && {
117 |     for arg; do
118 |       if printf "%s\n" "$arg" | grep -q ' '; then
119 |         printf "\"%s\"\n" "$arg"
120 |       else
121 |         printf "%s\n" "$arg"
122 |       fi
123 |     done
124 |     echo ""
125 |   }
126 | 
127 |   # THis used to be exec, but we loose the ability to re-hook stty then
128 |   # for cygwin...  Maybe we should flag the feature here...
129 |   "$@"
130 | }
131 | 
132 | addJava () {
133 |   dlog "[addJava] arg = '$1'"
134 |   java_args=( "${java_args[@]}" "$1" )
135 | }
136 | addSbt () {
137 |   dlog "[addSbt] arg = '$1'"
138 |   sbt_commands=( "${sbt_commands[@]}" "$1" )
139 | }
140 | addResidual () {
141 |   dlog "[residual] arg = '$1'"
142 |   residual_args=( "${residual_args[@]}" "$1" )
143 | }
144 | addDebugger () {
145 |   addJava "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=$1"
146 | }
147 | 
148 | get_mem_opts () {
149 |   # if we detect any of these settings in ${JAVA_OPTS} we need to NOT output our settings.
150 |   # The reason is the Xms/Xmx, if they don't line up, cause errors.
151 |   if [[ "${JAVA_OPTS}" == *-Xmx* ]] || [[ "${JAVA_OPTS}" == *-Xms* ]] || [[ "${JAVA_OPTS}" == *-XX:MaxPermSize* ]] || [[ "${JAVA_OPTS}" == *-XX:MaxMetaspaceSize* ]] || [[ "${JAVA_OPTS}" == *-XX:ReservedCodeCacheSize* ]]; then
152 |      echo ""
153 |   else
154 |     # a ham-fisted attempt to move some memory settings in concert
155 |     # so they need not be messed around with individually.
156 |     local mem=${1:-1024}
157 |     local codecache=$(( $mem / 8 ))
158 |     (( $codecache > 128 )) || codecache=128
159 |     (( $codecache < 512 )) || codecache=512
160 |     local class_metadata_size=$(( $codecache * 2 ))
161 |     local class_metadata_opt=$([[ "$java_version" < "1.8" ]] && echo "MaxPermSize" || echo "MaxMetaspaceSize")
162 | 
163 |     echo "-Xms${mem}m -Xmx${mem}m -XX:ReservedCodeCacheSize=${codecache}m -XX:${class_metadata_opt}=${class_metadata_size}m"
164 |   fi
165 | }
166 | 
167 | require_arg () {
168 |   local type="$1"
169 |   local opt="$2"
170 |   local arg="$3"
171 |   if [[ -z "$arg" ]] || [[ "${arg:0:1}" == "-" ]]; then
172 |     echo "$opt requires <$type> argument"
173 |     exit 1
174 |   fi
175 | }
176 | 
177 | is_function_defined() {
178 |   declare -f "$1" > /dev/null
179 | }
180 | 
181 | # If we're *not* running in a terminal, and we don't have any arguments, then we need to add the 'ui' parameter
182 | detect_terminal_for_ui() {
183 |   [[ ! -t 0 ]] && [[ "${#residual_args}" == "0" ]] && {
184 |     addResidual "ui"
185 |   }
186 |   # SPECIAL TEST FOR MAC
187 |   [[ "$(uname)" == "Darwin" ]] && [[ "$HOME" == "$PWD" ]] && [[ "${#residual_args}" == "0" ]] && {
188 |     echo "Detected MAC OSX launched script...."
189 |     echo "Swapping to UI"
190 |     addResidual "ui"
191 |   }
192 | }
193 | 
194 | process_args () {
195 |   while [[ $# -gt 0 ]]; do
196 |     case "$1" in
197 |        -h|-help) usage; exit 1 ;;
198 |     -v|-verbose) verbose=1 && shift ;;
199 |       -d|-debug) debug=1 && shift ;;
200 | 
201 |            -ivy) require_arg path "$1" "$2" && addJava "-Dsbt.ivy.home=$2" && shift 2 ;;
202 |            -mem) require_arg integer "$1" "$2" && sbt_mem="$2" && shift 2 ;;
203 |      -jvm-debug) require_arg port "$1" "$2" && addDebugger $2 && shift 2 ;;
204 |          -batch) exec </dev/null && shift ;;
205 | 
206 |        -sbt-jar) require_arg path "$1" "$2" && sbt_jar="$2" && shift 2 ;;
207 |    -sbt-version) require_arg version "$1" "$2" && sbt_version="$2" && shift 2 ;;
208 |      -java-home) require_arg path "$1" "$2" && java_cmd="$2/bin/java" && shift 2 ;;
209 | 
210 |             -D*) addJava "$1" && shift ;;
211 |             -J*) addJava "${1:2}" && shift ;;
212 |               *) addResidual "$1" && shift ;;
213 |     esac
214 |   done
215 | 
216 |   is_function_defined process_my_args && {
217 |     myargs=("${residual_args[@]}")
218 |     residual_args=()
219 |     process_my_args "${myargs[@]}"
220 |   }
221 | 
222 |   java_version=$("$java_cmd" -Xmx512M -version 2>&1 | awk -F '"' '/version/ {print $2}')
223 |   vlog "[process_args] java_version = '$java_version'"
224 | }
225 | 
226 | # Detect that we have java installed.
227 | checkJava() {
228 |   local required_version="$1"
229 |   # Now check to see if it's a good enough version
230 |   if [[ "$java_version" == "" ]]; then
231 |     echo
232 |     echo No java installations was detected.
233 |     echo Please go to http://www.java.com/getjava/ and download
234 |     echo
235 |     exit 1
236 |   elif [[ ! "$java_version" > "$required_version" ]]; then
237 |     echo
238 |     echo The java installation you have is not up to date
239 |     echo $script_name requires at least version $required_version+, you have
240 |     echo version $java_version
241 |     echo
242 |     echo Please go to http://www.java.com/getjava/ and download
243 |     echo a valid Java Runtime and install before running $script_name.
244 |     echo
245 |     exit 1
246 |   fi
247 | }
248 | 
249 | 
250 | run() {
251 |   # no jar? download it.
252 |   [[ -f "$sbt_jar" ]] || acquire_sbt_jar "$sbt_version" || {
253 |     # still no jar? uh-oh.
254 |     echo "Download failed. Obtain the sbt-launch.jar manually and place it at $sbt_jar"
255 |     exit 1
256 |   }
257 | 
258 |   # process the combined args, then reset "$@" to the residuals
259 |   process_args "$@"
260 |   detect_terminal_for_ui
261 |   set -- "${residual_args[@]}"
262 |   argumentCount=$#
263 | 
264 |   # TODO - java check should be configurable...
265 |   checkJava "1.6"
266 | 
267 |   #If we're in cygwin, we should use the windows config, and terminal hacks
268 |   if [[ "$CYGWIN_FLAG" == "true" ]]; then
269 |     stty -icanon min 1 -echo > /dev/null 2>&1
270 |     addJava "-Djline.terminal=jline.UnixTerminal"
271 |     addJava "-Dsbt.cygwin=true"
272 |   fi
273 | 
274 |   # run sbt
275 |   execRunner "$java_cmd" \
276 |     "-Dactivator.home=$(make_url "$sbt_home")" \
277 |     ${SBT_OPTS:-$default_sbt_opts} \
278 |     $(get_mem_opts $sbt_mem) \
279 |       ${JAVA_OPTS} \
280 |     ${java_args[@]} \
281 |     -jar "$sbt_jar" \
282 |     "${sbt_commands[@]}" \
283 |     "${residual_args[@]}"
284 | 
285 |   exit_code=$?
286 | 
287 |   # Clean up the terminal from cygwin hacks.
288 |   if [[ "$CYGWIN_FLAG" == "true" ]]; then
289 |     stty icanon echo > /dev/null 2>&1
290 |   fi
291 |   exit $exit_code
292 | }
293 | 
294 | 
295 | declare -r noshare_opts="-Dsbt.global.base=project/.sbtboot -Dsbt.boot.directory=project/.boot -Dsbt.ivy.home=project/.ivy"
296 | declare -r sbt_opts_file=".sbtopts"
297 | declare -r etc_sbt_opts_file="${sbt_home}/conf/sbtopts"
298 | declare -r win_sbt_opts_file="${sbt_home}/conf/sbtconfig.txt"
299 | 
300 | usage() {
301 |  cat <<EOM
302 | Usage: $script_name [options]
303 | 
304 |   Command:
305 |   ui                 Start the Activator UI
306 |   new [name] [template-id]  Create a new project with [name] using template [template-id]
307 |   list-templates     Print all available template names
308 | 
309 |   Options:
310 |   -h | -help         print this message
311 |   -v | -verbose      this runner is chattier
312 |   -d | -debug        set sbt log level to debug
313 |   -no-colors         disable ANSI color codes
314 |   -sbt-create        start sbt even if current directory contains no sbt project
315 |   -sbt-dir   <path>  path to global settings/plugins directory (default: ~/.sbt)
316 |   -sbt-boot  <path>  path to shared boot directory (default: ~/.sbt/boot in 0.11 series)
317 |   -ivy       <path>  path to local Ivy repository (default: ~/.ivy2)
318 |   -mem    <integer>  set memory options (default: $sbt_mem, which is $(get_mem_opts $sbt_mem))
319 |   -no-share          use all local caches; no sharing
320 |   -no-global         uses global caches, but does not use global ~/.sbt directory.
321 |   -jvm-debug <port>  Turn on JVM debugging, open at the given port.
322 |   -batch             Disable interactive mode
323 | 
324 |   # sbt version (default: from project/build.properties if present, else latest release)
325 |   -sbt-version  <version>   use the specified version of sbt
326 |   -sbt-jar      <path>      use the specified jar as the sbt launcher
327 |   -sbt-rc                   use an RC version of sbt
328 |   -sbt-snapshot             use a snapshot version of sbt
329 | 
330 |   # java version (default: java from PATH, currently $(java -version 2>&1 | grep version))
331 |   -java-home <path>         alternate JAVA_HOME
332 | 
333 |   # jvm options and output control
334 |   JAVA_OPTS          environment variable, if unset uses "$java_opts"
335 |   SBT_OPTS           environment variable, if unset uses "$default_sbt_opts"
336 |   ACTIVATOR_OPTS     Environment variable, if unset uses ""
337 |   .sbtopts           if this file exists in the current directory, it is
338 |                      prepended to the runner args
339 |   /etc/sbt/sbtopts   if this file exists, it is prepended to the runner args
340 |   -Dkey=val          pass -Dkey=val directly to the java runtime
341 |   -J-X               pass option -X directly to the java runtime
342 |                      (-J is stripped)
343 |   -S-X               add -X to sbt's scalacOptions (-S is stripped)
344 | 
345 | In the case of duplicated or conflicting options, the order above
346 | shows precedence: JAVA_OPTS lowest, command line options highest.
347 | EOM
348 | }
349 | 
350 | 
351 | 
352 | process_my_args () {
353 |   while [[ $# -gt 0 ]]; do
354 |     case "$1" in
355 |      -no-colors) addJava "-Dsbt.log.noformat=true" && shift ;;
356 |       -no-share) addJava "$noshare_opts" && shift ;;
357 |      -no-global) addJava "-Dsbt.global.base=$(pwd)/project/.sbtboot" && shift ;;
358 |       -sbt-boot) require_arg path "$1" "$2" && addJava "-Dsbt.boot.directory=$2" && shift 2 ;;
359 |        -sbt-dir) require_arg path "$1" "$2" && addJava "-Dsbt.global.base=$2" && shift 2 ;;
360 |      -debug-inc) addJava "-Dxsbt.inc.debug=true" && shift ;;
361 |          -batch) exec </dev/null && shift ;;
362 | 
363 |     -sbt-create) sbt_create=true && shift ;;
364 | 
365 |               *) addResidual "$1" && shift ;;
366 |     esac
367 |   done
368 | 
369 |   # Now, ensure sbt version is used.
370 |   [[ "${sbt_version}XXX" != "XXX" ]] && addJava "-Dsbt.version=$sbt_version"
371 | }
372 | 
373 | loadConfigFile() {
374 |   cat "$1" | sed '/^\#/d' | while read line; do
375 |     eval echo $line
376 |   done
377 | }
378 | 
379 | # TODO - Pull in config based on operating system... (MSYS + cygwin should pull in txt file).
380 | # Here we pull in the global settings configuration.
381 | [[ -f "$etc_sbt_opts_file" ]] && set -- $(loadConfigFile "$etc_sbt_opts_file") "$@"
382 | # -- Windows behavior stub'd
383 | # JAVA_OPTS=$(cat "$WDIR/sbtconfig.txt" | sed -e 's/\r//g' -e 's/^#.*$//g' | sed ':a;N;$!ba;s/\n/ /g')
384 | 
385 | 
386 | #  Pull in the project-level config file, if it exists.
387 | [[ -f "$sbt_opts_file" ]] && set -- $(loadConfigFile "$sbt_opts_file") "$@"
388 | 
389 | # if configuration files exist, prepend their contents to the java args so it can be processed by this runner
390 | # a "versioned" config trumps one on the top level
391 | if [[ -f "$java_opts_config_version" ]]; then
392 |   addConfigOpts $(loadConfigFile "$java_opts_config_version")
393 | elif [[ -f "$java_opts_config_home" ]]; then
394 |   addConfigOpts $(loadConfigFile "$java_opts_config_home")
395 | fi
396 | 
397 | run "$@"
398 | 


--------------------------------------------------------------------------------