├── .gitignore
├── README.md
├── app-api
    └── src
    │   ├── main
    │       ├── resources
    │       │   ├── logback-test.xml
    │       │   └── reference.conf
    │       └── scala
    │       │   └── crawler
    │       │       └── app
    │       │           ├── Main.scala
    │       │           ├── common
    │       │               ├── BaseRoute.scala
    │       │               └── JsonSupport.scala
    │       │           └── routes
    │       │               ├── ApiRoutes.scala
    │       │               ├── NewsRoute.scala
    │       │               └── SiteRoute.scala
    │   ├── test
    │       └── scala
    │       │   ├── demo.sc
    │       │   ├── saic.sc
    │       │   └── worksheet.sc
    │   └── universal
    │       └── conf
    │           ├── application-test.conf
    │           ├── application.conf
    │           ├── jvmopts
    │           └── logback.xml
├── module-news
    ├── docs
    │   └── 杂记.txt
    └── src
    │   ├── main
    │       └── scala
    │       │   └── crawler
    │       │       └── module
    │       │           └── news
    │       │               ├── NewsJsonSupport.scala
    │       │               ├── NewsUtils.scala
    │       │               ├── commands
    │       │                   └── Commands.scala
    │       │               ├── crawlers
    │       │                   ├── BaiduNews.scala
    │       │                   ├── CourtNews.scala
    │       │                   ├── HaosouNews.scala
    │       │                   ├── NewsCrawler.scala
    │       │                   ├── SogouNews.scala
    │       │                   └── WechatNews.scala
    │       │               ├── enums
    │       │                   ├── ItemSource.scala
    │       │                   └── SearchMethod.scala
    │       │               ├── model
    │       │                   ├── NewsItem.scala
    │       │                   ├── NewsPage.scala
    │       │                   ├── NewsPageItem.scala
    │       │                   └── SearchResult.scala
    │       │               └── service
    │       │                   ├── NewsDBRepo.scala
    │       │                   ├── NewsMaster.scala
    │       │                   ├── NewsService.scala
    │       │                   └── actors
    │       │                       ├── ItemPageWorker.scala
    │       │                       ├── NewsJob.scala
    │       │                       ├── NewsSourceJob.scala
    │       │                       ├── PersistActor.scala
    │       │                       └── SearchPageWorker.scala
    │   └── test
    │       ├── resources
    │           └── logback.xml
    │       └── scala
    │           └── crawler
    │               └── module
    │                   └── news
    │                       ├── crawlers
    │                           ├── BaiduNewsTest.scala
    │                           ├── CourtNewsTest.scala
    │                           ├── HaosouNewsTest.scala
    │                           └── WechatNewsTest.scala
    │                       └── service
    │                           ├── NewsDBRepoTest.scala
    │                           └── actors
    │                               └── NewsJobMasterTest.scala
├── module-site-search
    └── src
    │   ├── main
    │       └── scala
    │       │   └── crawler
    │       │       └── module
    │       │           └── site
    │       │               ├── BaiduSite.scala
    │       │               ├── QueryCond.scala
    │       │               ├── SearchSyntax.scala
    │       │               └── model
    │       │                   ├── SearchRequest.scala
    │       │                   ├── SiteItem.scala
    │       │                   └── SiteResult.scala
    │   └── test
    │       ├── resources
    │           └── logback.xml
    │       └── scala
    │           └── crawler
    │               └── module
    │                   └── site
    │                       └── BaiduSiteTest.scala
├── project
    ├── Build.scala
    ├── BuildSettings.scala
    ├── build.properties
    ├── plugins.sbt
    └── sbt-launch.jar
├── sbt
└── util
    └── src
        ├── main
            ├── java
            │   └── crawler
            │   │   └── util
            │   │       └── news
            │   │           └── contextextractor
            │   │               ├── ContentExtractor.java
            │   │               └── News.java
            ├── resources
            │   └── reference.conf
            └── scala
            │   └── crawler
            │       ├── SystemUtils.scala
            │       └── util
            │           ├── Crawler.scala
            │           ├── JsoupImplicits.scala
            │           ├── Utils.scala
            │           ├── actors
            │               └── MetricActor.scala
            │           ├── http
            │               ├── HttpClient.scala
            │               └── TJsonSupport.scala
            │           ├── persist
            │               └── CassandraPersists.scala
            │           └── time
            │               └── TimeUtils.scala
        └── test
            └── scala
                └── crawler
                    ├── testsuite
                        └── ServiceSpec.scala
                    └── util
                        └── persist
                            └── CassandraPersistsTest.scala


/.gitignore:
--------------------------------------------------------------------------------
 1 | app-api/package/
 2 | logs/
 3 | target/
 4 | .idea
 5 | .idea_modules
 6 | .classpath
 7 | .project
 8 | .settings
 9 | RUNNING_PID
10 | app.pid
11 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Crawler Service
 2 | 
 3 | 爬虫服务
 4 | 
 5 | - Akka Stream & Http 1.0
 6 | - Cassandra 2.1
 7 | - Json4s 3.3
 8 | 
 9 | ## Install
10 | 
11 | ### 安装Cassandra
12 | 
13 | [http://www.yangbajing.me/2015/10/22/canssandra%E5%BC%80%E5%A7%8B/](http://www.yangbajing.me/2015/10/22/canssandra%E5%BC%80%E5%A7%8B/)
14 | 
15 | ### 配置
16 | 
17 | 1. `util/src/main/resources/reference.conf`: 默认配置
18 | 2. `app/src/main/resources/application.conf`: 产品配置
19 | 
20 | 具体使用说明请参考：[https://github.com/typesafehub/config](https://github.com/typesafehub/config)`
21 | 
22 | ### 编译
23 | 
24 | ```
25 | ./sbt app/assembly
26 | ```
27 | 
28 | ### 运行
29 | 
30 | ```
31 | java -jar app/target/scala-2.11/crawler-app.jar
32 | ```
33 | 
34 | 


--------------------------------------------------------------------------------
/app-api/src/main/resources/logback-test.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <configuration>
 3 |     <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 4 |         <encoder>
 5 |             <pattern>%date - [%level] - from %logger in %thread %n%message%n%xException%n</pattern>
 6 |         </encoder>
 7 |     </appender>
 8 | 
 9 |     <logger name="crawler" level="DEBUG"/>
10 | 
11 |     <root level="INFO">
12 |         <appender-ref ref="STDOUT"/>
13 |     </root>
14 | </configuration>


--------------------------------------------------------------------------------
/app-api/src/main/resources/reference.conf:
--------------------------------------------------------------------------------
 1 | akka {
 2 |   http {
 3 |     server {
 4 |       backlog = 1024
 5 |       max-connections = 8192
 6 |       socket-options {
 7 |         so-reuse-address = on
 8 |       }
 9 |     }
10 |     host-connection-pool {
11 |       max-connections = 8
12 |     }
13 |   }
14 | }
15 | 
16 | crawler {
17 |   api-uri = "http://120.26.93.104"
18 | 
19 |   akka-system-name = "crawler"
20 | 
21 |   network {
22 |     server = "0.0.0.0"
23 |     server = ${crawler.network.server}
24 |     port = 33333
25 |   }
26 | 
27 |   cassandra {
28 |     nodes = ["192.168.31.242", "192.168.31.243"]
29 |     keyspace = "crawler_spider"
30 |   }
31 | 
32 |   http-client {
33 |     headers {
34 |       chromeMac {
35 |         User-Agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"
36 |         Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
37 |         Accept-Encoding = "gzip, deflate, sdch"
38 |         Accept-Language = "zh-CN,zh;q=0.8,en;q=0.6"
39 |         Connection = "keep-alive"
40 |       }
41 | 
42 |       safariMac {
43 |         User-Agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/601.2.7 (KHTML, like Gecko) Version/9.0.1 Safari/601.2.7"
44 |         Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
45 |       }
46 | 
47 |       firefoxMac {
48 |         User-Agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:39.0) Gecko/20100101 Firefox/39.0"
49 |         Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
50 |         Accept-Encoding = "gzip, deflate"
51 |         Accept-Language = "en-US,en;q=0.5"
52 |         Connection = "keep-alive"
53 |       }
54 |     }
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/app-api/src/main/scala/crawler/app/Main.scala:
--------------------------------------------------------------------------------
 1 | package crawler.app
 2 | 
 3 | import java.nio.file.{Files, Paths}
 4 | 
 5 | import akka.http.scaladsl.Http
 6 | import com.typesafe.config.ConfigFactory
 7 | import com.typesafe.scalalogging.StrictLogging
 8 | import crawler.SystemUtils
 9 | import crawler.app.routes.ApiRoutes
10 | import crawler.util.Utils
11 | 
12 | import scala.util.{Failure, Success}
13 | 
14 | /**
15 |  * Main
16 |  * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-03.
17 |  */
18 | object Main extends App with StrictLogging {
19 | 
20 |   import SystemUtils._
21 |   import system.dispatcher
22 | 
23 |   Files.write(Paths.get("app.pid"), Utils.getPid.getBytes(Utils.CHARSET))
24 | 
25 |   val config = ConfigFactory.load()
26 | 
27 |   println(config.getString("crawler.network.server") + ":" + config.getInt("crawler.network.port"))
28 | 
29 |   Http().bindAndHandle(ApiRoutes(), config.getString("crawler.network.server"), config.getInt("crawler.network.port"))
30 |     .onComplete {
31 |       case Success(binding) =>
32 |         logger.info(s"binding: $binding")
33 |       case Failure(e) =>
34 |         e.printStackTrace()
35 |         SystemUtils.shutdown()
36 |     }
37 | 
38 | }
39 | 


--------------------------------------------------------------------------------
/app-api/src/main/scala/crawler/app/common/BaseRoute.scala:
--------------------------------------------------------------------------------
 1 | package crawler.app.common
 2 | 
 3 | import akka.http.scaladsl.server.Directives
 4 | import com.typesafe.scalalogging.LazyLogging
 5 | import crawler.SystemUtils
 6 | 
 7 | /**
 8 |   * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-18.
 9 |   */
10 | trait BaseRoute extends Directives with JsonSupport with LazyLogging {
11 |   implicit def system = SystemUtils.system
12 | 
13 |   implicit def mat = SystemUtils.materializer
14 | 
15 |   implicit def dispatcher = system.dispatcher
16 | }
17 | 


--------------------------------------------------------------------------------
/app-api/src/main/scala/crawler/app/common/JsonSupport.scala:
--------------------------------------------------------------------------------
 1 | package crawler.app.common
 2 | 
 3 | import akka.http.scaladsl.marshalling._
 4 | import akka.http.scaladsl.model.{HttpCharsets, MediaTypes}
 5 | import akka.http.scaladsl.unmarshalling._
 6 | import akka.stream.Materializer
 7 | import crawler.module.news.NewsJsonSupport
 8 | import crawler.module.site.QueryCond
 9 | import crawler.util.http.TJsonSupport
10 | import org.json4s.ext.EnumNameSerializer
11 | import org.json4s.{Formats, Serialization}
12 | 
13 | /**
14 |   * Json Support
15 |   * Created by yangjing on 15-11-6.
16 |   */
17 | trait JsonSupport extends TJsonSupport with NewsJsonSupport {
18 |   implicit override val formats: Formats = defaultFormats +
19 |     new EnumNameSerializer(QueryCond)
20 | 
21 |   implicit def json4sUnmarshallerConverter[A: Manifest](serialization: Serialization, formats: Formats)(implicit mat: Materializer): FromEntityUnmarshaller[A] =
22 |     json4sUnmarshaller(manifest, serialization, formats, mat)
23 | 
24 |   implicit def json4sUnmarshaller[A: Manifest](implicit serialization: Serialization, formats: Formats, mat: Materializer): FromEntityUnmarshaller[A] =
25 |     Unmarshaller.byteStringUnmarshaller
26 |       .forContentTypes(MediaTypes.`application/json`)
27 |       .mapWithCharset { (data, charset) =>
28 |         val input = if (charset == HttpCharsets.`UTF-8`) data.utf8String else data.decodeString(charset.nioCharset.name)
29 |         serialization.read(input)
30 |       }
31 | 
32 |   implicit def json4sMarshallerConverter[A <: AnyRef](serialization: Serialization, formats: Formats): ToEntityMarshaller[A] =
33 |     json4sMarshaller(serialization, formats)
34 | 
35 |   implicit def json4sMarshaller[A <: AnyRef](implicit serialization: Serialization, formats: Formats): ToEntityMarshaller[A] =
36 |     Marshaller.StringMarshaller.wrap(MediaTypes.`application/json`)(serialization.write[A])
37 | }
38 | 
39 | object JsonSupport extends JsonSupport
40 | 


--------------------------------------------------------------------------------
/app-api/src/main/scala/crawler/app/routes/ApiRoutes.scala:
--------------------------------------------------------------------------------
 1 | package crawler.app.routes
 2 | 
 3 | import akka.http.scaladsl.model.HttpResponse
 4 | import akka.http.scaladsl.server.Directives
 5 | 
 6 | /**
 7 |   * ApiRoute
 8 |   * Created by yangjing on 15-11-3.
 9 |   */
10 | object ApiRoutes extends Directives {
11 | 
12 |   def apply() =
13 |     pathPrefix("api") {
14 |       path("health_check") {
15 |         (get | head) {
16 |           complete(HttpResponse())
17 |         }
18 |       } ~
19 |         NewsRoute() ~
20 |         SiteRoute()
21 |     }
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/app-api/src/main/scala/crawler/app/routes/NewsRoute.scala:
--------------------------------------------------------------------------------
 1 | package crawler.app.routes
 2 | 
 3 | import java.util.concurrent.TimeUnit
 4 | 
 5 | import akka.http.scaladsl.marshalling.Marshal
 6 | import akka.http.scaladsl.model._
 7 | import com.typesafe.config.ConfigFactory
 8 | import crawler.SystemUtils
 9 | import crawler.app.common.BaseRoute
10 | import crawler.module.news.crawlers._
11 | import crawler.module.news.enums.{ItemSource, SearchMethod}
12 | import crawler.module.news.service.NewsService
13 | import crawler.util.Utils
14 | 
15 | import scala.concurrent.Future
16 | import scala.concurrent.duration.Duration
17 | import scala.util.Try
18 | 
19 | /**
20 |   * 新闻路由
21 |   * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-03.
22 |   */
23 | object NewsRoute extends BaseRoute {
24 | 
25 |   val config = ConfigFactory.load()
26 |   NewsCrawler.registerCrawler(ItemSource.baidu, new BaiduNews(SystemUtils.httpClient))
27 |   NewsCrawler.registerCrawler(ItemSource.sogou, new SogouNews(SystemUtils.httpClient))
28 |   NewsCrawler.registerCrawler(ItemSource.haosou, new HaosouNews(SystemUtils.httpClient))
29 |   NewsCrawler.registerCrawler(ItemSource.court, new CourtNews(SystemUtils.httpClient))
30 |   //  NewsCrawler.registerCrawler(NewsSource.wechat, new WechatNews(httpClient))
31 | 
32 |   val newsService = new NewsService()
33 | 
34 |   def apply() =
35 |     pathPrefix("news") {
36 |       pathEnd {
37 |         get {
38 |           parameters(
39 |             'company.as[String],
40 |             'source.as[String] ? "",
41 |             'method.as[String] ? "",
42 |             'duration.as[Int] ? 15,
43 |             'forcedLatest.as[String] ? "",
44 |             'version.as[String] ? "1") { (company, source, method, duration, forcedLatest, version) =>
45 | 
46 |             val future: Future[HttpResponse] =
47 |               version match {
48 |                 case "3" =>
49 |                   fromLocal(company, Seq(ItemSource.baidu) /*NewsSource.withToNames(source)*/ , method, duration, forcedLatest).flatMap(list =>
50 |                     Marshal(list.flatMap(_.news)).to[HttpResponse]
51 |                   )
52 | 
53 |                 case "2" =>
54 |                   fromCrawlerApi(company).recoverWith {
55 |                     case e: Exception =>
56 |                       logger.warn("fromCralwerApi recover with: " + e, e)
57 |                       fromLocal(company, Seq(ItemSource.baidu), method, duration, forcedLatest).flatMap(list =>
58 |                         Marshal(list.flatMap(_.news)).to[HttpResponse]
59 |                       )
60 |                   }
61 | 
62 |                 case _ =>
63 |                   fromLocal(company, Seq(ItemSource.baidu), method, duration, forcedLatest).flatMap(list =>
64 |                     Marshal(list).to[HttpResponse]
65 |                   )
66 |               }
67 |             complete(future)
68 |           }
69 |         }
70 |       }
71 |     }
72 | 
73 |   private def fromLocal(company: String, sources: Traversable[ItemSource.Value], method: String, duration: Int, forcedLatest: String) = {
74 |     val mtd = Try(SearchMethod.withName(method)).getOrElse(SearchMethod.F)
75 |     newsService.
76 |       fetchNews(company, sources, mtd, Duration(duration, TimeUnit.SECONDS), forcedLatest == "y")
77 |   }
78 | 
79 |   private def fromCrawlerApi(company: String) =
80 |     SystemUtils.httpClient.get(config.getString("crawler.api-uri") + "/api/news")
81 |       .queryParam("companyName" -> company)
82 |       .execute()
83 |       .map { resp =>
84 |         if (resp.getStatusCode != 200)
85 |           throw new RuntimeException(s"crawler-api not found company: $company, return: ${resp.getStatusCode}")
86 | 
87 |         HttpResponse(
88 |           StatusCodes.OK,
89 |           entity = HttpEntity(ContentType(MediaTypes.`application/json`), resp.getResponseBody(Utils.CHARSET.name()))
90 |         )
91 |       }
92 | 
93 | }
94 | 


--------------------------------------------------------------------------------
/app-api/src/main/scala/crawler/app/routes/SiteRoute.scala:
--------------------------------------------------------------------------------
 1 | package crawler.app.routes
 2 | 
 3 | import crawler.SystemUtils
 4 | import crawler.module.site.BaiduSite
 5 | import crawler.app.common.BaseRoute
 6 | import crawler.module.site.model.SearchRequest
 7 | 
 8 | /**
 9 |   * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-18.
10 |   */
11 | object SiteRoute extends BaseRoute {
12 | 
13 |   def apply() =
14 |     pathPrefix("site") {
15 |       path("baidu") {
16 |         post {
17 |           entity(as[SearchRequest]) { searchRequest =>
18 |             val baidu = new BaiduSite(SystemUtils.httpClient, searchRequest)
19 |             complete(baidu.fetchItemList())
20 |           }
21 |         }
22 |       }
23 |     }
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/app-api/src/test/scala/demo.sc:
--------------------------------------------------------------------------------
 1 | import java.nio.charset.Charset
 2 | import java.nio.file.{Paths, Files}
 3 | import scala.collection.JavaConverters._
 4 | 
 5 | import scala.io.Source
 6 | 
 7 | val s =
 8 |   """crawler-news001    121.199.23.3
 9 |     |crawler-news002    121.199.4.6
10 |     |crawler-news003    121.199.2.152
11 |     |crawler-news004    121.199.12.190
12 |     |crawler-news005    121.41.53.230
13 |     |crawler-news006    121.199.5.96
14 |     |crawler-news007    121.199.20.87
15 |     |crawler-news008    121.40.93.44
16 |     |crawler-news009    121.199.22.228
17 |     |crawler-news010    120.26.94.198
18 |     |crawler-news011    120.26.94.202
19 |     |crawler-news012    120.26.94.146
20 |     |crawler-news013    120.26.94.163
21 |     |crawler-news014    120.26.94.211
22 |     |crawler-news015    120.26.94.117
23 |     |crawler-news016    120.26.94.195
24 |     |crawler-news017    120.26.94.207
25 |     |crawler-news018    120.26.94.185
26 |     |crawler-news019    120.26.93.249
27 |     |crawler-news020    120.26.94.17
28 |     |crawler-news021    120.26.94.5
29 |     |crawler-news022    120.26.94.7
30 |     |crawler-news023    120.26.93.202
31 |     |crawler-news024    120.26.94.188
32 |     |crawler-news025    120.26.94.35
33 |     |crawler-news026    120.26.94.58
34 |     |crawler-news027    120.26.94.120
35 |     |crawler-news028    120.26.94.203
36 |     |crawler-news029    120.26.94.38
37 |     |crawler-news030    120.26.94.150
38 |     |crawler-news031    120.26.94.151
39 |     |crawler-news032    120.26.94.147
40 |     |crawler-news033    120.26.94.28
41 |     |crawler-news034    120.26.94.191
42 |     |crawler-news035    120.26.94.18
43 |     |crawler-news036    120.26.93.254
44 |     |crawler-news037    120.26.94.49
45 |     |crawler-news038    120.26.94.139
46 |     |crawler-news039    120.26.94.2
47 |     |crawler-news040    120.26.94.4
48 |     |crawler-news041    120.26.94.23
49 |     |crawler-news042    120.26.94.29
50 |     |crawler-news043    120.26.94.174
51 |     |crawler-news044    120.26.94.8
52 |     |crawler-news045    120.26.93.240
53 |     |crawler-news046    120.26.93.215
54 |     |crawler-news047    120.26.94.122
55 |     |crawler-news048    120.26.94.12
56 |     |crawler-news049    120.26.92.125
57 |     |crawler-news050    120.26.92.180
58 |     |crawler-news051    120.26.93.219
59 |     |crawler-news052    120.26.94.76
60 |     |crawler-news053    120.26.93.229
61 |     |crawler-news054    120.26.94.22
62 |     |crawler-news055    120.26.94.14
63 |     |crawler-news056    120.26.94.84
64 |     |crawler-news057    120.26.94.27
65 |     |crawler-news058    120.26.93.221
66 |     |crawler-news059    121.43.60.236""".stripMargin
67 | val lines = Source.fromString(s).getLines().map(_.split("    ")(0)).toStream
68 | 
69 | //val ss = Source.fromString(s).getLines().map { v =>
70 | //  val ip = v.drop(19)
71 | //  val hostname = v.take(15)
72 | //  Seq(hostname, ip, "1核1G", "/usr/app/python <br> /home/sc/open-falcon/agent")
73 | //    .mkString("| ", " | ", "  |")
74 | //}.toStream
75 | //
76 | //val lines =
77 | //  Stream(
78 | //    Seq("hostname        ", "IP", "hardware", "path"),
79 | //    Seq("----------------", "--", "--------", "----")
80 | //  ).map(_.mkString("| ", "  | ", "  |")) #:::
81 | //    ss
82 | 
83 | Files.write(Paths.get("/tmp/crawler-news-hosts.txt"), lines.asJava)


--------------------------------------------------------------------------------
/app-api/src/test/scala/saic.sc:
--------------------------------------------------------------------------------
 1 | import scala.io.Source
 2 | 
 3 | val s =
 4 |   """|120.55.182.150<br>(10.117.12.74)   | 1核1G    | /usr/app/python |saic  |
 5 |     ||120.26.225.105<br>(10.117.55.14)   | 1核1G    | /usr/app/python |saic  |
 6 |     ||121.41.2.74<br>(10.168.96.82)      | 1核1G    | /usr/app/python |saic  |
 7 |     ||120.55.113.230<br>(10.168.152.118) | 1核1G    | /usr/app/python |saic  |
 8 |     ||120.55.114.18<br>(10.168.154.133)  | 1核1G    | /usr/app/python |saic  |
 9 |     ||120.55.88.109<br>(10.117.196.51)   | 1核1G    | /usr/app/python |saic  |
10 |     ||121.41.2.196<br>(10.168.91.79)     | 1核1G    | /usr/app/python |saic  |
11 |     ||121.41.2.186<br>(10.168.94.151)    | 1核1G    | /usr/app/python |saic  |
12 |     ||120.55.64.125<br>(10.117.211.194)  | 1核1G    | /usr/app/python |saic  |
13 |     ||121.41.2.162<br>(10.168.93.81)     | 1核1G    | /usr/app/python |saic  |
14 |     ||121.41.1.166<br>(10.168.54.249)    | 1核1G    | /usr/app/python |saic  |
15 |     ||120.26.217.236<br>(10.117.52.105)  | 1核1G    | /usr/app/python |saic  |
16 |     ||120.26.92.73<br>(10.51.8.148)      | 1核1G    | /usr/app/python |saic  |
17 |     ||120.55.180.251<br>(10.117.8.21)    | 1核1G    | /usr/app/python |saic  |
18 |     ||120.26.91.2<br>(10.117.209.143)    | 1核1G    | /usr/app/python |saic  |
19 |     ||120.26.223.152<br>(10.117.51.186)  | 1核1G    | /usr/app/python |saic  |
20 |     ||120.26.223.135<br>(10.117.52.107)  | 1核1G    | /usr/app/python |saic  |
21 |     ||120.26.91.8<br>(10.117.209.141)    | 1核1G    | /usr/app/python |saic  |
22 |     ||120.55.112.92<br>(10.168.152.171)  | 1核1G    | /usr/app/python |saic  |
23 |     ||120.55.181.10<br>(10.117.8.192)    | 1核1G    | /usr/app/python |saic  |""".stripMargin
24 | val lines = Source.fromString(s).getLines().toStream
25 |   .map(v => v.take(v.indexOf('<')).replace("|", ""))
26 | 
27 | // fabric hosts
28 | lines
29 |   .map("xu.du@" + _)
30 |   .mkString("[\"", "\",\"", "\"]")
31 | 
32 | // hostnames
33 | lines.foreach(println)
34 | 
35 | 


--------------------------------------------------------------------------------
/app-api/src/test/scala/worksheet.sc:
--------------------------------------------------------------------------------
 1 | import java.time.LocalDateTime
 2 | 
 3 | import crawler.module.site.BaiduSite
 4 | //BaiduSite.dealTime("2015年1月13日")
 5 | //BaiduSite.dealTime("2015年1月1日")
 6 | //BaiduSite.dealTime("2015年11月13日")
 7 | //BaiduSite.dealTime("2015年11月3日")
 8 | 
 9 | "www.runoob.com/kjlsdf/sdf/".take("www.runoob.com/kjlsdf/sdf/".indexOf('/'))
10 | 
11 | val TIME_PATTERN = """(\d{4})年(\d{1,2})月(\d{1,2})日""".r
12 | def parseTime(s: String) = s.substring(0, s.indexOf('日')+1) match {
13 |   case TIME_PATTERN(year, month, day) => LocalDateTime.of(year.toInt, month.toInt, day.toInt, 0, 0)
14 |   case _ => null
15 | }
16 | parseTime("2015年1月13日 -  ")
17 | parseTime("2015年1月1日")
18 | parseTime("2015年11月13日")
19 | parseTime("2015年11月3日")
20 | parseTime("2015年11月332日")
21 | parseTime("15年11月332日")
22 | 


--------------------------------------------------------------------------------
/app-api/src/universal/conf/application-test.conf:
--------------------------------------------------------------------------------
 1 | akka {
 2 |   loggers = ["akka.event.slf4j.Slf4jLogger"]
 3 |   loglevel = INFO
 4 |   log-dead-letters = off
 5 |   log-dead-letters-during-shutdown = off
 6 |   fork-join-executor {
 7 |     parallelism-factor = 3.0
 8 |     parallelism-min = 16
 9 |     parallelism-max = 64
10 |   }
11 | 
12 |   http {
13 |     server {
14 |       backlog = 1024
15 |       max-connections = 8192
16 |       socket-options {
17 |         so-reuse-address = on
18 |       }
19 |     }
20 |     host-connection-pool {
21 |       max-connections = 8
22 |     }
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/app-api/src/universal/conf/application.conf:
--------------------------------------------------------------------------------
 1 | akka {
 2 |   loggers = ["akka.event.slf4j.Slf4jLogger"]
 3 |   loglevel = INFO
 4 |   log-dead-letters = off
 5 |   log-dead-letters-during-shutdown = off
 6 |   fork-join-executor {
 7 |     parallelism-factor = 3.0
 8 |     parallelism-min = 16
 9 |     parallelism-max = 64
10 |   }
11 | 
12 |   http {
13 |     server {
14 |       backlog = 1024
15 |       max-connections = 8192
16 |       socket-options {
17 |         so-reuse-address = on
18 |       }
19 |     }
20 |     host-connection-pool {
21 |       max-connections = 8
22 |     }
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/app-api/src/universal/conf/jvmopts:
--------------------------------------------------------------------------------
1 | -xmx2048m
2 | -xms2048m
3 | -file.encoding=UTF-8


--------------------------------------------------------------------------------
/app-api/src/universal/conf/logback.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <configuration>
 3 |     <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 4 |         <encoder>
 5 |             <pattern>%date - [%level] - from %logger in %thread %n%message%n%xException%n</pattern>
 6 |         </encoder>
 7 |     </appender>
 8 | 
 9 |     <appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender">
10 |         <file>logs/application.log</file>
11 |         <rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
12 |             <!-- Daily rollover with compression -->
13 |             <fileNamePattern>logs/application-log-%d{yyyy-MM-dd}.gz</fileNamePattern>
14 |             <!-- keep 60 days worth of history -->
15 |             <maxHistory>60</maxHistory>
16 |         </rollingPolicy>
17 |         <encoder>
18 |             <pattern>%date{yyyy-MM-dd HH:mm:ss ZZZZ} [%level] from %logger in %thread - %n%message%n%xException%n
19 |             </pattern>
20 |         </encoder>
21 |     </appender>
22 | 
23 |     <logger name="akka" level="INFO"/>
24 |     <logger name="crawler" level="DEBUG"/>
25 | 
26 |     <root level="INFO">
27 |         <!--<appender-ref ref="STDOUT"/>-->
28 |         <appender-ref ref="FILE"/>
29 |     </root>
30 | </configuration>


--------------------------------------------------------------------------------
/module-news/docs/杂记.txt:
--------------------------------------------------------------------------------
 1 | NewsMaster -> NewsJob* -> NewsSourceJob* -> SearchPageWorker
 2 |               PersistActor                    -> ItemPageWorker*
 3 | 
 4 | 
 5 | NewsJob actor: 收到新闻抓取请求，管理新闻抓取状态，数据存储
 6 |     - SearchPageWorker actor: 进行新闻搜索页面抓取，并解析
 7 |     - ItemPageWorker actor: 新闻详情页面抓取，并抽取内容正文
 8 | 
 9 | NewsJob actor: 每收到一次新闻抓取请求就实例化一个actor，在actor中再委派SearchPageWorker进行新闻搜索页抓取。
10 |                新闻搜索页抓取成功后数据回到Job actor，Job actor判断是否需要抓取全文，若是则再委派ItemPageWorker进行全文抓取。
11 |                NewsJob将保存一个 timeout 超时值，由实例化时参数传入。超时到则向客户返回Timeout请求。而actor则继续等待子actor，
12 |                如：SearchPageWorker和Seq[ItemPageWorker]执行完（或有错误发生），再停止NewsJob
13 |                在 postStop 回调函数中进行数据持久化工作。
14 | 
15 | SearchPageWorker: 根据参数抓取新闻搜索页的新闻列表，并将结果传回给 NewsJob
16 | 
17 | ItemPageWorker: 根据url抓取新闻详情页正文内容。每条url生成一个actor。抓取成功一条则回传到 NewsJob 中，由 NewsJob 做进一步处理。
18 | 
19 | 
20 | 
21 | DB存储添加索引和查找功能
22 | 
23 | 在case class中加入一个transactionId，记录每一次新闻查询的请求事物。
24 | 
25 | actor中尽量不传 ActorRef，而通过ActorPath或其它类似机制来查找actor
26 | 
27 | 新闻数据。
28 | 
29 | 使用Cassandra存储，2张表：
30 | 
31 | create keyspace if not exists crawler_spider with replication = {'class': 'SimpleStrategy', 'replication_factor': 2};
32 | use crawler_spider;
33 | 
34 | create type news_type (
35 |   url Text,
36 |   source Text,
37 |   title Text,
38 |   time Timestamp,
39 |   abstract Text
40 | );
41 | create table search_page (
42 |   key Text,
43 |   source Ascii,
44 |   time Timestamp,
45 |   count Int,
46 |   news List<frozen <news_type>>,
47 |   primary key (key, source, time)
48 | );
49 | create table news_page (
50 |   url Text,
51 |   title Text,
52 |   source Text,
53 |   time Timestamp,
54 |   abstract Text,
55 |   content Text,
56 |   primary key (url)
57 | );
58 | create table page_html (
59 |   url Text,
60 |   created_at Timestamp,
61 |   src Text,
62 |   primary key (url, created_at)
63 | );
64 | 


--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/NewsJsonSupport.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.news
 2 | 
 3 | import crawler.module.news.enums.{SearchMethod, ItemSource}
 4 | import crawler.util.http.TJsonSupport
 5 | import org.json4s.Formats
 6 | import org.json4s.ext.EnumNameSerializer
 7 | 
 8 | /**
 9 |   * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-22.
10 |   */
11 | trait NewsJsonSupport extends TJsonSupport {
12 |   implicit val formats: Formats = defaultFormats +
13 |     new EnumNameSerializer(ItemSource) +
14 |     new EnumNameSerializer(SearchMethod)
15 | }
16 | 
17 | object NewsJsonSupport extends NewsJsonSupport
18 | 


--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/NewsUtils.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.news
 2 | 
 3 | import java.net.URI
 4 | import java.util.concurrent.atomic.AtomicInteger
 5 | 
 6 | /**
 7 |  * News Utils
 8 |  * Created by yangjing on 15-11-5.
 9 |  */
10 | object NewsUtils {
11 |   private val _nums = new AtomicInteger(0)
12 | 
13 |   def getIndent = _nums.getAndIncrement()
14 | 
15 |   def uriToBaseUri(uri: String): String = uriToBaseUri(URI.create(uri))
16 | 
17 |   def uriToBaseUri(uri: URI): String = {
18 |     val sb = new StringBuffer()
19 |     if (uri.getScheme != null) {
20 |       sb.append(uri.getScheme)
21 |       sb.append(':')
22 |     }
23 |     if (uri.isOpaque) {
24 |       sb.append(uri.getSchemeSpecificPart)
25 |     } else {
26 |       if (uri.getHost != null) {
27 |         sb.append("//")
28 |         if (uri.getUserInfo != null) {
29 |           sb.append(uri.getUserInfo)
30 |           sb.append('@')
31 |         }
32 |         val needBrackets = ((uri.getHost.indexOf(':') >= 0)
33 |           && !uri.getHost.startsWith("[")
34 |           && !uri.getHost.endsWith("]"))
35 |         if (needBrackets) {
36 |           sb.append('[')
37 |         }
38 |         sb.append(uri.getHost)
39 |         if (needBrackets) sb.append(']')
40 |         if (uri.getPort != -1) {
41 |           sb.append(':')
42 |           sb.append(uri.getPort)
43 |         }
44 |       } else if (uri.getAuthority != null) {
45 |         sb.append("//")
46 |         sb.append(uri.getAuthority)
47 |       }
48 |     }
49 |     sb.toString
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/commands/Commands.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.news.commands
 2 | 
 3 | import crawler.module.news.enums.{ItemSource, SearchMethod}
 4 | import crawler.module.news.model.{NewsPageItem, SearchResult}
 5 | 
 6 | import scala.concurrent.duration.FiniteDuration
 7 | 
 8 | case class RequestSearchNews(sources: Seq[ItemSource.Value], msg: SearchNews)
 9 | 
10 | /**
11 |  * 新闻源搜索
12 |   *
13 |   * @param key 关键词
14 |  * @param method 搜索方式
15 |  * @param duration 持续时间（超时）
16 |  */
17 | case class SearchNews(key: String,
18 |                       method: SearchMethod.Value,
19 |                       duration: FiniteDuration)
20 | 
21 | /**
22 |  * 开始搜索新闻
23 |  */
24 | case object StartSearchNews
25 | 
26 | /**
27 |  * 抓取搜索页
28 |  */
29 | case object StartFetchSearchPage
30 | 
31 | /**
32 |  * 搜索超时
33 |  */
34 | case object SearchTimeout
35 | 
36 | /**
37 |  * 搜索结果
38 |   *
39 |   * @param news 新闻结果
40 |  */
41 | case class SearchPageResult(news: SearchResult)
42 | 
43 | /**
44 |  * 搜索失败
45 |   *
46 |   * @param failure 失败结果
47 |  */
48 | case class SearchPageFailure(failure: Throwable)
49 | 
50 | /**
51 |  * 开始抓取新闻详情内容
52 |  */
53 | case object StartFetchItemPage
54 | 
55 | /**
56 |  * 新闻详情
57 |   *
58 |   * @param result 新闻详情
59 |  */
60 | case class ItemPageResult(result: Either[String, NewsPageItem])
61 | 


--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/crawlers/BaiduNews.scala:
--------------------------------------------------------------------------------
  1 | package crawler.module.news.crawlers
  2 | 
  3 | import java.net.URLEncoder
  4 | import java.time.LocalDateTime
  5 | import java.util.concurrent.TimeUnit
  6 | 
  7 | import akka.util.Timeout
  8 | import crawler.SystemUtils
  9 | import crawler.module.news.enums.{SearchMethod, ItemSource}
 10 | import crawler.module.news.model.{NewsItem, SearchResult}
 11 | import crawler.util.http.HttpClient
 12 | import crawler.util.news.contextextractor.ContentExtractor
 13 | import crawler.util.time.TimeUtils
 14 | import org.jsoup.Jsoup
 15 | import org.jsoup.nodes.Element
 16 | 
 17 | import scala.collection.JavaConverters._
 18 | import scala.concurrent.duration._
 19 | import scala.concurrent.{Await, ExecutionContext, Future, Promise}
 20 | import scala.util.{Failure, Success}
 21 | 
 22 | /**
 23 |  * 百度新闻爬虫
 24 |  * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-03.
 25 |  */
 26 | class BaiduNews(val httpClient: HttpClient) extends NewsCrawler(ItemSource.baidu) {
 27 | 
 28 |   import crawler.util.JsoupImplicits._
 29 | 
 30 |   override protected val defaultHeaders: Array[Seq[(String, String)]] =
 31 |     super.defaultHeaders.map(headers => headers :+ ("User-Agent" -> "Baiduspider"))
 32 | 
 33 |   private def parseNewsItem(news: Element): NewsItem = {
 34 |     val a = news.findByClass("c-title").first().getElementsByTag("a").first()
 35 |     val summary = news.findByClass("c-summary")
 36 |     val authorText = news.findByClass("c-author").text()
 37 |     val source = authorText.split("  ")
 38 |     val footer = summary.findByClass("c-info").first().text()
 39 |     NewsItem(
 40 |       a.text(),
 41 |       a.attr("href"),
 42 |       source.headOption.getOrElse(""),
 43 |       BaiduNews.dealTime(source.lastOption.getOrElse("")),
 44 |       summary.text().replace(authorText, "").replace(footer, ""))
 45 |   }
 46 | 
 47 |   override def fetchItemList(key: String)(implicit ec: ExecutionContext): Future[SearchResult] = {
 48 |     val promise = Promise[Seq[NewsItem]]()
 49 | 
 50 |     val newsResultsFuture = fetchPage(BaiduNews.BAIDU_NEWS_BASE_URL.format(URLEncoder.encode('"' + key + '"', "UTF-8"))).map { resp =>
 51 |       val doc = Jsoup.parse(resp.getResponseBodyAsStream, "UTF-8", BaiduNews.BAIDU_NEWS_HOST)
 52 |       //        logger.debug(doc.body().toString + "\n\n\n")
 53 |       val now = TimeUtils.now()
 54 |       if (doc.getElementById("noresult") != null) {
 55 |         SearchResult(newsSource, key, now, 0, Nil)
 56 |       } else {
 57 |         val countText = doc
 58 |           .getElementById("header_top_bar")
 59 |           .getElementsByAttributeValue("class", "nums")
 60 |           .first()
 61 |           .text()
 62 |         val count =
 63 |           """\d+""".r.findAllMatchIn(countText).map(_.matched).mkString.toInt
 64 | 
 65 |         val newsDiv = doc.getElementById("content_left")
 66 |         val pages = doc.select("#page a").asScala
 67 |         val newsItemFutures = pages.take(BaiduNews.PAGE_LIMIT - 1).map { page =>
 68 |           TimeUnit.MILLISECONDS.sleep(500)
 69 |           fetchPageLinks(BaiduNews.BAIDU_NEWS_HOST + page.attr("href"))
 70 |         }
 71 |         Future.sequence(newsItemFutures).map(_.flatten).onComplete {
 72 |           case Success(list) =>
 73 |             promise.success(list)
 74 |           case Failure(e) =>
 75 |             e.printStackTrace()
 76 |             promise.success(Nil)
 77 |         }
 78 | 
 79 |         SearchResult(
 80 |           newsSource,
 81 |           key,
 82 |           now,
 83 |           count,
 84 |           newsDiv.findByClass("result").asScala.map(parseNewsItem).toList)
 85 |       }
 86 |     }
 87 | 
 88 |     for {
 89 |       newsResult <- newsResultsFuture
 90 |       newsItems <- promise.future
 91 |     } yield {
 92 |       newsResult.copy(news = newsResult.news ++ newsItems)
 93 |     }
 94 |   }
 95 | 
 96 |   def fetchPageLinks(url: String)(implicit ec: ExecutionContext): Future[Seq[NewsItem]] = {
 97 |     fetchPage(url).map { resp =>
 98 |       val doc = Jsoup.parse(resp.getResponseBodyAsStream, "UTF-8", BaiduNews.BAIDU_NEWS_HOST)
 99 |       if (doc.getElementById("noresult") != null) {
100 |         Nil
101 |       } else {
102 |         val newsDiv = doc.getElementById("content_left")
103 |         newsDiv.findByClass("result").asScala.map(parseNewsItem).toList
104 |       }
105 |     }
106 |   }
107 | }
108 | 
109 | object BaiduNews {
110 |   val PAGE_LIMIT = 5
111 |   val BAIDU_NEWS_HOST = "http://news.baidu.com"
112 |   val BAIDU_NEWS_BASE_URL = "http://news.baidu.com/ns?word=%s&tn=news&from=news&cl=2&rn=20&ct=1"
113 |   val TIME_PATTERN = """\d{4}年\d{2}月\d{2}日 \d{2}:\d{2}""".r
114 |   val FEW_HOURS_PATTERN = """(\d+)小时前""".r
115 | 
116 |   private def dealFewHours(timeStr: String): String = {
117 |     val matcher = FEW_HOURS_PATTERN.pattern.matcher(timeStr)
118 |     if (matcher.matches()) matcher.group(1) else ""
119 |   }
120 | 
121 |   def dealTime(timeStr: String): Option[LocalDateTime] = {
122 |     val dt = if (timeStr.length < 2) {
123 |       LocalDateTime.now()
124 |     } else if (TIME_PATTERN.pattern.matcher(timeStr).matches()) {
125 |       val s = timeStr.replaceAll( """年|月""", "-").replace("日", "")
126 |       LocalDateTime.parse(s, TimeUtils.formatterDateMinus)
127 |     } else if (FEW_HOURS_PATTERN.pattern.matcher(timeStr).matches()) {
128 |       val now = LocalDateTime.now()
129 |       val hour = dealFewHours(timeStr).toLong
130 |       now.minusHours(hour)
131 |     } else {
132 |       null
133 |     }
134 |     Option(dt)
135 |   }
136 | 
137 |   ////////////////////////////////////////////////////////////////////////////
138 |   // 以下为测试用例
139 |   ////////////////////////////////////////////////////////////////////////////
140 | 
141 |   def run(newsCrawler: NewsCrawler,
142 |           name: String,
143 |           method: SearchMethod.Value)(implicit ec: ExecutionContext): Future[SearchResult] = {
144 |     val newsResult = newsCrawler.fetchItemList(name)
145 |     if (SearchMethod.A == method) {
146 |       newsResult
147 |     } else {
148 |       newsResult.flatMap { result =>
149 |         val seqs = result.news.map { news =>
150 |           newsCrawler.fetchPage(news.url).map { resp =>
151 |             (news.url, ContentExtractor.getNewsByHtml(resp.getResponseBody("UTF-8")).getContent)
152 |           }
153 |         }
154 |         val f = Future.sequence(seqs)
155 |         f.map { urlContents =>
156 |           val news = result.news.map { news =>
157 |             urlContents.find(_._1 == news.url) match {
158 |               case Some((_, content)) =>
159 |                 news.copy(content = Option(content))
160 |               case None =>
161 |                 news
162 |             }
163 |           }
164 |           result.copy(news = news)
165 |         }
166 |       }
167 |     }
168 |   }
169 | 
170 |   def main(args: Array[String]): Unit = {
171 |     import SystemUtils._
172 |     implicit val timeout = Timeout(10.hours)
173 |     import system.dispatcher
174 | 
175 |     val httpClient = HttpClient()
176 |     val baidu = new BaiduNews(httpClient)
177 |     val f = run(baidu, "杭州今元标矩科技有限公司", SearchMethod.F)
178 |     val result = Await.result(f, timeout.duration)
179 |     result.news.foreach(news => println(news.content + "\n\n"))
180 |     println(result.count)
181 | 
182 |     system.shutdown()
183 |     httpClient.close()
184 |     system.awaitTermination()
185 |     //    System.exit(0)
186 |   }
187 | }
188 | 


--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/crawlers/CourtNews.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.news.crawlers
 2 | 
 3 | import java.time.LocalDate
 4 | 
 5 | import crawler.module.news.enums.ItemSource
 6 | import crawler.module.news.model.{NewsItem, SearchResult}
 7 | import crawler.util.Utils
 8 | import crawler.util.http.HttpClient
 9 | import crawler.util.time.TimeUtils
10 | import org.jsoup.Jsoup
11 | import org.jsoup.nodes.Element
12 | 
13 | import scala.collection.JavaConverters._
14 | import scala.concurrent.{ExecutionContext, Future}
15 | import scala.util.Random
16 | 
17 | /**
18 |  * 中国法院网新闻搜索
19 |  * Created by yangjing on 15-11-9.
20 |  */
21 | class CourtNews(val httpClient: HttpClient) extends NewsCrawler(ItemSource.court) {
22 |   private def fetchPagePost(url: String, data: Seq[(String, String)]) = {
23 |     val headers = defaultHeaders(Random.nextInt(defaultHeaders.length))
24 |     httpClient.post(url).header(headers: _*).addFormParam(data: _*).execute()
25 |   }
26 | 
27 |   private def parseNewsItem(elem: Element) = {
28 |     val a = elem.select("dt").select("a").first()
29 |     val dds = elem.select("dd")
30 |     val item = NewsItem(
31 |       a.text(),
32 |       CourtNews.SITE_URL + a.attr("href"),
33 |       "中国法院网",
34 |       Option(TimeUtils.toLocalDateTime(dds.last().text().split("    ").last)),
35 |       dds.first().text())
36 | //    println(item)
37 |     item
38 |   }
39 | 
40 |   /**
41 |    * 抓取搜索页
42 |     *
43 |     * @param key 搜索关键词
44 |    * @return
45 |    */
46 |   override def fetchItemList(key: String)(implicit ec: ExecutionContext): Future[SearchResult] = {
47 |     fetchPagePost(CourtNews.SEARCH_URL, Seq(
48 |       "keyword" -> key,
49 |       "button" -> "提交",
50 |       "content_time_publish_begin" -> "2002-01-01",
51 |       "content_time_publish_end" -> LocalDate.now().toString,
52 |       "article_category_id" -> "",
53 |       "content_author" -> ""
54 |     )).map { resp =>
55 |       val now = TimeUtils.now()
56 |       val doc = Jsoup.parse(resp.getResponseBody(Utils.CHARSET.name), CourtNews.SITE_URL)
57 |       val newsDl = doc.select("div.search_content").select("dl")
58 |       if (newsDl.isEmpty) {
59 |         SearchResult(newsSource, key, now, 0, Nil)
60 |       } else {
61 |         val newsItems = newsDl.asScala.map(parseNewsItem)
62 |         val countText = doc.select("div.search_br").select("span").first().text
63 |         val count =
64 |           try {
65 |             countText.toInt
66 |           } catch {
67 |             case e: Exception =>
68 |               logger.warn("count < 1: " + countText)
69 |               0
70 |           }
71 | 
72 |         SearchResult(newsSource, key, now, count, newsItems)
73 |       }
74 |     }
75 |   }
76 | }
77 | 
78 | object CourtNews {
79 |   val SITE_URL = "http://www.chinacourt.org"
80 |   val SEARCH_URL = "http://www.chinacourt.org/article/search.shtml"
81 | }
82 | 


--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/crawlers/HaosouNews.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.news.crawlers
 2 | 
 3 | import java.net.URLEncoder
 4 | 
 5 | import crawler.module.news.NewsUtils
 6 | import crawler.module.news.enums.ItemSource
 7 | import crawler.module.news.model.{NewsItem, SearchResult}
 8 | import crawler.util.Utils
 9 | import crawler.util.http.HttpClient
10 | import crawler.util.time.TimeUtils
11 | import org.jsoup.Jsoup
12 | import org.jsoup.nodes.Element
13 | 
14 | import scala.collection.JavaConverters._
15 | import scala.concurrent.{ExecutionContext, Future}
16 | 
17 | /**
18 |  * 360好搜新闻搜索
19 |  * Created by yangjing on 15-11-9.
20 |  */
21 | class HaosouNews(val httpClient: HttpClient) extends NewsCrawler(ItemSource.haosou) {
22 |   private def parseItem(elem: Element) = {
23 |     val a = elem.select("a")
24 |     val newsInfo = elem.select("p.newsinfo")
25 |     NewsItem(
26 |       a.text(),
27 |       a.attr("href"),
28 |       newsInfo.select("span.sitename").text(),
29 |       Option(TimeUtils.toLocalDateTime(newsInfo.select("span.posttime").attr("title"))),
30 |       elem.select("p.content").text())
31 |   }
32 | 
33 |   /**
34 |    * 抓取搜索页
35 |     *
36 |     * @param key 搜索关键词
37 |    * @return
38 |    */
39 |   override def fetchItemList(key: String)(implicit ec: ExecutionContext): Future[SearchResult] = {
40 |     fetchPage(HaosouNews.searchUrl(key)).map { resp =>
41 |       val doc = Jsoup.parse(resp.getResponseBody(Utils.CHARSET.name()), NewsUtils.uriToBaseUri(HaosouNews.SEARCH_SITE))
42 |       val now = TimeUtils.now()
43 |       val ul = doc.select("ul#news")
44 |       if (ul.isEmpty) {
45 |         SearchResult(newsSource, key, now, 0, Nil)
46 |       } else {
47 |         val newsItems = ul.select("li.res-list").asScala.map(parseItem)
48 |         val countText = doc.select("div#page").select("span.nums").text()
49 |         val count =
50 |           try {
51 |             """\d+""".r.findAllMatchIn(countText).mkString.toInt
52 |           } catch {
53 |             case e: Exception =>
54 |               logger.warn("count < 1")
55 |               newsItems.size
56 |           }
57 |         SearchResult(newsSource, key, now, count, newsItems)
58 |       }
59 |     }
60 |   }
61 | }
62 | 
63 | object HaosouNews {
64 |   val SEARCH_SITE = "http://news.haosou.com"
65 | 
66 |   def searchUrl(key: String) = SEARCH_SITE + "/ns?q=%s".format(URLEncoder.encode(key, Utils.CHARSET.name()))
67 | 
68 | }
69 | 


--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/crawlers/NewsCrawler.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.news.crawlers
 2 | 
 3 | import com.typesafe.scalalogging.LazyLogging
 4 | import crawler.module.news.NewsUtils
 5 | import crawler.module.news.enums.ItemSource
 6 | import crawler.module.news.model.{NewsPageItem, SearchResult}
 7 | import crawler.util.Crawler
 8 | import crawler.util.news.contextextractor.ContentExtractor
 9 | import org.jsoup.helper.DataUtil
10 | 
11 | import scala.concurrent.{ExecutionContext, Future}
12 | 
13 | /**
14 |   * 新闻爬虫
15 |   * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-03.
16 |   */
17 | abstract class NewsCrawler(val newsSource: ItemSource.Value) extends Crawler with LazyLogging {
18 |   /**
19 |     * 抓取搜索页
20 |     *
21 |     * @param key 搜索关键词
22 |     * @return
23 |     */
24 |   def fetchItemList(key: String)(implicit ec: ExecutionContext): Future[SearchResult]
25 | 
26 |   /**
27 |     * 抓取新闻详情页
28 |     *
29 |     * @param url 网页链接
30 |     * @return
31 |     */
32 |   def fetchNewsItem(url: String)(implicit ec: ExecutionContext): Future[NewsPageItem] = {
33 |     fetchPage(url).map { resp =>
34 |       val in = resp.getResponseBodyAsStream
35 |       val doc = DataUtil.load(in, null, NewsUtils.uriToBaseUri(url))
36 |       val src = doc.toString
37 |       val news = ContentExtractor.getNewsByDoc(doc)
38 |       NewsPageItem(url, src, news.getContent)
39 |     }
40 |   }
41 | 
42 | }
43 | 
44 | object NewsCrawler {
45 |   private var _newsCrawler = Map.empty[ItemSource.Value, NewsCrawler]
46 | 
47 |   def registerCrawler(source: ItemSource.Value, newsCrawler: NewsCrawler): Unit = {
48 |     _newsCrawler = _newsCrawler + (source -> newsCrawler)
49 |   }
50 | 
51 |   def getCrawler(source: ItemSource.Value): Option[NewsCrawler] = _newsCrawler.get(source)
52 | 
53 | }
54 | 


--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/crawlers/SogouNews.scala:
--------------------------------------------------------------------------------
  1 | package crawler.module.news.crawlers
  2 | 
  3 | import java.net.URLEncoder
  4 | 
  5 | import akka.util.Timeout
  6 | import crawler.SystemUtils
  7 | import crawler.module.news.enums.{ItemSource, SearchMethod}
  8 | import crawler.module.news.model.{NewsItem, SearchResult}
  9 | import crawler.util.http.HttpClient
 10 | import crawler.util.time.TimeUtils
 11 | import org.jsoup.Jsoup
 12 | import org.jsoup.nodes.Element
 13 | 
 14 | import scala.collection.JavaConverters._
 15 | import scala.concurrent.{Await, ExecutionContext, Future}
 16 | import scala.util.Try
 17 | 
 18 | /**
 19 |  * 搜狗新闻搜索
 20 |   *
 21 |   * @param httpClient
 22 |  */
 23 | class SogouNews(val httpClient: HttpClient) extends NewsCrawler(ItemSource.sogou) {
 24 | 
 25 |   private def parseItem(elem: Element) = {
 26 |     val header = elem.select("h3.pt")
 27 |     val title = header.select("a.pp")
 28 |     val source = header.select("cite") match {
 29 |       case s if s.isEmpty => Array("", "")
 30 |       case s => s.text().split(SogouNews.CITE_SPLIT_CHAR)
 31 |     }
 32 |     val summary = elem.select("div.ft").text().replace( """>>\d+?条相同新闻""", "")
 33 | 
 34 |     NewsItem(
 35 |       title.text(),
 36 |       title.attr("href"),
 37 |       source(0),
 38 |       Option(TimeUtils.toLocalDateTime(source.tail.mkString(" "))),
 39 |       summary)
 40 |   }
 41 | 
 42 |   /**
 43 |    * 抓取搜索页
 44 |     *
 45 |     * @param key 搜索关键词
 46 |    * @return
 47 |    */
 48 |   override def fetchItemList(key: String)(implicit ec: ExecutionContext): Future[SearchResult] = {
 49 |     //   val doc =  fetchDocument(SogouCrawler.searchUrl(URLEncoder.encode(key, "UTF-8")))
 50 |     fetchPage(SogouNews.searchUrl(URLEncoder.encode(key, "UTF-8"))).map { resp =>
 51 |       val doc = Jsoup.parse(resp.getResponseBody, "http://news.sogou.com")
 52 |       val now = TimeUtils.now()
 53 |       //      println(doc)
 54 |       val results = doc.select("div.results")
 55 |       if (results.isEmpty) {
 56 |         SearchResult(newsSource, key, now, 0, Nil)
 57 |       } else {
 58 |         val newsList = results.select("div.rb").asScala.map(parseItem)
 59 |         var count = Try( """\d+""".r.findAllMatchIn(doc.select("#pagebar_container").select("div.num").text()).mkString.toInt).getOrElse(0)
 60 |         if (count < 1) {
 61 |           logger.warn("count < 1")
 62 |           count = newsList.size
 63 |         }
 64 |         SearchResult(newsSource, key, now, count, newsList)
 65 |       }
 66 |     }
 67 |   }
 68 | }
 69 | 
 70 | object SogouNews {
 71 |   val REGEX = """\d+?条相同新闻""".r
 72 |   val CITE_SPLIT_CHAR = 160.toChar
 73 | 
 74 |   def searchUrl(key: String) = s"http://news.sogou.com/news?query=%22$key%22"
 75 | 
 76 |   ////////////////////////////////////////////////////////////////////////////
 77 |   // 以下为测试用例
 78 |   ////////////////////////////////////////////////////////////////////////////
 79 | 
 80 |   def run(newsCrawler: NewsCrawler,
 81 |           key: String,
 82 |           method: SearchMethod.Value)(implicit ec: ExecutionContext): Future[SearchResult] = {
 83 |     val newsResult = newsCrawler.fetchItemList(key)
 84 |     if (SearchMethod.A == method) {
 85 |       newsResult
 86 |     } else {
 87 |       newsResult.flatMap { result =>
 88 |         val seqs = result.news.map { news =>
 89 |           //          newsCrawler.fetchPage(news.url).map { resp =>
 90 |           //            (news.url, ContentExtractor.getNewsByHtml(resp.getResponseBody("UTF-8")).getContent)
 91 |           //          }
 92 |           newsCrawler.fetchNewsItem(news.url)
 93 |         }
 94 |         val f = Future.sequence(seqs)
 95 |         f.map { pageItems =>
 96 |           val news = result.news.map { news =>
 97 |             pageItems.find(_.url == news.url) match {
 98 |               case Some(pageItem) =>
 99 |                 news.copy(content = Option(pageItem.content))
100 |               case None =>
101 |                 news
102 |             }
103 |           }
104 |           result.copy(news = news)
105 |         }
106 |       }
107 |     }
108 |   }
109 | 
110 | }
111 | 


--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/crawlers/WechatNews.scala:
--------------------------------------------------------------------------------
  1 | package crawler.module.news.crawlers
  2 | 
  3 | import java.net.URLEncoder
  4 | import java.time.Instant
  5 | 
  6 | import crawler.module.news.enums.ItemSource
  7 | import crawler.module.news.model.{NewsItem, SearchResult}
  8 | import crawler.util.Utils
  9 | import crawler.util.http.HttpClient
 10 | import crawler.util.time.TimeUtils
 11 | import org.jsoup.Jsoup
 12 | import org.jsoup.nodes.Element
 13 | 
 14 | import scala.collection.JavaConverters._
 15 | import scala.concurrent.duration._
 16 | import scala.concurrent.{Await, ExecutionContext, Future}
 17 | 
 18 | /**
 19 |  * 搜狗微信搜索
 20 |  * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-10.
 21 |  */
 22 | class WechatNews(val httpClient: HttpClient) extends NewsCrawler(ItemSource.wechat) {
 23 |   private def parseNewsItem(elem: Element) = {
 24 |     implicit val duration = 1.second
 25 | 
 26 |     try {
 27 |       val title = elem.select("h4")
 28 |       val footer = elem.select("div.s-p")
 29 |       val scriptStr = elem.select("script").last().text()
 30 |       val timeStr = """'(\d+?)'""".r.findFirstMatchIn(scriptStr).map(_.matched.replace("'", ""))
 31 |       val href = WechatNews.complateWeixinUrl(title.select("a").attr("href").trim)
 32 |       val url = Option(WechatNews.find302Location(href, requestHeaders())).getOrElse(href)
 33 |       NewsItem(
 34 |         title.text().trim,
 35 |         url,
 36 |         footer.select("a#weixin_account").attr("title"),
 37 |         Option(TimeUtils.toLocalDateTime(Instant.ofEpochSecond(timeStr.map(_.toLong).getOrElse(Instant.now().getEpochSecond)))),
 38 |         elem.select("p").text())
 39 |     } catch {
 40 |       case e: Exception =>
 41 |         logger.error(elem.toString)
 42 |         throw e
 43 |     }
 44 |   }
 45 | 
 46 |   /**
 47 |    * 抓取搜索页
 48 |     *
 49 |     * @param key 搜索关键词
 50 |    * @return
 51 |    */
 52 |   override def fetchItemList(key: String)(implicit ec: ExecutionContext): Future[SearchResult] = {
 53 |     fetchPage(WechatNews.searchUrl(key)).map { response =>
 54 |       response.getHeaders.entrySet().asScala.foreach { case entry => println(entry.getKey + ": " + entry.getValue.asScala) }
 55 | 
 56 |       val now = TimeUtils.now()
 57 |       val doc = Jsoup.parse(response.getResponseBody(Utils.CHARSET.name()), "http://weixin.sogou.com")
 58 |       println(doc)
 59 |       val results = doc.select("div.wx-rb")
 60 |       if (!doc.select("#seccodeImage").isEmpty) {
 61 |         SearchResult(newsSource, key, now, 0, Nil, Some(doc.select("div.content-box").select("p.p2").text()))
 62 |       } else if (results.isEmpty) {
 63 |         SearchResult(newsSource, key, now, 0, Nil)
 64 |       } else {
 65 |         val newsItems = results.asScala.map(parseNewsItem)
 66 |         val countText = doc.select("resnum#scd_num").text().replace(",", "").trim
 67 |         val count =
 68 |           try {
 69 |             countText.toInt
 70 |           } catch {
 71 |             case e: Exception =>
 72 |               logger.warn("count < 1: " + countText, e)
 73 |               newsItems.size
 74 |           }
 75 |         SearchResult(newsSource, key, now, count, newsItems)
 76 |       }
 77 |     }
 78 |   }
 79 | 
 80 | }
 81 | 
 82 | object WechatNews {
 83 |   final val WEIXIN_SEARCH_PAGE = "http://weixin.sogou.com"
 84 | 
 85 |   def complateWeixinUrl(uri: String) =
 86 |     if (uri.startsWith("/")) WEIXIN_SEARCH_PAGE + uri else uri
 87 | 
 88 |   def searchUrl(key: String) =
 89 |     WEIXIN_SEARCH_PAGE + "/weixin?query=%s&type=2".format(URLEncoder.encode(key, Utils.CHARSET.name()))
 90 | 
 91 |   def find302Location(url: String, headers: Seq[(String, String)])(implicit duration: Duration) = {
 92 |     val client = HttpClient(false)
 93 |     try {
 94 |       val resp = Await.result(client.get(url).header(headers: _*).execute(), duration)
 95 |       resp.getHeader("Location")
 96 |     } catch {
 97 |       case e: Exception =>
 98 |         try {
 99 |           val respose = Await.result(client.get(url).header(headers: _*).execute(), duration + 1.second)
100 |           respose.getHeader("Location")
101 |         } catch {
102 |           case e: Exception =>
103 |             // do nothing
104 |             null
105 |         }
106 |     } finally {
107 |       client.close()
108 |     }
109 |   }
110 | }
111 | 


--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/enums/ItemSource.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.news.enums
 2 | 
 3 | /**
 4 |  * 新闻来源
 5 |  * Created by yangjing on 15-11-4.
 6 |  */
 7 | object ItemSource extends Enumeration {
 8 |   val baidu = Value
 9 |   val sogou = Value
10 |   val haosou = Value
11 |   val court = Value
12 |   val wechat = Value
13 | 
14 |   def withToNames(source: String): Traversable[Value] =
15 |     if (source == null || source.isEmpty) {
16 |       ItemSource.values
17 |     } else {
18 |       source.split(',').toSeq.collect {
19 |         case s if ItemSource.values.exists(_.toString == s) =>
20 |           ItemSource.withName(s)
21 |       }
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/enums/SearchMethod.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.news.enums
 2 | 
 3 | /**
 4 |  * 查找方式
 5 |  * Created by yangjing on 15-11-4.
 6 |  */
 7 | object SearchMethod extends Enumeration {
 8 |   // 取摘要
 9 |   val A = Value
10 | 
11 |   // 取全文
12 |   val F = Value
13 | }
14 | 


--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/model/NewsItem.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.news.model
 2 | 
 3 | import java.time.LocalDateTime
 4 | 
 5 | import com.datastax.driver.core.{UDTValue, UserType}
 6 | import crawler.module.news.NewsJsonSupport._
 7 | import crawler.util.time.TimeUtils
 8 | import org.json4s.Extraction
 9 | 
10 | /**
11 |   * 新闻详情
12 |   * Created by yangjing on 15-11-3.
13 |   */
14 | case class NewsItem(title: String,
15 |                     url: String,
16 |                     // 新闻来源（站点）
17 |                     source: String,
18 |                     time: Option[LocalDateTime],
19 |                     // 摘要
20 |                     `abstract`: String,
21 |                     content: Option[String] = None,
22 |                     values: Seq[String] = Nil,
23 |                     error: Option[String] = None) {
24 |   def jsonPretty = {
25 |     val jv = Extraction.decompose(this)
26 |     serialization.writePretty(jv)
27 |   }
28 | }
29 | 
30 | object NewsItem {
31 |   def toUDTValue(userType: UserType, ni: NewsItem): UDTValue = {
32 |     userType.newValue()
33 |       .setString("title", ni.title)
34 |       .setString("url", ni.url)
35 |       .setString("source", ni.source)
36 |       .setTimestamp("time", ni.time.map(TimeUtils.toDate).orNull)
37 |       .setString("abstract", ni.`abstract`)
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/model/NewsPage.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.news.model
 2 | 
 3 | import java.time.LocalDateTime
 4 | 
 5 | /**
 6 |  * 新闻页
 7 |  * Created by yangjing on 15-11-9.
 8 |  */
 9 | case class NewsPage(url: String,
10 |                     title: String,
11 |                     source: String,
12 |                     time: Option[LocalDateTime],
13 |                     `abstract`: String,
14 |                     content: String)
15 | 


--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/model/NewsPageItem.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.news.model
 2 | 
 3 | /**
 4 |  * 新闻页详情
 5 |  * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-05.
 6 |  * @param url 网页链接
 7 |  * @param src 网页源码
 8 | // * @param title 新闻标题
 9 | // * @param time 发布时间
10 |  * @param content 新闻内容
11 |  */
12 | case class NewsPageItem(url: String,
13 |                         src: String,
14 | //                        title: String,
15 | //                        time: String,
16 |                         content: String)
17 | 


--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/model/SearchResult.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.news.model
 2 | 
 3 | import java.time.LocalDateTime
 4 | 
 5 | import crawler.module.news.enums.ItemSource
 6 | 
 7 | /**
 8 |  * 搜索结果
 9 |  * Created by yangjing on 15-11-3.
10 |  */
11 | case class SearchResult(source: ItemSource.Value,
12 |                         key: String,
13 |                         time: LocalDateTime,
14 |                         count: Int,
15 |                         news: Seq[NewsItem],
16 |                         error: Option[String] = None)
17 | 


--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/service/NewsDBRepo.scala:
--------------------------------------------------------------------------------
  1 | package crawler.module.news.service
  2 | 
  3 | import java.time.LocalDateTime
  4 | 
  5 | import com.datastax.driver.core.{PreparedStatement, Session, UDTValue}
  6 | import com.typesafe.scalalogging.LazyLogging
  7 | import crawler.SystemUtils
  8 | import crawler.module.news.enums.{ItemSource, SearchMethod}
  9 | import crawler.module.news.model.{NewsItem, NewsPage, SearchResult}
 10 | import crawler.util.persist.CassandraPersists
 11 | import crawler.util.time.TimeUtils
 12 | 
 13 | import scala.collection.JavaConverters._
 14 | import scala.collection.mutable
 15 | import scala.concurrent.{ExecutionContextExecutor, Future}
 16 | 
 17 | /**
 18 |  * News DB Service
 19 |  * Created by yangjing on 15-11-6.
 20 |  */
 21 | class NewsDBRepo extends LazyLogging {
 22 | 
 23 |   val KEYSPACE = SystemUtils.crawlerConfig.getString("cassandra.keyspace")
 24 |   val cachePrepares = mutable.Map.empty[String, PreparedStatement]
 25 | 
 26 |   private def findNews(key: String,
 27 |                        source: ItemSource.Value,
 28 |                        method: SearchMethod.Value,
 29 |                        time: LocalDateTime)(
 30 |                         implicit ec: ExecutionContextExecutor
 31 |                         ): Future[Seq[SearchResult]] = {
 32 | 
 33 |     logger.debug(s"key: $key, source: $source, method: $method, time: $time")
 34 | 
 35 |     CassandraPersists.using(KEYSPACE) { implicit session =>
 36 |       val stmt = getPreparedStatement(session, "SELECT * FROM search_page WHERE key = ? AND source = ? AND time > ?")
 37 |       val futureResultSet = session.executeAsync(stmt.bind(key, source.toString, TimeUtils.toDate(time)))
 38 |       val list = CassandraPersists.execute(futureResultSet) { rs =>
 39 |         rs.asScala.map { row =>
 40 |           val news = row.getList("news", classOf[UDTValue]).asScala.map(udt =>
 41 |             NewsItem(
 42 |               udt.getString("title"),
 43 |               udt.getString("url"),
 44 |               udt.getString("source"),
 45 |               Option(TimeUtils.toLocalDateTime(udt.getTimestamp("time"))),
 46 |               udt.getString("abstract"))
 47 |           )
 48 | 
 49 |           val newsItemFuture = Future.sequence(news.map(news =>
 50 |             findOneNewsPageItem(news.url).map(nop => news.copy(content = nop.map(_.content)))))
 51 | 
 52 |           newsItemFuture.map { newsList =>
 53 |             SearchResult(
 54 |               ItemSource.withName(row.getString("source")),
 55 |               row.getString("key"),
 56 |               TimeUtils.toLocalDateTime(row.getTimestamp("time")),
 57 |               row.getInt("count"),
 58 |               newsList)
 59 |           }
 60 |         }.toList
 61 |       }
 62 | 
 63 |       list.flatMap(futures => Future.sequence(futures))
 64 |     }
 65 |   }
 66 | 
 67 |   def findNews(key: String,
 68 |                sources: Traversable[ItemSource.Value],
 69 |                method: SearchMethod.Value,
 70 |                time: Option[LocalDateTime])(
 71 |                 implicit ec: ExecutionContextExecutor
 72 |                 ): Future[List[SearchResult]] = {
 73 | 
 74 |     val futureList = CassandraPersists.using(KEYSPACE) { implicit session =>
 75 |       val pstmt =
 76 |         if (time.isEmpty) getPreparedStatement(session, "SELECT * FROM search_page WHERE key = ? AND source = ?")
 77 |         else getPreparedStatement(session, "SELECT * FROM search_page WHERE key = ? AND source = ? AND time > ?")
 78 | 
 79 |       sources.flatMap { source =>
 80 |         val stmt =
 81 |           if (time.isEmpty) pstmt.bind(key, source.toString)
 82 |           else pstmt.bind(key, source.toString, TimeUtils.toDate(time.get))
 83 | 
 84 |         session.execute(stmt).asScala.map { row =>
 85 |           val news = row.getList("news", classOf[UDTValue]).asScala.map(udt =>
 86 |             NewsItem(
 87 |               udt.getString("title"),
 88 |               udt.getString("url"),
 89 |               udt.getString("source"),
 90 |               Option(TimeUtils.toLocalDateTime(udt.getTimestamp("time"))),
 91 |               udt.getString("abstract"))
 92 |           )
 93 | 
 94 |           val newsItemFuture = Future.sequence(news.map(news =>
 95 |             findOneNewsPageItem(news.url).map(nop => news.copy(content = nop.map(_.content)))))
 96 | 
 97 |           newsItemFuture.map(list =>
 98 |             SearchResult(
 99 |               ItemSource.withName(row.getString("source")),
100 |               row.getString("key"),
101 |               TimeUtils.toLocalDateTime(row.getTimestamp("time")),
102 |               row.getInt("count"),
103 |               list)
104 |           )
105 | 
106 |         }
107 |       }.toList
108 | 
109 |     }
110 | 
111 |     Future.sequence(futureList)
112 |   }
113 | 
114 |   def findOneNewsPageItem(url: String)(
115 |     implicit session: Session, ec: ExecutionContextExecutor
116 |     ): Future[Option[NewsPage]] = {
117 | 
118 |     val stmt = getPreparedStatement(session, "SELECT * FROM news_page WHERE url = ?")
119 |     CassandraPersists.execute(session.executeAsync(stmt.bind(url))) { rs =>
120 |       rs.one match {
121 |         case null =>
122 |           None
123 |         case row =>
124 |           Some(NewsPage(
125 |             row.getString("url"),
126 |             row.getString("title"),
127 |             row.getString("source"),
128 |             Option(TimeUtils.toLocalDateTime(row.getTimestamp("time"))),
129 |             row.getString("abstract"),
130 |             row.getString("content"))
131 |           )
132 |       }
133 |     }
134 |   }
135 | 
136 |   def saveToNewsPage(page: NewsPage): Unit = {
137 |     CassandraPersists.using(KEYSPACE) { session =>
138 |       val stmt = getPreparedStatement(session,
139 |         "INSERT INTO news_page(url, title, source, time, abstract, content) VALUES(?, ?, ?, ?, ?, ?)")
140 |       session.executeAsync(stmt.bind(
141 |         page.url,
142 |         page.title,
143 |         page.source,
144 |         page.time.map(TimeUtils.toDate).orNull,
145 |         page.`abstract`,
146 |         page.content))
147 |     }
148 |   }
149 | 
150 |   def saveToSearchPage(newsResult: SearchResult) = {
151 | //    logger.debug(newsResult.news.mkString("\n"))
152 |     logger.info(s"key: ${newsResult.key} found news: ${newsResult.count}, saved: ${newsResult.news.size}")
153 |     CassandraPersists.using(KEYSPACE) { session =>
154 |       val newsType = CassandraPersists.userType(KEYSPACE, "news_type")
155 |       val stmt = getPreparedStatement(session, "INSERT INTO search_page(key, source, time, count, news) VALUES(?, ?, ?, ?, ?)")
156 |       session.executeAsync(stmt.bind(
157 |         newsResult.key,
158 |         newsResult.source.toString,
159 |         TimeUtils.toDate(newsResult.time),
160 |         Integer.valueOf(newsResult.count),
161 |         newsResult.news.map(n => NewsItem.toUDTValue(newsType, n)).asJava))
162 |     }
163 |   }
164 | 
165 |   private def getPreparedStatement(session: Session, sql: String): PreparedStatement = {
166 |     //    println("sql: " + sql)
167 |     cachePrepares.getOrElse(sql, {
168 |       val p = session.prepare(sql)
169 |       cachePrepares.put(sql, p)
170 |       p
171 |     })
172 |   }
173 | 
174 | }
175 | 


--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/service/NewsMaster.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.news.service
 2 | 
 3 | import akka.actor.Props
 4 | import crawler.module.news.NewsUtils
 5 | import crawler.module.news.commands.RequestSearchNews
 6 | import crawler.module.news.service.actors.{NewsJob, PersistActor}
 7 | import crawler.util.actors.MetricActor
 8 | 
 9 | /**
10 |  * News Supervisor
11 |  * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-06.
12 |  */
13 | class NewsMaster extends MetricActor {
14 |   val persistActor = context.actorOf(PersistActor.props, PersistActor.actorName)
15 | 
16 |   override val metricReceive: Receive = {
17 |     case RequestSearchNews(sources, msg) =>
18 |       val doSender = sender()
19 |       val newsJob = context.actorOf(NewsJob.props(sources, doSender), "news-" + NewsUtils.getIndent)
20 |       newsJob ! msg
21 |   }
22 | }
23 | 
24 | object NewsMaster {
25 |   val actorName = "news-master"
26 | 
27 |   def props = Props(new NewsMaster)
28 | }
29 | 


--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/service/NewsService.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.news.service
 2 | 
 3 | import akka.pattern.ask
 4 | import crawler.module.news.commands.{RequestSearchNews, SearchNews}
 5 | import crawler.module.news.enums.{ItemSource, SearchMethod}
 6 | import crawler.module.news.model.{NewsItem, SearchResult}
 7 | import crawler.util.time.TimeUtils
 8 | 
 9 | import scala.concurrent.Future
10 | import scala.concurrent.duration._
11 | 
12 | /**
13 |  * 新闻服务
14 |  * Created by yangjing on 15-11-3.
15 |  */
16 | class NewsService {
17 | 
18 |   import crawler.SystemUtils._
19 |   import system.dispatcher
20 | 
21 |   val newsMaster = system.actorOf(NewsMaster.props, NewsMaster.actorName)
22 |   val dbRepo = new NewsDBRepo
23 | 
24 |   def fetchNewsApi(_key: String,
25 |                    sources: Traversable[ItemSource.Value],
26 |                    method: SearchMethod.Value,
27 |                    duration: FiniteDuration,
28 |                    forcedLatest: Boolean): Future[Seq[NewsItem]] = {
29 |     fetchNews(_key, sources, method, duration, forcedLatest).
30 |       map(_.flatMap(_.news))
31 |   }
32 | 
33 |   def fetchNews(_key: String,
34 |                 sources: Traversable[ItemSource.Value],
35 |                 method: SearchMethod.Value,
36 |                 duration: FiniteDuration,
37 |                 forcedLatest: Boolean): Future[Seq[SearchResult]] = {
38 |     val key = _key.trim
39 |     val future = dbRepo.findNews(key, sources, method, if (forcedLatest) Some(TimeUtils.nowBegin()) else None)
40 | 
41 |     future.flatMap(results =>
42 |       if (results.isEmpty) {
43 |         val msg = RequestSearchNews(sources.toSeq, SearchNews(key, method, duration))
44 |         // TODO 最长5分钟
45 |         newsMaster.ask(msg)(5.minutes).mapTo[Seq[SearchResult]]
46 |       } else {
47 |         Future.successful(results)
48 |       }
49 |     )
50 |   }
51 | 
52 | }
53 | 
54 | 


--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/service/actors/ItemPageWorker.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.news.service.actors
 2 | 
 3 | import akka.actor.Props
 4 | import crawler.module.news.commands.{ItemPageResult, StartFetchItemPage}
 5 | import crawler.module.news.crawlers.NewsCrawler
 6 | import crawler.module.news.enums.ItemSource
 7 | import crawler.module.news.model.NewsItem
 8 | import crawler.util.actors.MetricActor
 9 | 
10 | import scala.util.{Failure, Success}
11 | 
12 | /**
13 |  * 详情页面
14 |  * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-06.
15 |  */
16 | class ItemPageWorker(source: ItemSource.Value, newsItem: NewsItem) extends MetricActor {
17 | 
18 |   import context.dispatcher
19 | 
20 |   override val metricReceive: Receive = {
21 |     case StartFetchItemPage =>
22 |       val doSender = sender()
23 | 
24 |       NewsCrawler.getCrawler(source) match {
25 |         case Some(crawler) =>
26 |           crawler.fetchNewsItem(newsItem.url).onComplete {
27 |             case Success(pageItem) =>
28 |               logger.debug(s"${newsItem.url} context OK")
29 |               doSender ! ItemPageResult(Right(pageItem))
30 | 
31 |             case Failure(e) =>
32 |               logger.warn(s"${newsItem.url} context extractor")
33 |               e.printStackTrace()
34 |               doSender ! ItemPageResult(Left(e.getLocalizedMessage))
35 |           }
36 | 
37 |         case None =>
38 |           doSender ! ItemPageResult(Left(s"Crawler $source not exists, ${newsItem.url} needed."))
39 |       }
40 |   }
41 | 
42 | }
43 | 
44 | object ItemPageWorker {
45 | 
46 |   def props(source: ItemSource.Value, item: NewsItem) = Props(new ItemPageWorker(source, item))
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/service/actors/NewsJob.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.news.service.actors
 2 | 
 3 | import akka.actor.{ActorRef, PoisonPill, Props}
 4 | import crawler.module.news.commands.{SearchNews, StartSearchNews}
 5 | import crawler.module.news.enums.ItemSource
 6 | import crawler.module.news.model.SearchResult
 7 | import crawler.util.actors.MetricActor
 8 | 
 9 | /**
10 |  * NewsJob
11 |  * 成功返回: Seq[NewsResult]
12 |  * Created by yangjing on 15-11-5.
13 |  */
14 | class NewsJob(sources: Seq[ItemSource.Value], reqSender: ActorRef) extends MetricActor {
15 |   @volatile var _completeJobs = 0
16 |   @volatile var _newsResults = List.empty[SearchResult]
17 | 
18 |   override val metricReceive: Receive = {
19 |     case SearchNews(key, method, duration) =>
20 |       sources.foreach { source =>
21 |         val jobName = source.toString
22 |         val jobActor = context.actorOf(NewsSourceJob.props(source, method, key, duration, self), jobName)
23 |         jobActor ! StartSearchNews
24 |       }
25 | 
26 |     case result: SearchResult =>
27 |       _completeJobs += 1
28 |       _newsResults ::= result
29 |       if (sources.size == _completeJobs) {
30 |         reqSender ! _newsResults
31 | 
32 |         // TODO 把 NewsJob 内的超时判断上移到 NewsJob ?
33 |         self ! PoisonPill
34 |       }
35 | 
36 |   }
37 | }
38 | 
39 | object NewsJob {
40 |   def props(sources: Seq[ItemSource.Value], reqSender: ActorRef) = Props(new NewsJob(sources, reqSender))
41 | }
42 | 


--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/service/actors/NewsSourceJob.scala:
--------------------------------------------------------------------------------
  1 | package crawler.module.news.service.actors
  2 | 
  3 | import akka.actor.{ActorRef, Cancellable, PoisonPill, Props}
  4 | import crawler.module.news.commands._
  5 | import crawler.module.news.enums.{ItemSource, SearchMethod}
  6 | import crawler.module.news.model.SearchResult
  7 | import crawler.module.news.service.NewsMaster
  8 | import crawler.util.actors.MetricActor
  9 | import crawler.util.time.TimeUtils
 10 | 
 11 | import scala.concurrent.duration.FiniteDuration
 12 | 
 13 | /**
 14 |   * 新闻job
 15 |   *
 16 |   * @param source    搜索源
 17 |   * @param method    搜索方式
 18 |   * @param key       搜索关键词
 19 |   * @param duration  持续时间，到期后向未获取完新闻数据向客户端返回Timeout。children actor继续业务处理
 20 |   * @param reqSender 请求actor
 21 |   */
 22 | class NewsSourceJob(source: ItemSource.Value,
 23 |                     method: SearchMethod.Value,
 24 |                     key: String,
 25 |                     duration: FiniteDuration,
 26 |                     reqSender: ActorRef) extends MetricActor {
 27 | 
 28 |   private val persistActor = context.actorSelection(context.system / NewsMaster.actorName / PersistActor.actorName)
 29 |   @volatile var _newsResult = SearchResult(source, "", TimeUtils.now(), 0, Nil)
 30 |   @volatile var _isTimeout: Boolean = false
 31 |   @volatile var _notCompleteItemPageActorNames = Seq.empty[String]
 32 |   @volatile var _cancelableSchedule: Cancellable = _
 33 | 
 34 |   import context.dispatcher
 35 | 
 36 |   override def metricPreStart(): Unit = {
 37 |     // 定义超时时间
 38 |     _cancelableSchedule = context.system.scheduler.scheduleOnce(duration, self, SearchTimeout)
 39 |   }
 40 | 
 41 |   override def metricPostStop(): Unit = {
 42 |     if (!_cancelableSchedule.isCancelled) {
 43 |       _cancelableSchedule.cancel()
 44 |     }
 45 | 
 46 |     if (null != _newsResult && _newsResult.count > 0) {
 47 |       persistActor ! _newsResult
 48 |     } else {
 49 |       logger.warn(s"${self.path} [$key]未获取到相关数据: ${_newsResult.error}")
 50 |     }
 51 |   }
 52 | 
 53 |   override val metricReceive: Receive = {
 54 |     case s@StartSearchNews =>
 55 |       val searchPage = context.actorOf(SearchPageWorker.props(source, key), "page")
 56 |       searchPage ! StartFetchSearchPage
 57 | 
 58 |     case SearchPageResult(newsResult) =>
 59 |       _newsResult = newsResult
 60 |       method match {
 61 |         case SearchMethod.F if _newsResult.count > 0 => // 需要抓取详情内容
 62 |           _notCompleteItemPageActorNames = newsResult.news.zipWithIndex.map { case (item, idx) =>
 63 |             val childName = "item-" + idx
 64 |             val itemPage = context.actorOf(ItemPageWorker.props(source, item), childName)
 65 |             itemPage ! StartFetchItemPage
 66 |             childName
 67 |           }
 68 | 
 69 |         case _ => // SearchMethod.S => // 只抓取摘要
 70 |           if (!_isTimeout) {
 71 |             reqSender ! _newsResult
 72 |           }
 73 |           self ! PoisonPill
 74 |       }
 75 | 
 76 |     case ItemPageResult(result) =>
 77 |       val doSender = sender()
 78 |       println(doSender.path)
 79 |       _notCompleteItemPageActorNames = _notCompleteItemPageActorNames.filterNot(_ == doSender.path.name)
 80 |       result match {
 81 |         case Left(errMsg) =>
 82 |           // TODO 解析新闻详情页失败！
 83 |           logger.error(errMsg)
 84 | 
 85 |         case Right(pageItem) =>
 86 |           // 更新 result.news
 87 |           val news = _newsResult.news.map {
 88 |             case oldItem if oldItem.url == pageItem.url =>
 89 |               oldItem.copy(content = Option(pageItem.content))
 90 | 
 91 |             case oldItem =>
 92 |               oldItem
 93 |           }
 94 | 
 95 |           _newsResult = _newsResult.copy(news = news)
 96 |       }
 97 | 
 98 |       if (_notCompleteItemPageActorNames.isEmpty) {
 99 |         if (!_isTimeout) {
100 |           reqSender ! _newsResult
101 |         }
102 |         self ! PoisonPill
103 |       }
104 | 
105 |     case SearchTimeout =>
106 |       _isTimeout = true
107 | 
108 |       // 此时向调用客户端返回已存在的数据，但实际的新闻抓取流程仍将继续
109 |       reqSender ! _newsResult //Left(new AskTimeoutException("搜索超时"))
110 | 
111 |     case SearchPageFailure(e) =>
112 |       logger.warn(self.path + " ", e)
113 |       if (!_isTimeout) {
114 |         reqSender ! SearchResult(source, key, TimeUtils.now(), 0, Nil, Some(e.getLocalizedMessage))
115 |       }
116 |       self ! PoisonPill
117 |   }
118 | 
119 | }
120 | 
121 | object NewsSourceJob {
122 |   def props(source: ItemSource.Value,
123 |             method: SearchMethod.Value,
124 |             key: String,
125 |             duration: FiniteDuration,
126 |             reqSender: ActorRef) =
127 |     Props(new NewsSourceJob(source, method, key, duration, reqSender))
128 | }
129 | 


--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/service/actors/PersistActor.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.news.service.actors
 2 | 
 3 | import akka.actor.Props
 4 | import crawler.module.news.model.{NewsPage, SearchResult}
 5 | import crawler.module.news.service.NewsDBRepo
 6 | import crawler.util.actors.MetricActor
 7 | 
 8 | /**
 9 |   * 持久化
10 |   * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-06.
11 |   */
12 | class PersistActor extends MetricActor {
13 |   val dbRepo = new NewsDBRepo
14 | 
15 |   override val metricReceive: Receive = {
16 |     case newsResult: SearchResult =>
17 |       dbRepo.saveToSearchPage(newsResult)
18 | 
19 |       newsResult.news.foreach { item =>
20 |         val page = NewsPage(item.url, item.title, item.source, item.time, item.`abstract`, item.content.getOrElse(""))
21 |         dbRepo.saveToNewsPage(page)
22 |       }
23 |   }
24 | 
25 | }
26 | 
27 | object PersistActor {
28 |   val BATCH_SIZE = 20
29 |   val actorName = "persist"
30 | 
31 |   def props = Props(new PersistActor)
32 | }
33 | 


--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/service/actors/SearchPageWorker.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.news.service.actors
 2 | 
 3 | import akka.actor.Props
 4 | import crawler.module.news.commands.{SearchPageFailure, SearchPageResult, StartFetchSearchPage}
 5 | import crawler.module.news.crawlers.NewsCrawler
 6 | import crawler.module.news.enums.ItemSource
 7 | import crawler.util.actors.MetricActor
 8 | 
 9 | import scala.util.{Failure, Success}
10 | 
11 | /**
12 |  * 搜索页面
13 |  * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-06.
14 |  */
15 | class SearchPageWorker(source: ItemSource.Value, key: String) extends MetricActor {
16 | 
17 |   import context.dispatcher
18 | 
19 |   override val metricReceive: Receive = {
20 |     case StartFetchSearchPage =>
21 |       val doSender = sender()
22 | 
23 |       NewsCrawler.getCrawler(source) match {
24 |         case Some(crawler) =>
25 |           crawler.fetchItemList(key).onComplete {
26 |             case Success(result) =>
27 |               doSender ! SearchPageResult(result)
28 |               stop()
29 | 
30 |             case Failure(e) =>
31 |               doSender ! SearchPageFailure(e)
32 |               stop()
33 |           }
34 | 
35 |         case None =>
36 |           doSender ! SearchPageFailure(new RuntimeException(s"Crawler $source not exists"))
37 |           stop()
38 |       }
39 |   }
40 | 
41 |   private def stop(): Unit = context.stop(self)
42 | }
43 | 
44 | object SearchPageWorker {
45 | 
46 |   def props(source: ItemSource.Value, name: String) = Props(new SearchPageWorker(source, name))
47 | 
48 | }


--------------------------------------------------------------------------------
/module-news/src/test/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <configuration>
 3 |     <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 4 |         <encoder>
 5 |             <pattern>%date - [%level] - from %logger in %thread %n%message%n%xException%n</pattern>
 6 |         </encoder>
 7 |     </appender>
 8 | 
 9 |     <logger name="akka" level="INFO"/>
10 |     <logger name="crawler" level="DEBUG"/>
11 | 
12 |     <root level="DEBUG">
13 |         <appender-ref ref="STDOUT"/>
14 |     </root>
15 | </configuration>


--------------------------------------------------------------------------------
/module-news/src/test/scala/crawler/module/news/crawlers/BaiduNewsTest.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.news.crawlers
 2 | 
 3 | import akka.util.Timeout
 4 | import crawler.testsuite.ServiceSpec
 5 | import crawler.util.http.HttpClient
 6 | 
 7 | import scala.concurrent.Await
 8 | import scala.concurrent.duration._
 9 | 
10 | /**
11 |  * Created by Yang Jing (yangbajing@gmail.com) on 2015-12-03.
12 |  */
13 | class BaiduNewsTest extends ServiceSpec {
14 | 
15 |   implicit val timeout = Timeout(30.seconds)
16 | 
17 |   "BaiduNewsTest" should {
18 | 
19 |     "fetchNewsList" in {
20 |       val baidu = new BaiduNews(HttpClient())
21 |       val result = Await.result(baidu.fetchItemList("阿里巴巴"), timeout.duration)
22 |       result.news.foreach(println)
23 |       println(result.source + " " + result.key)
24 |       println(result.news.size)
25 |       result.news must not be empty
26 |     }
27 | 
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/module-news/src/test/scala/crawler/module/news/crawlers/CourtNewsTest.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.news.crawlers
 2 | 
 3 | import akka.util.Timeout
 4 | import crawler.testsuite.ServiceSpec
 5 | import crawler.util.http.HttpClient
 6 | 
 7 | import scala.concurrent.Await
 8 | import scala.concurrent.duration._
 9 | 
10 | class CourtNewsTest extends ServiceSpec {
11 | 
12 |   val timeout = Timeout(30.seconds)
13 | 
14 |   "CourtNewsTest" should {
15 |     "fetchNewsList" in {
16 |       val court = new CourtNews(HttpClient())
17 |       val result = Await.result(court.fetchItemList("重庆"), timeout.duration)
18 |       result.news.foreach(println)
19 |       println(result.key)
20 |       result.news must not be empty
21 |     }
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/module-news/src/test/scala/crawler/module/news/crawlers/HaosouNewsTest.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.news.crawlers
 2 | 
 3 | import akka.util.Timeout
 4 | import crawler.testsuite.ServiceSpec
 5 | import crawler.util.http.HttpClient
 6 | 
 7 | import scala.concurrent.Await
 8 | import scala.concurrent.duration._
 9 | 
10 | /**
11 |  * Created by yangjing on 15-11-9.
12 |  */
13 | class HaosouNewsTest extends ServiceSpec {
14 | 
15 |   implicit val timeout = Timeout(30.seconds)
16 | 
17 |   "HaosouCrawlerTest" should {
18 | 
19 |     "fetchNewsList" in {
20 |       val haosou = new HaosouNews(HttpClient())
21 |       val result = Await.result(haosou.fetchItemList("誉存科技"), timeout.duration)
22 |       result.news.foreach(println)
23 |       println(result.source + " " + result.key)
24 |       result.news must not be empty
25 |     }
26 | 
27 |   }
28 | 
29 |   override implicit def patienceConfig: PatienceConfig = super.patienceConfig
30 | }
31 | 


--------------------------------------------------------------------------------
/module-news/src/test/scala/crawler/module/news/crawlers/WechatNewsTest.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.news.crawlers
 2 | 
 3 | import akka.util.Timeout
 4 | import crawler.testsuite.ServiceSpec
 5 | import crawler.util.http.HttpClient
 6 | 
 7 | import scala.concurrent.Await
 8 | import scala.concurrent.duration._
 9 | 
10 | /**
11 |  * Wechat News Test
12 |  * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-10.
13 |  */
14 | class WechatNewsTest extends ServiceSpec {
15 | 
16 |   implicit val timeout = Timeout(30.seconds)
17 |   "WechatNewsTest" should {
18 | 
19 |     "fetchNewsList" in {
20 |       val wechat = new WechatNews(HttpClient())
21 |       val f = wechat.fetchItemList("成都念念科技有限公司")
22 |       val result = Await.result(f, timeout.duration)
23 |       result.news.foreach(println)
24 |       println(result.count + " " + result.key)
25 |       result.news must not be empty
26 |     }
27 | 
28 |   }
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/module-news/src/test/scala/crawler/module/news/service/NewsDBRepoTest.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.news.service
 2 | 
 3 | import java.util.concurrent.TimeUnit
 4 | 
 5 | import crawler.module.news.enums.{ItemSource, SearchMethod}
 6 | import crawler.testsuite.ServiceSpec
 7 | import crawler.util.time.TimeUtils
 8 | 
 9 | class NewsDBRepoTest extends ServiceSpec {
10 | 
11 |   "NewsDBRepoTest" should {
12 |     val dbRepo = new NewsDBRepo
13 | 
14 |     "findNews" in {
15 |       val result = dbRepo.findNews("阿里巴巴", Seq(ItemSource.baidu), SearchMethod.F, Some(TimeUtils.nowBegin()))
16 |       val list = result.futureValue
17 |       println(list)
18 |       list must not be empty
19 | 
20 |       TimeUnit.SECONDS.sleep(5)
21 |     }
22 | 
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/module-news/src/test/scala/crawler/module/news/service/actors/NewsJobMasterTest.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.news.service.actors
 2 | 
 3 | import java.util.concurrent.TimeUnit
 4 | 
 5 | import akka.pattern.ask
 6 | import akka.util.Timeout
 7 | import crawler.SystemUtils
 8 | import crawler.module.news.commands.{SearchNews, RequestSearchNews}
 9 | import crawler.module.news.crawlers.{BaiduNews, NewsCrawler}
10 | import crawler.module.news.enums.{SearchMethod, ItemSource}
11 | import crawler.module.news.model.SearchResult
12 | import crawler.module.news.service.NewsMaster
13 | import crawler.testsuite.ServiceSpec
14 | 
15 | import scala.concurrent.duration._
16 | 
17 | /**
18 |   * NewsMasterTest
19 |   * Created by yangjing on 15-11-5.
20 |   */
21 | class NewsJobMasterTest extends ServiceSpec {
22 | 
23 |   implicit val timeout = Timeout(60.seconds)
24 | 
25 |   "NewsMasterTest" should {
26 |     NewsCrawler.registerCrawler(ItemSource.baidu, new BaiduNews(SystemUtils.httpClient))
27 | 
28 |     "news-master" in {
29 |       val sources = Seq(ItemSource.baidu)
30 |       val newsMaster = system.actorOf(NewsMaster.props, NewsMaster.actorName)
31 |       val msg = RequestSearchNews(sources, SearchNews("杭州誉存科技有限公司", SearchMethod.F, 3.seconds))
32 | 
33 |       val f = (newsMaster ? msg).mapTo[Seq[SearchResult]]
34 | 
35 |       f onSuccess { case list =>
36 |         list.foreach(println)
37 |         list.size mustBe 1
38 |       }
39 | 
40 |       f onFailure { case e =>
41 |         println("Failure: " + e)
42 |       }
43 | 
44 |       TimeUnit.SECONDS.sleep(20)
45 |     }
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/module-site-search/src/main/scala/crawler/module/site/BaiduSite.scala:
--------------------------------------------------------------------------------
  1 | package crawler.module.site
  2 | 
  3 | import java.net.URLEncoder
  4 | import java.time.LocalDateTime
  5 | import java.util.concurrent.TimeUnit
  6 | 
  7 | import com.typesafe.scalalogging.LazyLogging
  8 | import crawler.module.site.model.{SearchRequest, SiteItem, SiteResult}
  9 | import crawler.util.Crawler
 10 | import crawler.util.http.HttpClient
 11 | import crawler.util.time.TimeUtils
 12 | import org.jsoup.Jsoup
 13 | import org.jsoup.nodes.Element
 14 | 
 15 | import scala.collection.JavaConverters._
 16 | import scala.concurrent.{ExecutionContext, Future, Promise}
 17 | import scala.util.{Failure, Success}
 18 | 
 19 | /**
 20 |   * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-18.
 21 |   */
 22 | class BaiduSite(val httpClient: HttpClient,
 23 |                 searchRequest: SearchRequest) extends Crawler with LazyLogging {
 24 | 
 25 |   import BaiduSite._
 26 | 
 27 |   override protected val defaultHeaders: Array[Seq[(String, String)]] =
 28 |     super.defaultHeaders.map(headers => headers :+ ("User-Agent" -> "Baiduspider"))
 29 | 
 30 |   val values = searchRequest.params.map(_.value)
 31 | 
 32 |   /**
 33 |     * 抓取搜索页
 34 |     *
 35 |     * @return
 36 |     */
 37 |   def fetchItemList()(implicit ec: ExecutionContext): Future[SiteResult] = {
 38 |     val promise = Promise[Seq[SiteItem]]()
 39 |     val key = searchRequest.toParam
 40 | 
 41 |     val url = BAIDU_SITE_BASE_URL.format(URLEncoder.encode(key, "UTF-8"))
 42 |     logger.info(s"key: $key, url: $url")
 43 | 
 44 |     val newsResultsFuture = fetchPage(url).flatMap { resp =>
 45 |       val doc = Jsoup.parse(resp.getResponseBodyAsStream, "UTF-8", BAIDU_SITE_HOST).getElementById("wrapper_wrapper")
 46 |       val now = TimeUtils.now()
 47 |       val contentNone = doc.select(".content_none")
 48 | 
 49 |       if (!contentNone.isEmpty) {
 50 |         promise.success(Nil)
 51 |         Future.successful(SiteResult(ITEM_SOURCE, key, now, 0, Nil))
 52 |       } else {
 53 |         val wrapper = doc
 54 |         val countText = wrapper
 55 |           .select(".head_nums_cont_outer.OP_LOG")
 56 |           .select(".nums")
 57 |           .text()
 58 |         val count =
 59 |           """\d+""".r.findAllMatchIn(countText).map(_.matched).mkString.toInt
 60 | 
 61 |         val itemDiv = doc.getElementById("content_left")
 62 |         val itemResults = itemDiv.select(".result.c-container").asScala
 63 | 
 64 |         val pages = doc.select("#page a").asScala
 65 |         val newsItemFutures = pages.take(PAGE_LIMIT - 1).map { page =>
 66 |           TimeUnit.MILLISECONDS.sleep(100)
 67 |           fetchPageLinks(BAIDU_SITE_HOST + page.attr("href"))
 68 |         }
 69 | 
 70 |         Future.sequence(newsItemFutures).map(_.flatten).onComplete {
 71 |           case Success(list) =>
 72 |             promise.success(list)
 73 |           case Failure(e) =>
 74 |             e.printStackTrace()
 75 |             promise.success(Nil)
 76 |         }
 77 | 
 78 |         Future.sequence(itemResults.map(parseSiteItem))
 79 |           .map(items => SiteResult(ITEM_SOURCE, key, now, count, items))
 80 |       }
 81 |     }
 82 | 
 83 |     for {
 84 |       newsResult <- newsResultsFuture
 85 |       newsItems <- promise.future
 86 |     } yield {
 87 |       newsResult.copy(items = newsResult.items ++ newsItems)
 88 |     }
 89 |   }
 90 | 
 91 |   def fetchPageLinks(url: String)(implicit ec: ExecutionContext): Future[Seq[SiteItem]] = {
 92 |     fetchPage(url).flatMap { resp =>
 93 |       val doc = Jsoup.parse(resp.getResponseBodyAsStream, "UTF-8", BaiduSite.BAIDU_SITE_HOST)
 94 |       if (doc.getElementById("content_none") != null) {
 95 |         Future.successful(Nil)
 96 |       } else {
 97 |         val itemDiv = doc.getElementById("content_left")
 98 |         val itemResults = itemDiv.select(".result.c-container").asScala
 99 |         val futures = itemResults.map(parseSiteItem)
100 |         Future.sequence(futures)
101 |       }
102 |     }
103 |   }
104 | 
105 |   def parseSiteItem(elem: Element)(implicit ec: ExecutionContext): Future[SiteItem] = {
106 |     val link = elem.select(".t").select("a").first()
107 |     val href = link.attr("href")
108 | 
109 |     extractPageUrl(href).map { url =>
110 |       val title = link.text()
111 | 
112 |       val sourceHostDesc = elem.select(".f13 a").first().text()
113 |       val source = sourceHostDesc.take(sourceHostDesc.indexOf('/'))
114 | 
115 |       val abstractElem = elem.select(".c-abstract")
116 |       val summary = abstractElem.asScala.filterNot(e => e.attr("class").contains("newTimeFactor_before_abs")).map(_.text()).mkString
117 |       val time = BaiduSite.dealTime(abstractElem.select(".newTimeFactor_before_abs").text())
118 | 
119 |       SiteItem(title, url, source, time, summary, values)
120 |     }
121 |   }
122 | 
123 |   def extractPageUrl(href: String): Future[String] = {
124 |     implicit val ec = ExecutionContext.Implicits.global
125 | 
126 |     if (searchRequest.followUrl) {
127 |       HttpClient.find302Location(httpClient, href, requestHeaders()).map(v => if (v == null) href else v)
128 |     } else {
129 |       Future.successful(href)
130 |     }
131 |   }
132 | 
133 | }
134 | 
135 | object BaiduSite {
136 |   // 抓取前５页
137 |   val PAGE_LIMIT = 5
138 | 
139 |   val BAIDU_SITE_BASE_URL = "https://www.baidu.com/s?wd=%s&rsv_spt=1&issp=1&f=8&rsv_bp=0&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&rsv_n=2&rsv_sug3=1"
140 | 
141 |   val BAIDU_SITE_HOST = "https://www.baidu.com"
142 | 
143 |   val TIME_PATTERN = """(\d{4})年(\d{1,2})月(\d{1,2})日""".r
144 | 
145 |   val ITEM_SOURCE = "baiduSite"
146 | 
147 |   def dealTime(timeStr: String): Option[LocalDateTime] = timeStr.substring(0, timeStr.indexOf('日') + 1) match {
148 |     case TIME_PATTERN(year, month, day) => Some(LocalDateTime.of(year.toInt, month.toInt, day.toInt, 0, 0))
149 |     case _ => None
150 |   }
151 | 
152 | }
153 | 


--------------------------------------------------------------------------------
/module-site-search/src/main/scala/crawler/module/site/QueryCond.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.site
 2 | 
 3 | /**
 4 |   * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-18.
 5 |   */
 6 | object QueryCond extends Enumeration {
 7 |   val - = Value("-")
 8 |   val + = Value("+")
 9 | }
10 | 


--------------------------------------------------------------------------------
/module-site-search/src/main/scala/crawler/module/site/SearchSyntax.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.site
 2 | 
 3 | /**
 4 |   * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-20.
 5 |   */
 6 | object SearchSyntax {
 7 |   final val Intitle = "intitle"
 8 |   final val Insite = "insite"
 9 |   final val Inurl = "inurl"
10 | }
11 | 


--------------------------------------------------------------------------------
/module-site-search/src/main/scala/crawler/module/site/model/SearchRequest.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.site.model
 2 | 
 3 | import crawler.module.site.QueryCond
 4 | 
 5 | /**
 6 |   * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-18.
 7 |   */
 8 | case class SearchRequest(params: Seq[SearchParam], followUrl: Boolean = true) {
 9 | 
10 |   def toParam = params.map(_.toParam).mkString(" ")
11 | 
12 | }
13 | 
14 | case class SearchParam(value: String,
15 |                        syntax: Option[String] = None,
16 |                        cond: Option[QueryCond.Value] = None,
17 |                        filetypeDoc: Seq[String] = Nil,
18 |                        strict: Boolean = true) {
19 | 
20 |   def toParam =
21 |     syntax.map(v => if (strict) s"""$v:"$value"""" else s"$v:$value") orElse
22 |       cond.map(v => v + value) getOrElse
23 |       value
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/module-site-search/src/main/scala/crawler/module/site/model/SiteItem.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.site.model
 2 | 
 3 | import java.time.LocalDateTime
 4 | 
 5 | import org.json4s.Extraction
 6 | 
 7 | /**
 8 |   * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-22.
 9 |   */
10 | case class SiteItem(title: String,
11 |                     url: String,
12 |                     // 新闻来源（站点）
13 |                     source: String,
14 |                     time: Option[LocalDateTime],
15 |                     // 摘要
16 |                     `abstract`: String,
17 |                     values: Seq[String] = Nil) {
18 | 
19 |   def jsonPretty = {
20 |     import crawler.util.http.TJsonSupport._
21 |     val jv = Extraction.decompose(this)
22 |     serialization.writePretty(jv)
23 |   }
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/module-site-search/src/main/scala/crawler/module/site/model/SiteResult.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.site.model
 2 | 
 3 | import java.time.LocalDateTime
 4 | 
 5 | /**
 6 |   * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-22.
 7 |   */
 8 | case class SiteResult(source: String,
 9 |                       key: String,
10 |                       time: LocalDateTime,
11 |                       count: Int,
12 |                       items: Seq[SiteItem],
13 |                       error: Option[String] = None)
14 | 


--------------------------------------------------------------------------------
/module-site-search/src/test/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <configuration>
 3 |     <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 4 |         <encoder>
 5 |             <pattern>%date - [%level] - from %logger in %thread %n%message%n%xException%n</pattern>
 6 |         </encoder>
 7 |     </appender>
 8 | 
 9 |     <logger name="akka" level="INFO"/>
10 |     <logger name="crawler" level="DEBUG"/>
11 | 
12 |     <root level="DEBUG">
13 |         <appender-ref ref="STDOUT"/>
14 |     </root>
15 | </configuration>


--------------------------------------------------------------------------------
/module-site-search/src/test/scala/crawler/module/site/BaiduSiteTest.scala:
--------------------------------------------------------------------------------
 1 | package crawler.module.site
 2 | 
 3 | import akka.util.Timeout
 4 | import crawler.module.site.model.{SearchParam, SearchRequest}
 5 | import crawler.testsuite.ServiceSpec
 6 | import crawler.util.http.HttpClient
 7 | 
 8 | import scala.concurrent.Await
 9 | import scala.concurrent.duration._
10 | 
11 | /**
12 |   * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-18.
13 |   */
14 | class BaiduSiteTest extends ServiceSpec {
15 | 
16 |   implicit val timeout = Timeout(30.seconds)
17 | 
18 |   "BaiduSiteTest" should {
19 | 
20 |     "fetchItemList" in {
21 |       val requestParams = SearchRequest(
22 |         SearchParam("晋渝地产", Some(SearchSyntax.Intitle)) ::
23 |           //        SearchParam("阿里巴巴kakakakaak", Some(SearchSyntax.Intitle)) ::
24 |           //          SearchParam("失信", syntax = Some(SearchSyntax.Intitle), strict = false) ::
25 |           Nil
26 |       )
27 |       val baidu = new BaiduSite(HttpClient(), requestParams)
28 | 
29 |       val key = requestParams.toParam
30 |       val f = baidu.fetchItemList()
31 |       val result = Await.result(f, timeout.duration)
32 |       result.items.foreach(v => println(v.jsonPretty))
33 |       println(result.items.size)
34 |       result.items must not be empty
35 |     }
36 | 
37 |   }
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/project/Build.scala:
--------------------------------------------------------------------------------
  1 | import _root_.sbt.Keys._
  2 | import _root_.sbt._
  3 | import com.typesafe.sbt.SbtNativePackager.{Linux, Debian}
  4 | import com.typesafe.sbt.packager.Keys._
  5 | import com.typesafe.sbt.packager.archetypes.JavaServerAppPackaging
  6 | import com.typesafe.sbt.packager.universal.UniversalPlugin.autoImport._
  7 | import sbtassembly.AssemblyKeys._
  8 | import sbtassembly.{MergeStrategy, PathList}
  9 | 
 10 | object Build extends Build {
 11 | 
 12 |   import BuildSettings._
 13 | 
 14 |   val DependsConfigure = "test->test;compile->compile"
 15 | 
 16 |   override lazy val settings = super.settings :+ {
 17 |     shellPrompt := (s => Project.extract(s).currentProject.id + " > ")
 18 |   }
 19 | 
 20 |   lazy val root = Project("crawler-high-search", file("."))
 21 |     .aggregate(
 22 |       appApi,
 23 |       crawlerSiteSearch,
 24 |       moduleSiteSearch, moduleNews,
 25 |       util)
 26 | 
 27 |   ///////////////////////////////////////////////////////////////
 28 |   // projects
 29 |   ///////////////////////////////////////////////////////////////
 30 |   lazy val packageDebianProd = taskKey[File]("creates deb-prod package")
 31 |   lazy val appApi = Project("app-api", file("app-api"))
 32 |     .enablePlugins(JavaServerAppPackaging)
 33 |     .dependsOn(moduleSiteSearch % DependsConfigure, moduleNews % DependsConfigure, util % DependsConfigure)
 34 |     .settings(basicSettings: _*)
 35 |     .settings(
 36 |       description := "app-api",
 37 | 
 38 |       packageDescription := "一个高级异步多线程实时爬虫API",
 39 |       mainClass in Compile := Some("crawler.app.Main"),
 40 |       maintainer in Linux := "Jing Yang <jing.yang@socialcredits.cn, yangbajing@gmail.com>",
 41 |       packageSummary in Linux := "Crawler High Search API",
 42 |       daemonUser in Linux := "nobody",
 43 |       bashScriptConfigLocation := Some("${app_home}/../conf/jvmopts"),
 44 |       bashScriptExtraDefines += """addJava "-Dlogback.configurationFile=${app_home}/../conf/logback.xml"""",
 45 | 
 46 |       //          |; bashScriptExtraDefines := Seq("addJava \"-Dconfig.file=${app_home}/../conf/application.conf -Dlogback.configurationFile=${app_home}/../conf/logback.xml\"")
 47 |       addCommandAlias("packageProd",
 48 |         """; clean
 49 |           |; bashScriptExtraDefines += "addJava \"-Dconfig.file=${app_home}/../conf/application-test.conf -Dlogback.configurationFile=${app_home}/../conf/logback.xml\""
 50 |           |; packageDebianProd
 51 |         """.stripMargin),
 52 |       packageDebianProd := {
 53 |         bashScriptExtraDefines += """addJava "-Dconfig.file=${app_home}/../conf/application-test.conf -Dlogback.configurationFile=${app_home}/../conf/logback.xml""""
 54 |         val output = baseDirectory.value / "package" / "deb-prod.deb"
 55 |         val debianFile = (packageBin in Debian).value
 56 |         IO.move(debianFile, output)
 57 |         output
 58 |       },
 59 | 
 60 | //      assemblyJarName in assembly := "crawler-app.jar",
 61 | //      mappings in Universal <<= (mappings in Universal, assembly in Compile) map { (mappings, fatJar) =>
 62 | //        val filtered = mappings filter { case (file, name) => !name.endsWith(".jar") }
 63 | //        filtered :+ (fatJar -> ("lib/" + fatJar.getName))
 64 | //      },
 65 | //      test in assembly := {},
 66 | //      assemblyMergeStrategy in assembly := {
 67 | //        case PathList("META-INF", "io.netty.versions.properties") => MergeStrategy.discard
 68 | //        case x =>
 69 | //          val oldStrategy = (assemblyMergeStrategy in assembly).value
 70 | //          oldStrategy(x)
 71 | //      },
 72 | 
 73 |       libraryDependencies ++= Seq(
 74 |         _akkaHttp)
 75 |     )
 76 | 
 77 |   lazy val crawlerSiteSearch = Project("crawler-site-search", file("crawler-site-search"))
 78 |     .dependsOn(moduleSiteSearch % DependsConfigure, util % DependsConfigure)
 79 |     .settings(basicSettings: _*)
 80 |     .settings(
 81 |       description := "crawler-site-search",
 82 |       libraryDependencies ++= Seq(
 83 |         _activemqSTOMP,
 84 |         _cassandraDriverCore,
 85 |         _mongoScala)
 86 |     )
 87 | 
 88 |   lazy val moduleSiteSearch = Project("module-site-search", file("module-site-search"))
 89 |     .dependsOn(util % DependsConfigure)
 90 |     .settings(basicSettings: _*)
 91 |     .settings(
 92 |       description := "module-site-search"
 93 |     )
 94 | 
 95 |   lazy val moduleNews = Project("module-news", file("module-news"))
 96 |     .dependsOn(util % DependsConfigure)
 97 |     .settings(basicSettings: _*)
 98 |     .settings(
 99 |       description := "module-news",
100 |       libraryDependencies ++= Seq(
101 |         _cassandraDriverCore,
102 |         _akkaActor)
103 |     )
104 | 
105 |   lazy val util = Project("util", file("util"))
106 |     .settings(basicSettings: _*)
107 |     .settings(
108 |       description := "util",
109 |       libraryDependencies ++= Seq(
110 |         _activemqSTOMP % "provided",
111 |         _cassandraDriverCore % "provided",
112 |         _mongoScala % "provided",
113 |         _akkaHttp % "provided",
114 |         _akkaStream,
115 |         _json4sJackson,
116 |         _json4sExt,
117 |         _scalaLogging,
118 |         _asyncHttpClient,
119 |         _jsoup,
120 |         _akkaActor,
121 |         _akkaSlf4j,
122 |         _logbackClassic)
123 |     )
124 | 
125 | }
126 | 


--------------------------------------------------------------------------------
/project/BuildSettings.scala:
--------------------------------------------------------------------------------
 1 | import sbt.Keys._
 2 | import sbt._
 3 | 
 4 | object BuildSettings {
 5 | 
 6 |   lazy val basicSettings = Seq(
 7 |     version := "0.0.1",
 8 |     homepage := Some(new URL("https://github.com/yangbajing/crawler-service")),
 9 |     organization := "cn.socialcredits.crawler",
10 |     organizationHomepage := Some(new URL("https://github.com/yangbajing/crawler-service")),
11 |     startYear := Some(2015),
12 |     scalaVersion := "2.11.7",
13 |     scalacOptions := Seq(
14 |       "-encoding", "utf8",
15 |       "-unchecked",
16 |       "-feature",
17 |       "-deprecation"
18 |     ),
19 |     javacOptions := Seq(
20 |       "-encoding", "utf8",
21 |       "-Xlint:unchecked",
22 |       "-Xlint:deprecation"
23 |     ),
24 |     resolvers ++= Seq(
25 |       "snapshots" at "http://oss.sonatype.org/content/repositories/snapshots",
26 |       "releases" at "http://oss.sonatype.org/content/repositories/releases",
27 |       "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/",
28 |       "Typesafe Snapshots" at "http://repo.typesafe.com/typesafe/snapshots/"),
29 |     libraryDependencies ++= Seq(
30 |       _scalaReflect,
31 |       _scalatest
32 |     ),
33 |     offline := true,
34 |     fork := true
35 |   )
36 | 
37 |   lazy val noPublishing = Seq(
38 |     publish :=(),
39 |     publishLocal :=()
40 |   )
41 | 
42 |   val verAkka = "2.3.14"
43 |   val _akkaActor = "com.typesafe.akka" %% "akka-actor" % verAkka
44 |   val _akkaSlf4j = "com.typesafe.akka" %% "akka-slf4j" % verAkka
45 | 
46 |   lazy val _scalaReflect = "org.scala-lang" % "scala-reflect" % "2.11.7"
47 | 
48 |   val verAkkaHttp = "2.0.2"
49 |   lazy val _akkaStream = ("com.typesafe.akka" %% "akka-stream-experimental" % verAkkaHttp).exclude("com.typesafe.akka", "akka-actor")
50 |   lazy val _akkaHttp = ("com.typesafe.akka" %% "akka-http-experimental" % verAkkaHttp).exclude("com.typesafe.akka", "akka-actor")
51 | 
52 |   lazy val _scalatest = "org.scalatest" %% "scalatest" % "2.2.5" % "test"
53 | 
54 |   lazy val _scalaLogging = ("com.typesafe.scala-logging" %% "scala-logging" % "3.1.0").exclude("org.scala-lang", "scala-reflect").exclude("org.slf4j", "slf4j-api")
55 | 
56 |   lazy val _mongoScala = ("org.mongodb.scala" %% "mongo-scala-driver" % "1.1.0").exclude("com.typesafe.akka", "akka-actor")
57 | 
58 |   lazy val varJson4s = "3.3.0"
59 |   lazy val _json4sJackson = "org.json4s" %% "json4s-jackson" % varJson4s
60 |   lazy val _json4sExt = "org.json4s" %% "json4s-ext" % varJson4s
61 | 
62 |   lazy val _jsoup = "org.jsoup" % "jsoup" % "1.8.3"
63 | 
64 |   lazy val _asyncHttpClient = ("com.ning" % "async-http-client" % "1.9.31").exclude("io.netty", "netty")
65 | 
66 |   lazy val _logbackClassic = "ch.qos.logback" % "logback-classic" % "1.1.3"
67 | 
68 |   lazy val _cassandraDriverCore = "com.datastax.cassandra" % "cassandra-driver-core" % "2.2.0-rc3"
69 | 
70 |   lazy val _activemqSTOMP = "org.apache.activemq" % "activemq-stomp" % "5.13.0"
71 | 
72 | }
73 | 
74 | 


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=0.13.9
2 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.0")
2 | 
3 | addSbtPlugin("org.xerial.sbt" % "sbt-pack" % "0.7.5")
4 | 
5 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.8.0")
6 | 
7 | addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.0.6")
8 | 


--------------------------------------------------------------------------------
/project/sbt-launch.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangjing/crawler-service/462c198f0ea22cc673d59a2e725628712f96f79b/project/sbt-launch.jar


--------------------------------------------------------------------------------
/sbt:
--------------------------------------------------------------------------------
1 | SCRIPT_DIR=`dirname $0`
2 | java -Xmx1024M -Dsbt.override.build.repos=true -Dfile.encoding=UTF-8 -XX:+CMSClassUnloadingEnabled -jar "$SCRIPT_DIR/project/sbt-launch.jar" $@
3 | 


--------------------------------------------------------------------------------
/util/src/main/java/crawler/util/news/contextextractor/ContentExtractor.java:
--------------------------------------------------------------------------------
  1 | package crawler.util.news.contextextractor;
  2 | 
  3 | import org.jsoup.Jsoup;
  4 | import org.jsoup.nodes.Document;
  5 | import org.jsoup.nodes.Element;
  6 | import org.jsoup.nodes.Node;
  7 | import org.jsoup.nodes.TextNode;
  8 | import org.jsoup.select.Elements;
  9 | import org.jsoup.select.NodeVisitor;
 10 | import org.slf4j.Logger;
 11 | import org.slf4j.LoggerFactory;
 12 | 
 13 | import java.util.ArrayList;
 14 | import java.util.HashMap;
 15 | import java.util.Map;
 16 | import java.util.concurrent.atomic.AtomicInteger;
 17 | import java.util.regex.Matcher;
 18 | import java.util.regex.Pattern;
 19 | 
 20 | /**
 21 |  * Created by yangjing on 15-11-3.
 22 |  */
 23 | public class ContentExtractor {
 24 | 
 25 |     public static final Logger LOG = LoggerFactory.getLogger(ContentExtractor.class);
 26 | 
 27 |     protected Document doc;
 28 | 
 29 |     ContentExtractor(Document doc) {
 30 |         this.doc = doc;
 31 |     }
 32 | 
 33 |     protected HashMap<Element, CountInfo> infoMap = new HashMap<Element, CountInfo>();
 34 | 
 35 |     class CountInfo {
 36 | 
 37 |         int textCount = 0;
 38 |         int linkTextCount = 0;
 39 |         int tagCount = 0;
 40 |         int linkTagCount = 0;
 41 |         double density = 0;
 42 |         double densitySum = 0;
 43 |         double score = 0;
 44 |         int pCount = 0;
 45 |         ArrayList<Integer> leafList = new ArrayList<Integer>();
 46 | 
 47 |     }
 48 | 
 49 |     protected void clean() {
 50 |         doc.select("script,noscript,style,iframe,br").remove();
 51 |     }
 52 | 
 53 |     protected CountInfo computeInfo(Node node) {
 54 | 
 55 |         if (node instanceof Element) {
 56 |             Element tag = (Element) node;
 57 | 
 58 |             CountInfo countInfo = new CountInfo();
 59 |             for (Node childNode : tag.childNodes()) {
 60 |                 CountInfo childCountInfo = computeInfo(childNode);
 61 |                 countInfo.textCount += childCountInfo.textCount;
 62 |                 countInfo.linkTextCount += childCountInfo.linkTextCount;
 63 |                 countInfo.tagCount += childCountInfo.tagCount;
 64 |                 countInfo.linkTagCount += childCountInfo.linkTagCount;
 65 |                 countInfo.leafList.addAll(childCountInfo.leafList);
 66 |                 countInfo.densitySum += childCountInfo.density;
 67 |                 countInfo.pCount += childCountInfo.pCount;
 68 |             }
 69 |             countInfo.tagCount++;
 70 |             String tagName = tag.tagName();
 71 |             if (tagName.equals("a")) {
 72 |                 countInfo.linkTextCount = countInfo.textCount;
 73 |                 countInfo.linkTagCount++;
 74 |             } else if (tagName.equals("p")) {
 75 |                 countInfo.pCount++;
 76 |             }
 77 | 
 78 |             int pureLen = countInfo.textCount - countInfo.linkTextCount;
 79 |             int len = countInfo.tagCount - countInfo.linkTagCount;
 80 |             if (pureLen == 0 || len == 0) {
 81 |                 countInfo.density = 0;
 82 |             } else {
 83 |                 countInfo.density = (pureLen + 0.0) / len;
 84 |             }
 85 | 
 86 |             infoMap.put(tag, countInfo);
 87 | 
 88 |             return countInfo;
 89 |         } else if (node instanceof TextNode) {
 90 |             TextNode tn = (TextNode) node;
 91 |             CountInfo countInfo = new CountInfo();
 92 |             String text = tn.text();
 93 |             int len = text.length();
 94 |             countInfo.textCount = len;
 95 |             countInfo.leafList.add(len);
 96 |             return countInfo;
 97 |         } else {
 98 |             return new CountInfo();
 99 |         }
100 |     }
101 | 
102 |     protected double computeScore(Element tag) {
103 |         CountInfo countInfo = infoMap.get(tag);
104 |         double var = Math.sqrt(computeVar(countInfo.leafList) + 1);
105 |         double score = Math.log(var) * countInfo.densitySum * Math.log(countInfo.textCount - countInfo.linkTextCount + 1) * Math.log10(countInfo.pCount + 2);
106 |         return score;
107 |     }
108 | 
109 |     protected double computeVar(ArrayList<Integer> data) {
110 |         if (data.size() == 0) {
111 |             return 0;
112 |         }
113 |         if (data.size() == 1) {
114 |             return data.get(0) / 2;
115 |         }
116 |         double sum = 0;
117 |         for (Integer i : data) {
118 |             sum += i;
119 |         }
120 |         double ave = sum / data.size();
121 |         sum = 0;
122 |         for (Integer i : data) {
123 |             sum += (i - ave) * (i - ave);
124 |         }
125 |         sum = sum / data.size();
126 |         return sum;
127 |     }
128 | 
129 |     public Element getContentElement() throws Exception {
130 |         clean();
131 |         computeInfo(doc.body());
132 |         double maxScore = 0;
133 |         Element content = null;
134 |         for (Map.Entry<Element, CountInfo> entry : infoMap.entrySet()) {
135 |             Element tag = entry.getKey();
136 |             if (tag.tagName().equals("a") || tag == doc.body()) {
137 |                 continue;
138 |             }
139 |             double score = computeScore(tag);
140 |             if (score > maxScore) {
141 |                 maxScore = score;
142 |                 content = tag;
143 |             }
144 |         }
145 |         if (content == null) {
146 |             throw new Exception("extraction failed");
147 |         }
148 |         return content;
149 |     }
150 | 
151 |     public News getNews() throws Exception {
152 |         News news = new News();
153 |         Element contentElement;
154 |         try {
155 |             contentElement = getContentElement();
156 |             news.setContentElement(contentElement);
157 |         } catch (Exception ex) {
158 | //            LOG.info("news content extraction failed,extraction abort", ex);
159 |             throw new Exception(ex);
160 |         }
161 | 
162 |         if (doc.baseUri() != null) {
163 |             news.setUrl(doc.baseUri());
164 |         }
165 | 
166 | //        try {
167 | //            news.setTime(getTime(contentElement));
168 | //        } catch (Exception ex) {
169 | ////            LOG.info("news title extraction failed", ex);
170 | //        }
171 | 
172 | //        try {
173 | //            news.setTitle(getTitle(contentElement));
174 | //        } catch (Exception ex) {
175 | //            LOG.info("title extraction failed", ex);
176 | //        }
177 |         return news;
178 |     }
179 | 
180 |     protected String getTime(Element contentElement) throws Exception {
181 |         String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})";
182 |         Pattern pattern = Pattern.compile(regex);
183 |         Element current = contentElement;
184 |         for (int i = 0; i < 2; i++) {
185 |             if (current != null && current != doc.body()) {
186 |                 Element parent = current.parent();
187 |                 if (parent != null) {
188 |                     current = parent;
189 |                 }
190 |             }
191 |         }
192 |         for (int i = 0; i < 6; i++) {
193 |             if (current == null) {
194 |                 break;
195 |             }
196 |             String currentHtml = current.outerHtml();
197 |             Matcher matcher = pattern.matcher(currentHtml);
198 |             if (matcher.find()) {
199 |                 return matcher.group(1) + "-" + matcher.group(2) + "-" + matcher.group(3) + " " + matcher.group(4) + ":" + matcher.group(5) + ":" + matcher.group(6);
200 |             }
201 |             if (current != doc.body()) {
202 |                 current = current.parent();
203 |             }
204 |         }
205 | 
206 |         try {
207 |             return getDate(contentElement);
208 |         } catch (Exception ex) {
209 |             ex.printStackTrace();
210 | //            throw new Exception("time not found");
211 |             return "";
212 |         }
213 | 
214 |     }
215 | 
216 |     protected String getDate(Element contentElement) throws Exception {
217 |         String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})";
218 |         Pattern pattern = Pattern.compile(regex);
219 |         Element current = contentElement;
220 |         for (int i = 0; i < 2; i++) {
221 |             if (current != null && current != doc.body()) {
222 |                 Element parent = current.parent();
223 |                 if (parent != null) {
224 |                     current = parent;
225 |                 }
226 |             }
227 |         }
228 |         for (int i = 0; i < 6; i++) {
229 |             if (current == null) {
230 |                 break;
231 |             }
232 |             String currentHtml = current.outerHtml();
233 |             Matcher matcher = pattern.matcher(currentHtml);
234 |             if (matcher.find()) {
235 |                 return matcher.group(1) + "-" + matcher.group(2) + "-" + matcher.group(3);
236 |             }
237 |             if (current != doc.body()) {
238 |                 current = current.parent();
239 |             }
240 |         }
241 |         throw new Exception("date not found");
242 |     }
243 | 
244 |     protected double strSim(String a, String b) {
245 |         int len1 = a.length();
246 |         int len2 = b.length();
247 |         if (len1 == 0 || len2 == 0) {
248 |             return 0;
249 |         }
250 |         double ratio;
251 |         if (len1 > len2) {
252 |             ratio = (len1 + 0.0) / len2;
253 |         } else {
254 |             ratio = (len2 + 0.0) / len1;
255 |         }
256 |         if (ratio >= 3) {
257 |             return 0;
258 |         }
259 |         return (lcs(a, b) + 0.0) / Math.max(len1, len2);
260 |     }
261 | 
262 |     protected String getTitle(final Element contentElement) throws Exception {
263 |         final ArrayList<Element> titleList = new ArrayList<Element>();
264 |         final ArrayList<Double> titleSim = new ArrayList<Double>();
265 |         final AtomicInteger contentIndex = new AtomicInteger();
266 |         final String metaTitle = doc.title().trim();
267 |         if (!metaTitle.isEmpty()) {
268 |             doc.body().traverse(new NodeVisitor() {
269 |                 @Override
270 |                 public void head(Node node, int i) {
271 |                     if (node instanceof Element) {
272 |                         Element tag = (Element) node;
273 |                         if (tag == contentElement) {
274 |                             contentIndex.set(titleList.size());
275 |                             return;
276 |                         }
277 |                         String tagName = tag.tagName();
278 |                         if (Pattern.matches("h[1-6]", tagName)) {
279 |                             String title = tag.text().trim();
280 |                             double sim = strSim(title, metaTitle);
281 |                             titleSim.add(sim);
282 |                             titleList.add(tag);
283 |                         }
284 |                     }
285 |                 }
286 | 
287 |                 @Override
288 |                 public void tail(Node node, int i) {
289 |                 }
290 |             });
291 |             int index = contentIndex.get();
292 |             if (index > 0) {
293 |                 double maxScore = 0;
294 |                 int maxIndex = -1;
295 |                 for (int i = 0; i < index; i++) {
296 |                     double score = (i + 1) * titleSim.get(i);
297 |                     if (score > maxScore) {
298 |                         maxScore = score;
299 |                         maxIndex = i;
300 |                     }
301 |                 }
302 |                 if (maxIndex != -1) {
303 |                     return titleList.get(maxIndex).text();
304 |                 }
305 |             }
306 |         }
307 | 
308 |         Elements titles = doc.body().select("*[id^=title],*[id$=title],*[class^=title],*[class$=title]");
309 |         if (titles.size() > 0) {
310 |             String title = titles.first().text();
311 |             if (title.length() > 5 && title.length() < 40) {
312 |                 return titles.first().text();
313 |             }
314 |         }
315 |         try {
316 |             return getTitleByEditDistance(contentElement);
317 |         } catch (Exception ex) {
318 |             throw new Exception("title not found");
319 |         }
320 | 
321 |     }
322 | 
323 |     protected String getTitleByEditDistance(Element contentElement) throws Exception {
324 |         final String metaTitle = doc.title();
325 | 
326 |         final ArrayList<Double> max = new ArrayList<Double>();
327 |         max.add(0.0);
328 |         final StringBuilder sb = new StringBuilder();
329 |         doc.body().traverse(new NodeVisitor() {
330 | 
331 |             public void head(Node node, int i) {
332 | 
333 |                 if (node instanceof TextNode) {
334 |                     TextNode tn = (TextNode) node;
335 |                     String text = tn.text().trim();
336 |                     double sim = strSim(text, metaTitle);
337 |                     if (sim > 0) {
338 |                         if (sim > max.get(0)) {
339 |                             max.set(0, sim);
340 |                             sb.setLength(0);
341 |                             sb.append(text);
342 |                         }
343 |                     }
344 | 
345 |                 }
346 |             }
347 | 
348 |             public void tail(Node node, int i) {
349 |             }
350 |         });
351 |         if (sb.length() > 0) {
352 |             return sb.toString();
353 |         }
354 |         throw new Exception();
355 | 
356 |     }
357 | 
358 |     protected int lcs(String x, String y) {
359 | 
360 |         int M = x.length();
361 |         int N = y.length();
362 |         if (M == 0 || N == 0) {
363 |             return 0;
364 |         }
365 |         int[][] opt = new int[M + 1][N + 1];
366 | 
367 |         for (int i = M - 1; i >= 0; i--) {
368 |             for (int j = N - 1; j >= 0; j--) {
369 |                 if (x.charAt(i) == y.charAt(j)) {
370 |                     opt[i][j] = opt[i + 1][j + 1] + 1;
371 |                 } else {
372 |                     opt[i][j] = Math.max(opt[i + 1][j], opt[i][j + 1]);
373 |                 }
374 |             }
375 |         }
376 | 
377 |         return opt[0][0];
378 | 
379 |     }
380 | 
381 |     protected int editDistance(String word1, String word2) {
382 |         int len1 = word1.length();
383 |         int len2 = word2.length();
384 | 
385 |         int[][] dp = new int[len1 + 1][len2 + 1];
386 | 
387 |         for (int i = 0; i <= len1; i++) {
388 |             dp[i][0] = i;
389 |         }
390 | 
391 |         for (int j = 0; j <= len2; j++) {
392 |             dp[0][j] = j;
393 |         }
394 | 
395 |         for (int i = 0; i < len1; i++) {
396 |             char c1 = word1.charAt(i);
397 |             for (int j = 0; j < len2; j++) {
398 |                 char c2 = word2.charAt(j);
399 | 
400 |                 if (c1 == c2) {
401 |                     dp[i + 1][j + 1] = dp[i][j];
402 |                 } else {
403 |                     int replace = dp[i][j] + 1;
404 |                     int insert = dp[i][j + 1] + 1;
405 |                     int delete = dp[i + 1][j] + 1;
406 | 
407 |                     int min = replace > insert ? insert : replace;
408 |                     min = delete > min ? min : delete;
409 |                     dp[i + 1][j + 1] = min;
410 |                 }
411 |             }
412 |         }
413 | 
414 |         return dp[len1][len2];
415 |     }
416 | 
417 |     /*输入Jsoup的Document，获取正文所在Element*/
418 |     public static Element getContentElementByDoc(Document doc) throws Exception {
419 |         ContentExtractor ce = new ContentExtractor(doc);
420 |         return ce.getContentElement();
421 |     }
422 | 
423 |     /*输入HTML，获取正文所在Element*/
424 |     public static Element getContentElementByHtml(String html) throws Exception {
425 |         Document doc = Jsoup.parse(html);
426 |         return getContentElementByDoc(doc);
427 |     }
428 | 
429 |     /*输入HTML和URL，获取正文所在Element*/
430 |     public static Element getContentElementByHtml(String html, String url) throws Exception {
431 |         Document doc = Jsoup.parse(html, url);
432 |         return getContentElementByDoc(doc);
433 |     }
434 | 
435 |     /*输入URL，获取正文所在Element*/
436 | //    public static Element getContentElementByUrl(String url) throws Exception {
437 | //        HttpRequest request = new HttpRequest(url);
438 | //        String html = request.getResponse().getHtmlByCharsetDetect();
439 | //        return getContentElementByHtml(html, url);
440 | //    }
441 | 
442 |     /*输入Jsoup的Document，获取正文文本*/
443 |     public static String getContentByDoc(Document doc) throws Exception {
444 |         ContentExtractor ce = new ContentExtractor(doc);
445 |         return ce.getContentElement().text();
446 |     }
447 | 
448 |     /*输入HTML，获取正文文本*/
449 |     public static String getContentByHtml(String html) throws Exception {
450 |         Document doc = Jsoup.parse(html);
451 |         return getContentElementByDoc(doc).text();
452 |     }
453 | 
454 |     /*输入HTML和URL，获取正文文本*/
455 |     public static String getContentByHtml(String html, String url) throws Exception {
456 |         Document doc = Jsoup.parse(html, url);
457 |         return getContentElementByDoc(doc).text();
458 |     }
459 | 
460 |     /*输入URL，获取正文文本*/
461 | //    public static String getContentByUrl(String url) throws Exception {
462 | //        HttpRequest request = new HttpRequest(url);
463 | //        String html = request.getResponse().getHtmlByCharsetDetect();
464 | //        return getContentByHtml(html, url);
465 | //    }
466 | 
467 |     /*输入Jsoup的Document，获取结构化新闻信息*/
468 |     public static News getNewsByDoc(Document doc) throws Exception {
469 |         ContentExtractor ce = new ContentExtractor(doc);
470 |         return ce.getNews();
471 |     }
472 | 
473 |     /*输入HTML，获取结构化新闻信息*/
474 |     public static News getNewsByHtml(String html) throws Exception {
475 |         Document doc = Jsoup.parse(html);
476 |         return getNewsByDoc(doc);
477 |     }
478 | 
479 |     /*输入HTML和URL，获取结构化新闻信息*/
480 |     public static News getNewsByHtml(String html, String url) throws Exception {
481 |         Document doc = Jsoup.parse(html, url);
482 |         return getNewsByDoc(doc);
483 |     }
484 | 
485 |     /*输入URL，获取结构化新闻信息*/
486 | //    public static News getNewsByUrl(String url) throws Exception {
487 | //        HttpRequest request = new HttpRequest(url);
488 | //        String html = request.getResponse().getHtmlByCharsetDetect();
489 | //        return getNewsByHtml(html, url);
490 | //    }
491 | 
492 |     public static void main(String[] args) throws Exception {
493 | 
494 | //        News news = ContentExtractor.getNewsByUrl("http://www.huxiu.com/article/121959/1.html");
495 | //        System.out.println(news.getUrl());
496 | //        System.out.println(news.getTitle());
497 | //        System.out.println(news.getTime());
498 | //        System.out.println(news.getContent());
499 |         //System.out.println(news.getContentElement());
500 | 
501 |         //System.out.println(news);
502 |     }
503 | 
504 | }


--------------------------------------------------------------------------------
/util/src/main/java/crawler/util/news/contextextractor/News.java:
--------------------------------------------------------------------------------
 1 | package crawler.util.news.contextextractor;
 2 | 
 3 | import org.jsoup.nodes.Element;
 4 | 
 5 | /**
 6 |  * Created by yangjing on 15-11-3.
 7 |  */
 8 | public class News {
 9 | 
10 |     protected String url = null;
11 | //    protected String title = null;
12 |     protected String content = null;
13 | //    protected String time = null;
14 | 
15 |     protected Element contentElement = null;
16 | 
17 |     public String getUrl() {
18 |         return url;
19 |     }
20 | 
21 |     public void setUrl(String url) {
22 |         this.url = url;
23 |     }
24 | 
25 | //    public String getTitle() {
26 | //        return title;
27 | //    }
28 | //
29 | //    public void setTitle(String title) {
30 | //        this.title = title;
31 | //    }
32 | 
33 |     public String getContent() {
34 |         if (content == null) {
35 |             if (contentElement != null) {
36 |                 content = contentElement.text();
37 |             }
38 |         }
39 |         return content;
40 |     }
41 | 
42 | 
43 |     public void setContent(String content) {
44 |         this.content = content;
45 |     }
46 | 
47 | //    public String getTime() {
48 | //        return time;
49 | //    }
50 | //
51 | //    public void setTime(String time) {
52 | //        this.time = time;
53 | //    }
54 | 
55 |     @Override
56 |     public String toString() {
57 |         return "URL:\n" + url + /*"\nTITLE:\n" + title + "\nTIME:\n" + time +*/ "\nCONTENT:\n" + getContent() + "\nCONTENT(SOURCE):\n" + contentElement;
58 |     }
59 | 
60 |     public Element getContentElement() {
61 |         return contentElement;
62 |     }
63 | 
64 |     public void setContentElement(Element contentElement) {
65 |         this.contentElement = contentElement;
66 |     }
67 | 
68 | 
69 | }


--------------------------------------------------------------------------------
/util/src/main/resources/reference.conf:
--------------------------------------------------------------------------------
 1 | akka {
 2 |   loggers = ["akka.event.slf4j.Slf4jLogger"]
 3 |   loglevel = INFO
 4 |   log-dead-letters = off
 5 |   log-dead-letters-during-shutdown = off
 6 |   fork-join-executor {
 7 |     parallelism-factor = 3.0
 8 |     parallelism-min = 16
 9 |     parallelism-max = 64
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/util/src/main/scala/crawler/SystemUtils.scala:
--------------------------------------------------------------------------------
 1 | package crawler
 2 | 
 3 | import java.util.concurrent.TimeoutException
 4 | 
 5 | import akka.actor.ActorSystem
 6 | import akka.stream.ActorMaterializer
 7 | import com.ning.http.client.AsyncHttpClientConfig
 8 | import com.typesafe.config.ConfigFactory
 9 | import com.typesafe.scalalogging.StrictLogging
10 | import crawler.util.http.HttpClient
11 | 
12 | import scala.concurrent.duration._
13 | 
14 | /**
15 |   * System Utils
16 |   * Created by yangjing on 15-11-5.
17 |   */
18 | object SystemUtils extends StrictLogging {
19 |   val crawlerConfig = ConfigFactory.load().getConfig("crawler")
20 | 
21 |   implicit val system = ActorSystem(crawlerConfig.getString("akka-system-name"))
22 |   implicit val materializer = ActorMaterializer()
23 | 
24 |   val httpClient = {
25 |     crawlerConfig.getConfig("http-client")
26 |     val builder = new AsyncHttpClientConfig.Builder()
27 |     builder.setMaxConnections(8192)
28 |     builder.setMaxConnectionsPerHost(4)
29 |     builder.setConnectTimeout(10 * 1000)
30 |     builder.setPooledConnectionIdleTimeout(40 * 1000)
31 |     builder.setRequestTimeout(90 * 1000)
32 |     builder.setAllowPoolingConnections(true)
33 |     builder.setFollowRedirect(true)
34 |     HttpClient(builder.build(), Nil)
35 |   }
36 | 
37 |   def shutdown(): Unit = {
38 |     httpClient.close()
39 |     system.shutdown()
40 |     try {
41 |       system.awaitTermination(5.seconds)
42 |       System.exit(0)
43 |     } catch {
44 |       case e: TimeoutException =>
45 |         logger.error(e.getLocalizedMessage, e)
46 |         System.exit(3)
47 |     }
48 |   }
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/util/src/main/scala/crawler/util/Crawler.scala:
--------------------------------------------------------------------------------
 1 | package crawler.util
 2 | 
 3 | import crawler.util.http.HttpClient
 4 | 
 5 | import scala.util.Random
 6 | 
 7 | /**
 8 |   * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-18.
 9 |   */
10 | trait Crawler {
11 |   val httpClient: HttpClient
12 | 
13 |   protected def defaultHeaders = Array(
14 |     Seq(
15 |       "User-Agent" -> "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36",
16 |       "Accept" -> "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
17 |       "Accept-Encoding" -> "gzip, deflate, sdch",
18 |       "Accept-Language" -> "zh-CN,zh;q=0.8,en;q=0.6",
19 |       "Connection" -> "keep-alive"
20 |     ),
21 |     Seq(
22 |       "User-Agent" -> "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/601.2.7 (KHTML, like Gecko) Version/9.0.1 Safari/601.2.7",
23 |       "Accept" -> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
24 |     ),
25 |     Seq(
26 |       "User-Agent" -> "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:39.0) Gecko/20100101 Firefox/39.0",
27 |       "Accept" -> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
28 |       "Accept-Encoding" -> "gzip, deflate",
29 |       "Accept-Language" -> "en-US,en;q=0.5",
30 |       "Connection" -> "keep-alive"
31 |     )
32 |   )
33 | 
34 |   def requestHeaders() = defaultHeaders(Random.nextInt(defaultHeaders.length))
35 | 
36 |   def fetchPage(url: String) = {
37 |     httpClient.get(url).setFollowRedirects(true).header(requestHeaders(): _*).execute()
38 |   }
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/util/src/main/scala/crawler/util/JsoupImplicits.scala:
--------------------------------------------------------------------------------
 1 | package crawler.util
 2 | 
 3 | import org.jsoup.nodes.Element
 4 | import org.jsoup.select.Elements
 5 | 
 6 | /**
 7 |  * Jsoup 相关辅助方法
 8 |  * Created by yangjing on 15-11-3.
 9 |  */
10 | object JsoupImplicits {
11 | 
12 |   implicit class JsoupElementFindByClassname(element: Element) {
13 |     def findByClass(cn: String): Elements = {
14 |       element.getElementsByClass(cn)
15 |     }
16 |   }
17 | 
18 |   implicit class JsoupElementsFindByClassname(elements: Elements) {
19 |     def findByClass(cn: String): Elements = {
20 |       val list = new java.util.LinkedList[Element]()
21 |       val iter = elements.iterator()
22 |       while (iter.hasNext) {
23 |         val elements = iter.next().getElementsByClass(cn)
24 |         list.addAll(elements)
25 |       }
26 |       new Elements(list)
27 |     }
28 |   }
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/util/src/main/scala/crawler/util/Utils.scala:
--------------------------------------------------------------------------------
 1 | package crawler.util
 2 | 
 3 | import java.lang.management.ManagementFactory
 4 | import java.nio.charset.Charset
 5 | 
 6 | import crawler.util.time.TimeUtils
 7 | 
 8 | /**
 9 |  * Utils
10 |  * Created by Yang Jing (yangbajing@gmail.com) on 2015-12-03.
11 |  */
12 | object Utils {
13 |   val CHARSET = Charset.forName("UTF-8")
14 | 
15 |   def getPid = {
16 |     val runtime = ManagementFactory.getRuntimeMXBean
17 |     runtime.getName.split('@')(0)
18 |   }
19 | 
20 |   def lastYearPeriods(): Seq[Int] = {
21 |     val now = TimeUtils.now()
22 |     val (curMonth, curYear, preYear) = (now.getMonthValue, now.getYear * 100, now.getYear * 100 - 100)
23 |     (curMonth + 1 to 12).map(preYear + _) ++ (1 to curMonth).map(curYear + _)
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/util/src/main/scala/crawler/util/actors/MetricActor.scala:
--------------------------------------------------------------------------------
 1 | package crawler.util.actors
 2 | 
 3 | import java.util.concurrent.atomic.AtomicInteger
 4 | 
 5 | import akka.actor.Actor
 6 | import com.typesafe.scalalogging.LazyLogging
 7 | 
 8 | /**
 9 |  * Metric Actor
10 |  * Created by yangjing on 15-11-4.
11 |  */
12 | trait MetricActor extends Actor with LazyLogging {
13 |   final override def preStart(): Unit = {
14 |     logger.trace(s"${self.path} preStart")
15 |     MetricActor.incrementActorSize()
16 |     metricPreStart()
17 |   }
18 | 
19 |   final override def postStop(): Unit = {
20 |     metricPostStop()
21 |     MetricActor.decrementActorSize()
22 |     logger.trace(s"${self.path} postStop")
23 |   }
24 | 
25 |   final override def receive: Receive = {
26 |     case s =>
27 |       if (metricReceive.isDefinedAt(s)) {
28 |         logger.trace(s"${self.path} receive message: $s")
29 |         metricReceive(s)
30 |       } else {
31 |         logger.warn(s"${self.path} receive message: $s")
32 |         unhandled(s)
33 |       }
34 |   }
35 | 
36 |   def metricPreStart(): Unit = ()
37 | 
38 |   def metricPostStop(): Unit = ()
39 | 
40 |   val metricReceive: Receive
41 | 
42 | }
43 | 
44 | object MetricActor {
45 |   private val _currentActiveActors = new AtomicInteger(0)
46 | 
47 |   def incrementActorSize() = _currentActiveActors.incrementAndGet()
48 | 
49 |   def decrementActorSize() = _currentActiveActors.decrementAndGet()
50 | 
51 |   def currentActorSize() = _currentActiveActors.get()
52 | }
53 | 


--------------------------------------------------------------------------------
/util/src/main/scala/crawler/util/http/HttpClient.scala:
--------------------------------------------------------------------------------
  1 | package crawler.util.http
  2 | 
  3 | import com.ning.http.client._
  4 | import com.ning.http.client.cookie.Cookie
  5 | import com.ning.http.client.multipart.Part
  6 | import com.typesafe.config.Config
  7 | 
  8 | import scala.concurrent.{ExecutionContext, Future, Promise}
  9 | import scala.util.{Failure, Success}
 10 | 
 11 | class HttpClientBuilder(builder: AsyncHttpClient#BoundRequestBuilder) {
 12 | 
 13 |   def queryParam(params: (String, String)*) = {
 14 |     params.foreach { case (name, value) => builder.addQueryParam(name, value) }
 15 |     this
 16 |   }
 17 | 
 18 |   def header(headers: (String, String)*) = {
 19 |     headers.foreach { case (name, value) => builder.addHeader(name, value) }
 20 |     this
 21 |   }
 22 | 
 23 |   def cookie(cookie: Cookie) = {
 24 |     builder.addCookie(cookie)
 25 |     this
 26 |   }
 27 | 
 28 |   def part(part: Part) = {
 29 |     builder.addBodyPart(part)
 30 |     this
 31 |   }
 32 | 
 33 |   def addFormParam(params: (String, String)*) = {
 34 |     params.foreach { case (key, value) => builder.addFormParam(key, value) }
 35 |     this
 36 |   }
 37 | 
 38 |   def setFollowRedirects(followRedirects: Boolean) = {
 39 |     builder.setFollowRedirects(followRedirects)
 40 |     this
 41 |   }
 42 | 
 43 |   def execute(): Future[Response] = {
 44 |     val promise = Promise[Response]()
 45 |     try {
 46 |       builder.execute(new AsyncCompletionHandler[Unit] {
 47 |         override def onCompleted(response: Response): Unit = {
 48 |           //          println(response.getStatusCode + ": " + response.getStatusText)
 49 |           promise.success(response)
 50 |         }
 51 | 
 52 |         override def onThrowable(t: Throwable): Unit = {
 53 |           promise.failure(t)
 54 |         }
 55 |       })
 56 |     } catch {
 57 |       case e: Throwable =>
 58 |         promise.failure(e)
 59 |     }
 60 |     promise.future
 61 |   }
 62 | 
 63 | }
 64 | 
 65 | /**
 66 |   * HttpClient
 67 |   * Created by yangjing on 15-11-3.
 68 |   */
 69 | class HttpClient private(config: AsyncHttpClientConfig,
 70 |                          defaultHeaders: Iterable[(String, String)]) {
 71 | 
 72 |   private val client = new AsyncHttpClient(config)
 73 | 
 74 |   def close() = client.close()
 75 | 
 76 |   def get(url: String) = new HttpClientBuilder(client.prepareGet(url))
 77 | 
 78 |   def post(url: String) = new HttpClientBuilder(client.preparePost(url))
 79 | 
 80 |   def delete(url: String) = new HttpClientBuilder(client.prepareDelete(url))
 81 | 
 82 |   def put(url: String) = new HttpClientBuilder(client.preparePut(url))
 83 | }
 84 | 
 85 | object HttpClient {
 86 |   def apply(): HttpClient = apply(Nil)
 87 | 
 88 |   def apply(config: Config): HttpClient = {
 89 |     // TODO 解析config to AsyncHttpClientConfig
 90 | 
 91 |     apply(Nil)
 92 |   }
 93 | 
 94 |   def apply(defaultHeaders: Iterable[(String, String)]): HttpClient =
 95 |     apply(new AsyncHttpClientConfig.Builder().build, defaultHeaders)
 96 | 
 97 |   def apply(config: AsyncHttpClientConfig, defaultHeaders: Iterable[(String, String)]): HttpClient =
 98 |     new HttpClient(config, defaultHeaders)
 99 | 
100 |   def apply(allowRedirect: Boolean): HttpClient = {
101 |     val builder = new AsyncHttpClientConfig.Builder()
102 |     builder.setFollowRedirect(false)
103 |     apply(builder.build(), Nil)
104 |   }
105 | 
106 |   def find302Location(client: HttpClient, url: String, headers: Seq[(String, String)])(implicit ec: ExecutionContext) = {
107 |     val promise = Promise[String]()
108 | 
109 |     def findLocation() = client.get(url).header(headers: _*).setFollowRedirects(false).execute().map(_.getHeader("Location"))
110 | 
111 |     findLocation().onComplete {
112 |       case Success(location) => promise.success(location)
113 |       case Failure(e) =>
114 |         findLocation().onComplete {
115 |           case Success(location) => promise.success(location)
116 |           case Failure(t) => promise.failure(t)
117 |         }
118 |     }
119 | 
120 |     promise.future
121 |   }
122 | 
123 | }


--------------------------------------------------------------------------------
/util/src/main/scala/crawler/util/http/TJsonSupport.scala:
--------------------------------------------------------------------------------
 1 | package crawler.util.http
 2 | 
 3 | import java.time.LocalDateTime
 4 | 
 5 | import akka.http.scaladsl.marshalling._
 6 | import akka.http.scaladsl.model.{ContentType, ContentTypes, HttpCharsets, MediaTypes}
 7 | import akka.http.scaladsl.unmarshalling._
 8 | import akka.stream.Materializer
 9 | import crawler.util.time.TimeUtils
10 | import org.json4s._
11 | import org.json4s.jackson.Serialization
12 | 
13 | /**
14 |   * Akka Http Json Supoort
15 |   * Created by yangjing on 15-11-5.
16 |   */
17 | trait TJsonSupport {
18 |   def defaultFormats: Formats = DefaultFormats + new LocalDateTimeSerializer()
19 | 
20 |   implicit val serialization = Serialization
21 |   implicit val formats: Formats
22 | 
23 | }
24 | 
25 | object TJsonSupport extends TJsonSupport {
26 |   override implicit val formats: Formats = defaultFormats
27 | }
28 | 
29 | class LocalDateTimeSerializer extends CustomSerializer[LocalDateTime](format =>
30 |   ( {
31 |     case JString(s) => LocalDateTime.parse(s, TimeUtils.formatterDateTime)
32 |     case JNull => null
33 |   }, {
34 |     case d: LocalDateTime => JString(TimeUtils.formatterDateTime.format(d))
35 |   })
36 | )
37 | 
38 | 


--------------------------------------------------------------------------------
/util/src/main/scala/crawler/util/persist/CassandraPersists.scala:
--------------------------------------------------------------------------------
 1 | package crawler.util.persist
 2 | 
 3 | import com.datastax.driver.core._
 4 | import com.google.common.util.concurrent.{FutureCallback, Futures}
 5 | import com.typesafe.scalalogging.LazyLogging
 6 | import crawler.SystemUtils
 7 | 
 8 | import scala.collection.JavaConverters._
 9 | import scala.concurrent.{ExecutionContextExecutor, Future, Promise}
10 | import scala.util.Try
11 | 
12 | /**
13 |  * CassandraPersists
14 |  * Created by yangjing on 15-11-6.
15 |  */
16 | abstract class CassandraPersists(nodes: Seq[String]) {
17 |   val cluster = {
18 |     Cluster.builder().addContactPoints(nodes: _*)
19 |   }
20 | }
21 | 
22 | object CassandraPersists extends LazyLogging {
23 | 
24 |   val cluster = {
25 |     val nodes = SystemUtils.crawlerConfig.getStringList("cassandra.nodes").asScala
26 |     logger.info("cassandra.nodes: " + nodes)
27 |     Cluster.builder().addContactPoints(nodes: _*).build()
28 |   }
29 | 
30 |   def userType(keyspace: String, userType: String): UserType =
31 |     cluster.getMetadata.getKeyspace(keyspace).getUserType(userType)
32 | 
33 |   def using[R](keyspace: String)(func: Session => R): R = {
34 |     val session = cluster.connect(keyspace)
35 |     try {
36 |       func(session)
37 |     } finally {
38 |       session.closeAsync()
39 |     }
40 |   }
41 | 
42 |   def execute[R](resultSetFuture: ResultSetFuture)(func: ResultSet => R)(implicit ec: ExecutionContextExecutor): Future[R] = {
43 |     val promise = Promise[R]()
44 |     Futures.addCallback(
45 |       resultSetFuture,
46 |       new FutureCallback[ResultSet] {
47 |         override def onFailure(t: Throwable): Unit = {
48 |           promise.failure(t)
49 |         }
50 | 
51 |         override def onSuccess(rs: ResultSet): Unit = {
52 |           promise.complete(Try(func(rs)))
53 |         }
54 |       },
55 |       ec)
56 |     promise.future
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/util/src/main/scala/crawler/util/time/TimeUtils.scala:
--------------------------------------------------------------------------------
 1 | package crawler.util.time
 2 | 
 3 | import java.time._
 4 | import java.time.format.DateTimeFormatter
 5 | import java.util.Date
 6 | 
 7 | /**
 8 |  * DateTimeUtils
 9 |  * Created by yangjing on 15-11-6.
10 |  */
11 | object TimeUtils {
12 |   val ZONE_OFFSET = ZoneOffset.ofHours(8)
13 |   val formatterDate = DateTimeFormatter.ofPattern("yyyy-MM-dd")
14 |   val formatterDateTime = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")
15 |   val formatterDateMinus = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm")
16 |   val formatterMinus = DateTimeFormatter.ofPattern("HH:mm")
17 | 
18 |   def toLocalDateTime(instant: Instant): LocalDateTime = LocalDateTime.ofInstant(instant, ZONE_OFFSET)
19 | 
20 |   def toLocalDateTime(s: String): LocalDateTime = {
21 |     s.length match {
22 |       case 5 =>
23 |         LocalDateTime.parse(s, formatterMinus)
24 |       case 16 =>
25 |         LocalDateTime.parse(s, formatterDateMinus)
26 |       case 19 =>
27 |         LocalDateTime.parse(s, formatterDateTime)
28 |       case _ =>
29 |         LocalDateTime.parse(s)
30 |     }
31 |   }
32 | 
33 |   def toLocalDateTime(date: Date): LocalDateTime =
34 |     LocalDateTime.ofInstant(Instant.ofEpochMilli(date.getTime), ZONE_OFFSET)
35 | 
36 |   def toDate(ldt: LocalDateTime): Date =
37 |     new Date(ldt.toInstant(ZONE_OFFSET).toEpochMilli)
38 | 
39 |   def now() = LocalDateTime.now()
40 | 
41 |   /**
42 |    * @return 一天的开始：
43 |    */
44 |   def nowBegin(): LocalDateTime = LocalDate.now().atTime(0, 0, 0, 0)
45 | 
46 |   /**
47 |    * @return 一天的结尾：
48 |    */
49 |   def nowEnd(): LocalDateTime = LocalTime.of(23, 59, 59, 999999999).atDate(LocalDate.now())
50 | }
51 | 


--------------------------------------------------------------------------------
/util/src/test/scala/crawler/testsuite/ServiceSpec.scala:
--------------------------------------------------------------------------------
 1 | package crawler.testsuite
 2 | 
 3 | import crawler.SystemUtils
 4 | import org.scalatest._
 5 | import org.scalatest.concurrent.ScalaFutures
 6 | import org.scalatest.time.{Seconds, Span}
 7 | 
 8 | /**
 9 |  * Created by yangjing on 15-11-4.
10 |  */
11 | abstract class ServiceSpec
12 |   extends WordSpec
13 |   with BeforeAndAfterAll
14 |   with MustMatchers
15 |   with OptionValues
16 |   with EitherValues
17 |   with ScalaFutures {
18 | 
19 |   implicit def system = SystemUtils.system
20 |   implicit def materializer = SystemUtils.materializer
21 |   implicit def dispatcher = system.dispatcher
22 |   implicit val defaultPatience = PatienceConfig(Span(30, Seconds))
23 | 
24 |   override protected def afterAll(): Unit = {
25 |     SystemUtils.shutdown()
26 |   }
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/util/src/test/scala/crawler/util/persist/CassandraPersistsTest.scala:
--------------------------------------------------------------------------------
 1 | package crawler.util.persist
 2 | 
 3 | import java.util.Date
 4 | 
 5 | import crawler.SystemUtils
 6 | import org.scalatest.WordSpec
 7 | 
 8 | /**
 9 |  * Created by yangjing on 15-11-6.
10 |  */
11 | class CassandraPersistsTest extends WordSpec {
12 | 
13 |   "CassandraPersistsTest" should {
14 | 
15 |     "save" in {
16 |       val keyspace = SystemUtils.crawlerConfig.getString("cassandra.keyspace")
17 |       CassandraPersists.using(keyspace) { session =>
18 |         val newsItem = Map(
19 |           "url" -> "http://hostname/news/1.html",
20 |           "source" -> "网易新闻",
21 |           "title" -> "标题",
22 |           "time" -> new Date(),
23 |           "abstract" -> "新闻摘要")
24 |         val bstmt = session.prepare("INSERT INTO search_page(source, key, count, news) VALUES(?, ?, ?, ?);")
25 | 
26 |         val newsTypeUDT = session.getCluster.getMetadata.getKeyspace(keyspace).getUserType("news_type")
27 |         val nit = newsTypeUDT.newValue()
28 |         newsItem.foreach {
29 |           case ("time", value: Date) => nit.setTimestamp("time", value)
30 |           case (key, value: String) => nit.setString(key, value)
31 |         }
32 | 
33 |         val result = session.execute(bstmt.bind(
34 |           "网易新闻",
35 |           "杭州誉存科技有限公司",
36 |           Integer.valueOf(2),
37 |           java.util.Arrays.asList(nit)
38 |         ))
39 |         println(result)
40 | 
41 |       }
42 |     }
43 | 
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------