├── .gitignore
├── README.md
├── app-api
└── src
│ ├── main
│ ├── resources
│ │ ├── logback-test.xml
│ │ └── reference.conf
│ └── scala
│ │ └── crawler
│ │ └── app
│ │ ├── Main.scala
│ │ ├── common
│ │ ├── BaseRoute.scala
│ │ └── JsonSupport.scala
│ │ └── routes
│ │ ├── ApiRoutes.scala
│ │ ├── NewsRoute.scala
│ │ └── SiteRoute.scala
│ ├── test
│ └── scala
│ │ ├── demo.sc
│ │ ├── saic.sc
│ │ └── worksheet.sc
│ └── universal
│ └── conf
│ ├── application-test.conf
│ ├── application.conf
│ ├── jvmopts
│ └── logback.xml
├── module-news
├── docs
│ └── 杂记.txt
└── src
│ ├── main
│ └── scala
│ │ └── crawler
│ │ └── module
│ │ └── news
│ │ ├── NewsJsonSupport.scala
│ │ ├── NewsUtils.scala
│ │ ├── commands
│ │ └── Commands.scala
│ │ ├── crawlers
│ │ ├── BaiduNews.scala
│ │ ├── CourtNews.scala
│ │ ├── HaosouNews.scala
│ │ ├── NewsCrawler.scala
│ │ ├── SogouNews.scala
│ │ └── WechatNews.scala
│ │ ├── enums
│ │ ├── ItemSource.scala
│ │ └── SearchMethod.scala
│ │ ├── model
│ │ ├── NewsItem.scala
│ │ ├── NewsPage.scala
│ │ ├── NewsPageItem.scala
│ │ └── SearchResult.scala
│ │ └── service
│ │ ├── NewsDBRepo.scala
│ │ ├── NewsMaster.scala
│ │ ├── NewsService.scala
│ │ └── actors
│ │ ├── ItemPageWorker.scala
│ │ ├── NewsJob.scala
│ │ ├── NewsSourceJob.scala
│ │ ├── PersistActor.scala
│ │ └── SearchPageWorker.scala
│ └── test
│ ├── resources
│ └── logback.xml
│ └── scala
│ └── crawler
│ └── module
│ └── news
│ ├── crawlers
│ ├── BaiduNewsTest.scala
│ ├── CourtNewsTest.scala
│ ├── HaosouNewsTest.scala
│ └── WechatNewsTest.scala
│ └── service
│ ├── NewsDBRepoTest.scala
│ └── actors
│ └── NewsJobMasterTest.scala
├── module-site-search
└── src
│ ├── main
│ └── scala
│ │ └── crawler
│ │ └── module
│ │ └── site
│ │ ├── BaiduSite.scala
│ │ ├── QueryCond.scala
│ │ ├── SearchSyntax.scala
│ │ └── model
│ │ ├── SearchRequest.scala
│ │ ├── SiteItem.scala
│ │ └── SiteResult.scala
│ └── test
│ ├── resources
│ └── logback.xml
│ └── scala
│ └── crawler
│ └── module
│ └── site
│ └── BaiduSiteTest.scala
├── project
├── Build.scala
├── BuildSettings.scala
├── build.properties
├── plugins.sbt
└── sbt-launch.jar
├── sbt
└── util
└── src
├── main
├── java
│ └── crawler
│ │ └── util
│ │ └── news
│ │ └── contextextractor
│ │ ├── ContentExtractor.java
│ │ └── News.java
├── resources
│ └── reference.conf
└── scala
│ └── crawler
│ ├── SystemUtils.scala
│ └── util
│ ├── Crawler.scala
│ ├── JsoupImplicits.scala
│ ├── Utils.scala
│ ├── actors
│ └── MetricActor.scala
│ ├── http
│ ├── HttpClient.scala
│ └── TJsonSupport.scala
│ ├── persist
│ └── CassandraPersists.scala
│ └── time
│ └── TimeUtils.scala
└── test
└── scala
└── crawler
├── testsuite
└── ServiceSpec.scala
└── util
└── persist
└── CassandraPersistsTest.scala
/.gitignore:
--------------------------------------------------------------------------------
1 | app-api/package/
2 | logs/
3 | target/
4 | .idea
5 | .idea_modules
6 | .classpath
7 | .project
8 | .settings
9 | RUNNING_PID
10 | app.pid
11 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Crawler Service
2 |
3 | 爬虫服务
4 |
5 | - Akka Stream & Http 1.0
6 | - Cassandra 2.1
7 | - Json4s 3.3
8 |
9 | ## Install
10 |
11 | ### 安装Cassandra
12 |
13 | [http://www.yangbajing.me/2015/10/22/canssandra%E5%BC%80%E5%A7%8B/](http://www.yangbajing.me/2015/10/22/canssandra%E5%BC%80%E5%A7%8B/)
14 |
15 | ### 配置
16 |
17 | 1. `util/src/main/resources/reference.conf`: 默认配置
18 | 2. `app/src/main/resources/application.conf`: 产品配置
19 |
20 | 具体使用说明请参考:[https://github.com/typesafehub/config](https://github.com/typesafehub/config)`
21 |
22 | ### 编译
23 |
24 | ```
25 | ./sbt app/assembly
26 | ```
27 |
28 | ### 运行
29 |
30 | ```
31 | java -jar app/target/scala-2.11/crawler-app.jar
32 | ```
33 |
34 |
--------------------------------------------------------------------------------
/app-api/src/main/resources/logback-test.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | %date - [%level] - from %logger in %thread %n%message%n%xException%n
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/app-api/src/main/resources/reference.conf:
--------------------------------------------------------------------------------
1 | akka {
2 | http {
3 | server {
4 | backlog = 1024
5 | max-connections = 8192
6 | socket-options {
7 | so-reuse-address = on
8 | }
9 | }
10 | host-connection-pool {
11 | max-connections = 8
12 | }
13 | }
14 | }
15 |
16 | crawler {
17 | api-uri = "http://120.26.93.104"
18 |
19 | akka-system-name = "crawler"
20 |
21 | network {
22 | server = "0.0.0.0"
23 | server = ${crawler.network.server}
24 | port = 33333
25 | }
26 |
27 | cassandra {
28 | nodes = ["192.168.31.242", "192.168.31.243"]
29 | keyspace = "crawler_spider"
30 | }
31 |
32 | http-client {
33 | headers {
34 | chromeMac {
35 | User-Agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"
36 | Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
37 | Accept-Encoding = "gzip, deflate, sdch"
38 | Accept-Language = "zh-CN,zh;q=0.8,en;q=0.6"
39 | Connection = "keep-alive"
40 | }
41 |
42 | safariMac {
43 | User-Agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/601.2.7 (KHTML, like Gecko) Version/9.0.1 Safari/601.2.7"
44 | Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
45 | }
46 |
47 | firefoxMac {
48 | User-Agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:39.0) Gecko/20100101 Firefox/39.0"
49 | Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
50 | Accept-Encoding = "gzip, deflate"
51 | Accept-Language = "en-US,en;q=0.5"
52 | Connection = "keep-alive"
53 | }
54 | }
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/app-api/src/main/scala/crawler/app/Main.scala:
--------------------------------------------------------------------------------
1 | package crawler.app
2 |
3 | import java.nio.file.{Files, Paths}
4 |
5 | import akka.http.scaladsl.Http
6 | import com.typesafe.config.ConfigFactory
7 | import com.typesafe.scalalogging.StrictLogging
8 | import crawler.SystemUtils
9 | import crawler.app.routes.ApiRoutes
10 | import crawler.util.Utils
11 |
12 | import scala.util.{Failure, Success}
13 |
14 | /**
15 | * Main
16 | * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-03.
17 | */
18 | object Main extends App with StrictLogging {
19 |
20 | import SystemUtils._
21 | import system.dispatcher
22 |
23 | Files.write(Paths.get("app.pid"), Utils.getPid.getBytes(Utils.CHARSET))
24 |
25 | val config = ConfigFactory.load()
26 |
27 | println(config.getString("crawler.network.server") + ":" + config.getInt("crawler.network.port"))
28 |
29 | Http().bindAndHandle(ApiRoutes(), config.getString("crawler.network.server"), config.getInt("crawler.network.port"))
30 | .onComplete {
31 | case Success(binding) =>
32 | logger.info(s"binding: $binding")
33 | case Failure(e) =>
34 | e.printStackTrace()
35 | SystemUtils.shutdown()
36 | }
37 |
38 | }
39 |
--------------------------------------------------------------------------------
/app-api/src/main/scala/crawler/app/common/BaseRoute.scala:
--------------------------------------------------------------------------------
1 | package crawler.app.common
2 |
3 | import akka.http.scaladsl.server.Directives
4 | import com.typesafe.scalalogging.LazyLogging
5 | import crawler.SystemUtils
6 |
7 | /**
8 | * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-18.
9 | */
10 | trait BaseRoute extends Directives with JsonSupport with LazyLogging {
11 | implicit def system = SystemUtils.system
12 |
13 | implicit def mat = SystemUtils.materializer
14 |
15 | implicit def dispatcher = system.dispatcher
16 | }
17 |
--------------------------------------------------------------------------------
/app-api/src/main/scala/crawler/app/common/JsonSupport.scala:
--------------------------------------------------------------------------------
1 | package crawler.app.common
2 |
3 | import akka.http.scaladsl.marshalling._
4 | import akka.http.scaladsl.model.{HttpCharsets, MediaTypes}
5 | import akka.http.scaladsl.unmarshalling._
6 | import akka.stream.Materializer
7 | import crawler.module.news.NewsJsonSupport
8 | import crawler.module.site.QueryCond
9 | import crawler.util.http.TJsonSupport
10 | import org.json4s.ext.EnumNameSerializer
11 | import org.json4s.{Formats, Serialization}
12 |
13 | /**
14 | * Json Support
15 | * Created by yangjing on 15-11-6.
16 | */
17 | trait JsonSupport extends TJsonSupport with NewsJsonSupport {
18 | implicit override val formats: Formats = defaultFormats +
19 | new EnumNameSerializer(QueryCond)
20 |
21 | implicit def json4sUnmarshallerConverter[A: Manifest](serialization: Serialization, formats: Formats)(implicit mat: Materializer): FromEntityUnmarshaller[A] =
22 | json4sUnmarshaller(manifest, serialization, formats, mat)
23 |
24 | implicit def json4sUnmarshaller[A: Manifest](implicit serialization: Serialization, formats: Formats, mat: Materializer): FromEntityUnmarshaller[A] =
25 | Unmarshaller.byteStringUnmarshaller
26 | .forContentTypes(MediaTypes.`application/json`)
27 | .mapWithCharset { (data, charset) =>
28 | val input = if (charset == HttpCharsets.`UTF-8`) data.utf8String else data.decodeString(charset.nioCharset.name)
29 | serialization.read(input)
30 | }
31 |
32 | implicit def json4sMarshallerConverter[A <: AnyRef](serialization: Serialization, formats: Formats): ToEntityMarshaller[A] =
33 | json4sMarshaller(serialization, formats)
34 |
35 | implicit def json4sMarshaller[A <: AnyRef](implicit serialization: Serialization, formats: Formats): ToEntityMarshaller[A] =
36 | Marshaller.StringMarshaller.wrap(MediaTypes.`application/json`)(serialization.write[A])
37 | }
38 |
39 | object JsonSupport extends JsonSupport
40 |
--------------------------------------------------------------------------------
/app-api/src/main/scala/crawler/app/routes/ApiRoutes.scala:
--------------------------------------------------------------------------------
1 | package crawler.app.routes
2 |
3 | import akka.http.scaladsl.model.HttpResponse
4 | import akka.http.scaladsl.server.Directives
5 |
6 | /**
7 | * ApiRoute
8 | * Created by yangjing on 15-11-3.
9 | */
10 | object ApiRoutes extends Directives {
11 |
12 | def apply() =
13 | pathPrefix("api") {
14 | path("health_check") {
15 | (get | head) {
16 | complete(HttpResponse())
17 | }
18 | } ~
19 | NewsRoute() ~
20 | SiteRoute()
21 | }
22 |
23 | }
24 |
--------------------------------------------------------------------------------
/app-api/src/main/scala/crawler/app/routes/NewsRoute.scala:
--------------------------------------------------------------------------------
1 | package crawler.app.routes
2 |
3 | import java.util.concurrent.TimeUnit
4 |
5 | import akka.http.scaladsl.marshalling.Marshal
6 | import akka.http.scaladsl.model._
7 | import com.typesafe.config.ConfigFactory
8 | import crawler.SystemUtils
9 | import crawler.app.common.BaseRoute
10 | import crawler.module.news.crawlers._
11 | import crawler.module.news.enums.{ItemSource, SearchMethod}
12 | import crawler.module.news.service.NewsService
13 | import crawler.util.Utils
14 |
15 | import scala.concurrent.Future
16 | import scala.concurrent.duration.Duration
17 | import scala.util.Try
18 |
19 | /**
20 | * 新闻路由
21 | * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-03.
22 | */
23 | object NewsRoute extends BaseRoute {
24 |
25 | val config = ConfigFactory.load()
26 | NewsCrawler.registerCrawler(ItemSource.baidu, new BaiduNews(SystemUtils.httpClient))
27 | NewsCrawler.registerCrawler(ItemSource.sogou, new SogouNews(SystemUtils.httpClient))
28 | NewsCrawler.registerCrawler(ItemSource.haosou, new HaosouNews(SystemUtils.httpClient))
29 | NewsCrawler.registerCrawler(ItemSource.court, new CourtNews(SystemUtils.httpClient))
30 | // NewsCrawler.registerCrawler(NewsSource.wechat, new WechatNews(httpClient))
31 |
32 | val newsService = new NewsService()
33 |
34 | def apply() =
35 | pathPrefix("news") {
36 | pathEnd {
37 | get {
38 | parameters(
39 | 'company.as[String],
40 | 'source.as[String] ? "",
41 | 'method.as[String] ? "",
42 | 'duration.as[Int] ? 15,
43 | 'forcedLatest.as[String] ? "",
44 | 'version.as[String] ? "1") { (company, source, method, duration, forcedLatest, version) =>
45 |
46 | val future: Future[HttpResponse] =
47 | version match {
48 | case "3" =>
49 | fromLocal(company, Seq(ItemSource.baidu) /*NewsSource.withToNames(source)*/ , method, duration, forcedLatest).flatMap(list =>
50 | Marshal(list.flatMap(_.news)).to[HttpResponse]
51 | )
52 |
53 | case "2" =>
54 | fromCrawlerApi(company).recoverWith {
55 | case e: Exception =>
56 | logger.warn("fromCralwerApi recover with: " + e, e)
57 | fromLocal(company, Seq(ItemSource.baidu), method, duration, forcedLatest).flatMap(list =>
58 | Marshal(list.flatMap(_.news)).to[HttpResponse]
59 | )
60 | }
61 |
62 | case _ =>
63 | fromLocal(company, Seq(ItemSource.baidu), method, duration, forcedLatest).flatMap(list =>
64 | Marshal(list).to[HttpResponse]
65 | )
66 | }
67 | complete(future)
68 | }
69 | }
70 | }
71 | }
72 |
73 | private def fromLocal(company: String, sources: Traversable[ItemSource.Value], method: String, duration: Int, forcedLatest: String) = {
74 | val mtd = Try(SearchMethod.withName(method)).getOrElse(SearchMethod.F)
75 | newsService.
76 | fetchNews(company, sources, mtd, Duration(duration, TimeUnit.SECONDS), forcedLatest == "y")
77 | }
78 |
79 | private def fromCrawlerApi(company: String) =
80 | SystemUtils.httpClient.get(config.getString("crawler.api-uri") + "/api/news")
81 | .queryParam("companyName" -> company)
82 | .execute()
83 | .map { resp =>
84 | if (resp.getStatusCode != 200)
85 | throw new RuntimeException(s"crawler-api not found company: $company, return: ${resp.getStatusCode}")
86 |
87 | HttpResponse(
88 | StatusCodes.OK,
89 | entity = HttpEntity(ContentType(MediaTypes.`application/json`), resp.getResponseBody(Utils.CHARSET.name()))
90 | )
91 | }
92 |
93 | }
94 |
--------------------------------------------------------------------------------
/app-api/src/main/scala/crawler/app/routes/SiteRoute.scala:
--------------------------------------------------------------------------------
1 | package crawler.app.routes
2 |
3 | import crawler.SystemUtils
4 | import crawler.module.site.BaiduSite
5 | import crawler.app.common.BaseRoute
6 | import crawler.module.site.model.SearchRequest
7 |
8 | /**
9 | * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-18.
10 | */
11 | object SiteRoute extends BaseRoute {
12 |
13 | def apply() =
14 | pathPrefix("site") {
15 | path("baidu") {
16 | post {
17 | entity(as[SearchRequest]) { searchRequest =>
18 | val baidu = new BaiduSite(SystemUtils.httpClient, searchRequest)
19 | complete(baidu.fetchItemList())
20 | }
21 | }
22 | }
23 | }
24 |
25 | }
26 |
--------------------------------------------------------------------------------
/app-api/src/test/scala/demo.sc:
--------------------------------------------------------------------------------
1 | import java.nio.charset.Charset
2 | import java.nio.file.{Paths, Files}
3 | import scala.collection.JavaConverters._
4 |
5 | import scala.io.Source
6 |
7 | val s =
8 | """crawler-news001 121.199.23.3
9 | |crawler-news002 121.199.4.6
10 | |crawler-news003 121.199.2.152
11 | |crawler-news004 121.199.12.190
12 | |crawler-news005 121.41.53.230
13 | |crawler-news006 121.199.5.96
14 | |crawler-news007 121.199.20.87
15 | |crawler-news008 121.40.93.44
16 | |crawler-news009 121.199.22.228
17 | |crawler-news010 120.26.94.198
18 | |crawler-news011 120.26.94.202
19 | |crawler-news012 120.26.94.146
20 | |crawler-news013 120.26.94.163
21 | |crawler-news014 120.26.94.211
22 | |crawler-news015 120.26.94.117
23 | |crawler-news016 120.26.94.195
24 | |crawler-news017 120.26.94.207
25 | |crawler-news018 120.26.94.185
26 | |crawler-news019 120.26.93.249
27 | |crawler-news020 120.26.94.17
28 | |crawler-news021 120.26.94.5
29 | |crawler-news022 120.26.94.7
30 | |crawler-news023 120.26.93.202
31 | |crawler-news024 120.26.94.188
32 | |crawler-news025 120.26.94.35
33 | |crawler-news026 120.26.94.58
34 | |crawler-news027 120.26.94.120
35 | |crawler-news028 120.26.94.203
36 | |crawler-news029 120.26.94.38
37 | |crawler-news030 120.26.94.150
38 | |crawler-news031 120.26.94.151
39 | |crawler-news032 120.26.94.147
40 | |crawler-news033 120.26.94.28
41 | |crawler-news034 120.26.94.191
42 | |crawler-news035 120.26.94.18
43 | |crawler-news036 120.26.93.254
44 | |crawler-news037 120.26.94.49
45 | |crawler-news038 120.26.94.139
46 | |crawler-news039 120.26.94.2
47 | |crawler-news040 120.26.94.4
48 | |crawler-news041 120.26.94.23
49 | |crawler-news042 120.26.94.29
50 | |crawler-news043 120.26.94.174
51 | |crawler-news044 120.26.94.8
52 | |crawler-news045 120.26.93.240
53 | |crawler-news046 120.26.93.215
54 | |crawler-news047 120.26.94.122
55 | |crawler-news048 120.26.94.12
56 | |crawler-news049 120.26.92.125
57 | |crawler-news050 120.26.92.180
58 | |crawler-news051 120.26.93.219
59 | |crawler-news052 120.26.94.76
60 | |crawler-news053 120.26.93.229
61 | |crawler-news054 120.26.94.22
62 | |crawler-news055 120.26.94.14
63 | |crawler-news056 120.26.94.84
64 | |crawler-news057 120.26.94.27
65 | |crawler-news058 120.26.93.221
66 | |crawler-news059 121.43.60.236""".stripMargin
67 | val lines = Source.fromString(s).getLines().map(_.split(" ")(0)).toStream
68 |
69 | //val ss = Source.fromString(s).getLines().map { v =>
70 | // val ip = v.drop(19)
71 | // val hostname = v.take(15)
72 | // Seq(hostname, ip, "1核1G", "/usr/app/python
/home/sc/open-falcon/agent")
73 | // .mkString("| ", " | ", " |")
74 | //}.toStream
75 | //
76 | //val lines =
77 | // Stream(
78 | // Seq("hostname ", "IP", "hardware", "path"),
79 | // Seq("----------------", "--", "--------", "----")
80 | // ).map(_.mkString("| ", " | ", " |")) #:::
81 | // ss
82 |
83 | Files.write(Paths.get("/tmp/crawler-news-hosts.txt"), lines.asJava)
--------------------------------------------------------------------------------
/app-api/src/test/scala/saic.sc:
--------------------------------------------------------------------------------
1 | import scala.io.Source
2 |
3 | val s =
4 | """|120.55.182.150
(10.117.12.74) | 1核1G | /usr/app/python |saic |
5 | ||120.26.225.105
(10.117.55.14) | 1核1G | /usr/app/python |saic |
6 | ||121.41.2.74
(10.168.96.82) | 1核1G | /usr/app/python |saic |
7 | ||120.55.113.230
(10.168.152.118) | 1核1G | /usr/app/python |saic |
8 | ||120.55.114.18
(10.168.154.133) | 1核1G | /usr/app/python |saic |
9 | ||120.55.88.109
(10.117.196.51) | 1核1G | /usr/app/python |saic |
10 | ||121.41.2.196
(10.168.91.79) | 1核1G | /usr/app/python |saic |
11 | ||121.41.2.186
(10.168.94.151) | 1核1G | /usr/app/python |saic |
12 | ||120.55.64.125
(10.117.211.194) | 1核1G | /usr/app/python |saic |
13 | ||121.41.2.162
(10.168.93.81) | 1核1G | /usr/app/python |saic |
14 | ||121.41.1.166
(10.168.54.249) | 1核1G | /usr/app/python |saic |
15 | ||120.26.217.236
(10.117.52.105) | 1核1G | /usr/app/python |saic |
16 | ||120.26.92.73
(10.51.8.148) | 1核1G | /usr/app/python |saic |
17 | ||120.55.180.251
(10.117.8.21) | 1核1G | /usr/app/python |saic |
18 | ||120.26.91.2
(10.117.209.143) | 1核1G | /usr/app/python |saic |
19 | ||120.26.223.152
(10.117.51.186) | 1核1G | /usr/app/python |saic |
20 | ||120.26.223.135
(10.117.52.107) | 1核1G | /usr/app/python |saic |
21 | ||120.26.91.8
(10.117.209.141) | 1核1G | /usr/app/python |saic |
22 | ||120.55.112.92
(10.168.152.171) | 1核1G | /usr/app/python |saic |
23 | ||120.55.181.10
(10.117.8.192) | 1核1G | /usr/app/python |saic |""".stripMargin
24 | val lines = Source.fromString(s).getLines().toStream
25 | .map(v => v.take(v.indexOf('<')).replace("|", ""))
26 |
27 | // fabric hosts
28 | lines
29 | .map("xu.du@" + _)
30 | .mkString("[\"", "\",\"", "\"]")
31 |
32 | // hostnames
33 | lines.foreach(println)
34 |
35 |
--------------------------------------------------------------------------------
/app-api/src/test/scala/worksheet.sc:
--------------------------------------------------------------------------------
1 | import java.time.LocalDateTime
2 |
3 | import crawler.module.site.BaiduSite
4 | //BaiduSite.dealTime("2015年1月13日")
5 | //BaiduSite.dealTime("2015年1月1日")
6 | //BaiduSite.dealTime("2015年11月13日")
7 | //BaiduSite.dealTime("2015年11月3日")
8 |
9 | "www.runoob.com/kjlsdf/sdf/".take("www.runoob.com/kjlsdf/sdf/".indexOf('/'))
10 |
11 | val TIME_PATTERN = """(\d{4})年(\d{1,2})月(\d{1,2})日""".r
12 | def parseTime(s: String) = s.substring(0, s.indexOf('日')+1) match {
13 | case TIME_PATTERN(year, month, day) => LocalDateTime.of(year.toInt, month.toInt, day.toInt, 0, 0)
14 | case _ => null
15 | }
16 | parseTime("2015年1月13日 - ")
17 | parseTime("2015年1月1日")
18 | parseTime("2015年11月13日")
19 | parseTime("2015年11月3日")
20 | parseTime("2015年11月332日")
21 | parseTime("15年11月332日")
22 |
--------------------------------------------------------------------------------
/app-api/src/universal/conf/application-test.conf:
--------------------------------------------------------------------------------
1 | akka {
2 | loggers = ["akka.event.slf4j.Slf4jLogger"]
3 | loglevel = INFO
4 | log-dead-letters = off
5 | log-dead-letters-during-shutdown = off
6 | fork-join-executor {
7 | parallelism-factor = 3.0
8 | parallelism-min = 16
9 | parallelism-max = 64
10 | }
11 |
12 | http {
13 | server {
14 | backlog = 1024
15 | max-connections = 8192
16 | socket-options {
17 | so-reuse-address = on
18 | }
19 | }
20 | host-connection-pool {
21 | max-connections = 8
22 | }
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/app-api/src/universal/conf/application.conf:
--------------------------------------------------------------------------------
1 | akka {
2 | loggers = ["akka.event.slf4j.Slf4jLogger"]
3 | loglevel = INFO
4 | log-dead-letters = off
5 | log-dead-letters-during-shutdown = off
6 | fork-join-executor {
7 | parallelism-factor = 3.0
8 | parallelism-min = 16
9 | parallelism-max = 64
10 | }
11 |
12 | http {
13 | server {
14 | backlog = 1024
15 | max-connections = 8192
16 | socket-options {
17 | so-reuse-address = on
18 | }
19 | }
20 | host-connection-pool {
21 | max-connections = 8
22 | }
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/app-api/src/universal/conf/jvmopts:
--------------------------------------------------------------------------------
1 | -xmx2048m
2 | -xms2048m
3 | -file.encoding=UTF-8
--------------------------------------------------------------------------------
/app-api/src/universal/conf/logback.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | %date - [%level] - from %logger in %thread %n%message%n%xException%n
6 |
7 |
8 |
9 |
10 | logs/application.log
11 |
12 |
13 | logs/application-log-%d{yyyy-MM-dd}.gz
14 |
15 | 60
16 |
17 |
18 | %date{yyyy-MM-dd HH:mm:ss ZZZZ} [%level] from %logger in %thread - %n%message%n%xException%n
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
--------------------------------------------------------------------------------
/module-news/docs/杂记.txt:
--------------------------------------------------------------------------------
1 | NewsMaster -> NewsJob* -> NewsSourceJob* -> SearchPageWorker
2 | PersistActor -> ItemPageWorker*
3 |
4 |
5 | NewsJob actor: 收到新闻抓取请求,管理新闻抓取状态,数据存储
6 | - SearchPageWorker actor: 进行新闻搜索页面抓取,并解析
7 | - ItemPageWorker actor: 新闻详情页面抓取,并抽取内容正文
8 |
9 | NewsJob actor: 每收到一次新闻抓取请求就实例化一个actor,在actor中再委派SearchPageWorker进行新闻搜索页抓取。
10 | 新闻搜索页抓取成功后数据回到Job actor,Job actor判断是否需要抓取全文,若是则再委派ItemPageWorker进行全文抓取。
11 | NewsJob将保存一个 timeout 超时值,由实例化时参数传入。超时到则向客户返回Timeout请求。而actor则继续等待子actor,
12 | 如:SearchPageWorker和Seq[ItemPageWorker]执行完(或有错误发生),再停止NewsJob
13 | 在 postStop 回调函数中进行数据持久化工作。
14 |
15 | SearchPageWorker: 根据参数抓取新闻搜索页的新闻列表,并将结果传回给 NewsJob
16 |
17 | ItemPageWorker: 根据url抓取新闻详情页正文内容。每条url生成一个actor。抓取成功一条则回传到 NewsJob 中,由 NewsJob 做进一步处理。
18 |
19 |
20 |
21 | DB存储添加索引和查找功能
22 |
23 | 在case class中加入一个transactionId,记录每一次新闻查询的请求事物。
24 |
25 | actor中尽量不传 ActorRef,而通过ActorPath或其它类似机制来查找actor
26 |
27 | 新闻数据。
28 |
29 | 使用Cassandra存储,2张表:
30 |
31 | create keyspace if not exists crawler_spider with replication = {'class': 'SimpleStrategy', 'replication_factor': 2};
32 | use crawler_spider;
33 |
34 | create type news_type (
35 | url Text,
36 | source Text,
37 | title Text,
38 | time Timestamp,
39 | abstract Text
40 | );
41 | create table search_page (
42 | key Text,
43 | source Ascii,
44 | time Timestamp,
45 | count Int,
46 | news List>,
47 | primary key (key, source, time)
48 | );
49 | create table news_page (
50 | url Text,
51 | title Text,
52 | source Text,
53 | time Timestamp,
54 | abstract Text,
55 | content Text,
56 | primary key (url)
57 | );
58 | create table page_html (
59 | url Text,
60 | created_at Timestamp,
61 | src Text,
62 | primary key (url, created_at)
63 | );
64 |
--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/NewsJsonSupport.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news
2 |
3 | import crawler.module.news.enums.{SearchMethod, ItemSource}
4 | import crawler.util.http.TJsonSupport
5 | import org.json4s.Formats
6 | import org.json4s.ext.EnumNameSerializer
7 |
8 | /**
9 | * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-22.
10 | */
11 | trait NewsJsonSupport extends TJsonSupport {
12 | implicit val formats: Formats = defaultFormats +
13 | new EnumNameSerializer(ItemSource) +
14 | new EnumNameSerializer(SearchMethod)
15 | }
16 |
17 | object NewsJsonSupport extends NewsJsonSupport
18 |
--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/NewsUtils.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news
2 |
3 | import java.net.URI
4 | import java.util.concurrent.atomic.AtomicInteger
5 |
6 | /**
7 | * News Utils
8 | * Created by yangjing on 15-11-5.
9 | */
10 | object NewsUtils {
11 | private val _nums = new AtomicInteger(0)
12 |
13 | def getIndent = _nums.getAndIncrement()
14 |
15 | def uriToBaseUri(uri: String): String = uriToBaseUri(URI.create(uri))
16 |
17 | def uriToBaseUri(uri: URI): String = {
18 | val sb = new StringBuffer()
19 | if (uri.getScheme != null) {
20 | sb.append(uri.getScheme)
21 | sb.append(':')
22 | }
23 | if (uri.isOpaque) {
24 | sb.append(uri.getSchemeSpecificPart)
25 | } else {
26 | if (uri.getHost != null) {
27 | sb.append("//")
28 | if (uri.getUserInfo != null) {
29 | sb.append(uri.getUserInfo)
30 | sb.append('@')
31 | }
32 | val needBrackets = ((uri.getHost.indexOf(':') >= 0)
33 | && !uri.getHost.startsWith("[")
34 | && !uri.getHost.endsWith("]"))
35 | if (needBrackets) {
36 | sb.append('[')
37 | }
38 | sb.append(uri.getHost)
39 | if (needBrackets) sb.append(']')
40 | if (uri.getPort != -1) {
41 | sb.append(':')
42 | sb.append(uri.getPort)
43 | }
44 | } else if (uri.getAuthority != null) {
45 | sb.append("//")
46 | sb.append(uri.getAuthority)
47 | }
48 | }
49 | sb.toString
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/commands/Commands.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news.commands
2 |
3 | import crawler.module.news.enums.{ItemSource, SearchMethod}
4 | import crawler.module.news.model.{NewsPageItem, SearchResult}
5 |
6 | import scala.concurrent.duration.FiniteDuration
7 |
8 | case class RequestSearchNews(sources: Seq[ItemSource.Value], msg: SearchNews)
9 |
10 | /**
11 | * 新闻源搜索
12 | *
13 | * @param key 关键词
14 | * @param method 搜索方式
15 | * @param duration 持续时间(超时)
16 | */
17 | case class SearchNews(key: String,
18 | method: SearchMethod.Value,
19 | duration: FiniteDuration)
20 |
21 | /**
22 | * 开始搜索新闻
23 | */
24 | case object StartSearchNews
25 |
26 | /**
27 | * 抓取搜索页
28 | */
29 | case object StartFetchSearchPage
30 |
31 | /**
32 | * 搜索超时
33 | */
34 | case object SearchTimeout
35 |
36 | /**
37 | * 搜索结果
38 | *
39 | * @param news 新闻结果
40 | */
41 | case class SearchPageResult(news: SearchResult)
42 |
43 | /**
44 | * 搜索失败
45 | *
46 | * @param failure 失败结果
47 | */
48 | case class SearchPageFailure(failure: Throwable)
49 |
50 | /**
51 | * 开始抓取新闻详情内容
52 | */
53 | case object StartFetchItemPage
54 |
55 | /**
56 | * 新闻详情
57 | *
58 | * @param result 新闻详情
59 | */
60 | case class ItemPageResult(result: Either[String, NewsPageItem])
61 |
--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/crawlers/BaiduNews.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news.crawlers
2 |
3 | import java.net.URLEncoder
4 | import java.time.LocalDateTime
5 | import java.util.concurrent.TimeUnit
6 |
7 | import akka.util.Timeout
8 | import crawler.SystemUtils
9 | import crawler.module.news.enums.{SearchMethod, ItemSource}
10 | import crawler.module.news.model.{NewsItem, SearchResult}
11 | import crawler.util.http.HttpClient
12 | import crawler.util.news.contextextractor.ContentExtractor
13 | import crawler.util.time.TimeUtils
14 | import org.jsoup.Jsoup
15 | import org.jsoup.nodes.Element
16 |
17 | import scala.collection.JavaConverters._
18 | import scala.concurrent.duration._
19 | import scala.concurrent.{Await, ExecutionContext, Future, Promise}
20 | import scala.util.{Failure, Success}
21 |
22 | /**
23 | * 百度新闻爬虫
24 | * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-03.
25 | */
26 | class BaiduNews(val httpClient: HttpClient) extends NewsCrawler(ItemSource.baidu) {
27 |
28 | import crawler.util.JsoupImplicits._
29 |
30 | override protected val defaultHeaders: Array[Seq[(String, String)]] =
31 | super.defaultHeaders.map(headers => headers :+ ("User-Agent" -> "Baiduspider"))
32 |
33 | private def parseNewsItem(news: Element): NewsItem = {
34 | val a = news.findByClass("c-title").first().getElementsByTag("a").first()
35 | val summary = news.findByClass("c-summary")
36 | val authorText = news.findByClass("c-author").text()
37 | val source = authorText.split(" ")
38 | val footer = summary.findByClass("c-info").first().text()
39 | NewsItem(
40 | a.text(),
41 | a.attr("href"),
42 | source.headOption.getOrElse(""),
43 | BaiduNews.dealTime(source.lastOption.getOrElse("")),
44 | summary.text().replace(authorText, "").replace(footer, ""))
45 | }
46 |
47 | override def fetchItemList(key: String)(implicit ec: ExecutionContext): Future[SearchResult] = {
48 | val promise = Promise[Seq[NewsItem]]()
49 |
50 | val newsResultsFuture = fetchPage(BaiduNews.BAIDU_NEWS_BASE_URL.format(URLEncoder.encode('"' + key + '"', "UTF-8"))).map { resp =>
51 | val doc = Jsoup.parse(resp.getResponseBodyAsStream, "UTF-8", BaiduNews.BAIDU_NEWS_HOST)
52 | // logger.debug(doc.body().toString + "\n\n\n")
53 | val now = TimeUtils.now()
54 | if (doc.getElementById("noresult") != null) {
55 | SearchResult(newsSource, key, now, 0, Nil)
56 | } else {
57 | val countText = doc
58 | .getElementById("header_top_bar")
59 | .getElementsByAttributeValue("class", "nums")
60 | .first()
61 | .text()
62 | val count =
63 | """\d+""".r.findAllMatchIn(countText).map(_.matched).mkString.toInt
64 |
65 | val newsDiv = doc.getElementById("content_left")
66 | val pages = doc.select("#page a").asScala
67 | val newsItemFutures = pages.take(BaiduNews.PAGE_LIMIT - 1).map { page =>
68 | TimeUnit.MILLISECONDS.sleep(500)
69 | fetchPageLinks(BaiduNews.BAIDU_NEWS_HOST + page.attr("href"))
70 | }
71 | Future.sequence(newsItemFutures).map(_.flatten).onComplete {
72 | case Success(list) =>
73 | promise.success(list)
74 | case Failure(e) =>
75 | e.printStackTrace()
76 | promise.success(Nil)
77 | }
78 |
79 | SearchResult(
80 | newsSource,
81 | key,
82 | now,
83 | count,
84 | newsDiv.findByClass("result").asScala.map(parseNewsItem).toList)
85 | }
86 | }
87 |
88 | for {
89 | newsResult <- newsResultsFuture
90 | newsItems <- promise.future
91 | } yield {
92 | newsResult.copy(news = newsResult.news ++ newsItems)
93 | }
94 | }
95 |
96 | def fetchPageLinks(url: String)(implicit ec: ExecutionContext): Future[Seq[NewsItem]] = {
97 | fetchPage(url).map { resp =>
98 | val doc = Jsoup.parse(resp.getResponseBodyAsStream, "UTF-8", BaiduNews.BAIDU_NEWS_HOST)
99 | if (doc.getElementById("noresult") != null) {
100 | Nil
101 | } else {
102 | val newsDiv = doc.getElementById("content_left")
103 | newsDiv.findByClass("result").asScala.map(parseNewsItem).toList
104 | }
105 | }
106 | }
107 | }
108 |
109 | object BaiduNews {
110 | val PAGE_LIMIT = 5
111 | val BAIDU_NEWS_HOST = "http://news.baidu.com"
112 | val BAIDU_NEWS_BASE_URL = "http://news.baidu.com/ns?word=%s&tn=news&from=news&cl=2&rn=20&ct=1"
113 | val TIME_PATTERN = """\d{4}年\d{2}月\d{2}日 \d{2}:\d{2}""".r
114 | val FEW_HOURS_PATTERN = """(\d+)小时前""".r
115 |
116 | private def dealFewHours(timeStr: String): String = {
117 | val matcher = FEW_HOURS_PATTERN.pattern.matcher(timeStr)
118 | if (matcher.matches()) matcher.group(1) else ""
119 | }
120 |
121 | def dealTime(timeStr: String): Option[LocalDateTime] = {
122 | val dt = if (timeStr.length < 2) {
123 | LocalDateTime.now()
124 | } else if (TIME_PATTERN.pattern.matcher(timeStr).matches()) {
125 | val s = timeStr.replaceAll( """年|月""", "-").replace("日", "")
126 | LocalDateTime.parse(s, TimeUtils.formatterDateMinus)
127 | } else if (FEW_HOURS_PATTERN.pattern.matcher(timeStr).matches()) {
128 | val now = LocalDateTime.now()
129 | val hour = dealFewHours(timeStr).toLong
130 | now.minusHours(hour)
131 | } else {
132 | null
133 | }
134 | Option(dt)
135 | }
136 |
137 | ////////////////////////////////////////////////////////////////////////////
138 | // 以下为测试用例
139 | ////////////////////////////////////////////////////////////////////////////
140 |
141 | def run(newsCrawler: NewsCrawler,
142 | name: String,
143 | method: SearchMethod.Value)(implicit ec: ExecutionContext): Future[SearchResult] = {
144 | val newsResult = newsCrawler.fetchItemList(name)
145 | if (SearchMethod.A == method) {
146 | newsResult
147 | } else {
148 | newsResult.flatMap { result =>
149 | val seqs = result.news.map { news =>
150 | newsCrawler.fetchPage(news.url).map { resp =>
151 | (news.url, ContentExtractor.getNewsByHtml(resp.getResponseBody("UTF-8")).getContent)
152 | }
153 | }
154 | val f = Future.sequence(seqs)
155 | f.map { urlContents =>
156 | val news = result.news.map { news =>
157 | urlContents.find(_._1 == news.url) match {
158 | case Some((_, content)) =>
159 | news.copy(content = Option(content))
160 | case None =>
161 | news
162 | }
163 | }
164 | result.copy(news = news)
165 | }
166 | }
167 | }
168 | }
169 |
170 | def main(args: Array[String]): Unit = {
171 | import SystemUtils._
172 | implicit val timeout = Timeout(10.hours)
173 | import system.dispatcher
174 |
175 | val httpClient = HttpClient()
176 | val baidu = new BaiduNews(httpClient)
177 | val f = run(baidu, "杭州今元标矩科技有限公司", SearchMethod.F)
178 | val result = Await.result(f, timeout.duration)
179 | result.news.foreach(news => println(news.content + "\n\n"))
180 | println(result.count)
181 |
182 | system.shutdown()
183 | httpClient.close()
184 | system.awaitTermination()
185 | // System.exit(0)
186 | }
187 | }
188 |
--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/crawlers/CourtNews.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news.crawlers
2 |
3 | import java.time.LocalDate
4 |
5 | import crawler.module.news.enums.ItemSource
6 | import crawler.module.news.model.{NewsItem, SearchResult}
7 | import crawler.util.Utils
8 | import crawler.util.http.HttpClient
9 | import crawler.util.time.TimeUtils
10 | import org.jsoup.Jsoup
11 | import org.jsoup.nodes.Element
12 |
13 | import scala.collection.JavaConverters._
14 | import scala.concurrent.{ExecutionContext, Future}
15 | import scala.util.Random
16 |
17 | /**
18 | * 中国法院网新闻搜索
19 | * Created by yangjing on 15-11-9.
20 | */
21 | class CourtNews(val httpClient: HttpClient) extends NewsCrawler(ItemSource.court) {
22 | private def fetchPagePost(url: String, data: Seq[(String, String)]) = {
23 | val headers = defaultHeaders(Random.nextInt(defaultHeaders.length))
24 | httpClient.post(url).header(headers: _*).addFormParam(data: _*).execute()
25 | }
26 |
27 | private def parseNewsItem(elem: Element) = {
28 | val a = elem.select("dt").select("a").first()
29 | val dds = elem.select("dd")
30 | val item = NewsItem(
31 | a.text(),
32 | CourtNews.SITE_URL + a.attr("href"),
33 | "中国法院网",
34 | Option(TimeUtils.toLocalDateTime(dds.last().text().split(" ").last)),
35 | dds.first().text())
36 | // println(item)
37 | item
38 | }
39 |
40 | /**
41 | * 抓取搜索页
42 | *
43 | * @param key 搜索关键词
44 | * @return
45 | */
46 | override def fetchItemList(key: String)(implicit ec: ExecutionContext): Future[SearchResult] = {
47 | fetchPagePost(CourtNews.SEARCH_URL, Seq(
48 | "keyword" -> key,
49 | "button" -> "提交",
50 | "content_time_publish_begin" -> "2002-01-01",
51 | "content_time_publish_end" -> LocalDate.now().toString,
52 | "article_category_id" -> "",
53 | "content_author" -> ""
54 | )).map { resp =>
55 | val now = TimeUtils.now()
56 | val doc = Jsoup.parse(resp.getResponseBody(Utils.CHARSET.name), CourtNews.SITE_URL)
57 | val newsDl = doc.select("div.search_content").select("dl")
58 | if (newsDl.isEmpty) {
59 | SearchResult(newsSource, key, now, 0, Nil)
60 | } else {
61 | val newsItems = newsDl.asScala.map(parseNewsItem)
62 | val countText = doc.select("div.search_br").select("span").first().text
63 | val count =
64 | try {
65 | countText.toInt
66 | } catch {
67 | case e: Exception =>
68 | logger.warn("count < 1: " + countText)
69 | 0
70 | }
71 |
72 | SearchResult(newsSource, key, now, count, newsItems)
73 | }
74 | }
75 | }
76 | }
77 |
78 | object CourtNews {
79 | val SITE_URL = "http://www.chinacourt.org"
80 | val SEARCH_URL = "http://www.chinacourt.org/article/search.shtml"
81 | }
82 |
--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/crawlers/HaosouNews.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news.crawlers
2 |
3 | import java.net.URLEncoder
4 |
5 | import crawler.module.news.NewsUtils
6 | import crawler.module.news.enums.ItemSource
7 | import crawler.module.news.model.{NewsItem, SearchResult}
8 | import crawler.util.Utils
9 | import crawler.util.http.HttpClient
10 | import crawler.util.time.TimeUtils
11 | import org.jsoup.Jsoup
12 | import org.jsoup.nodes.Element
13 |
14 | import scala.collection.JavaConverters._
15 | import scala.concurrent.{ExecutionContext, Future}
16 |
17 | /**
18 | * 360好搜新闻搜索
19 | * Created by yangjing on 15-11-9.
20 | */
21 | class HaosouNews(val httpClient: HttpClient) extends NewsCrawler(ItemSource.haosou) {
22 | private def parseItem(elem: Element) = {
23 | val a = elem.select("a")
24 | val newsInfo = elem.select("p.newsinfo")
25 | NewsItem(
26 | a.text(),
27 | a.attr("href"),
28 | newsInfo.select("span.sitename").text(),
29 | Option(TimeUtils.toLocalDateTime(newsInfo.select("span.posttime").attr("title"))),
30 | elem.select("p.content").text())
31 | }
32 |
33 | /**
34 | * 抓取搜索页
35 | *
36 | * @param key 搜索关键词
37 | * @return
38 | */
39 | override def fetchItemList(key: String)(implicit ec: ExecutionContext): Future[SearchResult] = {
40 | fetchPage(HaosouNews.searchUrl(key)).map { resp =>
41 | val doc = Jsoup.parse(resp.getResponseBody(Utils.CHARSET.name()), NewsUtils.uriToBaseUri(HaosouNews.SEARCH_SITE))
42 | val now = TimeUtils.now()
43 | val ul = doc.select("ul#news")
44 | if (ul.isEmpty) {
45 | SearchResult(newsSource, key, now, 0, Nil)
46 | } else {
47 | val newsItems = ul.select("li.res-list").asScala.map(parseItem)
48 | val countText = doc.select("div#page").select("span.nums").text()
49 | val count =
50 | try {
51 | """\d+""".r.findAllMatchIn(countText).mkString.toInt
52 | } catch {
53 | case e: Exception =>
54 | logger.warn("count < 1")
55 | newsItems.size
56 | }
57 | SearchResult(newsSource, key, now, count, newsItems)
58 | }
59 | }
60 | }
61 | }
62 |
63 | object HaosouNews {
64 | val SEARCH_SITE = "http://news.haosou.com"
65 |
66 | def searchUrl(key: String) = SEARCH_SITE + "/ns?q=%s".format(URLEncoder.encode(key, Utils.CHARSET.name()))
67 |
68 | }
69 |
--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/crawlers/NewsCrawler.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news.crawlers
2 |
3 | import com.typesafe.scalalogging.LazyLogging
4 | import crawler.module.news.NewsUtils
5 | import crawler.module.news.enums.ItemSource
6 | import crawler.module.news.model.{NewsPageItem, SearchResult}
7 | import crawler.util.Crawler
8 | import crawler.util.news.contextextractor.ContentExtractor
9 | import org.jsoup.helper.DataUtil
10 |
11 | import scala.concurrent.{ExecutionContext, Future}
12 |
13 | /**
14 | * 新闻爬虫
15 | * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-03.
16 | */
17 | abstract class NewsCrawler(val newsSource: ItemSource.Value) extends Crawler with LazyLogging {
18 | /**
19 | * 抓取搜索页
20 | *
21 | * @param key 搜索关键词
22 | * @return
23 | */
24 | def fetchItemList(key: String)(implicit ec: ExecutionContext): Future[SearchResult]
25 |
26 | /**
27 | * 抓取新闻详情页
28 | *
29 | * @param url 网页链接
30 | * @return
31 | */
32 | def fetchNewsItem(url: String)(implicit ec: ExecutionContext): Future[NewsPageItem] = {
33 | fetchPage(url).map { resp =>
34 | val in = resp.getResponseBodyAsStream
35 | val doc = DataUtil.load(in, null, NewsUtils.uriToBaseUri(url))
36 | val src = doc.toString
37 | val news = ContentExtractor.getNewsByDoc(doc)
38 | NewsPageItem(url, src, news.getContent)
39 | }
40 | }
41 |
42 | }
43 |
44 | object NewsCrawler {
45 | private var _newsCrawler = Map.empty[ItemSource.Value, NewsCrawler]
46 |
47 | def registerCrawler(source: ItemSource.Value, newsCrawler: NewsCrawler): Unit = {
48 | _newsCrawler = _newsCrawler + (source -> newsCrawler)
49 | }
50 |
51 | def getCrawler(source: ItemSource.Value): Option[NewsCrawler] = _newsCrawler.get(source)
52 |
53 | }
54 |
--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/crawlers/SogouNews.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news.crawlers
2 |
3 | import java.net.URLEncoder
4 |
5 | import akka.util.Timeout
6 | import crawler.SystemUtils
7 | import crawler.module.news.enums.{ItemSource, SearchMethod}
8 | import crawler.module.news.model.{NewsItem, SearchResult}
9 | import crawler.util.http.HttpClient
10 | import crawler.util.time.TimeUtils
11 | import org.jsoup.Jsoup
12 | import org.jsoup.nodes.Element
13 |
14 | import scala.collection.JavaConverters._
15 | import scala.concurrent.{Await, ExecutionContext, Future}
16 | import scala.util.Try
17 |
18 | /**
19 | * 搜狗新闻搜索
20 | *
21 | * @param httpClient
22 | */
23 | class SogouNews(val httpClient: HttpClient) extends NewsCrawler(ItemSource.sogou) {
24 |
25 | private def parseItem(elem: Element) = {
26 | val header = elem.select("h3.pt")
27 | val title = header.select("a.pp")
28 | val source = header.select("cite") match {
29 | case s if s.isEmpty => Array("", "")
30 | case s => s.text().split(SogouNews.CITE_SPLIT_CHAR)
31 | }
32 | val summary = elem.select("div.ft").text().replace( """>>\d+?条相同新闻""", "")
33 |
34 | NewsItem(
35 | title.text(),
36 | title.attr("href"),
37 | source(0),
38 | Option(TimeUtils.toLocalDateTime(source.tail.mkString(" "))),
39 | summary)
40 | }
41 |
42 | /**
43 | * 抓取搜索页
44 | *
45 | * @param key 搜索关键词
46 | * @return
47 | */
48 | override def fetchItemList(key: String)(implicit ec: ExecutionContext): Future[SearchResult] = {
49 | // val doc = fetchDocument(SogouCrawler.searchUrl(URLEncoder.encode(key, "UTF-8")))
50 | fetchPage(SogouNews.searchUrl(URLEncoder.encode(key, "UTF-8"))).map { resp =>
51 | val doc = Jsoup.parse(resp.getResponseBody, "http://news.sogou.com")
52 | val now = TimeUtils.now()
53 | // println(doc)
54 | val results = doc.select("div.results")
55 | if (results.isEmpty) {
56 | SearchResult(newsSource, key, now, 0, Nil)
57 | } else {
58 | val newsList = results.select("div.rb").asScala.map(parseItem)
59 | var count = Try( """\d+""".r.findAllMatchIn(doc.select("#pagebar_container").select("div.num").text()).mkString.toInt).getOrElse(0)
60 | if (count < 1) {
61 | logger.warn("count < 1")
62 | count = newsList.size
63 | }
64 | SearchResult(newsSource, key, now, count, newsList)
65 | }
66 | }
67 | }
68 | }
69 |
70 | object SogouNews {
71 | val REGEX = """\d+?条相同新闻""".r
72 | val CITE_SPLIT_CHAR = 160.toChar
73 |
74 | def searchUrl(key: String) = s"http://news.sogou.com/news?query=%22$key%22"
75 |
76 | ////////////////////////////////////////////////////////////////////////////
77 | // 以下为测试用例
78 | ////////////////////////////////////////////////////////////////////////////
79 |
80 | def run(newsCrawler: NewsCrawler,
81 | key: String,
82 | method: SearchMethod.Value)(implicit ec: ExecutionContext): Future[SearchResult] = {
83 | val newsResult = newsCrawler.fetchItemList(key)
84 | if (SearchMethod.A == method) {
85 | newsResult
86 | } else {
87 | newsResult.flatMap { result =>
88 | val seqs = result.news.map { news =>
89 | // newsCrawler.fetchPage(news.url).map { resp =>
90 | // (news.url, ContentExtractor.getNewsByHtml(resp.getResponseBody("UTF-8")).getContent)
91 | // }
92 | newsCrawler.fetchNewsItem(news.url)
93 | }
94 | val f = Future.sequence(seqs)
95 | f.map { pageItems =>
96 | val news = result.news.map { news =>
97 | pageItems.find(_.url == news.url) match {
98 | case Some(pageItem) =>
99 | news.copy(content = Option(pageItem.content))
100 | case None =>
101 | news
102 | }
103 | }
104 | result.copy(news = news)
105 | }
106 | }
107 | }
108 | }
109 |
110 | }
111 |
--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/crawlers/WechatNews.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news.crawlers
2 |
3 | import java.net.URLEncoder
4 | import java.time.Instant
5 |
6 | import crawler.module.news.enums.ItemSource
7 | import crawler.module.news.model.{NewsItem, SearchResult}
8 | import crawler.util.Utils
9 | import crawler.util.http.HttpClient
10 | import crawler.util.time.TimeUtils
11 | import org.jsoup.Jsoup
12 | import org.jsoup.nodes.Element
13 |
14 | import scala.collection.JavaConverters._
15 | import scala.concurrent.duration._
16 | import scala.concurrent.{Await, ExecutionContext, Future}
17 |
18 | /**
19 | * 搜狗微信搜索
20 | * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-10.
21 | */
22 | class WechatNews(val httpClient: HttpClient) extends NewsCrawler(ItemSource.wechat) {
23 | private def parseNewsItem(elem: Element) = {
24 | implicit val duration = 1.second
25 |
26 | try {
27 | val title = elem.select("h4")
28 | val footer = elem.select("div.s-p")
29 | val scriptStr = elem.select("script").last().text()
30 | val timeStr = """'(\d+?)'""".r.findFirstMatchIn(scriptStr).map(_.matched.replace("'", ""))
31 | val href = WechatNews.complateWeixinUrl(title.select("a").attr("href").trim)
32 | val url = Option(WechatNews.find302Location(href, requestHeaders())).getOrElse(href)
33 | NewsItem(
34 | title.text().trim,
35 | url,
36 | footer.select("a#weixin_account").attr("title"),
37 | Option(TimeUtils.toLocalDateTime(Instant.ofEpochSecond(timeStr.map(_.toLong).getOrElse(Instant.now().getEpochSecond)))),
38 | elem.select("p").text())
39 | } catch {
40 | case e: Exception =>
41 | logger.error(elem.toString)
42 | throw e
43 | }
44 | }
45 |
46 | /**
47 | * 抓取搜索页
48 | *
49 | * @param key 搜索关键词
50 | * @return
51 | */
52 | override def fetchItemList(key: String)(implicit ec: ExecutionContext): Future[SearchResult] = {
53 | fetchPage(WechatNews.searchUrl(key)).map { response =>
54 | response.getHeaders.entrySet().asScala.foreach { case entry => println(entry.getKey + ": " + entry.getValue.asScala) }
55 |
56 | val now = TimeUtils.now()
57 | val doc = Jsoup.parse(response.getResponseBody(Utils.CHARSET.name()), "http://weixin.sogou.com")
58 | println(doc)
59 | val results = doc.select("div.wx-rb")
60 | if (!doc.select("#seccodeImage").isEmpty) {
61 | SearchResult(newsSource, key, now, 0, Nil, Some(doc.select("div.content-box").select("p.p2").text()))
62 | } else if (results.isEmpty) {
63 | SearchResult(newsSource, key, now, 0, Nil)
64 | } else {
65 | val newsItems = results.asScala.map(parseNewsItem)
66 | val countText = doc.select("resnum#scd_num").text().replace(",", "").trim
67 | val count =
68 | try {
69 | countText.toInt
70 | } catch {
71 | case e: Exception =>
72 | logger.warn("count < 1: " + countText, e)
73 | newsItems.size
74 | }
75 | SearchResult(newsSource, key, now, count, newsItems)
76 | }
77 | }
78 | }
79 |
80 | }
81 |
82 | object WechatNews {
83 | final val WEIXIN_SEARCH_PAGE = "http://weixin.sogou.com"
84 |
85 | def complateWeixinUrl(uri: String) =
86 | if (uri.startsWith("/")) WEIXIN_SEARCH_PAGE + uri else uri
87 |
88 | def searchUrl(key: String) =
89 | WEIXIN_SEARCH_PAGE + "/weixin?query=%s&type=2".format(URLEncoder.encode(key, Utils.CHARSET.name()))
90 |
91 | def find302Location(url: String, headers: Seq[(String, String)])(implicit duration: Duration) = {
92 | val client = HttpClient(false)
93 | try {
94 | val resp = Await.result(client.get(url).header(headers: _*).execute(), duration)
95 | resp.getHeader("Location")
96 | } catch {
97 | case e: Exception =>
98 | try {
99 | val respose = Await.result(client.get(url).header(headers: _*).execute(), duration + 1.second)
100 | respose.getHeader("Location")
101 | } catch {
102 | case e: Exception =>
103 | // do nothing
104 | null
105 | }
106 | } finally {
107 | client.close()
108 | }
109 | }
110 | }
111 |
--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/enums/ItemSource.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news.enums
2 |
3 | /**
4 | * 新闻来源
5 | * Created by yangjing on 15-11-4.
6 | */
7 | object ItemSource extends Enumeration {
8 | val baidu = Value
9 | val sogou = Value
10 | val haosou = Value
11 | val court = Value
12 | val wechat = Value
13 |
14 | def withToNames(source: String): Traversable[Value] =
15 | if (source == null || source.isEmpty) {
16 | ItemSource.values
17 | } else {
18 | source.split(',').toSeq.collect {
19 | case s if ItemSource.values.exists(_.toString == s) =>
20 | ItemSource.withName(s)
21 | }
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/enums/SearchMethod.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news.enums
2 |
3 | /**
4 | * 查找方式
5 | * Created by yangjing on 15-11-4.
6 | */
7 | object SearchMethod extends Enumeration {
8 | // 取摘要
9 | val A = Value
10 |
11 | // 取全文
12 | val F = Value
13 | }
14 |
--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/model/NewsItem.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news.model
2 |
3 | import java.time.LocalDateTime
4 |
5 | import com.datastax.driver.core.{UDTValue, UserType}
6 | import crawler.module.news.NewsJsonSupport._
7 | import crawler.util.time.TimeUtils
8 | import org.json4s.Extraction
9 |
10 | /**
11 | * 新闻详情
12 | * Created by yangjing on 15-11-3.
13 | */
14 | case class NewsItem(title: String,
15 | url: String,
16 | // 新闻来源(站点)
17 | source: String,
18 | time: Option[LocalDateTime],
19 | // 摘要
20 | `abstract`: String,
21 | content: Option[String] = None,
22 | values: Seq[String] = Nil,
23 | error: Option[String] = None) {
24 | def jsonPretty = {
25 | val jv = Extraction.decompose(this)
26 | serialization.writePretty(jv)
27 | }
28 | }
29 |
30 | object NewsItem {
31 | def toUDTValue(userType: UserType, ni: NewsItem): UDTValue = {
32 | userType.newValue()
33 | .setString("title", ni.title)
34 | .setString("url", ni.url)
35 | .setString("source", ni.source)
36 | .setTimestamp("time", ni.time.map(TimeUtils.toDate).orNull)
37 | .setString("abstract", ni.`abstract`)
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/model/NewsPage.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news.model
2 |
3 | import java.time.LocalDateTime
4 |
5 | /**
6 | * 新闻页
7 | * Created by yangjing on 15-11-9.
8 | */
9 | case class NewsPage(url: String,
10 | title: String,
11 | source: String,
12 | time: Option[LocalDateTime],
13 | `abstract`: String,
14 | content: String)
15 |
--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/model/NewsPageItem.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news.model
2 |
3 | /**
4 | * 新闻页详情
5 | * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-05.
6 | * @param url 网页链接
7 | * @param src 网页源码
8 | // * @param title 新闻标题
9 | // * @param time 发布时间
10 | * @param content 新闻内容
11 | */
12 | case class NewsPageItem(url: String,
13 | src: String,
14 | // title: String,
15 | // time: String,
16 | content: String)
17 |
--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/model/SearchResult.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news.model
2 |
3 | import java.time.LocalDateTime
4 |
5 | import crawler.module.news.enums.ItemSource
6 |
7 | /**
8 | * 搜索结果
9 | * Created by yangjing on 15-11-3.
10 | */
11 | case class SearchResult(source: ItemSource.Value,
12 | key: String,
13 | time: LocalDateTime,
14 | count: Int,
15 | news: Seq[NewsItem],
16 | error: Option[String] = None)
17 |
--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/service/NewsDBRepo.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news.service
2 |
3 | import java.time.LocalDateTime
4 |
5 | import com.datastax.driver.core.{PreparedStatement, Session, UDTValue}
6 | import com.typesafe.scalalogging.LazyLogging
7 | import crawler.SystemUtils
8 | import crawler.module.news.enums.{ItemSource, SearchMethod}
9 | import crawler.module.news.model.{NewsItem, NewsPage, SearchResult}
10 | import crawler.util.persist.CassandraPersists
11 | import crawler.util.time.TimeUtils
12 |
13 | import scala.collection.JavaConverters._
14 | import scala.collection.mutable
15 | import scala.concurrent.{ExecutionContextExecutor, Future}
16 |
17 | /**
18 | * News DB Service
19 | * Created by yangjing on 15-11-6.
20 | */
21 | class NewsDBRepo extends LazyLogging {
22 |
23 | val KEYSPACE = SystemUtils.crawlerConfig.getString("cassandra.keyspace")
24 | val cachePrepares = mutable.Map.empty[String, PreparedStatement]
25 |
26 | private def findNews(key: String,
27 | source: ItemSource.Value,
28 | method: SearchMethod.Value,
29 | time: LocalDateTime)(
30 | implicit ec: ExecutionContextExecutor
31 | ): Future[Seq[SearchResult]] = {
32 |
33 | logger.debug(s"key: $key, source: $source, method: $method, time: $time")
34 |
35 | CassandraPersists.using(KEYSPACE) { implicit session =>
36 | val stmt = getPreparedStatement(session, "SELECT * FROM search_page WHERE key = ? AND source = ? AND time > ?")
37 | val futureResultSet = session.executeAsync(stmt.bind(key, source.toString, TimeUtils.toDate(time)))
38 | val list = CassandraPersists.execute(futureResultSet) { rs =>
39 | rs.asScala.map { row =>
40 | val news = row.getList("news", classOf[UDTValue]).asScala.map(udt =>
41 | NewsItem(
42 | udt.getString("title"),
43 | udt.getString("url"),
44 | udt.getString("source"),
45 | Option(TimeUtils.toLocalDateTime(udt.getTimestamp("time"))),
46 | udt.getString("abstract"))
47 | )
48 |
49 | val newsItemFuture = Future.sequence(news.map(news =>
50 | findOneNewsPageItem(news.url).map(nop => news.copy(content = nop.map(_.content)))))
51 |
52 | newsItemFuture.map { newsList =>
53 | SearchResult(
54 | ItemSource.withName(row.getString("source")),
55 | row.getString("key"),
56 | TimeUtils.toLocalDateTime(row.getTimestamp("time")),
57 | row.getInt("count"),
58 | newsList)
59 | }
60 | }.toList
61 | }
62 |
63 | list.flatMap(futures => Future.sequence(futures))
64 | }
65 | }
66 |
67 | def findNews(key: String,
68 | sources: Traversable[ItemSource.Value],
69 | method: SearchMethod.Value,
70 | time: Option[LocalDateTime])(
71 | implicit ec: ExecutionContextExecutor
72 | ): Future[List[SearchResult]] = {
73 |
74 | val futureList = CassandraPersists.using(KEYSPACE) { implicit session =>
75 | val pstmt =
76 | if (time.isEmpty) getPreparedStatement(session, "SELECT * FROM search_page WHERE key = ? AND source = ?")
77 | else getPreparedStatement(session, "SELECT * FROM search_page WHERE key = ? AND source = ? AND time > ?")
78 |
79 | sources.flatMap { source =>
80 | val stmt =
81 | if (time.isEmpty) pstmt.bind(key, source.toString)
82 | else pstmt.bind(key, source.toString, TimeUtils.toDate(time.get))
83 |
84 | session.execute(stmt).asScala.map { row =>
85 | val news = row.getList("news", classOf[UDTValue]).asScala.map(udt =>
86 | NewsItem(
87 | udt.getString("title"),
88 | udt.getString("url"),
89 | udt.getString("source"),
90 | Option(TimeUtils.toLocalDateTime(udt.getTimestamp("time"))),
91 | udt.getString("abstract"))
92 | )
93 |
94 | val newsItemFuture = Future.sequence(news.map(news =>
95 | findOneNewsPageItem(news.url).map(nop => news.copy(content = nop.map(_.content)))))
96 |
97 | newsItemFuture.map(list =>
98 | SearchResult(
99 | ItemSource.withName(row.getString("source")),
100 | row.getString("key"),
101 | TimeUtils.toLocalDateTime(row.getTimestamp("time")),
102 | row.getInt("count"),
103 | list)
104 | )
105 |
106 | }
107 | }.toList
108 |
109 | }
110 |
111 | Future.sequence(futureList)
112 | }
113 |
114 | def findOneNewsPageItem(url: String)(
115 | implicit session: Session, ec: ExecutionContextExecutor
116 | ): Future[Option[NewsPage]] = {
117 |
118 | val stmt = getPreparedStatement(session, "SELECT * FROM news_page WHERE url = ?")
119 | CassandraPersists.execute(session.executeAsync(stmt.bind(url))) { rs =>
120 | rs.one match {
121 | case null =>
122 | None
123 | case row =>
124 | Some(NewsPage(
125 | row.getString("url"),
126 | row.getString("title"),
127 | row.getString("source"),
128 | Option(TimeUtils.toLocalDateTime(row.getTimestamp("time"))),
129 | row.getString("abstract"),
130 | row.getString("content"))
131 | )
132 | }
133 | }
134 | }
135 |
136 | def saveToNewsPage(page: NewsPage): Unit = {
137 | CassandraPersists.using(KEYSPACE) { session =>
138 | val stmt = getPreparedStatement(session,
139 | "INSERT INTO news_page(url, title, source, time, abstract, content) VALUES(?, ?, ?, ?, ?, ?)")
140 | session.executeAsync(stmt.bind(
141 | page.url,
142 | page.title,
143 | page.source,
144 | page.time.map(TimeUtils.toDate).orNull,
145 | page.`abstract`,
146 | page.content))
147 | }
148 | }
149 |
150 | def saveToSearchPage(newsResult: SearchResult) = {
151 | // logger.debug(newsResult.news.mkString("\n"))
152 | logger.info(s"key: ${newsResult.key} found news: ${newsResult.count}, saved: ${newsResult.news.size}")
153 | CassandraPersists.using(KEYSPACE) { session =>
154 | val newsType = CassandraPersists.userType(KEYSPACE, "news_type")
155 | val stmt = getPreparedStatement(session, "INSERT INTO search_page(key, source, time, count, news) VALUES(?, ?, ?, ?, ?)")
156 | session.executeAsync(stmt.bind(
157 | newsResult.key,
158 | newsResult.source.toString,
159 | TimeUtils.toDate(newsResult.time),
160 | Integer.valueOf(newsResult.count),
161 | newsResult.news.map(n => NewsItem.toUDTValue(newsType, n)).asJava))
162 | }
163 | }
164 |
165 | private def getPreparedStatement(session: Session, sql: String): PreparedStatement = {
166 | // println("sql: " + sql)
167 | cachePrepares.getOrElse(sql, {
168 | val p = session.prepare(sql)
169 | cachePrepares.put(sql, p)
170 | p
171 | })
172 | }
173 |
174 | }
175 |
--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/service/NewsMaster.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news.service
2 |
3 | import akka.actor.Props
4 | import crawler.module.news.NewsUtils
5 | import crawler.module.news.commands.RequestSearchNews
6 | import crawler.module.news.service.actors.{NewsJob, PersistActor}
7 | import crawler.util.actors.MetricActor
8 |
9 | /**
10 | * News Supervisor
11 | * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-06.
12 | */
13 | class NewsMaster extends MetricActor {
14 | val persistActor = context.actorOf(PersistActor.props, PersistActor.actorName)
15 |
16 | override val metricReceive: Receive = {
17 | case RequestSearchNews(sources, msg) =>
18 | val doSender = sender()
19 | val newsJob = context.actorOf(NewsJob.props(sources, doSender), "news-" + NewsUtils.getIndent)
20 | newsJob ! msg
21 | }
22 | }
23 |
24 | object NewsMaster {
25 | val actorName = "news-master"
26 |
27 | def props = Props(new NewsMaster)
28 | }
29 |
--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/service/NewsService.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news.service
2 |
3 | import akka.pattern.ask
4 | import crawler.module.news.commands.{RequestSearchNews, SearchNews}
5 | import crawler.module.news.enums.{ItemSource, SearchMethod}
6 | import crawler.module.news.model.{NewsItem, SearchResult}
7 | import crawler.util.time.TimeUtils
8 |
9 | import scala.concurrent.Future
10 | import scala.concurrent.duration._
11 |
12 | /**
13 | * 新闻服务
14 | * Created by yangjing on 15-11-3.
15 | */
16 | class NewsService {
17 |
18 | import crawler.SystemUtils._
19 | import system.dispatcher
20 |
21 | val newsMaster = system.actorOf(NewsMaster.props, NewsMaster.actorName)
22 | val dbRepo = new NewsDBRepo
23 |
24 | def fetchNewsApi(_key: String,
25 | sources: Traversable[ItemSource.Value],
26 | method: SearchMethod.Value,
27 | duration: FiniteDuration,
28 | forcedLatest: Boolean): Future[Seq[NewsItem]] = {
29 | fetchNews(_key, sources, method, duration, forcedLatest).
30 | map(_.flatMap(_.news))
31 | }
32 |
33 | def fetchNews(_key: String,
34 | sources: Traversable[ItemSource.Value],
35 | method: SearchMethod.Value,
36 | duration: FiniteDuration,
37 | forcedLatest: Boolean): Future[Seq[SearchResult]] = {
38 | val key = _key.trim
39 | val future = dbRepo.findNews(key, sources, method, if (forcedLatest) Some(TimeUtils.nowBegin()) else None)
40 |
41 | future.flatMap(results =>
42 | if (results.isEmpty) {
43 | val msg = RequestSearchNews(sources.toSeq, SearchNews(key, method, duration))
44 | // TODO 最长5分钟
45 | newsMaster.ask(msg)(5.minutes).mapTo[Seq[SearchResult]]
46 | } else {
47 | Future.successful(results)
48 | }
49 | )
50 | }
51 |
52 | }
53 |
54 |
--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/service/actors/ItemPageWorker.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news.service.actors
2 |
3 | import akka.actor.Props
4 | import crawler.module.news.commands.{ItemPageResult, StartFetchItemPage}
5 | import crawler.module.news.crawlers.NewsCrawler
6 | import crawler.module.news.enums.ItemSource
7 | import crawler.module.news.model.NewsItem
8 | import crawler.util.actors.MetricActor
9 |
10 | import scala.util.{Failure, Success}
11 |
12 | /**
13 | * 详情页面
14 | * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-06.
15 | */
16 | class ItemPageWorker(source: ItemSource.Value, newsItem: NewsItem) extends MetricActor {
17 |
18 | import context.dispatcher
19 |
20 | override val metricReceive: Receive = {
21 | case StartFetchItemPage =>
22 | val doSender = sender()
23 |
24 | NewsCrawler.getCrawler(source) match {
25 | case Some(crawler) =>
26 | crawler.fetchNewsItem(newsItem.url).onComplete {
27 | case Success(pageItem) =>
28 | logger.debug(s"${newsItem.url} context OK")
29 | doSender ! ItemPageResult(Right(pageItem))
30 |
31 | case Failure(e) =>
32 | logger.warn(s"${newsItem.url} context extractor")
33 | e.printStackTrace()
34 | doSender ! ItemPageResult(Left(e.getLocalizedMessage))
35 | }
36 |
37 | case None =>
38 | doSender ! ItemPageResult(Left(s"Crawler $source not exists, ${newsItem.url} needed."))
39 | }
40 | }
41 |
42 | }
43 |
44 | object ItemPageWorker {
45 |
46 | def props(source: ItemSource.Value, item: NewsItem) = Props(new ItemPageWorker(source, item))
47 |
48 | }
49 |
--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/service/actors/NewsJob.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news.service.actors
2 |
3 | import akka.actor.{ActorRef, PoisonPill, Props}
4 | import crawler.module.news.commands.{SearchNews, StartSearchNews}
5 | import crawler.module.news.enums.ItemSource
6 | import crawler.module.news.model.SearchResult
7 | import crawler.util.actors.MetricActor
8 |
9 | /**
10 | * NewsJob
11 | * 成功返回: Seq[NewsResult]
12 | * Created by yangjing on 15-11-5.
13 | */
14 | class NewsJob(sources: Seq[ItemSource.Value], reqSender: ActorRef) extends MetricActor {
15 | @volatile var _completeJobs = 0
16 | @volatile var _newsResults = List.empty[SearchResult]
17 |
18 | override val metricReceive: Receive = {
19 | case SearchNews(key, method, duration) =>
20 | sources.foreach { source =>
21 | val jobName = source.toString
22 | val jobActor = context.actorOf(NewsSourceJob.props(source, method, key, duration, self), jobName)
23 | jobActor ! StartSearchNews
24 | }
25 |
26 | case result: SearchResult =>
27 | _completeJobs += 1
28 | _newsResults ::= result
29 | if (sources.size == _completeJobs) {
30 | reqSender ! _newsResults
31 |
32 | // TODO 把 NewsJob 内的超时判断上移到 NewsJob ?
33 | self ! PoisonPill
34 | }
35 |
36 | }
37 | }
38 |
39 | object NewsJob {
40 | def props(sources: Seq[ItemSource.Value], reqSender: ActorRef) = Props(new NewsJob(sources, reqSender))
41 | }
42 |
--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/service/actors/NewsSourceJob.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news.service.actors
2 |
3 | import akka.actor.{ActorRef, Cancellable, PoisonPill, Props}
4 | import crawler.module.news.commands._
5 | import crawler.module.news.enums.{ItemSource, SearchMethod}
6 | import crawler.module.news.model.SearchResult
7 | import crawler.module.news.service.NewsMaster
8 | import crawler.util.actors.MetricActor
9 | import crawler.util.time.TimeUtils
10 |
11 | import scala.concurrent.duration.FiniteDuration
12 |
13 | /**
14 | * 新闻job
15 | *
16 | * @param source 搜索源
17 | * @param method 搜索方式
18 | * @param key 搜索关键词
19 | * @param duration 持续时间,到期后向未获取完新闻数据向客户端返回Timeout。children actor继续业务处理
20 | * @param reqSender 请求actor
21 | */
22 | class NewsSourceJob(source: ItemSource.Value,
23 | method: SearchMethod.Value,
24 | key: String,
25 | duration: FiniteDuration,
26 | reqSender: ActorRef) extends MetricActor {
27 |
28 | private val persistActor = context.actorSelection(context.system / NewsMaster.actorName / PersistActor.actorName)
29 | @volatile var _newsResult = SearchResult(source, "", TimeUtils.now(), 0, Nil)
30 | @volatile var _isTimeout: Boolean = false
31 | @volatile var _notCompleteItemPageActorNames = Seq.empty[String]
32 | @volatile var _cancelableSchedule: Cancellable = _
33 |
34 | import context.dispatcher
35 |
36 | override def metricPreStart(): Unit = {
37 | // 定义超时时间
38 | _cancelableSchedule = context.system.scheduler.scheduleOnce(duration, self, SearchTimeout)
39 | }
40 |
41 | override def metricPostStop(): Unit = {
42 | if (!_cancelableSchedule.isCancelled) {
43 | _cancelableSchedule.cancel()
44 | }
45 |
46 | if (null != _newsResult && _newsResult.count > 0) {
47 | persistActor ! _newsResult
48 | } else {
49 | logger.warn(s"${self.path} [$key]未获取到相关数据: ${_newsResult.error}")
50 | }
51 | }
52 |
53 | override val metricReceive: Receive = {
54 | case s@StartSearchNews =>
55 | val searchPage = context.actorOf(SearchPageWorker.props(source, key), "page")
56 | searchPage ! StartFetchSearchPage
57 |
58 | case SearchPageResult(newsResult) =>
59 | _newsResult = newsResult
60 | method match {
61 | case SearchMethod.F if _newsResult.count > 0 => // 需要抓取详情内容
62 | _notCompleteItemPageActorNames = newsResult.news.zipWithIndex.map { case (item, idx) =>
63 | val childName = "item-" + idx
64 | val itemPage = context.actorOf(ItemPageWorker.props(source, item), childName)
65 | itemPage ! StartFetchItemPage
66 | childName
67 | }
68 |
69 | case _ => // SearchMethod.S => // 只抓取摘要
70 | if (!_isTimeout) {
71 | reqSender ! _newsResult
72 | }
73 | self ! PoisonPill
74 | }
75 |
76 | case ItemPageResult(result) =>
77 | val doSender = sender()
78 | println(doSender.path)
79 | _notCompleteItemPageActorNames = _notCompleteItemPageActorNames.filterNot(_ == doSender.path.name)
80 | result match {
81 | case Left(errMsg) =>
82 | // TODO 解析新闻详情页失败!
83 | logger.error(errMsg)
84 |
85 | case Right(pageItem) =>
86 | // 更新 result.news
87 | val news = _newsResult.news.map {
88 | case oldItem if oldItem.url == pageItem.url =>
89 | oldItem.copy(content = Option(pageItem.content))
90 |
91 | case oldItem =>
92 | oldItem
93 | }
94 |
95 | _newsResult = _newsResult.copy(news = news)
96 | }
97 |
98 | if (_notCompleteItemPageActorNames.isEmpty) {
99 | if (!_isTimeout) {
100 | reqSender ! _newsResult
101 | }
102 | self ! PoisonPill
103 | }
104 |
105 | case SearchTimeout =>
106 | _isTimeout = true
107 |
108 | // 此时向调用客户端返回已存在的数据,但实际的新闻抓取流程仍将继续
109 | reqSender ! _newsResult //Left(new AskTimeoutException("搜索超时"))
110 |
111 | case SearchPageFailure(e) =>
112 | logger.warn(self.path + " ", e)
113 | if (!_isTimeout) {
114 | reqSender ! SearchResult(source, key, TimeUtils.now(), 0, Nil, Some(e.getLocalizedMessage))
115 | }
116 | self ! PoisonPill
117 | }
118 |
119 | }
120 |
121 | object NewsSourceJob {
122 | def props(source: ItemSource.Value,
123 | method: SearchMethod.Value,
124 | key: String,
125 | duration: FiniteDuration,
126 | reqSender: ActorRef) =
127 | Props(new NewsSourceJob(source, method, key, duration, reqSender))
128 | }
129 |
--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/service/actors/PersistActor.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news.service.actors
2 |
3 | import akka.actor.Props
4 | import crawler.module.news.model.{NewsPage, SearchResult}
5 | import crawler.module.news.service.NewsDBRepo
6 | import crawler.util.actors.MetricActor
7 |
8 | /**
9 | * 持久化
10 | * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-06.
11 | */
12 | class PersistActor extends MetricActor {
13 | val dbRepo = new NewsDBRepo
14 |
15 | override val metricReceive: Receive = {
16 | case newsResult: SearchResult =>
17 | dbRepo.saveToSearchPage(newsResult)
18 |
19 | newsResult.news.foreach { item =>
20 | val page = NewsPage(item.url, item.title, item.source, item.time, item.`abstract`, item.content.getOrElse(""))
21 | dbRepo.saveToNewsPage(page)
22 | }
23 | }
24 |
25 | }
26 |
27 | object PersistActor {
28 | val BATCH_SIZE = 20
29 | val actorName = "persist"
30 |
31 | def props = Props(new PersistActor)
32 | }
33 |
--------------------------------------------------------------------------------
/module-news/src/main/scala/crawler/module/news/service/actors/SearchPageWorker.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news.service.actors
2 |
3 | import akka.actor.Props
4 | import crawler.module.news.commands.{SearchPageFailure, SearchPageResult, StartFetchSearchPage}
5 | import crawler.module.news.crawlers.NewsCrawler
6 | import crawler.module.news.enums.ItemSource
7 | import crawler.util.actors.MetricActor
8 |
9 | import scala.util.{Failure, Success}
10 |
11 | /**
12 | * 搜索页面
13 | * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-06.
14 | */
15 | class SearchPageWorker(source: ItemSource.Value, key: String) extends MetricActor {
16 |
17 | import context.dispatcher
18 |
19 | override val metricReceive: Receive = {
20 | case StartFetchSearchPage =>
21 | val doSender = sender()
22 |
23 | NewsCrawler.getCrawler(source) match {
24 | case Some(crawler) =>
25 | crawler.fetchItemList(key).onComplete {
26 | case Success(result) =>
27 | doSender ! SearchPageResult(result)
28 | stop()
29 |
30 | case Failure(e) =>
31 | doSender ! SearchPageFailure(e)
32 | stop()
33 | }
34 |
35 | case None =>
36 | doSender ! SearchPageFailure(new RuntimeException(s"Crawler $source not exists"))
37 | stop()
38 | }
39 | }
40 |
41 | private def stop(): Unit = context.stop(self)
42 | }
43 |
44 | object SearchPageWorker {
45 |
46 | def props(source: ItemSource.Value, name: String) = Props(new SearchPageWorker(source, name))
47 |
48 | }
--------------------------------------------------------------------------------
/module-news/src/test/resources/logback.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | %date - [%level] - from %logger in %thread %n%message%n%xException%n
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/module-news/src/test/scala/crawler/module/news/crawlers/BaiduNewsTest.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news.crawlers
2 |
3 | import akka.util.Timeout
4 | import crawler.testsuite.ServiceSpec
5 | import crawler.util.http.HttpClient
6 |
7 | import scala.concurrent.Await
8 | import scala.concurrent.duration._
9 |
10 | /**
11 | * Created by Yang Jing (yangbajing@gmail.com) on 2015-12-03.
12 | */
13 | class BaiduNewsTest extends ServiceSpec {
14 |
15 | implicit val timeout = Timeout(30.seconds)
16 |
17 | "BaiduNewsTest" should {
18 |
19 | "fetchNewsList" in {
20 | val baidu = new BaiduNews(HttpClient())
21 | val result = Await.result(baidu.fetchItemList("阿里巴巴"), timeout.duration)
22 | result.news.foreach(println)
23 | println(result.source + " " + result.key)
24 | println(result.news.size)
25 | result.news must not be empty
26 | }
27 |
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/module-news/src/test/scala/crawler/module/news/crawlers/CourtNewsTest.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news.crawlers
2 |
3 | import akka.util.Timeout
4 | import crawler.testsuite.ServiceSpec
5 | import crawler.util.http.HttpClient
6 |
7 | import scala.concurrent.Await
8 | import scala.concurrent.duration._
9 |
10 | class CourtNewsTest extends ServiceSpec {
11 |
12 | val timeout = Timeout(30.seconds)
13 |
14 | "CourtNewsTest" should {
15 | "fetchNewsList" in {
16 | val court = new CourtNews(HttpClient())
17 | val result = Await.result(court.fetchItemList("重庆"), timeout.duration)
18 | result.news.foreach(println)
19 | println(result.key)
20 | result.news must not be empty
21 | }
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/module-news/src/test/scala/crawler/module/news/crawlers/HaosouNewsTest.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news.crawlers
2 |
3 | import akka.util.Timeout
4 | import crawler.testsuite.ServiceSpec
5 | import crawler.util.http.HttpClient
6 |
7 | import scala.concurrent.Await
8 | import scala.concurrent.duration._
9 |
10 | /**
11 | * Created by yangjing on 15-11-9.
12 | */
13 | class HaosouNewsTest extends ServiceSpec {
14 |
15 | implicit val timeout = Timeout(30.seconds)
16 |
17 | "HaosouCrawlerTest" should {
18 |
19 | "fetchNewsList" in {
20 | val haosou = new HaosouNews(HttpClient())
21 | val result = Await.result(haosou.fetchItemList("誉存科技"), timeout.duration)
22 | result.news.foreach(println)
23 | println(result.source + " " + result.key)
24 | result.news must not be empty
25 | }
26 |
27 | }
28 |
29 | override implicit def patienceConfig: PatienceConfig = super.patienceConfig
30 | }
31 |
--------------------------------------------------------------------------------
/module-news/src/test/scala/crawler/module/news/crawlers/WechatNewsTest.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news.crawlers
2 |
3 | import akka.util.Timeout
4 | import crawler.testsuite.ServiceSpec
5 | import crawler.util.http.HttpClient
6 |
7 | import scala.concurrent.Await
8 | import scala.concurrent.duration._
9 |
10 | /**
11 | * Wechat News Test
12 | * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-10.
13 | */
14 | class WechatNewsTest extends ServiceSpec {
15 |
16 | implicit val timeout = Timeout(30.seconds)
17 | "WechatNewsTest" should {
18 |
19 | "fetchNewsList" in {
20 | val wechat = new WechatNews(HttpClient())
21 | val f = wechat.fetchItemList("成都念念科技有限公司")
22 | val result = Await.result(f, timeout.duration)
23 | result.news.foreach(println)
24 | println(result.count + " " + result.key)
25 | result.news must not be empty
26 | }
27 |
28 | }
29 |
30 | }
31 |
--------------------------------------------------------------------------------
/module-news/src/test/scala/crawler/module/news/service/NewsDBRepoTest.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news.service
2 |
3 | import java.util.concurrent.TimeUnit
4 |
5 | import crawler.module.news.enums.{ItemSource, SearchMethod}
6 | import crawler.testsuite.ServiceSpec
7 | import crawler.util.time.TimeUtils
8 |
9 | class NewsDBRepoTest extends ServiceSpec {
10 |
11 | "NewsDBRepoTest" should {
12 | val dbRepo = new NewsDBRepo
13 |
14 | "findNews" in {
15 | val result = dbRepo.findNews("阿里巴巴", Seq(ItemSource.baidu), SearchMethod.F, Some(TimeUtils.nowBegin()))
16 | val list = result.futureValue
17 | println(list)
18 | list must not be empty
19 |
20 | TimeUnit.SECONDS.sleep(5)
21 | }
22 |
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/module-news/src/test/scala/crawler/module/news/service/actors/NewsJobMasterTest.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.news.service.actors
2 |
3 | import java.util.concurrent.TimeUnit
4 |
5 | import akka.pattern.ask
6 | import akka.util.Timeout
7 | import crawler.SystemUtils
8 | import crawler.module.news.commands.{SearchNews, RequestSearchNews}
9 | import crawler.module.news.crawlers.{BaiduNews, NewsCrawler}
10 | import crawler.module.news.enums.{SearchMethod, ItemSource}
11 | import crawler.module.news.model.SearchResult
12 | import crawler.module.news.service.NewsMaster
13 | import crawler.testsuite.ServiceSpec
14 |
15 | import scala.concurrent.duration._
16 |
17 | /**
18 | * NewsMasterTest
19 | * Created by yangjing on 15-11-5.
20 | */
21 | class NewsJobMasterTest extends ServiceSpec {
22 |
23 | implicit val timeout = Timeout(60.seconds)
24 |
25 | "NewsMasterTest" should {
26 | NewsCrawler.registerCrawler(ItemSource.baidu, new BaiduNews(SystemUtils.httpClient))
27 |
28 | "news-master" in {
29 | val sources = Seq(ItemSource.baidu)
30 | val newsMaster = system.actorOf(NewsMaster.props, NewsMaster.actorName)
31 | val msg = RequestSearchNews(sources, SearchNews("杭州誉存科技有限公司", SearchMethod.F, 3.seconds))
32 |
33 | val f = (newsMaster ? msg).mapTo[Seq[SearchResult]]
34 |
35 | f onSuccess { case list =>
36 | list.foreach(println)
37 | list.size mustBe 1
38 | }
39 |
40 | f onFailure { case e =>
41 | println("Failure: " + e)
42 | }
43 |
44 | TimeUnit.SECONDS.sleep(20)
45 | }
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/module-site-search/src/main/scala/crawler/module/site/BaiduSite.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.site
2 |
3 | import java.net.URLEncoder
4 | import java.time.LocalDateTime
5 | import java.util.concurrent.TimeUnit
6 |
7 | import com.typesafe.scalalogging.LazyLogging
8 | import crawler.module.site.model.{SearchRequest, SiteItem, SiteResult}
9 | import crawler.util.Crawler
10 | import crawler.util.http.HttpClient
11 | import crawler.util.time.TimeUtils
12 | import org.jsoup.Jsoup
13 | import org.jsoup.nodes.Element
14 |
15 | import scala.collection.JavaConverters._
16 | import scala.concurrent.{ExecutionContext, Future, Promise}
17 | import scala.util.{Failure, Success}
18 |
19 | /**
20 | * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-18.
21 | */
22 | class BaiduSite(val httpClient: HttpClient,
23 | searchRequest: SearchRequest) extends Crawler with LazyLogging {
24 |
25 | import BaiduSite._
26 |
27 | override protected val defaultHeaders: Array[Seq[(String, String)]] =
28 | super.defaultHeaders.map(headers => headers :+ ("User-Agent" -> "Baiduspider"))
29 |
30 | val values = searchRequest.params.map(_.value)
31 |
32 | /**
33 | * 抓取搜索页
34 | *
35 | * @return
36 | */
37 | def fetchItemList()(implicit ec: ExecutionContext): Future[SiteResult] = {
38 | val promise = Promise[Seq[SiteItem]]()
39 | val key = searchRequest.toParam
40 |
41 | val url = BAIDU_SITE_BASE_URL.format(URLEncoder.encode(key, "UTF-8"))
42 | logger.info(s"key: $key, url: $url")
43 |
44 | val newsResultsFuture = fetchPage(url).flatMap { resp =>
45 | val doc = Jsoup.parse(resp.getResponseBodyAsStream, "UTF-8", BAIDU_SITE_HOST).getElementById("wrapper_wrapper")
46 | val now = TimeUtils.now()
47 | val contentNone = doc.select(".content_none")
48 |
49 | if (!contentNone.isEmpty) {
50 | promise.success(Nil)
51 | Future.successful(SiteResult(ITEM_SOURCE, key, now, 0, Nil))
52 | } else {
53 | val wrapper = doc
54 | val countText = wrapper
55 | .select(".head_nums_cont_outer.OP_LOG")
56 | .select(".nums")
57 | .text()
58 | val count =
59 | """\d+""".r.findAllMatchIn(countText).map(_.matched).mkString.toInt
60 |
61 | val itemDiv = doc.getElementById("content_left")
62 | val itemResults = itemDiv.select(".result.c-container").asScala
63 |
64 | val pages = doc.select("#page a").asScala
65 | val newsItemFutures = pages.take(PAGE_LIMIT - 1).map { page =>
66 | TimeUnit.MILLISECONDS.sleep(100)
67 | fetchPageLinks(BAIDU_SITE_HOST + page.attr("href"))
68 | }
69 |
70 | Future.sequence(newsItemFutures).map(_.flatten).onComplete {
71 | case Success(list) =>
72 | promise.success(list)
73 | case Failure(e) =>
74 | e.printStackTrace()
75 | promise.success(Nil)
76 | }
77 |
78 | Future.sequence(itemResults.map(parseSiteItem))
79 | .map(items => SiteResult(ITEM_SOURCE, key, now, count, items))
80 | }
81 | }
82 |
83 | for {
84 | newsResult <- newsResultsFuture
85 | newsItems <- promise.future
86 | } yield {
87 | newsResult.copy(items = newsResult.items ++ newsItems)
88 | }
89 | }
90 |
91 | def fetchPageLinks(url: String)(implicit ec: ExecutionContext): Future[Seq[SiteItem]] = {
92 | fetchPage(url).flatMap { resp =>
93 | val doc = Jsoup.parse(resp.getResponseBodyAsStream, "UTF-8", BaiduSite.BAIDU_SITE_HOST)
94 | if (doc.getElementById("content_none") != null) {
95 | Future.successful(Nil)
96 | } else {
97 | val itemDiv = doc.getElementById("content_left")
98 | val itemResults = itemDiv.select(".result.c-container").asScala
99 | val futures = itemResults.map(parseSiteItem)
100 | Future.sequence(futures)
101 | }
102 | }
103 | }
104 |
105 | def parseSiteItem(elem: Element)(implicit ec: ExecutionContext): Future[SiteItem] = {
106 | val link = elem.select(".t").select("a").first()
107 | val href = link.attr("href")
108 |
109 | extractPageUrl(href).map { url =>
110 | val title = link.text()
111 |
112 | val sourceHostDesc = elem.select(".f13 a").first().text()
113 | val source = sourceHostDesc.take(sourceHostDesc.indexOf('/'))
114 |
115 | val abstractElem = elem.select(".c-abstract")
116 | val summary = abstractElem.asScala.filterNot(e => e.attr("class").contains("newTimeFactor_before_abs")).map(_.text()).mkString
117 | val time = BaiduSite.dealTime(abstractElem.select(".newTimeFactor_before_abs").text())
118 |
119 | SiteItem(title, url, source, time, summary, values)
120 | }
121 | }
122 |
123 | def extractPageUrl(href: String): Future[String] = {
124 | implicit val ec = ExecutionContext.Implicits.global
125 |
126 | if (searchRequest.followUrl) {
127 | HttpClient.find302Location(httpClient, href, requestHeaders()).map(v => if (v == null) href else v)
128 | } else {
129 | Future.successful(href)
130 | }
131 | }
132 |
133 | }
134 |
135 | object BaiduSite {
136 | // 抓取前5页
137 | val PAGE_LIMIT = 5
138 |
139 | val BAIDU_SITE_BASE_URL = "https://www.baidu.com/s?wd=%s&rsv_spt=1&issp=1&f=8&rsv_bp=0&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&rsv_n=2&rsv_sug3=1"
140 |
141 | val BAIDU_SITE_HOST = "https://www.baidu.com"
142 |
143 | val TIME_PATTERN = """(\d{4})年(\d{1,2})月(\d{1,2})日""".r
144 |
145 | val ITEM_SOURCE = "baiduSite"
146 |
147 | def dealTime(timeStr: String): Option[LocalDateTime] = timeStr.substring(0, timeStr.indexOf('日') + 1) match {
148 | case TIME_PATTERN(year, month, day) => Some(LocalDateTime.of(year.toInt, month.toInt, day.toInt, 0, 0))
149 | case _ => None
150 | }
151 |
152 | }
153 |
--------------------------------------------------------------------------------
/module-site-search/src/main/scala/crawler/module/site/QueryCond.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.site
2 |
3 | /**
4 | * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-18.
5 | */
6 | object QueryCond extends Enumeration {
7 | val - = Value("-")
8 | val + = Value("+")
9 | }
10 |
--------------------------------------------------------------------------------
/module-site-search/src/main/scala/crawler/module/site/SearchSyntax.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.site
2 |
3 | /**
4 | * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-20.
5 | */
6 | object SearchSyntax {
7 | final val Intitle = "intitle"
8 | final val Insite = "insite"
9 | final val Inurl = "inurl"
10 | }
11 |
--------------------------------------------------------------------------------
/module-site-search/src/main/scala/crawler/module/site/model/SearchRequest.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.site.model
2 |
3 | import crawler.module.site.QueryCond
4 |
5 | /**
6 | * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-18.
7 | */
8 | case class SearchRequest(params: Seq[SearchParam], followUrl: Boolean = true) {
9 |
10 | def toParam = params.map(_.toParam).mkString(" ")
11 |
12 | }
13 |
14 | case class SearchParam(value: String,
15 | syntax: Option[String] = None,
16 | cond: Option[QueryCond.Value] = None,
17 | filetypeDoc: Seq[String] = Nil,
18 | strict: Boolean = true) {
19 |
20 | def toParam =
21 | syntax.map(v => if (strict) s"""$v:"$value"""" else s"$v:$value") orElse
22 | cond.map(v => v + value) getOrElse
23 | value
24 |
25 | }
26 |
--------------------------------------------------------------------------------
/module-site-search/src/main/scala/crawler/module/site/model/SiteItem.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.site.model
2 |
3 | import java.time.LocalDateTime
4 |
5 | import org.json4s.Extraction
6 |
7 | /**
8 | * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-22.
9 | */
10 | case class SiteItem(title: String,
11 | url: String,
12 | // 新闻来源(站点)
13 | source: String,
14 | time: Option[LocalDateTime],
15 | // 摘要
16 | `abstract`: String,
17 | values: Seq[String] = Nil) {
18 |
19 | def jsonPretty = {
20 | import crawler.util.http.TJsonSupport._
21 | val jv = Extraction.decompose(this)
22 | serialization.writePretty(jv)
23 | }
24 |
25 | }
26 |
--------------------------------------------------------------------------------
/module-site-search/src/main/scala/crawler/module/site/model/SiteResult.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.site.model
2 |
3 | import java.time.LocalDateTime
4 |
5 | /**
6 | * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-22.
7 | */
8 | case class SiteResult(source: String,
9 | key: String,
10 | time: LocalDateTime,
11 | count: Int,
12 | items: Seq[SiteItem],
13 | error: Option[String] = None)
14 |
--------------------------------------------------------------------------------
/module-site-search/src/test/resources/logback.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | %date - [%level] - from %logger in %thread %n%message%n%xException%n
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/module-site-search/src/test/scala/crawler/module/site/BaiduSiteTest.scala:
--------------------------------------------------------------------------------
1 | package crawler.module.site
2 |
3 | import akka.util.Timeout
4 | import crawler.module.site.model.{SearchParam, SearchRequest}
5 | import crawler.testsuite.ServiceSpec
6 | import crawler.util.http.HttpClient
7 |
8 | import scala.concurrent.Await
9 | import scala.concurrent.duration._
10 |
11 | /**
12 | * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-18.
13 | */
14 | class BaiduSiteTest extends ServiceSpec {
15 |
16 | implicit val timeout = Timeout(30.seconds)
17 |
18 | "BaiduSiteTest" should {
19 |
20 | "fetchItemList" in {
21 | val requestParams = SearchRequest(
22 | SearchParam("晋渝地产", Some(SearchSyntax.Intitle)) ::
23 | // SearchParam("阿里巴巴kakakakaak", Some(SearchSyntax.Intitle)) ::
24 | // SearchParam("失信", syntax = Some(SearchSyntax.Intitle), strict = false) ::
25 | Nil
26 | )
27 | val baidu = new BaiduSite(HttpClient(), requestParams)
28 |
29 | val key = requestParams.toParam
30 | val f = baidu.fetchItemList()
31 | val result = Await.result(f, timeout.duration)
32 | result.items.foreach(v => println(v.jsonPretty))
33 | println(result.items.size)
34 | result.items must not be empty
35 | }
36 |
37 | }
38 |
39 | }
40 |
--------------------------------------------------------------------------------
/project/Build.scala:
--------------------------------------------------------------------------------
1 | import _root_.sbt.Keys._
2 | import _root_.sbt._
3 | import com.typesafe.sbt.SbtNativePackager.{Linux, Debian}
4 | import com.typesafe.sbt.packager.Keys._
5 | import com.typesafe.sbt.packager.archetypes.JavaServerAppPackaging
6 | import com.typesafe.sbt.packager.universal.UniversalPlugin.autoImport._
7 | import sbtassembly.AssemblyKeys._
8 | import sbtassembly.{MergeStrategy, PathList}
9 |
10 | object Build extends Build {
11 |
12 | import BuildSettings._
13 |
14 | val DependsConfigure = "test->test;compile->compile"
15 |
16 | override lazy val settings = super.settings :+ {
17 | shellPrompt := (s => Project.extract(s).currentProject.id + " > ")
18 | }
19 |
20 | lazy val root = Project("crawler-high-search", file("."))
21 | .aggregate(
22 | appApi,
23 | crawlerSiteSearch,
24 | moduleSiteSearch, moduleNews,
25 | util)
26 |
27 | ///////////////////////////////////////////////////////////////
28 | // projects
29 | ///////////////////////////////////////////////////////////////
30 | lazy val packageDebianProd = taskKey[File]("creates deb-prod package")
31 | lazy val appApi = Project("app-api", file("app-api"))
32 | .enablePlugins(JavaServerAppPackaging)
33 | .dependsOn(moduleSiteSearch % DependsConfigure, moduleNews % DependsConfigure, util % DependsConfigure)
34 | .settings(basicSettings: _*)
35 | .settings(
36 | description := "app-api",
37 |
38 | packageDescription := "一个高级异步多线程实时爬虫API",
39 | mainClass in Compile := Some("crawler.app.Main"),
40 | maintainer in Linux := "Jing Yang ",
41 | packageSummary in Linux := "Crawler High Search API",
42 | daemonUser in Linux := "nobody",
43 | bashScriptConfigLocation := Some("${app_home}/../conf/jvmopts"),
44 | bashScriptExtraDefines += """addJava "-Dlogback.configurationFile=${app_home}/../conf/logback.xml"""",
45 |
46 | // |; bashScriptExtraDefines := Seq("addJava \"-Dconfig.file=${app_home}/../conf/application.conf -Dlogback.configurationFile=${app_home}/../conf/logback.xml\"")
47 | addCommandAlias("packageProd",
48 | """; clean
49 | |; bashScriptExtraDefines += "addJava \"-Dconfig.file=${app_home}/../conf/application-test.conf -Dlogback.configurationFile=${app_home}/../conf/logback.xml\""
50 | |; packageDebianProd
51 | """.stripMargin),
52 | packageDebianProd := {
53 | bashScriptExtraDefines += """addJava "-Dconfig.file=${app_home}/../conf/application-test.conf -Dlogback.configurationFile=${app_home}/../conf/logback.xml""""
54 | val output = baseDirectory.value / "package" / "deb-prod.deb"
55 | val debianFile = (packageBin in Debian).value
56 | IO.move(debianFile, output)
57 | output
58 | },
59 |
60 | // assemblyJarName in assembly := "crawler-app.jar",
61 | // mappings in Universal <<= (mappings in Universal, assembly in Compile) map { (mappings, fatJar) =>
62 | // val filtered = mappings filter { case (file, name) => !name.endsWith(".jar") }
63 | // filtered :+ (fatJar -> ("lib/" + fatJar.getName))
64 | // },
65 | // test in assembly := {},
66 | // assemblyMergeStrategy in assembly := {
67 | // case PathList("META-INF", "io.netty.versions.properties") => MergeStrategy.discard
68 | // case x =>
69 | // val oldStrategy = (assemblyMergeStrategy in assembly).value
70 | // oldStrategy(x)
71 | // },
72 |
73 | libraryDependencies ++= Seq(
74 | _akkaHttp)
75 | )
76 |
77 | lazy val crawlerSiteSearch = Project("crawler-site-search", file("crawler-site-search"))
78 | .dependsOn(moduleSiteSearch % DependsConfigure, util % DependsConfigure)
79 | .settings(basicSettings: _*)
80 | .settings(
81 | description := "crawler-site-search",
82 | libraryDependencies ++= Seq(
83 | _activemqSTOMP,
84 | _cassandraDriverCore,
85 | _mongoScala)
86 | )
87 |
88 | lazy val moduleSiteSearch = Project("module-site-search", file("module-site-search"))
89 | .dependsOn(util % DependsConfigure)
90 | .settings(basicSettings: _*)
91 | .settings(
92 | description := "module-site-search"
93 | )
94 |
95 | lazy val moduleNews = Project("module-news", file("module-news"))
96 | .dependsOn(util % DependsConfigure)
97 | .settings(basicSettings: _*)
98 | .settings(
99 | description := "module-news",
100 | libraryDependencies ++= Seq(
101 | _cassandraDriverCore,
102 | _akkaActor)
103 | )
104 |
105 | lazy val util = Project("util", file("util"))
106 | .settings(basicSettings: _*)
107 | .settings(
108 | description := "util",
109 | libraryDependencies ++= Seq(
110 | _activemqSTOMP % "provided",
111 | _cassandraDriverCore % "provided",
112 | _mongoScala % "provided",
113 | _akkaHttp % "provided",
114 | _akkaStream,
115 | _json4sJackson,
116 | _json4sExt,
117 | _scalaLogging,
118 | _asyncHttpClient,
119 | _jsoup,
120 | _akkaActor,
121 | _akkaSlf4j,
122 | _logbackClassic)
123 | )
124 |
125 | }
126 |
--------------------------------------------------------------------------------
/project/BuildSettings.scala:
--------------------------------------------------------------------------------
1 | import sbt.Keys._
2 | import sbt._
3 |
4 | object BuildSettings {
5 |
6 | lazy val basicSettings = Seq(
7 | version := "0.0.1",
8 | homepage := Some(new URL("https://github.com/yangbajing/crawler-service")),
9 | organization := "cn.socialcredits.crawler",
10 | organizationHomepage := Some(new URL("https://github.com/yangbajing/crawler-service")),
11 | startYear := Some(2015),
12 | scalaVersion := "2.11.7",
13 | scalacOptions := Seq(
14 | "-encoding", "utf8",
15 | "-unchecked",
16 | "-feature",
17 | "-deprecation"
18 | ),
19 | javacOptions := Seq(
20 | "-encoding", "utf8",
21 | "-Xlint:unchecked",
22 | "-Xlint:deprecation"
23 | ),
24 | resolvers ++= Seq(
25 | "snapshots" at "http://oss.sonatype.org/content/repositories/snapshots",
26 | "releases" at "http://oss.sonatype.org/content/repositories/releases",
27 | "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/",
28 | "Typesafe Snapshots" at "http://repo.typesafe.com/typesafe/snapshots/"),
29 | libraryDependencies ++= Seq(
30 | _scalaReflect,
31 | _scalatest
32 | ),
33 | offline := true,
34 | fork := true
35 | )
36 |
37 | lazy val noPublishing = Seq(
38 | publish :=(),
39 | publishLocal :=()
40 | )
41 |
42 | val verAkka = "2.3.14"
43 | val _akkaActor = "com.typesafe.akka" %% "akka-actor" % verAkka
44 | val _akkaSlf4j = "com.typesafe.akka" %% "akka-slf4j" % verAkka
45 |
46 | lazy val _scalaReflect = "org.scala-lang" % "scala-reflect" % "2.11.7"
47 |
48 | val verAkkaHttp = "2.0.2"
49 | lazy val _akkaStream = ("com.typesafe.akka" %% "akka-stream-experimental" % verAkkaHttp).exclude("com.typesafe.akka", "akka-actor")
50 | lazy val _akkaHttp = ("com.typesafe.akka" %% "akka-http-experimental" % verAkkaHttp).exclude("com.typesafe.akka", "akka-actor")
51 |
52 | lazy val _scalatest = "org.scalatest" %% "scalatest" % "2.2.5" % "test"
53 |
54 | lazy val _scalaLogging = ("com.typesafe.scala-logging" %% "scala-logging" % "3.1.0").exclude("org.scala-lang", "scala-reflect").exclude("org.slf4j", "slf4j-api")
55 |
56 | lazy val _mongoScala = ("org.mongodb.scala" %% "mongo-scala-driver" % "1.1.0").exclude("com.typesafe.akka", "akka-actor")
57 |
58 | lazy val varJson4s = "3.3.0"
59 | lazy val _json4sJackson = "org.json4s" %% "json4s-jackson" % varJson4s
60 | lazy val _json4sExt = "org.json4s" %% "json4s-ext" % varJson4s
61 |
62 | lazy val _jsoup = "org.jsoup" % "jsoup" % "1.8.3"
63 |
64 | lazy val _asyncHttpClient = ("com.ning" % "async-http-client" % "1.9.31").exclude("io.netty", "netty")
65 |
66 | lazy val _logbackClassic = "ch.qos.logback" % "logback-classic" % "1.1.3"
67 |
68 | lazy val _cassandraDriverCore = "com.datastax.cassandra" % "cassandra-driver-core" % "2.2.0-rc3"
69 |
70 | lazy val _activemqSTOMP = "org.apache.activemq" % "activemq-stomp" % "5.13.0"
71 |
72 | }
73 |
74 |
--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=0.13.9
2 |
--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.0")
2 |
3 | addSbtPlugin("org.xerial.sbt" % "sbt-pack" % "0.7.5")
4 |
5 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.8.0")
6 |
7 | addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.0.6")
8 |
--------------------------------------------------------------------------------
/project/sbt-launch.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangjing/crawler-service/462c198f0ea22cc673d59a2e725628712f96f79b/project/sbt-launch.jar
--------------------------------------------------------------------------------
/sbt:
--------------------------------------------------------------------------------
1 | SCRIPT_DIR=`dirname $0`
2 | java -Xmx1024M -Dsbt.override.build.repos=true -Dfile.encoding=UTF-8 -XX:+CMSClassUnloadingEnabled -jar "$SCRIPT_DIR/project/sbt-launch.jar" $@
3 |
--------------------------------------------------------------------------------
/util/src/main/java/crawler/util/news/contextextractor/ContentExtractor.java:
--------------------------------------------------------------------------------
1 | package crawler.util.news.contextextractor;
2 |
3 | import org.jsoup.Jsoup;
4 | import org.jsoup.nodes.Document;
5 | import org.jsoup.nodes.Element;
6 | import org.jsoup.nodes.Node;
7 | import org.jsoup.nodes.TextNode;
8 | import org.jsoup.select.Elements;
9 | import org.jsoup.select.NodeVisitor;
10 | import org.slf4j.Logger;
11 | import org.slf4j.LoggerFactory;
12 |
13 | import java.util.ArrayList;
14 | import java.util.HashMap;
15 | import java.util.Map;
16 | import java.util.concurrent.atomic.AtomicInteger;
17 | import java.util.regex.Matcher;
18 | import java.util.regex.Pattern;
19 |
20 | /**
21 | * Created by yangjing on 15-11-3.
22 | */
23 | public class ContentExtractor {
24 |
25 | public static final Logger LOG = LoggerFactory.getLogger(ContentExtractor.class);
26 |
27 | protected Document doc;
28 |
29 | ContentExtractor(Document doc) {
30 | this.doc = doc;
31 | }
32 |
33 | protected HashMap infoMap = new HashMap();
34 |
35 | class CountInfo {
36 |
37 | int textCount = 0;
38 | int linkTextCount = 0;
39 | int tagCount = 0;
40 | int linkTagCount = 0;
41 | double density = 0;
42 | double densitySum = 0;
43 | double score = 0;
44 | int pCount = 0;
45 | ArrayList leafList = new ArrayList();
46 |
47 | }
48 |
49 | protected void clean() {
50 | doc.select("script,noscript,style,iframe,br").remove();
51 | }
52 |
53 | protected CountInfo computeInfo(Node node) {
54 |
55 | if (node instanceof Element) {
56 | Element tag = (Element) node;
57 |
58 | CountInfo countInfo = new CountInfo();
59 | for (Node childNode : tag.childNodes()) {
60 | CountInfo childCountInfo = computeInfo(childNode);
61 | countInfo.textCount += childCountInfo.textCount;
62 | countInfo.linkTextCount += childCountInfo.linkTextCount;
63 | countInfo.tagCount += childCountInfo.tagCount;
64 | countInfo.linkTagCount += childCountInfo.linkTagCount;
65 | countInfo.leafList.addAll(childCountInfo.leafList);
66 | countInfo.densitySum += childCountInfo.density;
67 | countInfo.pCount += childCountInfo.pCount;
68 | }
69 | countInfo.tagCount++;
70 | String tagName = tag.tagName();
71 | if (tagName.equals("a")) {
72 | countInfo.linkTextCount = countInfo.textCount;
73 | countInfo.linkTagCount++;
74 | } else if (tagName.equals("p")) {
75 | countInfo.pCount++;
76 | }
77 |
78 | int pureLen = countInfo.textCount - countInfo.linkTextCount;
79 | int len = countInfo.tagCount - countInfo.linkTagCount;
80 | if (pureLen == 0 || len == 0) {
81 | countInfo.density = 0;
82 | } else {
83 | countInfo.density = (pureLen + 0.0) / len;
84 | }
85 |
86 | infoMap.put(tag, countInfo);
87 |
88 | return countInfo;
89 | } else if (node instanceof TextNode) {
90 | TextNode tn = (TextNode) node;
91 | CountInfo countInfo = new CountInfo();
92 | String text = tn.text();
93 | int len = text.length();
94 | countInfo.textCount = len;
95 | countInfo.leafList.add(len);
96 | return countInfo;
97 | } else {
98 | return new CountInfo();
99 | }
100 | }
101 |
102 | protected double computeScore(Element tag) {
103 | CountInfo countInfo = infoMap.get(tag);
104 | double var = Math.sqrt(computeVar(countInfo.leafList) + 1);
105 | double score = Math.log(var) * countInfo.densitySum * Math.log(countInfo.textCount - countInfo.linkTextCount + 1) * Math.log10(countInfo.pCount + 2);
106 | return score;
107 | }
108 |
109 | protected double computeVar(ArrayList data) {
110 | if (data.size() == 0) {
111 | return 0;
112 | }
113 | if (data.size() == 1) {
114 | return data.get(0) / 2;
115 | }
116 | double sum = 0;
117 | for (Integer i : data) {
118 | sum += i;
119 | }
120 | double ave = sum / data.size();
121 | sum = 0;
122 | for (Integer i : data) {
123 | sum += (i - ave) * (i - ave);
124 | }
125 | sum = sum / data.size();
126 | return sum;
127 | }
128 |
129 | public Element getContentElement() throws Exception {
130 | clean();
131 | computeInfo(doc.body());
132 | double maxScore = 0;
133 | Element content = null;
134 | for (Map.Entry entry : infoMap.entrySet()) {
135 | Element tag = entry.getKey();
136 | if (tag.tagName().equals("a") || tag == doc.body()) {
137 | continue;
138 | }
139 | double score = computeScore(tag);
140 | if (score > maxScore) {
141 | maxScore = score;
142 | content = tag;
143 | }
144 | }
145 | if (content == null) {
146 | throw new Exception("extraction failed");
147 | }
148 | return content;
149 | }
150 |
151 | public News getNews() throws Exception {
152 | News news = new News();
153 | Element contentElement;
154 | try {
155 | contentElement = getContentElement();
156 | news.setContentElement(contentElement);
157 | } catch (Exception ex) {
158 | // LOG.info("news content extraction failed,extraction abort", ex);
159 | throw new Exception(ex);
160 | }
161 |
162 | if (doc.baseUri() != null) {
163 | news.setUrl(doc.baseUri());
164 | }
165 |
166 | // try {
167 | // news.setTime(getTime(contentElement));
168 | // } catch (Exception ex) {
169 | //// LOG.info("news title extraction failed", ex);
170 | // }
171 |
172 | // try {
173 | // news.setTitle(getTitle(contentElement));
174 | // } catch (Exception ex) {
175 | // LOG.info("title extraction failed", ex);
176 | // }
177 | return news;
178 | }
179 |
180 | protected String getTime(Element contentElement) throws Exception {
181 | String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})";
182 | Pattern pattern = Pattern.compile(regex);
183 | Element current = contentElement;
184 | for (int i = 0; i < 2; i++) {
185 | if (current != null && current != doc.body()) {
186 | Element parent = current.parent();
187 | if (parent != null) {
188 | current = parent;
189 | }
190 | }
191 | }
192 | for (int i = 0; i < 6; i++) {
193 | if (current == null) {
194 | break;
195 | }
196 | String currentHtml = current.outerHtml();
197 | Matcher matcher = pattern.matcher(currentHtml);
198 | if (matcher.find()) {
199 | return matcher.group(1) + "-" + matcher.group(2) + "-" + matcher.group(3) + " " + matcher.group(4) + ":" + matcher.group(5) + ":" + matcher.group(6);
200 | }
201 | if (current != doc.body()) {
202 | current = current.parent();
203 | }
204 | }
205 |
206 | try {
207 | return getDate(contentElement);
208 | } catch (Exception ex) {
209 | ex.printStackTrace();
210 | // throw new Exception("time not found");
211 | return "";
212 | }
213 |
214 | }
215 |
216 | protected String getDate(Element contentElement) throws Exception {
217 | String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})";
218 | Pattern pattern = Pattern.compile(regex);
219 | Element current = contentElement;
220 | for (int i = 0; i < 2; i++) {
221 | if (current != null && current != doc.body()) {
222 | Element parent = current.parent();
223 | if (parent != null) {
224 | current = parent;
225 | }
226 | }
227 | }
228 | for (int i = 0; i < 6; i++) {
229 | if (current == null) {
230 | break;
231 | }
232 | String currentHtml = current.outerHtml();
233 | Matcher matcher = pattern.matcher(currentHtml);
234 | if (matcher.find()) {
235 | return matcher.group(1) + "-" + matcher.group(2) + "-" + matcher.group(3);
236 | }
237 | if (current != doc.body()) {
238 | current = current.parent();
239 | }
240 | }
241 | throw new Exception("date not found");
242 | }
243 |
244 | protected double strSim(String a, String b) {
245 | int len1 = a.length();
246 | int len2 = b.length();
247 | if (len1 == 0 || len2 == 0) {
248 | return 0;
249 | }
250 | double ratio;
251 | if (len1 > len2) {
252 | ratio = (len1 + 0.0) / len2;
253 | } else {
254 | ratio = (len2 + 0.0) / len1;
255 | }
256 | if (ratio >= 3) {
257 | return 0;
258 | }
259 | return (lcs(a, b) + 0.0) / Math.max(len1, len2);
260 | }
261 |
262 | protected String getTitle(final Element contentElement) throws Exception {
263 | final ArrayList titleList = new ArrayList();
264 | final ArrayList titleSim = new ArrayList();
265 | final AtomicInteger contentIndex = new AtomicInteger();
266 | final String metaTitle = doc.title().trim();
267 | if (!metaTitle.isEmpty()) {
268 | doc.body().traverse(new NodeVisitor() {
269 | @Override
270 | public void head(Node node, int i) {
271 | if (node instanceof Element) {
272 | Element tag = (Element) node;
273 | if (tag == contentElement) {
274 | contentIndex.set(titleList.size());
275 | return;
276 | }
277 | String tagName = tag.tagName();
278 | if (Pattern.matches("h[1-6]", tagName)) {
279 | String title = tag.text().trim();
280 | double sim = strSim(title, metaTitle);
281 | titleSim.add(sim);
282 | titleList.add(tag);
283 | }
284 | }
285 | }
286 |
287 | @Override
288 | public void tail(Node node, int i) {
289 | }
290 | });
291 | int index = contentIndex.get();
292 | if (index > 0) {
293 | double maxScore = 0;
294 | int maxIndex = -1;
295 | for (int i = 0; i < index; i++) {
296 | double score = (i + 1) * titleSim.get(i);
297 | if (score > maxScore) {
298 | maxScore = score;
299 | maxIndex = i;
300 | }
301 | }
302 | if (maxIndex != -1) {
303 | return titleList.get(maxIndex).text();
304 | }
305 | }
306 | }
307 |
308 | Elements titles = doc.body().select("*[id^=title],*[id$=title],*[class^=title],*[class$=title]");
309 | if (titles.size() > 0) {
310 | String title = titles.first().text();
311 | if (title.length() > 5 && title.length() < 40) {
312 | return titles.first().text();
313 | }
314 | }
315 | try {
316 | return getTitleByEditDistance(contentElement);
317 | } catch (Exception ex) {
318 | throw new Exception("title not found");
319 | }
320 |
321 | }
322 |
323 | protected String getTitleByEditDistance(Element contentElement) throws Exception {
324 | final String metaTitle = doc.title();
325 |
326 | final ArrayList max = new ArrayList();
327 | max.add(0.0);
328 | final StringBuilder sb = new StringBuilder();
329 | doc.body().traverse(new NodeVisitor() {
330 |
331 | public void head(Node node, int i) {
332 |
333 | if (node instanceof TextNode) {
334 | TextNode tn = (TextNode) node;
335 | String text = tn.text().trim();
336 | double sim = strSim(text, metaTitle);
337 | if (sim > 0) {
338 | if (sim > max.get(0)) {
339 | max.set(0, sim);
340 | sb.setLength(0);
341 | sb.append(text);
342 | }
343 | }
344 |
345 | }
346 | }
347 |
348 | public void tail(Node node, int i) {
349 | }
350 | });
351 | if (sb.length() > 0) {
352 | return sb.toString();
353 | }
354 | throw new Exception();
355 |
356 | }
357 |
358 | protected int lcs(String x, String y) {
359 |
360 | int M = x.length();
361 | int N = y.length();
362 | if (M == 0 || N == 0) {
363 | return 0;
364 | }
365 | int[][] opt = new int[M + 1][N + 1];
366 |
367 | for (int i = M - 1; i >= 0; i--) {
368 | for (int j = N - 1; j >= 0; j--) {
369 | if (x.charAt(i) == y.charAt(j)) {
370 | opt[i][j] = opt[i + 1][j + 1] + 1;
371 | } else {
372 | opt[i][j] = Math.max(opt[i + 1][j], opt[i][j + 1]);
373 | }
374 | }
375 | }
376 |
377 | return opt[0][0];
378 |
379 | }
380 |
381 | protected int editDistance(String word1, String word2) {
382 | int len1 = word1.length();
383 | int len2 = word2.length();
384 |
385 | int[][] dp = new int[len1 + 1][len2 + 1];
386 |
387 | for (int i = 0; i <= len1; i++) {
388 | dp[i][0] = i;
389 | }
390 |
391 | for (int j = 0; j <= len2; j++) {
392 | dp[0][j] = j;
393 | }
394 |
395 | for (int i = 0; i < len1; i++) {
396 | char c1 = word1.charAt(i);
397 | for (int j = 0; j < len2; j++) {
398 | char c2 = word2.charAt(j);
399 |
400 | if (c1 == c2) {
401 | dp[i + 1][j + 1] = dp[i][j];
402 | } else {
403 | int replace = dp[i][j] + 1;
404 | int insert = dp[i][j + 1] + 1;
405 | int delete = dp[i + 1][j] + 1;
406 |
407 | int min = replace > insert ? insert : replace;
408 | min = delete > min ? min : delete;
409 | dp[i + 1][j + 1] = min;
410 | }
411 | }
412 | }
413 |
414 | return dp[len1][len2];
415 | }
416 |
417 | /*输入Jsoup的Document,获取正文所在Element*/
418 | public static Element getContentElementByDoc(Document doc) throws Exception {
419 | ContentExtractor ce = new ContentExtractor(doc);
420 | return ce.getContentElement();
421 | }
422 |
423 | /*输入HTML,获取正文所在Element*/
424 | public static Element getContentElementByHtml(String html) throws Exception {
425 | Document doc = Jsoup.parse(html);
426 | return getContentElementByDoc(doc);
427 | }
428 |
429 | /*输入HTML和URL,获取正文所在Element*/
430 | public static Element getContentElementByHtml(String html, String url) throws Exception {
431 | Document doc = Jsoup.parse(html, url);
432 | return getContentElementByDoc(doc);
433 | }
434 |
435 | /*输入URL,获取正文所在Element*/
436 | // public static Element getContentElementByUrl(String url) throws Exception {
437 | // HttpRequest request = new HttpRequest(url);
438 | // String html = request.getResponse().getHtmlByCharsetDetect();
439 | // return getContentElementByHtml(html, url);
440 | // }
441 |
442 | /*输入Jsoup的Document,获取正文文本*/
443 | public static String getContentByDoc(Document doc) throws Exception {
444 | ContentExtractor ce = new ContentExtractor(doc);
445 | return ce.getContentElement().text();
446 | }
447 |
448 | /*输入HTML,获取正文文本*/
449 | public static String getContentByHtml(String html) throws Exception {
450 | Document doc = Jsoup.parse(html);
451 | return getContentElementByDoc(doc).text();
452 | }
453 |
454 | /*输入HTML和URL,获取正文文本*/
455 | public static String getContentByHtml(String html, String url) throws Exception {
456 | Document doc = Jsoup.parse(html, url);
457 | return getContentElementByDoc(doc).text();
458 | }
459 |
460 | /*输入URL,获取正文文本*/
461 | // public static String getContentByUrl(String url) throws Exception {
462 | // HttpRequest request = new HttpRequest(url);
463 | // String html = request.getResponse().getHtmlByCharsetDetect();
464 | // return getContentByHtml(html, url);
465 | // }
466 |
467 | /*输入Jsoup的Document,获取结构化新闻信息*/
468 | public static News getNewsByDoc(Document doc) throws Exception {
469 | ContentExtractor ce = new ContentExtractor(doc);
470 | return ce.getNews();
471 | }
472 |
473 | /*输入HTML,获取结构化新闻信息*/
474 | public static News getNewsByHtml(String html) throws Exception {
475 | Document doc = Jsoup.parse(html);
476 | return getNewsByDoc(doc);
477 | }
478 |
479 | /*输入HTML和URL,获取结构化新闻信息*/
480 | public static News getNewsByHtml(String html, String url) throws Exception {
481 | Document doc = Jsoup.parse(html, url);
482 | return getNewsByDoc(doc);
483 | }
484 |
485 | /*输入URL,获取结构化新闻信息*/
486 | // public static News getNewsByUrl(String url) throws Exception {
487 | // HttpRequest request = new HttpRequest(url);
488 | // String html = request.getResponse().getHtmlByCharsetDetect();
489 | // return getNewsByHtml(html, url);
490 | // }
491 |
492 | public static void main(String[] args) throws Exception {
493 |
494 | // News news = ContentExtractor.getNewsByUrl("http://www.huxiu.com/article/121959/1.html");
495 | // System.out.println(news.getUrl());
496 | // System.out.println(news.getTitle());
497 | // System.out.println(news.getTime());
498 | // System.out.println(news.getContent());
499 | //System.out.println(news.getContentElement());
500 |
501 | //System.out.println(news);
502 | }
503 |
504 | }
--------------------------------------------------------------------------------
/util/src/main/java/crawler/util/news/contextextractor/News.java:
--------------------------------------------------------------------------------
1 | package crawler.util.news.contextextractor;
2 |
3 | import org.jsoup.nodes.Element;
4 |
5 | /**
6 | * Created by yangjing on 15-11-3.
7 | */
8 | public class News {
9 |
10 | protected String url = null;
11 | // protected String title = null;
12 | protected String content = null;
13 | // protected String time = null;
14 |
15 | protected Element contentElement = null;
16 |
17 | public String getUrl() {
18 | return url;
19 | }
20 |
21 | public void setUrl(String url) {
22 | this.url = url;
23 | }
24 |
25 | // public String getTitle() {
26 | // return title;
27 | // }
28 | //
29 | // public void setTitle(String title) {
30 | // this.title = title;
31 | // }
32 |
33 | public String getContent() {
34 | if (content == null) {
35 | if (contentElement != null) {
36 | content = contentElement.text();
37 | }
38 | }
39 | return content;
40 | }
41 |
42 |
43 | public void setContent(String content) {
44 | this.content = content;
45 | }
46 |
47 | // public String getTime() {
48 | // return time;
49 | // }
50 | //
51 | // public void setTime(String time) {
52 | // this.time = time;
53 | // }
54 |
55 | @Override
56 | public String toString() {
57 | return "URL:\n" + url + /*"\nTITLE:\n" + title + "\nTIME:\n" + time +*/ "\nCONTENT:\n" + getContent() + "\nCONTENT(SOURCE):\n" + contentElement;
58 | }
59 |
60 | public Element getContentElement() {
61 | return contentElement;
62 | }
63 |
64 | public void setContentElement(Element contentElement) {
65 | this.contentElement = contentElement;
66 | }
67 |
68 |
69 | }
--------------------------------------------------------------------------------
/util/src/main/resources/reference.conf:
--------------------------------------------------------------------------------
1 | akka {
2 | loggers = ["akka.event.slf4j.Slf4jLogger"]
3 | loglevel = INFO
4 | log-dead-letters = off
5 | log-dead-letters-during-shutdown = off
6 | fork-join-executor {
7 | parallelism-factor = 3.0
8 | parallelism-min = 16
9 | parallelism-max = 64
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/util/src/main/scala/crawler/SystemUtils.scala:
--------------------------------------------------------------------------------
1 | package crawler
2 |
3 | import java.util.concurrent.TimeoutException
4 |
5 | import akka.actor.ActorSystem
6 | import akka.stream.ActorMaterializer
7 | import com.ning.http.client.AsyncHttpClientConfig
8 | import com.typesafe.config.ConfigFactory
9 | import com.typesafe.scalalogging.StrictLogging
10 | import crawler.util.http.HttpClient
11 |
12 | import scala.concurrent.duration._
13 |
14 | /**
15 | * System Utils
16 | * Created by yangjing on 15-11-5.
17 | */
18 | object SystemUtils extends StrictLogging {
19 | val crawlerConfig = ConfigFactory.load().getConfig("crawler")
20 |
21 | implicit val system = ActorSystem(crawlerConfig.getString("akka-system-name"))
22 | implicit val materializer = ActorMaterializer()
23 |
24 | val httpClient = {
25 | crawlerConfig.getConfig("http-client")
26 | val builder = new AsyncHttpClientConfig.Builder()
27 | builder.setMaxConnections(8192)
28 | builder.setMaxConnectionsPerHost(4)
29 | builder.setConnectTimeout(10 * 1000)
30 | builder.setPooledConnectionIdleTimeout(40 * 1000)
31 | builder.setRequestTimeout(90 * 1000)
32 | builder.setAllowPoolingConnections(true)
33 | builder.setFollowRedirect(true)
34 | HttpClient(builder.build(), Nil)
35 | }
36 |
37 | def shutdown(): Unit = {
38 | httpClient.close()
39 | system.shutdown()
40 | try {
41 | system.awaitTermination(5.seconds)
42 | System.exit(0)
43 | } catch {
44 | case e: TimeoutException =>
45 | logger.error(e.getLocalizedMessage, e)
46 | System.exit(3)
47 | }
48 | }
49 |
50 | }
51 |
--------------------------------------------------------------------------------
/util/src/main/scala/crawler/util/Crawler.scala:
--------------------------------------------------------------------------------
1 | package crawler.util
2 |
3 | import crawler.util.http.HttpClient
4 |
5 | import scala.util.Random
6 |
7 | /**
8 | * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-18.
9 | */
10 | trait Crawler {
11 | val httpClient: HttpClient
12 |
13 | protected def defaultHeaders = Array(
14 | Seq(
15 | "User-Agent" -> "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36",
16 | "Accept" -> "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
17 | "Accept-Encoding" -> "gzip, deflate, sdch",
18 | "Accept-Language" -> "zh-CN,zh;q=0.8,en;q=0.6",
19 | "Connection" -> "keep-alive"
20 | ),
21 | Seq(
22 | "User-Agent" -> "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/601.2.7 (KHTML, like Gecko) Version/9.0.1 Safari/601.2.7",
23 | "Accept" -> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
24 | ),
25 | Seq(
26 | "User-Agent" -> "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:39.0) Gecko/20100101 Firefox/39.0",
27 | "Accept" -> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
28 | "Accept-Encoding" -> "gzip, deflate",
29 | "Accept-Language" -> "en-US,en;q=0.5",
30 | "Connection" -> "keep-alive"
31 | )
32 | )
33 |
34 | def requestHeaders() = defaultHeaders(Random.nextInt(defaultHeaders.length))
35 |
36 | def fetchPage(url: String) = {
37 | httpClient.get(url).setFollowRedirects(true).header(requestHeaders(): _*).execute()
38 | }
39 |
40 | }
41 |
--------------------------------------------------------------------------------
/util/src/main/scala/crawler/util/JsoupImplicits.scala:
--------------------------------------------------------------------------------
1 | package crawler.util
2 |
3 | import org.jsoup.nodes.Element
4 | import org.jsoup.select.Elements
5 |
6 | /**
7 | * Jsoup 相关辅助方法
8 | * Created by yangjing on 15-11-3.
9 | */
10 | object JsoupImplicits {
11 |
12 | implicit class JsoupElementFindByClassname(element: Element) {
13 | def findByClass(cn: String): Elements = {
14 | element.getElementsByClass(cn)
15 | }
16 | }
17 |
18 | implicit class JsoupElementsFindByClassname(elements: Elements) {
19 | def findByClass(cn: String): Elements = {
20 | val list = new java.util.LinkedList[Element]()
21 | val iter = elements.iterator()
22 | while (iter.hasNext) {
23 | val elements = iter.next().getElementsByClass(cn)
24 | list.addAll(elements)
25 | }
26 | new Elements(list)
27 | }
28 | }
29 |
30 | }
31 |
--------------------------------------------------------------------------------
/util/src/main/scala/crawler/util/Utils.scala:
--------------------------------------------------------------------------------
1 | package crawler.util
2 |
3 | import java.lang.management.ManagementFactory
4 | import java.nio.charset.Charset
5 |
6 | import crawler.util.time.TimeUtils
7 |
8 | /**
9 | * Utils
10 | * Created by Yang Jing (yangbajing@gmail.com) on 2015-12-03.
11 | */
12 | object Utils {
13 | val CHARSET = Charset.forName("UTF-8")
14 |
15 | def getPid = {
16 | val runtime = ManagementFactory.getRuntimeMXBean
17 | runtime.getName.split('@')(0)
18 | }
19 |
20 | def lastYearPeriods(): Seq[Int] = {
21 | val now = TimeUtils.now()
22 | val (curMonth, curYear, preYear) = (now.getMonthValue, now.getYear * 100, now.getYear * 100 - 100)
23 | (curMonth + 1 to 12).map(preYear + _) ++ (1 to curMonth).map(curYear + _)
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/util/src/main/scala/crawler/util/actors/MetricActor.scala:
--------------------------------------------------------------------------------
1 | package crawler.util.actors
2 |
3 | import java.util.concurrent.atomic.AtomicInteger
4 |
5 | import akka.actor.Actor
6 | import com.typesafe.scalalogging.LazyLogging
7 |
8 | /**
9 | * Metric Actor
10 | * Created by yangjing on 15-11-4.
11 | */
12 | trait MetricActor extends Actor with LazyLogging {
13 | final override def preStart(): Unit = {
14 | logger.trace(s"${self.path} preStart")
15 | MetricActor.incrementActorSize()
16 | metricPreStart()
17 | }
18 |
19 | final override def postStop(): Unit = {
20 | metricPostStop()
21 | MetricActor.decrementActorSize()
22 | logger.trace(s"${self.path} postStop")
23 | }
24 |
25 | final override def receive: Receive = {
26 | case s =>
27 | if (metricReceive.isDefinedAt(s)) {
28 | logger.trace(s"${self.path} receive message: $s")
29 | metricReceive(s)
30 | } else {
31 | logger.warn(s"${self.path} receive message: $s")
32 | unhandled(s)
33 | }
34 | }
35 |
36 | def metricPreStart(): Unit = ()
37 |
38 | def metricPostStop(): Unit = ()
39 |
40 | val metricReceive: Receive
41 |
42 | }
43 |
44 | object MetricActor {
45 | private val _currentActiveActors = new AtomicInteger(0)
46 |
47 | def incrementActorSize() = _currentActiveActors.incrementAndGet()
48 |
49 | def decrementActorSize() = _currentActiveActors.decrementAndGet()
50 |
51 | def currentActorSize() = _currentActiveActors.get()
52 | }
53 |
--------------------------------------------------------------------------------
/util/src/main/scala/crawler/util/http/HttpClient.scala:
--------------------------------------------------------------------------------
1 | package crawler.util.http
2 |
3 | import com.ning.http.client._
4 | import com.ning.http.client.cookie.Cookie
5 | import com.ning.http.client.multipart.Part
6 | import com.typesafe.config.Config
7 |
8 | import scala.concurrent.{ExecutionContext, Future, Promise}
9 | import scala.util.{Failure, Success}
10 |
11 | class HttpClientBuilder(builder: AsyncHttpClient#BoundRequestBuilder) {
12 |
13 | def queryParam(params: (String, String)*) = {
14 | params.foreach { case (name, value) => builder.addQueryParam(name, value) }
15 | this
16 | }
17 |
18 | def header(headers: (String, String)*) = {
19 | headers.foreach { case (name, value) => builder.addHeader(name, value) }
20 | this
21 | }
22 |
23 | def cookie(cookie: Cookie) = {
24 | builder.addCookie(cookie)
25 | this
26 | }
27 |
28 | def part(part: Part) = {
29 | builder.addBodyPart(part)
30 | this
31 | }
32 |
33 | def addFormParam(params: (String, String)*) = {
34 | params.foreach { case (key, value) => builder.addFormParam(key, value) }
35 | this
36 | }
37 |
38 | def setFollowRedirects(followRedirects: Boolean) = {
39 | builder.setFollowRedirects(followRedirects)
40 | this
41 | }
42 |
43 | def execute(): Future[Response] = {
44 | val promise = Promise[Response]()
45 | try {
46 | builder.execute(new AsyncCompletionHandler[Unit] {
47 | override def onCompleted(response: Response): Unit = {
48 | // println(response.getStatusCode + ": " + response.getStatusText)
49 | promise.success(response)
50 | }
51 |
52 | override def onThrowable(t: Throwable): Unit = {
53 | promise.failure(t)
54 | }
55 | })
56 | } catch {
57 | case e: Throwable =>
58 | promise.failure(e)
59 | }
60 | promise.future
61 | }
62 |
63 | }
64 |
65 | /**
66 | * HttpClient
67 | * Created by yangjing on 15-11-3.
68 | */
69 | class HttpClient private(config: AsyncHttpClientConfig,
70 | defaultHeaders: Iterable[(String, String)]) {
71 |
72 | private val client = new AsyncHttpClient(config)
73 |
74 | def close() = client.close()
75 |
76 | def get(url: String) = new HttpClientBuilder(client.prepareGet(url))
77 |
78 | def post(url: String) = new HttpClientBuilder(client.preparePost(url))
79 |
80 | def delete(url: String) = new HttpClientBuilder(client.prepareDelete(url))
81 |
82 | def put(url: String) = new HttpClientBuilder(client.preparePut(url))
83 | }
84 |
85 | object HttpClient {
86 | def apply(): HttpClient = apply(Nil)
87 |
88 | def apply(config: Config): HttpClient = {
89 | // TODO 解析config to AsyncHttpClientConfig
90 |
91 | apply(Nil)
92 | }
93 |
94 | def apply(defaultHeaders: Iterable[(String, String)]): HttpClient =
95 | apply(new AsyncHttpClientConfig.Builder().build, defaultHeaders)
96 |
97 | def apply(config: AsyncHttpClientConfig, defaultHeaders: Iterable[(String, String)]): HttpClient =
98 | new HttpClient(config, defaultHeaders)
99 |
100 | def apply(allowRedirect: Boolean): HttpClient = {
101 | val builder = new AsyncHttpClientConfig.Builder()
102 | builder.setFollowRedirect(false)
103 | apply(builder.build(), Nil)
104 | }
105 |
106 | def find302Location(client: HttpClient, url: String, headers: Seq[(String, String)])(implicit ec: ExecutionContext) = {
107 | val promise = Promise[String]()
108 |
109 | def findLocation() = client.get(url).header(headers: _*).setFollowRedirects(false).execute().map(_.getHeader("Location"))
110 |
111 | findLocation().onComplete {
112 | case Success(location) => promise.success(location)
113 | case Failure(e) =>
114 | findLocation().onComplete {
115 | case Success(location) => promise.success(location)
116 | case Failure(t) => promise.failure(t)
117 | }
118 | }
119 |
120 | promise.future
121 | }
122 |
123 | }
--------------------------------------------------------------------------------
/util/src/main/scala/crawler/util/http/TJsonSupport.scala:
--------------------------------------------------------------------------------
1 | package crawler.util.http
2 |
3 | import java.time.LocalDateTime
4 |
5 | import akka.http.scaladsl.marshalling._
6 | import akka.http.scaladsl.model.{ContentType, ContentTypes, HttpCharsets, MediaTypes}
7 | import akka.http.scaladsl.unmarshalling._
8 | import akka.stream.Materializer
9 | import crawler.util.time.TimeUtils
10 | import org.json4s._
11 | import org.json4s.jackson.Serialization
12 |
13 | /**
14 | * Akka Http Json Supoort
15 | * Created by yangjing on 15-11-5.
16 | */
17 | trait TJsonSupport {
18 | def defaultFormats: Formats = DefaultFormats + new LocalDateTimeSerializer()
19 |
20 | implicit val serialization = Serialization
21 | implicit val formats: Formats
22 |
23 | }
24 |
25 | object TJsonSupport extends TJsonSupport {
26 | override implicit val formats: Formats = defaultFormats
27 | }
28 |
29 | class LocalDateTimeSerializer extends CustomSerializer[LocalDateTime](format =>
30 | ( {
31 | case JString(s) => LocalDateTime.parse(s, TimeUtils.formatterDateTime)
32 | case JNull => null
33 | }, {
34 | case d: LocalDateTime => JString(TimeUtils.formatterDateTime.format(d))
35 | })
36 | )
37 |
38 |
--------------------------------------------------------------------------------
/util/src/main/scala/crawler/util/persist/CassandraPersists.scala:
--------------------------------------------------------------------------------
1 | package crawler.util.persist
2 |
3 | import com.datastax.driver.core._
4 | import com.google.common.util.concurrent.{FutureCallback, Futures}
5 | import com.typesafe.scalalogging.LazyLogging
6 | import crawler.SystemUtils
7 |
8 | import scala.collection.JavaConverters._
9 | import scala.concurrent.{ExecutionContextExecutor, Future, Promise}
10 | import scala.util.Try
11 |
12 | /**
13 | * CassandraPersists
14 | * Created by yangjing on 15-11-6.
15 | */
16 | abstract class CassandraPersists(nodes: Seq[String]) {
17 | val cluster = {
18 | Cluster.builder().addContactPoints(nodes: _*)
19 | }
20 | }
21 |
22 | object CassandraPersists extends LazyLogging {
23 |
24 | val cluster = {
25 | val nodes = SystemUtils.crawlerConfig.getStringList("cassandra.nodes").asScala
26 | logger.info("cassandra.nodes: " + nodes)
27 | Cluster.builder().addContactPoints(nodes: _*).build()
28 | }
29 |
30 | def userType(keyspace: String, userType: String): UserType =
31 | cluster.getMetadata.getKeyspace(keyspace).getUserType(userType)
32 |
33 | def using[R](keyspace: String)(func: Session => R): R = {
34 | val session = cluster.connect(keyspace)
35 | try {
36 | func(session)
37 | } finally {
38 | session.closeAsync()
39 | }
40 | }
41 |
42 | def execute[R](resultSetFuture: ResultSetFuture)(func: ResultSet => R)(implicit ec: ExecutionContextExecutor): Future[R] = {
43 | val promise = Promise[R]()
44 | Futures.addCallback(
45 | resultSetFuture,
46 | new FutureCallback[ResultSet] {
47 | override def onFailure(t: Throwable): Unit = {
48 | promise.failure(t)
49 | }
50 |
51 | override def onSuccess(rs: ResultSet): Unit = {
52 | promise.complete(Try(func(rs)))
53 | }
54 | },
55 | ec)
56 | promise.future
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/util/src/main/scala/crawler/util/time/TimeUtils.scala:
--------------------------------------------------------------------------------
1 | package crawler.util.time
2 |
3 | import java.time._
4 | import java.time.format.DateTimeFormatter
5 | import java.util.Date
6 |
7 | /**
8 | * DateTimeUtils
9 | * Created by yangjing on 15-11-6.
10 | */
11 | object TimeUtils {
12 | val ZONE_OFFSET = ZoneOffset.ofHours(8)
13 | val formatterDate = DateTimeFormatter.ofPattern("yyyy-MM-dd")
14 | val formatterDateTime = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")
15 | val formatterDateMinus = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm")
16 | val formatterMinus = DateTimeFormatter.ofPattern("HH:mm")
17 |
18 | def toLocalDateTime(instant: Instant): LocalDateTime = LocalDateTime.ofInstant(instant, ZONE_OFFSET)
19 |
20 | def toLocalDateTime(s: String): LocalDateTime = {
21 | s.length match {
22 | case 5 =>
23 | LocalDateTime.parse(s, formatterMinus)
24 | case 16 =>
25 | LocalDateTime.parse(s, formatterDateMinus)
26 | case 19 =>
27 | LocalDateTime.parse(s, formatterDateTime)
28 | case _ =>
29 | LocalDateTime.parse(s)
30 | }
31 | }
32 |
33 | def toLocalDateTime(date: Date): LocalDateTime =
34 | LocalDateTime.ofInstant(Instant.ofEpochMilli(date.getTime), ZONE_OFFSET)
35 |
36 | def toDate(ldt: LocalDateTime): Date =
37 | new Date(ldt.toInstant(ZONE_OFFSET).toEpochMilli)
38 |
39 | def now() = LocalDateTime.now()
40 |
41 | /**
42 | * @return 一天的开始:
43 | */
44 | def nowBegin(): LocalDateTime = LocalDate.now().atTime(0, 0, 0, 0)
45 |
46 | /**
47 | * @return 一天的结尾:
48 | */
49 | def nowEnd(): LocalDateTime = LocalTime.of(23, 59, 59, 999999999).atDate(LocalDate.now())
50 | }
51 |
--------------------------------------------------------------------------------
/util/src/test/scala/crawler/testsuite/ServiceSpec.scala:
--------------------------------------------------------------------------------
1 | package crawler.testsuite
2 |
3 | import crawler.SystemUtils
4 | import org.scalatest._
5 | import org.scalatest.concurrent.ScalaFutures
6 | import org.scalatest.time.{Seconds, Span}
7 |
8 | /**
9 | * Created by yangjing on 15-11-4.
10 | */
11 | abstract class ServiceSpec
12 | extends WordSpec
13 | with BeforeAndAfterAll
14 | with MustMatchers
15 | with OptionValues
16 | with EitherValues
17 | with ScalaFutures {
18 |
19 | implicit def system = SystemUtils.system
20 | implicit def materializer = SystemUtils.materializer
21 | implicit def dispatcher = system.dispatcher
22 | implicit val defaultPatience = PatienceConfig(Span(30, Seconds))
23 |
24 | override protected def afterAll(): Unit = {
25 | SystemUtils.shutdown()
26 | }
27 |
28 | }
29 |
--------------------------------------------------------------------------------
/util/src/test/scala/crawler/util/persist/CassandraPersistsTest.scala:
--------------------------------------------------------------------------------
1 | package crawler.util.persist
2 |
3 | import java.util.Date
4 |
5 | import crawler.SystemUtils
6 | import org.scalatest.WordSpec
7 |
8 | /**
9 | * Created by yangjing on 15-11-6.
10 | */
11 | class CassandraPersistsTest extends WordSpec {
12 |
13 | "CassandraPersistsTest" should {
14 |
15 | "save" in {
16 | val keyspace = SystemUtils.crawlerConfig.getString("cassandra.keyspace")
17 | CassandraPersists.using(keyspace) { session =>
18 | val newsItem = Map(
19 | "url" -> "http://hostname/news/1.html",
20 | "source" -> "网易新闻",
21 | "title" -> "标题",
22 | "time" -> new Date(),
23 | "abstract" -> "新闻摘要")
24 | val bstmt = session.prepare("INSERT INTO search_page(source, key, count, news) VALUES(?, ?, ?, ?);")
25 |
26 | val newsTypeUDT = session.getCluster.getMetadata.getKeyspace(keyspace).getUserType("news_type")
27 | val nit = newsTypeUDT.newValue()
28 | newsItem.foreach {
29 | case ("time", value: Date) => nit.setTimestamp("time", value)
30 | case (key, value: String) => nit.setString(key, value)
31 | }
32 |
33 | val result = session.execute(bstmt.bind(
34 | "网易新闻",
35 | "杭州誉存科技有限公司",
36 | Integer.valueOf(2),
37 | java.util.Arrays.asList(nit)
38 | ))
39 | println(result)
40 |
41 | }
42 | }
43 |
44 | }
45 | }
46 |
--------------------------------------------------------------------------------