├── .gitignore ├── README.md ├── app-api └── src │ ├── main │ ├── resources │ │ ├── logback-test.xml │ │ └── reference.conf │ └── scala │ │ └── crawler │ │ └── app │ │ ├── Main.scala │ │ ├── common │ │ ├── BaseRoute.scala │ │ └── JsonSupport.scala │ │ └── routes │ │ ├── ApiRoutes.scala │ │ ├── NewsRoute.scala │ │ └── SiteRoute.scala │ ├── test │ └── scala │ │ ├── demo.sc │ │ ├── saic.sc │ │ └── worksheet.sc │ └── universal │ └── conf │ ├── application-test.conf │ ├── application.conf │ ├── jvmopts │ └── logback.xml ├── module-news ├── docs │ └── 杂记.txt └── src │ ├── main │ └── scala │ │ └── crawler │ │ └── module │ │ └── news │ │ ├── NewsJsonSupport.scala │ │ ├── NewsUtils.scala │ │ ├── commands │ │ └── Commands.scala │ │ ├── crawlers │ │ ├── BaiduNews.scala │ │ ├── CourtNews.scala │ │ ├── HaosouNews.scala │ │ ├── NewsCrawler.scala │ │ ├── SogouNews.scala │ │ └── WechatNews.scala │ │ ├── enums │ │ ├── ItemSource.scala │ │ └── SearchMethod.scala │ │ ├── model │ │ ├── NewsItem.scala │ │ ├── NewsPage.scala │ │ ├── NewsPageItem.scala │ │ └── SearchResult.scala │ │ └── service │ │ ├── NewsDBRepo.scala │ │ ├── NewsMaster.scala │ │ ├── NewsService.scala │ │ └── actors │ │ ├── ItemPageWorker.scala │ │ ├── NewsJob.scala │ │ ├── NewsSourceJob.scala │ │ ├── PersistActor.scala │ │ └── SearchPageWorker.scala │ └── test │ ├── resources │ └── logback.xml │ └── scala │ └── crawler │ └── module │ └── news │ ├── crawlers │ ├── BaiduNewsTest.scala │ ├── CourtNewsTest.scala │ ├── HaosouNewsTest.scala │ └── WechatNewsTest.scala │ └── service │ ├── NewsDBRepoTest.scala │ └── actors │ └── NewsJobMasterTest.scala ├── module-site-search └── src │ ├── main │ └── scala │ │ └── crawler │ │ └── module │ │ └── site │ │ ├── BaiduSite.scala │ │ ├── QueryCond.scala │ │ ├── SearchSyntax.scala │ │ └── model │ │ ├── SearchRequest.scala │ │ ├── SiteItem.scala │ │ └── SiteResult.scala │ └── test │ ├── resources │ └── logback.xml │ └── scala │ └── crawler │ └── module │ └── site │ └── BaiduSiteTest.scala ├── project ├── Build.scala ├── BuildSettings.scala ├── build.properties ├── plugins.sbt └── sbt-launch.jar ├── sbt └── util └── src ├── main ├── java │ └── crawler │ │ └── util │ │ └── news │ │ └── contextextractor │ │ ├── ContentExtractor.java │ │ └── News.java ├── resources │ └── reference.conf └── scala │ └── crawler │ ├── SystemUtils.scala │ └── util │ ├── Crawler.scala │ ├── JsoupImplicits.scala │ ├── Utils.scala │ ├── actors │ └── MetricActor.scala │ ├── http │ ├── HttpClient.scala │ └── TJsonSupport.scala │ ├── persist │ └── CassandraPersists.scala │ └── time │ └── TimeUtils.scala └── test └── scala └── crawler ├── testsuite └── ServiceSpec.scala └── util └── persist └── CassandraPersistsTest.scala /.gitignore: -------------------------------------------------------------------------------- 1 | app-api/package/ 2 | logs/ 3 | target/ 4 | .idea 5 | .idea_modules 6 | .classpath 7 | .project 8 | .settings 9 | RUNNING_PID 10 | app.pid 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Crawler Service 2 | 3 | 爬虫服务 4 | 5 | - Akka Stream & Http 1.0 6 | - Cassandra 2.1 7 | - Json4s 3.3 8 | 9 | ## Install 10 | 11 | ### 安装Cassandra 12 | 13 | [http://www.yangbajing.me/2015/10/22/canssandra%E5%BC%80%E5%A7%8B/](http://www.yangbajing.me/2015/10/22/canssandra%E5%BC%80%E5%A7%8B/) 14 | 15 | ### 配置 16 | 17 | 1. `util/src/main/resources/reference.conf`: 默认配置 18 | 2. `app/src/main/resources/application.conf`: 产品配置 19 | 20 | 具体使用说明请参考:[https://github.com/typesafehub/config](https://github.com/typesafehub/config)` 21 | 22 | ### 编译 23 | 24 | ``` 25 | ./sbt app/assembly 26 | ``` 27 | 28 | ### 运行 29 | 30 | ``` 31 | java -jar app/target/scala-2.11/crawler-app.jar 32 | ``` 33 | 34 | -------------------------------------------------------------------------------- /app-api/src/main/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | %date - [%level] - from %logger in %thread %n%message%n%xException%n 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /app-api/src/main/resources/reference.conf: -------------------------------------------------------------------------------- 1 | akka { 2 | http { 3 | server { 4 | backlog = 1024 5 | max-connections = 8192 6 | socket-options { 7 | so-reuse-address = on 8 | } 9 | } 10 | host-connection-pool { 11 | max-connections = 8 12 | } 13 | } 14 | } 15 | 16 | crawler { 17 | api-uri = "http://120.26.93.104" 18 | 19 | akka-system-name = "crawler" 20 | 21 | network { 22 | server = "0.0.0.0" 23 | server = ${crawler.network.server} 24 | port = 33333 25 | } 26 | 27 | cassandra { 28 | nodes = ["192.168.31.242", "192.168.31.243"] 29 | keyspace = "crawler_spider" 30 | } 31 | 32 | http-client { 33 | headers { 34 | chromeMac { 35 | User-Agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36" 36 | Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" 37 | Accept-Encoding = "gzip, deflate, sdch" 38 | Accept-Language = "zh-CN,zh;q=0.8,en;q=0.6" 39 | Connection = "keep-alive" 40 | } 41 | 42 | safariMac { 43 | User-Agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/601.2.7 (KHTML, like Gecko) Version/9.0.1 Safari/601.2.7" 44 | Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" 45 | } 46 | 47 | firefoxMac { 48 | User-Agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:39.0) Gecko/20100101 Firefox/39.0" 49 | Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" 50 | Accept-Encoding = "gzip, deflate" 51 | Accept-Language = "en-US,en;q=0.5" 52 | Connection = "keep-alive" 53 | } 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /app-api/src/main/scala/crawler/app/Main.scala: -------------------------------------------------------------------------------- 1 | package crawler.app 2 | 3 | import java.nio.file.{Files, Paths} 4 | 5 | import akka.http.scaladsl.Http 6 | import com.typesafe.config.ConfigFactory 7 | import com.typesafe.scalalogging.StrictLogging 8 | import crawler.SystemUtils 9 | import crawler.app.routes.ApiRoutes 10 | import crawler.util.Utils 11 | 12 | import scala.util.{Failure, Success} 13 | 14 | /** 15 | * Main 16 | * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-03. 17 | */ 18 | object Main extends App with StrictLogging { 19 | 20 | import SystemUtils._ 21 | import system.dispatcher 22 | 23 | Files.write(Paths.get("app.pid"), Utils.getPid.getBytes(Utils.CHARSET)) 24 | 25 | val config = ConfigFactory.load() 26 | 27 | println(config.getString("crawler.network.server") + ":" + config.getInt("crawler.network.port")) 28 | 29 | Http().bindAndHandle(ApiRoutes(), config.getString("crawler.network.server"), config.getInt("crawler.network.port")) 30 | .onComplete { 31 | case Success(binding) => 32 | logger.info(s"binding: $binding") 33 | case Failure(e) => 34 | e.printStackTrace() 35 | SystemUtils.shutdown() 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /app-api/src/main/scala/crawler/app/common/BaseRoute.scala: -------------------------------------------------------------------------------- 1 | package crawler.app.common 2 | 3 | import akka.http.scaladsl.server.Directives 4 | import com.typesafe.scalalogging.LazyLogging 5 | import crawler.SystemUtils 6 | 7 | /** 8 | * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-18. 9 | */ 10 | trait BaseRoute extends Directives with JsonSupport with LazyLogging { 11 | implicit def system = SystemUtils.system 12 | 13 | implicit def mat = SystemUtils.materializer 14 | 15 | implicit def dispatcher = system.dispatcher 16 | } 17 | -------------------------------------------------------------------------------- /app-api/src/main/scala/crawler/app/common/JsonSupport.scala: -------------------------------------------------------------------------------- 1 | package crawler.app.common 2 | 3 | import akka.http.scaladsl.marshalling._ 4 | import akka.http.scaladsl.model.{HttpCharsets, MediaTypes} 5 | import akka.http.scaladsl.unmarshalling._ 6 | import akka.stream.Materializer 7 | import crawler.module.news.NewsJsonSupport 8 | import crawler.module.site.QueryCond 9 | import crawler.util.http.TJsonSupport 10 | import org.json4s.ext.EnumNameSerializer 11 | import org.json4s.{Formats, Serialization} 12 | 13 | /** 14 | * Json Support 15 | * Created by yangjing on 15-11-6. 16 | */ 17 | trait JsonSupport extends TJsonSupport with NewsJsonSupport { 18 | implicit override val formats: Formats = defaultFormats + 19 | new EnumNameSerializer(QueryCond) 20 | 21 | implicit def json4sUnmarshallerConverter[A: Manifest](serialization: Serialization, formats: Formats)(implicit mat: Materializer): FromEntityUnmarshaller[A] = 22 | json4sUnmarshaller(manifest, serialization, formats, mat) 23 | 24 | implicit def json4sUnmarshaller[A: Manifest](implicit serialization: Serialization, formats: Formats, mat: Materializer): FromEntityUnmarshaller[A] = 25 | Unmarshaller.byteStringUnmarshaller 26 | .forContentTypes(MediaTypes.`application/json`) 27 | .mapWithCharset { (data, charset) => 28 | val input = if (charset == HttpCharsets.`UTF-8`) data.utf8String else data.decodeString(charset.nioCharset.name) 29 | serialization.read(input) 30 | } 31 | 32 | implicit def json4sMarshallerConverter[A <: AnyRef](serialization: Serialization, formats: Formats): ToEntityMarshaller[A] = 33 | json4sMarshaller(serialization, formats) 34 | 35 | implicit def json4sMarshaller[A <: AnyRef](implicit serialization: Serialization, formats: Formats): ToEntityMarshaller[A] = 36 | Marshaller.StringMarshaller.wrap(MediaTypes.`application/json`)(serialization.write[A]) 37 | } 38 | 39 | object JsonSupport extends JsonSupport 40 | -------------------------------------------------------------------------------- /app-api/src/main/scala/crawler/app/routes/ApiRoutes.scala: -------------------------------------------------------------------------------- 1 | package crawler.app.routes 2 | 3 | import akka.http.scaladsl.model.HttpResponse 4 | import akka.http.scaladsl.server.Directives 5 | 6 | /** 7 | * ApiRoute 8 | * Created by yangjing on 15-11-3. 9 | */ 10 | object ApiRoutes extends Directives { 11 | 12 | def apply() = 13 | pathPrefix("api") { 14 | path("health_check") { 15 | (get | head) { 16 | complete(HttpResponse()) 17 | } 18 | } ~ 19 | NewsRoute() ~ 20 | SiteRoute() 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /app-api/src/main/scala/crawler/app/routes/NewsRoute.scala: -------------------------------------------------------------------------------- 1 | package crawler.app.routes 2 | 3 | import java.util.concurrent.TimeUnit 4 | 5 | import akka.http.scaladsl.marshalling.Marshal 6 | import akka.http.scaladsl.model._ 7 | import com.typesafe.config.ConfigFactory 8 | import crawler.SystemUtils 9 | import crawler.app.common.BaseRoute 10 | import crawler.module.news.crawlers._ 11 | import crawler.module.news.enums.{ItemSource, SearchMethod} 12 | import crawler.module.news.service.NewsService 13 | import crawler.util.Utils 14 | 15 | import scala.concurrent.Future 16 | import scala.concurrent.duration.Duration 17 | import scala.util.Try 18 | 19 | /** 20 | * 新闻路由 21 | * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-03. 22 | */ 23 | object NewsRoute extends BaseRoute { 24 | 25 | val config = ConfigFactory.load() 26 | NewsCrawler.registerCrawler(ItemSource.baidu, new BaiduNews(SystemUtils.httpClient)) 27 | NewsCrawler.registerCrawler(ItemSource.sogou, new SogouNews(SystemUtils.httpClient)) 28 | NewsCrawler.registerCrawler(ItemSource.haosou, new HaosouNews(SystemUtils.httpClient)) 29 | NewsCrawler.registerCrawler(ItemSource.court, new CourtNews(SystemUtils.httpClient)) 30 | // NewsCrawler.registerCrawler(NewsSource.wechat, new WechatNews(httpClient)) 31 | 32 | val newsService = new NewsService() 33 | 34 | def apply() = 35 | pathPrefix("news") { 36 | pathEnd { 37 | get { 38 | parameters( 39 | 'company.as[String], 40 | 'source.as[String] ? "", 41 | 'method.as[String] ? "", 42 | 'duration.as[Int] ? 15, 43 | 'forcedLatest.as[String] ? "", 44 | 'version.as[String] ? "1") { (company, source, method, duration, forcedLatest, version) => 45 | 46 | val future: Future[HttpResponse] = 47 | version match { 48 | case "3" => 49 | fromLocal(company, Seq(ItemSource.baidu) /*NewsSource.withToNames(source)*/ , method, duration, forcedLatest).flatMap(list => 50 | Marshal(list.flatMap(_.news)).to[HttpResponse] 51 | ) 52 | 53 | case "2" => 54 | fromCrawlerApi(company).recoverWith { 55 | case e: Exception => 56 | logger.warn("fromCralwerApi recover with: " + e, e) 57 | fromLocal(company, Seq(ItemSource.baidu), method, duration, forcedLatest).flatMap(list => 58 | Marshal(list.flatMap(_.news)).to[HttpResponse] 59 | ) 60 | } 61 | 62 | case _ => 63 | fromLocal(company, Seq(ItemSource.baidu), method, duration, forcedLatest).flatMap(list => 64 | Marshal(list).to[HttpResponse] 65 | ) 66 | } 67 | complete(future) 68 | } 69 | } 70 | } 71 | } 72 | 73 | private def fromLocal(company: String, sources: Traversable[ItemSource.Value], method: String, duration: Int, forcedLatest: String) = { 74 | val mtd = Try(SearchMethod.withName(method)).getOrElse(SearchMethod.F) 75 | newsService. 76 | fetchNews(company, sources, mtd, Duration(duration, TimeUnit.SECONDS), forcedLatest == "y") 77 | } 78 | 79 | private def fromCrawlerApi(company: String) = 80 | SystemUtils.httpClient.get(config.getString("crawler.api-uri") + "/api/news") 81 | .queryParam("companyName" -> company) 82 | .execute() 83 | .map { resp => 84 | if (resp.getStatusCode != 200) 85 | throw new RuntimeException(s"crawler-api not found company: $company, return: ${resp.getStatusCode}") 86 | 87 | HttpResponse( 88 | StatusCodes.OK, 89 | entity = HttpEntity(ContentType(MediaTypes.`application/json`), resp.getResponseBody(Utils.CHARSET.name())) 90 | ) 91 | } 92 | 93 | } 94 | -------------------------------------------------------------------------------- /app-api/src/main/scala/crawler/app/routes/SiteRoute.scala: -------------------------------------------------------------------------------- 1 | package crawler.app.routes 2 | 3 | import crawler.SystemUtils 4 | import crawler.module.site.BaiduSite 5 | import crawler.app.common.BaseRoute 6 | import crawler.module.site.model.SearchRequest 7 | 8 | /** 9 | * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-18. 10 | */ 11 | object SiteRoute extends BaseRoute { 12 | 13 | def apply() = 14 | pathPrefix("site") { 15 | path("baidu") { 16 | post { 17 | entity(as[SearchRequest]) { searchRequest => 18 | val baidu = new BaiduSite(SystemUtils.httpClient, searchRequest) 19 | complete(baidu.fetchItemList()) 20 | } 21 | } 22 | } 23 | } 24 | 25 | } 26 | -------------------------------------------------------------------------------- /app-api/src/test/scala/demo.sc: -------------------------------------------------------------------------------- 1 | import java.nio.charset.Charset 2 | import java.nio.file.{Paths, Files} 3 | import scala.collection.JavaConverters._ 4 | 5 | import scala.io.Source 6 | 7 | val s = 8 | """crawler-news001 121.199.23.3 9 | |crawler-news002 121.199.4.6 10 | |crawler-news003 121.199.2.152 11 | |crawler-news004 121.199.12.190 12 | |crawler-news005 121.41.53.230 13 | |crawler-news006 121.199.5.96 14 | |crawler-news007 121.199.20.87 15 | |crawler-news008 121.40.93.44 16 | |crawler-news009 121.199.22.228 17 | |crawler-news010 120.26.94.198 18 | |crawler-news011 120.26.94.202 19 | |crawler-news012 120.26.94.146 20 | |crawler-news013 120.26.94.163 21 | |crawler-news014 120.26.94.211 22 | |crawler-news015 120.26.94.117 23 | |crawler-news016 120.26.94.195 24 | |crawler-news017 120.26.94.207 25 | |crawler-news018 120.26.94.185 26 | |crawler-news019 120.26.93.249 27 | |crawler-news020 120.26.94.17 28 | |crawler-news021 120.26.94.5 29 | |crawler-news022 120.26.94.7 30 | |crawler-news023 120.26.93.202 31 | |crawler-news024 120.26.94.188 32 | |crawler-news025 120.26.94.35 33 | |crawler-news026 120.26.94.58 34 | |crawler-news027 120.26.94.120 35 | |crawler-news028 120.26.94.203 36 | |crawler-news029 120.26.94.38 37 | |crawler-news030 120.26.94.150 38 | |crawler-news031 120.26.94.151 39 | |crawler-news032 120.26.94.147 40 | |crawler-news033 120.26.94.28 41 | |crawler-news034 120.26.94.191 42 | |crawler-news035 120.26.94.18 43 | |crawler-news036 120.26.93.254 44 | |crawler-news037 120.26.94.49 45 | |crawler-news038 120.26.94.139 46 | |crawler-news039 120.26.94.2 47 | |crawler-news040 120.26.94.4 48 | |crawler-news041 120.26.94.23 49 | |crawler-news042 120.26.94.29 50 | |crawler-news043 120.26.94.174 51 | |crawler-news044 120.26.94.8 52 | |crawler-news045 120.26.93.240 53 | |crawler-news046 120.26.93.215 54 | |crawler-news047 120.26.94.122 55 | |crawler-news048 120.26.94.12 56 | |crawler-news049 120.26.92.125 57 | |crawler-news050 120.26.92.180 58 | |crawler-news051 120.26.93.219 59 | |crawler-news052 120.26.94.76 60 | |crawler-news053 120.26.93.229 61 | |crawler-news054 120.26.94.22 62 | |crawler-news055 120.26.94.14 63 | |crawler-news056 120.26.94.84 64 | |crawler-news057 120.26.94.27 65 | |crawler-news058 120.26.93.221 66 | |crawler-news059 121.43.60.236""".stripMargin 67 | val lines = Source.fromString(s).getLines().map(_.split(" ")(0)).toStream 68 | 69 | //val ss = Source.fromString(s).getLines().map { v => 70 | // val ip = v.drop(19) 71 | // val hostname = v.take(15) 72 | // Seq(hostname, ip, "1核1G", "/usr/app/python
/home/sc/open-falcon/agent") 73 | // .mkString("| ", " | ", " |") 74 | //}.toStream 75 | // 76 | //val lines = 77 | // Stream( 78 | // Seq("hostname ", "IP", "hardware", "path"), 79 | // Seq("----------------", "--", "--------", "----") 80 | // ).map(_.mkString("| ", " | ", " |")) #::: 81 | // ss 82 | 83 | Files.write(Paths.get("/tmp/crawler-news-hosts.txt"), lines.asJava) -------------------------------------------------------------------------------- /app-api/src/test/scala/saic.sc: -------------------------------------------------------------------------------- 1 | import scala.io.Source 2 | 3 | val s = 4 | """|120.55.182.150
(10.117.12.74) | 1核1G | /usr/app/python |saic | 5 | ||120.26.225.105
(10.117.55.14) | 1核1G | /usr/app/python |saic | 6 | ||121.41.2.74
(10.168.96.82) | 1核1G | /usr/app/python |saic | 7 | ||120.55.113.230
(10.168.152.118) | 1核1G | /usr/app/python |saic | 8 | ||120.55.114.18
(10.168.154.133) | 1核1G | /usr/app/python |saic | 9 | ||120.55.88.109
(10.117.196.51) | 1核1G | /usr/app/python |saic | 10 | ||121.41.2.196
(10.168.91.79) | 1核1G | /usr/app/python |saic | 11 | ||121.41.2.186
(10.168.94.151) | 1核1G | /usr/app/python |saic | 12 | ||120.55.64.125
(10.117.211.194) | 1核1G | /usr/app/python |saic | 13 | ||121.41.2.162
(10.168.93.81) | 1核1G | /usr/app/python |saic | 14 | ||121.41.1.166
(10.168.54.249) | 1核1G | /usr/app/python |saic | 15 | ||120.26.217.236
(10.117.52.105) | 1核1G | /usr/app/python |saic | 16 | ||120.26.92.73
(10.51.8.148) | 1核1G | /usr/app/python |saic | 17 | ||120.55.180.251
(10.117.8.21) | 1核1G | /usr/app/python |saic | 18 | ||120.26.91.2
(10.117.209.143) | 1核1G | /usr/app/python |saic | 19 | ||120.26.223.152
(10.117.51.186) | 1核1G | /usr/app/python |saic | 20 | ||120.26.223.135
(10.117.52.107) | 1核1G | /usr/app/python |saic | 21 | ||120.26.91.8
(10.117.209.141) | 1核1G | /usr/app/python |saic | 22 | ||120.55.112.92
(10.168.152.171) | 1核1G | /usr/app/python |saic | 23 | ||120.55.181.10
(10.117.8.192) | 1核1G | /usr/app/python |saic |""".stripMargin 24 | val lines = Source.fromString(s).getLines().toStream 25 | .map(v => v.take(v.indexOf('<')).replace("|", "")) 26 | 27 | // fabric hosts 28 | lines 29 | .map("xu.du@" + _) 30 | .mkString("[\"", "\",\"", "\"]") 31 | 32 | // hostnames 33 | lines.foreach(println) 34 | 35 | -------------------------------------------------------------------------------- /app-api/src/test/scala/worksheet.sc: -------------------------------------------------------------------------------- 1 | import java.time.LocalDateTime 2 | 3 | import crawler.module.site.BaiduSite 4 | //BaiduSite.dealTime("2015年1月13日") 5 | //BaiduSite.dealTime("2015年1月1日") 6 | //BaiduSite.dealTime("2015年11月13日") 7 | //BaiduSite.dealTime("2015年11月3日") 8 | 9 | "www.runoob.com/kjlsdf/sdf/".take("www.runoob.com/kjlsdf/sdf/".indexOf('/')) 10 | 11 | val TIME_PATTERN = """(\d{4})年(\d{1,2})月(\d{1,2})日""".r 12 | def parseTime(s: String) = s.substring(0, s.indexOf('日')+1) match { 13 | case TIME_PATTERN(year, month, day) => LocalDateTime.of(year.toInt, month.toInt, day.toInt, 0, 0) 14 | case _ => null 15 | } 16 | parseTime("2015年1月13日 - ") 17 | parseTime("2015年1月1日") 18 | parseTime("2015年11月13日") 19 | parseTime("2015年11月3日") 20 | parseTime("2015年11月332日") 21 | parseTime("15年11月332日") 22 | -------------------------------------------------------------------------------- /app-api/src/universal/conf/application-test.conf: -------------------------------------------------------------------------------- 1 | akka { 2 | loggers = ["akka.event.slf4j.Slf4jLogger"] 3 | loglevel = INFO 4 | log-dead-letters = off 5 | log-dead-letters-during-shutdown = off 6 | fork-join-executor { 7 | parallelism-factor = 3.0 8 | parallelism-min = 16 9 | parallelism-max = 64 10 | } 11 | 12 | http { 13 | server { 14 | backlog = 1024 15 | max-connections = 8192 16 | socket-options { 17 | so-reuse-address = on 18 | } 19 | } 20 | host-connection-pool { 21 | max-connections = 8 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /app-api/src/universal/conf/application.conf: -------------------------------------------------------------------------------- 1 | akka { 2 | loggers = ["akka.event.slf4j.Slf4jLogger"] 3 | loglevel = INFO 4 | log-dead-letters = off 5 | log-dead-letters-during-shutdown = off 6 | fork-join-executor { 7 | parallelism-factor = 3.0 8 | parallelism-min = 16 9 | parallelism-max = 64 10 | } 11 | 12 | http { 13 | server { 14 | backlog = 1024 15 | max-connections = 8192 16 | socket-options { 17 | so-reuse-address = on 18 | } 19 | } 20 | host-connection-pool { 21 | max-connections = 8 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /app-api/src/universal/conf/jvmopts: -------------------------------------------------------------------------------- 1 | -xmx2048m 2 | -xms2048m 3 | -file.encoding=UTF-8 -------------------------------------------------------------------------------- /app-api/src/universal/conf/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | %date - [%level] - from %logger in %thread %n%message%n%xException%n 6 | 7 | 8 | 9 | 10 | logs/application.log 11 | 12 | 13 | logs/application-log-%d{yyyy-MM-dd}.gz 14 | 15 | 60 16 | 17 | 18 | %date{yyyy-MM-dd HH:mm:ss ZZZZ} [%level] from %logger in %thread - %n%message%n%xException%n 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /module-news/docs/杂记.txt: -------------------------------------------------------------------------------- 1 | NewsMaster -> NewsJob* -> NewsSourceJob* -> SearchPageWorker 2 | PersistActor -> ItemPageWorker* 3 | 4 | 5 | NewsJob actor: 收到新闻抓取请求,管理新闻抓取状态,数据存储 6 | - SearchPageWorker actor: 进行新闻搜索页面抓取,并解析 7 | - ItemPageWorker actor: 新闻详情页面抓取,并抽取内容正文 8 | 9 | NewsJob actor: 每收到一次新闻抓取请求就实例化一个actor,在actor中再委派SearchPageWorker进行新闻搜索页抓取。 10 | 新闻搜索页抓取成功后数据回到Job actor,Job actor判断是否需要抓取全文,若是则再委派ItemPageWorker进行全文抓取。 11 | NewsJob将保存一个 timeout 超时值,由实例化时参数传入。超时到则向客户返回Timeout请求。而actor则继续等待子actor, 12 | 如:SearchPageWorker和Seq[ItemPageWorker]执行完(或有错误发生),再停止NewsJob 13 | 在 postStop 回调函数中进行数据持久化工作。 14 | 15 | SearchPageWorker: 根据参数抓取新闻搜索页的新闻列表,并将结果传回给 NewsJob 16 | 17 | ItemPageWorker: 根据url抓取新闻详情页正文内容。每条url生成一个actor。抓取成功一条则回传到 NewsJob 中,由 NewsJob 做进一步处理。 18 | 19 | 20 | 21 | DB存储添加索引和查找功能 22 | 23 | 在case class中加入一个transactionId,记录每一次新闻查询的请求事物。 24 | 25 | actor中尽量不传 ActorRef,而通过ActorPath或其它类似机制来查找actor 26 | 27 | 新闻数据。 28 | 29 | 使用Cassandra存储,2张表: 30 | 31 | create keyspace if not exists crawler_spider with replication = {'class': 'SimpleStrategy', 'replication_factor': 2}; 32 | use crawler_spider; 33 | 34 | create type news_type ( 35 | url Text, 36 | source Text, 37 | title Text, 38 | time Timestamp, 39 | abstract Text 40 | ); 41 | create table search_page ( 42 | key Text, 43 | source Ascii, 44 | time Timestamp, 45 | count Int, 46 | news List>, 47 | primary key (key, source, time) 48 | ); 49 | create table news_page ( 50 | url Text, 51 | title Text, 52 | source Text, 53 | time Timestamp, 54 | abstract Text, 55 | content Text, 56 | primary key (url) 57 | ); 58 | create table page_html ( 59 | url Text, 60 | created_at Timestamp, 61 | src Text, 62 | primary key (url, created_at) 63 | ); 64 | -------------------------------------------------------------------------------- /module-news/src/main/scala/crawler/module/news/NewsJsonSupport.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news 2 | 3 | import crawler.module.news.enums.{SearchMethod, ItemSource} 4 | import crawler.util.http.TJsonSupport 5 | import org.json4s.Formats 6 | import org.json4s.ext.EnumNameSerializer 7 | 8 | /** 9 | * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-22. 10 | */ 11 | trait NewsJsonSupport extends TJsonSupport { 12 | implicit val formats: Formats = defaultFormats + 13 | new EnumNameSerializer(ItemSource) + 14 | new EnumNameSerializer(SearchMethod) 15 | } 16 | 17 | object NewsJsonSupport extends NewsJsonSupport 18 | -------------------------------------------------------------------------------- /module-news/src/main/scala/crawler/module/news/NewsUtils.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news 2 | 3 | import java.net.URI 4 | import java.util.concurrent.atomic.AtomicInteger 5 | 6 | /** 7 | * News Utils 8 | * Created by yangjing on 15-11-5. 9 | */ 10 | object NewsUtils { 11 | private val _nums = new AtomicInteger(0) 12 | 13 | def getIndent = _nums.getAndIncrement() 14 | 15 | def uriToBaseUri(uri: String): String = uriToBaseUri(URI.create(uri)) 16 | 17 | def uriToBaseUri(uri: URI): String = { 18 | val sb = new StringBuffer() 19 | if (uri.getScheme != null) { 20 | sb.append(uri.getScheme) 21 | sb.append(':') 22 | } 23 | if (uri.isOpaque) { 24 | sb.append(uri.getSchemeSpecificPart) 25 | } else { 26 | if (uri.getHost != null) { 27 | sb.append("//") 28 | if (uri.getUserInfo != null) { 29 | sb.append(uri.getUserInfo) 30 | sb.append('@') 31 | } 32 | val needBrackets = ((uri.getHost.indexOf(':') >= 0) 33 | && !uri.getHost.startsWith("[") 34 | && !uri.getHost.endsWith("]")) 35 | if (needBrackets) { 36 | sb.append('[') 37 | } 38 | sb.append(uri.getHost) 39 | if (needBrackets) sb.append(']') 40 | if (uri.getPort != -1) { 41 | sb.append(':') 42 | sb.append(uri.getPort) 43 | } 44 | } else if (uri.getAuthority != null) { 45 | sb.append("//") 46 | sb.append(uri.getAuthority) 47 | } 48 | } 49 | sb.toString 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /module-news/src/main/scala/crawler/module/news/commands/Commands.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news.commands 2 | 3 | import crawler.module.news.enums.{ItemSource, SearchMethod} 4 | import crawler.module.news.model.{NewsPageItem, SearchResult} 5 | 6 | import scala.concurrent.duration.FiniteDuration 7 | 8 | case class RequestSearchNews(sources: Seq[ItemSource.Value], msg: SearchNews) 9 | 10 | /** 11 | * 新闻源搜索 12 | * 13 | * @param key 关键词 14 | * @param method 搜索方式 15 | * @param duration 持续时间(超时) 16 | */ 17 | case class SearchNews(key: String, 18 | method: SearchMethod.Value, 19 | duration: FiniteDuration) 20 | 21 | /** 22 | * 开始搜索新闻 23 | */ 24 | case object StartSearchNews 25 | 26 | /** 27 | * 抓取搜索页 28 | */ 29 | case object StartFetchSearchPage 30 | 31 | /** 32 | * 搜索超时 33 | */ 34 | case object SearchTimeout 35 | 36 | /** 37 | * 搜索结果 38 | * 39 | * @param news 新闻结果 40 | */ 41 | case class SearchPageResult(news: SearchResult) 42 | 43 | /** 44 | * 搜索失败 45 | * 46 | * @param failure 失败结果 47 | */ 48 | case class SearchPageFailure(failure: Throwable) 49 | 50 | /** 51 | * 开始抓取新闻详情内容 52 | */ 53 | case object StartFetchItemPage 54 | 55 | /** 56 | * 新闻详情 57 | * 58 | * @param result 新闻详情 59 | */ 60 | case class ItemPageResult(result: Either[String, NewsPageItem]) 61 | -------------------------------------------------------------------------------- /module-news/src/main/scala/crawler/module/news/crawlers/BaiduNews.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news.crawlers 2 | 3 | import java.net.URLEncoder 4 | import java.time.LocalDateTime 5 | import java.util.concurrent.TimeUnit 6 | 7 | import akka.util.Timeout 8 | import crawler.SystemUtils 9 | import crawler.module.news.enums.{SearchMethod, ItemSource} 10 | import crawler.module.news.model.{NewsItem, SearchResult} 11 | import crawler.util.http.HttpClient 12 | import crawler.util.news.contextextractor.ContentExtractor 13 | import crawler.util.time.TimeUtils 14 | import org.jsoup.Jsoup 15 | import org.jsoup.nodes.Element 16 | 17 | import scala.collection.JavaConverters._ 18 | import scala.concurrent.duration._ 19 | import scala.concurrent.{Await, ExecutionContext, Future, Promise} 20 | import scala.util.{Failure, Success} 21 | 22 | /** 23 | * 百度新闻爬虫 24 | * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-03. 25 | */ 26 | class BaiduNews(val httpClient: HttpClient) extends NewsCrawler(ItemSource.baidu) { 27 | 28 | import crawler.util.JsoupImplicits._ 29 | 30 | override protected val defaultHeaders: Array[Seq[(String, String)]] = 31 | super.defaultHeaders.map(headers => headers :+ ("User-Agent" -> "Baiduspider")) 32 | 33 | private def parseNewsItem(news: Element): NewsItem = { 34 | val a = news.findByClass("c-title").first().getElementsByTag("a").first() 35 | val summary = news.findByClass("c-summary") 36 | val authorText = news.findByClass("c-author").text() 37 | val source = authorText.split("  ") 38 | val footer = summary.findByClass("c-info").first().text() 39 | NewsItem( 40 | a.text(), 41 | a.attr("href"), 42 | source.headOption.getOrElse(""), 43 | BaiduNews.dealTime(source.lastOption.getOrElse("")), 44 | summary.text().replace(authorText, "").replace(footer, "")) 45 | } 46 | 47 | override def fetchItemList(key: String)(implicit ec: ExecutionContext): Future[SearchResult] = { 48 | val promise = Promise[Seq[NewsItem]]() 49 | 50 | val newsResultsFuture = fetchPage(BaiduNews.BAIDU_NEWS_BASE_URL.format(URLEncoder.encode('"' + key + '"', "UTF-8"))).map { resp => 51 | val doc = Jsoup.parse(resp.getResponseBodyAsStream, "UTF-8", BaiduNews.BAIDU_NEWS_HOST) 52 | // logger.debug(doc.body().toString + "\n\n\n") 53 | val now = TimeUtils.now() 54 | if (doc.getElementById("noresult") != null) { 55 | SearchResult(newsSource, key, now, 0, Nil) 56 | } else { 57 | val countText = doc 58 | .getElementById("header_top_bar") 59 | .getElementsByAttributeValue("class", "nums") 60 | .first() 61 | .text() 62 | val count = 63 | """\d+""".r.findAllMatchIn(countText).map(_.matched).mkString.toInt 64 | 65 | val newsDiv = doc.getElementById("content_left") 66 | val pages = doc.select("#page a").asScala 67 | val newsItemFutures = pages.take(BaiduNews.PAGE_LIMIT - 1).map { page => 68 | TimeUnit.MILLISECONDS.sleep(500) 69 | fetchPageLinks(BaiduNews.BAIDU_NEWS_HOST + page.attr("href")) 70 | } 71 | Future.sequence(newsItemFutures).map(_.flatten).onComplete { 72 | case Success(list) => 73 | promise.success(list) 74 | case Failure(e) => 75 | e.printStackTrace() 76 | promise.success(Nil) 77 | } 78 | 79 | SearchResult( 80 | newsSource, 81 | key, 82 | now, 83 | count, 84 | newsDiv.findByClass("result").asScala.map(parseNewsItem).toList) 85 | } 86 | } 87 | 88 | for { 89 | newsResult <- newsResultsFuture 90 | newsItems <- promise.future 91 | } yield { 92 | newsResult.copy(news = newsResult.news ++ newsItems) 93 | } 94 | } 95 | 96 | def fetchPageLinks(url: String)(implicit ec: ExecutionContext): Future[Seq[NewsItem]] = { 97 | fetchPage(url).map { resp => 98 | val doc = Jsoup.parse(resp.getResponseBodyAsStream, "UTF-8", BaiduNews.BAIDU_NEWS_HOST) 99 | if (doc.getElementById("noresult") != null) { 100 | Nil 101 | } else { 102 | val newsDiv = doc.getElementById("content_left") 103 | newsDiv.findByClass("result").asScala.map(parseNewsItem).toList 104 | } 105 | } 106 | } 107 | } 108 | 109 | object BaiduNews { 110 | val PAGE_LIMIT = 5 111 | val BAIDU_NEWS_HOST = "http://news.baidu.com" 112 | val BAIDU_NEWS_BASE_URL = "http://news.baidu.com/ns?word=%s&tn=news&from=news&cl=2&rn=20&ct=1" 113 | val TIME_PATTERN = """\d{4}年\d{2}月\d{2}日 \d{2}:\d{2}""".r 114 | val FEW_HOURS_PATTERN = """(\d+)小时前""".r 115 | 116 | private def dealFewHours(timeStr: String): String = { 117 | val matcher = FEW_HOURS_PATTERN.pattern.matcher(timeStr) 118 | if (matcher.matches()) matcher.group(1) else "" 119 | } 120 | 121 | def dealTime(timeStr: String): Option[LocalDateTime] = { 122 | val dt = if (timeStr.length < 2) { 123 | LocalDateTime.now() 124 | } else if (TIME_PATTERN.pattern.matcher(timeStr).matches()) { 125 | val s = timeStr.replaceAll( """年|月""", "-").replace("日", "") 126 | LocalDateTime.parse(s, TimeUtils.formatterDateMinus) 127 | } else if (FEW_HOURS_PATTERN.pattern.matcher(timeStr).matches()) { 128 | val now = LocalDateTime.now() 129 | val hour = dealFewHours(timeStr).toLong 130 | now.minusHours(hour) 131 | } else { 132 | null 133 | } 134 | Option(dt) 135 | } 136 | 137 | //////////////////////////////////////////////////////////////////////////// 138 | // 以下为测试用例 139 | //////////////////////////////////////////////////////////////////////////// 140 | 141 | def run(newsCrawler: NewsCrawler, 142 | name: String, 143 | method: SearchMethod.Value)(implicit ec: ExecutionContext): Future[SearchResult] = { 144 | val newsResult = newsCrawler.fetchItemList(name) 145 | if (SearchMethod.A == method) { 146 | newsResult 147 | } else { 148 | newsResult.flatMap { result => 149 | val seqs = result.news.map { news => 150 | newsCrawler.fetchPage(news.url).map { resp => 151 | (news.url, ContentExtractor.getNewsByHtml(resp.getResponseBody("UTF-8")).getContent) 152 | } 153 | } 154 | val f = Future.sequence(seqs) 155 | f.map { urlContents => 156 | val news = result.news.map { news => 157 | urlContents.find(_._1 == news.url) match { 158 | case Some((_, content)) => 159 | news.copy(content = Option(content)) 160 | case None => 161 | news 162 | } 163 | } 164 | result.copy(news = news) 165 | } 166 | } 167 | } 168 | } 169 | 170 | def main(args: Array[String]): Unit = { 171 | import SystemUtils._ 172 | implicit val timeout = Timeout(10.hours) 173 | import system.dispatcher 174 | 175 | val httpClient = HttpClient() 176 | val baidu = new BaiduNews(httpClient) 177 | val f = run(baidu, "杭州今元标矩科技有限公司", SearchMethod.F) 178 | val result = Await.result(f, timeout.duration) 179 | result.news.foreach(news => println(news.content + "\n\n")) 180 | println(result.count) 181 | 182 | system.shutdown() 183 | httpClient.close() 184 | system.awaitTermination() 185 | // System.exit(0) 186 | } 187 | } 188 | -------------------------------------------------------------------------------- /module-news/src/main/scala/crawler/module/news/crawlers/CourtNews.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news.crawlers 2 | 3 | import java.time.LocalDate 4 | 5 | import crawler.module.news.enums.ItemSource 6 | import crawler.module.news.model.{NewsItem, SearchResult} 7 | import crawler.util.Utils 8 | import crawler.util.http.HttpClient 9 | import crawler.util.time.TimeUtils 10 | import org.jsoup.Jsoup 11 | import org.jsoup.nodes.Element 12 | 13 | import scala.collection.JavaConverters._ 14 | import scala.concurrent.{ExecutionContext, Future} 15 | import scala.util.Random 16 | 17 | /** 18 | * 中国法院网新闻搜索 19 | * Created by yangjing on 15-11-9. 20 | */ 21 | class CourtNews(val httpClient: HttpClient) extends NewsCrawler(ItemSource.court) { 22 | private def fetchPagePost(url: String, data: Seq[(String, String)]) = { 23 | val headers = defaultHeaders(Random.nextInt(defaultHeaders.length)) 24 | httpClient.post(url).header(headers: _*).addFormParam(data: _*).execute() 25 | } 26 | 27 | private def parseNewsItem(elem: Element) = { 28 | val a = elem.select("dt").select("a").first() 29 | val dds = elem.select("dd") 30 | val item = NewsItem( 31 | a.text(), 32 | CourtNews.SITE_URL + a.attr("href"), 33 | "中国法院网", 34 | Option(TimeUtils.toLocalDateTime(dds.last().text().split("    ").last)), 35 | dds.first().text()) 36 | // println(item) 37 | item 38 | } 39 | 40 | /** 41 | * 抓取搜索页 42 | * 43 | * @param key 搜索关键词 44 | * @return 45 | */ 46 | override def fetchItemList(key: String)(implicit ec: ExecutionContext): Future[SearchResult] = { 47 | fetchPagePost(CourtNews.SEARCH_URL, Seq( 48 | "keyword" -> key, 49 | "button" -> "提交", 50 | "content_time_publish_begin" -> "2002-01-01", 51 | "content_time_publish_end" -> LocalDate.now().toString, 52 | "article_category_id" -> "", 53 | "content_author" -> "" 54 | )).map { resp => 55 | val now = TimeUtils.now() 56 | val doc = Jsoup.parse(resp.getResponseBody(Utils.CHARSET.name), CourtNews.SITE_URL) 57 | val newsDl = doc.select("div.search_content").select("dl") 58 | if (newsDl.isEmpty) { 59 | SearchResult(newsSource, key, now, 0, Nil) 60 | } else { 61 | val newsItems = newsDl.asScala.map(parseNewsItem) 62 | val countText = doc.select("div.search_br").select("span").first().text 63 | val count = 64 | try { 65 | countText.toInt 66 | } catch { 67 | case e: Exception => 68 | logger.warn("count < 1: " + countText) 69 | 0 70 | } 71 | 72 | SearchResult(newsSource, key, now, count, newsItems) 73 | } 74 | } 75 | } 76 | } 77 | 78 | object CourtNews { 79 | val SITE_URL = "http://www.chinacourt.org" 80 | val SEARCH_URL = "http://www.chinacourt.org/article/search.shtml" 81 | } 82 | -------------------------------------------------------------------------------- /module-news/src/main/scala/crawler/module/news/crawlers/HaosouNews.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news.crawlers 2 | 3 | import java.net.URLEncoder 4 | 5 | import crawler.module.news.NewsUtils 6 | import crawler.module.news.enums.ItemSource 7 | import crawler.module.news.model.{NewsItem, SearchResult} 8 | import crawler.util.Utils 9 | import crawler.util.http.HttpClient 10 | import crawler.util.time.TimeUtils 11 | import org.jsoup.Jsoup 12 | import org.jsoup.nodes.Element 13 | 14 | import scala.collection.JavaConverters._ 15 | import scala.concurrent.{ExecutionContext, Future} 16 | 17 | /** 18 | * 360好搜新闻搜索 19 | * Created by yangjing on 15-11-9. 20 | */ 21 | class HaosouNews(val httpClient: HttpClient) extends NewsCrawler(ItemSource.haosou) { 22 | private def parseItem(elem: Element) = { 23 | val a = elem.select("a") 24 | val newsInfo = elem.select("p.newsinfo") 25 | NewsItem( 26 | a.text(), 27 | a.attr("href"), 28 | newsInfo.select("span.sitename").text(), 29 | Option(TimeUtils.toLocalDateTime(newsInfo.select("span.posttime").attr("title"))), 30 | elem.select("p.content").text()) 31 | } 32 | 33 | /** 34 | * 抓取搜索页 35 | * 36 | * @param key 搜索关键词 37 | * @return 38 | */ 39 | override def fetchItemList(key: String)(implicit ec: ExecutionContext): Future[SearchResult] = { 40 | fetchPage(HaosouNews.searchUrl(key)).map { resp => 41 | val doc = Jsoup.parse(resp.getResponseBody(Utils.CHARSET.name()), NewsUtils.uriToBaseUri(HaosouNews.SEARCH_SITE)) 42 | val now = TimeUtils.now() 43 | val ul = doc.select("ul#news") 44 | if (ul.isEmpty) { 45 | SearchResult(newsSource, key, now, 0, Nil) 46 | } else { 47 | val newsItems = ul.select("li.res-list").asScala.map(parseItem) 48 | val countText = doc.select("div#page").select("span.nums").text() 49 | val count = 50 | try { 51 | """\d+""".r.findAllMatchIn(countText).mkString.toInt 52 | } catch { 53 | case e: Exception => 54 | logger.warn("count < 1") 55 | newsItems.size 56 | } 57 | SearchResult(newsSource, key, now, count, newsItems) 58 | } 59 | } 60 | } 61 | } 62 | 63 | object HaosouNews { 64 | val SEARCH_SITE = "http://news.haosou.com" 65 | 66 | def searchUrl(key: String) = SEARCH_SITE + "/ns?q=%s".format(URLEncoder.encode(key, Utils.CHARSET.name())) 67 | 68 | } 69 | -------------------------------------------------------------------------------- /module-news/src/main/scala/crawler/module/news/crawlers/NewsCrawler.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news.crawlers 2 | 3 | import com.typesafe.scalalogging.LazyLogging 4 | import crawler.module.news.NewsUtils 5 | import crawler.module.news.enums.ItemSource 6 | import crawler.module.news.model.{NewsPageItem, SearchResult} 7 | import crawler.util.Crawler 8 | import crawler.util.news.contextextractor.ContentExtractor 9 | import org.jsoup.helper.DataUtil 10 | 11 | import scala.concurrent.{ExecutionContext, Future} 12 | 13 | /** 14 | * 新闻爬虫 15 | * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-03. 16 | */ 17 | abstract class NewsCrawler(val newsSource: ItemSource.Value) extends Crawler with LazyLogging { 18 | /** 19 | * 抓取搜索页 20 | * 21 | * @param key 搜索关键词 22 | * @return 23 | */ 24 | def fetchItemList(key: String)(implicit ec: ExecutionContext): Future[SearchResult] 25 | 26 | /** 27 | * 抓取新闻详情页 28 | * 29 | * @param url 网页链接 30 | * @return 31 | */ 32 | def fetchNewsItem(url: String)(implicit ec: ExecutionContext): Future[NewsPageItem] = { 33 | fetchPage(url).map { resp => 34 | val in = resp.getResponseBodyAsStream 35 | val doc = DataUtil.load(in, null, NewsUtils.uriToBaseUri(url)) 36 | val src = doc.toString 37 | val news = ContentExtractor.getNewsByDoc(doc) 38 | NewsPageItem(url, src, news.getContent) 39 | } 40 | } 41 | 42 | } 43 | 44 | object NewsCrawler { 45 | private var _newsCrawler = Map.empty[ItemSource.Value, NewsCrawler] 46 | 47 | def registerCrawler(source: ItemSource.Value, newsCrawler: NewsCrawler): Unit = { 48 | _newsCrawler = _newsCrawler + (source -> newsCrawler) 49 | } 50 | 51 | def getCrawler(source: ItemSource.Value): Option[NewsCrawler] = _newsCrawler.get(source) 52 | 53 | } 54 | -------------------------------------------------------------------------------- /module-news/src/main/scala/crawler/module/news/crawlers/SogouNews.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news.crawlers 2 | 3 | import java.net.URLEncoder 4 | 5 | import akka.util.Timeout 6 | import crawler.SystemUtils 7 | import crawler.module.news.enums.{ItemSource, SearchMethod} 8 | import crawler.module.news.model.{NewsItem, SearchResult} 9 | import crawler.util.http.HttpClient 10 | import crawler.util.time.TimeUtils 11 | import org.jsoup.Jsoup 12 | import org.jsoup.nodes.Element 13 | 14 | import scala.collection.JavaConverters._ 15 | import scala.concurrent.{Await, ExecutionContext, Future} 16 | import scala.util.Try 17 | 18 | /** 19 | * 搜狗新闻搜索 20 | * 21 | * @param httpClient 22 | */ 23 | class SogouNews(val httpClient: HttpClient) extends NewsCrawler(ItemSource.sogou) { 24 | 25 | private def parseItem(elem: Element) = { 26 | val header = elem.select("h3.pt") 27 | val title = header.select("a.pp") 28 | val source = header.select("cite") match { 29 | case s if s.isEmpty => Array("", "") 30 | case s => s.text().split(SogouNews.CITE_SPLIT_CHAR) 31 | } 32 | val summary = elem.select("div.ft").text().replace( """>>\d+?条相同新闻""", "") 33 | 34 | NewsItem( 35 | title.text(), 36 | title.attr("href"), 37 | source(0), 38 | Option(TimeUtils.toLocalDateTime(source.tail.mkString(" "))), 39 | summary) 40 | } 41 | 42 | /** 43 | * 抓取搜索页 44 | * 45 | * @param key 搜索关键词 46 | * @return 47 | */ 48 | override def fetchItemList(key: String)(implicit ec: ExecutionContext): Future[SearchResult] = { 49 | // val doc = fetchDocument(SogouCrawler.searchUrl(URLEncoder.encode(key, "UTF-8"))) 50 | fetchPage(SogouNews.searchUrl(URLEncoder.encode(key, "UTF-8"))).map { resp => 51 | val doc = Jsoup.parse(resp.getResponseBody, "http://news.sogou.com") 52 | val now = TimeUtils.now() 53 | // println(doc) 54 | val results = doc.select("div.results") 55 | if (results.isEmpty) { 56 | SearchResult(newsSource, key, now, 0, Nil) 57 | } else { 58 | val newsList = results.select("div.rb").asScala.map(parseItem) 59 | var count = Try( """\d+""".r.findAllMatchIn(doc.select("#pagebar_container").select("div.num").text()).mkString.toInt).getOrElse(0) 60 | if (count < 1) { 61 | logger.warn("count < 1") 62 | count = newsList.size 63 | } 64 | SearchResult(newsSource, key, now, count, newsList) 65 | } 66 | } 67 | } 68 | } 69 | 70 | object SogouNews { 71 | val REGEX = """\d+?条相同新闻""".r 72 | val CITE_SPLIT_CHAR = 160.toChar 73 | 74 | def searchUrl(key: String) = s"http://news.sogou.com/news?query=%22$key%22" 75 | 76 | //////////////////////////////////////////////////////////////////////////// 77 | // 以下为测试用例 78 | //////////////////////////////////////////////////////////////////////////// 79 | 80 | def run(newsCrawler: NewsCrawler, 81 | key: String, 82 | method: SearchMethod.Value)(implicit ec: ExecutionContext): Future[SearchResult] = { 83 | val newsResult = newsCrawler.fetchItemList(key) 84 | if (SearchMethod.A == method) { 85 | newsResult 86 | } else { 87 | newsResult.flatMap { result => 88 | val seqs = result.news.map { news => 89 | // newsCrawler.fetchPage(news.url).map { resp => 90 | // (news.url, ContentExtractor.getNewsByHtml(resp.getResponseBody("UTF-8")).getContent) 91 | // } 92 | newsCrawler.fetchNewsItem(news.url) 93 | } 94 | val f = Future.sequence(seqs) 95 | f.map { pageItems => 96 | val news = result.news.map { news => 97 | pageItems.find(_.url == news.url) match { 98 | case Some(pageItem) => 99 | news.copy(content = Option(pageItem.content)) 100 | case None => 101 | news 102 | } 103 | } 104 | result.copy(news = news) 105 | } 106 | } 107 | } 108 | } 109 | 110 | } 111 | -------------------------------------------------------------------------------- /module-news/src/main/scala/crawler/module/news/crawlers/WechatNews.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news.crawlers 2 | 3 | import java.net.URLEncoder 4 | import java.time.Instant 5 | 6 | import crawler.module.news.enums.ItemSource 7 | import crawler.module.news.model.{NewsItem, SearchResult} 8 | import crawler.util.Utils 9 | import crawler.util.http.HttpClient 10 | import crawler.util.time.TimeUtils 11 | import org.jsoup.Jsoup 12 | import org.jsoup.nodes.Element 13 | 14 | import scala.collection.JavaConverters._ 15 | import scala.concurrent.duration._ 16 | import scala.concurrent.{Await, ExecutionContext, Future} 17 | 18 | /** 19 | * 搜狗微信搜索 20 | * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-10. 21 | */ 22 | class WechatNews(val httpClient: HttpClient) extends NewsCrawler(ItemSource.wechat) { 23 | private def parseNewsItem(elem: Element) = { 24 | implicit val duration = 1.second 25 | 26 | try { 27 | val title = elem.select("h4") 28 | val footer = elem.select("div.s-p") 29 | val scriptStr = elem.select("script").last().text() 30 | val timeStr = """'(\d+?)'""".r.findFirstMatchIn(scriptStr).map(_.matched.replace("'", "")) 31 | val href = WechatNews.complateWeixinUrl(title.select("a").attr("href").trim) 32 | val url = Option(WechatNews.find302Location(href, requestHeaders())).getOrElse(href) 33 | NewsItem( 34 | title.text().trim, 35 | url, 36 | footer.select("a#weixin_account").attr("title"), 37 | Option(TimeUtils.toLocalDateTime(Instant.ofEpochSecond(timeStr.map(_.toLong).getOrElse(Instant.now().getEpochSecond)))), 38 | elem.select("p").text()) 39 | } catch { 40 | case e: Exception => 41 | logger.error(elem.toString) 42 | throw e 43 | } 44 | } 45 | 46 | /** 47 | * 抓取搜索页 48 | * 49 | * @param key 搜索关键词 50 | * @return 51 | */ 52 | override def fetchItemList(key: String)(implicit ec: ExecutionContext): Future[SearchResult] = { 53 | fetchPage(WechatNews.searchUrl(key)).map { response => 54 | response.getHeaders.entrySet().asScala.foreach { case entry => println(entry.getKey + ": " + entry.getValue.asScala) } 55 | 56 | val now = TimeUtils.now() 57 | val doc = Jsoup.parse(response.getResponseBody(Utils.CHARSET.name()), "http://weixin.sogou.com") 58 | println(doc) 59 | val results = doc.select("div.wx-rb") 60 | if (!doc.select("#seccodeImage").isEmpty) { 61 | SearchResult(newsSource, key, now, 0, Nil, Some(doc.select("div.content-box").select("p.p2").text())) 62 | } else if (results.isEmpty) { 63 | SearchResult(newsSource, key, now, 0, Nil) 64 | } else { 65 | val newsItems = results.asScala.map(parseNewsItem) 66 | val countText = doc.select("resnum#scd_num").text().replace(",", "").trim 67 | val count = 68 | try { 69 | countText.toInt 70 | } catch { 71 | case e: Exception => 72 | logger.warn("count < 1: " + countText, e) 73 | newsItems.size 74 | } 75 | SearchResult(newsSource, key, now, count, newsItems) 76 | } 77 | } 78 | } 79 | 80 | } 81 | 82 | object WechatNews { 83 | final val WEIXIN_SEARCH_PAGE = "http://weixin.sogou.com" 84 | 85 | def complateWeixinUrl(uri: String) = 86 | if (uri.startsWith("/")) WEIXIN_SEARCH_PAGE + uri else uri 87 | 88 | def searchUrl(key: String) = 89 | WEIXIN_SEARCH_PAGE + "/weixin?query=%s&type=2".format(URLEncoder.encode(key, Utils.CHARSET.name())) 90 | 91 | def find302Location(url: String, headers: Seq[(String, String)])(implicit duration: Duration) = { 92 | val client = HttpClient(false) 93 | try { 94 | val resp = Await.result(client.get(url).header(headers: _*).execute(), duration) 95 | resp.getHeader("Location") 96 | } catch { 97 | case e: Exception => 98 | try { 99 | val respose = Await.result(client.get(url).header(headers: _*).execute(), duration + 1.second) 100 | respose.getHeader("Location") 101 | } catch { 102 | case e: Exception => 103 | // do nothing 104 | null 105 | } 106 | } finally { 107 | client.close() 108 | } 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /module-news/src/main/scala/crawler/module/news/enums/ItemSource.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news.enums 2 | 3 | /** 4 | * 新闻来源 5 | * Created by yangjing on 15-11-4. 6 | */ 7 | object ItemSource extends Enumeration { 8 | val baidu = Value 9 | val sogou = Value 10 | val haosou = Value 11 | val court = Value 12 | val wechat = Value 13 | 14 | def withToNames(source: String): Traversable[Value] = 15 | if (source == null || source.isEmpty) { 16 | ItemSource.values 17 | } else { 18 | source.split(',').toSeq.collect { 19 | case s if ItemSource.values.exists(_.toString == s) => 20 | ItemSource.withName(s) 21 | } 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /module-news/src/main/scala/crawler/module/news/enums/SearchMethod.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news.enums 2 | 3 | /** 4 | * 查找方式 5 | * Created by yangjing on 15-11-4. 6 | */ 7 | object SearchMethod extends Enumeration { 8 | // 取摘要 9 | val A = Value 10 | 11 | // 取全文 12 | val F = Value 13 | } 14 | -------------------------------------------------------------------------------- /module-news/src/main/scala/crawler/module/news/model/NewsItem.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news.model 2 | 3 | import java.time.LocalDateTime 4 | 5 | import com.datastax.driver.core.{UDTValue, UserType} 6 | import crawler.module.news.NewsJsonSupport._ 7 | import crawler.util.time.TimeUtils 8 | import org.json4s.Extraction 9 | 10 | /** 11 | * 新闻详情 12 | * Created by yangjing on 15-11-3. 13 | */ 14 | case class NewsItem(title: String, 15 | url: String, 16 | // 新闻来源(站点) 17 | source: String, 18 | time: Option[LocalDateTime], 19 | // 摘要 20 | `abstract`: String, 21 | content: Option[String] = None, 22 | values: Seq[String] = Nil, 23 | error: Option[String] = None) { 24 | def jsonPretty = { 25 | val jv = Extraction.decompose(this) 26 | serialization.writePretty(jv) 27 | } 28 | } 29 | 30 | object NewsItem { 31 | def toUDTValue(userType: UserType, ni: NewsItem): UDTValue = { 32 | userType.newValue() 33 | .setString("title", ni.title) 34 | .setString("url", ni.url) 35 | .setString("source", ni.source) 36 | .setTimestamp("time", ni.time.map(TimeUtils.toDate).orNull) 37 | .setString("abstract", ni.`abstract`) 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /module-news/src/main/scala/crawler/module/news/model/NewsPage.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news.model 2 | 3 | import java.time.LocalDateTime 4 | 5 | /** 6 | * 新闻页 7 | * Created by yangjing on 15-11-9. 8 | */ 9 | case class NewsPage(url: String, 10 | title: String, 11 | source: String, 12 | time: Option[LocalDateTime], 13 | `abstract`: String, 14 | content: String) 15 | -------------------------------------------------------------------------------- /module-news/src/main/scala/crawler/module/news/model/NewsPageItem.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news.model 2 | 3 | /** 4 | * 新闻页详情 5 | * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-05. 6 | * @param url 网页链接 7 | * @param src 网页源码 8 | // * @param title 新闻标题 9 | // * @param time 发布时间 10 | * @param content 新闻内容 11 | */ 12 | case class NewsPageItem(url: String, 13 | src: String, 14 | // title: String, 15 | // time: String, 16 | content: String) 17 | -------------------------------------------------------------------------------- /module-news/src/main/scala/crawler/module/news/model/SearchResult.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news.model 2 | 3 | import java.time.LocalDateTime 4 | 5 | import crawler.module.news.enums.ItemSource 6 | 7 | /** 8 | * 搜索结果 9 | * Created by yangjing on 15-11-3. 10 | */ 11 | case class SearchResult(source: ItemSource.Value, 12 | key: String, 13 | time: LocalDateTime, 14 | count: Int, 15 | news: Seq[NewsItem], 16 | error: Option[String] = None) 17 | -------------------------------------------------------------------------------- /module-news/src/main/scala/crawler/module/news/service/NewsDBRepo.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news.service 2 | 3 | import java.time.LocalDateTime 4 | 5 | import com.datastax.driver.core.{PreparedStatement, Session, UDTValue} 6 | import com.typesafe.scalalogging.LazyLogging 7 | import crawler.SystemUtils 8 | import crawler.module.news.enums.{ItemSource, SearchMethod} 9 | import crawler.module.news.model.{NewsItem, NewsPage, SearchResult} 10 | import crawler.util.persist.CassandraPersists 11 | import crawler.util.time.TimeUtils 12 | 13 | import scala.collection.JavaConverters._ 14 | import scala.collection.mutable 15 | import scala.concurrent.{ExecutionContextExecutor, Future} 16 | 17 | /** 18 | * News DB Service 19 | * Created by yangjing on 15-11-6. 20 | */ 21 | class NewsDBRepo extends LazyLogging { 22 | 23 | val KEYSPACE = SystemUtils.crawlerConfig.getString("cassandra.keyspace") 24 | val cachePrepares = mutable.Map.empty[String, PreparedStatement] 25 | 26 | private def findNews(key: String, 27 | source: ItemSource.Value, 28 | method: SearchMethod.Value, 29 | time: LocalDateTime)( 30 | implicit ec: ExecutionContextExecutor 31 | ): Future[Seq[SearchResult]] = { 32 | 33 | logger.debug(s"key: $key, source: $source, method: $method, time: $time") 34 | 35 | CassandraPersists.using(KEYSPACE) { implicit session => 36 | val stmt = getPreparedStatement(session, "SELECT * FROM search_page WHERE key = ? AND source = ? AND time > ?") 37 | val futureResultSet = session.executeAsync(stmt.bind(key, source.toString, TimeUtils.toDate(time))) 38 | val list = CassandraPersists.execute(futureResultSet) { rs => 39 | rs.asScala.map { row => 40 | val news = row.getList("news", classOf[UDTValue]).asScala.map(udt => 41 | NewsItem( 42 | udt.getString("title"), 43 | udt.getString("url"), 44 | udt.getString("source"), 45 | Option(TimeUtils.toLocalDateTime(udt.getTimestamp("time"))), 46 | udt.getString("abstract")) 47 | ) 48 | 49 | val newsItemFuture = Future.sequence(news.map(news => 50 | findOneNewsPageItem(news.url).map(nop => news.copy(content = nop.map(_.content))))) 51 | 52 | newsItemFuture.map { newsList => 53 | SearchResult( 54 | ItemSource.withName(row.getString("source")), 55 | row.getString("key"), 56 | TimeUtils.toLocalDateTime(row.getTimestamp("time")), 57 | row.getInt("count"), 58 | newsList) 59 | } 60 | }.toList 61 | } 62 | 63 | list.flatMap(futures => Future.sequence(futures)) 64 | } 65 | } 66 | 67 | def findNews(key: String, 68 | sources: Traversable[ItemSource.Value], 69 | method: SearchMethod.Value, 70 | time: Option[LocalDateTime])( 71 | implicit ec: ExecutionContextExecutor 72 | ): Future[List[SearchResult]] = { 73 | 74 | val futureList = CassandraPersists.using(KEYSPACE) { implicit session => 75 | val pstmt = 76 | if (time.isEmpty) getPreparedStatement(session, "SELECT * FROM search_page WHERE key = ? AND source = ?") 77 | else getPreparedStatement(session, "SELECT * FROM search_page WHERE key = ? AND source = ? AND time > ?") 78 | 79 | sources.flatMap { source => 80 | val stmt = 81 | if (time.isEmpty) pstmt.bind(key, source.toString) 82 | else pstmt.bind(key, source.toString, TimeUtils.toDate(time.get)) 83 | 84 | session.execute(stmt).asScala.map { row => 85 | val news = row.getList("news", classOf[UDTValue]).asScala.map(udt => 86 | NewsItem( 87 | udt.getString("title"), 88 | udt.getString("url"), 89 | udt.getString("source"), 90 | Option(TimeUtils.toLocalDateTime(udt.getTimestamp("time"))), 91 | udt.getString("abstract")) 92 | ) 93 | 94 | val newsItemFuture = Future.sequence(news.map(news => 95 | findOneNewsPageItem(news.url).map(nop => news.copy(content = nop.map(_.content))))) 96 | 97 | newsItemFuture.map(list => 98 | SearchResult( 99 | ItemSource.withName(row.getString("source")), 100 | row.getString("key"), 101 | TimeUtils.toLocalDateTime(row.getTimestamp("time")), 102 | row.getInt("count"), 103 | list) 104 | ) 105 | 106 | } 107 | }.toList 108 | 109 | } 110 | 111 | Future.sequence(futureList) 112 | } 113 | 114 | def findOneNewsPageItem(url: String)( 115 | implicit session: Session, ec: ExecutionContextExecutor 116 | ): Future[Option[NewsPage]] = { 117 | 118 | val stmt = getPreparedStatement(session, "SELECT * FROM news_page WHERE url = ?") 119 | CassandraPersists.execute(session.executeAsync(stmt.bind(url))) { rs => 120 | rs.one match { 121 | case null => 122 | None 123 | case row => 124 | Some(NewsPage( 125 | row.getString("url"), 126 | row.getString("title"), 127 | row.getString("source"), 128 | Option(TimeUtils.toLocalDateTime(row.getTimestamp("time"))), 129 | row.getString("abstract"), 130 | row.getString("content")) 131 | ) 132 | } 133 | } 134 | } 135 | 136 | def saveToNewsPage(page: NewsPage): Unit = { 137 | CassandraPersists.using(KEYSPACE) { session => 138 | val stmt = getPreparedStatement(session, 139 | "INSERT INTO news_page(url, title, source, time, abstract, content) VALUES(?, ?, ?, ?, ?, ?)") 140 | session.executeAsync(stmt.bind( 141 | page.url, 142 | page.title, 143 | page.source, 144 | page.time.map(TimeUtils.toDate).orNull, 145 | page.`abstract`, 146 | page.content)) 147 | } 148 | } 149 | 150 | def saveToSearchPage(newsResult: SearchResult) = { 151 | // logger.debug(newsResult.news.mkString("\n")) 152 | logger.info(s"key: ${newsResult.key} found news: ${newsResult.count}, saved: ${newsResult.news.size}") 153 | CassandraPersists.using(KEYSPACE) { session => 154 | val newsType = CassandraPersists.userType(KEYSPACE, "news_type") 155 | val stmt = getPreparedStatement(session, "INSERT INTO search_page(key, source, time, count, news) VALUES(?, ?, ?, ?, ?)") 156 | session.executeAsync(stmt.bind( 157 | newsResult.key, 158 | newsResult.source.toString, 159 | TimeUtils.toDate(newsResult.time), 160 | Integer.valueOf(newsResult.count), 161 | newsResult.news.map(n => NewsItem.toUDTValue(newsType, n)).asJava)) 162 | } 163 | } 164 | 165 | private def getPreparedStatement(session: Session, sql: String): PreparedStatement = { 166 | // println("sql: " + sql) 167 | cachePrepares.getOrElse(sql, { 168 | val p = session.prepare(sql) 169 | cachePrepares.put(sql, p) 170 | p 171 | }) 172 | } 173 | 174 | } 175 | -------------------------------------------------------------------------------- /module-news/src/main/scala/crawler/module/news/service/NewsMaster.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news.service 2 | 3 | import akka.actor.Props 4 | import crawler.module.news.NewsUtils 5 | import crawler.module.news.commands.RequestSearchNews 6 | import crawler.module.news.service.actors.{NewsJob, PersistActor} 7 | import crawler.util.actors.MetricActor 8 | 9 | /** 10 | * News Supervisor 11 | * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-06. 12 | */ 13 | class NewsMaster extends MetricActor { 14 | val persistActor = context.actorOf(PersistActor.props, PersistActor.actorName) 15 | 16 | override val metricReceive: Receive = { 17 | case RequestSearchNews(sources, msg) => 18 | val doSender = sender() 19 | val newsJob = context.actorOf(NewsJob.props(sources, doSender), "news-" + NewsUtils.getIndent) 20 | newsJob ! msg 21 | } 22 | } 23 | 24 | object NewsMaster { 25 | val actorName = "news-master" 26 | 27 | def props = Props(new NewsMaster) 28 | } 29 | -------------------------------------------------------------------------------- /module-news/src/main/scala/crawler/module/news/service/NewsService.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news.service 2 | 3 | import akka.pattern.ask 4 | import crawler.module.news.commands.{RequestSearchNews, SearchNews} 5 | import crawler.module.news.enums.{ItemSource, SearchMethod} 6 | import crawler.module.news.model.{NewsItem, SearchResult} 7 | import crawler.util.time.TimeUtils 8 | 9 | import scala.concurrent.Future 10 | import scala.concurrent.duration._ 11 | 12 | /** 13 | * 新闻服务 14 | * Created by yangjing on 15-11-3. 15 | */ 16 | class NewsService { 17 | 18 | import crawler.SystemUtils._ 19 | import system.dispatcher 20 | 21 | val newsMaster = system.actorOf(NewsMaster.props, NewsMaster.actorName) 22 | val dbRepo = new NewsDBRepo 23 | 24 | def fetchNewsApi(_key: String, 25 | sources: Traversable[ItemSource.Value], 26 | method: SearchMethod.Value, 27 | duration: FiniteDuration, 28 | forcedLatest: Boolean): Future[Seq[NewsItem]] = { 29 | fetchNews(_key, sources, method, duration, forcedLatest). 30 | map(_.flatMap(_.news)) 31 | } 32 | 33 | def fetchNews(_key: String, 34 | sources: Traversable[ItemSource.Value], 35 | method: SearchMethod.Value, 36 | duration: FiniteDuration, 37 | forcedLatest: Boolean): Future[Seq[SearchResult]] = { 38 | val key = _key.trim 39 | val future = dbRepo.findNews(key, sources, method, if (forcedLatest) Some(TimeUtils.nowBegin()) else None) 40 | 41 | future.flatMap(results => 42 | if (results.isEmpty) { 43 | val msg = RequestSearchNews(sources.toSeq, SearchNews(key, method, duration)) 44 | // TODO 最长5分钟 45 | newsMaster.ask(msg)(5.minutes).mapTo[Seq[SearchResult]] 46 | } else { 47 | Future.successful(results) 48 | } 49 | ) 50 | } 51 | 52 | } 53 | 54 | -------------------------------------------------------------------------------- /module-news/src/main/scala/crawler/module/news/service/actors/ItemPageWorker.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news.service.actors 2 | 3 | import akka.actor.Props 4 | import crawler.module.news.commands.{ItemPageResult, StartFetchItemPage} 5 | import crawler.module.news.crawlers.NewsCrawler 6 | import crawler.module.news.enums.ItemSource 7 | import crawler.module.news.model.NewsItem 8 | import crawler.util.actors.MetricActor 9 | 10 | import scala.util.{Failure, Success} 11 | 12 | /** 13 | * 详情页面 14 | * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-06. 15 | */ 16 | class ItemPageWorker(source: ItemSource.Value, newsItem: NewsItem) extends MetricActor { 17 | 18 | import context.dispatcher 19 | 20 | override val metricReceive: Receive = { 21 | case StartFetchItemPage => 22 | val doSender = sender() 23 | 24 | NewsCrawler.getCrawler(source) match { 25 | case Some(crawler) => 26 | crawler.fetchNewsItem(newsItem.url).onComplete { 27 | case Success(pageItem) => 28 | logger.debug(s"${newsItem.url} context OK") 29 | doSender ! ItemPageResult(Right(pageItem)) 30 | 31 | case Failure(e) => 32 | logger.warn(s"${newsItem.url} context extractor") 33 | e.printStackTrace() 34 | doSender ! ItemPageResult(Left(e.getLocalizedMessage)) 35 | } 36 | 37 | case None => 38 | doSender ! ItemPageResult(Left(s"Crawler $source not exists, ${newsItem.url} needed.")) 39 | } 40 | } 41 | 42 | } 43 | 44 | object ItemPageWorker { 45 | 46 | def props(source: ItemSource.Value, item: NewsItem) = Props(new ItemPageWorker(source, item)) 47 | 48 | } 49 | -------------------------------------------------------------------------------- /module-news/src/main/scala/crawler/module/news/service/actors/NewsJob.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news.service.actors 2 | 3 | import akka.actor.{ActorRef, PoisonPill, Props} 4 | import crawler.module.news.commands.{SearchNews, StartSearchNews} 5 | import crawler.module.news.enums.ItemSource 6 | import crawler.module.news.model.SearchResult 7 | import crawler.util.actors.MetricActor 8 | 9 | /** 10 | * NewsJob 11 | * 成功返回: Seq[NewsResult] 12 | * Created by yangjing on 15-11-5. 13 | */ 14 | class NewsJob(sources: Seq[ItemSource.Value], reqSender: ActorRef) extends MetricActor { 15 | @volatile var _completeJobs = 0 16 | @volatile var _newsResults = List.empty[SearchResult] 17 | 18 | override val metricReceive: Receive = { 19 | case SearchNews(key, method, duration) => 20 | sources.foreach { source => 21 | val jobName = source.toString 22 | val jobActor = context.actorOf(NewsSourceJob.props(source, method, key, duration, self), jobName) 23 | jobActor ! StartSearchNews 24 | } 25 | 26 | case result: SearchResult => 27 | _completeJobs += 1 28 | _newsResults ::= result 29 | if (sources.size == _completeJobs) { 30 | reqSender ! _newsResults 31 | 32 | // TODO 把 NewsJob 内的超时判断上移到 NewsJob ? 33 | self ! PoisonPill 34 | } 35 | 36 | } 37 | } 38 | 39 | object NewsJob { 40 | def props(sources: Seq[ItemSource.Value], reqSender: ActorRef) = Props(new NewsJob(sources, reqSender)) 41 | } 42 | -------------------------------------------------------------------------------- /module-news/src/main/scala/crawler/module/news/service/actors/NewsSourceJob.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news.service.actors 2 | 3 | import akka.actor.{ActorRef, Cancellable, PoisonPill, Props} 4 | import crawler.module.news.commands._ 5 | import crawler.module.news.enums.{ItemSource, SearchMethod} 6 | import crawler.module.news.model.SearchResult 7 | import crawler.module.news.service.NewsMaster 8 | import crawler.util.actors.MetricActor 9 | import crawler.util.time.TimeUtils 10 | 11 | import scala.concurrent.duration.FiniteDuration 12 | 13 | /** 14 | * 新闻job 15 | * 16 | * @param source 搜索源 17 | * @param method 搜索方式 18 | * @param key 搜索关键词 19 | * @param duration 持续时间,到期后向未获取完新闻数据向客户端返回Timeout。children actor继续业务处理 20 | * @param reqSender 请求actor 21 | */ 22 | class NewsSourceJob(source: ItemSource.Value, 23 | method: SearchMethod.Value, 24 | key: String, 25 | duration: FiniteDuration, 26 | reqSender: ActorRef) extends MetricActor { 27 | 28 | private val persistActor = context.actorSelection(context.system / NewsMaster.actorName / PersistActor.actorName) 29 | @volatile var _newsResult = SearchResult(source, "", TimeUtils.now(), 0, Nil) 30 | @volatile var _isTimeout: Boolean = false 31 | @volatile var _notCompleteItemPageActorNames = Seq.empty[String] 32 | @volatile var _cancelableSchedule: Cancellable = _ 33 | 34 | import context.dispatcher 35 | 36 | override def metricPreStart(): Unit = { 37 | // 定义超时时间 38 | _cancelableSchedule = context.system.scheduler.scheduleOnce(duration, self, SearchTimeout) 39 | } 40 | 41 | override def metricPostStop(): Unit = { 42 | if (!_cancelableSchedule.isCancelled) { 43 | _cancelableSchedule.cancel() 44 | } 45 | 46 | if (null != _newsResult && _newsResult.count > 0) { 47 | persistActor ! _newsResult 48 | } else { 49 | logger.warn(s"${self.path} [$key]未获取到相关数据: ${_newsResult.error}") 50 | } 51 | } 52 | 53 | override val metricReceive: Receive = { 54 | case s@StartSearchNews => 55 | val searchPage = context.actorOf(SearchPageWorker.props(source, key), "page") 56 | searchPage ! StartFetchSearchPage 57 | 58 | case SearchPageResult(newsResult) => 59 | _newsResult = newsResult 60 | method match { 61 | case SearchMethod.F if _newsResult.count > 0 => // 需要抓取详情内容 62 | _notCompleteItemPageActorNames = newsResult.news.zipWithIndex.map { case (item, idx) => 63 | val childName = "item-" + idx 64 | val itemPage = context.actorOf(ItemPageWorker.props(source, item), childName) 65 | itemPage ! StartFetchItemPage 66 | childName 67 | } 68 | 69 | case _ => // SearchMethod.S => // 只抓取摘要 70 | if (!_isTimeout) { 71 | reqSender ! _newsResult 72 | } 73 | self ! PoisonPill 74 | } 75 | 76 | case ItemPageResult(result) => 77 | val doSender = sender() 78 | println(doSender.path) 79 | _notCompleteItemPageActorNames = _notCompleteItemPageActorNames.filterNot(_ == doSender.path.name) 80 | result match { 81 | case Left(errMsg) => 82 | // TODO 解析新闻详情页失败! 83 | logger.error(errMsg) 84 | 85 | case Right(pageItem) => 86 | // 更新 result.news 87 | val news = _newsResult.news.map { 88 | case oldItem if oldItem.url == pageItem.url => 89 | oldItem.copy(content = Option(pageItem.content)) 90 | 91 | case oldItem => 92 | oldItem 93 | } 94 | 95 | _newsResult = _newsResult.copy(news = news) 96 | } 97 | 98 | if (_notCompleteItemPageActorNames.isEmpty) { 99 | if (!_isTimeout) { 100 | reqSender ! _newsResult 101 | } 102 | self ! PoisonPill 103 | } 104 | 105 | case SearchTimeout => 106 | _isTimeout = true 107 | 108 | // 此时向调用客户端返回已存在的数据,但实际的新闻抓取流程仍将继续 109 | reqSender ! _newsResult //Left(new AskTimeoutException("搜索超时")) 110 | 111 | case SearchPageFailure(e) => 112 | logger.warn(self.path + " ", e) 113 | if (!_isTimeout) { 114 | reqSender ! SearchResult(source, key, TimeUtils.now(), 0, Nil, Some(e.getLocalizedMessage)) 115 | } 116 | self ! PoisonPill 117 | } 118 | 119 | } 120 | 121 | object NewsSourceJob { 122 | def props(source: ItemSource.Value, 123 | method: SearchMethod.Value, 124 | key: String, 125 | duration: FiniteDuration, 126 | reqSender: ActorRef) = 127 | Props(new NewsSourceJob(source, method, key, duration, reqSender)) 128 | } 129 | -------------------------------------------------------------------------------- /module-news/src/main/scala/crawler/module/news/service/actors/PersistActor.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news.service.actors 2 | 3 | import akka.actor.Props 4 | import crawler.module.news.model.{NewsPage, SearchResult} 5 | import crawler.module.news.service.NewsDBRepo 6 | import crawler.util.actors.MetricActor 7 | 8 | /** 9 | * 持久化 10 | * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-06. 11 | */ 12 | class PersistActor extends MetricActor { 13 | val dbRepo = new NewsDBRepo 14 | 15 | override val metricReceive: Receive = { 16 | case newsResult: SearchResult => 17 | dbRepo.saveToSearchPage(newsResult) 18 | 19 | newsResult.news.foreach { item => 20 | val page = NewsPage(item.url, item.title, item.source, item.time, item.`abstract`, item.content.getOrElse("")) 21 | dbRepo.saveToNewsPage(page) 22 | } 23 | } 24 | 25 | } 26 | 27 | object PersistActor { 28 | val BATCH_SIZE = 20 29 | val actorName = "persist" 30 | 31 | def props = Props(new PersistActor) 32 | } 33 | -------------------------------------------------------------------------------- /module-news/src/main/scala/crawler/module/news/service/actors/SearchPageWorker.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news.service.actors 2 | 3 | import akka.actor.Props 4 | import crawler.module.news.commands.{SearchPageFailure, SearchPageResult, StartFetchSearchPage} 5 | import crawler.module.news.crawlers.NewsCrawler 6 | import crawler.module.news.enums.ItemSource 7 | import crawler.util.actors.MetricActor 8 | 9 | import scala.util.{Failure, Success} 10 | 11 | /** 12 | * 搜索页面 13 | * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-06. 14 | */ 15 | class SearchPageWorker(source: ItemSource.Value, key: String) extends MetricActor { 16 | 17 | import context.dispatcher 18 | 19 | override val metricReceive: Receive = { 20 | case StartFetchSearchPage => 21 | val doSender = sender() 22 | 23 | NewsCrawler.getCrawler(source) match { 24 | case Some(crawler) => 25 | crawler.fetchItemList(key).onComplete { 26 | case Success(result) => 27 | doSender ! SearchPageResult(result) 28 | stop() 29 | 30 | case Failure(e) => 31 | doSender ! SearchPageFailure(e) 32 | stop() 33 | } 34 | 35 | case None => 36 | doSender ! SearchPageFailure(new RuntimeException(s"Crawler $source not exists")) 37 | stop() 38 | } 39 | } 40 | 41 | private def stop(): Unit = context.stop(self) 42 | } 43 | 44 | object SearchPageWorker { 45 | 46 | def props(source: ItemSource.Value, name: String) = Props(new SearchPageWorker(source, name)) 47 | 48 | } -------------------------------------------------------------------------------- /module-news/src/test/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | %date - [%level] - from %logger in %thread %n%message%n%xException%n 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /module-news/src/test/scala/crawler/module/news/crawlers/BaiduNewsTest.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news.crawlers 2 | 3 | import akka.util.Timeout 4 | import crawler.testsuite.ServiceSpec 5 | import crawler.util.http.HttpClient 6 | 7 | import scala.concurrent.Await 8 | import scala.concurrent.duration._ 9 | 10 | /** 11 | * Created by Yang Jing (yangbajing@gmail.com) on 2015-12-03. 12 | */ 13 | class BaiduNewsTest extends ServiceSpec { 14 | 15 | implicit val timeout = Timeout(30.seconds) 16 | 17 | "BaiduNewsTest" should { 18 | 19 | "fetchNewsList" in { 20 | val baidu = new BaiduNews(HttpClient()) 21 | val result = Await.result(baidu.fetchItemList("阿里巴巴"), timeout.duration) 22 | result.news.foreach(println) 23 | println(result.source + " " + result.key) 24 | println(result.news.size) 25 | result.news must not be empty 26 | } 27 | 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /module-news/src/test/scala/crawler/module/news/crawlers/CourtNewsTest.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news.crawlers 2 | 3 | import akka.util.Timeout 4 | import crawler.testsuite.ServiceSpec 5 | import crawler.util.http.HttpClient 6 | 7 | import scala.concurrent.Await 8 | import scala.concurrent.duration._ 9 | 10 | class CourtNewsTest extends ServiceSpec { 11 | 12 | val timeout = Timeout(30.seconds) 13 | 14 | "CourtNewsTest" should { 15 | "fetchNewsList" in { 16 | val court = new CourtNews(HttpClient()) 17 | val result = Await.result(court.fetchItemList("重庆"), timeout.duration) 18 | result.news.foreach(println) 19 | println(result.key) 20 | result.news must not be empty 21 | } 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /module-news/src/test/scala/crawler/module/news/crawlers/HaosouNewsTest.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news.crawlers 2 | 3 | import akka.util.Timeout 4 | import crawler.testsuite.ServiceSpec 5 | import crawler.util.http.HttpClient 6 | 7 | import scala.concurrent.Await 8 | import scala.concurrent.duration._ 9 | 10 | /** 11 | * Created by yangjing on 15-11-9. 12 | */ 13 | class HaosouNewsTest extends ServiceSpec { 14 | 15 | implicit val timeout = Timeout(30.seconds) 16 | 17 | "HaosouCrawlerTest" should { 18 | 19 | "fetchNewsList" in { 20 | val haosou = new HaosouNews(HttpClient()) 21 | val result = Await.result(haosou.fetchItemList("誉存科技"), timeout.duration) 22 | result.news.foreach(println) 23 | println(result.source + " " + result.key) 24 | result.news must not be empty 25 | } 26 | 27 | } 28 | 29 | override implicit def patienceConfig: PatienceConfig = super.patienceConfig 30 | } 31 | -------------------------------------------------------------------------------- /module-news/src/test/scala/crawler/module/news/crawlers/WechatNewsTest.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news.crawlers 2 | 3 | import akka.util.Timeout 4 | import crawler.testsuite.ServiceSpec 5 | import crawler.util.http.HttpClient 6 | 7 | import scala.concurrent.Await 8 | import scala.concurrent.duration._ 9 | 10 | /** 11 | * Wechat News Test 12 | * Created by Yang Jing (yangbajing@gmail.com) on 2015-11-10. 13 | */ 14 | class WechatNewsTest extends ServiceSpec { 15 | 16 | implicit val timeout = Timeout(30.seconds) 17 | "WechatNewsTest" should { 18 | 19 | "fetchNewsList" in { 20 | val wechat = new WechatNews(HttpClient()) 21 | val f = wechat.fetchItemList("成都念念科技有限公司") 22 | val result = Await.result(f, timeout.duration) 23 | result.news.foreach(println) 24 | println(result.count + " " + result.key) 25 | result.news must not be empty 26 | } 27 | 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /module-news/src/test/scala/crawler/module/news/service/NewsDBRepoTest.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news.service 2 | 3 | import java.util.concurrent.TimeUnit 4 | 5 | import crawler.module.news.enums.{ItemSource, SearchMethod} 6 | import crawler.testsuite.ServiceSpec 7 | import crawler.util.time.TimeUtils 8 | 9 | class NewsDBRepoTest extends ServiceSpec { 10 | 11 | "NewsDBRepoTest" should { 12 | val dbRepo = new NewsDBRepo 13 | 14 | "findNews" in { 15 | val result = dbRepo.findNews("阿里巴巴", Seq(ItemSource.baidu), SearchMethod.F, Some(TimeUtils.nowBegin())) 16 | val list = result.futureValue 17 | println(list) 18 | list must not be empty 19 | 20 | TimeUnit.SECONDS.sleep(5) 21 | } 22 | 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /module-news/src/test/scala/crawler/module/news/service/actors/NewsJobMasterTest.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.news.service.actors 2 | 3 | import java.util.concurrent.TimeUnit 4 | 5 | import akka.pattern.ask 6 | import akka.util.Timeout 7 | import crawler.SystemUtils 8 | import crawler.module.news.commands.{SearchNews, RequestSearchNews} 9 | import crawler.module.news.crawlers.{BaiduNews, NewsCrawler} 10 | import crawler.module.news.enums.{SearchMethod, ItemSource} 11 | import crawler.module.news.model.SearchResult 12 | import crawler.module.news.service.NewsMaster 13 | import crawler.testsuite.ServiceSpec 14 | 15 | import scala.concurrent.duration._ 16 | 17 | /** 18 | * NewsMasterTest 19 | * Created by yangjing on 15-11-5. 20 | */ 21 | class NewsJobMasterTest extends ServiceSpec { 22 | 23 | implicit val timeout = Timeout(60.seconds) 24 | 25 | "NewsMasterTest" should { 26 | NewsCrawler.registerCrawler(ItemSource.baidu, new BaiduNews(SystemUtils.httpClient)) 27 | 28 | "news-master" in { 29 | val sources = Seq(ItemSource.baidu) 30 | val newsMaster = system.actorOf(NewsMaster.props, NewsMaster.actorName) 31 | val msg = RequestSearchNews(sources, SearchNews("杭州誉存科技有限公司", SearchMethod.F, 3.seconds)) 32 | 33 | val f = (newsMaster ? msg).mapTo[Seq[SearchResult]] 34 | 35 | f onSuccess { case list => 36 | list.foreach(println) 37 | list.size mustBe 1 38 | } 39 | 40 | f onFailure { case e => 41 | println("Failure: " + e) 42 | } 43 | 44 | TimeUnit.SECONDS.sleep(20) 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /module-site-search/src/main/scala/crawler/module/site/BaiduSite.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.site 2 | 3 | import java.net.URLEncoder 4 | import java.time.LocalDateTime 5 | import java.util.concurrent.TimeUnit 6 | 7 | import com.typesafe.scalalogging.LazyLogging 8 | import crawler.module.site.model.{SearchRequest, SiteItem, SiteResult} 9 | import crawler.util.Crawler 10 | import crawler.util.http.HttpClient 11 | import crawler.util.time.TimeUtils 12 | import org.jsoup.Jsoup 13 | import org.jsoup.nodes.Element 14 | 15 | import scala.collection.JavaConverters._ 16 | import scala.concurrent.{ExecutionContext, Future, Promise} 17 | import scala.util.{Failure, Success} 18 | 19 | /** 20 | * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-18. 21 | */ 22 | class BaiduSite(val httpClient: HttpClient, 23 | searchRequest: SearchRequest) extends Crawler with LazyLogging { 24 | 25 | import BaiduSite._ 26 | 27 | override protected val defaultHeaders: Array[Seq[(String, String)]] = 28 | super.defaultHeaders.map(headers => headers :+ ("User-Agent" -> "Baiduspider")) 29 | 30 | val values = searchRequest.params.map(_.value) 31 | 32 | /** 33 | * 抓取搜索页 34 | * 35 | * @return 36 | */ 37 | def fetchItemList()(implicit ec: ExecutionContext): Future[SiteResult] = { 38 | val promise = Promise[Seq[SiteItem]]() 39 | val key = searchRequest.toParam 40 | 41 | val url = BAIDU_SITE_BASE_URL.format(URLEncoder.encode(key, "UTF-8")) 42 | logger.info(s"key: $key, url: $url") 43 | 44 | val newsResultsFuture = fetchPage(url).flatMap { resp => 45 | val doc = Jsoup.parse(resp.getResponseBodyAsStream, "UTF-8", BAIDU_SITE_HOST).getElementById("wrapper_wrapper") 46 | val now = TimeUtils.now() 47 | val contentNone = doc.select(".content_none") 48 | 49 | if (!contentNone.isEmpty) { 50 | promise.success(Nil) 51 | Future.successful(SiteResult(ITEM_SOURCE, key, now, 0, Nil)) 52 | } else { 53 | val wrapper = doc 54 | val countText = wrapper 55 | .select(".head_nums_cont_outer.OP_LOG") 56 | .select(".nums") 57 | .text() 58 | val count = 59 | """\d+""".r.findAllMatchIn(countText).map(_.matched).mkString.toInt 60 | 61 | val itemDiv = doc.getElementById("content_left") 62 | val itemResults = itemDiv.select(".result.c-container").asScala 63 | 64 | val pages = doc.select("#page a").asScala 65 | val newsItemFutures = pages.take(PAGE_LIMIT - 1).map { page => 66 | TimeUnit.MILLISECONDS.sleep(100) 67 | fetchPageLinks(BAIDU_SITE_HOST + page.attr("href")) 68 | } 69 | 70 | Future.sequence(newsItemFutures).map(_.flatten).onComplete { 71 | case Success(list) => 72 | promise.success(list) 73 | case Failure(e) => 74 | e.printStackTrace() 75 | promise.success(Nil) 76 | } 77 | 78 | Future.sequence(itemResults.map(parseSiteItem)) 79 | .map(items => SiteResult(ITEM_SOURCE, key, now, count, items)) 80 | } 81 | } 82 | 83 | for { 84 | newsResult <- newsResultsFuture 85 | newsItems <- promise.future 86 | } yield { 87 | newsResult.copy(items = newsResult.items ++ newsItems) 88 | } 89 | } 90 | 91 | def fetchPageLinks(url: String)(implicit ec: ExecutionContext): Future[Seq[SiteItem]] = { 92 | fetchPage(url).flatMap { resp => 93 | val doc = Jsoup.parse(resp.getResponseBodyAsStream, "UTF-8", BaiduSite.BAIDU_SITE_HOST) 94 | if (doc.getElementById("content_none") != null) { 95 | Future.successful(Nil) 96 | } else { 97 | val itemDiv = doc.getElementById("content_left") 98 | val itemResults = itemDiv.select(".result.c-container").asScala 99 | val futures = itemResults.map(parseSiteItem) 100 | Future.sequence(futures) 101 | } 102 | } 103 | } 104 | 105 | def parseSiteItem(elem: Element)(implicit ec: ExecutionContext): Future[SiteItem] = { 106 | val link = elem.select(".t").select("a").first() 107 | val href = link.attr("href") 108 | 109 | extractPageUrl(href).map { url => 110 | val title = link.text() 111 | 112 | val sourceHostDesc = elem.select(".f13 a").first().text() 113 | val source = sourceHostDesc.take(sourceHostDesc.indexOf('/')) 114 | 115 | val abstractElem = elem.select(".c-abstract") 116 | val summary = abstractElem.asScala.filterNot(e => e.attr("class").contains("newTimeFactor_before_abs")).map(_.text()).mkString 117 | val time = BaiduSite.dealTime(abstractElem.select(".newTimeFactor_before_abs").text()) 118 | 119 | SiteItem(title, url, source, time, summary, values) 120 | } 121 | } 122 | 123 | def extractPageUrl(href: String): Future[String] = { 124 | implicit val ec = ExecutionContext.Implicits.global 125 | 126 | if (searchRequest.followUrl) { 127 | HttpClient.find302Location(httpClient, href, requestHeaders()).map(v => if (v == null) href else v) 128 | } else { 129 | Future.successful(href) 130 | } 131 | } 132 | 133 | } 134 | 135 | object BaiduSite { 136 | // 抓取前5页 137 | val PAGE_LIMIT = 5 138 | 139 | val BAIDU_SITE_BASE_URL = "https://www.baidu.com/s?wd=%s&rsv_spt=1&issp=1&f=8&rsv_bp=0&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&rsv_n=2&rsv_sug3=1" 140 | 141 | val BAIDU_SITE_HOST = "https://www.baidu.com" 142 | 143 | val TIME_PATTERN = """(\d{4})年(\d{1,2})月(\d{1,2})日""".r 144 | 145 | val ITEM_SOURCE = "baiduSite" 146 | 147 | def dealTime(timeStr: String): Option[LocalDateTime] = timeStr.substring(0, timeStr.indexOf('日') + 1) match { 148 | case TIME_PATTERN(year, month, day) => Some(LocalDateTime.of(year.toInt, month.toInt, day.toInt, 0, 0)) 149 | case _ => None 150 | } 151 | 152 | } 153 | -------------------------------------------------------------------------------- /module-site-search/src/main/scala/crawler/module/site/QueryCond.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.site 2 | 3 | /** 4 | * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-18. 5 | */ 6 | object QueryCond extends Enumeration { 7 | val - = Value("-") 8 | val + = Value("+") 9 | } 10 | -------------------------------------------------------------------------------- /module-site-search/src/main/scala/crawler/module/site/SearchSyntax.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.site 2 | 3 | /** 4 | * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-20. 5 | */ 6 | object SearchSyntax { 7 | final val Intitle = "intitle" 8 | final val Insite = "insite" 9 | final val Inurl = "inurl" 10 | } 11 | -------------------------------------------------------------------------------- /module-site-search/src/main/scala/crawler/module/site/model/SearchRequest.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.site.model 2 | 3 | import crawler.module.site.QueryCond 4 | 5 | /** 6 | * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-18. 7 | */ 8 | case class SearchRequest(params: Seq[SearchParam], followUrl: Boolean = true) { 9 | 10 | def toParam = params.map(_.toParam).mkString(" ") 11 | 12 | } 13 | 14 | case class SearchParam(value: String, 15 | syntax: Option[String] = None, 16 | cond: Option[QueryCond.Value] = None, 17 | filetypeDoc: Seq[String] = Nil, 18 | strict: Boolean = true) { 19 | 20 | def toParam = 21 | syntax.map(v => if (strict) s"""$v:"$value"""" else s"$v:$value") orElse 22 | cond.map(v => v + value) getOrElse 23 | value 24 | 25 | } 26 | -------------------------------------------------------------------------------- /module-site-search/src/main/scala/crawler/module/site/model/SiteItem.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.site.model 2 | 3 | import java.time.LocalDateTime 4 | 5 | import org.json4s.Extraction 6 | 7 | /** 8 | * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-22. 9 | */ 10 | case class SiteItem(title: String, 11 | url: String, 12 | // 新闻来源(站点) 13 | source: String, 14 | time: Option[LocalDateTime], 15 | // 摘要 16 | `abstract`: String, 17 | values: Seq[String] = Nil) { 18 | 19 | def jsonPretty = { 20 | import crawler.util.http.TJsonSupport._ 21 | val jv = Extraction.decompose(this) 22 | serialization.writePretty(jv) 23 | } 24 | 25 | } 26 | -------------------------------------------------------------------------------- /module-site-search/src/main/scala/crawler/module/site/model/SiteResult.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.site.model 2 | 3 | import java.time.LocalDateTime 4 | 5 | /** 6 | * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-22. 7 | */ 8 | case class SiteResult(source: String, 9 | key: String, 10 | time: LocalDateTime, 11 | count: Int, 12 | items: Seq[SiteItem], 13 | error: Option[String] = None) 14 | -------------------------------------------------------------------------------- /module-site-search/src/test/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | %date - [%level] - from %logger in %thread %n%message%n%xException%n 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /module-site-search/src/test/scala/crawler/module/site/BaiduSiteTest.scala: -------------------------------------------------------------------------------- 1 | package crawler.module.site 2 | 3 | import akka.util.Timeout 4 | import crawler.module.site.model.{SearchParam, SearchRequest} 5 | import crawler.testsuite.ServiceSpec 6 | import crawler.util.http.HttpClient 7 | 8 | import scala.concurrent.Await 9 | import scala.concurrent.duration._ 10 | 11 | /** 12 | * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-18. 13 | */ 14 | class BaiduSiteTest extends ServiceSpec { 15 | 16 | implicit val timeout = Timeout(30.seconds) 17 | 18 | "BaiduSiteTest" should { 19 | 20 | "fetchItemList" in { 21 | val requestParams = SearchRequest( 22 | SearchParam("晋渝地产", Some(SearchSyntax.Intitle)) :: 23 | // SearchParam("阿里巴巴kakakakaak", Some(SearchSyntax.Intitle)) :: 24 | // SearchParam("失信", syntax = Some(SearchSyntax.Intitle), strict = false) :: 25 | Nil 26 | ) 27 | val baidu = new BaiduSite(HttpClient(), requestParams) 28 | 29 | val key = requestParams.toParam 30 | val f = baidu.fetchItemList() 31 | val result = Await.result(f, timeout.duration) 32 | result.items.foreach(v => println(v.jsonPretty)) 33 | println(result.items.size) 34 | result.items must not be empty 35 | } 36 | 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /project/Build.scala: -------------------------------------------------------------------------------- 1 | import _root_.sbt.Keys._ 2 | import _root_.sbt._ 3 | import com.typesafe.sbt.SbtNativePackager.{Linux, Debian} 4 | import com.typesafe.sbt.packager.Keys._ 5 | import com.typesafe.sbt.packager.archetypes.JavaServerAppPackaging 6 | import com.typesafe.sbt.packager.universal.UniversalPlugin.autoImport._ 7 | import sbtassembly.AssemblyKeys._ 8 | import sbtassembly.{MergeStrategy, PathList} 9 | 10 | object Build extends Build { 11 | 12 | import BuildSettings._ 13 | 14 | val DependsConfigure = "test->test;compile->compile" 15 | 16 | override lazy val settings = super.settings :+ { 17 | shellPrompt := (s => Project.extract(s).currentProject.id + " > ") 18 | } 19 | 20 | lazy val root = Project("crawler-high-search", file(".")) 21 | .aggregate( 22 | appApi, 23 | crawlerSiteSearch, 24 | moduleSiteSearch, moduleNews, 25 | util) 26 | 27 | /////////////////////////////////////////////////////////////// 28 | // projects 29 | /////////////////////////////////////////////////////////////// 30 | lazy val packageDebianProd = taskKey[File]("creates deb-prod package") 31 | lazy val appApi = Project("app-api", file("app-api")) 32 | .enablePlugins(JavaServerAppPackaging) 33 | .dependsOn(moduleSiteSearch % DependsConfigure, moduleNews % DependsConfigure, util % DependsConfigure) 34 | .settings(basicSettings: _*) 35 | .settings( 36 | description := "app-api", 37 | 38 | packageDescription := "一个高级异步多线程实时爬虫API", 39 | mainClass in Compile := Some("crawler.app.Main"), 40 | maintainer in Linux := "Jing Yang ", 41 | packageSummary in Linux := "Crawler High Search API", 42 | daemonUser in Linux := "nobody", 43 | bashScriptConfigLocation := Some("${app_home}/../conf/jvmopts"), 44 | bashScriptExtraDefines += """addJava "-Dlogback.configurationFile=${app_home}/../conf/logback.xml"""", 45 | 46 | // |; bashScriptExtraDefines := Seq("addJava \"-Dconfig.file=${app_home}/../conf/application.conf -Dlogback.configurationFile=${app_home}/../conf/logback.xml\"") 47 | addCommandAlias("packageProd", 48 | """; clean 49 | |; bashScriptExtraDefines += "addJava \"-Dconfig.file=${app_home}/../conf/application-test.conf -Dlogback.configurationFile=${app_home}/../conf/logback.xml\"" 50 | |; packageDebianProd 51 | """.stripMargin), 52 | packageDebianProd := { 53 | bashScriptExtraDefines += """addJava "-Dconfig.file=${app_home}/../conf/application-test.conf -Dlogback.configurationFile=${app_home}/../conf/logback.xml"""" 54 | val output = baseDirectory.value / "package" / "deb-prod.deb" 55 | val debianFile = (packageBin in Debian).value 56 | IO.move(debianFile, output) 57 | output 58 | }, 59 | 60 | // assemblyJarName in assembly := "crawler-app.jar", 61 | // mappings in Universal <<= (mappings in Universal, assembly in Compile) map { (mappings, fatJar) => 62 | // val filtered = mappings filter { case (file, name) => !name.endsWith(".jar") } 63 | // filtered :+ (fatJar -> ("lib/" + fatJar.getName)) 64 | // }, 65 | // test in assembly := {}, 66 | // assemblyMergeStrategy in assembly := { 67 | // case PathList("META-INF", "io.netty.versions.properties") => MergeStrategy.discard 68 | // case x => 69 | // val oldStrategy = (assemblyMergeStrategy in assembly).value 70 | // oldStrategy(x) 71 | // }, 72 | 73 | libraryDependencies ++= Seq( 74 | _akkaHttp) 75 | ) 76 | 77 | lazy val crawlerSiteSearch = Project("crawler-site-search", file("crawler-site-search")) 78 | .dependsOn(moduleSiteSearch % DependsConfigure, util % DependsConfigure) 79 | .settings(basicSettings: _*) 80 | .settings( 81 | description := "crawler-site-search", 82 | libraryDependencies ++= Seq( 83 | _activemqSTOMP, 84 | _cassandraDriverCore, 85 | _mongoScala) 86 | ) 87 | 88 | lazy val moduleSiteSearch = Project("module-site-search", file("module-site-search")) 89 | .dependsOn(util % DependsConfigure) 90 | .settings(basicSettings: _*) 91 | .settings( 92 | description := "module-site-search" 93 | ) 94 | 95 | lazy val moduleNews = Project("module-news", file("module-news")) 96 | .dependsOn(util % DependsConfigure) 97 | .settings(basicSettings: _*) 98 | .settings( 99 | description := "module-news", 100 | libraryDependencies ++= Seq( 101 | _cassandraDriverCore, 102 | _akkaActor) 103 | ) 104 | 105 | lazy val util = Project("util", file("util")) 106 | .settings(basicSettings: _*) 107 | .settings( 108 | description := "util", 109 | libraryDependencies ++= Seq( 110 | _activemqSTOMP % "provided", 111 | _cassandraDriverCore % "provided", 112 | _mongoScala % "provided", 113 | _akkaHttp % "provided", 114 | _akkaStream, 115 | _json4sJackson, 116 | _json4sExt, 117 | _scalaLogging, 118 | _asyncHttpClient, 119 | _jsoup, 120 | _akkaActor, 121 | _akkaSlf4j, 122 | _logbackClassic) 123 | ) 124 | 125 | } 126 | -------------------------------------------------------------------------------- /project/BuildSettings.scala: -------------------------------------------------------------------------------- 1 | import sbt.Keys._ 2 | import sbt._ 3 | 4 | object BuildSettings { 5 | 6 | lazy val basicSettings = Seq( 7 | version := "0.0.1", 8 | homepage := Some(new URL("https://github.com/yangbajing/crawler-service")), 9 | organization := "cn.socialcredits.crawler", 10 | organizationHomepage := Some(new URL("https://github.com/yangbajing/crawler-service")), 11 | startYear := Some(2015), 12 | scalaVersion := "2.11.7", 13 | scalacOptions := Seq( 14 | "-encoding", "utf8", 15 | "-unchecked", 16 | "-feature", 17 | "-deprecation" 18 | ), 19 | javacOptions := Seq( 20 | "-encoding", "utf8", 21 | "-Xlint:unchecked", 22 | "-Xlint:deprecation" 23 | ), 24 | resolvers ++= Seq( 25 | "snapshots" at "http://oss.sonatype.org/content/repositories/snapshots", 26 | "releases" at "http://oss.sonatype.org/content/repositories/releases", 27 | "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/", 28 | "Typesafe Snapshots" at "http://repo.typesafe.com/typesafe/snapshots/"), 29 | libraryDependencies ++= Seq( 30 | _scalaReflect, 31 | _scalatest 32 | ), 33 | offline := true, 34 | fork := true 35 | ) 36 | 37 | lazy val noPublishing = Seq( 38 | publish :=(), 39 | publishLocal :=() 40 | ) 41 | 42 | val verAkka = "2.3.14" 43 | val _akkaActor = "com.typesafe.akka" %% "akka-actor" % verAkka 44 | val _akkaSlf4j = "com.typesafe.akka" %% "akka-slf4j" % verAkka 45 | 46 | lazy val _scalaReflect = "org.scala-lang" % "scala-reflect" % "2.11.7" 47 | 48 | val verAkkaHttp = "2.0.2" 49 | lazy val _akkaStream = ("com.typesafe.akka" %% "akka-stream-experimental" % verAkkaHttp).exclude("com.typesafe.akka", "akka-actor") 50 | lazy val _akkaHttp = ("com.typesafe.akka" %% "akka-http-experimental" % verAkkaHttp).exclude("com.typesafe.akka", "akka-actor") 51 | 52 | lazy val _scalatest = "org.scalatest" %% "scalatest" % "2.2.5" % "test" 53 | 54 | lazy val _scalaLogging = ("com.typesafe.scala-logging" %% "scala-logging" % "3.1.0").exclude("org.scala-lang", "scala-reflect").exclude("org.slf4j", "slf4j-api") 55 | 56 | lazy val _mongoScala = ("org.mongodb.scala" %% "mongo-scala-driver" % "1.1.0").exclude("com.typesafe.akka", "akka-actor") 57 | 58 | lazy val varJson4s = "3.3.0" 59 | lazy val _json4sJackson = "org.json4s" %% "json4s-jackson" % varJson4s 60 | lazy val _json4sExt = "org.json4s" %% "json4s-ext" % varJson4s 61 | 62 | lazy val _jsoup = "org.jsoup" % "jsoup" % "1.8.3" 63 | 64 | lazy val _asyncHttpClient = ("com.ning" % "async-http-client" % "1.9.31").exclude("io.netty", "netty") 65 | 66 | lazy val _logbackClassic = "ch.qos.logback" % "logback-classic" % "1.1.3" 67 | 68 | lazy val _cassandraDriverCore = "com.datastax.cassandra" % "cassandra-driver-core" % "2.2.0-rc3" 69 | 70 | lazy val _activemqSTOMP = "org.apache.activemq" % "activemq-stomp" % "5.13.0" 71 | 72 | } 73 | 74 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.9 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.0") 2 | 3 | addSbtPlugin("org.xerial.sbt" % "sbt-pack" % "0.7.5") 4 | 5 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.8.0") 6 | 7 | addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.0.6") 8 | -------------------------------------------------------------------------------- /project/sbt-launch.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangjing/crawler-service/462c198f0ea22cc673d59a2e725628712f96f79b/project/sbt-launch.jar -------------------------------------------------------------------------------- /sbt: -------------------------------------------------------------------------------- 1 | SCRIPT_DIR=`dirname $0` 2 | java -Xmx1024M -Dsbt.override.build.repos=true -Dfile.encoding=UTF-8 -XX:+CMSClassUnloadingEnabled -jar "$SCRIPT_DIR/project/sbt-launch.jar" $@ 3 | -------------------------------------------------------------------------------- /util/src/main/java/crawler/util/news/contextextractor/ContentExtractor.java: -------------------------------------------------------------------------------- 1 | package crawler.util.news.contextextractor; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.jsoup.nodes.Document; 5 | import org.jsoup.nodes.Element; 6 | import org.jsoup.nodes.Node; 7 | import org.jsoup.nodes.TextNode; 8 | import org.jsoup.select.Elements; 9 | import org.jsoup.select.NodeVisitor; 10 | import org.slf4j.Logger; 11 | import org.slf4j.LoggerFactory; 12 | 13 | import java.util.ArrayList; 14 | import java.util.HashMap; 15 | import java.util.Map; 16 | import java.util.concurrent.atomic.AtomicInteger; 17 | import java.util.regex.Matcher; 18 | import java.util.regex.Pattern; 19 | 20 | /** 21 | * Created by yangjing on 15-11-3. 22 | */ 23 | public class ContentExtractor { 24 | 25 | public static final Logger LOG = LoggerFactory.getLogger(ContentExtractor.class); 26 | 27 | protected Document doc; 28 | 29 | ContentExtractor(Document doc) { 30 | this.doc = doc; 31 | } 32 | 33 | protected HashMap infoMap = new HashMap(); 34 | 35 | class CountInfo { 36 | 37 | int textCount = 0; 38 | int linkTextCount = 0; 39 | int tagCount = 0; 40 | int linkTagCount = 0; 41 | double density = 0; 42 | double densitySum = 0; 43 | double score = 0; 44 | int pCount = 0; 45 | ArrayList leafList = new ArrayList(); 46 | 47 | } 48 | 49 | protected void clean() { 50 | doc.select("script,noscript,style,iframe,br").remove(); 51 | } 52 | 53 | protected CountInfo computeInfo(Node node) { 54 | 55 | if (node instanceof Element) { 56 | Element tag = (Element) node; 57 | 58 | CountInfo countInfo = new CountInfo(); 59 | for (Node childNode : tag.childNodes()) { 60 | CountInfo childCountInfo = computeInfo(childNode); 61 | countInfo.textCount += childCountInfo.textCount; 62 | countInfo.linkTextCount += childCountInfo.linkTextCount; 63 | countInfo.tagCount += childCountInfo.tagCount; 64 | countInfo.linkTagCount += childCountInfo.linkTagCount; 65 | countInfo.leafList.addAll(childCountInfo.leafList); 66 | countInfo.densitySum += childCountInfo.density; 67 | countInfo.pCount += childCountInfo.pCount; 68 | } 69 | countInfo.tagCount++; 70 | String tagName = tag.tagName(); 71 | if (tagName.equals("a")) { 72 | countInfo.linkTextCount = countInfo.textCount; 73 | countInfo.linkTagCount++; 74 | } else if (tagName.equals("p")) { 75 | countInfo.pCount++; 76 | } 77 | 78 | int pureLen = countInfo.textCount - countInfo.linkTextCount; 79 | int len = countInfo.tagCount - countInfo.linkTagCount; 80 | if (pureLen == 0 || len == 0) { 81 | countInfo.density = 0; 82 | } else { 83 | countInfo.density = (pureLen + 0.0) / len; 84 | } 85 | 86 | infoMap.put(tag, countInfo); 87 | 88 | return countInfo; 89 | } else if (node instanceof TextNode) { 90 | TextNode tn = (TextNode) node; 91 | CountInfo countInfo = new CountInfo(); 92 | String text = tn.text(); 93 | int len = text.length(); 94 | countInfo.textCount = len; 95 | countInfo.leafList.add(len); 96 | return countInfo; 97 | } else { 98 | return new CountInfo(); 99 | } 100 | } 101 | 102 | protected double computeScore(Element tag) { 103 | CountInfo countInfo = infoMap.get(tag); 104 | double var = Math.sqrt(computeVar(countInfo.leafList) + 1); 105 | double score = Math.log(var) * countInfo.densitySum * Math.log(countInfo.textCount - countInfo.linkTextCount + 1) * Math.log10(countInfo.pCount + 2); 106 | return score; 107 | } 108 | 109 | protected double computeVar(ArrayList data) { 110 | if (data.size() == 0) { 111 | return 0; 112 | } 113 | if (data.size() == 1) { 114 | return data.get(0) / 2; 115 | } 116 | double sum = 0; 117 | for (Integer i : data) { 118 | sum += i; 119 | } 120 | double ave = sum / data.size(); 121 | sum = 0; 122 | for (Integer i : data) { 123 | sum += (i - ave) * (i - ave); 124 | } 125 | sum = sum / data.size(); 126 | return sum; 127 | } 128 | 129 | public Element getContentElement() throws Exception { 130 | clean(); 131 | computeInfo(doc.body()); 132 | double maxScore = 0; 133 | Element content = null; 134 | for (Map.Entry entry : infoMap.entrySet()) { 135 | Element tag = entry.getKey(); 136 | if (tag.tagName().equals("a") || tag == doc.body()) { 137 | continue; 138 | } 139 | double score = computeScore(tag); 140 | if (score > maxScore) { 141 | maxScore = score; 142 | content = tag; 143 | } 144 | } 145 | if (content == null) { 146 | throw new Exception("extraction failed"); 147 | } 148 | return content; 149 | } 150 | 151 | public News getNews() throws Exception { 152 | News news = new News(); 153 | Element contentElement; 154 | try { 155 | contentElement = getContentElement(); 156 | news.setContentElement(contentElement); 157 | } catch (Exception ex) { 158 | // LOG.info("news content extraction failed,extraction abort", ex); 159 | throw new Exception(ex); 160 | } 161 | 162 | if (doc.baseUri() != null) { 163 | news.setUrl(doc.baseUri()); 164 | } 165 | 166 | // try { 167 | // news.setTime(getTime(contentElement)); 168 | // } catch (Exception ex) { 169 | //// LOG.info("news title extraction failed", ex); 170 | // } 171 | 172 | // try { 173 | // news.setTitle(getTitle(contentElement)); 174 | // } catch (Exception ex) { 175 | // LOG.info("title extraction failed", ex); 176 | // } 177 | return news; 178 | } 179 | 180 | protected String getTime(Element contentElement) throws Exception { 181 | String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})"; 182 | Pattern pattern = Pattern.compile(regex); 183 | Element current = contentElement; 184 | for (int i = 0; i < 2; i++) { 185 | if (current != null && current != doc.body()) { 186 | Element parent = current.parent(); 187 | if (parent != null) { 188 | current = parent; 189 | } 190 | } 191 | } 192 | for (int i = 0; i < 6; i++) { 193 | if (current == null) { 194 | break; 195 | } 196 | String currentHtml = current.outerHtml(); 197 | Matcher matcher = pattern.matcher(currentHtml); 198 | if (matcher.find()) { 199 | return matcher.group(1) + "-" + matcher.group(2) + "-" + matcher.group(3) + " " + matcher.group(4) + ":" + matcher.group(5) + ":" + matcher.group(6); 200 | } 201 | if (current != doc.body()) { 202 | current = current.parent(); 203 | } 204 | } 205 | 206 | try { 207 | return getDate(contentElement); 208 | } catch (Exception ex) { 209 | ex.printStackTrace(); 210 | // throw new Exception("time not found"); 211 | return ""; 212 | } 213 | 214 | } 215 | 216 | protected String getDate(Element contentElement) throws Exception { 217 | String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})"; 218 | Pattern pattern = Pattern.compile(regex); 219 | Element current = contentElement; 220 | for (int i = 0; i < 2; i++) { 221 | if (current != null && current != doc.body()) { 222 | Element parent = current.parent(); 223 | if (parent != null) { 224 | current = parent; 225 | } 226 | } 227 | } 228 | for (int i = 0; i < 6; i++) { 229 | if (current == null) { 230 | break; 231 | } 232 | String currentHtml = current.outerHtml(); 233 | Matcher matcher = pattern.matcher(currentHtml); 234 | if (matcher.find()) { 235 | return matcher.group(1) + "-" + matcher.group(2) + "-" + matcher.group(3); 236 | } 237 | if (current != doc.body()) { 238 | current = current.parent(); 239 | } 240 | } 241 | throw new Exception("date not found"); 242 | } 243 | 244 | protected double strSim(String a, String b) { 245 | int len1 = a.length(); 246 | int len2 = b.length(); 247 | if (len1 == 0 || len2 == 0) { 248 | return 0; 249 | } 250 | double ratio; 251 | if (len1 > len2) { 252 | ratio = (len1 + 0.0) / len2; 253 | } else { 254 | ratio = (len2 + 0.0) / len1; 255 | } 256 | if (ratio >= 3) { 257 | return 0; 258 | } 259 | return (lcs(a, b) + 0.0) / Math.max(len1, len2); 260 | } 261 | 262 | protected String getTitle(final Element contentElement) throws Exception { 263 | final ArrayList titleList = new ArrayList(); 264 | final ArrayList titleSim = new ArrayList(); 265 | final AtomicInteger contentIndex = new AtomicInteger(); 266 | final String metaTitle = doc.title().trim(); 267 | if (!metaTitle.isEmpty()) { 268 | doc.body().traverse(new NodeVisitor() { 269 | @Override 270 | public void head(Node node, int i) { 271 | if (node instanceof Element) { 272 | Element tag = (Element) node; 273 | if (tag == contentElement) { 274 | contentIndex.set(titleList.size()); 275 | return; 276 | } 277 | String tagName = tag.tagName(); 278 | if (Pattern.matches("h[1-6]", tagName)) { 279 | String title = tag.text().trim(); 280 | double sim = strSim(title, metaTitle); 281 | titleSim.add(sim); 282 | titleList.add(tag); 283 | } 284 | } 285 | } 286 | 287 | @Override 288 | public void tail(Node node, int i) { 289 | } 290 | }); 291 | int index = contentIndex.get(); 292 | if (index > 0) { 293 | double maxScore = 0; 294 | int maxIndex = -1; 295 | for (int i = 0; i < index; i++) { 296 | double score = (i + 1) * titleSim.get(i); 297 | if (score > maxScore) { 298 | maxScore = score; 299 | maxIndex = i; 300 | } 301 | } 302 | if (maxIndex != -1) { 303 | return titleList.get(maxIndex).text(); 304 | } 305 | } 306 | } 307 | 308 | Elements titles = doc.body().select("*[id^=title],*[id$=title],*[class^=title],*[class$=title]"); 309 | if (titles.size() > 0) { 310 | String title = titles.first().text(); 311 | if (title.length() > 5 && title.length() < 40) { 312 | return titles.first().text(); 313 | } 314 | } 315 | try { 316 | return getTitleByEditDistance(contentElement); 317 | } catch (Exception ex) { 318 | throw new Exception("title not found"); 319 | } 320 | 321 | } 322 | 323 | protected String getTitleByEditDistance(Element contentElement) throws Exception { 324 | final String metaTitle = doc.title(); 325 | 326 | final ArrayList max = new ArrayList(); 327 | max.add(0.0); 328 | final StringBuilder sb = new StringBuilder(); 329 | doc.body().traverse(new NodeVisitor() { 330 | 331 | public void head(Node node, int i) { 332 | 333 | if (node instanceof TextNode) { 334 | TextNode tn = (TextNode) node; 335 | String text = tn.text().trim(); 336 | double sim = strSim(text, metaTitle); 337 | if (sim > 0) { 338 | if (sim > max.get(0)) { 339 | max.set(0, sim); 340 | sb.setLength(0); 341 | sb.append(text); 342 | } 343 | } 344 | 345 | } 346 | } 347 | 348 | public void tail(Node node, int i) { 349 | } 350 | }); 351 | if (sb.length() > 0) { 352 | return sb.toString(); 353 | } 354 | throw new Exception(); 355 | 356 | } 357 | 358 | protected int lcs(String x, String y) { 359 | 360 | int M = x.length(); 361 | int N = y.length(); 362 | if (M == 0 || N == 0) { 363 | return 0; 364 | } 365 | int[][] opt = new int[M + 1][N + 1]; 366 | 367 | for (int i = M - 1; i >= 0; i--) { 368 | for (int j = N - 1; j >= 0; j--) { 369 | if (x.charAt(i) == y.charAt(j)) { 370 | opt[i][j] = opt[i + 1][j + 1] + 1; 371 | } else { 372 | opt[i][j] = Math.max(opt[i + 1][j], opt[i][j + 1]); 373 | } 374 | } 375 | } 376 | 377 | return opt[0][0]; 378 | 379 | } 380 | 381 | protected int editDistance(String word1, String word2) { 382 | int len1 = word1.length(); 383 | int len2 = word2.length(); 384 | 385 | int[][] dp = new int[len1 + 1][len2 + 1]; 386 | 387 | for (int i = 0; i <= len1; i++) { 388 | dp[i][0] = i; 389 | } 390 | 391 | for (int j = 0; j <= len2; j++) { 392 | dp[0][j] = j; 393 | } 394 | 395 | for (int i = 0; i < len1; i++) { 396 | char c1 = word1.charAt(i); 397 | for (int j = 0; j < len2; j++) { 398 | char c2 = word2.charAt(j); 399 | 400 | if (c1 == c2) { 401 | dp[i + 1][j + 1] = dp[i][j]; 402 | } else { 403 | int replace = dp[i][j] + 1; 404 | int insert = dp[i][j + 1] + 1; 405 | int delete = dp[i + 1][j] + 1; 406 | 407 | int min = replace > insert ? insert : replace; 408 | min = delete > min ? min : delete; 409 | dp[i + 1][j + 1] = min; 410 | } 411 | } 412 | } 413 | 414 | return dp[len1][len2]; 415 | } 416 | 417 | /*输入Jsoup的Document,获取正文所在Element*/ 418 | public static Element getContentElementByDoc(Document doc) throws Exception { 419 | ContentExtractor ce = new ContentExtractor(doc); 420 | return ce.getContentElement(); 421 | } 422 | 423 | /*输入HTML,获取正文所在Element*/ 424 | public static Element getContentElementByHtml(String html) throws Exception { 425 | Document doc = Jsoup.parse(html); 426 | return getContentElementByDoc(doc); 427 | } 428 | 429 | /*输入HTML和URL,获取正文所在Element*/ 430 | public static Element getContentElementByHtml(String html, String url) throws Exception { 431 | Document doc = Jsoup.parse(html, url); 432 | return getContentElementByDoc(doc); 433 | } 434 | 435 | /*输入URL,获取正文所在Element*/ 436 | // public static Element getContentElementByUrl(String url) throws Exception { 437 | // HttpRequest request = new HttpRequest(url); 438 | // String html = request.getResponse().getHtmlByCharsetDetect(); 439 | // return getContentElementByHtml(html, url); 440 | // } 441 | 442 | /*输入Jsoup的Document,获取正文文本*/ 443 | public static String getContentByDoc(Document doc) throws Exception { 444 | ContentExtractor ce = new ContentExtractor(doc); 445 | return ce.getContentElement().text(); 446 | } 447 | 448 | /*输入HTML,获取正文文本*/ 449 | public static String getContentByHtml(String html) throws Exception { 450 | Document doc = Jsoup.parse(html); 451 | return getContentElementByDoc(doc).text(); 452 | } 453 | 454 | /*输入HTML和URL,获取正文文本*/ 455 | public static String getContentByHtml(String html, String url) throws Exception { 456 | Document doc = Jsoup.parse(html, url); 457 | return getContentElementByDoc(doc).text(); 458 | } 459 | 460 | /*输入URL,获取正文文本*/ 461 | // public static String getContentByUrl(String url) throws Exception { 462 | // HttpRequest request = new HttpRequest(url); 463 | // String html = request.getResponse().getHtmlByCharsetDetect(); 464 | // return getContentByHtml(html, url); 465 | // } 466 | 467 | /*输入Jsoup的Document,获取结构化新闻信息*/ 468 | public static News getNewsByDoc(Document doc) throws Exception { 469 | ContentExtractor ce = new ContentExtractor(doc); 470 | return ce.getNews(); 471 | } 472 | 473 | /*输入HTML,获取结构化新闻信息*/ 474 | public static News getNewsByHtml(String html) throws Exception { 475 | Document doc = Jsoup.parse(html); 476 | return getNewsByDoc(doc); 477 | } 478 | 479 | /*输入HTML和URL,获取结构化新闻信息*/ 480 | public static News getNewsByHtml(String html, String url) throws Exception { 481 | Document doc = Jsoup.parse(html, url); 482 | return getNewsByDoc(doc); 483 | } 484 | 485 | /*输入URL,获取结构化新闻信息*/ 486 | // public static News getNewsByUrl(String url) throws Exception { 487 | // HttpRequest request = new HttpRequest(url); 488 | // String html = request.getResponse().getHtmlByCharsetDetect(); 489 | // return getNewsByHtml(html, url); 490 | // } 491 | 492 | public static void main(String[] args) throws Exception { 493 | 494 | // News news = ContentExtractor.getNewsByUrl("http://www.huxiu.com/article/121959/1.html"); 495 | // System.out.println(news.getUrl()); 496 | // System.out.println(news.getTitle()); 497 | // System.out.println(news.getTime()); 498 | // System.out.println(news.getContent()); 499 | //System.out.println(news.getContentElement()); 500 | 501 | //System.out.println(news); 502 | } 503 | 504 | } -------------------------------------------------------------------------------- /util/src/main/java/crawler/util/news/contextextractor/News.java: -------------------------------------------------------------------------------- 1 | package crawler.util.news.contextextractor; 2 | 3 | import org.jsoup.nodes.Element; 4 | 5 | /** 6 | * Created by yangjing on 15-11-3. 7 | */ 8 | public class News { 9 | 10 | protected String url = null; 11 | // protected String title = null; 12 | protected String content = null; 13 | // protected String time = null; 14 | 15 | protected Element contentElement = null; 16 | 17 | public String getUrl() { 18 | return url; 19 | } 20 | 21 | public void setUrl(String url) { 22 | this.url = url; 23 | } 24 | 25 | // public String getTitle() { 26 | // return title; 27 | // } 28 | // 29 | // public void setTitle(String title) { 30 | // this.title = title; 31 | // } 32 | 33 | public String getContent() { 34 | if (content == null) { 35 | if (contentElement != null) { 36 | content = contentElement.text(); 37 | } 38 | } 39 | return content; 40 | } 41 | 42 | 43 | public void setContent(String content) { 44 | this.content = content; 45 | } 46 | 47 | // public String getTime() { 48 | // return time; 49 | // } 50 | // 51 | // public void setTime(String time) { 52 | // this.time = time; 53 | // } 54 | 55 | @Override 56 | public String toString() { 57 | return "URL:\n" + url + /*"\nTITLE:\n" + title + "\nTIME:\n" + time +*/ "\nCONTENT:\n" + getContent() + "\nCONTENT(SOURCE):\n" + contentElement; 58 | } 59 | 60 | public Element getContentElement() { 61 | return contentElement; 62 | } 63 | 64 | public void setContentElement(Element contentElement) { 65 | this.contentElement = contentElement; 66 | } 67 | 68 | 69 | } -------------------------------------------------------------------------------- /util/src/main/resources/reference.conf: -------------------------------------------------------------------------------- 1 | akka { 2 | loggers = ["akka.event.slf4j.Slf4jLogger"] 3 | loglevel = INFO 4 | log-dead-letters = off 5 | log-dead-letters-during-shutdown = off 6 | fork-join-executor { 7 | parallelism-factor = 3.0 8 | parallelism-min = 16 9 | parallelism-max = 64 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /util/src/main/scala/crawler/SystemUtils.scala: -------------------------------------------------------------------------------- 1 | package crawler 2 | 3 | import java.util.concurrent.TimeoutException 4 | 5 | import akka.actor.ActorSystem 6 | import akka.stream.ActorMaterializer 7 | import com.ning.http.client.AsyncHttpClientConfig 8 | import com.typesafe.config.ConfigFactory 9 | import com.typesafe.scalalogging.StrictLogging 10 | import crawler.util.http.HttpClient 11 | 12 | import scala.concurrent.duration._ 13 | 14 | /** 15 | * System Utils 16 | * Created by yangjing on 15-11-5. 17 | */ 18 | object SystemUtils extends StrictLogging { 19 | val crawlerConfig = ConfigFactory.load().getConfig("crawler") 20 | 21 | implicit val system = ActorSystem(crawlerConfig.getString("akka-system-name")) 22 | implicit val materializer = ActorMaterializer() 23 | 24 | val httpClient = { 25 | crawlerConfig.getConfig("http-client") 26 | val builder = new AsyncHttpClientConfig.Builder() 27 | builder.setMaxConnections(8192) 28 | builder.setMaxConnectionsPerHost(4) 29 | builder.setConnectTimeout(10 * 1000) 30 | builder.setPooledConnectionIdleTimeout(40 * 1000) 31 | builder.setRequestTimeout(90 * 1000) 32 | builder.setAllowPoolingConnections(true) 33 | builder.setFollowRedirect(true) 34 | HttpClient(builder.build(), Nil) 35 | } 36 | 37 | def shutdown(): Unit = { 38 | httpClient.close() 39 | system.shutdown() 40 | try { 41 | system.awaitTermination(5.seconds) 42 | System.exit(0) 43 | } catch { 44 | case e: TimeoutException => 45 | logger.error(e.getLocalizedMessage, e) 46 | System.exit(3) 47 | } 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /util/src/main/scala/crawler/util/Crawler.scala: -------------------------------------------------------------------------------- 1 | package crawler.util 2 | 3 | import crawler.util.http.HttpClient 4 | 5 | import scala.util.Random 6 | 7 | /** 8 | * Created by Yang Jing (yangbajing@gmail.com) on 2016-01-18. 9 | */ 10 | trait Crawler { 11 | val httpClient: HttpClient 12 | 13 | protected def defaultHeaders = Array( 14 | Seq( 15 | "User-Agent" -> "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36", 16 | "Accept" -> "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 17 | "Accept-Encoding" -> "gzip, deflate, sdch", 18 | "Accept-Language" -> "zh-CN,zh;q=0.8,en;q=0.6", 19 | "Connection" -> "keep-alive" 20 | ), 21 | Seq( 22 | "User-Agent" -> "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/601.2.7 (KHTML, like Gecko) Version/9.0.1 Safari/601.2.7", 23 | "Accept" -> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" 24 | ), 25 | Seq( 26 | "User-Agent" -> "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:39.0) Gecko/20100101 Firefox/39.0", 27 | "Accept" -> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 28 | "Accept-Encoding" -> "gzip, deflate", 29 | "Accept-Language" -> "en-US,en;q=0.5", 30 | "Connection" -> "keep-alive" 31 | ) 32 | ) 33 | 34 | def requestHeaders() = defaultHeaders(Random.nextInt(defaultHeaders.length)) 35 | 36 | def fetchPage(url: String) = { 37 | httpClient.get(url).setFollowRedirects(true).header(requestHeaders(): _*).execute() 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /util/src/main/scala/crawler/util/JsoupImplicits.scala: -------------------------------------------------------------------------------- 1 | package crawler.util 2 | 3 | import org.jsoup.nodes.Element 4 | import org.jsoup.select.Elements 5 | 6 | /** 7 | * Jsoup 相关辅助方法 8 | * Created by yangjing on 15-11-3. 9 | */ 10 | object JsoupImplicits { 11 | 12 | implicit class JsoupElementFindByClassname(element: Element) { 13 | def findByClass(cn: String): Elements = { 14 | element.getElementsByClass(cn) 15 | } 16 | } 17 | 18 | implicit class JsoupElementsFindByClassname(elements: Elements) { 19 | def findByClass(cn: String): Elements = { 20 | val list = new java.util.LinkedList[Element]() 21 | val iter = elements.iterator() 22 | while (iter.hasNext) { 23 | val elements = iter.next().getElementsByClass(cn) 24 | list.addAll(elements) 25 | } 26 | new Elements(list) 27 | } 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /util/src/main/scala/crawler/util/Utils.scala: -------------------------------------------------------------------------------- 1 | package crawler.util 2 | 3 | import java.lang.management.ManagementFactory 4 | import java.nio.charset.Charset 5 | 6 | import crawler.util.time.TimeUtils 7 | 8 | /** 9 | * Utils 10 | * Created by Yang Jing (yangbajing@gmail.com) on 2015-12-03. 11 | */ 12 | object Utils { 13 | val CHARSET = Charset.forName("UTF-8") 14 | 15 | def getPid = { 16 | val runtime = ManagementFactory.getRuntimeMXBean 17 | runtime.getName.split('@')(0) 18 | } 19 | 20 | def lastYearPeriods(): Seq[Int] = { 21 | val now = TimeUtils.now() 22 | val (curMonth, curYear, preYear) = (now.getMonthValue, now.getYear * 100, now.getYear * 100 - 100) 23 | (curMonth + 1 to 12).map(preYear + _) ++ (1 to curMonth).map(curYear + _) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /util/src/main/scala/crawler/util/actors/MetricActor.scala: -------------------------------------------------------------------------------- 1 | package crawler.util.actors 2 | 3 | import java.util.concurrent.atomic.AtomicInteger 4 | 5 | import akka.actor.Actor 6 | import com.typesafe.scalalogging.LazyLogging 7 | 8 | /** 9 | * Metric Actor 10 | * Created by yangjing on 15-11-4. 11 | */ 12 | trait MetricActor extends Actor with LazyLogging { 13 | final override def preStart(): Unit = { 14 | logger.trace(s"${self.path} preStart") 15 | MetricActor.incrementActorSize() 16 | metricPreStart() 17 | } 18 | 19 | final override def postStop(): Unit = { 20 | metricPostStop() 21 | MetricActor.decrementActorSize() 22 | logger.trace(s"${self.path} postStop") 23 | } 24 | 25 | final override def receive: Receive = { 26 | case s => 27 | if (metricReceive.isDefinedAt(s)) { 28 | logger.trace(s"${self.path} receive message: $s") 29 | metricReceive(s) 30 | } else { 31 | logger.warn(s"${self.path} receive message: $s") 32 | unhandled(s) 33 | } 34 | } 35 | 36 | def metricPreStart(): Unit = () 37 | 38 | def metricPostStop(): Unit = () 39 | 40 | val metricReceive: Receive 41 | 42 | } 43 | 44 | object MetricActor { 45 | private val _currentActiveActors = new AtomicInteger(0) 46 | 47 | def incrementActorSize() = _currentActiveActors.incrementAndGet() 48 | 49 | def decrementActorSize() = _currentActiveActors.decrementAndGet() 50 | 51 | def currentActorSize() = _currentActiveActors.get() 52 | } 53 | -------------------------------------------------------------------------------- /util/src/main/scala/crawler/util/http/HttpClient.scala: -------------------------------------------------------------------------------- 1 | package crawler.util.http 2 | 3 | import com.ning.http.client._ 4 | import com.ning.http.client.cookie.Cookie 5 | import com.ning.http.client.multipart.Part 6 | import com.typesafe.config.Config 7 | 8 | import scala.concurrent.{ExecutionContext, Future, Promise} 9 | import scala.util.{Failure, Success} 10 | 11 | class HttpClientBuilder(builder: AsyncHttpClient#BoundRequestBuilder) { 12 | 13 | def queryParam(params: (String, String)*) = { 14 | params.foreach { case (name, value) => builder.addQueryParam(name, value) } 15 | this 16 | } 17 | 18 | def header(headers: (String, String)*) = { 19 | headers.foreach { case (name, value) => builder.addHeader(name, value) } 20 | this 21 | } 22 | 23 | def cookie(cookie: Cookie) = { 24 | builder.addCookie(cookie) 25 | this 26 | } 27 | 28 | def part(part: Part) = { 29 | builder.addBodyPart(part) 30 | this 31 | } 32 | 33 | def addFormParam(params: (String, String)*) = { 34 | params.foreach { case (key, value) => builder.addFormParam(key, value) } 35 | this 36 | } 37 | 38 | def setFollowRedirects(followRedirects: Boolean) = { 39 | builder.setFollowRedirects(followRedirects) 40 | this 41 | } 42 | 43 | def execute(): Future[Response] = { 44 | val promise = Promise[Response]() 45 | try { 46 | builder.execute(new AsyncCompletionHandler[Unit] { 47 | override def onCompleted(response: Response): Unit = { 48 | // println(response.getStatusCode + ": " + response.getStatusText) 49 | promise.success(response) 50 | } 51 | 52 | override def onThrowable(t: Throwable): Unit = { 53 | promise.failure(t) 54 | } 55 | }) 56 | } catch { 57 | case e: Throwable => 58 | promise.failure(e) 59 | } 60 | promise.future 61 | } 62 | 63 | } 64 | 65 | /** 66 | * HttpClient 67 | * Created by yangjing on 15-11-3. 68 | */ 69 | class HttpClient private(config: AsyncHttpClientConfig, 70 | defaultHeaders: Iterable[(String, String)]) { 71 | 72 | private val client = new AsyncHttpClient(config) 73 | 74 | def close() = client.close() 75 | 76 | def get(url: String) = new HttpClientBuilder(client.prepareGet(url)) 77 | 78 | def post(url: String) = new HttpClientBuilder(client.preparePost(url)) 79 | 80 | def delete(url: String) = new HttpClientBuilder(client.prepareDelete(url)) 81 | 82 | def put(url: String) = new HttpClientBuilder(client.preparePut(url)) 83 | } 84 | 85 | object HttpClient { 86 | def apply(): HttpClient = apply(Nil) 87 | 88 | def apply(config: Config): HttpClient = { 89 | // TODO 解析config to AsyncHttpClientConfig 90 | 91 | apply(Nil) 92 | } 93 | 94 | def apply(defaultHeaders: Iterable[(String, String)]): HttpClient = 95 | apply(new AsyncHttpClientConfig.Builder().build, defaultHeaders) 96 | 97 | def apply(config: AsyncHttpClientConfig, defaultHeaders: Iterable[(String, String)]): HttpClient = 98 | new HttpClient(config, defaultHeaders) 99 | 100 | def apply(allowRedirect: Boolean): HttpClient = { 101 | val builder = new AsyncHttpClientConfig.Builder() 102 | builder.setFollowRedirect(false) 103 | apply(builder.build(), Nil) 104 | } 105 | 106 | def find302Location(client: HttpClient, url: String, headers: Seq[(String, String)])(implicit ec: ExecutionContext) = { 107 | val promise = Promise[String]() 108 | 109 | def findLocation() = client.get(url).header(headers: _*).setFollowRedirects(false).execute().map(_.getHeader("Location")) 110 | 111 | findLocation().onComplete { 112 | case Success(location) => promise.success(location) 113 | case Failure(e) => 114 | findLocation().onComplete { 115 | case Success(location) => promise.success(location) 116 | case Failure(t) => promise.failure(t) 117 | } 118 | } 119 | 120 | promise.future 121 | } 122 | 123 | } -------------------------------------------------------------------------------- /util/src/main/scala/crawler/util/http/TJsonSupport.scala: -------------------------------------------------------------------------------- 1 | package crawler.util.http 2 | 3 | import java.time.LocalDateTime 4 | 5 | import akka.http.scaladsl.marshalling._ 6 | import akka.http.scaladsl.model.{ContentType, ContentTypes, HttpCharsets, MediaTypes} 7 | import akka.http.scaladsl.unmarshalling._ 8 | import akka.stream.Materializer 9 | import crawler.util.time.TimeUtils 10 | import org.json4s._ 11 | import org.json4s.jackson.Serialization 12 | 13 | /** 14 | * Akka Http Json Supoort 15 | * Created by yangjing on 15-11-5. 16 | */ 17 | trait TJsonSupport { 18 | def defaultFormats: Formats = DefaultFormats + new LocalDateTimeSerializer() 19 | 20 | implicit val serialization = Serialization 21 | implicit val formats: Formats 22 | 23 | } 24 | 25 | object TJsonSupport extends TJsonSupport { 26 | override implicit val formats: Formats = defaultFormats 27 | } 28 | 29 | class LocalDateTimeSerializer extends CustomSerializer[LocalDateTime](format => 30 | ( { 31 | case JString(s) => LocalDateTime.parse(s, TimeUtils.formatterDateTime) 32 | case JNull => null 33 | }, { 34 | case d: LocalDateTime => JString(TimeUtils.formatterDateTime.format(d)) 35 | }) 36 | ) 37 | 38 | -------------------------------------------------------------------------------- /util/src/main/scala/crawler/util/persist/CassandraPersists.scala: -------------------------------------------------------------------------------- 1 | package crawler.util.persist 2 | 3 | import com.datastax.driver.core._ 4 | import com.google.common.util.concurrent.{FutureCallback, Futures} 5 | import com.typesafe.scalalogging.LazyLogging 6 | import crawler.SystemUtils 7 | 8 | import scala.collection.JavaConverters._ 9 | import scala.concurrent.{ExecutionContextExecutor, Future, Promise} 10 | import scala.util.Try 11 | 12 | /** 13 | * CassandraPersists 14 | * Created by yangjing on 15-11-6. 15 | */ 16 | abstract class CassandraPersists(nodes: Seq[String]) { 17 | val cluster = { 18 | Cluster.builder().addContactPoints(nodes: _*) 19 | } 20 | } 21 | 22 | object CassandraPersists extends LazyLogging { 23 | 24 | val cluster = { 25 | val nodes = SystemUtils.crawlerConfig.getStringList("cassandra.nodes").asScala 26 | logger.info("cassandra.nodes: " + nodes) 27 | Cluster.builder().addContactPoints(nodes: _*).build() 28 | } 29 | 30 | def userType(keyspace: String, userType: String): UserType = 31 | cluster.getMetadata.getKeyspace(keyspace).getUserType(userType) 32 | 33 | def using[R](keyspace: String)(func: Session => R): R = { 34 | val session = cluster.connect(keyspace) 35 | try { 36 | func(session) 37 | } finally { 38 | session.closeAsync() 39 | } 40 | } 41 | 42 | def execute[R](resultSetFuture: ResultSetFuture)(func: ResultSet => R)(implicit ec: ExecutionContextExecutor): Future[R] = { 43 | val promise = Promise[R]() 44 | Futures.addCallback( 45 | resultSetFuture, 46 | new FutureCallback[ResultSet] { 47 | override def onFailure(t: Throwable): Unit = { 48 | promise.failure(t) 49 | } 50 | 51 | override def onSuccess(rs: ResultSet): Unit = { 52 | promise.complete(Try(func(rs))) 53 | } 54 | }, 55 | ec) 56 | promise.future 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /util/src/main/scala/crawler/util/time/TimeUtils.scala: -------------------------------------------------------------------------------- 1 | package crawler.util.time 2 | 3 | import java.time._ 4 | import java.time.format.DateTimeFormatter 5 | import java.util.Date 6 | 7 | /** 8 | * DateTimeUtils 9 | * Created by yangjing on 15-11-6. 10 | */ 11 | object TimeUtils { 12 | val ZONE_OFFSET = ZoneOffset.ofHours(8) 13 | val formatterDate = DateTimeFormatter.ofPattern("yyyy-MM-dd") 14 | val formatterDateTime = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss") 15 | val formatterDateMinus = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm") 16 | val formatterMinus = DateTimeFormatter.ofPattern("HH:mm") 17 | 18 | def toLocalDateTime(instant: Instant): LocalDateTime = LocalDateTime.ofInstant(instant, ZONE_OFFSET) 19 | 20 | def toLocalDateTime(s: String): LocalDateTime = { 21 | s.length match { 22 | case 5 => 23 | LocalDateTime.parse(s, formatterMinus) 24 | case 16 => 25 | LocalDateTime.parse(s, formatterDateMinus) 26 | case 19 => 27 | LocalDateTime.parse(s, formatterDateTime) 28 | case _ => 29 | LocalDateTime.parse(s) 30 | } 31 | } 32 | 33 | def toLocalDateTime(date: Date): LocalDateTime = 34 | LocalDateTime.ofInstant(Instant.ofEpochMilli(date.getTime), ZONE_OFFSET) 35 | 36 | def toDate(ldt: LocalDateTime): Date = 37 | new Date(ldt.toInstant(ZONE_OFFSET).toEpochMilli) 38 | 39 | def now() = LocalDateTime.now() 40 | 41 | /** 42 | * @return 一天的开始: 43 | */ 44 | def nowBegin(): LocalDateTime = LocalDate.now().atTime(0, 0, 0, 0) 45 | 46 | /** 47 | * @return 一天的结尾: 48 | */ 49 | def nowEnd(): LocalDateTime = LocalTime.of(23, 59, 59, 999999999).atDate(LocalDate.now()) 50 | } 51 | -------------------------------------------------------------------------------- /util/src/test/scala/crawler/testsuite/ServiceSpec.scala: -------------------------------------------------------------------------------- 1 | package crawler.testsuite 2 | 3 | import crawler.SystemUtils 4 | import org.scalatest._ 5 | import org.scalatest.concurrent.ScalaFutures 6 | import org.scalatest.time.{Seconds, Span} 7 | 8 | /** 9 | * Created by yangjing on 15-11-4. 10 | */ 11 | abstract class ServiceSpec 12 | extends WordSpec 13 | with BeforeAndAfterAll 14 | with MustMatchers 15 | with OptionValues 16 | with EitherValues 17 | with ScalaFutures { 18 | 19 | implicit def system = SystemUtils.system 20 | implicit def materializer = SystemUtils.materializer 21 | implicit def dispatcher = system.dispatcher 22 | implicit val defaultPatience = PatienceConfig(Span(30, Seconds)) 23 | 24 | override protected def afterAll(): Unit = { 25 | SystemUtils.shutdown() 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /util/src/test/scala/crawler/util/persist/CassandraPersistsTest.scala: -------------------------------------------------------------------------------- 1 | package crawler.util.persist 2 | 3 | import java.util.Date 4 | 5 | import crawler.SystemUtils 6 | import org.scalatest.WordSpec 7 | 8 | /** 9 | * Created by yangjing on 15-11-6. 10 | */ 11 | class CassandraPersistsTest extends WordSpec { 12 | 13 | "CassandraPersistsTest" should { 14 | 15 | "save" in { 16 | val keyspace = SystemUtils.crawlerConfig.getString("cassandra.keyspace") 17 | CassandraPersists.using(keyspace) { session => 18 | val newsItem = Map( 19 | "url" -> "http://hostname/news/1.html", 20 | "source" -> "网易新闻", 21 | "title" -> "标题", 22 | "time" -> new Date(), 23 | "abstract" -> "新闻摘要") 24 | val bstmt = session.prepare("INSERT INTO search_page(source, key, count, news) VALUES(?, ?, ?, ?);") 25 | 26 | val newsTypeUDT = session.getCluster.getMetadata.getKeyspace(keyspace).getUserType("news_type") 27 | val nit = newsTypeUDT.newValue() 28 | newsItem.foreach { 29 | case ("time", value: Date) => nit.setTimestamp("time", value) 30 | case (key, value: String) => nit.setString(key, value) 31 | } 32 | 33 | val result = session.execute(bstmt.bind( 34 | "网易新闻", 35 | "杭州誉存科技有限公司", 36 | Integer.valueOf(2), 37 | java.util.Arrays.asList(nit) 38 | )) 39 | println(result) 40 | 41 | } 42 | } 43 | 44 | } 45 | } 46 | --------------------------------------------------------------------------------