├── .scalafix.conf
├── .scalafmt.conf
├── .sonarcloud.properties
├── .travis.yml
├── README.md
├── bin
├── run-example.sh
├── stop.sh
└── test.sh
├── build.sbt
├── crawler-core
├── build.sbt
└── src
│ ├── main
│ ├── resources
│ │ ├── log4j2.xml
│ │ └── reference.conf
│ └── scala
│ │ └── io
│ │ └── github
│ │ └── wtog
│ │ └── crawler
│ │ ├── actor
│ │ └── ActorManager.scala
│ │ ├── downloader
│ │ ├── AsyncHttpClientDownloader.scala
│ │ ├── ChromeHeadlessDownloader.scala
│ │ ├── Downloader.scala
│ │ ├── DownloaderActorReceiver.scala
│ │ └── proxy
│ │ │ ├── ProxyCrawlerPipeline.scala
│ │ │ ├── ProxyProvider.scala
│ │ │ └── crawler
│ │ │ ├── A2UPageProcessor.scala
│ │ │ ├── Data5UPageProcessor.scala
│ │ │ ├── IP89Processor.scala
│ │ │ └── ProxyProcessorTrait.scala
│ │ ├── dto
│ │ ├── Event.scala
│ │ ├── Page.scala
│ │ └── RequestSetting.scala
│ │ ├── exceptions
│ │ └── NonNullArgumentsException.scala
│ │ ├── pipeline
│ │ ├── ConsolePipeline.scala
│ │ ├── Pipeline.scala
│ │ └── PipelineActorReceiver.scala
│ │ ├── processor
│ │ ├── PageProcessor.scala
│ │ └── PageProcessorActorReceiver.scala
│ │ ├── queue
│ │ ├── DuplicateRemovedQueue.scala
│ │ ├── RequestQueue.scala
│ │ ├── TargetRequestTaskQueue.scala
│ │ └── duplicate
│ │ │ ├── BitSetStrategy.scala
│ │ │ ├── DuplicateRemovedStrategy.scala
│ │ │ └── HashMapStrategy.scala
│ │ ├── rest
│ │ ├── NettyServer.scala
│ │ ├── Router.scala
│ │ └── Server.scala
│ │ ├── schedule
│ │ └── ScheduleJobs.scala
│ │ ├── selector
│ │ ├── HtmlParser.scala
│ │ └── Selector.scala
│ │ └── spider
│ │ ├── Spider.scala
│ │ └── SpiderPool.scala
│ └── test
│ ├── resources
│ ├── application-test.conf
│ └── log4j2-test.xml
│ └── scala
│ └── io
│ └── github
│ └── wtog
│ └── crawler
│ └── test
│ ├── BaseCoreTest.scala
│ ├── actor
│ ├── ActorTestBase.scala
│ └── PageProcessorActorTestkit.scala
│ ├── download
│ ├── AsyncHttpClientTest.scala
│ └── ChromeHeadlessDownloaderTest.scala
│ ├── processor
│ └── HtmlParserSpec.scala
│ ├── proxy
│ └── ProxyProviderTest.scala
│ ├── queue
│ └── DuplicateStrategyTest.scala
│ ├── schedule
│ └── ScheduleTest.scala
│ └── server
│ └── TestMockServer.scala
├── crawler-example
└── src
│ └── main
│ ├── resources
│ ├── log4j2.xml
│ └── reference.conf
│ └── scala
│ └── io
│ └── github
│ └── wtog
│ └── example
│ ├── ExampleTrait.scala
│ ├── Main.scala
│ └── impl
│ ├── BaiduPageProcessor.scala
│ ├── LianjiaErshouFangProcessor.scala
│ ├── LianjiaRentingProcessor.scala
│ ├── ZhihuAnswerPageProcessor.scala
│ └── flight
│ └── QunarPageProcessor.scala
├── crawler-pipeline
└── src
│ ├── main
│ └── scala
│ │ └── io
│ │ └── github
│ │ └── wtog
│ │ └── crawler
│ │ └── pipeline
│ │ ├── db
│ │ ├── DataSource.scala
│ │ ├── DataSourceInfo.scala
│ │ └── PostgreSQLPipeline.scala
│ │ └── file
│ │ ├── CsvFilePipeline.scala
│ │ └── FilePipeline.scala
│ └── test
│ └── scala
│ └── io
│ └── github
│ └── wtog
│ └── crawler
│ └── pipeline
│ └── test
│ ├── BasePipelineTest.scala
│ └── DataSourceTest.scala
├── docker
├── Dockerfile
└── build.sh
├── project
├── .gnupg
│ ├── pubring.gpg
│ └── secring.gpg
├── Dependencies.scala
├── Publish.scala
├── build.properties
└── plugins.sbt
├── push.sh
├── utils
└── src
│ ├── main
│ └── scala
│ │ └── io
│ │ └── github
│ │ └── wtog
│ │ └── utils
│ │ ├── ConfigUtils.scala
│ │ ├── JsonUtils.scala
│ │ ├── ReflectionUtils.scala
│ │ ├── RetryUtils.scala
│ │ ├── StringUtils.scala
│ │ └── logger
│ │ └── Logging.scala
│ └── test
│ └── scala
│ └── io
│ └── github
│ └── wtog
│ └── utils
│ └── test
│ ├── BaseTest.scala
│ ├── JsonUtilsTest.scala
│ ├── RetryUtilsTest.scala
│ ├── jmh
│ └── StringUtilsBenchmark.scala
│ └── reflection
│ └── ReflectionUtilsTest.scala
└── version.sbt
/.scalafix.conf:
--------------------------------------------------------------------------------
1 | rules = [
2 | RemoveUnused,
3 | ExplicitResultTypes,
4 | LeakingImplicitClassVal,
5 | NoValInForComprehension,
6 | ProcedureSyntax
7 | ]
--------------------------------------------------------------------------------
/.scalafmt.conf:
--------------------------------------------------------------------------------
1 | version=2.0.0-RC5
2 | align = true
3 | danglingParentheses = true
4 | maxColumn = 400
5 | rewrite.rules = [AvoidInfix, RedundantParens, SortModifiers, PreferCurlyFors, RedundantBraces, SortImports]
6 | spaces.inImportCurlyBraces = true
7 | continuationIndent.defnSite = 4
8 | verticalMultiline.atDefnSite = true
9 | verticalMultiline.arityThreshold = 5
10 | project.excludeFilters = [
11 | io.github.wtog.test.jmh
12 | ]
13 |
--------------------------------------------------------------------------------
/.sonarcloud.properties:
--------------------------------------------------------------------------------
1 | # Path to sources
2 | sonar.projectKey=io.github.wtog:crawler
3 | sonar.organization=wtog
4 | sonar.sources=crawler-core/src/main,crawler-pipeline/src/main,utils/src/main
5 | sonar.exclusions=**/resources/**
6 | sonar.tests=crawler-core/src/test,crawler-pipeline/src/test,utils/src/test
7 | sonar.test.inclusions=**/*Test*
8 | sonar.sourceEncoding=UTF-8
9 | sonar.scala.coverage.reportPaths=crawler-core/target/scala-2.12/scoverage-report/scoverage.xml,crawler-pipeline/target/scala-2.12/scoverage-report/scoverage.xml,utils/target/scala-2.12/scoverage-report/scoverage.xml
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | # use Docker-based container (instead of OpenVZ)
2 | sudo: false
3 |
4 | branches:
5 | only:
6 | - dev
7 |
8 | cache:
9 | directories:
10 | - $HOME/.m2/repository
11 | - $HOME/.sbt
12 | - $HOME/.ivy2
13 |
14 | language: scala
15 |
16 | addons:
17 | chrome: stable
18 |
19 | script:
20 | - bash bin/test.sh
21 |
22 | jdk:
23 | - openjdk8
24 |
25 | after_success:
26 | - bash <(curl -s https://codecov.io/bash)
27 | - sbt ';set credentials += Credentials("Sonatype Nexus Repository Manager", "oss.sonatype.org", System.getenv("NEXUS_USER"), System.getenv("NEXUS_PASS")); set pgpPassphrase := Some("PGP_PASSPHRASE".toArray[Char]); +clean; +publish';
28 | - git config --global user.email wtgeeker@163.com
29 | - git config --global user.name wtog
30 | - current=`git log | head -n 1 | awk '{print $2}'`
31 | - git fetch origin master:master; git checkout master; git merge $current -s recursive -X theirs --no-commit;
32 | - git commit -m `git log $current --oneline | head -n 1 | awk '{print $2}'`
33 | - git push --force --quiet "https://${GITHUB_TOKEN}@github.com/wtog/web-crawler.git" master:master
34 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # web-crawler
2 |
3 | [](https://travis-ci.com/wtog/web-crawler.svg?branch=dev) [](https://codecov.io/gh/wtog/web-crawler)  [](https://sonarcloud.io/dashboard?id=io.github.wtog:crawler)
4 |
5 | ## 项目介绍
6 |
7 | 参考 webmagic [http://webmagic.io](http://webmagic.io) 撸的 [scala + akka] 爬虫
8 |
9 | ## 使用说明
10 |
11 | - 爬虫例子 [爬知乎R大 JVM 回答]
12 |
13 | ```scala
14 | package io.github.wtog.example
15 |
16 | import io.github.wtog.processor.{ Page, PageProcessor, RequestSetting }
17 |
18 | import scala.concurrent.duration._
19 |
20 | // 爬虫解析逻辑
21 | case class ZhihuAnswerPageProcessor() extends PageProcessor {
22 |
23 | val link = "https://www.zhihu.com/api/v4/members/rednaxelafx/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cquestion%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=10&sort_by=created"
24 |
25 | override def doProcess(page: Page): Unit = {
26 | val result = page.json[Map[String, Any]]()
27 |
28 | result.get("data").foreach { answers =>
29 | answers.asInstanceOf[List[Map[String, Any]]].foreach { answer =>
30 | val question = answer("question").asInstanceOf[Map[String, String]]("title")
31 | val answerContent = answer("content")
32 | page.addPageResultItem(Map("question" -> question, "answer" -> answerContent))
33 | }
34 | }
35 |
36 | val nextPage = result("paging").asInstanceOf[Map[String, String]].get("next")
37 |
38 | nextPage.foreach { url =>
39 | page.addTargetRequest(url.replaceAll("https://www.zhihu.com", "$0/api/v4"))
40 | }
41 | }
42 |
43 | override def requestSetting: RequestSetting =
44 | RequestSetting(
45 | domain = "www.zhihu.com",
46 | sleepTime = 3 seconds
47 | )
48 |
49 | override def targetUrls: List[String] = List(link)
50 |
51 | override def cronExpression: Option[String] = None
52 | }
53 |
54 | // 启动爬虫
55 | Spider(pageProcessor = ZhihuAnswerPageProcessor()).start()
56 | ```
57 |
58 | [更多例子](https://github.com/wtog/web-crawler/tree/master/crawler-example/src/main/scala/io.github.wtog.example)
59 |
60 | - sbt
61 |
62 | 1. sbt 'project example; assembly' # 打 jar 包
63 | 2. java -jar crawler-example/target/scala-2.12/web-crawler-assembly.jar
64 |
65 | - docker
66 |
67 | 1. build
68 |
69 | ```docker
70 | sh docker/build.sh
71 | ```
72 |
73 | 2. start container
74 |
75 | ```docker
76 | docker run -it --name web-crawler wtog/web-crawler:latest
77 | ```
78 |
--------------------------------------------------------------------------------
/bin/run-example.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -e
4 |
5 | base=$(basename $(pwd))
6 |
7 | if [ "$base" == "bin" ]; then
8 | cd ../
9 | fi
10 |
11 | sbt 'project example; assembly'
12 |
13 | echo "java -jar crawler-example/target/scala-2.12/web-crawler-assembly.jar $* > /tmp/crawler.log 2>&1 &"
14 | echo "crawler starting..."
15 | nohup java -jar crawler-example/target/scala-2.12/web-crawler-assembly.jar $* > /tmp/crawler.log 2>&1 &
16 |
--------------------------------------------------------------------------------
/bin/stop.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | ps -ef |grep web-crawler-assembly | awk '{print $2}' | xargs kill -9
--------------------------------------------------------------------------------
/bin/test.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | wget -O chromedriver_linux64.zip -q https://npm.taobao.org/mirrors/chromedriver/79.0.3945.36/chromedriver_linux64.zip
4 |
5 | unzip -uxo chromedriver_linux64.zip -d /opt
6 |
7 | chmod +x /opt/chromedriver
8 |
9 | rm chromedriver_linux64.zip
10 |
11 | sbt ';clean ;coverage ;test ;coverageReport'
12 |
--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
1 | import Dependencies.crossVersion
2 | import sbt.Keys.organization
3 | import sbtassembly.{Assembly, MergeStrategy, PathList}
4 |
5 | javacOptions ++= Seq("-source", "1.8", "-target", "1.8")
6 |
7 | lazy val scalafixSettings = Seq(
8 | addCompilerPlugin(scalafixSemanticdb),
9 | scalacOptions ++= List(
10 | s"-P:semanticdb:targetroot:${System.getProperty("java.io.tmpdir")}/semanticdb",
11 | "-Yrangepos",
12 | "-language:postfixOps"))
13 |
14 | lazy val jmhSettings = Seq(
15 | sourceDirectory in Jmh := (sourceDirectory in Test).value,
16 | classDirectory in Jmh := (classDirectory in Test).value,
17 | dependencyClasspath in Jmh := (dependencyClasspath in Test).value
18 | )
19 |
20 | lazy val commonSettings = Seq(
21 | scalaVersion := crossVersion.head,
22 | fork := true,
23 | crossScalaVersions := crossVersion,
24 | parallelExecution in Test := true
25 | )
26 |
27 | lazy val dependenciesScope = "compile->compile;test->test"
28 |
29 | lazy val utils = (project in file("utils"))
30 | .settings(commonSettings: _*)
31 | .settings(scalafixSettings: _*)
32 | .settings(jmhSettings: _*)
33 | .settings(Seq(name := "utils", organization := "io.github.wtog.utils"))
34 | .settings(libraryDependencies ++= Dependencies.utils.dependencies)
35 | .enablePlugins(JmhPlugin)
36 | .disablePlugins(AssemblyPlugin)
37 |
38 | lazy val core = (project in file("crawler-core"))
39 | .settings(commonSettings: _*)
40 | .settings(scalafixSettings: _*)
41 | .settings(jmhSettings: _*)
42 | .settings(Seq(name := "crawler-core", organization := "io.github.wtog.crawler"))
43 | .settings(libraryDependencies ++= Dependencies.core.dependencies)
44 | .dependsOn(utils % dependenciesScope)
45 | .disablePlugins(AssemblyPlugin)
46 |
47 | lazy val pipeline = (project in file("crawler-pipeline"))
48 | .settings(commonSettings: _*)
49 | .settings(scalafixSettings: _*)
50 | .settings(jmhSettings: _*)
51 | .settings(Seq(name := "crawler-pipeline", organization := "io.github.wtog.crawler.pipeline"))
52 | .settings(libraryDependencies ++= Dependencies.pipeline.dependencies)
53 | .dependsOn(core, utils % dependenciesScope)
54 | .disablePlugins(AssemblyPlugin)
55 |
56 | lazy val example = (project in file("crawler-example"))
57 | .settings(commonSettings: _*)
58 | .settings(scalafixSettings: _*)
59 | .settings(Seq(name := "crawler-example", organization := "io.github.wtog.example"))
60 | .settings(libraryDependencies ++= Dependencies.example.dependencies)
61 | .settings(
62 | Seq(
63 | assemblyJarName in assembly := s"web-crawler-assembly.jar",
64 | mainClass in Compile := Some("io.github.wtog.example.Main"),
65 | test in assembly := {},
66 | assemblyMergeStrategy in assembly := {
67 | case x if Assembly.isConfigFile(x) =>
68 | MergeStrategy.concat
69 | case PathList(ps@_*) if Assembly.isReadme(ps.last) || Assembly.isLicenseFile(ps.last) =>
70 | MergeStrategy.rename
71 | case PathList(ps@_*) if Assembly.isSystemJunkFile(ps.last) =>
72 | MergeStrategy.discard
73 | case PathList("META-INF", xs@_*) =>
74 | xs.map(_.toLowerCase) match {
75 | case (x :: Nil) if Seq("manifest.mf", "index.list", "dependencies") contains x =>
76 | MergeStrategy.discard
77 | case ps@(x :: xs) if ps.last.endsWith(".sf") || ps.last.endsWith(".dsa") || ps.last.endsWith(".rsa") =>
78 | MergeStrategy.discard
79 | case "maven" :: xs =>
80 | MergeStrategy.discard
81 | case "plexus" :: xs =>
82 | MergeStrategy.discard
83 | case "services" :: xs =>
84 | MergeStrategy.filterDistinctLines
85 | case ("spring.schemas" :: Nil) | ("spring.handlers" :: Nil) | ("spring.tooling" :: Nil) =>
86 | MergeStrategy.filterDistinctLines
87 | case _ => MergeStrategy.first
88 | }
89 | case _ => MergeStrategy.first
90 | }
91 | )
92 | )
93 | .dependsOn(core, pipeline)
94 | .enablePlugins(DisablePublish, AssemblyPlugin)
95 | .disablePlugins(ScoverageSbtPlugin)
96 |
97 | lazy val root = (project in file("."))
98 | .settings(commonSettings: _*)
99 | .settings(Seq(name := "web-crawler"))
100 | .aggregate(utils, core, pipeline, example)
101 | .enablePlugins(JmhPlugin, DisablePublish)
102 | .disablePlugins(AssemblyPlugin)
103 |
104 | testOptions in Test += Tests.Argument(s"-P${java.lang.Runtime.getRuntime.availableProcessors()}")
105 |
106 | javaOptions in test := Seq(
107 | "-Dlog4j.configurationFile=log4j2-test.xml",
108 | "-Xms512m", "-Xmx512m"
109 | )
110 |
111 | javaOptions in run := Seq(
112 | "-Dlog4j.configurationFile=log4j2.xml",
113 | "-Xms512m", "-Xmx512m"
114 | )
115 |
116 |
--------------------------------------------------------------------------------
/crawler-core/build.sbt:
--------------------------------------------------------------------------------
1 | mappings in(Compile, packageBin) ~= {
2 | files =>
3 | files.filter(!_._1.getName.contentEquals("log4j2.xml"))
4 | }
5 |
--------------------------------------------------------------------------------
/crawler-core/src/main/resources/log4j2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/crawler-core/src/main/resources/reference.conf:
--------------------------------------------------------------------------------
1 | crawler {
2 |
3 | downloader-dispatcher {
4 | type = Dispatcher
5 | executor = "thread-pool-executor"
6 |
7 | thread-pool-executor {
8 | core-pool-size-min = 5
9 | core-pool-size-factor = 2
10 | core-pool-size-max = 10
11 | }
12 | }
13 |
14 | processor-dispatcher {
15 | type = Dispatcher
16 | executor = "fork-join-executor"
17 | fork-join-executor {
18 | parallelism-min = 2
19 | parallelism-factor = 4.0
20 | parallelism-max = 10
21 | }
22 | throughput = 50
23 | }
24 |
25 | pipeline-dispatcher {
26 | type = Dispatcher
27 | executor = "thread-pool-executor"
28 |
29 | thread-pool-executor {
30 | core-pool-size-min = 4
31 | core-pool-size-factor = 2
32 | core-pool-size-max = 8
33 | }
34 | }
35 |
36 | download.retry.exception = ["java.util.concurrent.TimeoutException"]
37 |
38 | server.port = 19000
39 | }
40 |
41 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/actor/ActorManager.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.actor
2 |
3 | import akka.actor.{ ActorSystem, Props }
4 |
5 | import scala.concurrent.ExecutionContext
6 | import akka.actor.{ ActorRef, ActorSelection }
7 | import akka.dispatch.MessageDispatcher
8 |
9 | /**
10 | * @author : tong.wang
11 | * @since : 5/16/18 11:56 PM
12 | * @version : 1.0.0
13 | */
14 | object ActorManager {
15 | lazy val system: ActorSystem = ActorSystem("crawler")
16 |
17 | def getNewSystemActor(dispatcher: String, actorName: String, props: Props): ActorRef = system.actorOf(props.withDispatcher(s"crawler.${dispatcher}"), actorName)
18 |
19 | def getExistedActor(path: String): ActorSelection = system.actorSelection(path)
20 | }
21 |
22 | object ExecutionContexts {
23 | implicit lazy val downloadDispatcher: ExecutionContext = dispatcher("crawler.downloader-dispatcher")
24 | implicit lazy val processorDispatcher: ExecutionContext = dispatcher("crawler.processor-dispatcher")
25 | implicit lazy val pipelineDispatcher: ExecutionContext = dispatcher("crawler.pipeline-dispatcher")
26 |
27 | def dispatcher(id: String): MessageDispatcher = {
28 | val dispatchers = ActorManager.system.dispatchers
29 | if (dispatchers.hasDispatcher(id)) {
30 | dispatchers.lookup(id)
31 | } else {
32 | dispatchers.defaultGlobalDispatcher
33 | }
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/downloader/AsyncHttpClientDownloader.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.downloader
2 |
3 | import io.github.wtog.crawler.downloader.proxy.ProxyDTO
4 | import io.github.wtog.crawler.dto.{ Page, RequestSetting }
5 | import io.github.wtog.crawler.exceptions.IllegalArgumentsException
6 | import io.netty.handler.codec.http.{ DefaultHttpHeaders, HttpHeaderNames }
7 | import org.asynchttpclient.Dsl.asyncHttpClient
8 | import org.asynchttpclient._
9 | import org.asynchttpclient.proxy.ProxyServer
10 |
11 | import scala.concurrent.{ Future, Promise, TimeoutException }
12 |
13 | /**
14 | * @author : tong.wang
15 | * @since : 5/16/18 11:13 PM
16 | * @version : 1.0.0
17 | */
18 | object AsyncHttpClientDownloader extends Downloader[AsyncHttpClient] {
19 |
20 | private[this] def buildRequest(driver: AsyncHttpClient, request: RequestSetting, proxyOpt: Option[ProxyDTO] = None): BoundRequestBuilder = {
21 | proxyOpt.foreach { proxy =>
22 | buildProxy(proxy)(p => new ProxyServer.Builder(p.host, p.port).build())
23 | }
24 |
25 | val builder = builderMethod(driver, request)
26 |
27 | val httpHeaders = new DefaultHttpHeaders
28 | request.headers.foreach { case (k, v) ⇒ httpHeaders.add(k, v) }
29 | httpHeaders.add(HttpHeaderNames.USER_AGENT, request.userAgent)
30 | httpHeaders.add(HttpHeaderNames.ACCEPT_CHARSET, request.charset)
31 | builder.setHeaders(httpHeaders)
32 |
33 | }
34 |
35 | def builderMethod(driver: AsyncHttpClient, requestSetting: RequestSetting): BoundRequestBuilder = {
36 | val url = requestSetting.url.get
37 | requestSetting.method.toUpperCase match {
38 | case "GET" ⇒
39 | driver.prepareGet(url)
40 | case "POST" ⇒
41 | val post = driver.preparePost(url)
42 | requestSetting.requestBody.foreach(post.setBody)
43 | post
44 | case other ⇒
45 | logger.warn(s"unknown http method ${other}")
46 | throw IllegalArgumentsException(other)
47 | }
48 |
49 | }
50 |
51 | override def doDownload(request: RequestSetting): Future[Page] = {
52 | val response = executeRequest(request) { proxyOpt =>
53 | val promise = Promise[Response]
54 |
55 | val client = getOrCreateClient(request)
56 | buildRequest(client.driver, request, proxyOpt).execute(new AsyncCompletionHandler[Response]() {
57 | override def onCompleted(response: Response): Response = {
58 | promise.success(response)
59 | client.decrement()
60 | response
61 | }
62 |
63 | override def onThrowable(t: Throwable): Unit = {
64 | if (t.isInstanceOf[TimeoutException]) {
65 | logger.error("download error ", t)
66 | }
67 | client.decrement()
68 | promise.failure(t)
69 | }
70 | })
71 |
72 | promise.future
73 | }
74 |
75 | response.map { r =>
76 | pageResult(request, Some(r.getResponseBodyAsBytes), r.getStatusCode == 200, Some(s"return ${r.getStatusCode}"))
77 | }(io.github.wtog.crawler.actor.ExecutionContexts.downloadDispatcher)
78 | }
79 |
80 | def closeClient(): Unit = closeDownloaderClient(_.close())
81 |
82 | override protected def getOrCreateClient(requestSetting: RequestSetting): DownloaderClient[AsyncHttpClient] =
83 | getDownloaderClient(requestSetting.domain) {
84 | asyncHttpClient(
85 | new DefaultAsyncHttpClientConfig.Builder()
86 | .setRequestTimeout(requestSetting.timeOut.toMillis.toInt)
87 | .setConnectTimeout(requestSetting.timeOut.toMillis.toInt)
88 | .setFollowRedirect(true)
89 | .setConnectionPoolCleanerPeriod(5)
90 | .build()
91 | )
92 | }
93 | }
94 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/downloader/ChromeHeadlessDownloader.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.downloader
2 |
3 | import java.io.File
4 | import java.util
5 | import java.util.logging.Level
6 |
7 | import io.github.wtog.crawler.downloader.ChromeHeadlessConfig._
8 | import io.github.wtog.crawler.dto.{ Page, RequestSetting, XhrResponse }
9 | import io.github.wtog.utils.{ ConfigUtils, JsonUtils }
10 | import org.openqa.selenium.chrome.{ ChromeDriver, ChromeOptions }
11 | import org.openqa.selenium.logging.{ LogType, LoggingPreferences }
12 | import org.openqa.selenium.remote.UnreachableBrowserException
13 |
14 | import scala.collection.JavaConverters._
15 | import scala.collection.mutable.ListBuffer
16 | import scala.concurrent.Future
17 | import scala.util.control.NonFatal
18 |
19 | /**
20 | * @author : tong.wang
21 | * @since : 2019-07-18 22:28
22 | * @version : 1.0.0
23 | */
24 | object ChromeHeadlessDownloader extends Downloader[ChromeDriver] {
25 |
26 | override protected def doDownload(requestSetting: RequestSetting): Future[Page] = {
27 |
28 | val client = getOrCreateClient(requestSetting)
29 |
30 | Future {
31 | try {
32 | val driver = client.driver
33 |
34 | driver.get(requestSetting.url.get)
35 | val performanceLog = driver.manage().logs().get(LogType.PERFORMANCE)
36 |
37 | val iterator = performanceLog.iterator()
38 | var returnAllXhrRequest = true
39 | val xhrResponseBuffer = new ListBuffer[XhrResponse]
40 | while (iterator.hasNext && returnAllXhrRequest) {
41 | val xhrResponse = iterator.next()
42 | val message = JsonUtils.parseFrom[Map[String, Any]](xhrResponse.getMessage).get("message").get.asInstanceOf[Map[String, Any]]
43 | message.get("params").foreach {
44 | case params: Map[String, Any] @unchecked =>
45 | val headers = params.getOrElse("headers", Map.empty[String, Any]).asInstanceOf[Map[String, Any]]
46 | getXhrRequestUriByHeaders(headers).foreach {
47 | case xhrResponseUri: String if requestSetting.xhrRequests.contains(xhrResponseUri) =>
48 | xhrResponseBuffer.append(XhrResponse(xhrResponseUri, getXhrResponse(driver, params.get("requestId").get.asInstanceOf[String])))
49 | case _ if (xhrResponseBuffer.size == requestSetting.xhrRequests.size) =>
50 | returnAllXhrRequest = false
51 | case _ =>
52 | }
53 | }
54 | }
55 |
56 | Page(requestSetting = requestSetting, bytes = Some(driver.getPageSource.getBytes()), xhrResponses = xhrResponseBuffer.toSeq)
57 | } catch {
58 | case NonFatal(exception) =>
59 | Page.failed(requestSetting, exception)
60 | } finally {
61 | client.decrement()
62 | }
63 | }(io.github.wtog.crawler.actor.ExecutionContexts.downloadDispatcher)
64 | }
65 |
66 | private def getXhrResponse(driver: ChromeDriver, requestId: String): Map[String, Any] = {
67 | val cdpMap = new util.HashMap[String, Object]()
68 | cdpMap.put("requestId", requestId)
69 | driver.executeCdpCommand("Network.getResponseBody", cdpMap).asScala.toMap
70 | }
71 |
72 | private def getXhrRequestUriByHeaders(headers: Map[String, Any]): Option[String] =
73 | headers.get("x-requested-with") match {
74 | case Some("XMLHttpRequest") =>
75 | val schema: String = headers.getOrElse(":scheme", "").asInstanceOf[String]
76 | val domain: String = headers.getOrElse(":authority", "").asInstanceOf[String]
77 | val uri: String = headers.get(":path").fold[String]("") {
78 | case p: String =>
79 | val queryIndex = p.indexOf('?')
80 | p.substring(0, if (queryIndex > 0) queryIndex else p.length)
81 | }
82 |
83 | Some(s"$schema://$domain$uri")
84 | case _ =>
85 | None
86 | }
87 |
88 | private[this] def buildOptions(requestSetting: RequestSetting): ChromeOptions = {
89 | val options = new ChromeOptions()
90 | options.setExperimentalOption("excludeSwitches", Array[String]("enable-automation"))
91 |
92 | val perf = new util.HashMap[String, Any]()
93 | perf.put("enableNetwork", true)
94 | options.setExperimentalOption("prefs", perf)
95 |
96 | val logPrefs = new LoggingPreferences
97 | logPrefs.enable(LogType.PERFORMANCE, Level.ALL)
98 | logPrefs.enable(LogType.SERVER, Level.ALL)
99 | logPrefs.enable(LogType.BROWSER, Level.ALL)
100 | logPrefs.enable(LogType.DRIVER, Level.ALL)
101 | logPrefs.enable(LogType.CLIENT, Level.ALL)
102 | options.setCapability("goog:loggingPrefs", logPrefs)
103 | options.addArguments(
104 | "--no-sandbox",
105 | "--headless",
106 | "--start-maximized",
107 | "--disable-dev-shm-usage",
108 | "--disable-plugins-discovery",
109 | "--enable-logging",
110 | "--v=1",
111 | "--disable-gpu",
112 | "--ignore-certificate-errors",
113 | s"--user-agent=${requestSetting.userAgent}"
114 | )
115 |
116 | options
117 | }
118 |
119 | override def getOrCreateClient(requestSetting: RequestSetting): DownloaderClient[ChromeDriver] = getDownloaderClient(requestSetting.domain) {
120 | System.setProperty("webdriver.chrome.driver", chromeDriverPath)
121 | System.setProperty("webdriver.chrome.logfile", chromeDriverLog)
122 |
123 | val driver = new ChromeDriver(buildOptions(requestSetting))
124 | val map = new util.HashMap[String, Object]()
125 | map.put(
126 | "source",
127 | """
128 | |Object.defineProperty(navigator, 'webdriver', {
129 | | get: () => false
130 | |});
131 | |Object.defineProperty(navigator, 'plugins', {
132 | | get: () => [1, 2, 3, 4, 5]
133 | |});
134 | |Object.defineProperty(navigator, 'languages', {
135 | | get: () => ["zh-CN","zh","en-US","en"]
136 | |});
137 | |""".stripMargin
138 | )
139 | driver.executeCdpCommand("Page.addScriptToEvaluateOnNewDocument", map)
140 |
141 | driver
142 | }
143 |
144 | override def closeClient(): Unit = closeDownloaderClient { driver =>
145 | try (driver.quit())
146 | catch {
147 | case _: UnreachableBrowserException =>
148 | ()
149 | case e: Throwable =>
150 | throw e
151 | }
152 | }
153 |
154 | }
155 |
156 | object ChromeHeadlessConfig {
157 | lazy val chromeDriverPath: String = ConfigUtils.getStringOpt("crawler.chrome.driver").getOrElse("/opt/chromedriver")
158 | lazy val chromeDriverLog: String = ConfigUtils.getStringOpt("crawler.chrome.log").getOrElse("/tmp/chromedriver.log")
159 |
160 | def chromeDriverNotExecutable: Boolean = {
161 | val file = new File(chromeDriverPath)
162 | val canExecute = file.exists() && file.canExecute
163 | !canExecute
164 | }
165 | }
166 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/downloader/Downloader.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.downloader
2 |
3 | import java.util.concurrent.atomic.AtomicInteger
4 | import java.util.concurrent.{ ConcurrentHashMap, Executors, ScheduledFuture, TimeUnit }
5 |
6 | import io.github.wtog.crawler.downloader.proxy.{ ProxyDTO, ProxyProvider }
7 | import scala.collection.JavaConverters._
8 | import io.github.wtog.crawler.dto
9 | import io.github.wtog.crawler.dto.{ Page, RequestSetting }
10 | import io.github.wtog.crawler.spider.Spider
11 | import io.github.wtog.utils.RetryUtils._
12 | import io.github.wtog.utils.{ ConfigUtils, RetryInfo }
13 | import org.slf4j.{ Logger, LoggerFactory }
14 |
15 | import scala.concurrent.Future
16 | import scala.util.control.NonFatal
17 | import scala.util.{ Failure, Success, Try }
18 |
19 | /**
20 | * @author : tong.wang
21 | * @since : 5/16/18 9:56 PM
22 | * @version : 1.0.0
23 | */
24 | trait Downloader[Driver] {
25 | protected lazy val logger: Logger = LoggerFactory.getLogger(this.getClass)
26 |
27 | private val downloadRetryException: Seq[String] = ConfigUtils.getSeq[String]("crawler.download.retry.exception")
28 |
29 | protected val clientsPool = new ConcurrentHashMap[String, DownloaderClient[Driver]]()
30 |
31 | protected def doDownload(requestSetting: RequestSetting): Future[Page]
32 |
33 | protected def buildProxy[P](proxyDto: ProxyDTO)(buildProxy: ProxyDTO => P): P = buildProxy(proxyDto)
34 |
35 | protected def closeClient(): Unit
36 |
37 | protected def getOrCreateClient(requestSetting: RequestSetting): DownloaderClient[Driver]
38 |
39 | def getClient(domain: String): Option[DownloaderClient[Driver]] = Option(clientsPool.get(domain))
40 |
41 | /**
42 | * common schedule job to close download client
43 | */
44 | val scheduleClose: ScheduledFuture[_] = Executors
45 | .newSingleThreadScheduledExecutor()
46 | .scheduleAtFixedRate(new Runnable {
47 | override def run(): Unit =
48 | try (closeClient())
49 | catch { case e: Throwable => logger.error("", e) }
50 | }, 3, 3, TimeUnit.MINUTES)
51 |
52 | def download(spider: Spider, request: RequestSetting): Future[Page] = {
53 | import io.github.wtog.crawler.actor.ExecutionContexts.downloadDispatcher
54 | futureRetryWhen(doDownload(requestSetting = request), retryTime = request.retryTime, RetryInfo(duration = request.sleepTime, downloadRetryException))
55 | .map { page =>
56 | spider.CrawlMetric.record(page.isDownloadSuccess, page.url)
57 | page
58 | }
59 | .recover {
60 | case NonFatal(e) =>
61 | spider.CrawlMetric.record(success = false, request.url.get)
62 | throw e
63 | }
64 | }
65 |
66 | protected def executeRequest[HttpResponse](requestSetting: RequestSetting)(execute: Option[ProxyDTO] => Future[HttpResponse]): Future[HttpResponse] =
67 | if (requestSetting.useProxy) {
68 | execute(ProxyProvider.getProxy)
69 | } else {
70 | execute(None)
71 | }
72 |
73 | protected def pageResult(requestSetting: RequestSetting, results: Option[Array[Byte]] = None, downloadSuccess: Boolean = true, msg: Option[String] = None): Page =
74 | dto.Page(downloadSuccess, bytes = results, requestSetting = requestSetting)
75 |
76 | protected def getDownloaderClient(domain: String)(driver: => Driver): DownloaderClient[Driver] = {
77 | val clientCache = Option(clientsPool.get(domain))
78 |
79 | val downloaderClient = clientCache.getOrElse {
80 | val downloaderClient = DownloaderClient(domain = domain, driver = driver)
81 | clientsPool.put(domain, downloaderClient)
82 | downloaderClient
83 | }
84 |
85 | downloaderClient.increment()
86 | downloaderClient
87 | }
88 |
89 | def closeDownloaderClient(close: Driver => Unit): Unit =
90 | for (e <- clientsPool.entrySet().asScala) {
91 | val (domain, downloaderClient) = (e.getKey, e.getValue)
92 | if (downloaderClient.idle()) {
93 | Try(close(downloaderClient.driver)) match {
94 | case Success(_) =>
95 | logger.info(s"${domain} downloader driver[${downloaderClient.driver.getClass.getSimpleName}] has been closed.")
96 | case Failure(exception) =>
97 | logger.error(s"${domain} downloader driver failed to close. ${exception.getLocalizedMessage}")
98 | }
99 | clientsPool.remove(domain)
100 | }
101 | }
102 |
103 | sys.addShutdownHook {
104 | try (closeClient())
105 | catch { case e: Throwable => logger.error("", e) }
106 | }
107 | }
108 |
109 | case class DownloaderClient[C](domain: String, driver: C, consumers: AtomicInteger = new AtomicInteger(0)) {
110 | def idle(): Boolean = consumers.get() == 0
111 | def increment(): Int = consumers.incrementAndGet()
112 | def decrement(): Int = consumers.decrementAndGet()
113 | }
114 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/downloader/DownloaderActorReceiver.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.downloader
2 |
3 | import akka.actor.{ Actor, Props }
4 | import io.github.wtog.crawler.dto.{ DownloadEvent, ProcessorEvent }
5 | import io.github.wtog.crawler.processor.PageProcessorActorReceiver
6 | import org.slf4j.{ Logger, LoggerFactory }
7 | import akka.actor.ActorRef
8 |
9 | /**
10 | * @author : tong.wang
11 | * @since : 5/16/18 11:54 PM
12 | * @version : 1.0.0
13 | */
14 | class DownloaderActorReceiver extends Actor {
15 |
16 | private val logger: Logger = LoggerFactory.getLogger(classOf[DownloaderActorReceiver])
17 |
18 | lazy val processorActor: ActorRef = context.actorOf(
19 | Props[PageProcessorActorReceiver].withDispatcher("crawler.processor-dispatcher"),
20 | "page-processor"
21 | )
22 |
23 | override def receive: Receive = {
24 | case downloadEvent: DownloadEvent ⇒
25 | val spider = downloadEvent.spider
26 |
27 | import io.github.wtog.crawler.actor.ExecutionContexts.downloadDispatcher
28 | spider.pageProcessor.downloader.download(spider, downloadEvent.request).foreach {
29 | case page if page.isDownloadSuccess ⇒
30 | processorActor ! ProcessorEvent(spider, page)
31 | case page =>
32 | logger.warn(s"page failed to download cause ${page.source}")
33 | }
34 | case other ⇒
35 | logger.warn(s"${self.path} received wrong msg ${other}")
36 | }
37 |
38 | override def postStop(): Unit =
39 | if (logger.isWarnEnabled())
40 | logger.warn(s"downloader-processor [${self.path}] stopped!")
41 |
42 | override def postRestart(reason: Throwable): Unit =
43 | if (logger.isWarnEnabled())
44 | logger.warn(s"downloader-processor restart! ${reason.getLocalizedMessage}")
45 | }
46 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/downloader/proxy/ProxyCrawlerPipeline.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.downloader.proxy
2 |
3 | import io.github.wtog.crawler.pipeline.Pipeline
4 |
5 | /**
6 | * @author : tong.wang
7 | * @since : 6/2/18 11:57 PM
8 | * @version : 1.0.0
9 | */
10 | object ProxyCrawlerPipeline extends Pipeline {
11 |
12 | override def process[R](pageResultItem: (String, R)): Unit = {
13 | val (url, result) = pageResultItem
14 |
15 | val resultMap = result.asInstanceOf[ProxyDTO]
16 | logger.trace(s"${url} => ${resultMap}")
17 | ProxyProvider.proxyList.offer(resultMap)
18 | }
19 |
20 | }
21 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/downloader/proxy/ProxyProvider.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.downloader.proxy
2 |
3 | import java.net.{ HttpURLConnection, InetSocketAddress, URL }
4 | import java.util.Objects
5 | import java.util.concurrent.atomic.{ AtomicBoolean, AtomicInteger }
6 | import java.util.concurrent.{ ArrayBlockingQueue, Executors }
7 |
8 | import io.github.wtog.crawler.downloader.proxy.ProxyProvider._
9 | import io.github.wtog.crawler.downloader.proxy.ProxyStatusEnums.ProxyStatusEnums
10 | import io.github.wtog.crawler.processor.PageProcessor
11 | import io.github.wtog.crawler.schedule.{ ScheduleJob, ScheduleJobs }
12 | import io.github.wtog.crawler.spider.Spider
13 | import io.github.wtog.utils.ReflectionUtils
14 | import org.quartz.{ Job, JobExecutionContext }
15 | import org.slf4j.{ Logger, LoggerFactory }
16 |
17 | import scala.util.Try
18 | import java.util.concurrent.ExecutorService
19 |
20 | /**
21 | * @author : tong.wang
22 | * @since : 5/20/18 11:08 AM
23 | * @version : 1.0.0
24 | */
25 | object ProxyProvider {
26 | lazy val logger: Logger = LoggerFactory.getLogger(ProxyProvider.getClass)
27 |
28 | val checkThread: ExecutorService = Executors.newFixedThreadPool(5)
29 |
30 | val proxyList: ArrayBlockingQueue[ProxyDTO] = new ArrayBlockingQueue[ProxyDTO](100)
31 |
32 | val proxySpiderCrawling: AtomicBoolean = new AtomicBoolean(false)
33 |
34 | private lazy val proxyCrawlerList: Seq[Spider] = ReflectionUtils
35 | .implementationClasses(
36 | classOf[PageProcessor],
37 | "io.github.wtog.crawler.downloader.proxy.crawler"
38 | )
39 | .map(proxy ⇒ Spider(pageProcessor = proxy.newInstance()))
40 |
41 | private def crawlCronJob(restart: Boolean = false) =
42 | if (restart) proxyCrawlerList.foreach(_.restart())
43 | else proxyCrawlerList.foreach(_.start())
44 |
45 | def startProxyCrawl(restart: Boolean = false): Unit =
46 | if (!proxySpiderCrawling.getAndSet(true)) {
47 | crawlCronJob(restart)
48 | ScheduleJobs.addJob(scheduleJob = ScheduleJob(jobName = "proxy-check", cronExpression = "*/2 * * ? * *", task = classOf[ProxyCheckScheduleJob]))
49 | }
50 |
51 | def getProxy: Option[ProxyDTO] = Option(proxyList.peek()).filter(_.usability > 0.5)
52 | }
53 |
54 | class ProxyCheckScheduleJob extends Job {
55 |
56 | override def execute(context: JobExecutionContext): Unit = {
57 | def checkProxy() = Option(proxyList.poll()).foreach { proxy ⇒
58 | proxy.usabilityCheck(proxy.connect2Baidu()) match {
59 | case (_, true) =>
60 | proxyList.put(proxy)
61 | case (_, _) =>
62 | }
63 | }
64 |
65 | if (logger.isDebugEnabled) {
66 | logger.debug(s"proxy list is ${proxyList.size()}")
67 | }
68 |
69 | (1 to 5).foreach { _ =>
70 | checkThread.execute(new Runnable {
71 | override def run(): Unit = checkProxy()
72 | })
73 | }
74 | }
75 |
76 | }
77 |
78 | final case class ProxyDTO(
79 | host: String,
80 | port: Int,
81 | username: Option[String] = None,
82 | password: Option[String] = None,
83 | var status: ProxyStatusEnums = ProxyStatusEnums.IDLE,
84 | var usability: Float = 0f) {
85 |
86 | val checkUrl: URL = new URL("http://www.baidu.com")
87 | val checkTimes: AtomicInteger = new AtomicInteger(0)
88 | val successTimes: AtomicInteger = new AtomicInteger(0)
89 | val usabilityLimit = 0.5
90 |
91 | def connect2Baidu(): Boolean = {
92 | import java.net.Proxy
93 | Try {
94 | val proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(host, port))
95 | val connection = checkUrl.openConnection(proxy).asInstanceOf[HttpURLConnection]
96 | connection.setConnectTimeout(1000)
97 | connection.setReadTimeout(1000)
98 |
99 | connection.getResponseCode == 200
100 | }.recover {
101 | case _: Throwable =>
102 | false
103 | }.get
104 |
105 | }
106 |
107 | def usabilityCheck(checkWay: => Boolean): (Float, Boolean) = {
108 | usability = {
109 | val checkTimeValue = checkTimes.incrementAndGet()
110 | if (checkWay)
111 | successTimes.incrementAndGet() / checkTimeValue
112 | else
113 | successTimes.get() / checkTimeValue
114 | }
115 |
116 | (usability, usability > usabilityLimit)
117 | }
118 |
119 | override def hashCode(): Int = Objects.hash(this.host.asInstanceOf[Object], this.port.asInstanceOf[Object])
120 |
121 | override def equals(obj: scala.Any): Boolean = obj match {
122 | case t: ProxyDTO ⇒
123 | t.host == this.host && t.port == this.port
124 | case _ ⇒
125 | false
126 | }
127 |
128 | override def toString: String = s"${host}:${port}"
129 | }
130 |
131 | object ProxyStatusEnums extends Enumeration {
132 | type ProxyStatusEnums = Value
133 | val USING: Value = Value("using")
134 | val IDLE: Value = Value("idle")
135 | }
136 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/downloader/proxy/crawler/A2UPageProcessor.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.downloader.proxy.crawler
2 |
3 | import io.github.wtog.crawler.downloader.proxy.ProxyDTO
4 | import io.github.wtog.crawler.dto.{ Page, RequestSetting }
5 |
6 | import scala.concurrent.duration._
7 |
8 | /**
9 | * https://raw.githubusercontent.com/a2u/free-proxy-list/master/free-proxy-list.txt
10 | * @author : tong.wang
11 | * @since : 6/3/18 12:33 AM
12 | * @version : 1.0.0
13 | */
14 | case class A2UPageProcessor() extends ProxyProcessorTrait {
15 | override def doProcess(page: Page): Unit = {
16 | val proxyIpList = page.body.text().split(" ")
17 |
18 | proxyIpList.foreach(it ⇒ {
19 | val ipAndPort = it.split(":")
20 | val proxy = ProxyDTO(ipAndPort.head, ipAndPort.last.toInt)
21 | page.addPageResultItem(proxy)
22 | })
23 | }
24 |
25 | override def requestSetting: RequestSetting = RequestSetting(domain = "raw.githubusercontent.com", sleepTime = 2 seconds)
26 |
27 | override def targetUrls: List[String] =
28 | List(
29 | "https://raw.githubusercontent.com/a2u/free-proxy-list/master/free-proxy-list.txt"
30 | )
31 |
32 | }
33 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/downloader/proxy/crawler/Data5UPageProcessor.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.downloader.proxy.crawler
2 |
3 | import io.github.wtog.crawler.downloader.proxy.ProxyDTO
4 | import io.github.wtog.crawler.dto.{ Page, RequestSetting }
5 |
6 | import scala.concurrent.duration._
7 |
8 | /**
9 | * @author : tong.wang
10 | * @since : 6/7/18 11:27 PM
11 | * @version : 1.0.0
12 | */
13 | case class Data5UPageProcessor() extends ProxyProcessorTrait {
14 |
15 | override def doProcess(page: Page): Unit = {
16 | val ipRow = page.dom(".wlist > ul > li:nth-child(2) .l2")
17 | val ipSize = ipRow.size()
18 |
19 | (0 until ipSize).foreach(i ⇒ {
20 | val ip = ipRow.get(i).select("span:nth-child(1)").text()
21 | val port = ipRow.get(i).select("span:nth-child(2)").text()
22 |
23 | val proxy = ProxyDTO(ip, port.toInt)
24 | page.addPageResultItem(proxy)
25 | })
26 |
27 | }
28 |
29 | override def cronExpression: Option[String] = Some("*/5 * * ? * *")
30 |
31 | override def requestSetting: RequestSetting = RequestSetting(domain = "www.data5u.com", sleepTime = 2 second)
32 |
33 | override def targetUrls: List[String] = List("http://www.data5u.com")
34 |
35 | }
36 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/downloader/proxy/crawler/IP89Processor.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.downloader.proxy.crawler
2 |
3 | import io.github.wtog.crawler.downloader.proxy.ProxyDTO
4 | import io.github.wtog.crawler.dto.{ Page, RequestSetting }
5 |
6 | import scala.concurrent.duration._
7 | import scala.util.Try
8 |
9 | /**
10 | * http://www.89ip.cn
11 | * @author : tong.wang
12 | * @since : 6/3/18 12:33 AM
13 | * @version : 1.0.0
14 | */
15 | case class IP89Processor() extends ProxyProcessorTrait {
16 | override def doProcess(page: Page): Unit = {
17 | val iprows = page.table("tbody tr")
18 | iprows.foreach { ip =>
19 | val tds = ip.select("td")
20 | val proxDto = ProxyDTO(host = tds.get(0).text(), port = Try(tds.get(1).text().toInt).getOrElse(80))
21 | page.addPageResultItem(proxDto)
22 | }
23 | }
24 |
25 | override def requestSetting: RequestSetting = RequestSetting(domain = "www.89ip.com", sleepTime = 2 seconds)
26 |
27 | override def targetUrls: List[String] = List("http://www.89ip.cn")
28 |
29 | }
30 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/downloader/proxy/crawler/ProxyProcessorTrait.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.downloader.proxy.crawler
2 |
3 | import io.github.wtog.crawler.downloader.proxy.ProxyCrawlerPipeline
4 | import io.github.wtog.crawler.pipeline.Pipeline
5 | import io.github.wtog.crawler.processor.PageProcessor
6 |
7 | /**
8 | * @author : tong.wang
9 | * @since : 9/16/18 10:34 AM
10 | * @version : 1.0.0
11 | */
12 | trait ProxyProcessorTrait extends PageProcessor {
13 | override def cronExpression: Option[String] = Some("*/5 * * ? * *")
14 |
15 | override val pipelines: Set[Pipeline] = Set(ProxyCrawlerPipeline)
16 | }
17 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/dto/Event.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.dto
2 |
3 | import io.github.wtog.crawler.pipeline.Pipeline
4 | import io.github.wtog.crawler.spider.Spider
5 | import io.github.wtog.utils.logger.Logging
6 |
7 | import scala.util.{ Failure, Success, Try }
8 |
9 | /**
10 | * @author : tong.wang
11 | * @since : 2019-05-01 22:56
12 | * @version : 1.0.0
13 | */
14 | sealed trait Event
15 |
16 | case class DownloadEvent(spider: Spider, request: RequestSetting) extends Event
17 |
18 | case class ProcessorEvent(spider: Spider, page: Page) extends Event
19 |
20 | case class PipelineEvent[R](pipelineList: Set[Pipeline], pageResultItems: (String, R)) extends Event with Logging {
21 | def initPipelines(): Option[PipelineEvent[R]] = {
22 | val allInited = pipelineList
23 | .map { p =>
24 | Try(p.open()) match {
25 | case Success(_) => true
26 | case Failure(exception) =>
27 | logger.error(s"failed to init pipeline ${exception.getLocalizedMessage}")
28 | false
29 | }
30 | }
31 | .forall(_ == true)
32 |
33 | if (allInited) Some(this) else None
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/dto/Page.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.dto
2 |
3 | import java.net.URL
4 | import java.util.concurrent.LinkedBlockingQueue
5 |
6 | import io.github.wtog.crawler.queue.TargetRequestTaskQueue
7 | import io.github.wtog.crawler.selector.HtmlParser
8 | import io.github.wtog.crawler.selector.HtmlParser.parseJson
9 | import io.github.wtog.utils.logger.Logging
10 |
11 | import scala.util.Try
12 |
13 | /**
14 | * @author : tong.wang
15 | * @since : 1/2/20 9:41 PM
16 | * @version : 1.0.0
17 | */
18 | case class Page(
19 | isDownloadSuccess: Boolean = true,
20 | bytes: Option[Array[Byte]] = None,
21 | responseHeaders: Map[String, String] = Map.empty[String, String],
22 | xhrResponses: Seq[XhrResponse] = Seq.empty[XhrResponse],
23 | requestSetting: RequestSetting) {
24 |
25 | lazy val resultItems: LinkedBlockingQueue[Any] = new LinkedBlockingQueue[Any]
26 | lazy val requestQueue: TargetRequestTaskQueue = new TargetRequestTaskQueue()
27 |
28 | lazy val url = requestSetting.url.get
29 |
30 | def source: String = bytes match {
31 | case Some(byte) ⇒
32 | HtmlParser.getHtmlSourceWithCharset(byte, requestSetting.charset)
33 | case None ⇒
34 | throw new IllegalStateException("no page source text found ")
35 | }
36 |
37 | def json[T: Manifest](text: Option[String] = None): T = parseJson[T](text.getOrElse(this.source))
38 |
39 | def addTargetRequest(urlAdd: String): Unit = addRequest(this.requestSetting.withUrl(url = urlAdd))
40 |
41 | def addTargetRequest(requestUri: RequestUri): Unit = addRequest(this.requestSetting.withRequestUri(requestUri))
42 |
43 | private[this] def addRequest(requestSetting: RequestSetting): Unit = {
44 | val url = requestSetting.url.get
45 |
46 | if (Try(new URL(url)).isSuccess) {
47 | this.requestQueue.push(requestSetting)
48 | }
49 | }
50 |
51 | def addPageResultItem[R](result: R): Unit = this.resultItems.add(result)
52 |
53 | override def toString: String = s"${requestSetting.url.get} downloaded ${isDownloadSuccess}"
54 | }
55 |
56 | object Page extends Logging {
57 | def failed(requestSetting: RequestSetting, exceptionMessage: Throwable): Page = {
58 | logger.warn(s"failed to download cause ${exceptionMessage.getLocalizedMessage}")
59 | Page(requestSetting = requestSetting, isDownloadSuccess = false)
60 | }
61 | }
62 |
63 | case class XhrResponse(xhrUri: String, result: Map[String, Any])
64 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/dto/RequestSetting.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.dto
2 |
3 | import java.nio.charset.Charset
4 |
5 | import io.netty.handler.codec.http.HttpMethod
6 |
7 | import scala.collection.mutable
8 | import scala.concurrent.duration.Duration
9 | import scala.concurrent.duration._
10 |
11 | /**
12 | * @author : tong.wang
13 | * @since : 1/2/20 9:43 PM
14 | * @version : 1.0.0
15 | */
16 | case class RequestSetting(
17 | domain: String = "",
18 | method: String = HttpMethod.GET.toString,
19 | url: Option[String] = None,
20 | userAgent: String = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36",
21 | requestBody: Option[String] = None,
22 | headers: mutable.Map[String, String] = mutable.Map.empty[String, String],
23 | sleepTime: Duration = 1 seconds,
24 | cookies: Option[Map[String, String]] = None,
25 | charset: String = Charset.defaultCharset().name(),
26 | retryTime: Int = 0,
27 | timeOut: Duration = 3 seconds,
28 | useProxy: Boolean = false,
29 | xhrRequests: Set[String] = Set.empty[String]) {
30 |
31 | def withUrlAndMethod(url: String, method: String = HttpMethod.GET.toString): RequestSetting =
32 | this.copy(url = Some(url), method = method)
33 |
34 | def withUrl(url: String): RequestSetting = this.copy(url = Some(url))
35 |
36 | def withSleepTime(sleepTime: Duration): RequestSetting = this.copy(sleepTime = sleepTime)
37 |
38 | def withHeaders(extraHeaders: Map[String, String]): RequestSetting = {
39 | this.headers ++= extraHeaders.toSeq
40 | this
41 | }
42 |
43 | def addHeader(header: String, value: String): RequestSetting = {
44 | this.headers += (header -> value)
45 | this
46 | }
47 |
48 | def withMethodAndRequestBody(method: String, requestBody: Option[String]): RequestSetting =
49 | this.copy(method = method, requestBody = requestBody)
50 |
51 | def withRequestUri(requestUri: RequestUri): RequestSetting = {
52 | val basic = this.copy(
53 | url = Some(requestUri.url),
54 | method = requestUri.method,
55 | requestBody = requestUri.requestBody,
56 | xhrRequests = requestUri.xhrRequests
57 | )
58 |
59 | requestUri.headers.fold(basic) { extra ⇒
60 | basic.withHeaders(extra)
61 | }
62 | }
63 |
64 | def withXhrRequests(xhrRequest: String*): RequestSetting = {
65 | val requests = xhrRequest.foldLeft(this.xhrRequests) { (xhrRequests, xhrRequest) =>
66 | xhrRequests + xhrRequest
67 | }
68 | this.copy(xhrRequests = requests)
69 | }
70 |
71 | override def toString: String = {
72 | val fields = this.getClass.getDeclaredFields
73 | .map { field =>
74 | val value = field.get(this) match {
75 | case v: Option[Any] =>
76 | v.getOrElse("")
77 | case v =>
78 | v
79 | }
80 |
81 | (s"${field.getName}: $value", value)
82 | }
83 | .collect {
84 | case (v: String, t: String) if !t.isEmpty => v
85 | case (v: String, t: Any) if !t.isInstanceOf[String] => v
86 | }
87 |
88 | s"${fields.mkString(", ")}"
89 | }
90 | }
91 |
92 | case class RequestUri(
93 | url: String,
94 | method: String = HttpMethod.GET.toString,
95 | requestBody: Option[String] = None,
96 | headers: Option[Map[String, String]] = None,
97 | xhrRequests: Set[String] = Set.empty[String])
98 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/exceptions/NonNullArgumentsException.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.exceptions
2 |
3 | /**
4 | * @author : tong.wang
5 | * @since : 5/19/18 1:38 PM
6 | * @version : 1.0.0
7 | */
8 | case class NonNullArgumentsException(arguments: String*) extends IllegalArgumentException {
9 | override def getLocalizedMessage: String = arguments.mkString(",") + " cant be null"
10 | }
11 |
12 | case class IllegalArgumentsException(arguments: String*) extends IllegalArgumentException {
13 | override def getLocalizedMessage: String = arguments.mkString(",") + " type is illeage"
14 | }
15 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/pipeline/ConsolePipeline.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.pipeline
2 |
3 | /**
4 | * @author : tong.wang
5 | * @since : 10/18/19 10:07 PM
6 | * @version : 1.0.0
7 | */
8 | object ConsolePipeline extends Pipeline {
9 | def process[R](pageResultItem: (String, R)): Unit = {
10 | val (url, result) = pageResultItem
11 | logger.trace(s"crawl result: ${url} - ${result}")
12 | }
13 |
14 | }
15 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/pipeline/Pipeline.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.pipeline
2 |
3 | import java.util.concurrent.atomic.AtomicBoolean
4 |
5 | import io.github.wtog.utils.logger.Logging
6 |
7 | /**
8 | * @author : tong.wang
9 | * @since : 5/16/18 9:09 PM
10 | * @version : 1.0.0
11 | */
12 | trait Pipeline extends Logging {
13 |
14 | private val inited = new AtomicBoolean(false)
15 |
16 | def open(): Unit =
17 | if (inited.compareAndSet(false, true)) {
18 | init()
19 | }
20 |
21 | protected def init(): Unit = ()
22 |
23 | def process[Result](pageResultItem: (String, Result)): Unit
24 | }
25 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/pipeline/PipelineActorReceiver.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.pipeline
2 |
3 | import akka.actor.Actor
4 | import io.github.wtog.crawler.dto.PipelineEvent
5 | import org.slf4j.{ Logger, LoggerFactory }
6 |
7 | /**
8 | * @author : tong.wang
9 | * @since : 5/16/18 11:54 PM
10 | * @version : 1.0.0
11 | */
12 | class PipelineActorReceiver extends Actor {
13 |
14 | private lazy val logger: Logger = LoggerFactory.getLogger(classOf[PipelineActorReceiver])
15 |
16 | override def receive: Receive = {
17 | case pipelineEvent: PipelineEvent[_] ⇒
18 | val pipelineList = if (logger.isTraceEnabled()) {
19 | pipelineEvent.pipelineList + ConsolePipeline
20 | } else {
21 | pipelineEvent.pipelineList
22 | }
23 |
24 | pipelineList.foreach { _.process(pipelineEvent.pageResultItems) }
25 | case other ⇒
26 | logger.warn(s"${this.getClass.getSimpleName} received wrong msg ${other}")
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/processor/PageProcessor.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.processor
2 |
3 | import io.github.wtog.crawler.actor.ExecutionContexts.processorDispatcher
4 | import io.github.wtog.crawler.downloader.{AsyncHttpClientDownloader, Downloader}
5 | import io.github.wtog.crawler.dto.{Page, RequestSetting, RequestUri}
6 | import io.github.wtog.crawler.pipeline.{ConsolePipeline, Pipeline}
7 | import io.github.wtog.crawler.selector.HtmlParser
8 | import io.github.wtog.utils.logger.Logging
9 |
10 | import scala.concurrent.Future
11 |
12 | /**
13 | * @author : tong.wang
14 | * @since : 5/16/18 9:48 PM
15 | * @version : 1.0.0
16 | */
17 | trait PageProcessor extends HtmlParser with Logging {
18 |
19 | val name: String = this.getClass.getSimpleName
20 |
21 | /**
22 | * download client
23 | */
24 | val downloader: Downloader[_] = AsyncHttpClientDownloader
25 |
26 | /**
27 | * the target urls for processor to crawl
28 | *
29 | * @return
30 | */
31 | @deprecated
32 | def targetUrls: List[String] = Nil
33 |
34 | /**
35 | * the target request for processor to crawl
36 | * @return
37 | */
38 | def targetRequests: List[RequestUri] =
39 | if (targetUrls.nonEmpty) {
40 | targetUrls.map(url => RequestUri(url))
41 | } else {
42 | List.empty[RequestUri]
43 | }
44 |
45 | /**
46 | * handle the crawled result
47 | *
48 | * @return
49 | */
50 | val pipelines: Set[Pipeline] = Set(ConsolePipeline)
51 |
52 | /**
53 | * parse the html source code
54 | *
55 | * @param page
56 | */
57 | def process(page: Page): Future[Unit] = Future {
58 | try {
59 | doProcess(page)
60 | } catch {
61 | case e: Throwable =>
62 | logger.error(s"failed to process page ${page.url} -> ${page.source}", e)
63 | throw e
64 | }
65 | }
66 |
67 | protected def doProcess(page: Page): Unit
68 |
69 | /**
70 | * set request config for processor
71 | *
72 | * @return
73 | */
74 | def requestSetting: RequestSetting = RequestSetting(url = None)
75 |
76 | /**
77 | * schedule cron job expression
78 | *
79 | * @return
80 | */
81 | def cronExpression: Option[String] = None
82 |
83 | }
84 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/processor/PageProcessorActorReceiver.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.processor
2 |
3 | import java.util.concurrent.LinkedBlockingQueue
4 |
5 | import akka.actor.{ Actor, ActorRef, PoisonPill, Props }
6 | import io.github.wtog.crawler.actor.ExecutionContexts.processorDispatcher
7 | import io.github.wtog.crawler.dto.{ DownloadEvent, PipelineEvent, ProcessorEvent }
8 | import io.github.wtog.crawler.pipeline.{ Pipeline, PipelineActorReceiver }
9 | import io.github.wtog.crawler.queue.RequestQueue
10 | import io.github.wtog.crawler.spider.Spider
11 | import org.slf4j.{ Logger, LoggerFactory }
12 |
13 | import scala.util.{ Failure, Success }
14 |
15 | /**
16 | * @author : tong.wang
17 | * @since : 5/16/18 11:54 PM
18 | * @version : 1.0.0
19 | */
20 | class PageProcessorActorReceiver extends Actor {
21 |
22 | private lazy val logger: Logger = LoggerFactory.getLogger(classOf[PageProcessorActorReceiver])
23 |
24 | val pipelineActor: ActorRef = context.actorOf(Props[PipelineActorReceiver].withDispatcher("crawler.pipeline-dispatcher"), "pipeline-processor")
25 |
26 | override def receive: Receive = {
27 | case processorEvent: ProcessorEvent ⇒
28 | val page = processorEvent.page
29 | val spider = processorEvent.spider
30 | val downloadSender = sender()
31 |
32 | spider.pageProcessor.process(page).foreach { _ ⇒
33 | pipelineProcess(page.requestSetting.url.get, page.resultItems)(spider.pageProcessor.pipelines)(downloadSender)
34 | spider.CrawlMetric.processedSuccessCounter
35 |
36 | continueRequest(page.requestQueue)(spider)(downloadSender)
37 | }
38 | case other ⇒
39 | logger.warn(s"${self.path} received wrong msg ${other}")
40 | }
41 |
42 | private[this] def pipelineProcess(url: String, pageResultItems: LinkedBlockingQueue[Any])(pipelines: Set[Pipeline])(downloadSender: ActorRef): Unit =
43 | while (!pageResultItems.isEmpty) {
44 | Option(pageResultItems.poll()).foreach { item ⇒
45 | PipelineEvent(pipelines, (url, item)).initPipelines() match {
46 | case Some(e) => pipelineActor ! e
47 | case None => downloadSender ! PoisonPill
48 | }
49 | }
50 | }
51 |
52 | private[this] def continueRequest(targetRequests: RequestQueue)(spider: Spider)(downloadSender: ActorRef): Unit =
53 | while (targetRequests.nonEmpty) {
54 | targetRequests.poll().foreach { targetRequest ⇒
55 | downloadSender ! DownloadEvent(spider, targetRequest)
56 | }
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/queue/DuplicateRemovedQueue.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.queue
2 |
3 | import io.github.wtog.crawler.dto.RequestSetting
4 | import io.github.wtog.crawler.queue.duplicate.DuplicateRemovedStrategy
5 |
6 | /**
7 | * @author : tong.wang
8 | * @since : 5/16/18 10:07 PM
9 | * @version : 1.0.0
10 | */
11 | abstract class DuplicateRemovedQueue(duplicateRemovedStrategy: DuplicateRemovedStrategy) extends RequestQueue {
12 |
13 | override def push(request: RequestSetting): Unit =
14 | if (isNotDuplicateRequest(request)) {
15 | pushWhenNoDuplicate(request)
16 | }
17 |
18 | private def isNotDuplicateRequest(requestHeaderGeneral: RequestSetting): Boolean =
19 | requestHeaderGeneral.method match {
20 | case "GET" ⇒
21 | !duplicateRemovedStrategy.isDuplicate(requestHeaderGeneral.url.get)
22 | case "POST" ⇒
23 | !duplicateRemovedStrategy.isDuplicate(
24 | requestHeaderGeneral.url.get + requestHeaderGeneral.requestBody
25 | .getOrElse("")
26 | )
27 | case other ⇒
28 | logger.warn(s"unknown request method type: ${other}")
29 | true
30 | }
31 |
32 | protected def pushWhenNoDuplicate(request: RequestSetting): Unit
33 | }
34 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/queue/RequestQueue.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.queue
2 |
3 | import java.util.concurrent.TimeUnit
4 |
5 | import io.github.wtog.crawler.dto.RequestSetting
6 | import org.slf4j.{ Logger, LoggerFactory }
7 |
8 | /**
9 | * @author : tong.wang
10 | * @since : 5/16/18 10:03 PM
11 | * @version : 1.0.0
12 | */
13 | trait RequestQueue {
14 | protected val logger: Logger = LoggerFactory.getLogger(this.getClass)
15 |
16 | def push(request: RequestSetting): Unit
17 |
18 | def poll(): Option[RequestSetting] =
19 | doPoll().map { r =>
20 | TimeUnit.MILLISECONDS.sleep(r.sleepTime.toMillis)
21 | r
22 | }
23 |
24 | protected def doPoll(): Option[RequestSetting]
25 |
26 | def isEmpty: Boolean
27 |
28 | def nonEmpty: Boolean = !isEmpty
29 | }
30 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/queue/TargetRequestTaskQueue.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.queue
2 |
3 | import java.util.concurrent.LinkedBlockingQueue
4 |
5 | import io.github.wtog.crawler.dto.RequestSetting
6 | import io.github.wtog.crawler.queue.duplicate.{ DuplicateRemovedStrategy, HashMapStrategy }
7 |
8 | /**
9 | * @author : tong.wang
10 | * @since : 5/16/18 10:12 PM
11 | * @version : 1.0.0
12 | */
13 | class TargetRequestTaskQueue(duplicateRemovedStrategy: DuplicateRemovedStrategy = HashMapStrategy) extends DuplicateRemovedQueue(duplicateRemovedStrategy) {
14 | private lazy val queue: LinkedBlockingQueue[RequestSetting] = new LinkedBlockingQueue[RequestSetting]()
15 |
16 | override def pushWhenNoDuplicate(request: RequestSetting): Unit = this.queue.add(request)
17 |
18 | override def isEmpty: Boolean = queue.isEmpty
19 |
20 | override def doPoll(): Option[RequestSetting] = Option(this.queue.poll())
21 |
22 | }
23 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/queue/duplicate/BitSetStrategy.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.queue.duplicate
2 |
3 | import scala.collection.BitSet
4 |
5 | /**
6 | * @author : tong.wang
7 | * @since : 11/8/18 1:31 PM
8 | * @version : 1.0.0
9 | */
10 | object BitSetStrategy extends DuplicateRemovedStrategy {
11 | var urlBitSet = BitSet.empty
12 |
13 | override def isDuplicate(url: String): Boolean = {
14 | val urlHashCode = urlToHashCode(url)
15 | val isDuplicate = urlBitSet.contains(urlHashCode)
16 |
17 | if (!isDuplicate) {
18 | urlBitSet += urlHashCode
19 | }
20 |
21 | isDuplicate
22 | }
23 |
24 | def urlToHashCode(url: String): Int =
25 | url.hashCode & 0x7FFFFFFF
26 | }
27 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/queue/duplicate/DuplicateRemovedStrategy.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.queue.duplicate
2 |
3 | /**
4 | * @author : tong.wang
5 | * @since : 6/1/18 8:44 AM
6 | * @version : 1.0.0
7 | */
8 | trait DuplicateRemovedStrategy {
9 |
10 | def isDuplicate(url: String): Boolean
11 |
12 | }
13 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/queue/duplicate/HashMapStrategy.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.queue.duplicate
2 |
3 | import java.util.concurrent.ConcurrentHashMap
4 |
5 | import scala.concurrent.duration._
6 |
7 | /**
8 | * @author : tong.wang
9 | * @since : 6/1/18 11:59 PM
10 | * @version : 1.0.0
11 | */
12 | object HashMapStrategy extends DuplicateRemovedStrategy {
13 | private[this] val urlMap: ConcurrentHashMap[Int, Long] = new ConcurrentHashMap()
14 |
15 | override def isDuplicate(url: String): Boolean = {
16 | val urlHashCode = url.hashCode
17 |
18 | urlMap.containsKey(urlHashCode) match {
19 | case duplicated @ true if (passedMinutes(urlMap.get(urlHashCode), 10 minutes)) =>
20 | urlMap.put(urlHashCode, System.currentTimeMillis())
21 | !duplicated
22 | case duplicated @ true =>
23 | duplicated
24 | case nonDuplicated @ false =>
25 | urlMap.put(urlHashCode, System.currentTimeMillis())
26 | nonDuplicated
27 | }
28 |
29 | }
30 |
31 | private[this] def passedMinutes(latest: Long, duration: Duration) = (latest - System.currentTimeMillis()) > duration.toMillis
32 |
33 | }
34 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/rest/NettyServer.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.rest
2 |
3 | import java.net.URI
4 |
5 | import io.netty.bootstrap.ServerBootstrap
6 | import io.netty.buffer.Unpooled
7 | import io.netty.channel._
8 | import io.netty.channel.nio.NioEventLoopGroup
9 | import io.netty.channel.socket.SocketChannel
10 | import io.netty.channel.socket.nio.NioServerSocketChannel
11 | import io.netty.handler.codec.http._
12 | import io.netty.handler.logging.{ LogLevel, LoggingHandler }
13 | import io.netty.handler.ssl.SslContext
14 |
15 | /**
16 | * @author : tong.wang
17 | * @since : 2019-08-28 10:24
18 | * @version : 1.0.0
19 | */
20 | object NettyServer extends Server {
21 | override def doStart(routes: Set[Router]): Unit = {
22 | val bossGroup = new NioEventLoopGroup(1)
23 | val workerGroup = new NioEventLoopGroup()
24 | try {
25 | val bootstrap = new ServerBootstrap()
26 | bootstrap
27 | .group(bossGroup, workerGroup)
28 | .channel(classOf[NioServerSocketChannel])
29 | .handler(new LoggingHandler(LogLevel.INFO))
30 | .childHandler(new ServerInitializer(routes = routes ++ defaultRoutes))
31 |
32 | val channel = bootstrap.bind(port).sync().channel()
33 | channel.closeFuture().sync()
34 | } finally {
35 | bossGroup.shutdownGracefully()
36 | workerGroup.shutdownGracefully()
37 | }
38 | }
39 |
40 | }
41 |
42 | class ServerInitializer(routes: Set[Router], sslContext: Option[SslContext] = None) extends ChannelInitializer[SocketChannel] {
43 |
44 | override def channelReadComplete(ctx: ChannelHandlerContext): Unit = ctx.flush()
45 |
46 | override def initChannel(channel: SocketChannel): Unit =
47 | channel
48 | .pipeline()
49 | .addLast(new HttpRequestDecoder)
50 | .addLast(new HttpResponseEncoder)
51 | .addLast(new HttpObjectAggregator(1024))
52 | .addLast(new RouterHandler(routes))
53 |
54 | }
55 |
56 | class RouterHandler(routes: Set[Router]) extends SimpleChannelInboundHandler[FullHttpRequest](true) {
57 |
58 | import io.netty.handler.codec.http.HttpResponseStatus._
59 |
60 | def channelRead0(ctx: ChannelHandlerContext, request: FullHttpRequest): Unit =
61 | try {
62 | val uri = new URI(request.uri()).getPath
63 | val method = request.method().name()
64 |
65 | routes.find(route => route.method == method && route.route == uri) match {
66 | case Some(handler) =>
67 | responseOk(request, handler.handleRequest(request))(ctx)
68 | case None =>
69 | responseNotFound(request, "not found".getBytes())(ctx)
70 | }
71 | } catch {
72 | case e: Throwable =>
73 | responseBadRequest(request, e.getLocalizedMessage.getBytes())(ctx)
74 | }
75 |
76 | override def exceptionCaught(ctx: ChannelHandlerContext, cause: Throwable): Unit = {
77 | cause.printStackTrace()
78 | ctx.close()
79 | }
80 |
81 | private[this] def responseBadRequest(request: FullHttpRequest, resp: Array[Byte])(ctx: ChannelHandlerContext) =
82 | response(ctx, BAD_REQUEST, request, resp)
83 |
84 | private[this] def responseOk(request: FullHttpRequest, resp: Array[Byte])(ctx: ChannelHandlerContext) =
85 | response(ctx, OK, request, resp)
86 |
87 | private[this] def responseNotFound(request: FullHttpRequest, resp: Array[Byte])(ctx: ChannelHandlerContext) =
88 | response(ctx, NOT_FOUND, request, resp)
89 |
90 | private[this] def response(ctx: ChannelHandlerContext, status: HttpResponseStatus, req: FullHttpRequest, resp: Array[Byte]) = {
91 | import io.netty.handler.codec.http.HttpHeaderNames._
92 | import io.netty.handler.codec.http.HttpVersion._
93 |
94 | val keepAlive = HttpUtil.isKeepAlive(req)
95 | val content = Unpooled.copiedBuffer(resp)
96 | val response = new DefaultFullHttpResponse(HTTP_1_1, status, content)
97 |
98 | response.headers.set(HttpHeaderNames.CONTENT_TYPE, "text/plain")
99 | response.headers.set(HttpHeaderNames.CONTENT_LENGTH, content.readableBytes)
100 |
101 | if (!keepAlive) {
102 | ctx.write(response).addListener(ChannelFutureListener.CLOSE)
103 | } else {
104 | response.headers().set(CONNECTION, HttpHeaderNames.KEEP_ALIVE);
105 | ctx.write(response)
106 | }
107 | ctx.flush()
108 | }
109 |
110 | }
111 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/rest/Router.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.rest
2 |
3 | import io.github.wtog.crawler.spider.SpiderPool
4 | import io.github.wtog.utils.JsonUtils
5 | import io.netty.handler.codec.http.FullHttpRequest
6 |
7 | /**
8 | * @author : tong.wang
9 | * @since : 2019-08-28 10:38
10 | * @version : 1.0.0
11 | */
12 | trait Router {
13 |
14 | def method: String
15 |
16 | def route: String
17 |
18 | def handleRequest(request: FullHttpRequest): Array[Byte]
19 |
20 | implicit def toBytes(content: String): Array[Byte] = content.getBytes()
21 | }
22 |
23 | object SpiderStatusRoute extends Router {
24 | override def method: String = "GET"
25 |
26 | override def route: String = "/spiders"
27 |
28 | override def handleRequest(request: FullHttpRequest): Array[Byte] = {
29 | val results = SpiderPool.fetchAllSpiders().foldLeft(List.empty[Map[String, Any]]) { (list, entry) =>
30 | entry.CrawlMetric.metricInfo() +: list
31 | }
32 | JsonUtils.toJson(results)
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/rest/Server.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.rest
2 |
3 | import java.util.concurrent.Executors
4 |
5 | import io.github.wtog.utils.ConfigUtils
6 | import io.github.wtog.utils.logger.Logging
7 |
8 | import scala.concurrent.{ ExecutionContext, Future }
9 | import scala.util.control.NonFatal
10 |
11 | /**
12 | * @author : tong.wang
13 | * @since : 2019-08-28 10:24
14 | * @version : 1.0.0
15 | */
16 | trait Server extends Logging {
17 |
18 | @volatile var running = false
19 |
20 | def start(routes: Set[Router]): Boolean = {
21 | if (!running) {
22 | Future {
23 | running = true
24 | try {
25 | doStart(routes)
26 | } catch {
27 | case NonFatal(e) =>
28 | logger.error(e.getLocalizedMessage)
29 | running = false
30 | }
31 | }(ExecutionContext.fromExecutor(Executors.newSingleThreadExecutor()))
32 | } else {
33 | running = true
34 | }
35 | running
36 | }
37 |
38 | protected def doStart(routes: Set[Router]): Unit
39 |
40 | val defaultRoutes: Set[Router] = Set(SpiderStatusRoute)
41 |
42 | val port: Int = ConfigUtils.getIntOpt("crawler.server.port").getOrElse(19000)
43 |
44 | }
45 |
46 | object Server {
47 | val serverInstance = NettyServer
48 |
49 | def start(routes: Set[Router] = Set.empty[Router]): Boolean = serverInstance.start(routes)
50 |
51 | def running: Boolean = serverInstance.running
52 | }
53 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/schedule/ScheduleJobs.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.schedule
2 |
3 | import org.quartz.impl.StdSchedulerFactory
4 | import org.quartz.{ JobBuilder, _ }
5 |
6 | /**
7 | * @author : tong.wang
8 | * @since : 2018-12-08 23:48
9 | * @version : 1.0.0
10 | */
11 | object ScheduleJobs {
12 | private lazy val scheduler = new StdSchedulerFactory().getScheduler()
13 |
14 | def addJob[C <: Job](scheduleJob: ScheduleJob[C]): Unit =
15 | if (!scheduler.checkExists(scheduleJob.jobKey)) {
16 | val trigger = TriggerBuilder.newTrigger().withSchedule(CronScheduleBuilder.cronSchedule(scheduleJob.cronExpression)).build()
17 | val job = JobBuilder.newJob(scheduleJob.task).withIdentity(scheduleJob.jobKey).build
18 |
19 | scheduler.scheduleJob(job, trigger)
20 | scheduler.startDelayed(1)
21 | }
22 |
23 | def shutdown(): Unit = scheduler.shutdown(true)
24 | }
25 |
26 | case class ScheduleJob[C <: Job](jobName: String, cronExpression: String, task: Class[C], groupName: Option[String] = None) {
27 | val group: String = groupName.getOrElse(jobName)
28 | val jobKey = new JobKey(jobName, group)
29 | }
30 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/selector/HtmlParser.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.selector
2 |
3 | import java.nio.charset.Charset
4 |
5 | import io.github.wtog.crawler.dto.Page
6 | import io.github.wtog.utils.JsonUtils
7 | import org.jsoup.Jsoup
8 | import org.jsoup.nodes.Element
9 | import org.jsoup.select.Elements
10 |
11 | import scala.collection.JavaConverters._
12 | import org.jsoup.nodes.Document
13 |
14 | /**
15 | * @author : tong.wang
16 | * @since : 5/18/18 12:32 AM
17 | * @version : 1.0.0
18 | */
19 | trait HtmlParser {
20 |
21 | implicit class PageWrapper(page: Page) {
22 |
23 | lazy val document: Document = Jsoup.parse(page.source, page.requestSetting.url.getOrElse(""))
24 |
25 | lazy val title: String = document.title()
26 |
27 | lazy val body: Element = document.body()
28 |
29 | def div(element: String): Elements = document.select(element)
30 |
31 | def dom(query: String): Elements = document.select(query)
32 |
33 | def table(query: String): Seq[Element] = document.select(s"table ${query}").asScala.toSeq
34 |
35 | def hrefs: Seq[String] = document.select("a").toSeq.collect {
36 | case e if e.attr("href").startsWith("http") =>
37 | e.attr("href")
38 | }
39 |
40 | }
41 |
42 | implicit class ElementsWrapper(elements: Elements) {
43 | def getText(query: String): String = elements.select(query).text()
44 |
45 | def getElements(query: String): Elements = elements.select(query)
46 |
47 | def toSeq: Seq[Element] = elements.asScala.toSeq
48 | }
49 |
50 | implicit class ElementWrapper(element: Element) {
51 | def getText(query: String): String = element.select(query).text()
52 | }
53 |
54 | }
55 |
56 | object HtmlParser {
57 |
58 | def getValueFromJson[T: Manifest](json: String, key: String): Option[T] = JsonUtils.parseFrom[Map[String, T]](json).get(key)
59 |
60 | def parseJson[T: Manifest](json: String): T = JsonUtils.parseFrom[T](json)
61 |
62 | def getHtmlSourceWithCharset(contentBytes: Array[Byte], defaultCharset: String = Charset.defaultCharset().name()): String = {
63 |
64 | val content = new String(contentBytes, defaultCharset)
65 | val metas: Elements = Jsoup.parse(content).select("meta")
66 |
67 | val metaContent = metas.attr("content")
68 |
69 | val actualCharset = if (metaContent.contains("charset")) { // html4
70 | metaContent
71 | .substring(metaContent.indexOf("charset"), metaContent.length)
72 | .split("=")(1)
73 | } else { // html5
74 | metas.attr("charset")
75 | }
76 |
77 | if (actualCharset.isEmpty || actualCharset.toUpperCase.equals(
78 | defaultCharset.toString.toUpperCase
79 | )) {
80 | content
81 | } else {
82 | new String(contentBytes, actualCharset)
83 | }
84 | }
85 | }
86 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/selector/Selector.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.selector
2 |
3 | /**
4 | * @author : tong.wang
5 | * @since : 5/18/18 12:27 AM
6 | * @version : 1.0.0
7 | */
8 | trait Selector {
9 | def select(text: String): String
10 |
11 | }
12 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/spider/Spider.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.spider
2 |
3 | import java.util.concurrent.TimeUnit
4 | import java.util.concurrent.atomic.{ AtomicBoolean, AtomicInteger }
5 |
6 | import akka.actor.{ ActorRef, PoisonPill, Props }
7 | import io.github.wtog.crawler.actor.ActorManager
8 | import io.github.wtog.crawler.downloader.{ ChromeHeadlessConfig, ChromeHeadlessDownloader, DownloaderActorReceiver }
9 | import io.github.wtog.crawler.dto.DownloadEvent
10 | import io.github.wtog.crawler.processor.PageProcessor
11 | import org.slf4j.LoggerFactory
12 |
13 | import scala.concurrent.ExecutionContext.Implicits._
14 | import scala.concurrent.Future
15 |
16 | /**
17 | * @author : tong.wang
18 | * @since : 4/10/18 11:34 AM
19 | * @version : 1.0.0
20 | */
21 | case class Spider(pageProcessor: PageProcessor) {
22 |
23 | private lazy val logger = LoggerFactory.getLogger(classOf[Spider])
24 |
25 | private var downloaderActorPath = ""
26 |
27 | val running: AtomicBoolean = new AtomicBoolean(false)
28 |
29 | val name: String = pageProcessor.name
30 |
31 | def start(): Unit =
32 | if (!running.getAndSet(true)) {
33 | // if (pageProcessor.downloader.isInstanceOf[ChromeHeadlessDownloader.type] && ChromeHeadlessConfig.chromeDriverNotExecutable) {
34 | // throw new IllegalStateException("""
35 | // |cant find chrome driver to execute.
36 | // |choose one chrome driver from https://npm.taobao.org/mirrors/chromedriver/70.0.3538.16/ to download and install into your system
37 | // """.stripMargin)
38 | // }
39 |
40 | val downloaderActor: ActorRef = getDownloadActor
41 | execute(downloaderActor)
42 | SpiderPool.addSpider(this)
43 | }
44 |
45 | private def getDownloadActor: ActorRef = {
46 | downloaderActorPath = s"downloader-${name}-${System.currentTimeMillis()}"
47 | val downloaderActor = ActorManager.getNewSystemActor(
48 | "downloader-dispatcher",
49 | downloaderActorPath,
50 | props = Props[DownloaderActorReceiver]
51 | )
52 | downloaderActor
53 | }
54 |
55 | def restart(): Unit = {
56 | if (running.get()) {
57 | this.stop()
58 | }
59 |
60 | start()
61 | }
62 |
63 | def stop(): Unit =
64 | if (running.getAndSet(false)) {
65 | ActorManager.getExistedActor(downloaderActorPath) ! PoisonPill
66 | SpiderPool.removeSpider(this)
67 | this.CrawlMetric.clean()
68 | }
69 |
70 | private def execute(downloaderActor: ActorRef): Future[Unit] =
71 | Future {
72 | this.pageProcessor.targetRequests.foreach { url ⇒
73 | downloaderActor ! DownloadEvent(
74 | spider = this,
75 | request = pageProcessor.requestSetting.withRequestUri(url)
76 | )
77 | TimeUnit.MILLISECONDS.sleep(this.pageProcessor.requestSetting.sleepTime.toMillis)
78 | }
79 | }
80 |
81 | object CrawlMetric {
82 | private val downloadPageSuccessNum = new AtomicInteger(0)
83 | private val downloadPageFailedNum = new AtomicInteger(0)
84 | private val processPageSuccessNum = new AtomicInteger(0)
85 |
86 | def downloadedPageSum: Int = downloadPageSuccessNum.get() + downloadPageFailedNum.get()
87 |
88 | def downloadSuccessCounter: Int = downloadPageSuccessNum.getAndIncrement()
89 |
90 | def downloadFailedCounter: Int = downloadPageFailedNum.getAndIncrement()
91 |
92 | def processedSuccessCounter: Int = processPageSuccessNum.getAndIncrement()
93 |
94 | def clean(): Unit = {
95 | downloadPageSuccessNum.set(0)
96 | downloadPageFailedNum.set(0)
97 | processPageSuccessNum.set(0)
98 | }
99 |
100 | def record(success: Boolean, url: String): Unit = {
101 | if (logger.isDebugEnabled())
102 | logger.debug(s"downloaded: ${success} ${url}")
103 | if (success) downloadSuccessCounter
104 | else downloadFailedCounter
105 | }
106 |
107 | def metricInfo(): Map[String, Any] = Map(
108 | "spider" -> name,
109 | "total" -> downloadedPageSum,
110 | "downloaded" -> downloadSuccessCounter,
111 | "processed" -> processPageSuccessNum.get()
112 | )
113 |
114 | }
115 |
116 | override def toString: String = s"spider-${name}: ${CrawlMetric.downloadedPageSum}"
117 | }
118 |
--------------------------------------------------------------------------------
/crawler-core/src/main/scala/io/github/wtog/crawler/spider/SpiderPool.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.spider
2 |
3 | import java.util.concurrent.ConcurrentHashMap
4 |
5 | import io.github.wtog.crawler.downloader.proxy.ProxyProvider
6 | import io.github.wtog.crawler.schedule.{ ScheduleJob, ScheduleJobs }
7 | import org.quartz.{ Job, JobExecutionContext }
8 |
9 | /**
10 | * @author : tong.wang
11 | * @since : 9/15/18 10:51 AM
12 | * @version : 1.0.0
13 | */
14 | object SpiderPool {
15 |
16 | private[this] val spiders = new ConcurrentHashMap[String, Spider]()
17 |
18 | def addSpider(spider: Spider): Unit = {
19 | spiders.putIfAbsent(spider.name, spider)
20 |
21 | spider.pageProcessor.cronExpression.foreach { cron ⇒
22 | ScheduleJobs.addJob(ScheduleJob(jobName = spider.name, cronExpression = cron, task = classOf[SpiderScheduleJob]))
23 | }
24 |
25 | if (spider.pageProcessor.requestSetting.useProxy) {
26 | ProxyProvider.startProxyCrawl()
27 | }
28 | }
29 |
30 | def removeSpider(spider: Spider): Spider = spiders.remove(spider.name)
31 |
32 | def getSpiderByName(name: String): Option[Spider] = Option(spiders.get(name))
33 |
34 | def fetchAllSpiders(): Array[Spider] = spiders.values().toArray().map(_.asInstanceOf[Spider])
35 |
36 | def fetchAllUsingProxySpiders(): Array[Spider] = fetchAllSpiders().filter(_.pageProcessor.requestSetting.useProxy)
37 |
38 | }
39 |
40 | class SpiderScheduleJob() extends Job {
41 | override def execute(jobExecutionContext: JobExecutionContext): Unit =
42 | SpiderPool
43 | .getSpiderByName(jobExecutionContext.getJobDetail.getKey.getName)
44 | .foreach(_.restart())
45 | }
46 |
--------------------------------------------------------------------------------
/crawler-core/src/test/resources/application-test.conf:
--------------------------------------------------------------------------------
1 | include "reference.conf"
2 |
--------------------------------------------------------------------------------
/crawler-core/src/test/resources/log4j2-test.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/crawler-core/src/test/scala/io/github/wtog/crawler/test/BaseCoreTest.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.test
2 |
3 | import java.util.concurrent.atomic.AtomicInteger
4 |
5 | import io.github.wtog.crawler.dto.{Page, RequestSetting}
6 | import io.github.wtog.crawler.processor.PageProcessor
7 | import io.github.wtog.crawler.rest.Server
8 | import io.github.wtog.crawler.test.server.TestMockServer
9 | import io.github.wtog.utils.ConfigUtils
10 | import io.github.wtog.utils.test.BaseTest
11 |
12 | /**
13 | * @author : tong.wang
14 | * @since : 5/16/18 9:19 PM
15 | * @version : 1.0.0
16 | */
17 | trait BaseCoreTest extends BaseTest {
18 |
19 | lazy val port = ConfigUtils.getIntOpt("crawler.server.port").getOrElse(19000)
20 |
21 | lazy val localServerHost = s"http://localhost:${port}"
22 |
23 | override def beforeAll() = {
24 | System.getProperty("config.resource", "application-test.conf")
25 | System.getProperty("log4j.resource", "log4j2-test.xml")
26 | if (!Server.running)
27 | TestMockServer.start
28 | }
29 |
30 | lazy val requestSettingTest = RequestSetting(
31 | domain = "www.baidu.com",
32 | url = Some("https://www.baidu.com/s?wd=wtog%20web-crawler")
33 | )
34 |
35 | case class LocalProcessor(requestSettingTest: Option[RequestSetting] = None) extends PageProcessor {
36 |
37 | val link = new AtomicInteger(0)
38 |
39 | override def targetUrls: List[String] = List(localServerHost)
40 |
41 | override protected def doProcess(page: Page): Unit = {
42 | assert(page.isDownloadSuccess)
43 | page.addTargetRequest(s"${page.url}?id=${link.incrementAndGet()}")
44 | }
45 |
46 | override def requestSetting: RequestSetting = {
47 | requestSettingTest.getOrElse(
48 | RequestSetting(
49 | domain = "localhost",
50 | url = Some(s"http://localhost:${port}/mock/get")
51 | ))
52 | }
53 |
54 | def getUrl(route: String): String = s"http://localhost:${port}${route}"
55 | }
56 |
57 | object LocalProcessor {
58 | def apply(): LocalProcessor = new LocalProcessor()
59 |
60 | def apply(requestSetting: Option[RequestSetting] = None): LocalProcessor = new LocalProcessor(requestSetting)
61 |
62 | def apply(requestSetting: RequestSetting): LocalProcessor = new LocalProcessor(Some(requestSetting))
63 |
64 | }
65 |
66 | }
67 |
--------------------------------------------------------------------------------
/crawler-core/src/test/scala/io/github/wtog/crawler/test/actor/ActorTestBase.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.test.actor
2 |
3 | import akka.actor.ActorSystem
4 | import akka.testkit.TestKit
5 | import org.scalatest.{BeforeAndAfterAll, Matchers, WordSpecLike}
6 |
7 | /**
8 | * @author : tong.wang
9 | * @since : 2019-04-22 08:06
10 | * @version : 1.0.0
11 | */
12 | abstract class ActorTestBase extends TestKit(ActorSystem("testsystem")) with WordSpecLike
13 | with Matchers
14 | with BeforeAndAfterAll
15 |
--------------------------------------------------------------------------------
/crawler-core/src/test/scala/io/github/wtog/crawler/test/actor/PageProcessorActorTestkit.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.test.actor
2 |
3 | import java.util.concurrent.TimeUnit
4 |
5 | import akka.actor.Props
6 | import akka.testkit.TestProbe
7 | import io.github.wtog.crawler.dto
8 | import io.github.wtog.crawler.dto.{DownloadEvent, Page, ProcessorEvent, RequestSetting}
9 | import io.github.wtog.crawler.processor._
10 | import io.github.wtog.crawler.spider.Spider
11 |
12 | import scala.concurrent.duration._
13 |
14 | /**
15 | * @author : tong.wang
16 | * @since : 2019-04-19 22:52
17 | * @version : 1.0.0
18 | */
19 | class PageProcessorActorTestkit extends ActorTestBase {
20 |
21 | private class TestProcessor extends PageProcessor {
22 |
23 | override def targetUrls: List[String] = List("http://test")
24 |
25 | override protected def doProcess(page: Page): Unit = {
26 | (1 to 10).foreach(i => page.addTargetRequest(s"${page.url}/$i"))
27 | }
28 |
29 | override def requestSetting: RequestSetting = RequestSetting()
30 | }
31 |
32 | "pageProcessor" must {
33 | "send" in {
34 | val pageProcessorActorRevicer = system.actorOf(props = Props[PageProcessorActorReceiver])
35 | val testProb = new TestProbe(system)
36 |
37 | val testProcessor = new TestProcessor()
38 | val pages = testProcessor.targetUrls.map { url =>
39 | dto.Page(bytes = Some("hh".getBytes()), requestSetting = testProcessor.requestSetting.withUrl(url))
40 | }
41 |
42 | val spider = Spider(pageProcessor = testProcessor)
43 | pages.foreach { p =>
44 | testProb.send(pageProcessorActorRevicer, ProcessorEvent(spider, p))
45 | }
46 |
47 | (1 to 10).foreach{i =>
48 | TimeUnit.SECONDS.sleep(1)
49 | testProb.expectMsg(2 seconds, DownloadEvent(spider, RequestSetting(url = Some(s"http://test/$i"))))}
50 | }
51 |
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/crawler-core/src/test/scala/io/github/wtog/crawler/test/download/AsyncHttpClientTest.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.test.download
2 |
3 | import com.google.common.net.{HttpHeaders, MediaType}
4 | import io.github.wtog.crawler.downloader.AsyncHttpClientDownloader
5 | import io.github.wtog.crawler.dto.RequestUri
6 | import io.github.wtog.crawler.spider.Spider
7 | import io.github.wtog.crawler.test.BaseCoreTest
8 | import io.github.wtog.crawler.test.server.TestPostMockRoute
9 | import io.github.wtog.utils.JsonUtils
10 | import org.scalatest.BeforeAndAfter
11 |
12 | /**
13 | * @author : tong.wang
14 | * @since : 5/20/18 11:22 AM
15 | * @version : 1.0.0
16 | */
17 | class AsyncHttpClientTest extends BaseCoreTest with BeforeAndAfter {
18 |
19 | after {
20 | AsyncHttpClientDownloader.closeClient()
21 | }
22 |
23 | lazy val localProcessor = LocalProcessor()
24 | lazy val url = localProcessor.requestSetting.url.get
25 |
26 | test("asynchttpclient driver close resource safely") {
27 | val page = await(AsyncHttpClientDownloader.download(Spider(pageProcessor = localProcessor), request = localProcessor.requestSetting.withUrl(s"${url}?id=0")))
28 |
29 | assert(page.isDownloadSuccess)
30 | assert(page.source.nonEmpty)
31 | AsyncHttpClientDownloader.closeClient()
32 |
33 | assert(AsyncHttpClientDownloader.getClient(localProcessor.requestSetting.domain).isEmpty)
34 | }
35 |
36 | test("async http client post download") {
37 | val request = RequestUri(
38 | url = localProcessor.getUrl(TestPostMockRoute.route),
39 | method = TestPostMockRoute.method,
40 | requestBody = Some(JsonUtils.toJson(Map("a" -> "b"))),
41 | headers = Some(Map(HttpHeaders.CONTENT_TYPE -> MediaType.FORM_DATA.toString))
42 | )
43 | val page = await(AsyncHttpClientDownloader.download(Spider(pageProcessor = localProcessor), request = localProcessor.requestSetting.withRequestUri(request)))
44 |
45 | assert(page.isDownloadSuccess)
46 | assert(page.source.nonEmpty)
47 |
48 | AsyncHttpClientDownloader.closeClient()
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/crawler-core/src/test/scala/io/github/wtog/crawler/test/download/ChromeHeadlessDownloaderTest.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.test.download
2 |
3 | import io.github.wtog.crawler.downloader.ChromeHeadlessDownloader
4 | import io.github.wtog.crawler.spider.Spider
5 | import io.github.wtog.crawler.test.BaseCoreTest
6 | import org.scalatest.BeforeAndAfter
7 |
8 | /**
9 | * @author : tong.wang
10 | * @since : 1/12/20 10:48 PM
11 | * @version : 1.0.0
12 | */
13 | class ChromeHeadlessDownloaderTest extends BaseCoreTest with BeforeAndAfter {
14 |
15 | after {
16 | ChromeHeadlessDownloader.closeClient()
17 | }
18 |
19 | lazy val localProcessor = LocalProcessor()
20 |
21 | ignore("chrome driver get xhr response") {
22 | val url = "https://flight.qunar.com/site/oneway_list.htm?searchDepartureAirport=%E5%8C%97%E4%BA%AC&searchArrivalAirport=%E6%88%90%E9%83%BD&searchDepartureTime=2020-03-11&searchArrivalTime=2020-03-12&nextNDays=0&startSearch=true&fromCode=BJS&toCode=CTU&from=qunarindex&lowestPrice=nul"
23 |
24 | val page = await(ChromeHeadlessDownloader.download(
25 | spider = Spider(pageProcessor = localProcessor),
26 | request = localProcessor.requestSetting.withUrl(url).withXhrRequests("https://flight.qunar.com/touch/api/domestic/wbdflightlist")
27 | ))
28 |
29 | assert(page.isDownloadSuccess)
30 |
31 | println(page.xhrResponses)
32 | }
33 |
34 | ignore("chrome driver on linux") {
35 | val page = await(ChromeHeadlessDownloader.download(
36 | spider = Spider(pageProcessor = localProcessor),
37 | request = localProcessor.requestSetting
38 | ))
39 |
40 | assert(page.isDownloadSuccess)
41 | println(page.source)
42 | }
43 |
44 | }
45 |
--------------------------------------------------------------------------------
/crawler-core/src/test/scala/io/github/wtog/crawler/test/processor/HtmlParserSpec.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.test.processor
2 |
3 | import io.github.wtog.crawler.dto
4 | import io.github.wtog.crawler.dto.{Page, RequestSetting}
5 | import io.github.wtog.crawler.selector.HtmlParser
6 | import io.github.wtog.crawler.test.BaseCoreTest
7 |
8 | /**
9 | * @author : tong.wang
10 | * @since : 2019-05-02 21:49
11 | * @version : 1.0.0
12 | */
13 | class HtmlParserSpec extends BaseCoreTest with HtmlParser {
14 |
15 | test("page json") {
16 | val pageJsonObj =
17 | """
18 | |{
19 | | "id": 1,
20 | | "name": "test"
21 | |}
22 | """.stripMargin
23 |
24 | val pageJson = Page(bytes = Some(pageJsonObj.getBytes()), requestSetting = RequestSetting()).json[Map[String, Any]]()
25 |
26 | assert(pageJson("id") == 1)
27 | assert(pageJson("name") == "test")
28 |
29 | val pageListJson =
30 | """
31 | |[
32 | | {
33 | | "id": 1,
34 | | "name": "test"
35 | | }
36 | |]
37 | """.stripMargin
38 |
39 | val pageJsonList = dto.Page(bytes = Some(pageListJson.getBytes()), requestSetting = RequestSetting()).json[List[Map[String, Any]]]()
40 |
41 | assert(pageJsonList.head("id") == 1)
42 | assert(pageJsonList.head("name") == "test")
43 |
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/crawler-core/src/test/scala/io/github/wtog/crawler/test/proxy/ProxyProviderTest.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.test.proxy
2 |
3 | import java.util.concurrent.TimeUnit
4 |
5 | import io.github.wtog.crawler.downloader.proxy.ProxyProvider
6 | import io.github.wtog.crawler.downloader.proxy.crawler.{A2UPageProcessor, Data5UPageProcessor, IP89Processor}
7 | import io.github.wtog.crawler.dto.RequestSetting
8 | import io.github.wtog.crawler.spider.Spider
9 | import io.github.wtog.crawler.test.BaseCoreTest
10 |
11 | /**
12 | * @author : tong.wang
13 | * @since : 2019-05-14 22:37
14 | * @version : 1.0.0
15 | */
16 | class ProxyProviderTest extends BaseCoreTest {
17 |
18 | test("spider use proxy with request setting useProxy=true") {
19 | val request = RequestSetting(
20 | url = Some(localServerHost),
21 | useProxy = true
22 | )
23 |
24 | Spider(pageProcessor = LocalProcessor(request)).start()
25 | assert(ProxyProvider.proxySpiderCrawling.get())
26 | }
27 |
28 | ignore("a2u proxy") {
29 | Spider(pageProcessor = A2UPageProcessor()).start()
30 | TimeUnit.SECONDS.sleep(10)
31 | }
32 |
33 | ignore("data5u proxy") {
34 | Spider(pageProcessor = Data5UPageProcessor()).start()
35 | TimeUnit.SECONDS.sleep(10)
36 | }
37 |
38 | ignore("ip89 proxy") {
39 | Spider(pageProcessor = IP89Processor()).start()
40 | TimeUnit.SECONDS.sleep(10)
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/crawler-core/src/test/scala/io/github/wtog/crawler/test/queue/DuplicateStrategyTest.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.test.queue
2 |
3 | import io.github.wtog.crawler.queue.duplicate.{BitSetStrategy, HashMapStrategy}
4 | import org.scalatest.FunSuite
5 |
6 | /**
7 | * @author : tong.wang
8 | * @since : 12/8/19 12:27 AM
9 | * @version : 1.0.0
10 | */
11 | class DuplicateStrategyTest extends FunSuite {
12 | val urls = Seq("url1", "url1", "url2")
13 |
14 |
15 | private def removeDuplicated(isDuplicate: String => Boolean): Seq[String] = {
16 | urls.collect { case x if (!isDuplicate(x)) => x }
17 | }
18 |
19 | test("hashMap remove duplicate") {
20 | assert(urls.distinct == removeDuplicated(HashMapStrategy.isDuplicate))
21 | }
22 |
23 | test("bitset remove duplicate") {
24 | assert(urls.distinct == removeDuplicated(BitSetStrategy.isDuplicate))
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/crawler-core/src/test/scala/io/github/wtog/crawler/test/schedule/ScheduleTest.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.test.schedule
2 |
3 | import java.util.concurrent.TimeUnit
4 |
5 | import io.github.wtog.crawler.schedule.{ScheduleJob, ScheduleJobs}
6 | import io.github.wtog.crawler.test.BaseCoreTest
7 | import org.quartz.{Job, JobExecutionContext}
8 |
9 | /**
10 | * @author : tong.wang
11 | * @since : 2019-05-12 23:11
12 | * @version : 1.0.0
13 | */
14 | class ScheduleTest extends BaseCoreTest with Job {
15 |
16 | val intervalPrintJob = ScheduleJob(jobName = "intervalPrintJob", cronExpression = "*/1 * * ? * *", classOf[ScheduleTest], groupName = Some("test"))
17 |
18 | override def execute(context: JobExecutionContext): Unit = {
19 | assert(context.getJobDetail.getKey == intervalPrintJob.jobKey)
20 | }
21 |
22 | test("addJob") {
23 | ScheduleJobs.addJob(intervalPrintJob)
24 | TimeUnit.SECONDS.sleep(2)
25 | }
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/crawler-core/src/test/scala/io/github/wtog/crawler/test/server/TestMockServer.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.test.server
2 |
3 | import java.nio.charset.Charset
4 |
5 | import io.github.wtog.crawler.rest.{Router, Server}
6 | import io.github.wtog.utils.JsonUtils
7 | import io.github.wtog.utils.logger.Logging
8 | import io.netty.handler.codec.http.{FullHttpRequest, HttpMethod}
9 |
10 | import scala.collection.JavaConverters._
11 | import scala.collection.mutable
12 |
13 | /**
14 | * @author : tong.wang
15 | * @since : 2019-08-01 09:10
16 | * @version : 1.0.0
17 | */
18 | object TestMockServer {
19 | def start = Server.start(Set(TestGetMockRoute, TestPostMockRoute))
20 | }
21 |
22 | object TestGetMockRoute extends Router with Logging {
23 | override def method: String = HttpMethod.GET.toString
24 |
25 | override def route: String = s"/mock/get"
26 |
27 | override def handleRequest(request: FullHttpRequest): Array[Byte] = {
28 | val resp = request.headers().entries().asScala.foldLeft(Map.empty[String, String]){ (map, entry) =>
29 | map ++ Map(entry.getKey -> entry.getValue)
30 | }
31 |
32 | val json = JsonUtils.toJson(resp)
33 | logger.info(s"TestGetMock ${json}")
34 | json.getBytes()
35 | }
36 | }
37 |
38 | object TestPostMockRoute extends Router {
39 | override def method: String = HttpMethod.POST.toString
40 |
41 | override def route: String = s"/mock/post"
42 |
43 | override def handleRequest(request: FullHttpRequest): Array[Byte] = {
44 | val resp = mutable.Map.empty[String, Any]
45 | val headers = request.headers().entries().asScala.foldLeft(Map.empty[String, String]){ (map, entry) =>
46 | map ++ Map(entry.getKey -> entry.getValue)
47 | }
48 |
49 | val content = request.content().toString(Charset.defaultCharset())
50 | val bodyMap = JsonUtils.parseFrom[Map[String, Any]](content)
51 |
52 | resp += ("requestHeaders" -> headers)
53 | resp += ("body" -> bodyMap)
54 |
55 | JsonUtils.toJson(resp).getBytes()
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/crawler-example/src/main/resources/log4j2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/crawler-example/src/main/resources/reference.conf:
--------------------------------------------------------------------------------
1 | crawler {
2 |
3 | downloader-dispatcher {
4 | type = Dispatcher
5 | executor = "thread-pool-executor"
6 |
7 | thread-pool-executor {
8 | core-pool-size-min = 5
9 | core-pool-size-factor = 2
10 | core-pool-size-max = 10
11 | }
12 | }
13 |
14 | processor-dispatcher {
15 | type = Dispatcher
16 | executor = "fork-join-executor"
17 | fork-join-executor {
18 | parallelism-min = 2
19 | parallelism-factor = 4.0
20 | parallelism-max = 10
21 | }
22 | throughput = 50
23 | }
24 |
25 | pipeline-dispatcher {
26 | type = Dispatcher
27 | executor = "thread-pool-executor"
28 |
29 | thread-pool-executor {
30 | core-pool-size-min = 4
31 | core-pool-size-factor = 2
32 | core-pool-size-max = 8
33 | }
34 | }
35 |
36 | download.retry.exception = ["java.util.concurrent.TimeoutException"]
37 |
38 | server.port = 19000
39 |
40 | chrome {
41 | driver: "/usr/bin/chromedriver"
42 | }
43 | }
44 |
45 | crawler-examples {
46 | BaiduPageProcessor.enable: false
47 |
48 | LianjiaErshouFangProcessor.enable: false
49 |
50 | LianjiaRentingProcessor.enable: false
51 |
52 | QunarPageProcessor.enable: true
53 |
54 | ZhihuAnswerPageProcessor.enable: false
55 | }
56 |
57 |
--------------------------------------------------------------------------------
/crawler-example/src/main/scala/io/github/wtog/example/ExampleTrait.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.example
2 |
3 | import io.github.wtog.crawler.processor.PageProcessor
4 | import io.github.wtog.utils.ConfigUtils
5 |
6 | /**
7 | * @author : tong.wang
8 | * @since : 1/14/20 8:31 PM
9 | * @version : 1.0.0
10 | */
11 | trait ExampleTrait extends PageProcessor {
12 |
13 | val enable: Boolean = ConfigUtils.getBooleanOpt(s"crawler-examples.${this.getClass.getSimpleName}.enable").getOrElse(false)
14 |
15 | }
16 |
--------------------------------------------------------------------------------
/crawler-example/src/main/scala/io/github/wtog/example/Main.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.example
2 |
3 | import io.github.wtog.crawler.processor.PageProcessor
4 | import io.github.wtog.crawler.spider.Spider
5 | import io.github.wtog.utils.ReflectionUtils
6 |
7 | import scala.io.StdIn
8 | import scala.util.Try
9 |
10 | /**
11 | * @author : tong.wang
12 | * @since : 6/14/18 11:40 PM
13 | * @version : 1.0.0
14 | */
15 | object Main {
16 |
17 | def printProcessors(processorList: Seq[(ExampleTrait, Int)]): Unit = {
18 | for ((service, order) ← processorList) {
19 | println(s"\t${order}. ${service.getClass.getSimpleName}")
20 | }
21 | println("")
22 | }
23 |
24 | def main(args: Array[String]): Unit = {
25 | val processorList = ReflectionUtils
26 | .implementationClasses(classOf[ExampleTrait], "io.github.wtog.example")
27 | .map(_.newInstance())
28 | .filter(_.enable)
29 | .sortWith(_.getClass.getSimpleName < _.getClass.getSimpleName)
30 | .zip(Stream.from(1))
31 |
32 | val execProcessors = args match {
33 | case args: Array[String] if args.isEmpty || args.contains("0") ⇒
34 | println("executing all enabled processors")
35 | printProcessors(processorList)
36 | processorList
37 | case args: Array[String] if (args.nonEmpty && args.toSeq.forall(arg ⇒ Try(arg.toInt).isSuccess)) ⇒
38 | val processors = processorList.filter {
39 | case (_, order) ⇒
40 | args.contains(order.toString)
41 | }
42 | println(s"executing ${processors.map(_._1.getClass.getSimpleName).mkString(",")}")
43 | processors
44 | case _ ⇒
45 | println("\nshow page processor list: ")
46 | println("\t0. all")
47 |
48 | printProcessors(processorList)
49 |
50 | println("\nchoose number to execute.")
51 | println("input like 1,2,3 means to execute 1 and 2 and 3 processor")
52 |
53 | val chooseNumber = StdIn.readLine()
54 | val chosen = chooseNumber.split(",").distinct
55 |
56 | val executeProcessor =
57 | if (chosen.isEmpty || chooseNumber.contains("0")) {
58 | println("execute all processor")
59 | processorList
60 | } else {
61 | processorList.filter {
62 | case (_, order) ⇒ chosen.contains(order.toString)
63 | }
64 | }
65 |
66 | executeProcessor
67 | }
68 |
69 | startSpiders(execProcessors)
70 | }
71 |
72 | def startSpiders(processorList: Seq[(PageProcessor, Int)]): Unit =
73 | processorList.foreach {
74 | case (processor, _) ⇒
75 | Spider(pageProcessor = processor).start()
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/crawler-example/src/main/scala/io/github/wtog/example/impl/BaiduPageProcessor.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.example.impl
2 |
3 | import io.github.wtog.crawler.dto.{ Page, RequestSetting }
4 | import io.github.wtog.example.ExampleTrait
5 |
6 | import scala.collection.mutable
7 | import scala.concurrent.duration._
8 |
9 | /**
10 | * @author : tong.wang
11 | * @since : 5/16/18 11:42 PM
12 | * @version : 1.0.0
13 | */
14 | class BaiduPageProcessor() extends ExampleTrait {
15 |
16 | override def doProcess(page: Page): Unit = {
17 | val hotSearched = page.div("#content_right .opr-toplist1-table tr")
18 |
19 | val href = hotSearched.select("td a").attr("href")
20 | val content = hotSearched.select("td a").text()
21 | val hot = hotSearched.select("td").last().text()
22 |
23 | page.addPageResultItem(
24 | Map("href" -> href, "content" -> content, "hot" -> hot)
25 | )
26 |
27 | }
28 |
29 | override def requestSetting: RequestSetting =
30 | RequestSetting(
31 | domain = "www.baidu.com",
32 | headers = mutable.Map("Content-Type" -> "text/html; charset=GB2312"),
33 | sleepTime = 1 seconds
34 | )
35 |
36 | override def targetUrls: List[String] = List("https://www.baidu.com/s?wd=wtog%20web-crawler")
37 |
38 | override def cronExpression: Option[String] = Some("*/30 * * ? * *")
39 | }
40 |
--------------------------------------------------------------------------------
/crawler-example/src/main/scala/io/github/wtog/example/impl/LianjiaErshouFangProcessor.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.example.impl
2 |
3 | import java.time.ZonedDateTime
4 | import java.time.format.DateTimeFormatter
5 | import java.util.concurrent.atomic.AtomicInteger
6 |
7 | import io.github.wtog.crawler.dto.{ Page, RequestSetting }
8 | import io.github.wtog.crawler.pipeline.Pipeline
9 | import io.github.wtog.crawler.pipeline.db.{ DataSource, DataSourceInfo, PostgreSQLPipeline }
10 | import io.github.wtog.example.ExampleTrait
11 | import io.github.wtog.utils.JsonUtils
12 | import io.github.wtog.utils.StringUtils._
13 | import org.jsoup.nodes.Element
14 |
15 | import scala.concurrent.duration._
16 | import scala.util.Random
17 | import scala.util.matching.Regex
18 |
19 | /**
20 | * @author : tong.wang
21 | * @since : 10/11/19 10:01 APM
22 | * @version : 1.0.0
23 | */
24 | class LianjiaErshouFangProcessor extends ExampleTrait {
25 |
26 | val pageNo = new AtomicInteger(1)
27 | val houseDetailRegex: Regex = """(.*ershoufang)/([\d]+).(html$)""".r
28 | val houseListRegex: Regex = """(.*ershoufang/pg[\d]+/$)""".r
29 |
30 | val queryDomValue: (String, Map[String, Seq[Element]], Element => String) => String = (typ: String, elements: Map[String, Seq[Element]], getDomValue: Element => String) => elements.get(typ).fold("")(e => getDomValue(e.head))
31 | val getLiText: Element => String = (e: Element) => e.childNodes.get(1).toString
32 | val getLastSpanText: Element => String = (e: Element) => e.select("span").last().text()
33 |
34 | def getPage: Int = if (pageNo.get() >= 100) pageNo.getAndSet(1) else pageNo.incrementAndGet()
35 |
36 | override def doProcess(page: Page): Unit =
37 | page.requestSetting.url.get match {
38 | case houseListRegex(_) =>
39 | addHouseDetail(page)
40 | page.addTargetRequest(s"https://bj.lianjia.com/ershoufang/pg${getPage}/")
41 | case houseDetailRegex(_, houseCode, _) =>
42 | val overviewContent = page.dom(".overview .content")
43 | val pageShoufu = page.dom(".new-calculator").attr("data-shoufu")
44 |
45 | val (evaluationPrice, priceTotal) = pageShoufu match {
46 | case shoufu if shoufu.nonEmpty =>
47 | val newCalculator = JsonUtils.parseFrom[Map[String, Any]](shoufu)
48 | val evaluation = newCalculator.get("evaluation").get.asInstanceOf[Int]
49 | val total = newCalculator.get("price").fold(0)(_.asInstanceOf[String].toInt)
50 | (evaluation, total)
51 | case "" =>
52 | (0, 0)
53 | }
54 |
55 | val price = overviewContent.select(".price")
56 | val removed = price.hasClass("isRemove")
57 | val pricePerMeter = price.getText(".unitPriceValue").replace("元/平米", "")
58 | val room = overviewContent.getElements(".room")
59 | val roomMainInfoText = room.getText(".mainInfo")
60 | val roomSubInfoText = room.getText(".subInfo")
61 | val roomType = overviewContent.getElements(".type")
62 | val roomTypeMainInfoText = roomType.getText(".mainInfo")
63 | val roomTypeSubInfoText = roomType.getText(".subInfo")
64 | val roomArea = overviewContent.getElements(".area")
65 | val roomAreaMainInfo = roomArea.getText(".mainInfo").replace("平米", "")
66 | val roomAreaSubInfo = roomArea.getText(".subInfo")
67 | val aroundInfo = overviewContent.getElements(".aroundInfo")
68 | val subdistrict = aroundInfo.getElements("a").first().text()
69 | val communityAreaName = aroundInfo.getText(".areaName").replace("所在区域", "").split(" ")
70 |
71 | val (areaName, community, communityDetail) = (communityAreaName.headOption, communityAreaName.tail.headOption, communityAreaName.lastOption)
72 |
73 | val infoContent = page.dom(".m-content .base .content li")
74 | val basic = infoContent.toSeq.groupBy(e => e.select("span").text)
75 |
76 | val buildType = queryDomValue("建筑类型", basic, getLiText)
77 | val buildStruct = queryDomValue("建筑结构", basic, getLiText)
78 | val decoration: String = queryDomValue("装修情况", basic, getLiText)
79 | val householdLadder: String = queryDomValue("梯户比例", basic, getLiText)
80 | val heating: String = queryDomValue("供暖方式", basic, getLiText)
81 | val elevator: String = queryDomValue("配备电梯", basic, getLiText)
82 | val houseRight: String = queryDomValue("产权年限", basic, getLiText)
83 |
84 | val transactionContent = page.dom(".m-content .transaction .content li")
85 |
86 | val info = transactionContent.toSeq.groupBy(e => e.select("span").first().text)
87 | val saleTime: String = queryDomValue("挂牌时间", info, getLastSpanText)
88 | val tradingRight: String = queryDomValue("交易权属", info, getLastSpanText)
89 | val lastSale: String = queryDomValue("上次交易", info, getLastSpanText)
90 | val housingUse: String = queryDomValue("房屋用途", info, getLastSpanText)
91 | val houseYears: String = queryDomValue("房屋年限", info, getLastSpanText)
92 | val houseRightOwner: String = queryDomValue("产权所属", info, getLastSpanText)
93 | val mortgageInfo: String = queryDomValue("抵押信息", info, getLastSpanText)
94 |
95 | val house = House(
96 | houseCode = houseCode,
97 | totalPrice = priceTotal.toInt,
98 | evaluationPrice = evaluationPrice,
99 | meterPrice = pricePerMeter.toInt,
100 | roomMainInfo = roomMainInfoText,
101 | roomSubInfo = roomSubInfoText,
102 | roomTypeMainInfo = roomTypeMainInfoText,
103 | roomTypeSubInfo = roomTypeSubInfoText,
104 | roomAreaMainInfo = roomAreaMainInfo,
105 | roomAreaSubInfo = roomAreaSubInfo,
106 | subdistrict = subdistrict,
107 | areaName = areaName,
108 | community = community,
109 | communityDetail = communityDetail,
110 | buildType = buildType,
111 | buildStruct = buildStruct,
112 | decoration = decoration,
113 | householdladder = householdLadder,
114 | heating = heating,
115 | elevator = elevator,
116 | houseRight = houseRight,
117 | saleTime = saleTime,
118 | tradingRight = tradingRight,
119 | lastSale = lastSale,
120 | housingUse = housingUse,
121 | houseYears = houseYears,
122 | houseRightOwner = houseRightOwner,
123 | mortgageInfo = mortgageInfo,
124 | removed = removed
125 | )
126 |
127 | page.addPageResultItem[Map[String, Any]](house.toMap)
128 | case other ⇒
129 | println(other)
130 | }
131 |
132 | def addHouseDetail(page: Page): Unit = {
133 | val detailHrefs = page.dom(".sellListContent li div.title a")
134 | detailHrefs.toSeq.foreach(d => page.addTargetRequest(d.attr("href")))
135 | }
136 |
137 | override val pipelines: Set[Pipeline] = Set(
138 | // CsvFilePipeline(Some("ershoufang.csv")),
139 | PostgreSQLPipeline(DataSourceInfo(database = this.getClass.getSimpleName, jdbcUrl = "jdbc:postgresql://127.0.0.1:5432/magicbox", username = "wtog", password = "")) { (db: String, result: Map[String, Any]) =>
140 | val (keys, values) = result.unzip
141 | DataSource.rows[Int]("select count(1) from house where house_code = ?", Seq(result("houseCode").asInstanceOf[String]))(r => r.getInt(1))(db).headOption.getOrElse(0) match {
142 | case 0 =>
143 | DataSource.executeUpdate(s"insert into house (${keys.map(_.toUnderscore).mkString(",")}) values (${Seq.fill[String](keys.size)("?").mkString(",")})", values.toSeq)(db)
144 | case _ =>
145 | DataSource.executeUpdate(
146 | s"update house set ${keys.map(c => s"${c.toUnderscore} = ?").mkString(",")}, updated_at = '${ZonedDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"))}' where house_code = ? ",
147 | (values.toSeq ++ Seq(result("houseCode")))
148 | )(db)
149 | }
150 | }
151 | )
152 |
153 | override def requestSetting: RequestSetting = RequestSetting(
154 | domain = "www.lianjia.com",
155 | sleepTime = (Random.nextInt(3) + 5) seconds,
156 | useProxy = true
157 | )
158 |
159 | override def targetUrls: List[String] = List("https://bj.lianjia.com/ershoufang/pg1/")
160 |
161 | }
162 |
163 | case class House(
164 | id: Option[Int] = None,
165 | houseCode: String,
166 | totalPrice: Int,
167 | evaluationPrice: Int,
168 | meterPrice: Int,
169 | roomMainInfo: String,
170 | roomSubInfo: String,
171 | roomTypeMainInfo: String,
172 | roomTypeSubInfo: String,
173 | roomAreaMainInfo: String,
174 | roomAreaSubInfo: String,
175 | subdistrict: String,
176 | areaName: Option[String],
177 | community: Option[String],
178 | communityDetail: Option[String],
179 | buildType: String,
180 | buildStruct: String,
181 | decoration: String,
182 | householdladder: String,
183 | heating: String,
184 | elevator: String,
185 | houseRight: String,
186 | saleTime: String,
187 | tradingRight: String,
188 | lastSale: String,
189 | housingUse: String,
190 | houseYears: String,
191 | houseRightOwner: String,
192 | mortgageInfo: String,
193 | removed: Boolean = false)
194 |
195 | object House {
196 |
197 | implicit class HouseWrapper(house: House) {
198 | def toMap: Map[String, Any] = JsonUtils.toMap(house)
199 | }
200 |
201 | }
202 |
203 | object HouseType extends Enumeration {
204 | type HouseType = Value
205 |
206 | val ERSHOU: Value = Value("ershou")
207 | val NEW: Value = Value("new")
208 | }
209 |
--------------------------------------------------------------------------------
/crawler-example/src/main/scala/io/github/wtog/example/impl/LianjiaRentingProcessor.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.example.impl
2 |
3 | import java.time.ZonedDateTime
4 | import java.time.format.DateTimeFormatter
5 | import java.util.concurrent.atomic.AtomicInteger
6 |
7 | import io.github.wtog.crawler.dto.{ Page, RequestSetting }
8 | import io.github.wtog.crawler.pipeline.Pipeline
9 | import io.github.wtog.crawler.pipeline.db.{ DataSource, DataSourceInfo, PostgreSQLPipeline }
10 | import io.github.wtog.example.ExampleTrait
11 | import io.github.wtog.utils.JsonUtils
12 | import io.github.wtog.utils.StringUtils._
13 | import org.jsoup.nodes.Element
14 |
15 | import scala.concurrent.duration._
16 | import scala.util.Random
17 | import scala.util.matching.Regex
18 |
19 | /**
20 | * @author : tong.wang
21 | * @since : 10/11/19 10:01 APM
22 | * @version : 1.0.0
23 | */
24 | class LianjiaRentingProcessor extends ExampleTrait {
25 | val pageNo = new AtomicInteger(1)
26 | val houseDetailRegex: Regex = """(.*zufang)/([BJ\d]+).(html$)""".r
27 | val houseListRegex: Regex = """(.*)/(zufang)/(pg[\d]+/$)""".r
28 | val houseCodeRegex: Regex = """[BJ\d]+""".r
29 | val houseFloor: Regex = """[\d]+""".r
30 |
31 | val queryDomValue: (String, Map[String, Seq[Element]], Element => String) => String = (typ: String, elements: Map[String, Seq[Element]], getDomValue: Element => String) => elements.get(typ).fold("")(e => getDomValue(e.head))
32 | val getLiText: Element => String = (e: Element) => e.childNodes.get(1).toString
33 | val getLastSpanText: Element => String = (e: Element) => e.select("span").last().text()
34 |
35 | def getPage: Int = if (pageNo.get() >= 100) pageNo.getAndSet(1) else pageNo.incrementAndGet()
36 |
37 | override def doProcess(page: Page): Unit =
38 | page.requestSetting.url.get match {
39 | case houseListRegex(domain, _, _) =>
40 | val details = page.dom(".content__list--item").toSeq
41 |
42 | details.foreach { detail =>
43 | val rentingWay = detail.select(".content__list--item--title").text().substring(0, 2)
44 | val href = s"${domain}${detail.select("a").attr("href")}"
45 |
46 | val des = detail.select(".content__list--item--des").text().split("/")
47 |
48 | val area = des.head.split("-")
49 | val areaName = area.head.trim
50 | val community = area.lift(1).get.trim
51 | val communityDetail = area.last.trim
52 | val meter = des(1).replace("㎡", "").trim.toInt
53 | val direction = des(2).trim
54 | val typ = des(3).trim
55 | val floor = des(4).trim
56 |
57 | val bottom = detail.select(".content__list--item--bottom i")
58 |
59 | val subway = bottom.select(".content__item__tag--is_subway_house").text().trim
60 | val decoration = bottom.select(".content__item__tag--decoration").text().trim
61 | val heating = bottom.select(".content__item__tag--central_heating").text().trim
62 |
63 | val rentingHouse = RentingHouse(
64 | houseCode = houseCodeRegex.findFirstIn(href).get,
65 | price = Some(detail.select(".content__list--item-price em").text().toInt),
66 | rentingWay = Some(rentingWay),
67 | area = Some(areaName),
68 | community = Some(community),
69 | communityDetail = Some(communityDetail),
70 | meter = Some(meter),
71 | direction = Some(direction),
72 | typ = Some(typ),
73 | floor = houseFloor.findFirstIn(floor).map(_.toInt),
74 | subway = if (subway.nonEmpty) Some(subway) else None,
75 | decoration = if (decoration.nonEmpty) Some(decoration) else None,
76 | heating = if (heating.nonEmpty) Some(heating) else None
77 | )
78 | page.addPageResultItem(JsonUtils.toMap(rentingHouse))
79 | // page.addTargetRequest(s"${domain}${detail.select("a").attr("href")}")
80 | }
81 | page.addTargetRequest(s"https://bj.lianjia.com/zufang/pg${getPage}/")
82 | case houseDetailRegex(_, houseCode, _) =>
83 | //todo append detail
84 |
85 | case other ⇒
86 | println(other)
87 | }
88 |
89 | override val pipelines: Set[Pipeline] = Set(
90 | PostgreSQLPipeline(DataSourceInfo(database = "renting", jdbcUrl = "jdbc:postgresql://127.0.0.1:5432/magicbox", username = "wtog", password = "")) { (db: String, result: Map[String, Any]) =>
91 | val (keys, values) = result.unzip
92 | DataSource.rows[Int]("select count(1) from renting where house_code = ?", Seq(result("houseCode").asInstanceOf[String]))(r => r.getInt(1))(db).headOption.getOrElse(0) match {
93 | case 0 =>
94 | DataSource.executeUpdate(s"insert into renting (${keys.map(_.toUnderscore).mkString(",")}) values (${Seq.fill[String](keys.size)("?").mkString(",")})", values.toSeq)(db)
95 | case _ =>
96 | DataSource.executeUpdate(
97 | s"update renting set ${keys.map(c => s"${c.toUnderscore} = ?").mkString(",")}, updated_at = '${ZonedDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"))}' where house_code = ? ",
98 | (values.toSeq ++ Seq(result("houseCode")))
99 | )(db)
100 | }
101 | }
102 | )
103 |
104 | override def requestSetting: RequestSetting = RequestSetting(
105 | domain = "bj.lianjia.com",
106 | sleepTime = (Random.nextInt(3) + 5) seconds,
107 | useProxy = true
108 | )
109 |
110 | override def targetUrls: List[String] = List("https://bj.lianjia.com/zufang/pg1/")
111 |
112 | override def cronExpression: Option[String] = Some("0 0/30 * * * ?")
113 | }
114 |
115 | case class RentingHouse(
116 | id: Option[Int] = None,
117 | houseCode: String,
118 | price: Option[Int] = None,
119 | rentingWay: Option[String] = None,
120 | area: Option[String] = None,
121 | community: Option[String] = None,
122 | communityDetail: Option[String] = None,
123 | meter: Option[Int] = None,
124 | direction: Option[String] = None,
125 | typ: Option[String] = None,
126 | floor: Option[Int] = None,
127 | subway: Option[String] = None,
128 | decoration: Option[String] = None,
129 | heating: Option[String] = None)
130 |
--------------------------------------------------------------------------------
/crawler-example/src/main/scala/io/github/wtog/example/impl/ZhihuAnswerPageProcessor.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.example.impl
2 |
3 | import io.github.wtog.crawler.dto.{ Page, RequestSetting }
4 | import io.github.wtog.example.ExampleTrait
5 |
6 | import scala.concurrent.duration._
7 |
8 | case class ZhihuAnswerPageProcessor() extends ExampleTrait {
9 |
10 | override def doProcess(page: Page): Unit = {
11 | val result = page.json[Map[String, Any]]()
12 |
13 | result.get("data").foreach { answers =>
14 | answers.asInstanceOf[List[Map[String, Any]]].foreach { answer =>
15 | val question = answer("question").asInstanceOf[Map[String, String]]("title")
16 | val answerContent = answer("content")
17 | page.addPageResultItem(Map("question" -> question, "answer" -> answerContent))
18 | }
19 | }
20 |
21 | val nextPage = result("paging").asInstanceOf[Map[String, String]].get("next")
22 |
23 | nextPage.foreach { url =>
24 | page.addTargetRequest(url.replaceAll("https://www.zhihu.com", "$0/api/v4"))
25 | }
26 | }
27 |
28 | override def requestSetting: RequestSetting =
29 | RequestSetting(
30 | domain = "www.zhihu.com",
31 | timeOut = 10 seconds,
32 | sleepTime = 3 seconds
33 | )
34 |
35 | override def targetUrls: List[String] = List(
36 | "https://www.zhihu.com/api/v4/members/rednaxelafx/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cquestion%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=10&sort_by=created"
37 | )
38 |
39 | override def cronExpression: Option[String] = None
40 | }
41 |
--------------------------------------------------------------------------------
/crawler-example/src/main/scala/io/github/wtog/example/impl/flight/QunarPageProcessor.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.example.impl.flight
2 |
3 | import io.github.wtog.crawler.downloader.{ ChromeHeadlessDownloader, Downloader }
4 | import io.github.wtog.crawler.dto.{ Page, RequestSetting, RequestUri }
5 | import io.github.wtog.example.ExampleTrait
6 |
7 | import scala.concurrent.duration._
8 |
9 | /**
10 | * @author : tong.wang
11 | * @since : 1/14/20 8:41 PM
12 | * @version : 1.0.0
13 | */
14 | class QunarPageProcessor extends ExampleTrait {
15 |
16 | override def targetRequests: List[RequestUri] = List(
17 | RequestUri("https://flight.qunar.com/site/oneway_list.htm?searchDepartureAirport=%E5%8C%97%E4%BA%AC&searchArrivalAirport=%E6%88%90%E9%83%BD&searchDepartureTime=2020-01-11&searchArrivalTime=2020-01-15&nextNDays=0&startSearch=true&fromCode=BJS&toCode=CTU&from=qunarindex&lowestPrice=null")
18 | )
19 |
20 | override def requestSetting: RequestSetting = RequestSetting(
21 | domain = "flight.qunar.com",
22 | sleepTime = 1 seconds,
23 | xhrRequests = Set("https://flight.qunar.com/touch/api/domestic/wbdflightlist")
24 | )
25 |
26 | override protected def doProcess(page: Page): Unit =
27 | page.xhrResponses.foreach { response =>
28 | println(response.result)
29 | }
30 |
31 | override val downloader: Downloader[_] = ChromeHeadlessDownloader
32 | }
33 |
--------------------------------------------------------------------------------
/crawler-pipeline/src/main/scala/io/github/wtog/crawler/pipeline/db/DataSource.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.pipeline.db
2 |
3 | import java.sql._
4 | import java.util.concurrent.ConcurrentHashMap
5 |
6 | import com.zaxxer.hikari.HikariDataSource
7 | import io.github.wtog.utils.logger.Logging
8 |
9 | import scala.collection.mutable.ListBuffer
10 | import scala.util.control.NonFatal
11 |
12 | /**
13 | * @author : tong.wang
14 | * @since : 10/29/19 8:27 PM
15 | * @version : 1.0.0
16 | */
17 | trait DataSource extends Logging {
18 |
19 | protected val driverClass: String
20 |
21 | }
22 |
23 | object DataSource extends Logging {
24 | private val pools: ConcurrentHashMap[String, HikariDataSource] = new ConcurrentHashMap[String, HikariDataSource]()
25 |
26 | def initConnection(driverClass: String, dataSouceInfo: DataSourceInfo): Unit =
27 | try {
28 | val database = dataSouceInfo.database
29 | Option(pools.get(database)).getOrElse {
30 | val hikariDataSource = new HikariDataSource()
31 | hikariDataSource.setDriverClassName(driverClass)
32 | hikariDataSource.setJdbcUrl(dataSouceInfo.jdbcUrl)
33 | hikariDataSource.setUsername(dataSouceInfo.username)
34 | hikariDataSource.setPassword(dataSouceInfo.password)
35 | hikariDataSource.setMaximumPoolSize(dataSouceInfo.maxPoolSize)
36 | hikariDataSource.setMinimumIdle(dataSouceInfo.minIdleSize)
37 | hikariDataSource.setAutoCommit(true)
38 | hikariDataSource.setIdleTimeout(dataSouceInfo.idleTimeout.toMillis)
39 | hikariDataSource.setValidationTimeout(1000)
40 | hikariDataSource.setPoolName(database)
41 |
42 | pools.put(database, hikariDataSource)
43 | }
44 | } catch {
45 | case NonFatal(e) =>
46 | throw e
47 | }
48 |
49 | def getConnection(db: String): Connection = pools.get(db).getConnection
50 |
51 | def executeQuery[R](sql: SQL)(wrapper: ResultSet => R)(implicit db: String): Seq[R] =
52 | buildStatement(sql) { statement =>
53 | val resultSet = statement.executeQuery()
54 | val results = new ListBuffer[R]
55 |
56 | while (resultSet.next()) {
57 | results.append(wrapper(resultSet))
58 | }
59 | results.toSeq
60 | }
61 |
62 | def executeUpdate(sql: String, parameters: Seq[Any])(implicit db: String): Int =
63 | buildStatement(SQL(sql, parameters))(statement => statement.executeUpdate())
64 |
65 | private def buildStatement[R](sql: SQL)(exec: PreparedStatement => R)(implicit database: String): R = {
66 | val conn = DataSource.getConnection(database)
67 | try {
68 | val statement = conn.prepareStatement(sql.sql)
69 | var index: Int = 1
70 |
71 | for (p <- sql.parameters) {
72 | p match {
73 | case p: Int =>
74 | statement.setInt(index, p)
75 | case p: String =>
76 | statement.setString(index, p)
77 | case p: Boolean =>
78 | statement.setBoolean(index, p)
79 | case other =>
80 | throw new UnsupportedOperationException(s"parameter ${other}:${other.getClass.getName} not support by now ")
81 | }
82 | index += 1
83 | }
84 | logger.debug(s"${sql}")
85 | exec(statement)
86 | } finally {
87 | conn.close()
88 | }
89 |
90 | }
91 |
92 | def rows[R](sql: String, parameters: Seq[Any])(wrapper: ResultSet => R)(implicit db: String): Seq[R] =
93 | executeQuery(SQL(sql, parameters))(wrapper)
94 | }
95 |
96 | case class SQL(sql: String, parameters: Seq[Any]) {
97 |
98 | import io.github.wtog.utils.StringUtils._
99 |
100 | override def toString: String = sql.placeholderReplacedBy("\\?", parameters: _*)
101 | }
102 |
--------------------------------------------------------------------------------
/crawler-pipeline/src/main/scala/io/github/wtog/crawler/pipeline/db/DataSourceInfo.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.pipeline.db
2 |
3 | import scala.concurrent.duration._
4 |
5 | /**
6 | * @author : tong.wang
7 | * @since : 10/28/19 11:32 PM
8 | * @version : 1.0.0
9 | */
10 | case class DataSourceInfo(
11 | database: String = "default",
12 | jdbcUrl: String,
13 | username: String,
14 | password: String,
15 | maxPoolSize: Int = 5,
16 | minIdleSize: Int = 1,
17 | idleTimeout: Duration = 10 seconds)
18 |
--------------------------------------------------------------------------------
/crawler-pipeline/src/main/scala/io/github/wtog/crawler/pipeline/db/PostgreSQLPipeline.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.pipeline.db
2 |
3 | import io.github.wtog.crawler.pipeline.Pipeline
4 |
5 | /**
6 | * @author : tong.wang
7 | * @since : 10/28/19 11:29 PM
8 | * @version : 1.0.0
9 | */
10 | case class PostgreSQLPipeline(dataSouceInfo: DataSourceInfo)(statement: (String, Map[String, Any]) => Unit) extends DataSource with Pipeline {
11 |
12 | override val driverClass: String = "org.postgresql.Driver"
13 |
14 | override def process[Result](pageResultItem: (String, Result)): Unit = {
15 | val (_, resultMap) = (pageResultItem._1, pageResultItem._2.asInstanceOf[Map[String, Any]])
16 | statement(dataSouceInfo.database, resultMap)
17 | }
18 |
19 | override def init(): Unit = DataSource.initConnection(driverClass, dataSouceInfo)
20 | }
21 |
--------------------------------------------------------------------------------
/crawler-pipeline/src/main/scala/io/github/wtog/crawler/pipeline/file/CsvFilePipeline.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.pipeline.file
2 |
3 | import java.io.RandomAccessFile
4 | import java.util.concurrent._
5 |
6 | import io.github.wtog.crawler.pipeline.Pipeline
7 | import io.github.wtog.utils.logger.Logging
8 |
9 | import scala.collection.mutable.ListBuffer
10 | import scala.concurrent.Future
11 |
12 | /**
13 | * @author : tong.wang
14 | * @since : 5/20/18 11:01 PM
15 | * @version : 1.0.0
16 | */
17 | case class CsvFilePipeline(fileName: Option[String]) extends Pipeline {
18 |
19 | override def process[R](pageResultItem: (String, R)): Unit = {
20 | val (pageUrl, resultItems) = pageResultItem
21 | IOContentCache.add(fileName.getOrElse(pageUrl), resultItems.asInstanceOf[Map[String, Any]])
22 | }
23 |
24 | }
25 |
26 | object IOContentCache extends Logging {
27 | private val cache: ConcurrentHashMap[String, ListBuffer[Map[String, Any]]] = new ConcurrentHashMap[String, ListBuffer[Map[String, Any]]]()
28 |
29 | lazy val timestamp: Long = System.currentTimeMillis()
30 |
31 | def add(key: String, value: Map[String, Any]): ListBuffer[Map[String, Any]] = {
32 | val listValue = cache.getOrDefault(key, ListBuffer.empty[Map[String, Any]])
33 | listValue.append(value)
34 | cache.put(key, listValue)
35 | }
36 |
37 | def writeContentFile(fileName: String, contentList: ListBuffer[Map[String, Any]]): Any =
38 | if (contentList.nonEmpty) {
39 | val file = if (fileName.contains("/")) fileName.replace("/", "_") else fileName
40 |
41 | val randomFile = new RandomAccessFile(
42 | s"/tmp/web-crawler-${file}-${timestamp}.csv",
43 | "rw"
44 | )
45 | try {
46 | val fileLength = randomFile.length()
47 | randomFile.seek(fileLength) //指针指向文件末尾
48 | fileLength match {
49 | case 0 ⇒
50 | val head = contentList.head
51 | val title = head.keys.mkString(",") + "\n"
52 | randomFile.write((title).getBytes("UTF-8"))
53 | val row = head.values.mkString(",") + "\n"
54 | randomFile.write((row).getBytes("UTF-8"))
55 | contentList -= head
56 | case _ ⇒
57 | contentList.foreach(map ⇒ {
58 | val row = map.values.mkString(",") + "\n"
59 | randomFile.write((row).getBytes("UTF-8")) //写入数据
60 | contentList -= map
61 | })
62 | }
63 | } catch {
64 | case ex: Throwable ⇒ ex.printStackTrace()
65 | } finally {
66 | randomFile.close()
67 | }
68 | }
69 |
70 | val expire: Future[ScheduledFuture[_]] = {
71 | def removeExpire() = {
72 | import collection.JavaConverters._
73 | val schedule = Executors.newScheduledThreadPool(1)
74 |
75 | schedule.scheduleWithFixedDelay(new Runnable {
76 | override def run(): Unit =
77 | cache.asScala.foreach {
78 | case (url, list) ⇒ writeContentFile(url, list)
79 | }
80 | }, 3, 3, TimeUnit.SECONDS)
81 | }
82 |
83 | import scala.concurrent.ExecutionContext.Implicits.global
84 |
85 | Future {
86 | removeExpire()
87 | }.recover {
88 | case ex ⇒
89 | logger.error(ex.getLocalizedMessage)
90 | removeExpire()
91 | }
92 | }
93 |
94 | }
95 |
--------------------------------------------------------------------------------
/crawler-pipeline/src/main/scala/io/github/wtog/crawler/pipeline/file/FilePipeline.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.pipeline.file
2 |
3 | import java.io.{ File, PrintWriter }
4 |
5 | import io.github.wtog.crawler.pipeline.Pipeline
6 |
7 | /**
8 | * @author : tong.wang
9 | * @since : 2019-01-20 00:47
10 | * @version : 1.0.0
11 | */
12 | case class FilePipeline(fileBaseDir: Option[String] = None) extends Pipeline {
13 |
14 | override def process[R](pageResultItem: (String, R)): Unit = {
15 | val fileDto = pageResultItem._2.asInstanceOf[FileDTO]
16 |
17 | val (fileDirPath, fileName) = fileDto.fileName.lastIndexOf('/') match {
18 | case -1 =>
19 | (filePathFormat(fileBaseDir.getOrElse("/")), if (fileDto.fileName.startsWith("/")) fileDto.fileName else s"/${fileDto.fileName}")
20 | case fileDirIndex =>
21 | (filePathFormat(fileBaseDir.getOrElse("/")) + filePathFormat(fileDto.fileName.substring(0, fileDirIndex)), fileDto.fileName.substring(fileDirIndex))
22 | }
23 |
24 | val fileDir = new File(fileDirPath)
25 |
26 | !fileDir.exists() && fileDir.mkdirs()
27 |
28 | new PrintWriter(s"${fileDirPath}${fileName}.${fileDto.fileType}") {
29 | try {
30 | write(s"${fileDto.content}")
31 | } finally {
32 | close()
33 | }
34 | }
35 | }
36 |
37 | def filePathFormat(path: String): String = if (path.startsWith("/")) path else s"/${path}"
38 |
39 | }
40 |
41 | case class FileDTO(fileName: String, fileType: String = "html", content: String)
42 |
--------------------------------------------------------------------------------
/crawler-pipeline/src/test/scala/io/github/wtog/crawler/pipeline/test/BasePipelineTest.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.pipeline.test
2 |
3 | import io.github.wtog.utils.test.BaseTest
4 |
5 | /**
6 | * @author : tong.wang
7 | * @since : 10/30/19 11:43 PM
8 | * @version : 1.0.0
9 | */
10 | trait BasePipelineTest extends BaseTest {
11 |
12 | protected def init()
13 |
14 | protected def cleanup()
15 |
16 | override def beforeAll(): Unit = init()
17 |
18 | override def afterAll(): Unit = cleanup()
19 | }
20 |
--------------------------------------------------------------------------------
/crawler-pipeline/src/test/scala/io/github/wtog/crawler/pipeline/test/DataSourceTest.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.crawler.pipeline.test
2 |
3 | import io.github.wtog.crawler.pipeline.db.DataSource
4 |
5 | /**
6 | * @author : tong.wang
7 | * @since : 10/30/19 11:50 PM
8 | * @version : 1.0.0
9 | */
10 | class DataSourceTest extends BasePipelineTest {
11 | object PgDataSource extends DataSource {
12 | override protected val driverClass: String = "org.postgresql.Driver"
13 | }
14 |
15 | test("pg process") {
16 |
17 | val datas = Map[String, Any](
18 | "a" -> 1,
19 | "b" -> 2
20 | )
21 | }
22 |
23 | override protected def init(): Unit = {}
24 |
25 | override protected def cleanup(): Unit = {}
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM alpine:3.8
2 |
3 | MAINTAINER wangtong
4 | USER root
5 |
6 | RUN echo http://mirrors.ustc.edu.cn/alpine/v3.8/main > /etc/apk/repositories && \
7 | echo http://mirrors.ustc.edu.cn/alpine/v3.8/community >> /etc/apk/repositories && \
8 | echo @edge http://mirrors.ustc.edu.cn/alpine/edge/community >> /etc/apk/repositories && \
9 | echo @edge http://mirrors.ustc.edu.cn/alpine/edge/main >> /etc/apk/repositories
10 |
11 | RUN apk update && apk upgrade && \
12 | apk add --no-cache \
13 | bash \
14 | alsa-lib \
15 | at-spi2-atk \
16 | atk \
17 | cairo \
18 | cups-libs \
19 | dbus-libs \
20 | eudev-libs \
21 | expat \
22 | flac \
23 | gdk-pixbuf \
24 | glib \
25 | harfbuzz@edge \
26 | libgcc \
27 | libjpeg-turbo \
28 | libpng \
29 | libwebp \
30 | libx11 \
31 | libxcomposite \
32 | libstdc++@edge \
33 | libxdamage \
34 | libxext \
35 | libxfixes \
36 | libexif \
37 | chromium@edge \
38 | chromium-chromedriver@edge \
39 | openjdk8
40 |
41 | RUN ln -s /usr/lib/jvm/default-jvm/bin/jstat /usr/local/bin/jstat && \
42 | ln -s /usr/lib/jvm/default-jvm/bin/jcmd /usr/bin/jcmd && \
43 | ln -s /usr/lib/jvm/default-jvm/bin/jstack /usr/bin/jstack && \
44 | rm -rf /openjdk8*
45 |
46 | ADD web-crawler-assembly.jar /apps/web-crawler.jar
47 |
48 | CMD ["java", "-jar", "/apps/web-crawler.jar"]
49 |
--------------------------------------------------------------------------------
/docker/build.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | find crawler-example -type d -iname 'target' | xargs rm -rf
4 |
5 | sbt clean assembly
6 |
7 | version='latest'
8 |
9 | docker build -f docker/Dockerfile -t wtog/web-crawler:${version} ./crawler-example/target/scala-2.12/
--------------------------------------------------------------------------------
/project/.gnupg/pubring.gpg:
--------------------------------------------------------------------------------
1 | -----BEGIN PGP PUBLIC KEY BLOCK-----
2 |
3 | mQENBFxCjvEBCADMp/qjTLav0haO4zMDKCR2xn0Tx9X/SPxjyzqCXiuQRhJslyUv
4 | mRwsRbi4bUYEoDcB9xPZVnHBf/fNE8+Jo0tdnaKho1sTmwMc/inIxXubfgTjsqI1
5 | 8eZvLJfqGPPpyBgE6Ijt+//wUiKNS/JmoUnDHIQZ3Az5mf/faUI5Qn8N9OoWmAZ3
6 | f+qk03ZgXNmgfkW7KSUSwtr7S0AFEc+co/YmLvlH16B7mTrmgJJg8O9iKdheFU7p
7 | YivVjOkoE5r9Cs7M01hc1OfMFgFShqCEpLLi9P7nEKhtQdAGR+bdAuPxmEetbkVU
8 | MNnj73prrfrh9kft+P6gPQbGww6QDW+aM5IdABEBAAG0G3dhbmd0b25nIDx3dGdl
9 | ZWtlckAxNjMuY29tPokBVAQTAQgAPhYhBBJOwv/hbFas+DoxHHoKB+L9p6NGBQJc
10 | Qo7xAhsDBQkDwmcABQsJCAcCBhUKCQgLAgQWAgMBAh4BAheAAAoJEHoKB+L9p6NG
11 | rZcH+QGz10cZ+z3lYc1K9drLse88gp8oUBGmOmyQsP64gJTKt0Ni/0sg96TK6Q/K
12 | ssRSPdR9xAp0UKsQBwgLouQb6bBlRM63HBfwVPM3wLkWJRzeNCNu1+/EFlIa7y9E
13 | T6KB5cc/B/tbhkDrM9+0Yl8HslY8SMBQW6S2asjW8K4QaOd1wWJcGUuKDRupohgo
14 | SJOZNs+fr5fI9/zlpGVodyNOzZe4dGNk+/fKVQQYcsT2HQEtxX6Wi1wwnMksoFAX
15 | 8JbJHm5a8zoxq5YdES26bOcJWDiYc7Qm9r3DC5pm0j4FYLtRVv6kZIAjeBimM3nE
16 | lLKRliKAU1DE/8qHov3tbm0hCY25AQ0EXEKO8QEIAKTCNvDJi/2EclZHX9FQI/7c
17 | PmfNlSv3+//3eQ2MdFVQxbuoetFp/ogTi4FeMuMPrvtS8eWCS+2T/bM0lLokry7y
18 | YUE7jrxnBf8Gvslz0t+EvKmdH9CBzBWHk2keKjzNBC7wyHmi6YignS0o6pHZciOz
19 | sDr1n1q/BWii034rB8g/oVm7kIPQk/bMeN8tUprqF0S/BE8CiqFPiM6OhNR6enA8
20 | h+ZKcSmkTNLeuTvRoCcyqfyfjgroBaFN616eZFnExVCXIGvVe/TAWhZPtRzUmq6C
21 | r1D08KIsc6lWzrnru+hhYfjfXyNmoJFdxCqiArC4g0Egck9Wwj5z6RlSNmi+dCEA
22 | EQEAAYkBPAQYAQgAJhYhBBJOwv/hbFas+DoxHHoKB+L9p6NGBQJcQo7xAhsMBQkD
23 | wmcAAAoJEHoKB+L9p6NG40YH/1bV3tIlIfntfY6P9qDRmLB3cEAHKYg7r1XSqrKw
24 | YpqJHim1CQxM4GJ1l/XT9W+s0YpQn3BMR41nmRIGJ5tztB6xebCW3tcNOK8YMjcG
25 | h6/PRBPyu3uGjxii2MJvibLBNUvwoX6pdEQqVqGjU+xyQrN6Gm9RY115AsabgNPm
26 | QrcJx1pUizqvA46t5T1N1yCaAu164BuTeO8SLhsG6QCydcDPRy22Ezsl5eoQilI5
27 | mRGJqHaJL0phbeOaPrgERx4sAUlB6jUt/uSwiapzzsBNUthjO1J/m+CnPDU7JPi4
28 | Reenvw2AT/upHHKtoJxwRPS/gDZk4GDA9obLKqWaXnOsV8k=
29 | =Xx3w
30 | -----END PGP PUBLIC KEY BLOCK-----
31 |
--------------------------------------------------------------------------------
/project/.gnupg/secring.gpg:
--------------------------------------------------------------------------------
1 | -----BEGIN PGP PRIVATE KEY BLOCK-----
2 |
3 | lQPGBFxCjvEBCADMp/qjTLav0haO4zMDKCR2xn0Tx9X/SPxjyzqCXiuQRhJslyUv
4 | mRwsRbi4bUYEoDcB9xPZVnHBf/fNE8+Jo0tdnaKho1sTmwMc/inIxXubfgTjsqI1
5 | 8eZvLJfqGPPpyBgE6Ijt+//wUiKNS/JmoUnDHIQZ3Az5mf/faUI5Qn8N9OoWmAZ3
6 | f+qk03ZgXNmgfkW7KSUSwtr7S0AFEc+co/YmLvlH16B7mTrmgJJg8O9iKdheFU7p
7 | YivVjOkoE5r9Cs7M01hc1OfMFgFShqCEpLLi9P7nEKhtQdAGR+bdAuPxmEetbkVU
8 | MNnj73prrfrh9kft+P6gPQbGww6QDW+aM5IdABEBAAH+BwMCDD/DVB1tuWDkxVqe
9 | 4onaPqVM0n912iKPbMLY7lHDQ5GPZNIRIN5qanr5bZlIaN28ptmDvKtGwaahBTXE
10 | I0JJEuuNuGC2y+/Kmw4wXTdlG/GnT/8Ktz91w8toDOR80JdaLzGKC7H+0fFQ3pOa
11 | 7BGquuBfoJFMEnSSZXs26/mtdHcNkYzKw1il5hUYGrdkqeuK2uqMW3VAy3AQOB60
12 | xihg2zSj8P9VjF4yd0y0rY4r/ZlThx6KAZ8H+iiS4MlmTMh21L5V5UXx3zsnMywI
13 | 18spMUFkz06zIgLmtjbcLL+If5OpIl+mpdvuwrv04N3c6CF+Di10j5SZiNNwisvz
14 | jHmvPBnthbxx2kXSCXBJzKHm6XNVTHRHSGK+OM1anbDcu4NkTskMZrlzUpwfDGBB
15 | rqgzEqu8GRhwqP+v7fk3rd2dLb5uceaxovHI+6BNy0WRzUNuo5t3IzhlEcoK+1ED
16 | oxqr/n7xzw6JYHcKCR3wzrhIHNIjZ/Dew3YT1cQCPLjg8xc4sd68He/HBl9ZHnDy
17 | hXrF4oUEGLoD+ExvARjPfzl5P+2d/V3949oOtnmV3DNHGvE8ew4+XbXn8fQfmBiR
18 | FOzK01+RtukyU/pqaShX1VNb2tlyf/d0CMPxaMKOt1+BGndXzTwzaEv9X61Cl/+F
19 | KrYGWabBGiavBe7dDTfBXY/8lMTVmBqtwwoom+dNatWeytmXyRXK0o2lMwK5mKSh
20 | jaXwcoIa+uTjyZyjhuUKRdKadBunJz9w3EGB6cet1iTNruOHZkYhMk+AzWNskz+w
21 | yXxU908DHQWdAI0S/2pGJ76JitIT+t69i04Efg+L8j7vJPBMkmoZyxomKRsWoKDQ
22 | Lacm0Fo6bvKGRy0B6urHbHu9ylGo/SsihMiTRQse7W7g2dtqnq6BAX68a6r9+Gy1
23 | iPok4KE/HjuOtBt3YW5ndG9uZyA8d3RnZWVrZXJAMTYzLmNvbT6JAVQEEwEIAD4W
24 | IQQSTsL/4WxWrPg6MRx6Cgfi/aejRgUCXEKO8QIbAwUJA8JnAAULCQgHAgYVCgkI
25 | CwIEFgIDAQIeAQIXgAAKCRB6Cgfi/aejRq2XB/kBs9dHGfs95WHNSvXay7HvPIKf
26 | KFARpjpskLD+uICUyrdDYv9LIPekyukPyrLEUj3UfcQKdFCrEAcIC6LkG+mwZUTO
27 | txwX8FTzN8C5FiUc3jQjbtfvxBZSGu8vRE+igeXHPwf7W4ZA6zPftGJfB7JWPEjA
28 | UFuktmrI1vCuEGjndcFiXBlLig0bqaIYKEiTmTbPn6+XyPf85aRlaHcjTs2XuHRj
29 | ZPv3ylUEGHLE9h0BLcV+lotcMJzJLKBQF/CWyR5uWvM6MauWHREtumznCVg4mHO0
30 | Jva9wwuaZtI+BWC7UVb+pGSAI3gYpjN5xJSykZYigFNQxP/Kh6L97W5tIQmNnQPG
31 | BFxCjvEBCACkwjbwyYv9hHJWR1/RUCP+3D5nzZUr9/v/93kNjHRVUMW7qHrRaf6I
32 | E4uBXjLjD677UvHlgkvtk/2zNJS6JK8u8mFBO468ZwX/Br7Jc9LfhLypnR/QgcwV
33 | h5NpHio8zQQu8Mh5oumIoJ0tKOqR2XIjs7A69Z9avwVootN+KwfIP6FZu5CD0JP2
34 | zHjfLVKa6hdEvwRPAoqhT4jOjoTUenpwPIfmSnEppEzS3rk70aAnMqn8n44K6AWh
35 | TetenmRZxMVQlyBr1Xv0wFoWT7Uc1Jqugq9Q9PCiLHOpVs6567voYWH4318jZqCR
36 | XcQqogKwuINBIHJPVsI+c+kZUjZovnQhABEBAAH+BwMC8trkrtDSBhXk2XGVB+oz
37 | EBN1TKVKkXaF8Gnm6OhYhErC+r6P7kmacoFVY4Ji/+dxybnpRUadKnULsUoRiM/N
38 | 0tE9jUgFdvS4AjXd9Xkx/KU3onpXQ7WgwOT6T1pgFYh9oY1sJMvC6AA6aVThFS7b
39 | ZeiufBernRxishDzHxVklniNVk0k+LnpnQhUuf/DCkWZWeAqE3R+ebwv8Rjh4XvR
40 | tpoxkCQQnVr1klwF9qSSwqpiZK4nkKbuukV0nCkmUut3bXFwan9XXwj7mgY5oA1O
41 | R2jeMXQ8mDo7Jp4XdVCAbKCUTQy9BX90QxCiLQw9/em06KYaXVQZZdnEW96BI9aa
42 | 2epqc73di4YwZHZHC5A7r5hvlZqhnNnWO8ZUJoSIJlfgA8Jl6W2lD7JkXFSYNh9h
43 | xXSI3iK5SRppkUOy9drHZi0WlRn482OdYyiwgdOpg2ZDIqmIGd6y11iLwSpTE3Ku
44 | vhss7q80qXbMTb9EO9BYteYEPUSxvpAW0bFUZqOA3PoMO+GKxM+d/z8/qzbAz78k
45 | NYxAz/a7UkRnxOkaqD4LyZbL98Ce2YTxXqikgMAEuFEl18BX4Zek3OxjcpGnzY3a
46 | 76O7t8Ud5pnZkk5OV+fBgNOkSfMVeaLb8P0xqIGaRzXt9Z17OH+M10Bi+djPEnNh
47 | l9guvFdFtyimSygaG4yNHUYnP2BNtrZoGLyd6Z63bm6V0k7pUrH5aIEj0OpP1sBV
48 | BLxvpqsMFi15Nn79jdzoQeXJfyMQ3wVmnrEKdGyHQj7CLWU6of/ZnvNcfN3lBCJ3
49 | 7Nw3JZijzM2IPrSYG/zFJCtD3WYyAEQnkqe/6cXSTJjihjOIm55PK/K2sKi0/uU9
50 | c8lVfdl9LMo0NSu53CF0wyZbrozvC1JvU7Dv6zVtIebsZXlrr67lw7Z3pEKYHew2
51 | NR4JMK08iQE8BBgBCAAmFiEEEk7C/+FsVqz4OjEcegoH4v2no0YFAlxCjvECGwwF
52 | CQPCZwAACgkQegoH4v2no0bjRgf/VtXe0iUh+e19jo/2oNGYsHdwQAcpiDuvVdKq
53 | srBimokeKbUJDEzgYnWX9dP1b6zRilCfcExHjWeZEgYnm3O0HrF5sJbe1w04rxgy
54 | NwaHr89EE/K7e4aPGKLYwm+JssE1S/Chfql0RCpWoaNT7HJCs3oab1FjXXkCxpuA
55 | 0+ZCtwnHWlSLOq8Djq3lPU3XIJoC7XrgG5N47xIuGwbpALJ1wM9HLbYTOyXl6hCK
56 | UjmZEYmodokvSmFt45o+uARHHiwBSUHqNS3+5LCJqnPOwE1S2GM7Un+b4Kc8NTsk
57 | +LhF56e/DYBP+6kccq2gnHBE9L+ANmTgYMD2hssqpZpec6xXyQ==
58 | =uCVh
59 | -----END PGP PRIVATE KEY BLOCK-----
60 |
--------------------------------------------------------------------------------
/project/Dependencies.scala:
--------------------------------------------------------------------------------
1 | import sbt._
2 |
3 | /**
4 | * @author : tong.wang
5 | * @since : 2018-12-08 00:25
6 | * @version : 1.0.0
7 | */
8 | object Dependencies {
9 |
10 | object Versions {
11 | val akkaVersion = "2.6.3"
12 | val log4j2 = "2.12.1"
13 | val seleniumhq = "4.0.0-alpha-3"
14 | val httpClient = "2.10.1"
15 | val jackson = "2.10.2"
16 | val guava = "28.2-jre"
17 | val typesafeConfig = "1.4.0"
18 | val scalatest = "3.0.8"
19 | val hikariCP = "3.4.1"
20 | }
21 |
22 |
23 | implicit class ModuleIDWrapper(moduleID: ModuleID) {
24 | def provided: ModuleID = moduleID.withConfigurations(Some("provided"))
25 |
26 | def test: ModuleID = moduleID.withConfigurations(Some("test"))
27 | }
28 |
29 | lazy val crossVersion = Seq("2.13.1", "2.12.10")
30 |
31 | lazy val guava = "com.google.guava" % "guava" % Versions.guava
32 |
33 | lazy val typesafeConfig = "com.typesafe" % "config" % Versions.typesafeConfig
34 |
35 | lazy val jackson = Seq("com.fasterxml.jackson.module" %% "jackson-module-scala" % Versions.jackson)
36 |
37 | lazy val scalatest = "org.scalatest" %% "scalatest" % Versions.scalatest
38 |
39 | lazy val log = Seq(
40 | "org.apache.logging.log4j" % "log4j-slf4j-impl" % Versions.log4j2,
41 | "org.apache.logging.log4j" % "log4j-core" % Versions.log4j2)
42 |
43 |
44 | object utils {
45 | lazy val dependencies = scalatest.test +: (Seq(guava, typesafeConfig) ++ jackson ++ log).map(_.provided)
46 | }
47 |
48 | object core {
49 |
50 | lazy val akka = Seq(("com.typesafe.akka" %% "akka-actor" % Versions.akkaVersion).provided)
51 |
52 | lazy val quartz = "org.quartz-scheduler" % "quartz" % "2.3.1" exclude("com.zaxxer", "HikariCP-java7")
53 |
54 | lazy val httpUtils = Seq("org.asynchttpclient" % "async-http-client" % Versions.httpClient)
55 |
56 | lazy val httpParser = Seq("us.codecraft" % "xsoup" % "0.3.1")
57 |
58 | lazy val selenium = Seq(
59 | "org.seleniumhq.selenium" % "selenium-chrome-driver" % Versions.seleniumhq
60 | )
61 |
62 | lazy val test = Seq(
63 | "com.typesafe.akka" %% "akka-testkit" % Versions.akkaVersion, scalatest
64 | ).map(_.test)
65 |
66 | lazy val dependencies = Seq(quartz, guava, typesafeConfig) ++ jackson ++ akka ++ log ++ httpParser ++ httpUtils ++ test ++ selenium
67 | }
68 |
69 | object pipeline {
70 | lazy val postgresql = "42.2.6"
71 |
72 | lazy val pg: ModuleID = "org.postgresql" % "postgresql" % postgresql
73 |
74 | lazy val hikari = "com.zaxxer" % "HikariCP" % Versions.hikariCP
75 |
76 | lazy val h2 = "com.h2database" % "h2" % "1.4.192"
77 |
78 | lazy val dependencies = Seq(scalatest, h2).map(_.test) ++ Seq(pg, hikari).map(_.provided)
79 | }
80 |
81 | object example {
82 | lazy val dependencies = core.dependencies ++ log ++ Seq(pipeline.pg, pipeline.hikari)
83 | }
84 |
85 | }
86 |
--------------------------------------------------------------------------------
/project/Publish.scala:
--------------------------------------------------------------------------------
1 | import sbt._
2 | import sbt.Keys._
3 | import com.typesafe.sbt.SbtPgp.autoImportImpl._
4 |
5 | /**
6 | * @author : tong.wang
7 | * @since : 2018-12-20 22:00
8 | * @version : 1.0.0
9 | */
10 | object Publish extends AutoPlugin {
11 | override def trigger: PluginTrigger = allRequirements
12 |
13 | override def projectSettings: Seq[Def.Setting[_]] = Seq(
14 | useGpg := false,
15 | usePgpKeyHex("124EC2FFE16C56ACF83A311C7A0A07E2FDA7A346"),
16 | pgpPublicRing := baseDirectory.value / "project" / ".gnupg" / "pubring.gpg",
17 | pgpSecretRing := baseDirectory.value / "project" / ".gnupg" / "secring.gpg",
18 |
19 | credentials += Credentials(Path.userHome / ".sbt" / ".credentials-center"),
20 | publishTo := {
21 | val nexus = "https://oss.sonatype.org/"
22 | if (isSnapshot.value) Some("snapshots" at nexus + "content/repositories/snapshots")
23 | else Some("releases" at nexus + "service/local/staging/deploy/maven2/")
24 | },
25 |
26 | publishMavenStyle := true,
27 | publishArtifact in Test := false,
28 | pomIncludeRepository := { _ ⇒ false },
29 |
30 | pomExtra in Global := {
31 | https://github.com/wtog/web-crawler
32 |
33 |
34 | Apache 2
35 | http://www.apache.org/licenses/LICENSE-2.0.txt
36 |
37 |
38 |
39 | git@github.com:wtog/web-crawler.git
40 | scm:git:git@github.com:wtog/web-crawler.git
41 |
42 |
43 |
44 | wangtong
45 | wangtong
46 | https://github.com/wtog/
47 |
48 |
49 | })
50 | }
51 |
52 | object DisablePublish extends AutoPlugin {
53 |
54 | override def requires: Plugins = plugins.IvyPlugin
55 |
56 | override def projectSettings: Seq[_root_.sbt.Def.Setting[_]] = Seq(
57 | publishArtifact := false,
58 | publish := Unit,
59 | publishLocal := Unit)
60 |
61 | }
--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.7.1
2 |
--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | logLevel := Level.Warn
2 |
3 | resolvers += Classpaths.sbtPluginReleases
4 | resolvers += Resolver.sonatypeRepo("releases")
5 | resolvers += Resolver.sonatypeRepo("snapshots")
6 |
7 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5")
8 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.1")
9 | addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.0")
10 | addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.9.26")
11 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.6.1")
12 | addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.3.4")
13 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.10.0-RC1")
14 |
--------------------------------------------------------------------------------
/push.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | function deploy_git() {
6 | git add .
7 | params=$@
8 | if [ "X$params" = "X" ]; then
9 | msg=$(git log --pretty=oneline --abbrev-commit | awk '{if(NR<2) for(i=2;i<=NF;i++) printf $i" " }')
10 | else
11 | msg=$params
12 | fi
13 | echo $msg
14 | git commit -m "$msg"
15 | git push
16 | }
17 |
18 | deploy_git $@
19 |
--------------------------------------------------------------------------------
/utils/src/main/scala/io/github/wtog/utils/ConfigUtils.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.utils
2 |
3 | import com.typesafe.config.{Config, ConfigFactory, ConfigObject, ConfigValue}
4 |
5 | import scala.collection.JavaConverters._
6 | import java._
7 |
8 | /**
9 | * @author : tong.wang
10 | * @since : 2019-05-09 00:14
11 | * @version : 1.0.0
12 | */
13 | object ConfigUtils {
14 |
15 | @volatile private[this] var config: Config = ConfigFactory.load()
16 |
17 | def init(resource: String) = {
18 | config = config.withFallback(ConfigFactory.load(resource))
19 | }
20 |
21 | def getSeq[T](path: String): Seq[T] = config.getList(path).unwrapped().asScala.map(v => v.asInstanceOf[T]).toSeq
22 |
23 | def getSeqMap(path: String): Seq[Map[String, Any]] = getSeq[util.Map[String, Any]](path).map(i => i.asScala.toMap)
24 |
25 | def getStringOpt(path: String): Option[String] = getOpt[String](path)(config.getString)
26 |
27 | def getIntOpt(path: String): Option[Int] = getOpt[Int](path)(config.getInt)
28 |
29 | def getBooleanOpt(path: String): Option[Boolean] = getOpt[Boolean](path)(config.getBoolean)
30 |
31 | def getConfig(name: String): Config = config.getConfig(name)
32 |
33 | def getConfigObjectOpt(path: String): Option[ConfigObject] = getOpt[ConfigObject](path)(config.getObject)
34 |
35 | def getKeyAndValue(name: String): Map[String, Any] = getConfig(name).entrySet().asScala.foldLeft(Map.empty[String, Any]) { (map, entry) =>
36 | map + (entry.getKey -> entry.getValue.unwrapped())
37 | }
38 |
39 | private[this] def getOpt[T](path: String)(getConfig: String => T): Option[T] =
40 | if (config.hasPath(path)) {
41 | Some(getConfig(path))
42 | } else {
43 | None
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/utils/src/main/scala/io/github/wtog/utils/JsonUtils.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.utils
2 |
3 | import com.fasterxml.jackson.annotation.JsonInclude.Include
4 | import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, SerializationFeature}
5 | import com.fasterxml.jackson.module.scala.DefaultScalaModule
6 | import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper
7 |
8 | /**
9 | * @author : tong.wang
10 | * @since : 9/24/19 11:05 PM
11 | * @version : 1.0.0
12 | */
13 | object JsonUtils {
14 |
15 | private lazy val mapper: ObjectMapper with ScalaObjectMapper = {
16 | val mapper = new ObjectMapper() with ScalaObjectMapper
17 | mapper.setSerializationInclusion(Include.NON_NULL)
18 | mapper.setSerializationInclusion(Include.NON_ABSENT)
19 | mapper.disable(SerializationFeature.FAIL_ON_EMPTY_BEANS)
20 | mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
21 | mapper.registerModule(DefaultScalaModule)
22 |
23 | mapper
24 | }
25 |
26 | def toJson[T](t: T): String = mapper.writeValueAsString(t)
27 |
28 | def toMap(t: Any): Map[String, Any] = mapper.convertValue[Map[String, Any]](t)
29 |
30 | def parseFrom[T: Manifest](json: String): T = mapper.readValue[T](json)
31 |
32 | def parseFrom[T: Manifest](bytes: Array[Byte]): T = mapper.readValue[T](bytes)
33 |
34 | def parseFrom[T: Manifest](map: Map[String, Any]): T = mapper.convertValue[T](map)
35 | }
36 |
--------------------------------------------------------------------------------
/utils/src/main/scala/io/github/wtog/utils/ReflectionUtils.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.utils
2 |
3 | import com.google.common.reflect.{ ClassPath, TypeToken }
4 |
5 | import scala.collection.JavaConverters._
6 | import scala.collection.mutable
7 |
8 | /**
9 | * @author : tong.wang
10 | * @since : 11/19/18 11:39 PM
11 | * @version : 1.0.0
12 | */
13 | object ReflectionUtils {
14 | private[this] lazy val CLASS_PATH = ClassPath.from(this.getClass.getClassLoader)
15 |
16 | def implementationClasses[T](clazz: Class[T], packageName: String): Seq[Class[T]] =
17 | getClasses[T](packageName).filter { c ⇒
18 | !c.isInterface && TypeToken
19 | .of(c)
20 | .getTypes
21 | .asScala
22 | .exists(t ⇒ t.getRawType == clazz)
23 | }.toSeq
24 |
25 | def getClasses[T](packageName: String): mutable.Set[Class[T]] = CLASS_PATH.getTopLevelClassesRecursive(packageName).asScala.map(_.load().asInstanceOf[Class[T]])
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/utils/src/main/scala/io/github/wtog/utils/RetryUtils.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.utils
2 |
3 | import java.util.concurrent.TimeUnit
4 |
5 | import scala.concurrent.duration._
6 | import scala.concurrent.{ ExecutionContext, Future }
7 | import scala.util.Try
8 |
9 | /**
10 | * @author : tong.wang
11 | * @since : 2019-05-07 23:10
12 | * @version : 1.0.0
13 | */
14 | object RetryUtils {
15 |
16 | def futureRetryWhen[T](invoke: => Future[T], retryTime: Int = 0, retryInfo: RetryInfo = RetryInfo())(implicit context: ExecutionContext = ExecutionContext.Implicits.global): Future[T] =
17 | invoke.recoverWith {
18 | case ex if retryInfo.exceptions.contains(ex.getClass.getName) && retryTime > 0 =>
19 | TimeUnit.MILLISECONDS.sleep(retryInfo.duration.toMillis)
20 | futureRetryWhen(invoke, retryTime - 1, retryInfo)(context)
21 | case other =>
22 | throw other
23 | }
24 |
25 | def retryWhen[T](invoke: => T, retryTime: Int = 0, retryInfo: RetryInfo = RetryInfo()): T =
26 | Try(invoke).recover {
27 | case ex: Throwable if retryInfo.exceptions.contains(ex.getClass.getName) && retryTime > 0 =>
28 | TimeUnit.MILLISECONDS.sleep(retryInfo.duration.toMillis)
29 | retryWhen(invoke, retryTime - 1, retryInfo)
30 | case other =>
31 | throw other
32 | }.get
33 | }
34 |
35 | case class RetryInfo(duration: Duration = 1 seconds, exceptions: Seq[String] = Seq(classOf[Throwable].getName))
36 |
--------------------------------------------------------------------------------
/utils/src/main/scala/io/github/wtog/utils/StringUtils.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.utils
2 |
3 | import com.google.common.base.CaseFormat
4 | import com.google.common.base.Converter
5 |
6 | /**
7 | * @author : tong.wang
8 | * @since : 10/31/19 12:25 AM
9 | * @version : 1.0.0
10 | */
11 | object StringUtils {
12 |
13 | lazy val underscoreConverter: Converter[String, String] = CaseFormat.LOWER_CAMEL.converterTo(CaseFormat.LOWER_UNDERSCORE)
14 | lazy val lowlandersConverter: Converter[String, String] = CaseFormat.LOWER_UNDERSCORE.converterTo(CaseFormat.LOWER_CAMEL)
15 |
16 | implicit class StringWrapper(s: String) {
17 |
18 | def toUnderscore: String = underscoreConverter.convert(s)
19 |
20 | def toLowercamel: String = lowlandersConverter.convert(s)
21 |
22 | def placeholderReplacedBy(placeholder: String, replacement: Any*): String = {
23 | val list = s.split(placeholder).zip(replacement.toSeq)
24 | val buffer = new StringBuilder()
25 | list.foreach {
26 | case (s, p: String) =>
27 | buffer.append(s).append(s"'$p'")
28 | case (s, p) =>
29 | buffer.append(s).append(p)
30 | }
31 | buffer.toString()
32 | }
33 | }
34 |
35 | }
36 |
--------------------------------------------------------------------------------
/utils/src/main/scala/io/github/wtog/utils/logger/Logging.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.utils.logger
2 |
3 | import org.slf4j.{ Logger, LoggerFactory }
4 |
5 | /**
6 | * @author : tong.wang
7 | * @since : 3/3/20 11:33 PM
8 | * @version : 1.0.0
9 | */
10 | trait Logging {
11 | protected val logger: Logger = LoggerFactory.getLogger(this.getClass)
12 | }
13 |
--------------------------------------------------------------------------------
/utils/src/test/scala/io/github/wtog/utils/test/BaseTest.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.utils.test
2 |
3 | import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers}
4 |
5 | import scala.concurrent.{Await, Future}
6 | import scala.concurrent.duration._
7 |
8 | /**
9 | * @author : tong.wang
10 | * @since : 10/30/19 11:54 PM
11 | * @version : 1.0.0
12 | */
13 | trait BaseTest extends FunSuite with Matchers with BeforeAndAfterAll{
14 |
15 | def await[T](future: => Future[T]) = Await.result(future, 1 minute)
16 |
17 | }
18 |
--------------------------------------------------------------------------------
/utils/src/test/scala/io/github/wtog/utils/test/JsonUtilsTest.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.utils.test
2 |
3 | import io.github.wtog.utils.JsonUtils
4 |
5 | /**
6 | * @author : tong.wang
7 | * @since : 12/29/19 8:25 PM
8 | * @version : 1.0.0
9 | */
10 | class JsonUtilsTest extends BaseTest {
11 |
12 | test("json serialize and deserialize") {
13 | val test = Test("test")
14 | val json = JsonUtils.toJson(test)
15 | val map = JsonUtils.toMap(test)
16 | val jsonParsed = JsonUtils.parseFrom[Test](json)
17 | assert(test == jsonParsed)
18 | val mapParsed = JsonUtils.parseFrom[Test](map)
19 | assert(test == mapParsed)
20 | }
21 | }
22 |
23 | case class Test(t: String)
24 |
--------------------------------------------------------------------------------
/utils/src/test/scala/io/github/wtog/utils/test/RetryUtilsTest.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.utils.test
2 |
3 | import java.util.concurrent.TimeUnit
4 |
5 | import io.github.wtog.utils.RetryInfo
6 | import io.github.wtog.utils.RetryUtils._
7 |
8 | import scala.concurrent.duration._
9 | import scala.concurrent.{ExecutionContext, Future}
10 | import scala.util.Try
11 |
12 | /**
13 | * @author : tong.wang
14 | * @since : 2019-05-07 23:35
15 | * @version : 1.0.0
16 | */
17 | class RetryUtilsTest extends BaseTest {
18 |
19 | test("retry") {
20 | var invokeTime = 0
21 |
22 | def method(limit: Int): Int = {
23 | invokeTime += 1
24 | if (invokeTime < limit) throw new Exception(s"invokeTime: ${invokeTime}") else invokeTime
25 | }
26 |
27 | assert(Try(retryWhen(method(6), retryTime = 3, RetryInfo(exceptions = Seq(classOf[Exception].getName)))).isFailure)
28 | invokeTime = 0
29 | assert(3 == retryWhen(method(3),retryTime = 3, RetryInfo(exceptions = Seq(classOf[Exception].getName))))
30 | invokeTime = 0
31 | assert(2 == retryWhen(method(2),retryTime = 3,RetryInfo(exceptions = Seq(classOf[Exception].getName))))
32 | invokeTime = 0
33 | assert(2 == retryWhen(method(2),retryTime = 1, RetryInfo(exceptions = Seq(classOf[Exception].getName))))
34 | invokeTime = 0
35 | assert(Try(retryWhen(method(3), retryTime = 1, RetryInfo(exceptions = Seq(classOf[Exception].getName)))).isFailure)
36 | }
37 |
38 | test("futureRetryWhen") {
39 |
40 | import scala.concurrent.ExecutionContext.Implicits.global
41 |
42 | class Test(invokeTime: Int = 0) {
43 | var _invokeTime = invokeTime
44 |
45 | def method(limit: Int): Future[Int] = {
46 | _invokeTime += 1
47 | Future {
48 | if (_invokeTime < limit) throw new Exception(s"invokeTime: ${invokeTime}") else _invokeTime
49 | }(ExecutionContext.Implicits.global)
50 | }
51 | }
52 |
53 |
54 | val t1 = new Test()
55 | (futureRetryWhen(t1.method(6),retryTime = 3, RetryInfo(exceptions = Seq(classOf[Exception].getName), duration = 10 millis))).onComplete(r => assert(r.isFailure))
56 |
57 | val t2 = new Test()
58 | (futureRetryWhen(t2.method(3),retryTime = 3, RetryInfo(exceptions = Seq(classOf[Exception].getName), duration = 10 millis))).onComplete(r => assert(r.isSuccess && r.get == 3))
59 |
60 | val t3 = new Test()
61 | (futureRetryWhen(t3.method(2),retryTime = 3, RetryInfo(exceptions = Seq(classOf[Exception].getName), duration = 10 millis))).onComplete(r => assert(r.isSuccess && r.get == 2))
62 |
63 | val t4 = new Test()
64 | (futureRetryWhen(t4.method(2),retryTime = 3, RetryInfo(exceptions = Seq(classOf[Exception].getName), duration = 10 millis))).onComplete(r => assert(r.isSuccess && r.get == 2))
65 |
66 | val t5 = new Test()
67 | (futureRetryWhen(t5.method(3),retryTime = 1, RetryInfo(exceptions = Seq(classOf[Exception].getName), duration = 10 millis))).onComplete(r => assert(r.isFailure))
68 |
69 | val t6 = new Test()
70 | (futureRetryWhen(t6.method(0).failed, retryTime = 3, RetryInfo(10 millis))).onComplete(r => assert(r.isFailure))
71 |
72 | TimeUnit.SECONDS.sleep(1)
73 | }
74 | }
75 |
--------------------------------------------------------------------------------
/utils/src/test/scala/io/github/wtog/utils/test/jmh/StringUtilsBenchmark.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.utils.test.jmh
2 |
3 | import org.openjdk.jmh.annotations.Benchmark
4 |
5 | import scala.collection.mutable.ListBuffer
6 |
7 | /**
8 | * @author : tong.wang
9 | * @since : 11/10/19 11:25 AM
10 | * @version : 1.0.0
11 | */
12 | class StringUtilsBenchmark {
13 | import StringUtilsBenchmark._
14 |
15 | @Benchmark
16 | def foldleft(): String = {
17 | foldLeft()
18 | }
19 |
20 | @Benchmark
21 | def stringBuild: String = {
22 | stringbuilder()
23 | }
24 |
25 | @Benchmark
26 | def mapMkString: String = {
27 | map()
28 | }
29 |
30 | @Benchmark
31 | def stringutils: String = {
32 | stringUtils()
33 | }
34 |
35 | @Benchmark
36 | def replace: String = {
37 | replaceFirstFold()
38 | }
39 | }
40 |
41 | object StringUtilsBenchmark {
42 | val sql =
43 | """
44 | |update house set
45 | |trading_right = ?,
46 | |house_right_owner = ?,
47 | |community_name = ?,
48 | |community_area_name = ?,
49 | |house_code = ?,
50 | |room_type_sub_info = ?,
51 | |meter_price = ?,
52 | |heating = ?,
53 | |total_price = ?,
54 | |sale_time = ?,
55 | |elevator = ?,
56 | |room_type_main_info = ?,
57 | |householdladder = ?,
58 | |room_main_info = ?,
59 | |housing_use = ?,
60 | |last_sale = ?,
61 | |house_right = ?,
62 | |room_area_sub_info = ?,
63 | |build_type = ?,
64 | |build_struct = ?,
65 | |evaluation_price = ?,
66 | |mortgage_info = ?,
67 | |room_sub_info = ?,
68 | |decoration = ?,
69 | |room_area_main_info = ?,
70 | |house_years = ?
71 | |where
72 | |house_code = ? "
73 | |""".stripMargin
74 |
75 | val params = Seq.tabulate[Any](28){a => if (a % 2 == 0) "test" else a}
76 |
77 | def foldLeft(): String = {
78 | val list = sql.split('?') zip params
79 |
80 | list.foldLeft("") {
81 | case (s, (sql, param)) =>
82 | s"$s$sql$param"
83 | }
84 | }
85 |
86 | def stringbuilder(): String = {
87 | val list = sql.split('?') zip params
88 | val buffer = new StringBuilder()
89 | list.foreach {
90 | case (s,p) =>
91 | buffer.append(s).append(p)
92 | }
93 | buffer.toString()
94 | }
95 |
96 | def map(): String = {
97 | val list = sql.split('?') zip params
98 |
99 | list.map {
100 | case (s, p) =>
101 | s"$s$p"
102 | }.mkString("")
103 | }
104 |
105 | def replaceFirstFold(): String = {
106 | val buffer = new ListBuffer[Any]
107 | params.foreach(p =>
108 | buffer.append(sql.replaceFirst("\\?", p.toString)))
109 | buffer.mkString("")
110 | }
111 |
112 | def stringUtils(): String = {
113 | import io.github.wtog.utils.StringUtils._
114 |
115 | sql.placeholderReplacedBy("\\?", params)
116 | }
117 | }
--------------------------------------------------------------------------------
/utils/src/test/scala/io/github/wtog/utils/test/reflection/ReflectionUtilsTest.scala:
--------------------------------------------------------------------------------
1 | package io.github.wtog.utils.test.reflection
2 |
3 | import io.github.wtog.utils.ReflectionUtils
4 | import io.github.wtog.utils.test.BaseTest
5 |
6 | /**
7 | * @author : tong.wang
8 | * @since : 3/7/20 9:26 AM
9 | * @version : 1.0.0
10 | */
11 | class ReflectionUtilsTest extends BaseTest {
12 |
13 | test("get implementation classes") {
14 | val implementationClasses = ReflectionUtils
15 | .implementationClasses(
16 | classOf[BaseTest],
17 | "io.github.wtog.utils.BaseTest"
18 | )
19 | assert(!implementationClasses.contains(classOf[BaseTest]))
20 | }
21 |
22 | }
23 |
24 |
--------------------------------------------------------------------------------
/version.sbt:
--------------------------------------------------------------------------------
1 | version in ThisBuild := "0.1.3-SNAPSHOT"
2 |
--------------------------------------------------------------------------------