├── .scalafix.conf ├── .scalafmt.conf ├── .sonarcloud.properties ├── .travis.yml ├── README.md ├── bin ├── run-example.sh ├── stop.sh └── test.sh ├── build.sbt ├── crawler-core ├── build.sbt └── src │ ├── main │ ├── resources │ │ ├── log4j2.xml │ │ └── reference.conf │ └── scala │ │ └── io │ │ └── github │ │ └── wtog │ │ └── crawler │ │ ├── actor │ │ └── ActorManager.scala │ │ ├── downloader │ │ ├── AsyncHttpClientDownloader.scala │ │ ├── ChromeHeadlessDownloader.scala │ │ ├── Downloader.scala │ │ ├── DownloaderActorReceiver.scala │ │ └── proxy │ │ │ ├── ProxyCrawlerPipeline.scala │ │ │ ├── ProxyProvider.scala │ │ │ └── crawler │ │ │ ├── A2UPageProcessor.scala │ │ │ ├── Data5UPageProcessor.scala │ │ │ ├── IP89Processor.scala │ │ │ └── ProxyProcessorTrait.scala │ │ ├── dto │ │ ├── Event.scala │ │ ├── Page.scala │ │ └── RequestSetting.scala │ │ ├── exceptions │ │ └── NonNullArgumentsException.scala │ │ ├── pipeline │ │ ├── ConsolePipeline.scala │ │ ├── Pipeline.scala │ │ └── PipelineActorReceiver.scala │ │ ├── processor │ │ ├── PageProcessor.scala │ │ └── PageProcessorActorReceiver.scala │ │ ├── queue │ │ ├── DuplicateRemovedQueue.scala │ │ ├── RequestQueue.scala │ │ ├── TargetRequestTaskQueue.scala │ │ └── duplicate │ │ │ ├── BitSetStrategy.scala │ │ │ ├── DuplicateRemovedStrategy.scala │ │ │ └── HashMapStrategy.scala │ │ ├── rest │ │ ├── NettyServer.scala │ │ ├── Router.scala │ │ └── Server.scala │ │ ├── schedule │ │ └── ScheduleJobs.scala │ │ ├── selector │ │ ├── HtmlParser.scala │ │ └── Selector.scala │ │ └── spider │ │ ├── Spider.scala │ │ └── SpiderPool.scala │ └── test │ ├── resources │ ├── application-test.conf │ └── log4j2-test.xml │ └── scala │ └── io │ └── github │ └── wtog │ └── crawler │ └── test │ ├── BaseCoreTest.scala │ ├── actor │ ├── ActorTestBase.scala │ └── PageProcessorActorTestkit.scala │ ├── download │ ├── AsyncHttpClientTest.scala │ └── ChromeHeadlessDownloaderTest.scala │ ├── processor │ └── HtmlParserSpec.scala │ ├── proxy │ └── ProxyProviderTest.scala │ ├── queue │ └── DuplicateStrategyTest.scala │ ├── schedule │ └── ScheduleTest.scala │ └── server │ └── TestMockServer.scala ├── crawler-example └── src │ └── main │ ├── resources │ ├── log4j2.xml │ └── reference.conf │ └── scala │ └── io │ └── github │ └── wtog │ └── example │ ├── ExampleTrait.scala │ ├── Main.scala │ └── impl │ ├── BaiduPageProcessor.scala │ ├── LianjiaErshouFangProcessor.scala │ ├── LianjiaRentingProcessor.scala │ ├── ZhihuAnswerPageProcessor.scala │ └── flight │ └── QunarPageProcessor.scala ├── crawler-pipeline └── src │ ├── main │ └── scala │ │ └── io │ │ └── github │ │ └── wtog │ │ └── crawler │ │ └── pipeline │ │ ├── db │ │ ├── DataSource.scala │ │ ├── DataSourceInfo.scala │ │ └── PostgreSQLPipeline.scala │ │ └── file │ │ ├── CsvFilePipeline.scala │ │ └── FilePipeline.scala │ └── test │ └── scala │ └── io │ └── github │ └── wtog │ └── crawler │ └── pipeline │ └── test │ ├── BasePipelineTest.scala │ └── DataSourceTest.scala ├── docker ├── Dockerfile └── build.sh ├── project ├── .gnupg │ ├── pubring.gpg │ └── secring.gpg ├── Dependencies.scala ├── Publish.scala ├── build.properties └── plugins.sbt ├── push.sh ├── utils └── src │ ├── main │ └── scala │ │ └── io │ │ └── github │ │ └── wtog │ │ └── utils │ │ ├── ConfigUtils.scala │ │ ├── JsonUtils.scala │ │ ├── ReflectionUtils.scala │ │ ├── RetryUtils.scala │ │ ├── StringUtils.scala │ │ └── logger │ │ └── Logging.scala │ └── test │ └── scala │ └── io │ └── github │ └── wtog │ └── utils │ └── test │ ├── BaseTest.scala │ ├── JsonUtilsTest.scala │ ├── RetryUtilsTest.scala │ ├── jmh │ └── StringUtilsBenchmark.scala │ └── reflection │ └── ReflectionUtilsTest.scala └── version.sbt /.scalafix.conf: -------------------------------------------------------------------------------- 1 | rules = [ 2 | RemoveUnused, 3 | ExplicitResultTypes, 4 | LeakingImplicitClassVal, 5 | NoValInForComprehension, 6 | ProcedureSyntax 7 | ] -------------------------------------------------------------------------------- /.scalafmt.conf: -------------------------------------------------------------------------------- 1 | version=2.0.0-RC5 2 | align = true 3 | danglingParentheses = true 4 | maxColumn = 400 5 | rewrite.rules = [AvoidInfix, RedundantParens, SortModifiers, PreferCurlyFors, RedundantBraces, SortImports] 6 | spaces.inImportCurlyBraces = true 7 | continuationIndent.defnSite = 4 8 | verticalMultiline.atDefnSite = true 9 | verticalMultiline.arityThreshold = 5 10 | project.excludeFilters = [ 11 | io.github.wtog.test.jmh 12 | ] 13 | -------------------------------------------------------------------------------- /.sonarcloud.properties: -------------------------------------------------------------------------------- 1 | # Path to sources 2 | sonar.projectKey=io.github.wtog:crawler 3 | sonar.organization=wtog 4 | sonar.sources=crawler-core/src/main,crawler-pipeline/src/main,utils/src/main 5 | sonar.exclusions=**/resources/** 6 | sonar.tests=crawler-core/src/test,crawler-pipeline/src/test,utils/src/test 7 | sonar.test.inclusions=**/*Test* 8 | sonar.sourceEncoding=UTF-8 9 | sonar.scala.coverage.reportPaths=crawler-core/target/scala-2.12/scoverage-report/scoverage.xml,crawler-pipeline/target/scala-2.12/scoverage-report/scoverage.xml,utils/target/scala-2.12/scoverage-report/scoverage.xml -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # use Docker-based container (instead of OpenVZ) 2 | sudo: false 3 | 4 | branches: 5 | only: 6 | - dev 7 | 8 | cache: 9 | directories: 10 | - $HOME/.m2/repository 11 | - $HOME/.sbt 12 | - $HOME/.ivy2 13 | 14 | language: scala 15 | 16 | addons: 17 | chrome: stable 18 | 19 | script: 20 | - bash bin/test.sh 21 | 22 | jdk: 23 | - openjdk8 24 | 25 | after_success: 26 | - bash <(curl -s https://codecov.io/bash) 27 | - sbt ';set credentials += Credentials("Sonatype Nexus Repository Manager", "oss.sonatype.org", System.getenv("NEXUS_USER"), System.getenv("NEXUS_PASS")); set pgpPassphrase := Some("PGP_PASSPHRASE".toArray[Char]); +clean; +publish'; 28 | - git config --global user.email wtgeeker@163.com 29 | - git config --global user.name wtog 30 | - current=`git log | head -n 1 | awk '{print $2}'` 31 | - git fetch origin master:master; git checkout master; git merge $current -s recursive -X theirs --no-commit; 32 | - git commit -m `git log $current --oneline | head -n 1 | awk '{print $2}'` 33 | - git push --force --quiet "https://${GITHUB_TOKEN}@github.com/wtog/web-crawler.git" master:master 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # web-crawler 2 | 3 | [![Build Status](https://travis-ci.com/wtog/web-crawler.svg?branch=dev)](https://travis-ci.com/wtog/web-crawler.svg?branch=dev) [![codecov](https://codecov.io/gh/wtog/web-crawler/branch/dev/graph/badge.svg)](https://codecov.io/gh/wtog/web-crawler) ![Sonatype Nexus (Snapshots)](https://img.shields.io/nexus/s/https/oss.sonatype.org/io.github.wtog/web-crawler_2.12.svg) [![Quality Gate Status](https://sonarcloud.io/api/project_badges/measure?project=io.github.wtog:crawler&metric=alert_status)](https://sonarcloud.io/dashboard?id=io.github.wtog:crawler) 4 | 5 | ## 项目介绍 6 | 7 | 参考 webmagic [http://webmagic.io](http://webmagic.io) 撸的 [scala + akka] 爬虫 8 | 9 | ## 使用说明 10 | 11 | - 爬虫例子 [爬知乎R大 JVM 回答] 12 | 13 | ```scala 14 | package io.github.wtog.example 15 | 16 | import io.github.wtog.processor.{ Page, PageProcessor, RequestSetting } 17 | 18 | import scala.concurrent.duration._ 19 | 20 | // 爬虫解析逻辑 21 | case class ZhihuAnswerPageProcessor() extends PageProcessor { 22 | 23 | val link = "https://www.zhihu.com/api/v4/members/rednaxelafx/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cquestion%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=10&sort_by=created" 24 | 25 | override def doProcess(page: Page): Unit = { 26 | val result = page.json[Map[String, Any]]() 27 | 28 | result.get("data").foreach { answers => 29 | answers.asInstanceOf[List[Map[String, Any]]].foreach { answer => 30 | val question = answer("question").asInstanceOf[Map[String, String]]("title") 31 | val answerContent = answer("content") 32 | page.addPageResultItem(Map("question" -> question, "answer" -> answerContent)) 33 | } 34 | } 35 | 36 | val nextPage = result("paging").asInstanceOf[Map[String, String]].get("next") 37 | 38 | nextPage.foreach { url => 39 | page.addTargetRequest(url.replaceAll("https://www.zhihu.com", "$0/api/v4")) 40 | } 41 | } 42 | 43 | override def requestSetting: RequestSetting = 44 | RequestSetting( 45 | domain = "www.zhihu.com", 46 | sleepTime = 3 seconds 47 | ) 48 | 49 | override def targetUrls: List[String] = List(link) 50 | 51 | override def cronExpression: Option[String] = None 52 | } 53 | 54 | // 启动爬虫 55 | Spider(pageProcessor = ZhihuAnswerPageProcessor()).start() 56 | ``` 57 | 58 | [更多例子](https://github.com/wtog/web-crawler/tree/master/crawler-example/src/main/scala/io.github.wtog.example) 59 | 60 | - sbt 61 | 62 | 1. sbt 'project example; assembly' # 打 jar 包 63 | 2. java -jar crawler-example/target/scala-2.12/web-crawler-assembly.jar 64 | 65 | - docker 66 | 67 | 1. build 68 | 69 | ```docker 70 | sh docker/build.sh 71 | ``` 72 | 73 | 2. start container 74 | 75 | ```docker 76 | docker run -it --name web-crawler wtog/web-crawler:latest 77 | ``` 78 | -------------------------------------------------------------------------------- /bin/run-example.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | base=$(basename $(pwd)) 6 | 7 | if [ "$base" == "bin" ]; then 8 | cd ../ 9 | fi 10 | 11 | sbt 'project example; assembly' 12 | 13 | echo "java -jar crawler-example/target/scala-2.12/web-crawler-assembly.jar $* > /tmp/crawler.log 2>&1 &" 14 | echo "crawler starting..." 15 | nohup java -jar crawler-example/target/scala-2.12/web-crawler-assembly.jar $* > /tmp/crawler.log 2>&1 & 16 | -------------------------------------------------------------------------------- /bin/stop.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ps -ef |grep web-crawler-assembly | awk '{print $2}' | xargs kill -9 -------------------------------------------------------------------------------- /bin/test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | wget -O chromedriver_linux64.zip -q https://npm.taobao.org/mirrors/chromedriver/79.0.3945.36/chromedriver_linux64.zip 4 | 5 | unzip -uxo chromedriver_linux64.zip -d /opt 6 | 7 | chmod +x /opt/chromedriver 8 | 9 | rm chromedriver_linux64.zip 10 | 11 | sbt ';clean ;coverage ;test ;coverageReport' 12 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | import Dependencies.crossVersion 2 | import sbt.Keys.organization 3 | import sbtassembly.{Assembly, MergeStrategy, PathList} 4 | 5 | javacOptions ++= Seq("-source", "1.8", "-target", "1.8") 6 | 7 | lazy val scalafixSettings = Seq( 8 | addCompilerPlugin(scalafixSemanticdb), 9 | scalacOptions ++= List( 10 | s"-P:semanticdb:targetroot:${System.getProperty("java.io.tmpdir")}/semanticdb", 11 | "-Yrangepos", 12 | "-language:postfixOps")) 13 | 14 | lazy val jmhSettings = Seq( 15 | sourceDirectory in Jmh := (sourceDirectory in Test).value, 16 | classDirectory in Jmh := (classDirectory in Test).value, 17 | dependencyClasspath in Jmh := (dependencyClasspath in Test).value 18 | ) 19 | 20 | lazy val commonSettings = Seq( 21 | scalaVersion := crossVersion.head, 22 | fork := true, 23 | crossScalaVersions := crossVersion, 24 | parallelExecution in Test := true 25 | ) 26 | 27 | lazy val dependenciesScope = "compile->compile;test->test" 28 | 29 | lazy val utils = (project in file("utils")) 30 | .settings(commonSettings: _*) 31 | .settings(scalafixSettings: _*) 32 | .settings(jmhSettings: _*) 33 | .settings(Seq(name := "utils", organization := "io.github.wtog.utils")) 34 | .settings(libraryDependencies ++= Dependencies.utils.dependencies) 35 | .enablePlugins(JmhPlugin) 36 | .disablePlugins(AssemblyPlugin) 37 | 38 | lazy val core = (project in file("crawler-core")) 39 | .settings(commonSettings: _*) 40 | .settings(scalafixSettings: _*) 41 | .settings(jmhSettings: _*) 42 | .settings(Seq(name := "crawler-core", organization := "io.github.wtog.crawler")) 43 | .settings(libraryDependencies ++= Dependencies.core.dependencies) 44 | .dependsOn(utils % dependenciesScope) 45 | .disablePlugins(AssemblyPlugin) 46 | 47 | lazy val pipeline = (project in file("crawler-pipeline")) 48 | .settings(commonSettings: _*) 49 | .settings(scalafixSettings: _*) 50 | .settings(jmhSettings: _*) 51 | .settings(Seq(name := "crawler-pipeline", organization := "io.github.wtog.crawler.pipeline")) 52 | .settings(libraryDependencies ++= Dependencies.pipeline.dependencies) 53 | .dependsOn(core, utils % dependenciesScope) 54 | .disablePlugins(AssemblyPlugin) 55 | 56 | lazy val example = (project in file("crawler-example")) 57 | .settings(commonSettings: _*) 58 | .settings(scalafixSettings: _*) 59 | .settings(Seq(name := "crawler-example", organization := "io.github.wtog.example")) 60 | .settings(libraryDependencies ++= Dependencies.example.dependencies) 61 | .settings( 62 | Seq( 63 | assemblyJarName in assembly := s"web-crawler-assembly.jar", 64 | mainClass in Compile := Some("io.github.wtog.example.Main"), 65 | test in assembly := {}, 66 | assemblyMergeStrategy in assembly := { 67 | case x if Assembly.isConfigFile(x) => 68 | MergeStrategy.concat 69 | case PathList(ps@_*) if Assembly.isReadme(ps.last) || Assembly.isLicenseFile(ps.last) => 70 | MergeStrategy.rename 71 | case PathList(ps@_*) if Assembly.isSystemJunkFile(ps.last) => 72 | MergeStrategy.discard 73 | case PathList("META-INF", xs@_*) => 74 | xs.map(_.toLowerCase) match { 75 | case (x :: Nil) if Seq("manifest.mf", "index.list", "dependencies") contains x => 76 | MergeStrategy.discard 77 | case ps@(x :: xs) if ps.last.endsWith(".sf") || ps.last.endsWith(".dsa") || ps.last.endsWith(".rsa") => 78 | MergeStrategy.discard 79 | case "maven" :: xs => 80 | MergeStrategy.discard 81 | case "plexus" :: xs => 82 | MergeStrategy.discard 83 | case "services" :: xs => 84 | MergeStrategy.filterDistinctLines 85 | case ("spring.schemas" :: Nil) | ("spring.handlers" :: Nil) | ("spring.tooling" :: Nil) => 86 | MergeStrategy.filterDistinctLines 87 | case _ => MergeStrategy.first 88 | } 89 | case _ => MergeStrategy.first 90 | } 91 | ) 92 | ) 93 | .dependsOn(core, pipeline) 94 | .enablePlugins(DisablePublish, AssemblyPlugin) 95 | .disablePlugins(ScoverageSbtPlugin) 96 | 97 | lazy val root = (project in file(".")) 98 | .settings(commonSettings: _*) 99 | .settings(Seq(name := "web-crawler")) 100 | .aggregate(utils, core, pipeline, example) 101 | .enablePlugins(JmhPlugin, DisablePublish) 102 | .disablePlugins(AssemblyPlugin) 103 | 104 | testOptions in Test += Tests.Argument(s"-P${java.lang.Runtime.getRuntime.availableProcessors()}") 105 | 106 | javaOptions in test := Seq( 107 | "-Dlog4j.configurationFile=log4j2-test.xml", 108 | "-Xms512m", "-Xmx512m" 109 | ) 110 | 111 | javaOptions in run := Seq( 112 | "-Dlog4j.configurationFile=log4j2.xml", 113 | "-Xms512m", "-Xmx512m" 114 | ) 115 | 116 | -------------------------------------------------------------------------------- /crawler-core/build.sbt: -------------------------------------------------------------------------------- 1 | mappings in(Compile, packageBin) ~= { 2 | files => 3 | files.filter(!_._1.getName.contentEquals("log4j2.xml")) 4 | } 5 | -------------------------------------------------------------------------------- /crawler-core/src/main/resources/log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /crawler-core/src/main/resources/reference.conf: -------------------------------------------------------------------------------- 1 | crawler { 2 | 3 | downloader-dispatcher { 4 | type = Dispatcher 5 | executor = "thread-pool-executor" 6 | 7 | thread-pool-executor { 8 | core-pool-size-min = 5 9 | core-pool-size-factor = 2 10 | core-pool-size-max = 10 11 | } 12 | } 13 | 14 | processor-dispatcher { 15 | type = Dispatcher 16 | executor = "fork-join-executor" 17 | fork-join-executor { 18 | parallelism-min = 2 19 | parallelism-factor = 4.0 20 | parallelism-max = 10 21 | } 22 | throughput = 50 23 | } 24 | 25 | pipeline-dispatcher { 26 | type = Dispatcher 27 | executor = "thread-pool-executor" 28 | 29 | thread-pool-executor { 30 | core-pool-size-min = 4 31 | core-pool-size-factor = 2 32 | core-pool-size-max = 8 33 | } 34 | } 35 | 36 | download.retry.exception = ["java.util.concurrent.TimeoutException"] 37 | 38 | server.port = 19000 39 | } 40 | 41 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/actor/ActorManager.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.actor 2 | 3 | import akka.actor.{ ActorSystem, Props } 4 | 5 | import scala.concurrent.ExecutionContext 6 | import akka.actor.{ ActorRef, ActorSelection } 7 | import akka.dispatch.MessageDispatcher 8 | 9 | /** 10 | * @author : tong.wang 11 | * @since : 5/16/18 11:56 PM 12 | * @version : 1.0.0 13 | */ 14 | object ActorManager { 15 | lazy val system: ActorSystem = ActorSystem("crawler") 16 | 17 | def getNewSystemActor(dispatcher: String, actorName: String, props: Props): ActorRef = system.actorOf(props.withDispatcher(s"crawler.${dispatcher}"), actorName) 18 | 19 | def getExistedActor(path: String): ActorSelection = system.actorSelection(path) 20 | } 21 | 22 | object ExecutionContexts { 23 | implicit lazy val downloadDispatcher: ExecutionContext = dispatcher("crawler.downloader-dispatcher") 24 | implicit lazy val processorDispatcher: ExecutionContext = dispatcher("crawler.processor-dispatcher") 25 | implicit lazy val pipelineDispatcher: ExecutionContext = dispatcher("crawler.pipeline-dispatcher") 26 | 27 | def dispatcher(id: String): MessageDispatcher = { 28 | val dispatchers = ActorManager.system.dispatchers 29 | if (dispatchers.hasDispatcher(id)) { 30 | dispatchers.lookup(id) 31 | } else { 32 | dispatchers.defaultGlobalDispatcher 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/downloader/AsyncHttpClientDownloader.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.downloader 2 | 3 | import io.github.wtog.crawler.downloader.proxy.ProxyDTO 4 | import io.github.wtog.crawler.dto.{ Page, RequestSetting } 5 | import io.github.wtog.crawler.exceptions.IllegalArgumentsException 6 | import io.netty.handler.codec.http.{ DefaultHttpHeaders, HttpHeaderNames } 7 | import org.asynchttpclient.Dsl.asyncHttpClient 8 | import org.asynchttpclient._ 9 | import org.asynchttpclient.proxy.ProxyServer 10 | 11 | import scala.concurrent.{ Future, Promise, TimeoutException } 12 | 13 | /** 14 | * @author : tong.wang 15 | * @since : 5/16/18 11:13 PM 16 | * @version : 1.0.0 17 | */ 18 | object AsyncHttpClientDownloader extends Downloader[AsyncHttpClient] { 19 | 20 | private[this] def buildRequest(driver: AsyncHttpClient, request: RequestSetting, proxyOpt: Option[ProxyDTO] = None): BoundRequestBuilder = { 21 | proxyOpt.foreach { proxy => 22 | buildProxy(proxy)(p => new ProxyServer.Builder(p.host, p.port).build()) 23 | } 24 | 25 | val builder = builderMethod(driver, request) 26 | 27 | val httpHeaders = new DefaultHttpHeaders 28 | request.headers.foreach { case (k, v) ⇒ httpHeaders.add(k, v) } 29 | httpHeaders.add(HttpHeaderNames.USER_AGENT, request.userAgent) 30 | httpHeaders.add(HttpHeaderNames.ACCEPT_CHARSET, request.charset) 31 | builder.setHeaders(httpHeaders) 32 | 33 | } 34 | 35 | def builderMethod(driver: AsyncHttpClient, requestSetting: RequestSetting): BoundRequestBuilder = { 36 | val url = requestSetting.url.get 37 | requestSetting.method.toUpperCase match { 38 | case "GET" ⇒ 39 | driver.prepareGet(url) 40 | case "POST" ⇒ 41 | val post = driver.preparePost(url) 42 | requestSetting.requestBody.foreach(post.setBody) 43 | post 44 | case other ⇒ 45 | logger.warn(s"unknown http method ${other}") 46 | throw IllegalArgumentsException(other) 47 | } 48 | 49 | } 50 | 51 | override def doDownload(request: RequestSetting): Future[Page] = { 52 | val response = executeRequest(request) { proxyOpt => 53 | val promise = Promise[Response] 54 | 55 | val client = getOrCreateClient(request) 56 | buildRequest(client.driver, request, proxyOpt).execute(new AsyncCompletionHandler[Response]() { 57 | override def onCompleted(response: Response): Response = { 58 | promise.success(response) 59 | client.decrement() 60 | response 61 | } 62 | 63 | override def onThrowable(t: Throwable): Unit = { 64 | if (t.isInstanceOf[TimeoutException]) { 65 | logger.error("download error ", t) 66 | } 67 | client.decrement() 68 | promise.failure(t) 69 | } 70 | }) 71 | 72 | promise.future 73 | } 74 | 75 | response.map { r => 76 | pageResult(request, Some(r.getResponseBodyAsBytes), r.getStatusCode == 200, Some(s"return ${r.getStatusCode}")) 77 | }(io.github.wtog.crawler.actor.ExecutionContexts.downloadDispatcher) 78 | } 79 | 80 | def closeClient(): Unit = closeDownloaderClient(_.close()) 81 | 82 | override protected def getOrCreateClient(requestSetting: RequestSetting): DownloaderClient[AsyncHttpClient] = 83 | getDownloaderClient(requestSetting.domain) { 84 | asyncHttpClient( 85 | new DefaultAsyncHttpClientConfig.Builder() 86 | .setRequestTimeout(requestSetting.timeOut.toMillis.toInt) 87 | .setConnectTimeout(requestSetting.timeOut.toMillis.toInt) 88 | .setFollowRedirect(true) 89 | .setConnectionPoolCleanerPeriod(5) 90 | .build() 91 | ) 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/downloader/ChromeHeadlessDownloader.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.downloader 2 | 3 | import java.io.File 4 | import java.util 5 | import java.util.logging.Level 6 | 7 | import io.github.wtog.crawler.downloader.ChromeHeadlessConfig._ 8 | import io.github.wtog.crawler.dto.{ Page, RequestSetting, XhrResponse } 9 | import io.github.wtog.utils.{ ConfigUtils, JsonUtils } 10 | import org.openqa.selenium.chrome.{ ChromeDriver, ChromeOptions } 11 | import org.openqa.selenium.logging.{ LogType, LoggingPreferences } 12 | import org.openqa.selenium.remote.UnreachableBrowserException 13 | 14 | import scala.collection.JavaConverters._ 15 | import scala.collection.mutable.ListBuffer 16 | import scala.concurrent.Future 17 | import scala.util.control.NonFatal 18 | 19 | /** 20 | * @author : tong.wang 21 | * @since : 2019-07-18 22:28 22 | * @version : 1.0.0 23 | */ 24 | object ChromeHeadlessDownloader extends Downloader[ChromeDriver] { 25 | 26 | override protected def doDownload(requestSetting: RequestSetting): Future[Page] = { 27 | 28 | val client = getOrCreateClient(requestSetting) 29 | 30 | Future { 31 | try { 32 | val driver = client.driver 33 | 34 | driver.get(requestSetting.url.get) 35 | val performanceLog = driver.manage().logs().get(LogType.PERFORMANCE) 36 | 37 | val iterator = performanceLog.iterator() 38 | var returnAllXhrRequest = true 39 | val xhrResponseBuffer = new ListBuffer[XhrResponse] 40 | while (iterator.hasNext && returnAllXhrRequest) { 41 | val xhrResponse = iterator.next() 42 | val message = JsonUtils.parseFrom[Map[String, Any]](xhrResponse.getMessage).get("message").get.asInstanceOf[Map[String, Any]] 43 | message.get("params").foreach { 44 | case params: Map[String, Any] @unchecked => 45 | val headers = params.getOrElse("headers", Map.empty[String, Any]).asInstanceOf[Map[String, Any]] 46 | getXhrRequestUriByHeaders(headers).foreach { 47 | case xhrResponseUri: String if requestSetting.xhrRequests.contains(xhrResponseUri) => 48 | xhrResponseBuffer.append(XhrResponse(xhrResponseUri, getXhrResponse(driver, params.get("requestId").get.asInstanceOf[String]))) 49 | case _ if (xhrResponseBuffer.size == requestSetting.xhrRequests.size) => 50 | returnAllXhrRequest = false 51 | case _ => 52 | } 53 | } 54 | } 55 | 56 | Page(requestSetting = requestSetting, bytes = Some(driver.getPageSource.getBytes()), xhrResponses = xhrResponseBuffer.toSeq) 57 | } catch { 58 | case NonFatal(exception) => 59 | Page.failed(requestSetting, exception) 60 | } finally { 61 | client.decrement() 62 | } 63 | }(io.github.wtog.crawler.actor.ExecutionContexts.downloadDispatcher) 64 | } 65 | 66 | private def getXhrResponse(driver: ChromeDriver, requestId: String): Map[String, Any] = { 67 | val cdpMap = new util.HashMap[String, Object]() 68 | cdpMap.put("requestId", requestId) 69 | driver.executeCdpCommand("Network.getResponseBody", cdpMap).asScala.toMap 70 | } 71 | 72 | private def getXhrRequestUriByHeaders(headers: Map[String, Any]): Option[String] = 73 | headers.get("x-requested-with") match { 74 | case Some("XMLHttpRequest") => 75 | val schema: String = headers.getOrElse(":scheme", "").asInstanceOf[String] 76 | val domain: String = headers.getOrElse(":authority", "").asInstanceOf[String] 77 | val uri: String = headers.get(":path").fold[String]("") { 78 | case p: String => 79 | val queryIndex = p.indexOf('?') 80 | p.substring(0, if (queryIndex > 0) queryIndex else p.length) 81 | } 82 | 83 | Some(s"$schema://$domain$uri") 84 | case _ => 85 | None 86 | } 87 | 88 | private[this] def buildOptions(requestSetting: RequestSetting): ChromeOptions = { 89 | val options = new ChromeOptions() 90 | options.setExperimentalOption("excludeSwitches", Array[String]("enable-automation")) 91 | 92 | val perf = new util.HashMap[String, Any]() 93 | perf.put("enableNetwork", true) 94 | options.setExperimentalOption("prefs", perf) 95 | 96 | val logPrefs = new LoggingPreferences 97 | logPrefs.enable(LogType.PERFORMANCE, Level.ALL) 98 | logPrefs.enable(LogType.SERVER, Level.ALL) 99 | logPrefs.enable(LogType.BROWSER, Level.ALL) 100 | logPrefs.enable(LogType.DRIVER, Level.ALL) 101 | logPrefs.enable(LogType.CLIENT, Level.ALL) 102 | options.setCapability("goog:loggingPrefs", logPrefs) 103 | options.addArguments( 104 | "--no-sandbox", 105 | "--headless", 106 | "--start-maximized", 107 | "--disable-dev-shm-usage", 108 | "--disable-plugins-discovery", 109 | "--enable-logging", 110 | "--v=1", 111 | "--disable-gpu", 112 | "--ignore-certificate-errors", 113 | s"--user-agent=${requestSetting.userAgent}" 114 | ) 115 | 116 | options 117 | } 118 | 119 | override def getOrCreateClient(requestSetting: RequestSetting): DownloaderClient[ChromeDriver] = getDownloaderClient(requestSetting.domain) { 120 | System.setProperty("webdriver.chrome.driver", chromeDriverPath) 121 | System.setProperty("webdriver.chrome.logfile", chromeDriverLog) 122 | 123 | val driver = new ChromeDriver(buildOptions(requestSetting)) 124 | val map = new util.HashMap[String, Object]() 125 | map.put( 126 | "source", 127 | """ 128 | |Object.defineProperty(navigator, 'webdriver', { 129 | | get: () => false 130 | |}); 131 | |Object.defineProperty(navigator, 'plugins', { 132 | | get: () => [1, 2, 3, 4, 5] 133 | |}); 134 | |Object.defineProperty(navigator, 'languages', { 135 | | get: () => ["zh-CN","zh","en-US","en"] 136 | |}); 137 | |""".stripMargin 138 | ) 139 | driver.executeCdpCommand("Page.addScriptToEvaluateOnNewDocument", map) 140 | 141 | driver 142 | } 143 | 144 | override def closeClient(): Unit = closeDownloaderClient { driver => 145 | try (driver.quit()) 146 | catch { 147 | case _: UnreachableBrowserException => 148 | () 149 | case e: Throwable => 150 | throw e 151 | } 152 | } 153 | 154 | } 155 | 156 | object ChromeHeadlessConfig { 157 | lazy val chromeDriverPath: String = ConfigUtils.getStringOpt("crawler.chrome.driver").getOrElse("/opt/chromedriver") 158 | lazy val chromeDriverLog: String = ConfigUtils.getStringOpt("crawler.chrome.log").getOrElse("/tmp/chromedriver.log") 159 | 160 | def chromeDriverNotExecutable: Boolean = { 161 | val file = new File(chromeDriverPath) 162 | val canExecute = file.exists() && file.canExecute 163 | !canExecute 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/downloader/Downloader.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.downloader 2 | 3 | import java.util.concurrent.atomic.AtomicInteger 4 | import java.util.concurrent.{ ConcurrentHashMap, Executors, ScheduledFuture, TimeUnit } 5 | 6 | import io.github.wtog.crawler.downloader.proxy.{ ProxyDTO, ProxyProvider } 7 | import scala.collection.JavaConverters._ 8 | import io.github.wtog.crawler.dto 9 | import io.github.wtog.crawler.dto.{ Page, RequestSetting } 10 | import io.github.wtog.crawler.spider.Spider 11 | import io.github.wtog.utils.RetryUtils._ 12 | import io.github.wtog.utils.{ ConfigUtils, RetryInfo } 13 | import org.slf4j.{ Logger, LoggerFactory } 14 | 15 | import scala.concurrent.Future 16 | import scala.util.control.NonFatal 17 | import scala.util.{ Failure, Success, Try } 18 | 19 | /** 20 | * @author : tong.wang 21 | * @since : 5/16/18 9:56 PM 22 | * @version : 1.0.0 23 | */ 24 | trait Downloader[Driver] { 25 | protected lazy val logger: Logger = LoggerFactory.getLogger(this.getClass) 26 | 27 | private val downloadRetryException: Seq[String] = ConfigUtils.getSeq[String]("crawler.download.retry.exception") 28 | 29 | protected val clientsPool = new ConcurrentHashMap[String, DownloaderClient[Driver]]() 30 | 31 | protected def doDownload(requestSetting: RequestSetting): Future[Page] 32 | 33 | protected def buildProxy[P](proxyDto: ProxyDTO)(buildProxy: ProxyDTO => P): P = buildProxy(proxyDto) 34 | 35 | protected def closeClient(): Unit 36 | 37 | protected def getOrCreateClient(requestSetting: RequestSetting): DownloaderClient[Driver] 38 | 39 | def getClient(domain: String): Option[DownloaderClient[Driver]] = Option(clientsPool.get(domain)) 40 | 41 | /** 42 | * common schedule job to close download client 43 | */ 44 | val scheduleClose: ScheduledFuture[_] = Executors 45 | .newSingleThreadScheduledExecutor() 46 | .scheduleAtFixedRate(new Runnable { 47 | override def run(): Unit = 48 | try (closeClient()) 49 | catch { case e: Throwable => logger.error("", e) } 50 | }, 3, 3, TimeUnit.MINUTES) 51 | 52 | def download(spider: Spider, request: RequestSetting): Future[Page] = { 53 | import io.github.wtog.crawler.actor.ExecutionContexts.downloadDispatcher 54 | futureRetryWhen(doDownload(requestSetting = request), retryTime = request.retryTime, RetryInfo(duration = request.sleepTime, downloadRetryException)) 55 | .map { page => 56 | spider.CrawlMetric.record(page.isDownloadSuccess, page.url) 57 | page 58 | } 59 | .recover { 60 | case NonFatal(e) => 61 | spider.CrawlMetric.record(success = false, request.url.get) 62 | throw e 63 | } 64 | } 65 | 66 | protected def executeRequest[HttpResponse](requestSetting: RequestSetting)(execute: Option[ProxyDTO] => Future[HttpResponse]): Future[HttpResponse] = 67 | if (requestSetting.useProxy) { 68 | execute(ProxyProvider.getProxy) 69 | } else { 70 | execute(None) 71 | } 72 | 73 | protected def pageResult(requestSetting: RequestSetting, results: Option[Array[Byte]] = None, downloadSuccess: Boolean = true, msg: Option[String] = None): Page = 74 | dto.Page(downloadSuccess, bytes = results, requestSetting = requestSetting) 75 | 76 | protected def getDownloaderClient(domain: String)(driver: => Driver): DownloaderClient[Driver] = { 77 | val clientCache = Option(clientsPool.get(domain)) 78 | 79 | val downloaderClient = clientCache.getOrElse { 80 | val downloaderClient = DownloaderClient(domain = domain, driver = driver) 81 | clientsPool.put(domain, downloaderClient) 82 | downloaderClient 83 | } 84 | 85 | downloaderClient.increment() 86 | downloaderClient 87 | } 88 | 89 | def closeDownloaderClient(close: Driver => Unit): Unit = 90 | for (e <- clientsPool.entrySet().asScala) { 91 | val (domain, downloaderClient) = (e.getKey, e.getValue) 92 | if (downloaderClient.idle()) { 93 | Try(close(downloaderClient.driver)) match { 94 | case Success(_) => 95 | logger.info(s"${domain} downloader driver[${downloaderClient.driver.getClass.getSimpleName}] has been closed.") 96 | case Failure(exception) => 97 | logger.error(s"${domain} downloader driver failed to close. ${exception.getLocalizedMessage}") 98 | } 99 | clientsPool.remove(domain) 100 | } 101 | } 102 | 103 | sys.addShutdownHook { 104 | try (closeClient()) 105 | catch { case e: Throwable => logger.error("", e) } 106 | } 107 | } 108 | 109 | case class DownloaderClient[C](domain: String, driver: C, consumers: AtomicInteger = new AtomicInteger(0)) { 110 | def idle(): Boolean = consumers.get() == 0 111 | def increment(): Int = consumers.incrementAndGet() 112 | def decrement(): Int = consumers.decrementAndGet() 113 | } 114 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/downloader/DownloaderActorReceiver.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.downloader 2 | 3 | import akka.actor.{ Actor, Props } 4 | import io.github.wtog.crawler.dto.{ DownloadEvent, ProcessorEvent } 5 | import io.github.wtog.crawler.processor.PageProcessorActorReceiver 6 | import org.slf4j.{ Logger, LoggerFactory } 7 | import akka.actor.ActorRef 8 | 9 | /** 10 | * @author : tong.wang 11 | * @since : 5/16/18 11:54 PM 12 | * @version : 1.0.0 13 | */ 14 | class DownloaderActorReceiver extends Actor { 15 | 16 | private val logger: Logger = LoggerFactory.getLogger(classOf[DownloaderActorReceiver]) 17 | 18 | lazy val processorActor: ActorRef = context.actorOf( 19 | Props[PageProcessorActorReceiver].withDispatcher("crawler.processor-dispatcher"), 20 | "page-processor" 21 | ) 22 | 23 | override def receive: Receive = { 24 | case downloadEvent: DownloadEvent ⇒ 25 | val spider = downloadEvent.spider 26 | 27 | import io.github.wtog.crawler.actor.ExecutionContexts.downloadDispatcher 28 | spider.pageProcessor.downloader.download(spider, downloadEvent.request).foreach { 29 | case page if page.isDownloadSuccess ⇒ 30 | processorActor ! ProcessorEvent(spider, page) 31 | case page => 32 | logger.warn(s"page failed to download cause ${page.source}") 33 | } 34 | case other ⇒ 35 | logger.warn(s"${self.path} received wrong msg ${other}") 36 | } 37 | 38 | override def postStop(): Unit = 39 | if (logger.isWarnEnabled()) 40 | logger.warn(s"downloader-processor [${self.path}] stopped!") 41 | 42 | override def postRestart(reason: Throwable): Unit = 43 | if (logger.isWarnEnabled()) 44 | logger.warn(s"downloader-processor restart! ${reason.getLocalizedMessage}") 45 | } 46 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/downloader/proxy/ProxyCrawlerPipeline.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.downloader.proxy 2 | 3 | import io.github.wtog.crawler.pipeline.Pipeline 4 | 5 | /** 6 | * @author : tong.wang 7 | * @since : 6/2/18 11:57 PM 8 | * @version : 1.0.0 9 | */ 10 | object ProxyCrawlerPipeline extends Pipeline { 11 | 12 | override def process[R](pageResultItem: (String, R)): Unit = { 13 | val (url, result) = pageResultItem 14 | 15 | val resultMap = result.asInstanceOf[ProxyDTO] 16 | logger.trace(s"${url} => ${resultMap}") 17 | ProxyProvider.proxyList.offer(resultMap) 18 | } 19 | 20 | } 21 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/downloader/proxy/ProxyProvider.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.downloader.proxy 2 | 3 | import java.net.{ HttpURLConnection, InetSocketAddress, URL } 4 | import java.util.Objects 5 | import java.util.concurrent.atomic.{ AtomicBoolean, AtomicInteger } 6 | import java.util.concurrent.{ ArrayBlockingQueue, Executors } 7 | 8 | import io.github.wtog.crawler.downloader.proxy.ProxyProvider._ 9 | import io.github.wtog.crawler.downloader.proxy.ProxyStatusEnums.ProxyStatusEnums 10 | import io.github.wtog.crawler.processor.PageProcessor 11 | import io.github.wtog.crawler.schedule.{ ScheduleJob, ScheduleJobs } 12 | import io.github.wtog.crawler.spider.Spider 13 | import io.github.wtog.utils.ReflectionUtils 14 | import org.quartz.{ Job, JobExecutionContext } 15 | import org.slf4j.{ Logger, LoggerFactory } 16 | 17 | import scala.util.Try 18 | import java.util.concurrent.ExecutorService 19 | 20 | /** 21 | * @author : tong.wang 22 | * @since : 5/20/18 11:08 AM 23 | * @version : 1.0.0 24 | */ 25 | object ProxyProvider { 26 | lazy val logger: Logger = LoggerFactory.getLogger(ProxyProvider.getClass) 27 | 28 | val checkThread: ExecutorService = Executors.newFixedThreadPool(5) 29 | 30 | val proxyList: ArrayBlockingQueue[ProxyDTO] = new ArrayBlockingQueue[ProxyDTO](100) 31 | 32 | val proxySpiderCrawling: AtomicBoolean = new AtomicBoolean(false) 33 | 34 | private lazy val proxyCrawlerList: Seq[Spider] = ReflectionUtils 35 | .implementationClasses( 36 | classOf[PageProcessor], 37 | "io.github.wtog.crawler.downloader.proxy.crawler" 38 | ) 39 | .map(proxy ⇒ Spider(pageProcessor = proxy.newInstance())) 40 | 41 | private def crawlCronJob(restart: Boolean = false) = 42 | if (restart) proxyCrawlerList.foreach(_.restart()) 43 | else proxyCrawlerList.foreach(_.start()) 44 | 45 | def startProxyCrawl(restart: Boolean = false): Unit = 46 | if (!proxySpiderCrawling.getAndSet(true)) { 47 | crawlCronJob(restart) 48 | ScheduleJobs.addJob(scheduleJob = ScheduleJob(jobName = "proxy-check", cronExpression = "*/2 * * ? * *", task = classOf[ProxyCheckScheduleJob])) 49 | } 50 | 51 | def getProxy: Option[ProxyDTO] = Option(proxyList.peek()).filter(_.usability > 0.5) 52 | } 53 | 54 | class ProxyCheckScheduleJob extends Job { 55 | 56 | override def execute(context: JobExecutionContext): Unit = { 57 | def checkProxy() = Option(proxyList.poll()).foreach { proxy ⇒ 58 | proxy.usabilityCheck(proxy.connect2Baidu()) match { 59 | case (_, true) => 60 | proxyList.put(proxy) 61 | case (_, _) => 62 | } 63 | } 64 | 65 | if (logger.isDebugEnabled) { 66 | logger.debug(s"proxy list is ${proxyList.size()}") 67 | } 68 | 69 | (1 to 5).foreach { _ => 70 | checkThread.execute(new Runnable { 71 | override def run(): Unit = checkProxy() 72 | }) 73 | } 74 | } 75 | 76 | } 77 | 78 | final case class ProxyDTO( 79 | host: String, 80 | port: Int, 81 | username: Option[String] = None, 82 | password: Option[String] = None, 83 | var status: ProxyStatusEnums = ProxyStatusEnums.IDLE, 84 | var usability: Float = 0f) { 85 | 86 | val checkUrl: URL = new URL("http://www.baidu.com") 87 | val checkTimes: AtomicInteger = new AtomicInteger(0) 88 | val successTimes: AtomicInteger = new AtomicInteger(0) 89 | val usabilityLimit = 0.5 90 | 91 | def connect2Baidu(): Boolean = { 92 | import java.net.Proxy 93 | Try { 94 | val proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(host, port)) 95 | val connection = checkUrl.openConnection(proxy).asInstanceOf[HttpURLConnection] 96 | connection.setConnectTimeout(1000) 97 | connection.setReadTimeout(1000) 98 | 99 | connection.getResponseCode == 200 100 | }.recover { 101 | case _: Throwable => 102 | false 103 | }.get 104 | 105 | } 106 | 107 | def usabilityCheck(checkWay: => Boolean): (Float, Boolean) = { 108 | usability = { 109 | val checkTimeValue = checkTimes.incrementAndGet() 110 | if (checkWay) 111 | successTimes.incrementAndGet() / checkTimeValue 112 | else 113 | successTimes.get() / checkTimeValue 114 | } 115 | 116 | (usability, usability > usabilityLimit) 117 | } 118 | 119 | override def hashCode(): Int = Objects.hash(this.host.asInstanceOf[Object], this.port.asInstanceOf[Object]) 120 | 121 | override def equals(obj: scala.Any): Boolean = obj match { 122 | case t: ProxyDTO ⇒ 123 | t.host == this.host && t.port == this.port 124 | case _ ⇒ 125 | false 126 | } 127 | 128 | override def toString: String = s"${host}:${port}" 129 | } 130 | 131 | object ProxyStatusEnums extends Enumeration { 132 | type ProxyStatusEnums = Value 133 | val USING: Value = Value("using") 134 | val IDLE: Value = Value("idle") 135 | } 136 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/downloader/proxy/crawler/A2UPageProcessor.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.downloader.proxy.crawler 2 | 3 | import io.github.wtog.crawler.downloader.proxy.ProxyDTO 4 | import io.github.wtog.crawler.dto.{ Page, RequestSetting } 5 | 6 | import scala.concurrent.duration._ 7 | 8 | /** 9 | * https://raw.githubusercontent.com/a2u/free-proxy-list/master/free-proxy-list.txt 10 | * @author : tong.wang 11 | * @since : 6/3/18 12:33 AM 12 | * @version : 1.0.0 13 | */ 14 | case class A2UPageProcessor() extends ProxyProcessorTrait { 15 | override def doProcess(page: Page): Unit = { 16 | val proxyIpList = page.body.text().split(" ") 17 | 18 | proxyIpList.foreach(it ⇒ { 19 | val ipAndPort = it.split(":") 20 | val proxy = ProxyDTO(ipAndPort.head, ipAndPort.last.toInt) 21 | page.addPageResultItem(proxy) 22 | }) 23 | } 24 | 25 | override def requestSetting: RequestSetting = RequestSetting(domain = "raw.githubusercontent.com", sleepTime = 2 seconds) 26 | 27 | override def targetUrls: List[String] = 28 | List( 29 | "https://raw.githubusercontent.com/a2u/free-proxy-list/master/free-proxy-list.txt" 30 | ) 31 | 32 | } 33 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/downloader/proxy/crawler/Data5UPageProcessor.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.downloader.proxy.crawler 2 | 3 | import io.github.wtog.crawler.downloader.proxy.ProxyDTO 4 | import io.github.wtog.crawler.dto.{ Page, RequestSetting } 5 | 6 | import scala.concurrent.duration._ 7 | 8 | /** 9 | * @author : tong.wang 10 | * @since : 6/7/18 11:27 PM 11 | * @version : 1.0.0 12 | */ 13 | case class Data5UPageProcessor() extends ProxyProcessorTrait { 14 | 15 | override def doProcess(page: Page): Unit = { 16 | val ipRow = page.dom(".wlist > ul > li:nth-child(2) .l2") 17 | val ipSize = ipRow.size() 18 | 19 | (0 until ipSize).foreach(i ⇒ { 20 | val ip = ipRow.get(i).select("span:nth-child(1)").text() 21 | val port = ipRow.get(i).select("span:nth-child(2)").text() 22 | 23 | val proxy = ProxyDTO(ip, port.toInt) 24 | page.addPageResultItem(proxy) 25 | }) 26 | 27 | } 28 | 29 | override def cronExpression: Option[String] = Some("*/5 * * ? * *") 30 | 31 | override def requestSetting: RequestSetting = RequestSetting(domain = "www.data5u.com", sleepTime = 2 second) 32 | 33 | override def targetUrls: List[String] = List("http://www.data5u.com") 34 | 35 | } 36 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/downloader/proxy/crawler/IP89Processor.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.downloader.proxy.crawler 2 | 3 | import io.github.wtog.crawler.downloader.proxy.ProxyDTO 4 | import io.github.wtog.crawler.dto.{ Page, RequestSetting } 5 | 6 | import scala.concurrent.duration._ 7 | import scala.util.Try 8 | 9 | /** 10 | * http://www.89ip.cn 11 | * @author : tong.wang 12 | * @since : 6/3/18 12:33 AM 13 | * @version : 1.0.0 14 | */ 15 | case class IP89Processor() extends ProxyProcessorTrait { 16 | override def doProcess(page: Page): Unit = { 17 | val iprows = page.table("tbody tr") 18 | iprows.foreach { ip => 19 | val tds = ip.select("td") 20 | val proxDto = ProxyDTO(host = tds.get(0).text(), port = Try(tds.get(1).text().toInt).getOrElse(80)) 21 | page.addPageResultItem(proxDto) 22 | } 23 | } 24 | 25 | override def requestSetting: RequestSetting = RequestSetting(domain = "www.89ip.com", sleepTime = 2 seconds) 26 | 27 | override def targetUrls: List[String] = List("http://www.89ip.cn") 28 | 29 | } 30 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/downloader/proxy/crawler/ProxyProcessorTrait.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.downloader.proxy.crawler 2 | 3 | import io.github.wtog.crawler.downloader.proxy.ProxyCrawlerPipeline 4 | import io.github.wtog.crawler.pipeline.Pipeline 5 | import io.github.wtog.crawler.processor.PageProcessor 6 | 7 | /** 8 | * @author : tong.wang 9 | * @since : 9/16/18 10:34 AM 10 | * @version : 1.0.0 11 | */ 12 | trait ProxyProcessorTrait extends PageProcessor { 13 | override def cronExpression: Option[String] = Some("*/5 * * ? * *") 14 | 15 | override val pipelines: Set[Pipeline] = Set(ProxyCrawlerPipeline) 16 | } 17 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/dto/Event.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.dto 2 | 3 | import io.github.wtog.crawler.pipeline.Pipeline 4 | import io.github.wtog.crawler.spider.Spider 5 | import io.github.wtog.utils.logger.Logging 6 | 7 | import scala.util.{ Failure, Success, Try } 8 | 9 | /** 10 | * @author : tong.wang 11 | * @since : 2019-05-01 22:56 12 | * @version : 1.0.0 13 | */ 14 | sealed trait Event 15 | 16 | case class DownloadEvent(spider: Spider, request: RequestSetting) extends Event 17 | 18 | case class ProcessorEvent(spider: Spider, page: Page) extends Event 19 | 20 | case class PipelineEvent[R](pipelineList: Set[Pipeline], pageResultItems: (String, R)) extends Event with Logging { 21 | def initPipelines(): Option[PipelineEvent[R]] = { 22 | val allInited = pipelineList 23 | .map { p => 24 | Try(p.open()) match { 25 | case Success(_) => true 26 | case Failure(exception) => 27 | logger.error(s"failed to init pipeline ${exception.getLocalizedMessage}") 28 | false 29 | } 30 | } 31 | .forall(_ == true) 32 | 33 | if (allInited) Some(this) else None 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/dto/Page.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.dto 2 | 3 | import java.net.URL 4 | import java.util.concurrent.LinkedBlockingQueue 5 | 6 | import io.github.wtog.crawler.queue.TargetRequestTaskQueue 7 | import io.github.wtog.crawler.selector.HtmlParser 8 | import io.github.wtog.crawler.selector.HtmlParser.parseJson 9 | import io.github.wtog.utils.logger.Logging 10 | 11 | import scala.util.Try 12 | 13 | /** 14 | * @author : tong.wang 15 | * @since : 1/2/20 9:41 PM 16 | * @version : 1.0.0 17 | */ 18 | case class Page( 19 | isDownloadSuccess: Boolean = true, 20 | bytes: Option[Array[Byte]] = None, 21 | responseHeaders: Map[String, String] = Map.empty[String, String], 22 | xhrResponses: Seq[XhrResponse] = Seq.empty[XhrResponse], 23 | requestSetting: RequestSetting) { 24 | 25 | lazy val resultItems: LinkedBlockingQueue[Any] = new LinkedBlockingQueue[Any] 26 | lazy val requestQueue: TargetRequestTaskQueue = new TargetRequestTaskQueue() 27 | 28 | lazy val url = requestSetting.url.get 29 | 30 | def source: String = bytes match { 31 | case Some(byte) ⇒ 32 | HtmlParser.getHtmlSourceWithCharset(byte, requestSetting.charset) 33 | case None ⇒ 34 | throw new IllegalStateException("no page source text found ") 35 | } 36 | 37 | def json[T: Manifest](text: Option[String] = None): T = parseJson[T](text.getOrElse(this.source)) 38 | 39 | def addTargetRequest(urlAdd: String): Unit = addRequest(this.requestSetting.withUrl(url = urlAdd)) 40 | 41 | def addTargetRequest(requestUri: RequestUri): Unit = addRequest(this.requestSetting.withRequestUri(requestUri)) 42 | 43 | private[this] def addRequest(requestSetting: RequestSetting): Unit = { 44 | val url = requestSetting.url.get 45 | 46 | if (Try(new URL(url)).isSuccess) { 47 | this.requestQueue.push(requestSetting) 48 | } 49 | } 50 | 51 | def addPageResultItem[R](result: R): Unit = this.resultItems.add(result) 52 | 53 | override def toString: String = s"${requestSetting.url.get} downloaded ${isDownloadSuccess}" 54 | } 55 | 56 | object Page extends Logging { 57 | def failed(requestSetting: RequestSetting, exceptionMessage: Throwable): Page = { 58 | logger.warn(s"failed to download cause ${exceptionMessage.getLocalizedMessage}") 59 | Page(requestSetting = requestSetting, isDownloadSuccess = false) 60 | } 61 | } 62 | 63 | case class XhrResponse(xhrUri: String, result: Map[String, Any]) 64 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/dto/RequestSetting.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.dto 2 | 3 | import java.nio.charset.Charset 4 | 5 | import io.netty.handler.codec.http.HttpMethod 6 | 7 | import scala.collection.mutable 8 | import scala.concurrent.duration.Duration 9 | import scala.concurrent.duration._ 10 | 11 | /** 12 | * @author : tong.wang 13 | * @since : 1/2/20 9:43 PM 14 | * @version : 1.0.0 15 | */ 16 | case class RequestSetting( 17 | domain: String = "", 18 | method: String = HttpMethod.GET.toString, 19 | url: Option[String] = None, 20 | userAgent: String = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36", 21 | requestBody: Option[String] = None, 22 | headers: mutable.Map[String, String] = mutable.Map.empty[String, String], 23 | sleepTime: Duration = 1 seconds, 24 | cookies: Option[Map[String, String]] = None, 25 | charset: String = Charset.defaultCharset().name(), 26 | retryTime: Int = 0, 27 | timeOut: Duration = 3 seconds, 28 | useProxy: Boolean = false, 29 | xhrRequests: Set[String] = Set.empty[String]) { 30 | 31 | def withUrlAndMethod(url: String, method: String = HttpMethod.GET.toString): RequestSetting = 32 | this.copy(url = Some(url), method = method) 33 | 34 | def withUrl(url: String): RequestSetting = this.copy(url = Some(url)) 35 | 36 | def withSleepTime(sleepTime: Duration): RequestSetting = this.copy(sleepTime = sleepTime) 37 | 38 | def withHeaders(extraHeaders: Map[String, String]): RequestSetting = { 39 | this.headers ++= extraHeaders.toSeq 40 | this 41 | } 42 | 43 | def addHeader(header: String, value: String): RequestSetting = { 44 | this.headers += (header -> value) 45 | this 46 | } 47 | 48 | def withMethodAndRequestBody(method: String, requestBody: Option[String]): RequestSetting = 49 | this.copy(method = method, requestBody = requestBody) 50 | 51 | def withRequestUri(requestUri: RequestUri): RequestSetting = { 52 | val basic = this.copy( 53 | url = Some(requestUri.url), 54 | method = requestUri.method, 55 | requestBody = requestUri.requestBody, 56 | xhrRequests = requestUri.xhrRequests 57 | ) 58 | 59 | requestUri.headers.fold(basic) { extra ⇒ 60 | basic.withHeaders(extra) 61 | } 62 | } 63 | 64 | def withXhrRequests(xhrRequest: String*): RequestSetting = { 65 | val requests = xhrRequest.foldLeft(this.xhrRequests) { (xhrRequests, xhrRequest) => 66 | xhrRequests + xhrRequest 67 | } 68 | this.copy(xhrRequests = requests) 69 | } 70 | 71 | override def toString: String = { 72 | val fields = this.getClass.getDeclaredFields 73 | .map { field => 74 | val value = field.get(this) match { 75 | case v: Option[Any] => 76 | v.getOrElse("") 77 | case v => 78 | v 79 | } 80 | 81 | (s"${field.getName}: $value", value) 82 | } 83 | .collect { 84 | case (v: String, t: String) if !t.isEmpty => v 85 | case (v: String, t: Any) if !t.isInstanceOf[String] => v 86 | } 87 | 88 | s"${fields.mkString(", ")}" 89 | } 90 | } 91 | 92 | case class RequestUri( 93 | url: String, 94 | method: String = HttpMethod.GET.toString, 95 | requestBody: Option[String] = None, 96 | headers: Option[Map[String, String]] = None, 97 | xhrRequests: Set[String] = Set.empty[String]) 98 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/exceptions/NonNullArgumentsException.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.exceptions 2 | 3 | /** 4 | * @author : tong.wang 5 | * @since : 5/19/18 1:38 PM 6 | * @version : 1.0.0 7 | */ 8 | case class NonNullArgumentsException(arguments: String*) extends IllegalArgumentException { 9 | override def getLocalizedMessage: String = arguments.mkString(",") + " cant be null" 10 | } 11 | 12 | case class IllegalArgumentsException(arguments: String*) extends IllegalArgumentException { 13 | override def getLocalizedMessage: String = arguments.mkString(",") + " type is illeage" 14 | } 15 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/pipeline/ConsolePipeline.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.pipeline 2 | 3 | /** 4 | * @author : tong.wang 5 | * @since : 10/18/19 10:07 PM 6 | * @version : 1.0.0 7 | */ 8 | object ConsolePipeline extends Pipeline { 9 | def process[R](pageResultItem: (String, R)): Unit = { 10 | val (url, result) = pageResultItem 11 | logger.trace(s"crawl result: ${url} - ${result}") 12 | } 13 | 14 | } 15 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/pipeline/Pipeline.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.pipeline 2 | 3 | import java.util.concurrent.atomic.AtomicBoolean 4 | 5 | import io.github.wtog.utils.logger.Logging 6 | 7 | /** 8 | * @author : tong.wang 9 | * @since : 5/16/18 9:09 PM 10 | * @version : 1.0.0 11 | */ 12 | trait Pipeline extends Logging { 13 | 14 | private val inited = new AtomicBoolean(false) 15 | 16 | def open(): Unit = 17 | if (inited.compareAndSet(false, true)) { 18 | init() 19 | } 20 | 21 | protected def init(): Unit = () 22 | 23 | def process[Result](pageResultItem: (String, Result)): Unit 24 | } 25 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/pipeline/PipelineActorReceiver.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.pipeline 2 | 3 | import akka.actor.Actor 4 | import io.github.wtog.crawler.dto.PipelineEvent 5 | import org.slf4j.{ Logger, LoggerFactory } 6 | 7 | /** 8 | * @author : tong.wang 9 | * @since : 5/16/18 11:54 PM 10 | * @version : 1.0.0 11 | */ 12 | class PipelineActorReceiver extends Actor { 13 | 14 | private lazy val logger: Logger = LoggerFactory.getLogger(classOf[PipelineActorReceiver]) 15 | 16 | override def receive: Receive = { 17 | case pipelineEvent: PipelineEvent[_] ⇒ 18 | val pipelineList = if (logger.isTraceEnabled()) { 19 | pipelineEvent.pipelineList + ConsolePipeline 20 | } else { 21 | pipelineEvent.pipelineList 22 | } 23 | 24 | pipelineList.foreach { _.process(pipelineEvent.pageResultItems) } 25 | case other ⇒ 26 | logger.warn(s"${this.getClass.getSimpleName} received wrong msg ${other}") 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/processor/PageProcessor.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.processor 2 | 3 | import io.github.wtog.crawler.actor.ExecutionContexts.processorDispatcher 4 | import io.github.wtog.crawler.downloader.{AsyncHttpClientDownloader, Downloader} 5 | import io.github.wtog.crawler.dto.{Page, RequestSetting, RequestUri} 6 | import io.github.wtog.crawler.pipeline.{ConsolePipeline, Pipeline} 7 | import io.github.wtog.crawler.selector.HtmlParser 8 | import io.github.wtog.utils.logger.Logging 9 | 10 | import scala.concurrent.Future 11 | 12 | /** 13 | * @author : tong.wang 14 | * @since : 5/16/18 9:48 PM 15 | * @version : 1.0.0 16 | */ 17 | trait PageProcessor extends HtmlParser with Logging { 18 | 19 | val name: String = this.getClass.getSimpleName 20 | 21 | /** 22 | * download client 23 | */ 24 | val downloader: Downloader[_] = AsyncHttpClientDownloader 25 | 26 | /** 27 | * the target urls for processor to crawl 28 | * 29 | * @return 30 | */ 31 | @deprecated 32 | def targetUrls: List[String] = Nil 33 | 34 | /** 35 | * the target request for processor to crawl 36 | * @return 37 | */ 38 | def targetRequests: List[RequestUri] = 39 | if (targetUrls.nonEmpty) { 40 | targetUrls.map(url => RequestUri(url)) 41 | } else { 42 | List.empty[RequestUri] 43 | } 44 | 45 | /** 46 | * handle the crawled result 47 | * 48 | * @return 49 | */ 50 | val pipelines: Set[Pipeline] = Set(ConsolePipeline) 51 | 52 | /** 53 | * parse the html source code 54 | * 55 | * @param page 56 | */ 57 | def process(page: Page): Future[Unit] = Future { 58 | try { 59 | doProcess(page) 60 | } catch { 61 | case e: Throwable => 62 | logger.error(s"failed to process page ${page.url} -> ${page.source}", e) 63 | throw e 64 | } 65 | } 66 | 67 | protected def doProcess(page: Page): Unit 68 | 69 | /** 70 | * set request config for processor 71 | * 72 | * @return 73 | */ 74 | def requestSetting: RequestSetting = RequestSetting(url = None) 75 | 76 | /** 77 | * schedule cron job expression 78 | * 79 | * @return 80 | */ 81 | def cronExpression: Option[String] = None 82 | 83 | } 84 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/processor/PageProcessorActorReceiver.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.processor 2 | 3 | import java.util.concurrent.LinkedBlockingQueue 4 | 5 | import akka.actor.{ Actor, ActorRef, PoisonPill, Props } 6 | import io.github.wtog.crawler.actor.ExecutionContexts.processorDispatcher 7 | import io.github.wtog.crawler.dto.{ DownloadEvent, PipelineEvent, ProcessorEvent } 8 | import io.github.wtog.crawler.pipeline.{ Pipeline, PipelineActorReceiver } 9 | import io.github.wtog.crawler.queue.RequestQueue 10 | import io.github.wtog.crawler.spider.Spider 11 | import org.slf4j.{ Logger, LoggerFactory } 12 | 13 | import scala.util.{ Failure, Success } 14 | 15 | /** 16 | * @author : tong.wang 17 | * @since : 5/16/18 11:54 PM 18 | * @version : 1.0.0 19 | */ 20 | class PageProcessorActorReceiver extends Actor { 21 | 22 | private lazy val logger: Logger = LoggerFactory.getLogger(classOf[PageProcessorActorReceiver]) 23 | 24 | val pipelineActor: ActorRef = context.actorOf(Props[PipelineActorReceiver].withDispatcher("crawler.pipeline-dispatcher"), "pipeline-processor") 25 | 26 | override def receive: Receive = { 27 | case processorEvent: ProcessorEvent ⇒ 28 | val page = processorEvent.page 29 | val spider = processorEvent.spider 30 | val downloadSender = sender() 31 | 32 | spider.pageProcessor.process(page).foreach { _ ⇒ 33 | pipelineProcess(page.requestSetting.url.get, page.resultItems)(spider.pageProcessor.pipelines)(downloadSender) 34 | spider.CrawlMetric.processedSuccessCounter 35 | 36 | continueRequest(page.requestQueue)(spider)(downloadSender) 37 | } 38 | case other ⇒ 39 | logger.warn(s"${self.path} received wrong msg ${other}") 40 | } 41 | 42 | private[this] def pipelineProcess(url: String, pageResultItems: LinkedBlockingQueue[Any])(pipelines: Set[Pipeline])(downloadSender: ActorRef): Unit = 43 | while (!pageResultItems.isEmpty) { 44 | Option(pageResultItems.poll()).foreach { item ⇒ 45 | PipelineEvent(pipelines, (url, item)).initPipelines() match { 46 | case Some(e) => pipelineActor ! e 47 | case None => downloadSender ! PoisonPill 48 | } 49 | } 50 | } 51 | 52 | private[this] def continueRequest(targetRequests: RequestQueue)(spider: Spider)(downloadSender: ActorRef): Unit = 53 | while (targetRequests.nonEmpty) { 54 | targetRequests.poll().foreach { targetRequest ⇒ 55 | downloadSender ! DownloadEvent(spider, targetRequest) 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/queue/DuplicateRemovedQueue.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.queue 2 | 3 | import io.github.wtog.crawler.dto.RequestSetting 4 | import io.github.wtog.crawler.queue.duplicate.DuplicateRemovedStrategy 5 | 6 | /** 7 | * @author : tong.wang 8 | * @since : 5/16/18 10:07 PM 9 | * @version : 1.0.0 10 | */ 11 | abstract class DuplicateRemovedQueue(duplicateRemovedStrategy: DuplicateRemovedStrategy) extends RequestQueue { 12 | 13 | override def push(request: RequestSetting): Unit = 14 | if (isNotDuplicateRequest(request)) { 15 | pushWhenNoDuplicate(request) 16 | } 17 | 18 | private def isNotDuplicateRequest(requestHeaderGeneral: RequestSetting): Boolean = 19 | requestHeaderGeneral.method match { 20 | case "GET" ⇒ 21 | !duplicateRemovedStrategy.isDuplicate(requestHeaderGeneral.url.get) 22 | case "POST" ⇒ 23 | !duplicateRemovedStrategy.isDuplicate( 24 | requestHeaderGeneral.url.get + requestHeaderGeneral.requestBody 25 | .getOrElse("") 26 | ) 27 | case other ⇒ 28 | logger.warn(s"unknown request method type: ${other}") 29 | true 30 | } 31 | 32 | protected def pushWhenNoDuplicate(request: RequestSetting): Unit 33 | } 34 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/queue/RequestQueue.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.queue 2 | 3 | import java.util.concurrent.TimeUnit 4 | 5 | import io.github.wtog.crawler.dto.RequestSetting 6 | import org.slf4j.{ Logger, LoggerFactory } 7 | 8 | /** 9 | * @author : tong.wang 10 | * @since : 5/16/18 10:03 PM 11 | * @version : 1.0.0 12 | */ 13 | trait RequestQueue { 14 | protected val logger: Logger = LoggerFactory.getLogger(this.getClass) 15 | 16 | def push(request: RequestSetting): Unit 17 | 18 | def poll(): Option[RequestSetting] = 19 | doPoll().map { r => 20 | TimeUnit.MILLISECONDS.sleep(r.sleepTime.toMillis) 21 | r 22 | } 23 | 24 | protected def doPoll(): Option[RequestSetting] 25 | 26 | def isEmpty: Boolean 27 | 28 | def nonEmpty: Boolean = !isEmpty 29 | } 30 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/queue/TargetRequestTaskQueue.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.queue 2 | 3 | import java.util.concurrent.LinkedBlockingQueue 4 | 5 | import io.github.wtog.crawler.dto.RequestSetting 6 | import io.github.wtog.crawler.queue.duplicate.{ DuplicateRemovedStrategy, HashMapStrategy } 7 | 8 | /** 9 | * @author : tong.wang 10 | * @since : 5/16/18 10:12 PM 11 | * @version : 1.0.0 12 | */ 13 | class TargetRequestTaskQueue(duplicateRemovedStrategy: DuplicateRemovedStrategy = HashMapStrategy) extends DuplicateRemovedQueue(duplicateRemovedStrategy) { 14 | private lazy val queue: LinkedBlockingQueue[RequestSetting] = new LinkedBlockingQueue[RequestSetting]() 15 | 16 | override def pushWhenNoDuplicate(request: RequestSetting): Unit = this.queue.add(request) 17 | 18 | override def isEmpty: Boolean = queue.isEmpty 19 | 20 | override def doPoll(): Option[RequestSetting] = Option(this.queue.poll()) 21 | 22 | } 23 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/queue/duplicate/BitSetStrategy.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.queue.duplicate 2 | 3 | import scala.collection.BitSet 4 | 5 | /** 6 | * @author : tong.wang 7 | * @since : 11/8/18 1:31 PM 8 | * @version : 1.0.0 9 | */ 10 | object BitSetStrategy extends DuplicateRemovedStrategy { 11 | var urlBitSet = BitSet.empty 12 | 13 | override def isDuplicate(url: String): Boolean = { 14 | val urlHashCode = urlToHashCode(url) 15 | val isDuplicate = urlBitSet.contains(urlHashCode) 16 | 17 | if (!isDuplicate) { 18 | urlBitSet += urlHashCode 19 | } 20 | 21 | isDuplicate 22 | } 23 | 24 | def urlToHashCode(url: String): Int = 25 | url.hashCode & 0x7FFFFFFF 26 | } 27 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/queue/duplicate/DuplicateRemovedStrategy.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.queue.duplicate 2 | 3 | /** 4 | * @author : tong.wang 5 | * @since : 6/1/18 8:44 AM 6 | * @version : 1.0.0 7 | */ 8 | trait DuplicateRemovedStrategy { 9 | 10 | def isDuplicate(url: String): Boolean 11 | 12 | } 13 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/queue/duplicate/HashMapStrategy.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.queue.duplicate 2 | 3 | import java.util.concurrent.ConcurrentHashMap 4 | 5 | import scala.concurrent.duration._ 6 | 7 | /** 8 | * @author : tong.wang 9 | * @since : 6/1/18 11:59 PM 10 | * @version : 1.0.0 11 | */ 12 | object HashMapStrategy extends DuplicateRemovedStrategy { 13 | private[this] val urlMap: ConcurrentHashMap[Int, Long] = new ConcurrentHashMap() 14 | 15 | override def isDuplicate(url: String): Boolean = { 16 | val urlHashCode = url.hashCode 17 | 18 | urlMap.containsKey(urlHashCode) match { 19 | case duplicated @ true if (passedMinutes(urlMap.get(urlHashCode), 10 minutes)) => 20 | urlMap.put(urlHashCode, System.currentTimeMillis()) 21 | !duplicated 22 | case duplicated @ true => 23 | duplicated 24 | case nonDuplicated @ false => 25 | urlMap.put(urlHashCode, System.currentTimeMillis()) 26 | nonDuplicated 27 | } 28 | 29 | } 30 | 31 | private[this] def passedMinutes(latest: Long, duration: Duration) = (latest - System.currentTimeMillis()) > duration.toMillis 32 | 33 | } 34 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/rest/NettyServer.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.rest 2 | 3 | import java.net.URI 4 | 5 | import io.netty.bootstrap.ServerBootstrap 6 | import io.netty.buffer.Unpooled 7 | import io.netty.channel._ 8 | import io.netty.channel.nio.NioEventLoopGroup 9 | import io.netty.channel.socket.SocketChannel 10 | import io.netty.channel.socket.nio.NioServerSocketChannel 11 | import io.netty.handler.codec.http._ 12 | import io.netty.handler.logging.{ LogLevel, LoggingHandler } 13 | import io.netty.handler.ssl.SslContext 14 | 15 | /** 16 | * @author : tong.wang 17 | * @since : 2019-08-28 10:24 18 | * @version : 1.0.0 19 | */ 20 | object NettyServer extends Server { 21 | override def doStart(routes: Set[Router]): Unit = { 22 | val bossGroup = new NioEventLoopGroup(1) 23 | val workerGroup = new NioEventLoopGroup() 24 | try { 25 | val bootstrap = new ServerBootstrap() 26 | bootstrap 27 | .group(bossGroup, workerGroup) 28 | .channel(classOf[NioServerSocketChannel]) 29 | .handler(new LoggingHandler(LogLevel.INFO)) 30 | .childHandler(new ServerInitializer(routes = routes ++ defaultRoutes)) 31 | 32 | val channel = bootstrap.bind(port).sync().channel() 33 | channel.closeFuture().sync() 34 | } finally { 35 | bossGroup.shutdownGracefully() 36 | workerGroup.shutdownGracefully() 37 | } 38 | } 39 | 40 | } 41 | 42 | class ServerInitializer(routes: Set[Router], sslContext: Option[SslContext] = None) extends ChannelInitializer[SocketChannel] { 43 | 44 | override def channelReadComplete(ctx: ChannelHandlerContext): Unit = ctx.flush() 45 | 46 | override def initChannel(channel: SocketChannel): Unit = 47 | channel 48 | .pipeline() 49 | .addLast(new HttpRequestDecoder) 50 | .addLast(new HttpResponseEncoder) 51 | .addLast(new HttpObjectAggregator(1024)) 52 | .addLast(new RouterHandler(routes)) 53 | 54 | } 55 | 56 | class RouterHandler(routes: Set[Router]) extends SimpleChannelInboundHandler[FullHttpRequest](true) { 57 | 58 | import io.netty.handler.codec.http.HttpResponseStatus._ 59 | 60 | def channelRead0(ctx: ChannelHandlerContext, request: FullHttpRequest): Unit = 61 | try { 62 | val uri = new URI(request.uri()).getPath 63 | val method = request.method().name() 64 | 65 | routes.find(route => route.method == method && route.route == uri) match { 66 | case Some(handler) => 67 | responseOk(request, handler.handleRequest(request))(ctx) 68 | case None => 69 | responseNotFound(request, "not found".getBytes())(ctx) 70 | } 71 | } catch { 72 | case e: Throwable => 73 | responseBadRequest(request, e.getLocalizedMessage.getBytes())(ctx) 74 | } 75 | 76 | override def exceptionCaught(ctx: ChannelHandlerContext, cause: Throwable): Unit = { 77 | cause.printStackTrace() 78 | ctx.close() 79 | } 80 | 81 | private[this] def responseBadRequest(request: FullHttpRequest, resp: Array[Byte])(ctx: ChannelHandlerContext) = 82 | response(ctx, BAD_REQUEST, request, resp) 83 | 84 | private[this] def responseOk(request: FullHttpRequest, resp: Array[Byte])(ctx: ChannelHandlerContext) = 85 | response(ctx, OK, request, resp) 86 | 87 | private[this] def responseNotFound(request: FullHttpRequest, resp: Array[Byte])(ctx: ChannelHandlerContext) = 88 | response(ctx, NOT_FOUND, request, resp) 89 | 90 | private[this] def response(ctx: ChannelHandlerContext, status: HttpResponseStatus, req: FullHttpRequest, resp: Array[Byte]) = { 91 | import io.netty.handler.codec.http.HttpHeaderNames._ 92 | import io.netty.handler.codec.http.HttpVersion._ 93 | 94 | val keepAlive = HttpUtil.isKeepAlive(req) 95 | val content = Unpooled.copiedBuffer(resp) 96 | val response = new DefaultFullHttpResponse(HTTP_1_1, status, content) 97 | 98 | response.headers.set(HttpHeaderNames.CONTENT_TYPE, "text/plain") 99 | response.headers.set(HttpHeaderNames.CONTENT_LENGTH, content.readableBytes) 100 | 101 | if (!keepAlive) { 102 | ctx.write(response).addListener(ChannelFutureListener.CLOSE) 103 | } else { 104 | response.headers().set(CONNECTION, HttpHeaderNames.KEEP_ALIVE); 105 | ctx.write(response) 106 | } 107 | ctx.flush() 108 | } 109 | 110 | } 111 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/rest/Router.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.rest 2 | 3 | import io.github.wtog.crawler.spider.SpiderPool 4 | import io.github.wtog.utils.JsonUtils 5 | import io.netty.handler.codec.http.FullHttpRequest 6 | 7 | /** 8 | * @author : tong.wang 9 | * @since : 2019-08-28 10:38 10 | * @version : 1.0.0 11 | */ 12 | trait Router { 13 | 14 | def method: String 15 | 16 | def route: String 17 | 18 | def handleRequest(request: FullHttpRequest): Array[Byte] 19 | 20 | implicit def toBytes(content: String): Array[Byte] = content.getBytes() 21 | } 22 | 23 | object SpiderStatusRoute extends Router { 24 | override def method: String = "GET" 25 | 26 | override def route: String = "/spiders" 27 | 28 | override def handleRequest(request: FullHttpRequest): Array[Byte] = { 29 | val results = SpiderPool.fetchAllSpiders().foldLeft(List.empty[Map[String, Any]]) { (list, entry) => 30 | entry.CrawlMetric.metricInfo() +: list 31 | } 32 | JsonUtils.toJson(results) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/rest/Server.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.rest 2 | 3 | import java.util.concurrent.Executors 4 | 5 | import io.github.wtog.utils.ConfigUtils 6 | import io.github.wtog.utils.logger.Logging 7 | 8 | import scala.concurrent.{ ExecutionContext, Future } 9 | import scala.util.control.NonFatal 10 | 11 | /** 12 | * @author : tong.wang 13 | * @since : 2019-08-28 10:24 14 | * @version : 1.0.0 15 | */ 16 | trait Server extends Logging { 17 | 18 | @volatile var running = false 19 | 20 | def start(routes: Set[Router]): Boolean = { 21 | if (!running) { 22 | Future { 23 | running = true 24 | try { 25 | doStart(routes) 26 | } catch { 27 | case NonFatal(e) => 28 | logger.error(e.getLocalizedMessage) 29 | running = false 30 | } 31 | }(ExecutionContext.fromExecutor(Executors.newSingleThreadExecutor())) 32 | } else { 33 | running = true 34 | } 35 | running 36 | } 37 | 38 | protected def doStart(routes: Set[Router]): Unit 39 | 40 | val defaultRoutes: Set[Router] = Set(SpiderStatusRoute) 41 | 42 | val port: Int = ConfigUtils.getIntOpt("crawler.server.port").getOrElse(19000) 43 | 44 | } 45 | 46 | object Server { 47 | val serverInstance = NettyServer 48 | 49 | def start(routes: Set[Router] = Set.empty[Router]): Boolean = serverInstance.start(routes) 50 | 51 | def running: Boolean = serverInstance.running 52 | } 53 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/schedule/ScheduleJobs.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.schedule 2 | 3 | import org.quartz.impl.StdSchedulerFactory 4 | import org.quartz.{ JobBuilder, _ } 5 | 6 | /** 7 | * @author : tong.wang 8 | * @since : 2018-12-08 23:48 9 | * @version : 1.0.0 10 | */ 11 | object ScheduleJobs { 12 | private lazy val scheduler = new StdSchedulerFactory().getScheduler() 13 | 14 | def addJob[C <: Job](scheduleJob: ScheduleJob[C]): Unit = 15 | if (!scheduler.checkExists(scheduleJob.jobKey)) { 16 | val trigger = TriggerBuilder.newTrigger().withSchedule(CronScheduleBuilder.cronSchedule(scheduleJob.cronExpression)).build() 17 | val job = JobBuilder.newJob(scheduleJob.task).withIdentity(scheduleJob.jobKey).build 18 | 19 | scheduler.scheduleJob(job, trigger) 20 | scheduler.startDelayed(1) 21 | } 22 | 23 | def shutdown(): Unit = scheduler.shutdown(true) 24 | } 25 | 26 | case class ScheduleJob[C <: Job](jobName: String, cronExpression: String, task: Class[C], groupName: Option[String] = None) { 27 | val group: String = groupName.getOrElse(jobName) 28 | val jobKey = new JobKey(jobName, group) 29 | } 30 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/selector/HtmlParser.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.selector 2 | 3 | import java.nio.charset.Charset 4 | 5 | import io.github.wtog.crawler.dto.Page 6 | import io.github.wtog.utils.JsonUtils 7 | import org.jsoup.Jsoup 8 | import org.jsoup.nodes.Element 9 | import org.jsoup.select.Elements 10 | 11 | import scala.collection.JavaConverters._ 12 | import org.jsoup.nodes.Document 13 | 14 | /** 15 | * @author : tong.wang 16 | * @since : 5/18/18 12:32 AM 17 | * @version : 1.0.0 18 | */ 19 | trait HtmlParser { 20 | 21 | implicit class PageWrapper(page: Page) { 22 | 23 | lazy val document: Document = Jsoup.parse(page.source, page.requestSetting.url.getOrElse("")) 24 | 25 | lazy val title: String = document.title() 26 | 27 | lazy val body: Element = document.body() 28 | 29 | def div(element: String): Elements = document.select(element) 30 | 31 | def dom(query: String): Elements = document.select(query) 32 | 33 | def table(query: String): Seq[Element] = document.select(s"table ${query}").asScala.toSeq 34 | 35 | def hrefs: Seq[String] = document.select("a").toSeq.collect { 36 | case e if e.attr("href").startsWith("http") => 37 | e.attr("href") 38 | } 39 | 40 | } 41 | 42 | implicit class ElementsWrapper(elements: Elements) { 43 | def getText(query: String): String = elements.select(query).text() 44 | 45 | def getElements(query: String): Elements = elements.select(query) 46 | 47 | def toSeq: Seq[Element] = elements.asScala.toSeq 48 | } 49 | 50 | implicit class ElementWrapper(element: Element) { 51 | def getText(query: String): String = element.select(query).text() 52 | } 53 | 54 | } 55 | 56 | object HtmlParser { 57 | 58 | def getValueFromJson[T: Manifest](json: String, key: String): Option[T] = JsonUtils.parseFrom[Map[String, T]](json).get(key) 59 | 60 | def parseJson[T: Manifest](json: String): T = JsonUtils.parseFrom[T](json) 61 | 62 | def getHtmlSourceWithCharset(contentBytes: Array[Byte], defaultCharset: String = Charset.defaultCharset().name()): String = { 63 | 64 | val content = new String(contentBytes, defaultCharset) 65 | val metas: Elements = Jsoup.parse(content).select("meta") 66 | 67 | val metaContent = metas.attr("content") 68 | 69 | val actualCharset = if (metaContent.contains("charset")) { // html4 70 | metaContent 71 | .substring(metaContent.indexOf("charset"), metaContent.length) 72 | .split("=")(1) 73 | } else { // html5 74 | metas.attr("charset") 75 | } 76 | 77 | if (actualCharset.isEmpty || actualCharset.toUpperCase.equals( 78 | defaultCharset.toString.toUpperCase 79 | )) { 80 | content 81 | } else { 82 | new String(contentBytes, actualCharset) 83 | } 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/selector/Selector.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.selector 2 | 3 | /** 4 | * @author : tong.wang 5 | * @since : 5/18/18 12:27 AM 6 | * @version : 1.0.0 7 | */ 8 | trait Selector { 9 | def select(text: String): String 10 | 11 | } 12 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/spider/Spider.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.spider 2 | 3 | import java.util.concurrent.TimeUnit 4 | import java.util.concurrent.atomic.{ AtomicBoolean, AtomicInteger } 5 | 6 | import akka.actor.{ ActorRef, PoisonPill, Props } 7 | import io.github.wtog.crawler.actor.ActorManager 8 | import io.github.wtog.crawler.downloader.{ ChromeHeadlessConfig, ChromeHeadlessDownloader, DownloaderActorReceiver } 9 | import io.github.wtog.crawler.dto.DownloadEvent 10 | import io.github.wtog.crawler.processor.PageProcessor 11 | import org.slf4j.LoggerFactory 12 | 13 | import scala.concurrent.ExecutionContext.Implicits._ 14 | import scala.concurrent.Future 15 | 16 | /** 17 | * @author : tong.wang 18 | * @since : 4/10/18 11:34 AM 19 | * @version : 1.0.0 20 | */ 21 | case class Spider(pageProcessor: PageProcessor) { 22 | 23 | private lazy val logger = LoggerFactory.getLogger(classOf[Spider]) 24 | 25 | private var downloaderActorPath = "" 26 | 27 | val running: AtomicBoolean = new AtomicBoolean(false) 28 | 29 | val name: String = pageProcessor.name 30 | 31 | def start(): Unit = 32 | if (!running.getAndSet(true)) { 33 | // if (pageProcessor.downloader.isInstanceOf[ChromeHeadlessDownloader.type] && ChromeHeadlessConfig.chromeDriverNotExecutable) { 34 | // throw new IllegalStateException(""" 35 | // |cant find chrome driver to execute. 36 | // |choose one chrome driver from https://npm.taobao.org/mirrors/chromedriver/70.0.3538.16/ to download and install into your system 37 | // """.stripMargin) 38 | // } 39 | 40 | val downloaderActor: ActorRef = getDownloadActor 41 | execute(downloaderActor) 42 | SpiderPool.addSpider(this) 43 | } 44 | 45 | private def getDownloadActor: ActorRef = { 46 | downloaderActorPath = s"downloader-${name}-${System.currentTimeMillis()}" 47 | val downloaderActor = ActorManager.getNewSystemActor( 48 | "downloader-dispatcher", 49 | downloaderActorPath, 50 | props = Props[DownloaderActorReceiver] 51 | ) 52 | downloaderActor 53 | } 54 | 55 | def restart(): Unit = { 56 | if (running.get()) { 57 | this.stop() 58 | } 59 | 60 | start() 61 | } 62 | 63 | def stop(): Unit = 64 | if (running.getAndSet(false)) { 65 | ActorManager.getExistedActor(downloaderActorPath) ! PoisonPill 66 | SpiderPool.removeSpider(this) 67 | this.CrawlMetric.clean() 68 | } 69 | 70 | private def execute(downloaderActor: ActorRef): Future[Unit] = 71 | Future { 72 | this.pageProcessor.targetRequests.foreach { url ⇒ 73 | downloaderActor ! DownloadEvent( 74 | spider = this, 75 | request = pageProcessor.requestSetting.withRequestUri(url) 76 | ) 77 | TimeUnit.MILLISECONDS.sleep(this.pageProcessor.requestSetting.sleepTime.toMillis) 78 | } 79 | } 80 | 81 | object CrawlMetric { 82 | private val downloadPageSuccessNum = new AtomicInteger(0) 83 | private val downloadPageFailedNum = new AtomicInteger(0) 84 | private val processPageSuccessNum = new AtomicInteger(0) 85 | 86 | def downloadedPageSum: Int = downloadPageSuccessNum.get() + downloadPageFailedNum.get() 87 | 88 | def downloadSuccessCounter: Int = downloadPageSuccessNum.getAndIncrement() 89 | 90 | def downloadFailedCounter: Int = downloadPageFailedNum.getAndIncrement() 91 | 92 | def processedSuccessCounter: Int = processPageSuccessNum.getAndIncrement() 93 | 94 | def clean(): Unit = { 95 | downloadPageSuccessNum.set(0) 96 | downloadPageFailedNum.set(0) 97 | processPageSuccessNum.set(0) 98 | } 99 | 100 | def record(success: Boolean, url: String): Unit = { 101 | if (logger.isDebugEnabled()) 102 | logger.debug(s"downloaded: ${success} ${url}") 103 | if (success) downloadSuccessCounter 104 | else downloadFailedCounter 105 | } 106 | 107 | def metricInfo(): Map[String, Any] = Map( 108 | "spider" -> name, 109 | "total" -> downloadedPageSum, 110 | "downloaded" -> downloadSuccessCounter, 111 | "processed" -> processPageSuccessNum.get() 112 | ) 113 | 114 | } 115 | 116 | override def toString: String = s"spider-${name}: ${CrawlMetric.downloadedPageSum}" 117 | } 118 | -------------------------------------------------------------------------------- /crawler-core/src/main/scala/io/github/wtog/crawler/spider/SpiderPool.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.spider 2 | 3 | import java.util.concurrent.ConcurrentHashMap 4 | 5 | import io.github.wtog.crawler.downloader.proxy.ProxyProvider 6 | import io.github.wtog.crawler.schedule.{ ScheduleJob, ScheduleJobs } 7 | import org.quartz.{ Job, JobExecutionContext } 8 | 9 | /** 10 | * @author : tong.wang 11 | * @since : 9/15/18 10:51 AM 12 | * @version : 1.0.0 13 | */ 14 | object SpiderPool { 15 | 16 | private[this] val spiders = new ConcurrentHashMap[String, Spider]() 17 | 18 | def addSpider(spider: Spider): Unit = { 19 | spiders.putIfAbsent(spider.name, spider) 20 | 21 | spider.pageProcessor.cronExpression.foreach { cron ⇒ 22 | ScheduleJobs.addJob(ScheduleJob(jobName = spider.name, cronExpression = cron, task = classOf[SpiderScheduleJob])) 23 | } 24 | 25 | if (spider.pageProcessor.requestSetting.useProxy) { 26 | ProxyProvider.startProxyCrawl() 27 | } 28 | } 29 | 30 | def removeSpider(spider: Spider): Spider = spiders.remove(spider.name) 31 | 32 | def getSpiderByName(name: String): Option[Spider] = Option(spiders.get(name)) 33 | 34 | def fetchAllSpiders(): Array[Spider] = spiders.values().toArray().map(_.asInstanceOf[Spider]) 35 | 36 | def fetchAllUsingProxySpiders(): Array[Spider] = fetchAllSpiders().filter(_.pageProcessor.requestSetting.useProxy) 37 | 38 | } 39 | 40 | class SpiderScheduleJob() extends Job { 41 | override def execute(jobExecutionContext: JobExecutionContext): Unit = 42 | SpiderPool 43 | .getSpiderByName(jobExecutionContext.getJobDetail.getKey.getName) 44 | .foreach(_.restart()) 45 | } 46 | -------------------------------------------------------------------------------- /crawler-core/src/test/resources/application-test.conf: -------------------------------------------------------------------------------- 1 | include "reference.conf" 2 | -------------------------------------------------------------------------------- /crawler-core/src/test/resources/log4j2-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /crawler-core/src/test/scala/io/github/wtog/crawler/test/BaseCoreTest.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.test 2 | 3 | import java.util.concurrent.atomic.AtomicInteger 4 | 5 | import io.github.wtog.crawler.dto.{Page, RequestSetting} 6 | import io.github.wtog.crawler.processor.PageProcessor 7 | import io.github.wtog.crawler.rest.Server 8 | import io.github.wtog.crawler.test.server.TestMockServer 9 | import io.github.wtog.utils.ConfigUtils 10 | import io.github.wtog.utils.test.BaseTest 11 | 12 | /** 13 | * @author : tong.wang 14 | * @since : 5/16/18 9:19 PM 15 | * @version : 1.0.0 16 | */ 17 | trait BaseCoreTest extends BaseTest { 18 | 19 | lazy val port = ConfigUtils.getIntOpt("crawler.server.port").getOrElse(19000) 20 | 21 | lazy val localServerHost = s"http://localhost:${port}" 22 | 23 | override def beforeAll() = { 24 | System.getProperty("config.resource", "application-test.conf") 25 | System.getProperty("log4j.resource", "log4j2-test.xml") 26 | if (!Server.running) 27 | TestMockServer.start 28 | } 29 | 30 | lazy val requestSettingTest = RequestSetting( 31 | domain = "www.baidu.com", 32 | url = Some("https://www.baidu.com/s?wd=wtog%20web-crawler") 33 | ) 34 | 35 | case class LocalProcessor(requestSettingTest: Option[RequestSetting] = None) extends PageProcessor { 36 | 37 | val link = new AtomicInteger(0) 38 | 39 | override def targetUrls: List[String] = List(localServerHost) 40 | 41 | override protected def doProcess(page: Page): Unit = { 42 | assert(page.isDownloadSuccess) 43 | page.addTargetRequest(s"${page.url}?id=${link.incrementAndGet()}") 44 | } 45 | 46 | override def requestSetting: RequestSetting = { 47 | requestSettingTest.getOrElse( 48 | RequestSetting( 49 | domain = "localhost", 50 | url = Some(s"http://localhost:${port}/mock/get") 51 | )) 52 | } 53 | 54 | def getUrl(route: String): String = s"http://localhost:${port}${route}" 55 | } 56 | 57 | object LocalProcessor { 58 | def apply(): LocalProcessor = new LocalProcessor() 59 | 60 | def apply(requestSetting: Option[RequestSetting] = None): LocalProcessor = new LocalProcessor(requestSetting) 61 | 62 | def apply(requestSetting: RequestSetting): LocalProcessor = new LocalProcessor(Some(requestSetting)) 63 | 64 | } 65 | 66 | } 67 | -------------------------------------------------------------------------------- /crawler-core/src/test/scala/io/github/wtog/crawler/test/actor/ActorTestBase.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.test.actor 2 | 3 | import akka.actor.ActorSystem 4 | import akka.testkit.TestKit 5 | import org.scalatest.{BeforeAndAfterAll, Matchers, WordSpecLike} 6 | 7 | /** 8 | * @author : tong.wang 9 | * @since : 2019-04-22 08:06 10 | * @version : 1.0.0 11 | */ 12 | abstract class ActorTestBase extends TestKit(ActorSystem("testsystem")) with WordSpecLike 13 | with Matchers 14 | with BeforeAndAfterAll 15 | -------------------------------------------------------------------------------- /crawler-core/src/test/scala/io/github/wtog/crawler/test/actor/PageProcessorActorTestkit.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.test.actor 2 | 3 | import java.util.concurrent.TimeUnit 4 | 5 | import akka.actor.Props 6 | import akka.testkit.TestProbe 7 | import io.github.wtog.crawler.dto 8 | import io.github.wtog.crawler.dto.{DownloadEvent, Page, ProcessorEvent, RequestSetting} 9 | import io.github.wtog.crawler.processor._ 10 | import io.github.wtog.crawler.spider.Spider 11 | 12 | import scala.concurrent.duration._ 13 | 14 | /** 15 | * @author : tong.wang 16 | * @since : 2019-04-19 22:52 17 | * @version : 1.0.0 18 | */ 19 | class PageProcessorActorTestkit extends ActorTestBase { 20 | 21 | private class TestProcessor extends PageProcessor { 22 | 23 | override def targetUrls: List[String] = List("http://test") 24 | 25 | override protected def doProcess(page: Page): Unit = { 26 | (1 to 10).foreach(i => page.addTargetRequest(s"${page.url}/$i")) 27 | } 28 | 29 | override def requestSetting: RequestSetting = RequestSetting() 30 | } 31 | 32 | "pageProcessor" must { 33 | "send" in { 34 | val pageProcessorActorRevicer = system.actorOf(props = Props[PageProcessorActorReceiver]) 35 | val testProb = new TestProbe(system) 36 | 37 | val testProcessor = new TestProcessor() 38 | val pages = testProcessor.targetUrls.map { url => 39 | dto.Page(bytes = Some("hh".getBytes()), requestSetting = testProcessor.requestSetting.withUrl(url)) 40 | } 41 | 42 | val spider = Spider(pageProcessor = testProcessor) 43 | pages.foreach { p => 44 | testProb.send(pageProcessorActorRevicer, ProcessorEvent(spider, p)) 45 | } 46 | 47 | (1 to 10).foreach{i => 48 | TimeUnit.SECONDS.sleep(1) 49 | testProb.expectMsg(2 seconds, DownloadEvent(spider, RequestSetting(url = Some(s"http://test/$i"))))} 50 | } 51 | 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /crawler-core/src/test/scala/io/github/wtog/crawler/test/download/AsyncHttpClientTest.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.test.download 2 | 3 | import com.google.common.net.{HttpHeaders, MediaType} 4 | import io.github.wtog.crawler.downloader.AsyncHttpClientDownloader 5 | import io.github.wtog.crawler.dto.RequestUri 6 | import io.github.wtog.crawler.spider.Spider 7 | import io.github.wtog.crawler.test.BaseCoreTest 8 | import io.github.wtog.crawler.test.server.TestPostMockRoute 9 | import io.github.wtog.utils.JsonUtils 10 | import org.scalatest.BeforeAndAfter 11 | 12 | /** 13 | * @author : tong.wang 14 | * @since : 5/20/18 11:22 AM 15 | * @version : 1.0.0 16 | */ 17 | class AsyncHttpClientTest extends BaseCoreTest with BeforeAndAfter { 18 | 19 | after { 20 | AsyncHttpClientDownloader.closeClient() 21 | } 22 | 23 | lazy val localProcessor = LocalProcessor() 24 | lazy val url = localProcessor.requestSetting.url.get 25 | 26 | test("asynchttpclient driver close resource safely") { 27 | val page = await(AsyncHttpClientDownloader.download(Spider(pageProcessor = localProcessor), request = localProcessor.requestSetting.withUrl(s"${url}?id=0"))) 28 | 29 | assert(page.isDownloadSuccess) 30 | assert(page.source.nonEmpty) 31 | AsyncHttpClientDownloader.closeClient() 32 | 33 | assert(AsyncHttpClientDownloader.getClient(localProcessor.requestSetting.domain).isEmpty) 34 | } 35 | 36 | test("async http client post download") { 37 | val request = RequestUri( 38 | url = localProcessor.getUrl(TestPostMockRoute.route), 39 | method = TestPostMockRoute.method, 40 | requestBody = Some(JsonUtils.toJson(Map("a" -> "b"))), 41 | headers = Some(Map(HttpHeaders.CONTENT_TYPE -> MediaType.FORM_DATA.toString)) 42 | ) 43 | val page = await(AsyncHttpClientDownloader.download(Spider(pageProcessor = localProcessor), request = localProcessor.requestSetting.withRequestUri(request))) 44 | 45 | assert(page.isDownloadSuccess) 46 | assert(page.source.nonEmpty) 47 | 48 | AsyncHttpClientDownloader.closeClient() 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /crawler-core/src/test/scala/io/github/wtog/crawler/test/download/ChromeHeadlessDownloaderTest.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.test.download 2 | 3 | import io.github.wtog.crawler.downloader.ChromeHeadlessDownloader 4 | import io.github.wtog.crawler.spider.Spider 5 | import io.github.wtog.crawler.test.BaseCoreTest 6 | import org.scalatest.BeforeAndAfter 7 | 8 | /** 9 | * @author : tong.wang 10 | * @since : 1/12/20 10:48 PM 11 | * @version : 1.0.0 12 | */ 13 | class ChromeHeadlessDownloaderTest extends BaseCoreTest with BeforeAndAfter { 14 | 15 | after { 16 | ChromeHeadlessDownloader.closeClient() 17 | } 18 | 19 | lazy val localProcessor = LocalProcessor() 20 | 21 | ignore("chrome driver get xhr response") { 22 | val url = "https://flight.qunar.com/site/oneway_list.htm?searchDepartureAirport=%E5%8C%97%E4%BA%AC&searchArrivalAirport=%E6%88%90%E9%83%BD&searchDepartureTime=2020-03-11&searchArrivalTime=2020-03-12&nextNDays=0&startSearch=true&fromCode=BJS&toCode=CTU&from=qunarindex&lowestPrice=nul" 23 | 24 | val page = await(ChromeHeadlessDownloader.download( 25 | spider = Spider(pageProcessor = localProcessor), 26 | request = localProcessor.requestSetting.withUrl(url).withXhrRequests("https://flight.qunar.com/touch/api/domestic/wbdflightlist") 27 | )) 28 | 29 | assert(page.isDownloadSuccess) 30 | 31 | println(page.xhrResponses) 32 | } 33 | 34 | ignore("chrome driver on linux") { 35 | val page = await(ChromeHeadlessDownloader.download( 36 | spider = Spider(pageProcessor = localProcessor), 37 | request = localProcessor.requestSetting 38 | )) 39 | 40 | assert(page.isDownloadSuccess) 41 | println(page.source) 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /crawler-core/src/test/scala/io/github/wtog/crawler/test/processor/HtmlParserSpec.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.test.processor 2 | 3 | import io.github.wtog.crawler.dto 4 | import io.github.wtog.crawler.dto.{Page, RequestSetting} 5 | import io.github.wtog.crawler.selector.HtmlParser 6 | import io.github.wtog.crawler.test.BaseCoreTest 7 | 8 | /** 9 | * @author : tong.wang 10 | * @since : 2019-05-02 21:49 11 | * @version : 1.0.0 12 | */ 13 | class HtmlParserSpec extends BaseCoreTest with HtmlParser { 14 | 15 | test("page json") { 16 | val pageJsonObj = 17 | """ 18 | |{ 19 | | "id": 1, 20 | | "name": "test" 21 | |} 22 | """.stripMargin 23 | 24 | val pageJson = Page(bytes = Some(pageJsonObj.getBytes()), requestSetting = RequestSetting()).json[Map[String, Any]]() 25 | 26 | assert(pageJson("id") == 1) 27 | assert(pageJson("name") == "test") 28 | 29 | val pageListJson = 30 | """ 31 | |[ 32 | | { 33 | | "id": 1, 34 | | "name": "test" 35 | | } 36 | |] 37 | """.stripMargin 38 | 39 | val pageJsonList = dto.Page(bytes = Some(pageListJson.getBytes()), requestSetting = RequestSetting()).json[List[Map[String, Any]]]() 40 | 41 | assert(pageJsonList.head("id") == 1) 42 | assert(pageJsonList.head("name") == "test") 43 | 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /crawler-core/src/test/scala/io/github/wtog/crawler/test/proxy/ProxyProviderTest.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.test.proxy 2 | 3 | import java.util.concurrent.TimeUnit 4 | 5 | import io.github.wtog.crawler.downloader.proxy.ProxyProvider 6 | import io.github.wtog.crawler.downloader.proxy.crawler.{A2UPageProcessor, Data5UPageProcessor, IP89Processor} 7 | import io.github.wtog.crawler.dto.RequestSetting 8 | import io.github.wtog.crawler.spider.Spider 9 | import io.github.wtog.crawler.test.BaseCoreTest 10 | 11 | /** 12 | * @author : tong.wang 13 | * @since : 2019-05-14 22:37 14 | * @version : 1.0.0 15 | */ 16 | class ProxyProviderTest extends BaseCoreTest { 17 | 18 | test("spider use proxy with request setting useProxy=true") { 19 | val request = RequestSetting( 20 | url = Some(localServerHost), 21 | useProxy = true 22 | ) 23 | 24 | Spider(pageProcessor = LocalProcessor(request)).start() 25 | assert(ProxyProvider.proxySpiderCrawling.get()) 26 | } 27 | 28 | ignore("a2u proxy") { 29 | Spider(pageProcessor = A2UPageProcessor()).start() 30 | TimeUnit.SECONDS.sleep(10) 31 | } 32 | 33 | ignore("data5u proxy") { 34 | Spider(pageProcessor = Data5UPageProcessor()).start() 35 | TimeUnit.SECONDS.sleep(10) 36 | } 37 | 38 | ignore("ip89 proxy") { 39 | Spider(pageProcessor = IP89Processor()).start() 40 | TimeUnit.SECONDS.sleep(10) 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /crawler-core/src/test/scala/io/github/wtog/crawler/test/queue/DuplicateStrategyTest.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.test.queue 2 | 3 | import io.github.wtog.crawler.queue.duplicate.{BitSetStrategy, HashMapStrategy} 4 | import org.scalatest.FunSuite 5 | 6 | /** 7 | * @author : tong.wang 8 | * @since : 12/8/19 12:27 AM 9 | * @version : 1.0.0 10 | */ 11 | class DuplicateStrategyTest extends FunSuite { 12 | val urls = Seq("url1", "url1", "url2") 13 | 14 | 15 | private def removeDuplicated(isDuplicate: String => Boolean): Seq[String] = { 16 | urls.collect { case x if (!isDuplicate(x)) => x } 17 | } 18 | 19 | test("hashMap remove duplicate") { 20 | assert(urls.distinct == removeDuplicated(HashMapStrategy.isDuplicate)) 21 | } 22 | 23 | test("bitset remove duplicate") { 24 | assert(urls.distinct == removeDuplicated(BitSetStrategy.isDuplicate)) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /crawler-core/src/test/scala/io/github/wtog/crawler/test/schedule/ScheduleTest.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.test.schedule 2 | 3 | import java.util.concurrent.TimeUnit 4 | 5 | import io.github.wtog.crawler.schedule.{ScheduleJob, ScheduleJobs} 6 | import io.github.wtog.crawler.test.BaseCoreTest 7 | import org.quartz.{Job, JobExecutionContext} 8 | 9 | /** 10 | * @author : tong.wang 11 | * @since : 2019-05-12 23:11 12 | * @version : 1.0.0 13 | */ 14 | class ScheduleTest extends BaseCoreTest with Job { 15 | 16 | val intervalPrintJob = ScheduleJob(jobName = "intervalPrintJob", cronExpression = "*/1 * * ? * *", classOf[ScheduleTest], groupName = Some("test")) 17 | 18 | override def execute(context: JobExecutionContext): Unit = { 19 | assert(context.getJobDetail.getKey == intervalPrintJob.jobKey) 20 | } 21 | 22 | test("addJob") { 23 | ScheduleJobs.addJob(intervalPrintJob) 24 | TimeUnit.SECONDS.sleep(2) 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /crawler-core/src/test/scala/io/github/wtog/crawler/test/server/TestMockServer.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.test.server 2 | 3 | import java.nio.charset.Charset 4 | 5 | import io.github.wtog.crawler.rest.{Router, Server} 6 | import io.github.wtog.utils.JsonUtils 7 | import io.github.wtog.utils.logger.Logging 8 | import io.netty.handler.codec.http.{FullHttpRequest, HttpMethod} 9 | 10 | import scala.collection.JavaConverters._ 11 | import scala.collection.mutable 12 | 13 | /** 14 | * @author : tong.wang 15 | * @since : 2019-08-01 09:10 16 | * @version : 1.0.0 17 | */ 18 | object TestMockServer { 19 | def start = Server.start(Set(TestGetMockRoute, TestPostMockRoute)) 20 | } 21 | 22 | object TestGetMockRoute extends Router with Logging { 23 | override def method: String = HttpMethod.GET.toString 24 | 25 | override def route: String = s"/mock/get" 26 | 27 | override def handleRequest(request: FullHttpRequest): Array[Byte] = { 28 | val resp = request.headers().entries().asScala.foldLeft(Map.empty[String, String]){ (map, entry) => 29 | map ++ Map(entry.getKey -> entry.getValue) 30 | } 31 | 32 | val json = JsonUtils.toJson(resp) 33 | logger.info(s"TestGetMock ${json}") 34 | json.getBytes() 35 | } 36 | } 37 | 38 | object TestPostMockRoute extends Router { 39 | override def method: String = HttpMethod.POST.toString 40 | 41 | override def route: String = s"/mock/post" 42 | 43 | override def handleRequest(request: FullHttpRequest): Array[Byte] = { 44 | val resp = mutable.Map.empty[String, Any] 45 | val headers = request.headers().entries().asScala.foldLeft(Map.empty[String, String]){ (map, entry) => 46 | map ++ Map(entry.getKey -> entry.getValue) 47 | } 48 | 49 | val content = request.content().toString(Charset.defaultCharset()) 50 | val bodyMap = JsonUtils.parseFrom[Map[String, Any]](content) 51 | 52 | resp += ("requestHeaders" -> headers) 53 | resp += ("body" -> bodyMap) 54 | 55 | JsonUtils.toJson(resp).getBytes() 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /crawler-example/src/main/resources/log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /crawler-example/src/main/resources/reference.conf: -------------------------------------------------------------------------------- 1 | crawler { 2 | 3 | downloader-dispatcher { 4 | type = Dispatcher 5 | executor = "thread-pool-executor" 6 | 7 | thread-pool-executor { 8 | core-pool-size-min = 5 9 | core-pool-size-factor = 2 10 | core-pool-size-max = 10 11 | } 12 | } 13 | 14 | processor-dispatcher { 15 | type = Dispatcher 16 | executor = "fork-join-executor" 17 | fork-join-executor { 18 | parallelism-min = 2 19 | parallelism-factor = 4.0 20 | parallelism-max = 10 21 | } 22 | throughput = 50 23 | } 24 | 25 | pipeline-dispatcher { 26 | type = Dispatcher 27 | executor = "thread-pool-executor" 28 | 29 | thread-pool-executor { 30 | core-pool-size-min = 4 31 | core-pool-size-factor = 2 32 | core-pool-size-max = 8 33 | } 34 | } 35 | 36 | download.retry.exception = ["java.util.concurrent.TimeoutException"] 37 | 38 | server.port = 19000 39 | 40 | chrome { 41 | driver: "/usr/bin/chromedriver" 42 | } 43 | } 44 | 45 | crawler-examples { 46 | BaiduPageProcessor.enable: false 47 | 48 | LianjiaErshouFangProcessor.enable: false 49 | 50 | LianjiaRentingProcessor.enable: false 51 | 52 | QunarPageProcessor.enable: true 53 | 54 | ZhihuAnswerPageProcessor.enable: false 55 | } 56 | 57 | -------------------------------------------------------------------------------- /crawler-example/src/main/scala/io/github/wtog/example/ExampleTrait.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.example 2 | 3 | import io.github.wtog.crawler.processor.PageProcessor 4 | import io.github.wtog.utils.ConfigUtils 5 | 6 | /** 7 | * @author : tong.wang 8 | * @since : 1/14/20 8:31 PM 9 | * @version : 1.0.0 10 | */ 11 | trait ExampleTrait extends PageProcessor { 12 | 13 | val enable: Boolean = ConfigUtils.getBooleanOpt(s"crawler-examples.${this.getClass.getSimpleName}.enable").getOrElse(false) 14 | 15 | } 16 | -------------------------------------------------------------------------------- /crawler-example/src/main/scala/io/github/wtog/example/Main.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.example 2 | 3 | import io.github.wtog.crawler.processor.PageProcessor 4 | import io.github.wtog.crawler.spider.Spider 5 | import io.github.wtog.utils.ReflectionUtils 6 | 7 | import scala.io.StdIn 8 | import scala.util.Try 9 | 10 | /** 11 | * @author : tong.wang 12 | * @since : 6/14/18 11:40 PM 13 | * @version : 1.0.0 14 | */ 15 | object Main { 16 | 17 | def printProcessors(processorList: Seq[(ExampleTrait, Int)]): Unit = { 18 | for ((service, order) ← processorList) { 19 | println(s"\t${order}. ${service.getClass.getSimpleName}") 20 | } 21 | println("") 22 | } 23 | 24 | def main(args: Array[String]): Unit = { 25 | val processorList = ReflectionUtils 26 | .implementationClasses(classOf[ExampleTrait], "io.github.wtog.example") 27 | .map(_.newInstance()) 28 | .filter(_.enable) 29 | .sortWith(_.getClass.getSimpleName < _.getClass.getSimpleName) 30 | .zip(Stream.from(1)) 31 | 32 | val execProcessors = args match { 33 | case args: Array[String] if args.isEmpty || args.contains("0") ⇒ 34 | println("executing all enabled processors") 35 | printProcessors(processorList) 36 | processorList 37 | case args: Array[String] if (args.nonEmpty && args.toSeq.forall(arg ⇒ Try(arg.toInt).isSuccess)) ⇒ 38 | val processors = processorList.filter { 39 | case (_, order) ⇒ 40 | args.contains(order.toString) 41 | } 42 | println(s"executing ${processors.map(_._1.getClass.getSimpleName).mkString(",")}") 43 | processors 44 | case _ ⇒ 45 | println("\nshow page processor list: ") 46 | println("\t0. all") 47 | 48 | printProcessors(processorList) 49 | 50 | println("\nchoose number to execute.") 51 | println("input like 1,2,3 means to execute 1 and 2 and 3 processor") 52 | 53 | val chooseNumber = StdIn.readLine() 54 | val chosen = chooseNumber.split(",").distinct 55 | 56 | val executeProcessor = 57 | if (chosen.isEmpty || chooseNumber.contains("0")) { 58 | println("execute all processor") 59 | processorList 60 | } else { 61 | processorList.filter { 62 | case (_, order) ⇒ chosen.contains(order.toString) 63 | } 64 | } 65 | 66 | executeProcessor 67 | } 68 | 69 | startSpiders(execProcessors) 70 | } 71 | 72 | def startSpiders(processorList: Seq[(PageProcessor, Int)]): Unit = 73 | processorList.foreach { 74 | case (processor, _) ⇒ 75 | Spider(pageProcessor = processor).start() 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /crawler-example/src/main/scala/io/github/wtog/example/impl/BaiduPageProcessor.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.example.impl 2 | 3 | import io.github.wtog.crawler.dto.{ Page, RequestSetting } 4 | import io.github.wtog.example.ExampleTrait 5 | 6 | import scala.collection.mutable 7 | import scala.concurrent.duration._ 8 | 9 | /** 10 | * @author : tong.wang 11 | * @since : 5/16/18 11:42 PM 12 | * @version : 1.0.0 13 | */ 14 | class BaiduPageProcessor() extends ExampleTrait { 15 | 16 | override def doProcess(page: Page): Unit = { 17 | val hotSearched = page.div("#content_right .opr-toplist1-table tr") 18 | 19 | val href = hotSearched.select("td a").attr("href") 20 | val content = hotSearched.select("td a").text() 21 | val hot = hotSearched.select("td").last().text() 22 | 23 | page.addPageResultItem( 24 | Map("href" -> href, "content" -> content, "hot" -> hot) 25 | ) 26 | 27 | } 28 | 29 | override def requestSetting: RequestSetting = 30 | RequestSetting( 31 | domain = "www.baidu.com", 32 | headers = mutable.Map("Content-Type" -> "text/html; charset=GB2312"), 33 | sleepTime = 1 seconds 34 | ) 35 | 36 | override def targetUrls: List[String] = List("https://www.baidu.com/s?wd=wtog%20web-crawler") 37 | 38 | override def cronExpression: Option[String] = Some("*/30 * * ? * *") 39 | } 40 | -------------------------------------------------------------------------------- /crawler-example/src/main/scala/io/github/wtog/example/impl/LianjiaErshouFangProcessor.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.example.impl 2 | 3 | import java.time.ZonedDateTime 4 | import java.time.format.DateTimeFormatter 5 | import java.util.concurrent.atomic.AtomicInteger 6 | 7 | import io.github.wtog.crawler.dto.{ Page, RequestSetting } 8 | import io.github.wtog.crawler.pipeline.Pipeline 9 | import io.github.wtog.crawler.pipeline.db.{ DataSource, DataSourceInfo, PostgreSQLPipeline } 10 | import io.github.wtog.example.ExampleTrait 11 | import io.github.wtog.utils.JsonUtils 12 | import io.github.wtog.utils.StringUtils._ 13 | import org.jsoup.nodes.Element 14 | 15 | import scala.concurrent.duration._ 16 | import scala.util.Random 17 | import scala.util.matching.Regex 18 | 19 | /** 20 | * @author : tong.wang 21 | * @since : 10/11/19 10:01 APM 22 | * @version : 1.0.0 23 | */ 24 | class LianjiaErshouFangProcessor extends ExampleTrait { 25 | 26 | val pageNo = new AtomicInteger(1) 27 | val houseDetailRegex: Regex = """(.*ershoufang)/([\d]+).(html$)""".r 28 | val houseListRegex: Regex = """(.*ershoufang/pg[\d]+/$)""".r 29 | 30 | val queryDomValue: (String, Map[String, Seq[Element]], Element => String) => String = (typ: String, elements: Map[String, Seq[Element]], getDomValue: Element => String) => elements.get(typ).fold("")(e => getDomValue(e.head)) 31 | val getLiText: Element => String = (e: Element) => e.childNodes.get(1).toString 32 | val getLastSpanText: Element => String = (e: Element) => e.select("span").last().text() 33 | 34 | def getPage: Int = if (pageNo.get() >= 100) pageNo.getAndSet(1) else pageNo.incrementAndGet() 35 | 36 | override def doProcess(page: Page): Unit = 37 | page.requestSetting.url.get match { 38 | case houseListRegex(_) => 39 | addHouseDetail(page) 40 | page.addTargetRequest(s"https://bj.lianjia.com/ershoufang/pg${getPage}/") 41 | case houseDetailRegex(_, houseCode, _) => 42 | val overviewContent = page.dom(".overview .content") 43 | val pageShoufu = page.dom(".new-calculator").attr("data-shoufu") 44 | 45 | val (evaluationPrice, priceTotal) = pageShoufu match { 46 | case shoufu if shoufu.nonEmpty => 47 | val newCalculator = JsonUtils.parseFrom[Map[String, Any]](shoufu) 48 | val evaluation = newCalculator.get("evaluation").get.asInstanceOf[Int] 49 | val total = newCalculator.get("price").fold(0)(_.asInstanceOf[String].toInt) 50 | (evaluation, total) 51 | case "" => 52 | (0, 0) 53 | } 54 | 55 | val price = overviewContent.select(".price") 56 | val removed = price.hasClass("isRemove") 57 | val pricePerMeter = price.getText(".unitPriceValue").replace("元/平米", "") 58 | val room = overviewContent.getElements(".room") 59 | val roomMainInfoText = room.getText(".mainInfo") 60 | val roomSubInfoText = room.getText(".subInfo") 61 | val roomType = overviewContent.getElements(".type") 62 | val roomTypeMainInfoText = roomType.getText(".mainInfo") 63 | val roomTypeSubInfoText = roomType.getText(".subInfo") 64 | val roomArea = overviewContent.getElements(".area") 65 | val roomAreaMainInfo = roomArea.getText(".mainInfo").replace("平米", "") 66 | val roomAreaSubInfo = roomArea.getText(".subInfo") 67 | val aroundInfo = overviewContent.getElements(".aroundInfo") 68 | val subdistrict = aroundInfo.getElements("a").first().text() 69 | val communityAreaName = aroundInfo.getText(".areaName").replace("所在区域", "").split(" ") 70 | 71 | val (areaName, community, communityDetail) = (communityAreaName.headOption, communityAreaName.tail.headOption, communityAreaName.lastOption) 72 | 73 | val infoContent = page.dom(".m-content .base .content li") 74 | val basic = infoContent.toSeq.groupBy(e => e.select("span").text) 75 | 76 | val buildType = queryDomValue("建筑类型", basic, getLiText) 77 | val buildStruct = queryDomValue("建筑结构", basic, getLiText) 78 | val decoration: String = queryDomValue("装修情况", basic, getLiText) 79 | val householdLadder: String = queryDomValue("梯户比例", basic, getLiText) 80 | val heating: String = queryDomValue("供暖方式", basic, getLiText) 81 | val elevator: String = queryDomValue("配备电梯", basic, getLiText) 82 | val houseRight: String = queryDomValue("产权年限", basic, getLiText) 83 | 84 | val transactionContent = page.dom(".m-content .transaction .content li") 85 | 86 | val info = transactionContent.toSeq.groupBy(e => e.select("span").first().text) 87 | val saleTime: String = queryDomValue("挂牌时间", info, getLastSpanText) 88 | val tradingRight: String = queryDomValue("交易权属", info, getLastSpanText) 89 | val lastSale: String = queryDomValue("上次交易", info, getLastSpanText) 90 | val housingUse: String = queryDomValue("房屋用途", info, getLastSpanText) 91 | val houseYears: String = queryDomValue("房屋年限", info, getLastSpanText) 92 | val houseRightOwner: String = queryDomValue("产权所属", info, getLastSpanText) 93 | val mortgageInfo: String = queryDomValue("抵押信息", info, getLastSpanText) 94 | 95 | val house = House( 96 | houseCode = houseCode, 97 | totalPrice = priceTotal.toInt, 98 | evaluationPrice = evaluationPrice, 99 | meterPrice = pricePerMeter.toInt, 100 | roomMainInfo = roomMainInfoText, 101 | roomSubInfo = roomSubInfoText, 102 | roomTypeMainInfo = roomTypeMainInfoText, 103 | roomTypeSubInfo = roomTypeSubInfoText, 104 | roomAreaMainInfo = roomAreaMainInfo, 105 | roomAreaSubInfo = roomAreaSubInfo, 106 | subdistrict = subdistrict, 107 | areaName = areaName, 108 | community = community, 109 | communityDetail = communityDetail, 110 | buildType = buildType, 111 | buildStruct = buildStruct, 112 | decoration = decoration, 113 | householdladder = householdLadder, 114 | heating = heating, 115 | elevator = elevator, 116 | houseRight = houseRight, 117 | saleTime = saleTime, 118 | tradingRight = tradingRight, 119 | lastSale = lastSale, 120 | housingUse = housingUse, 121 | houseYears = houseYears, 122 | houseRightOwner = houseRightOwner, 123 | mortgageInfo = mortgageInfo, 124 | removed = removed 125 | ) 126 | 127 | page.addPageResultItem[Map[String, Any]](house.toMap) 128 | case other ⇒ 129 | println(other) 130 | } 131 | 132 | def addHouseDetail(page: Page): Unit = { 133 | val detailHrefs = page.dom(".sellListContent li div.title a") 134 | detailHrefs.toSeq.foreach(d => page.addTargetRequest(d.attr("href"))) 135 | } 136 | 137 | override val pipelines: Set[Pipeline] = Set( 138 | // CsvFilePipeline(Some("ershoufang.csv")), 139 | PostgreSQLPipeline(DataSourceInfo(database = this.getClass.getSimpleName, jdbcUrl = "jdbc:postgresql://127.0.0.1:5432/magicbox", username = "wtog", password = "")) { (db: String, result: Map[String, Any]) => 140 | val (keys, values) = result.unzip 141 | DataSource.rows[Int]("select count(1) from house where house_code = ?", Seq(result("houseCode").asInstanceOf[String]))(r => r.getInt(1))(db).headOption.getOrElse(0) match { 142 | case 0 => 143 | DataSource.executeUpdate(s"insert into house (${keys.map(_.toUnderscore).mkString(",")}) values (${Seq.fill[String](keys.size)("?").mkString(",")})", values.toSeq)(db) 144 | case _ => 145 | DataSource.executeUpdate( 146 | s"update house set ${keys.map(c => s"${c.toUnderscore} = ?").mkString(",")}, updated_at = '${ZonedDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"))}' where house_code = ? ", 147 | (values.toSeq ++ Seq(result("houseCode"))) 148 | )(db) 149 | } 150 | } 151 | ) 152 | 153 | override def requestSetting: RequestSetting = RequestSetting( 154 | domain = "www.lianjia.com", 155 | sleepTime = (Random.nextInt(3) + 5) seconds, 156 | useProxy = true 157 | ) 158 | 159 | override def targetUrls: List[String] = List("https://bj.lianjia.com/ershoufang/pg1/") 160 | 161 | } 162 | 163 | case class House( 164 | id: Option[Int] = None, 165 | houseCode: String, 166 | totalPrice: Int, 167 | evaluationPrice: Int, 168 | meterPrice: Int, 169 | roomMainInfo: String, 170 | roomSubInfo: String, 171 | roomTypeMainInfo: String, 172 | roomTypeSubInfo: String, 173 | roomAreaMainInfo: String, 174 | roomAreaSubInfo: String, 175 | subdistrict: String, 176 | areaName: Option[String], 177 | community: Option[String], 178 | communityDetail: Option[String], 179 | buildType: String, 180 | buildStruct: String, 181 | decoration: String, 182 | householdladder: String, 183 | heating: String, 184 | elevator: String, 185 | houseRight: String, 186 | saleTime: String, 187 | tradingRight: String, 188 | lastSale: String, 189 | housingUse: String, 190 | houseYears: String, 191 | houseRightOwner: String, 192 | mortgageInfo: String, 193 | removed: Boolean = false) 194 | 195 | object House { 196 | 197 | implicit class HouseWrapper(house: House) { 198 | def toMap: Map[String, Any] = JsonUtils.toMap(house) 199 | } 200 | 201 | } 202 | 203 | object HouseType extends Enumeration { 204 | type HouseType = Value 205 | 206 | val ERSHOU: Value = Value("ershou") 207 | val NEW: Value = Value("new") 208 | } 209 | -------------------------------------------------------------------------------- /crawler-example/src/main/scala/io/github/wtog/example/impl/LianjiaRentingProcessor.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.example.impl 2 | 3 | import java.time.ZonedDateTime 4 | import java.time.format.DateTimeFormatter 5 | import java.util.concurrent.atomic.AtomicInteger 6 | 7 | import io.github.wtog.crawler.dto.{ Page, RequestSetting } 8 | import io.github.wtog.crawler.pipeline.Pipeline 9 | import io.github.wtog.crawler.pipeline.db.{ DataSource, DataSourceInfo, PostgreSQLPipeline } 10 | import io.github.wtog.example.ExampleTrait 11 | import io.github.wtog.utils.JsonUtils 12 | import io.github.wtog.utils.StringUtils._ 13 | import org.jsoup.nodes.Element 14 | 15 | import scala.concurrent.duration._ 16 | import scala.util.Random 17 | import scala.util.matching.Regex 18 | 19 | /** 20 | * @author : tong.wang 21 | * @since : 10/11/19 10:01 APM 22 | * @version : 1.0.0 23 | */ 24 | class LianjiaRentingProcessor extends ExampleTrait { 25 | val pageNo = new AtomicInteger(1) 26 | val houseDetailRegex: Regex = """(.*zufang)/([BJ\d]+).(html$)""".r 27 | val houseListRegex: Regex = """(.*)/(zufang)/(pg[\d]+/$)""".r 28 | val houseCodeRegex: Regex = """[BJ\d]+""".r 29 | val houseFloor: Regex = """[\d]+""".r 30 | 31 | val queryDomValue: (String, Map[String, Seq[Element]], Element => String) => String = (typ: String, elements: Map[String, Seq[Element]], getDomValue: Element => String) => elements.get(typ).fold("")(e => getDomValue(e.head)) 32 | val getLiText: Element => String = (e: Element) => e.childNodes.get(1).toString 33 | val getLastSpanText: Element => String = (e: Element) => e.select("span").last().text() 34 | 35 | def getPage: Int = if (pageNo.get() >= 100) pageNo.getAndSet(1) else pageNo.incrementAndGet() 36 | 37 | override def doProcess(page: Page): Unit = 38 | page.requestSetting.url.get match { 39 | case houseListRegex(domain, _, _) => 40 | val details = page.dom(".content__list--item").toSeq 41 | 42 | details.foreach { detail => 43 | val rentingWay = detail.select(".content__list--item--title").text().substring(0, 2) 44 | val href = s"${domain}${detail.select("a").attr("href")}" 45 | 46 | val des = detail.select(".content__list--item--des").text().split("/") 47 | 48 | val area = des.head.split("-") 49 | val areaName = area.head.trim 50 | val community = area.lift(1).get.trim 51 | val communityDetail = area.last.trim 52 | val meter = des(1).replace("㎡", "").trim.toInt 53 | val direction = des(2).trim 54 | val typ = des(3).trim 55 | val floor = des(4).trim 56 | 57 | val bottom = detail.select(".content__list--item--bottom i") 58 | 59 | val subway = bottom.select(".content__item__tag--is_subway_house").text().trim 60 | val decoration = bottom.select(".content__item__tag--decoration").text().trim 61 | val heating = bottom.select(".content__item__tag--central_heating").text().trim 62 | 63 | val rentingHouse = RentingHouse( 64 | houseCode = houseCodeRegex.findFirstIn(href).get, 65 | price = Some(detail.select(".content__list--item-price em").text().toInt), 66 | rentingWay = Some(rentingWay), 67 | area = Some(areaName), 68 | community = Some(community), 69 | communityDetail = Some(communityDetail), 70 | meter = Some(meter), 71 | direction = Some(direction), 72 | typ = Some(typ), 73 | floor = houseFloor.findFirstIn(floor).map(_.toInt), 74 | subway = if (subway.nonEmpty) Some(subway) else None, 75 | decoration = if (decoration.nonEmpty) Some(decoration) else None, 76 | heating = if (heating.nonEmpty) Some(heating) else None 77 | ) 78 | page.addPageResultItem(JsonUtils.toMap(rentingHouse)) 79 | // page.addTargetRequest(s"${domain}${detail.select("a").attr("href")}") 80 | } 81 | page.addTargetRequest(s"https://bj.lianjia.com/zufang/pg${getPage}/") 82 | case houseDetailRegex(_, houseCode, _) => 83 | //todo append detail 84 | 85 | case other ⇒ 86 | println(other) 87 | } 88 | 89 | override val pipelines: Set[Pipeline] = Set( 90 | PostgreSQLPipeline(DataSourceInfo(database = "renting", jdbcUrl = "jdbc:postgresql://127.0.0.1:5432/magicbox", username = "wtog", password = "")) { (db: String, result: Map[String, Any]) => 91 | val (keys, values) = result.unzip 92 | DataSource.rows[Int]("select count(1) from renting where house_code = ?", Seq(result("houseCode").asInstanceOf[String]))(r => r.getInt(1))(db).headOption.getOrElse(0) match { 93 | case 0 => 94 | DataSource.executeUpdate(s"insert into renting (${keys.map(_.toUnderscore).mkString(",")}) values (${Seq.fill[String](keys.size)("?").mkString(",")})", values.toSeq)(db) 95 | case _ => 96 | DataSource.executeUpdate( 97 | s"update renting set ${keys.map(c => s"${c.toUnderscore} = ?").mkString(",")}, updated_at = '${ZonedDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"))}' where house_code = ? ", 98 | (values.toSeq ++ Seq(result("houseCode"))) 99 | )(db) 100 | } 101 | } 102 | ) 103 | 104 | override def requestSetting: RequestSetting = RequestSetting( 105 | domain = "bj.lianjia.com", 106 | sleepTime = (Random.nextInt(3) + 5) seconds, 107 | useProxy = true 108 | ) 109 | 110 | override def targetUrls: List[String] = List("https://bj.lianjia.com/zufang/pg1/") 111 | 112 | override def cronExpression: Option[String] = Some("0 0/30 * * * ?") 113 | } 114 | 115 | case class RentingHouse( 116 | id: Option[Int] = None, 117 | houseCode: String, 118 | price: Option[Int] = None, 119 | rentingWay: Option[String] = None, 120 | area: Option[String] = None, 121 | community: Option[String] = None, 122 | communityDetail: Option[String] = None, 123 | meter: Option[Int] = None, 124 | direction: Option[String] = None, 125 | typ: Option[String] = None, 126 | floor: Option[Int] = None, 127 | subway: Option[String] = None, 128 | decoration: Option[String] = None, 129 | heating: Option[String] = None) 130 | -------------------------------------------------------------------------------- /crawler-example/src/main/scala/io/github/wtog/example/impl/ZhihuAnswerPageProcessor.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.example.impl 2 | 3 | import io.github.wtog.crawler.dto.{ Page, RequestSetting } 4 | import io.github.wtog.example.ExampleTrait 5 | 6 | import scala.concurrent.duration._ 7 | 8 | case class ZhihuAnswerPageProcessor() extends ExampleTrait { 9 | 10 | override def doProcess(page: Page): Unit = { 11 | val result = page.json[Map[String, Any]]() 12 | 13 | result.get("data").foreach { answers => 14 | answers.asInstanceOf[List[Map[String, Any]]].foreach { answer => 15 | val question = answer("question").asInstanceOf[Map[String, String]]("title") 16 | val answerContent = answer("content") 17 | page.addPageResultItem(Map("question" -> question, "answer" -> answerContent)) 18 | } 19 | } 20 | 21 | val nextPage = result("paging").asInstanceOf[Map[String, String]].get("next") 22 | 23 | nextPage.foreach { url => 24 | page.addTargetRequest(url.replaceAll("https://www.zhihu.com", "$0/api/v4")) 25 | } 26 | } 27 | 28 | override def requestSetting: RequestSetting = 29 | RequestSetting( 30 | domain = "www.zhihu.com", 31 | timeOut = 10 seconds, 32 | sleepTime = 3 seconds 33 | ) 34 | 35 | override def targetUrls: List[String] = List( 36 | "https://www.zhihu.com/api/v4/members/rednaxelafx/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cquestion%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=10&sort_by=created" 37 | ) 38 | 39 | override def cronExpression: Option[String] = None 40 | } 41 | -------------------------------------------------------------------------------- /crawler-example/src/main/scala/io/github/wtog/example/impl/flight/QunarPageProcessor.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.example.impl.flight 2 | 3 | import io.github.wtog.crawler.downloader.{ ChromeHeadlessDownloader, Downloader } 4 | import io.github.wtog.crawler.dto.{ Page, RequestSetting, RequestUri } 5 | import io.github.wtog.example.ExampleTrait 6 | 7 | import scala.concurrent.duration._ 8 | 9 | /** 10 | * @author : tong.wang 11 | * @since : 1/14/20 8:41 PM 12 | * @version : 1.0.0 13 | */ 14 | class QunarPageProcessor extends ExampleTrait { 15 | 16 | override def targetRequests: List[RequestUri] = List( 17 | RequestUri("https://flight.qunar.com/site/oneway_list.htm?searchDepartureAirport=%E5%8C%97%E4%BA%AC&searchArrivalAirport=%E6%88%90%E9%83%BD&searchDepartureTime=2020-01-11&searchArrivalTime=2020-01-15&nextNDays=0&startSearch=true&fromCode=BJS&toCode=CTU&from=qunarindex&lowestPrice=null") 18 | ) 19 | 20 | override def requestSetting: RequestSetting = RequestSetting( 21 | domain = "flight.qunar.com", 22 | sleepTime = 1 seconds, 23 | xhrRequests = Set("https://flight.qunar.com/touch/api/domestic/wbdflightlist") 24 | ) 25 | 26 | override protected def doProcess(page: Page): Unit = 27 | page.xhrResponses.foreach { response => 28 | println(response.result) 29 | } 30 | 31 | override val downloader: Downloader[_] = ChromeHeadlessDownloader 32 | } 33 | -------------------------------------------------------------------------------- /crawler-pipeline/src/main/scala/io/github/wtog/crawler/pipeline/db/DataSource.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.pipeline.db 2 | 3 | import java.sql._ 4 | import java.util.concurrent.ConcurrentHashMap 5 | 6 | import com.zaxxer.hikari.HikariDataSource 7 | import io.github.wtog.utils.logger.Logging 8 | 9 | import scala.collection.mutable.ListBuffer 10 | import scala.util.control.NonFatal 11 | 12 | /** 13 | * @author : tong.wang 14 | * @since : 10/29/19 8:27 PM 15 | * @version : 1.0.0 16 | */ 17 | trait DataSource extends Logging { 18 | 19 | protected val driverClass: String 20 | 21 | } 22 | 23 | object DataSource extends Logging { 24 | private val pools: ConcurrentHashMap[String, HikariDataSource] = new ConcurrentHashMap[String, HikariDataSource]() 25 | 26 | def initConnection(driverClass: String, dataSouceInfo: DataSourceInfo): Unit = 27 | try { 28 | val database = dataSouceInfo.database 29 | Option(pools.get(database)).getOrElse { 30 | val hikariDataSource = new HikariDataSource() 31 | hikariDataSource.setDriverClassName(driverClass) 32 | hikariDataSource.setJdbcUrl(dataSouceInfo.jdbcUrl) 33 | hikariDataSource.setUsername(dataSouceInfo.username) 34 | hikariDataSource.setPassword(dataSouceInfo.password) 35 | hikariDataSource.setMaximumPoolSize(dataSouceInfo.maxPoolSize) 36 | hikariDataSource.setMinimumIdle(dataSouceInfo.minIdleSize) 37 | hikariDataSource.setAutoCommit(true) 38 | hikariDataSource.setIdleTimeout(dataSouceInfo.idleTimeout.toMillis) 39 | hikariDataSource.setValidationTimeout(1000) 40 | hikariDataSource.setPoolName(database) 41 | 42 | pools.put(database, hikariDataSource) 43 | } 44 | } catch { 45 | case NonFatal(e) => 46 | throw e 47 | } 48 | 49 | def getConnection(db: String): Connection = pools.get(db).getConnection 50 | 51 | def executeQuery[R](sql: SQL)(wrapper: ResultSet => R)(implicit db: String): Seq[R] = 52 | buildStatement(sql) { statement => 53 | val resultSet = statement.executeQuery() 54 | val results = new ListBuffer[R] 55 | 56 | while (resultSet.next()) { 57 | results.append(wrapper(resultSet)) 58 | } 59 | results.toSeq 60 | } 61 | 62 | def executeUpdate(sql: String, parameters: Seq[Any])(implicit db: String): Int = 63 | buildStatement(SQL(sql, parameters))(statement => statement.executeUpdate()) 64 | 65 | private def buildStatement[R](sql: SQL)(exec: PreparedStatement => R)(implicit database: String): R = { 66 | val conn = DataSource.getConnection(database) 67 | try { 68 | val statement = conn.prepareStatement(sql.sql) 69 | var index: Int = 1 70 | 71 | for (p <- sql.parameters) { 72 | p match { 73 | case p: Int => 74 | statement.setInt(index, p) 75 | case p: String => 76 | statement.setString(index, p) 77 | case p: Boolean => 78 | statement.setBoolean(index, p) 79 | case other => 80 | throw new UnsupportedOperationException(s"parameter ${other}:${other.getClass.getName} not support by now ") 81 | } 82 | index += 1 83 | } 84 | logger.debug(s"${sql}") 85 | exec(statement) 86 | } finally { 87 | conn.close() 88 | } 89 | 90 | } 91 | 92 | def rows[R](sql: String, parameters: Seq[Any])(wrapper: ResultSet => R)(implicit db: String): Seq[R] = 93 | executeQuery(SQL(sql, parameters))(wrapper) 94 | } 95 | 96 | case class SQL(sql: String, parameters: Seq[Any]) { 97 | 98 | import io.github.wtog.utils.StringUtils._ 99 | 100 | override def toString: String = sql.placeholderReplacedBy("\\?", parameters: _*) 101 | } 102 | -------------------------------------------------------------------------------- /crawler-pipeline/src/main/scala/io/github/wtog/crawler/pipeline/db/DataSourceInfo.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.pipeline.db 2 | 3 | import scala.concurrent.duration._ 4 | 5 | /** 6 | * @author : tong.wang 7 | * @since : 10/28/19 11:32 PM 8 | * @version : 1.0.0 9 | */ 10 | case class DataSourceInfo( 11 | database: String = "default", 12 | jdbcUrl: String, 13 | username: String, 14 | password: String, 15 | maxPoolSize: Int = 5, 16 | minIdleSize: Int = 1, 17 | idleTimeout: Duration = 10 seconds) 18 | -------------------------------------------------------------------------------- /crawler-pipeline/src/main/scala/io/github/wtog/crawler/pipeline/db/PostgreSQLPipeline.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.pipeline.db 2 | 3 | import io.github.wtog.crawler.pipeline.Pipeline 4 | 5 | /** 6 | * @author : tong.wang 7 | * @since : 10/28/19 11:29 PM 8 | * @version : 1.0.0 9 | */ 10 | case class PostgreSQLPipeline(dataSouceInfo: DataSourceInfo)(statement: (String, Map[String, Any]) => Unit) extends DataSource with Pipeline { 11 | 12 | override val driverClass: String = "org.postgresql.Driver" 13 | 14 | override def process[Result](pageResultItem: (String, Result)): Unit = { 15 | val (_, resultMap) = (pageResultItem._1, pageResultItem._2.asInstanceOf[Map[String, Any]]) 16 | statement(dataSouceInfo.database, resultMap) 17 | } 18 | 19 | override def init(): Unit = DataSource.initConnection(driverClass, dataSouceInfo) 20 | } 21 | -------------------------------------------------------------------------------- /crawler-pipeline/src/main/scala/io/github/wtog/crawler/pipeline/file/CsvFilePipeline.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.pipeline.file 2 | 3 | import java.io.RandomAccessFile 4 | import java.util.concurrent._ 5 | 6 | import io.github.wtog.crawler.pipeline.Pipeline 7 | import io.github.wtog.utils.logger.Logging 8 | 9 | import scala.collection.mutable.ListBuffer 10 | import scala.concurrent.Future 11 | 12 | /** 13 | * @author : tong.wang 14 | * @since : 5/20/18 11:01 PM 15 | * @version : 1.0.0 16 | */ 17 | case class CsvFilePipeline(fileName: Option[String]) extends Pipeline { 18 | 19 | override def process[R](pageResultItem: (String, R)): Unit = { 20 | val (pageUrl, resultItems) = pageResultItem 21 | IOContentCache.add(fileName.getOrElse(pageUrl), resultItems.asInstanceOf[Map[String, Any]]) 22 | } 23 | 24 | } 25 | 26 | object IOContentCache extends Logging { 27 | private val cache: ConcurrentHashMap[String, ListBuffer[Map[String, Any]]] = new ConcurrentHashMap[String, ListBuffer[Map[String, Any]]]() 28 | 29 | lazy val timestamp: Long = System.currentTimeMillis() 30 | 31 | def add(key: String, value: Map[String, Any]): ListBuffer[Map[String, Any]] = { 32 | val listValue = cache.getOrDefault(key, ListBuffer.empty[Map[String, Any]]) 33 | listValue.append(value) 34 | cache.put(key, listValue) 35 | } 36 | 37 | def writeContentFile(fileName: String, contentList: ListBuffer[Map[String, Any]]): Any = 38 | if (contentList.nonEmpty) { 39 | val file = if (fileName.contains("/")) fileName.replace("/", "_") else fileName 40 | 41 | val randomFile = new RandomAccessFile( 42 | s"/tmp/web-crawler-${file}-${timestamp}.csv", 43 | "rw" 44 | ) 45 | try { 46 | val fileLength = randomFile.length() 47 | randomFile.seek(fileLength) //指针指向文件末尾 48 | fileLength match { 49 | case 0 ⇒ 50 | val head = contentList.head 51 | val title = head.keys.mkString(",") + "\n" 52 | randomFile.write((title).getBytes("UTF-8")) 53 | val row = head.values.mkString(",") + "\n" 54 | randomFile.write((row).getBytes("UTF-8")) 55 | contentList -= head 56 | case _ ⇒ 57 | contentList.foreach(map ⇒ { 58 | val row = map.values.mkString(",") + "\n" 59 | randomFile.write((row).getBytes("UTF-8")) //写入数据 60 | contentList -= map 61 | }) 62 | } 63 | } catch { 64 | case ex: Throwable ⇒ ex.printStackTrace() 65 | } finally { 66 | randomFile.close() 67 | } 68 | } 69 | 70 | val expire: Future[ScheduledFuture[_]] = { 71 | def removeExpire() = { 72 | import collection.JavaConverters._ 73 | val schedule = Executors.newScheduledThreadPool(1) 74 | 75 | schedule.scheduleWithFixedDelay(new Runnable { 76 | override def run(): Unit = 77 | cache.asScala.foreach { 78 | case (url, list) ⇒ writeContentFile(url, list) 79 | } 80 | }, 3, 3, TimeUnit.SECONDS) 81 | } 82 | 83 | import scala.concurrent.ExecutionContext.Implicits.global 84 | 85 | Future { 86 | removeExpire() 87 | }.recover { 88 | case ex ⇒ 89 | logger.error(ex.getLocalizedMessage) 90 | removeExpire() 91 | } 92 | } 93 | 94 | } 95 | -------------------------------------------------------------------------------- /crawler-pipeline/src/main/scala/io/github/wtog/crawler/pipeline/file/FilePipeline.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.pipeline.file 2 | 3 | import java.io.{ File, PrintWriter } 4 | 5 | import io.github.wtog.crawler.pipeline.Pipeline 6 | 7 | /** 8 | * @author : tong.wang 9 | * @since : 2019-01-20 00:47 10 | * @version : 1.0.0 11 | */ 12 | case class FilePipeline(fileBaseDir: Option[String] = None) extends Pipeline { 13 | 14 | override def process[R](pageResultItem: (String, R)): Unit = { 15 | val fileDto = pageResultItem._2.asInstanceOf[FileDTO] 16 | 17 | val (fileDirPath, fileName) = fileDto.fileName.lastIndexOf('/') match { 18 | case -1 => 19 | (filePathFormat(fileBaseDir.getOrElse("/")), if (fileDto.fileName.startsWith("/")) fileDto.fileName else s"/${fileDto.fileName}") 20 | case fileDirIndex => 21 | (filePathFormat(fileBaseDir.getOrElse("/")) + filePathFormat(fileDto.fileName.substring(0, fileDirIndex)), fileDto.fileName.substring(fileDirIndex)) 22 | } 23 | 24 | val fileDir = new File(fileDirPath) 25 | 26 | !fileDir.exists() && fileDir.mkdirs() 27 | 28 | new PrintWriter(s"${fileDirPath}${fileName}.${fileDto.fileType}") { 29 | try { 30 | write(s"${fileDto.content}") 31 | } finally { 32 | close() 33 | } 34 | } 35 | } 36 | 37 | def filePathFormat(path: String): String = if (path.startsWith("/")) path else s"/${path}" 38 | 39 | } 40 | 41 | case class FileDTO(fileName: String, fileType: String = "html", content: String) 42 | -------------------------------------------------------------------------------- /crawler-pipeline/src/test/scala/io/github/wtog/crawler/pipeline/test/BasePipelineTest.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.pipeline.test 2 | 3 | import io.github.wtog.utils.test.BaseTest 4 | 5 | /** 6 | * @author : tong.wang 7 | * @since : 10/30/19 11:43 PM 8 | * @version : 1.0.0 9 | */ 10 | trait BasePipelineTest extends BaseTest { 11 | 12 | protected def init() 13 | 14 | protected def cleanup() 15 | 16 | override def beforeAll(): Unit = init() 17 | 18 | override def afterAll(): Unit = cleanup() 19 | } 20 | -------------------------------------------------------------------------------- /crawler-pipeline/src/test/scala/io/github/wtog/crawler/pipeline/test/DataSourceTest.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.crawler.pipeline.test 2 | 3 | import io.github.wtog.crawler.pipeline.db.DataSource 4 | 5 | /** 6 | * @author : tong.wang 7 | * @since : 10/30/19 11:50 PM 8 | * @version : 1.0.0 9 | */ 10 | class DataSourceTest extends BasePipelineTest { 11 | object PgDataSource extends DataSource { 12 | override protected val driverClass: String = "org.postgresql.Driver" 13 | } 14 | 15 | test("pg process") { 16 | 17 | val datas = Map[String, Any]( 18 | "a" -> 1, 19 | "b" -> 2 20 | ) 21 | } 22 | 23 | override protected def init(): Unit = {} 24 | 25 | override protected def cleanup(): Unit = {} 26 | 27 | } 28 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM alpine:3.8 2 | 3 | MAINTAINER wangtong 4 | USER root 5 | 6 | RUN echo http://mirrors.ustc.edu.cn/alpine/v3.8/main > /etc/apk/repositories && \ 7 | echo http://mirrors.ustc.edu.cn/alpine/v3.8/community >> /etc/apk/repositories && \ 8 | echo @edge http://mirrors.ustc.edu.cn/alpine/edge/community >> /etc/apk/repositories && \ 9 | echo @edge http://mirrors.ustc.edu.cn/alpine/edge/main >> /etc/apk/repositories 10 | 11 | RUN apk update && apk upgrade && \ 12 | apk add --no-cache \ 13 | bash \ 14 | alsa-lib \ 15 | at-spi2-atk \ 16 | atk \ 17 | cairo \ 18 | cups-libs \ 19 | dbus-libs \ 20 | eudev-libs \ 21 | expat \ 22 | flac \ 23 | gdk-pixbuf \ 24 | glib \ 25 | harfbuzz@edge \ 26 | libgcc \ 27 | libjpeg-turbo \ 28 | libpng \ 29 | libwebp \ 30 | libx11 \ 31 | libxcomposite \ 32 | libstdc++@edge \ 33 | libxdamage \ 34 | libxext \ 35 | libxfixes \ 36 | libexif \ 37 | chromium@edge \ 38 | chromium-chromedriver@edge \ 39 | openjdk8 40 | 41 | RUN ln -s /usr/lib/jvm/default-jvm/bin/jstat /usr/local/bin/jstat && \ 42 | ln -s /usr/lib/jvm/default-jvm/bin/jcmd /usr/bin/jcmd && \ 43 | ln -s /usr/lib/jvm/default-jvm/bin/jstack /usr/bin/jstack && \ 44 | rm -rf /openjdk8* 45 | 46 | ADD web-crawler-assembly.jar /apps/web-crawler.jar 47 | 48 | CMD ["java", "-jar", "/apps/web-crawler.jar"] 49 | -------------------------------------------------------------------------------- /docker/build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | find crawler-example -type d -iname 'target' | xargs rm -rf 4 | 5 | sbt clean assembly 6 | 7 | version='latest' 8 | 9 | docker build -f docker/Dockerfile -t wtog/web-crawler:${version} ./crawler-example/target/scala-2.12/ -------------------------------------------------------------------------------- /project/.gnupg/pubring.gpg: -------------------------------------------------------------------------------- 1 | -----BEGIN PGP PUBLIC KEY BLOCK----- 2 | 3 | mQENBFxCjvEBCADMp/qjTLav0haO4zMDKCR2xn0Tx9X/SPxjyzqCXiuQRhJslyUv 4 | mRwsRbi4bUYEoDcB9xPZVnHBf/fNE8+Jo0tdnaKho1sTmwMc/inIxXubfgTjsqI1 5 | 8eZvLJfqGPPpyBgE6Ijt+//wUiKNS/JmoUnDHIQZ3Az5mf/faUI5Qn8N9OoWmAZ3 6 | f+qk03ZgXNmgfkW7KSUSwtr7S0AFEc+co/YmLvlH16B7mTrmgJJg8O9iKdheFU7p 7 | YivVjOkoE5r9Cs7M01hc1OfMFgFShqCEpLLi9P7nEKhtQdAGR+bdAuPxmEetbkVU 8 | MNnj73prrfrh9kft+P6gPQbGww6QDW+aM5IdABEBAAG0G3dhbmd0b25nIDx3dGdl 9 | ZWtlckAxNjMuY29tPokBVAQTAQgAPhYhBBJOwv/hbFas+DoxHHoKB+L9p6NGBQJc 10 | Qo7xAhsDBQkDwmcABQsJCAcCBhUKCQgLAgQWAgMBAh4BAheAAAoJEHoKB+L9p6NG 11 | rZcH+QGz10cZ+z3lYc1K9drLse88gp8oUBGmOmyQsP64gJTKt0Ni/0sg96TK6Q/K 12 | ssRSPdR9xAp0UKsQBwgLouQb6bBlRM63HBfwVPM3wLkWJRzeNCNu1+/EFlIa7y9E 13 | T6KB5cc/B/tbhkDrM9+0Yl8HslY8SMBQW6S2asjW8K4QaOd1wWJcGUuKDRupohgo 14 | SJOZNs+fr5fI9/zlpGVodyNOzZe4dGNk+/fKVQQYcsT2HQEtxX6Wi1wwnMksoFAX 15 | 8JbJHm5a8zoxq5YdES26bOcJWDiYc7Qm9r3DC5pm0j4FYLtRVv6kZIAjeBimM3nE 16 | lLKRliKAU1DE/8qHov3tbm0hCY25AQ0EXEKO8QEIAKTCNvDJi/2EclZHX9FQI/7c 17 | PmfNlSv3+//3eQ2MdFVQxbuoetFp/ogTi4FeMuMPrvtS8eWCS+2T/bM0lLokry7y 18 | YUE7jrxnBf8Gvslz0t+EvKmdH9CBzBWHk2keKjzNBC7wyHmi6YignS0o6pHZciOz 19 | sDr1n1q/BWii034rB8g/oVm7kIPQk/bMeN8tUprqF0S/BE8CiqFPiM6OhNR6enA8 20 | h+ZKcSmkTNLeuTvRoCcyqfyfjgroBaFN616eZFnExVCXIGvVe/TAWhZPtRzUmq6C 21 | r1D08KIsc6lWzrnru+hhYfjfXyNmoJFdxCqiArC4g0Egck9Wwj5z6RlSNmi+dCEA 22 | EQEAAYkBPAQYAQgAJhYhBBJOwv/hbFas+DoxHHoKB+L9p6NGBQJcQo7xAhsMBQkD 23 | wmcAAAoJEHoKB+L9p6NG40YH/1bV3tIlIfntfY6P9qDRmLB3cEAHKYg7r1XSqrKw 24 | YpqJHim1CQxM4GJ1l/XT9W+s0YpQn3BMR41nmRIGJ5tztB6xebCW3tcNOK8YMjcG 25 | h6/PRBPyu3uGjxii2MJvibLBNUvwoX6pdEQqVqGjU+xyQrN6Gm9RY115AsabgNPm 26 | QrcJx1pUizqvA46t5T1N1yCaAu164BuTeO8SLhsG6QCydcDPRy22Ezsl5eoQilI5 27 | mRGJqHaJL0phbeOaPrgERx4sAUlB6jUt/uSwiapzzsBNUthjO1J/m+CnPDU7JPi4 28 | Reenvw2AT/upHHKtoJxwRPS/gDZk4GDA9obLKqWaXnOsV8k= 29 | =Xx3w 30 | -----END PGP PUBLIC KEY BLOCK----- 31 | -------------------------------------------------------------------------------- /project/.gnupg/secring.gpg: -------------------------------------------------------------------------------- 1 | -----BEGIN PGP PRIVATE KEY BLOCK----- 2 | 3 | lQPGBFxCjvEBCADMp/qjTLav0haO4zMDKCR2xn0Tx9X/SPxjyzqCXiuQRhJslyUv 4 | mRwsRbi4bUYEoDcB9xPZVnHBf/fNE8+Jo0tdnaKho1sTmwMc/inIxXubfgTjsqI1 5 | 8eZvLJfqGPPpyBgE6Ijt+//wUiKNS/JmoUnDHIQZ3Az5mf/faUI5Qn8N9OoWmAZ3 6 | f+qk03ZgXNmgfkW7KSUSwtr7S0AFEc+co/YmLvlH16B7mTrmgJJg8O9iKdheFU7p 7 | YivVjOkoE5r9Cs7M01hc1OfMFgFShqCEpLLi9P7nEKhtQdAGR+bdAuPxmEetbkVU 8 | MNnj73prrfrh9kft+P6gPQbGww6QDW+aM5IdABEBAAH+BwMCDD/DVB1tuWDkxVqe 9 | 4onaPqVM0n912iKPbMLY7lHDQ5GPZNIRIN5qanr5bZlIaN28ptmDvKtGwaahBTXE 10 | I0JJEuuNuGC2y+/Kmw4wXTdlG/GnT/8Ktz91w8toDOR80JdaLzGKC7H+0fFQ3pOa 11 | 7BGquuBfoJFMEnSSZXs26/mtdHcNkYzKw1il5hUYGrdkqeuK2uqMW3VAy3AQOB60 12 | xihg2zSj8P9VjF4yd0y0rY4r/ZlThx6KAZ8H+iiS4MlmTMh21L5V5UXx3zsnMywI 13 | 18spMUFkz06zIgLmtjbcLL+If5OpIl+mpdvuwrv04N3c6CF+Di10j5SZiNNwisvz 14 | jHmvPBnthbxx2kXSCXBJzKHm6XNVTHRHSGK+OM1anbDcu4NkTskMZrlzUpwfDGBB 15 | rqgzEqu8GRhwqP+v7fk3rd2dLb5uceaxovHI+6BNy0WRzUNuo5t3IzhlEcoK+1ED 16 | oxqr/n7xzw6JYHcKCR3wzrhIHNIjZ/Dew3YT1cQCPLjg8xc4sd68He/HBl9ZHnDy 17 | hXrF4oUEGLoD+ExvARjPfzl5P+2d/V3949oOtnmV3DNHGvE8ew4+XbXn8fQfmBiR 18 | FOzK01+RtukyU/pqaShX1VNb2tlyf/d0CMPxaMKOt1+BGndXzTwzaEv9X61Cl/+F 19 | KrYGWabBGiavBe7dDTfBXY/8lMTVmBqtwwoom+dNatWeytmXyRXK0o2lMwK5mKSh 20 | jaXwcoIa+uTjyZyjhuUKRdKadBunJz9w3EGB6cet1iTNruOHZkYhMk+AzWNskz+w 21 | yXxU908DHQWdAI0S/2pGJ76JitIT+t69i04Efg+L8j7vJPBMkmoZyxomKRsWoKDQ 22 | Lacm0Fo6bvKGRy0B6urHbHu9ylGo/SsihMiTRQse7W7g2dtqnq6BAX68a6r9+Gy1 23 | iPok4KE/HjuOtBt3YW5ndG9uZyA8d3RnZWVrZXJAMTYzLmNvbT6JAVQEEwEIAD4W 24 | IQQSTsL/4WxWrPg6MRx6Cgfi/aejRgUCXEKO8QIbAwUJA8JnAAULCQgHAgYVCgkI 25 | CwIEFgIDAQIeAQIXgAAKCRB6Cgfi/aejRq2XB/kBs9dHGfs95WHNSvXay7HvPIKf 26 | KFARpjpskLD+uICUyrdDYv9LIPekyukPyrLEUj3UfcQKdFCrEAcIC6LkG+mwZUTO 27 | txwX8FTzN8C5FiUc3jQjbtfvxBZSGu8vRE+igeXHPwf7W4ZA6zPftGJfB7JWPEjA 28 | UFuktmrI1vCuEGjndcFiXBlLig0bqaIYKEiTmTbPn6+XyPf85aRlaHcjTs2XuHRj 29 | ZPv3ylUEGHLE9h0BLcV+lotcMJzJLKBQF/CWyR5uWvM6MauWHREtumznCVg4mHO0 30 | Jva9wwuaZtI+BWC7UVb+pGSAI3gYpjN5xJSykZYigFNQxP/Kh6L97W5tIQmNnQPG 31 | BFxCjvEBCACkwjbwyYv9hHJWR1/RUCP+3D5nzZUr9/v/93kNjHRVUMW7qHrRaf6I 32 | E4uBXjLjD677UvHlgkvtk/2zNJS6JK8u8mFBO468ZwX/Br7Jc9LfhLypnR/QgcwV 33 | h5NpHio8zQQu8Mh5oumIoJ0tKOqR2XIjs7A69Z9avwVootN+KwfIP6FZu5CD0JP2 34 | zHjfLVKa6hdEvwRPAoqhT4jOjoTUenpwPIfmSnEppEzS3rk70aAnMqn8n44K6AWh 35 | TetenmRZxMVQlyBr1Xv0wFoWT7Uc1Jqugq9Q9PCiLHOpVs6567voYWH4318jZqCR 36 | XcQqogKwuINBIHJPVsI+c+kZUjZovnQhABEBAAH+BwMC8trkrtDSBhXk2XGVB+oz 37 | EBN1TKVKkXaF8Gnm6OhYhErC+r6P7kmacoFVY4Ji/+dxybnpRUadKnULsUoRiM/N 38 | 0tE9jUgFdvS4AjXd9Xkx/KU3onpXQ7WgwOT6T1pgFYh9oY1sJMvC6AA6aVThFS7b 39 | ZeiufBernRxishDzHxVklniNVk0k+LnpnQhUuf/DCkWZWeAqE3R+ebwv8Rjh4XvR 40 | tpoxkCQQnVr1klwF9qSSwqpiZK4nkKbuukV0nCkmUut3bXFwan9XXwj7mgY5oA1O 41 | R2jeMXQ8mDo7Jp4XdVCAbKCUTQy9BX90QxCiLQw9/em06KYaXVQZZdnEW96BI9aa 42 | 2epqc73di4YwZHZHC5A7r5hvlZqhnNnWO8ZUJoSIJlfgA8Jl6W2lD7JkXFSYNh9h 43 | xXSI3iK5SRppkUOy9drHZi0WlRn482OdYyiwgdOpg2ZDIqmIGd6y11iLwSpTE3Ku 44 | vhss7q80qXbMTb9EO9BYteYEPUSxvpAW0bFUZqOA3PoMO+GKxM+d/z8/qzbAz78k 45 | NYxAz/a7UkRnxOkaqD4LyZbL98Ce2YTxXqikgMAEuFEl18BX4Zek3OxjcpGnzY3a 46 | 76O7t8Ud5pnZkk5OV+fBgNOkSfMVeaLb8P0xqIGaRzXt9Z17OH+M10Bi+djPEnNh 47 | l9guvFdFtyimSygaG4yNHUYnP2BNtrZoGLyd6Z63bm6V0k7pUrH5aIEj0OpP1sBV 48 | BLxvpqsMFi15Nn79jdzoQeXJfyMQ3wVmnrEKdGyHQj7CLWU6of/ZnvNcfN3lBCJ3 49 | 7Nw3JZijzM2IPrSYG/zFJCtD3WYyAEQnkqe/6cXSTJjihjOIm55PK/K2sKi0/uU9 50 | c8lVfdl9LMo0NSu53CF0wyZbrozvC1JvU7Dv6zVtIebsZXlrr67lw7Z3pEKYHew2 51 | NR4JMK08iQE8BBgBCAAmFiEEEk7C/+FsVqz4OjEcegoH4v2no0YFAlxCjvECGwwF 52 | CQPCZwAACgkQegoH4v2no0bjRgf/VtXe0iUh+e19jo/2oNGYsHdwQAcpiDuvVdKq 53 | srBimokeKbUJDEzgYnWX9dP1b6zRilCfcExHjWeZEgYnm3O0HrF5sJbe1w04rxgy 54 | NwaHr89EE/K7e4aPGKLYwm+JssE1S/Chfql0RCpWoaNT7HJCs3oab1FjXXkCxpuA 55 | 0+ZCtwnHWlSLOq8Djq3lPU3XIJoC7XrgG5N47xIuGwbpALJ1wM9HLbYTOyXl6hCK 56 | UjmZEYmodokvSmFt45o+uARHHiwBSUHqNS3+5LCJqnPOwE1S2GM7Un+b4Kc8NTsk 57 | +LhF56e/DYBP+6kccq2gnHBE9L+ANmTgYMD2hssqpZpec6xXyQ== 58 | =uCVh 59 | -----END PGP PRIVATE KEY BLOCK----- 60 | -------------------------------------------------------------------------------- /project/Dependencies.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | 3 | /** 4 | * @author : tong.wang 5 | * @since : 2018-12-08 00:25 6 | * @version : 1.0.0 7 | */ 8 | object Dependencies { 9 | 10 | object Versions { 11 | val akkaVersion = "2.6.3" 12 | val log4j2 = "2.12.1" 13 | val seleniumhq = "4.0.0-alpha-3" 14 | val httpClient = "2.10.1" 15 | val jackson = "2.10.2" 16 | val guava = "28.2-jre" 17 | val typesafeConfig = "1.4.0" 18 | val scalatest = "3.0.8" 19 | val hikariCP = "3.4.1" 20 | } 21 | 22 | 23 | implicit class ModuleIDWrapper(moduleID: ModuleID) { 24 | def provided: ModuleID = moduleID.withConfigurations(Some("provided")) 25 | 26 | def test: ModuleID = moduleID.withConfigurations(Some("test")) 27 | } 28 | 29 | lazy val crossVersion = Seq("2.13.1", "2.12.10") 30 | 31 | lazy val guava = "com.google.guava" % "guava" % Versions.guava 32 | 33 | lazy val typesafeConfig = "com.typesafe" % "config" % Versions.typesafeConfig 34 | 35 | lazy val jackson = Seq("com.fasterxml.jackson.module" %% "jackson-module-scala" % Versions.jackson) 36 | 37 | lazy val scalatest = "org.scalatest" %% "scalatest" % Versions.scalatest 38 | 39 | lazy val log = Seq( 40 | "org.apache.logging.log4j" % "log4j-slf4j-impl" % Versions.log4j2, 41 | "org.apache.logging.log4j" % "log4j-core" % Versions.log4j2) 42 | 43 | 44 | object utils { 45 | lazy val dependencies = scalatest.test +: (Seq(guava, typesafeConfig) ++ jackson ++ log).map(_.provided) 46 | } 47 | 48 | object core { 49 | 50 | lazy val akka = Seq(("com.typesafe.akka" %% "akka-actor" % Versions.akkaVersion).provided) 51 | 52 | lazy val quartz = "org.quartz-scheduler" % "quartz" % "2.3.1" exclude("com.zaxxer", "HikariCP-java7") 53 | 54 | lazy val httpUtils = Seq("org.asynchttpclient" % "async-http-client" % Versions.httpClient) 55 | 56 | lazy val httpParser = Seq("us.codecraft" % "xsoup" % "0.3.1") 57 | 58 | lazy val selenium = Seq( 59 | "org.seleniumhq.selenium" % "selenium-chrome-driver" % Versions.seleniumhq 60 | ) 61 | 62 | lazy val test = Seq( 63 | "com.typesafe.akka" %% "akka-testkit" % Versions.akkaVersion, scalatest 64 | ).map(_.test) 65 | 66 | lazy val dependencies = Seq(quartz, guava, typesafeConfig) ++ jackson ++ akka ++ log ++ httpParser ++ httpUtils ++ test ++ selenium 67 | } 68 | 69 | object pipeline { 70 | lazy val postgresql = "42.2.6" 71 | 72 | lazy val pg: ModuleID = "org.postgresql" % "postgresql" % postgresql 73 | 74 | lazy val hikari = "com.zaxxer" % "HikariCP" % Versions.hikariCP 75 | 76 | lazy val h2 = "com.h2database" % "h2" % "1.4.192" 77 | 78 | lazy val dependencies = Seq(scalatest, h2).map(_.test) ++ Seq(pg, hikari).map(_.provided) 79 | } 80 | 81 | object example { 82 | lazy val dependencies = core.dependencies ++ log ++ Seq(pipeline.pg, pipeline.hikari) 83 | } 84 | 85 | } 86 | -------------------------------------------------------------------------------- /project/Publish.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | import sbt.Keys._ 3 | import com.typesafe.sbt.SbtPgp.autoImportImpl._ 4 | 5 | /** 6 | * @author : tong.wang 7 | * @since : 2018-12-20 22:00 8 | * @version : 1.0.0 9 | */ 10 | object Publish extends AutoPlugin { 11 | override def trigger: PluginTrigger = allRequirements 12 | 13 | override def projectSettings: Seq[Def.Setting[_]] = Seq( 14 | useGpg := false, 15 | usePgpKeyHex("124EC2FFE16C56ACF83A311C7A0A07E2FDA7A346"), 16 | pgpPublicRing := baseDirectory.value / "project" / ".gnupg" / "pubring.gpg", 17 | pgpSecretRing := baseDirectory.value / "project" / ".gnupg" / "secring.gpg", 18 | 19 | credentials += Credentials(Path.userHome / ".sbt" / ".credentials-center"), 20 | publishTo := { 21 | val nexus = "https://oss.sonatype.org/" 22 | if (isSnapshot.value) Some("snapshots" at nexus + "content/repositories/snapshots") 23 | else Some("releases" at nexus + "service/local/staging/deploy/maven2/") 24 | }, 25 | 26 | publishMavenStyle := true, 27 | publishArtifact in Test := false, 28 | pomIncludeRepository := { _ ⇒ false }, 29 | 30 | pomExtra in Global := { 31 | https://github.com/wtog/web-crawler 32 | 33 | 34 | Apache 2 35 | http://www.apache.org/licenses/LICENSE-2.0.txt 36 | 37 | 38 | 39 | git@github.com:wtog/web-crawler.git 40 | scm:git:git@github.com:wtog/web-crawler.git 41 | 42 | 43 | 44 | wangtong 45 | wangtong 46 | https://github.com/wtog/ 47 | 48 | 49 | }) 50 | } 51 | 52 | object DisablePublish extends AutoPlugin { 53 | 54 | override def requires: Plugins = plugins.IvyPlugin 55 | 56 | override def projectSettings: Seq[_root_.sbt.Def.Setting[_]] = Seq( 57 | publishArtifact := false, 58 | publish := Unit, 59 | publishLocal := Unit) 60 | 61 | } -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.7.1 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | logLevel := Level.Warn 2 | 3 | resolvers += Classpaths.sbtPluginReleases 4 | resolvers += Resolver.sonatypeRepo("releases") 5 | resolvers += Resolver.sonatypeRepo("snapshots") 6 | 7 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5") 8 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.1") 9 | addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.0") 10 | addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.9.26") 11 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.6.1") 12 | addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.3.4") 13 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.10.0-RC1") 14 | -------------------------------------------------------------------------------- /push.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | function deploy_git() { 6 | git add . 7 | params=$@ 8 | if [ "X$params" = "X" ]; then 9 | msg=$(git log --pretty=oneline --abbrev-commit | awk '{if(NR<2) for(i=2;i<=NF;i++) printf $i" " }') 10 | else 11 | msg=$params 12 | fi 13 | echo $msg 14 | git commit -m "$msg" 15 | git push 16 | } 17 | 18 | deploy_git $@ 19 | -------------------------------------------------------------------------------- /utils/src/main/scala/io/github/wtog/utils/ConfigUtils.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.utils 2 | 3 | import com.typesafe.config.{Config, ConfigFactory, ConfigObject, ConfigValue} 4 | 5 | import scala.collection.JavaConverters._ 6 | import java._ 7 | 8 | /** 9 | * @author : tong.wang 10 | * @since : 2019-05-09 00:14 11 | * @version : 1.0.0 12 | */ 13 | object ConfigUtils { 14 | 15 | @volatile private[this] var config: Config = ConfigFactory.load() 16 | 17 | def init(resource: String) = { 18 | config = config.withFallback(ConfigFactory.load(resource)) 19 | } 20 | 21 | def getSeq[T](path: String): Seq[T] = config.getList(path).unwrapped().asScala.map(v => v.asInstanceOf[T]).toSeq 22 | 23 | def getSeqMap(path: String): Seq[Map[String, Any]] = getSeq[util.Map[String, Any]](path).map(i => i.asScala.toMap) 24 | 25 | def getStringOpt(path: String): Option[String] = getOpt[String](path)(config.getString) 26 | 27 | def getIntOpt(path: String): Option[Int] = getOpt[Int](path)(config.getInt) 28 | 29 | def getBooleanOpt(path: String): Option[Boolean] = getOpt[Boolean](path)(config.getBoolean) 30 | 31 | def getConfig(name: String): Config = config.getConfig(name) 32 | 33 | def getConfigObjectOpt(path: String): Option[ConfigObject] = getOpt[ConfigObject](path)(config.getObject) 34 | 35 | def getKeyAndValue(name: String): Map[String, Any] = getConfig(name).entrySet().asScala.foldLeft(Map.empty[String, Any]) { (map, entry) => 36 | map + (entry.getKey -> entry.getValue.unwrapped()) 37 | } 38 | 39 | private[this] def getOpt[T](path: String)(getConfig: String => T): Option[T] = 40 | if (config.hasPath(path)) { 41 | Some(getConfig(path)) 42 | } else { 43 | None 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /utils/src/main/scala/io/github/wtog/utils/JsonUtils.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.utils 2 | 3 | import com.fasterxml.jackson.annotation.JsonInclude.Include 4 | import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, SerializationFeature} 5 | import com.fasterxml.jackson.module.scala.DefaultScalaModule 6 | import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper 7 | 8 | /** 9 | * @author : tong.wang 10 | * @since : 9/24/19 11:05 PM 11 | * @version : 1.0.0 12 | */ 13 | object JsonUtils { 14 | 15 | private lazy val mapper: ObjectMapper with ScalaObjectMapper = { 16 | val mapper = new ObjectMapper() with ScalaObjectMapper 17 | mapper.setSerializationInclusion(Include.NON_NULL) 18 | mapper.setSerializationInclusion(Include.NON_ABSENT) 19 | mapper.disable(SerializationFeature.FAIL_ON_EMPTY_BEANS) 20 | mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) 21 | mapper.registerModule(DefaultScalaModule) 22 | 23 | mapper 24 | } 25 | 26 | def toJson[T](t: T): String = mapper.writeValueAsString(t) 27 | 28 | def toMap(t: Any): Map[String, Any] = mapper.convertValue[Map[String, Any]](t) 29 | 30 | def parseFrom[T: Manifest](json: String): T = mapper.readValue[T](json) 31 | 32 | def parseFrom[T: Manifest](bytes: Array[Byte]): T = mapper.readValue[T](bytes) 33 | 34 | def parseFrom[T: Manifest](map: Map[String, Any]): T = mapper.convertValue[T](map) 35 | } 36 | -------------------------------------------------------------------------------- /utils/src/main/scala/io/github/wtog/utils/ReflectionUtils.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.utils 2 | 3 | import com.google.common.reflect.{ ClassPath, TypeToken } 4 | 5 | import scala.collection.JavaConverters._ 6 | import scala.collection.mutable 7 | 8 | /** 9 | * @author : tong.wang 10 | * @since : 11/19/18 11:39 PM 11 | * @version : 1.0.0 12 | */ 13 | object ReflectionUtils { 14 | private[this] lazy val CLASS_PATH = ClassPath.from(this.getClass.getClassLoader) 15 | 16 | def implementationClasses[T](clazz: Class[T], packageName: String): Seq[Class[T]] = 17 | getClasses[T](packageName).filter { c ⇒ 18 | !c.isInterface && TypeToken 19 | .of(c) 20 | .getTypes 21 | .asScala 22 | .exists(t ⇒ t.getRawType == clazz) 23 | }.toSeq 24 | 25 | def getClasses[T](packageName: String): mutable.Set[Class[T]] = CLASS_PATH.getTopLevelClassesRecursive(packageName).asScala.map(_.load().asInstanceOf[Class[T]]) 26 | 27 | } 28 | -------------------------------------------------------------------------------- /utils/src/main/scala/io/github/wtog/utils/RetryUtils.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.utils 2 | 3 | import java.util.concurrent.TimeUnit 4 | 5 | import scala.concurrent.duration._ 6 | import scala.concurrent.{ ExecutionContext, Future } 7 | import scala.util.Try 8 | 9 | /** 10 | * @author : tong.wang 11 | * @since : 2019-05-07 23:10 12 | * @version : 1.0.0 13 | */ 14 | object RetryUtils { 15 | 16 | def futureRetryWhen[T](invoke: => Future[T], retryTime: Int = 0, retryInfo: RetryInfo = RetryInfo())(implicit context: ExecutionContext = ExecutionContext.Implicits.global): Future[T] = 17 | invoke.recoverWith { 18 | case ex if retryInfo.exceptions.contains(ex.getClass.getName) && retryTime > 0 => 19 | TimeUnit.MILLISECONDS.sleep(retryInfo.duration.toMillis) 20 | futureRetryWhen(invoke, retryTime - 1, retryInfo)(context) 21 | case other => 22 | throw other 23 | } 24 | 25 | def retryWhen[T](invoke: => T, retryTime: Int = 0, retryInfo: RetryInfo = RetryInfo()): T = 26 | Try(invoke).recover { 27 | case ex: Throwable if retryInfo.exceptions.contains(ex.getClass.getName) && retryTime > 0 => 28 | TimeUnit.MILLISECONDS.sleep(retryInfo.duration.toMillis) 29 | retryWhen(invoke, retryTime - 1, retryInfo) 30 | case other => 31 | throw other 32 | }.get 33 | } 34 | 35 | case class RetryInfo(duration: Duration = 1 seconds, exceptions: Seq[String] = Seq(classOf[Throwable].getName)) 36 | -------------------------------------------------------------------------------- /utils/src/main/scala/io/github/wtog/utils/StringUtils.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.utils 2 | 3 | import com.google.common.base.CaseFormat 4 | import com.google.common.base.Converter 5 | 6 | /** 7 | * @author : tong.wang 8 | * @since : 10/31/19 12:25 AM 9 | * @version : 1.0.0 10 | */ 11 | object StringUtils { 12 | 13 | lazy val underscoreConverter: Converter[String, String] = CaseFormat.LOWER_CAMEL.converterTo(CaseFormat.LOWER_UNDERSCORE) 14 | lazy val lowlandersConverter: Converter[String, String] = CaseFormat.LOWER_UNDERSCORE.converterTo(CaseFormat.LOWER_CAMEL) 15 | 16 | implicit class StringWrapper(s: String) { 17 | 18 | def toUnderscore: String = underscoreConverter.convert(s) 19 | 20 | def toLowercamel: String = lowlandersConverter.convert(s) 21 | 22 | def placeholderReplacedBy(placeholder: String, replacement: Any*): String = { 23 | val list = s.split(placeholder).zip(replacement.toSeq) 24 | val buffer = new StringBuilder() 25 | list.foreach { 26 | case (s, p: String) => 27 | buffer.append(s).append(s"'$p'") 28 | case (s, p) => 29 | buffer.append(s).append(p) 30 | } 31 | buffer.toString() 32 | } 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /utils/src/main/scala/io/github/wtog/utils/logger/Logging.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.utils.logger 2 | 3 | import org.slf4j.{ Logger, LoggerFactory } 4 | 5 | /** 6 | * @author : tong.wang 7 | * @since : 3/3/20 11:33 PM 8 | * @version : 1.0.0 9 | */ 10 | trait Logging { 11 | protected val logger: Logger = LoggerFactory.getLogger(this.getClass) 12 | } 13 | -------------------------------------------------------------------------------- /utils/src/test/scala/io/github/wtog/utils/test/BaseTest.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.utils.test 2 | 3 | import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers} 4 | 5 | import scala.concurrent.{Await, Future} 6 | import scala.concurrent.duration._ 7 | 8 | /** 9 | * @author : tong.wang 10 | * @since : 10/30/19 11:54 PM 11 | * @version : 1.0.0 12 | */ 13 | trait BaseTest extends FunSuite with Matchers with BeforeAndAfterAll{ 14 | 15 | def await[T](future: => Future[T]) = Await.result(future, 1 minute) 16 | 17 | } 18 | -------------------------------------------------------------------------------- /utils/src/test/scala/io/github/wtog/utils/test/JsonUtilsTest.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.utils.test 2 | 3 | import io.github.wtog.utils.JsonUtils 4 | 5 | /** 6 | * @author : tong.wang 7 | * @since : 12/29/19 8:25 PM 8 | * @version : 1.0.0 9 | */ 10 | class JsonUtilsTest extends BaseTest { 11 | 12 | test("json serialize and deserialize") { 13 | val test = Test("test") 14 | val json = JsonUtils.toJson(test) 15 | val map = JsonUtils.toMap(test) 16 | val jsonParsed = JsonUtils.parseFrom[Test](json) 17 | assert(test == jsonParsed) 18 | val mapParsed = JsonUtils.parseFrom[Test](map) 19 | assert(test == mapParsed) 20 | } 21 | } 22 | 23 | case class Test(t: String) 24 | -------------------------------------------------------------------------------- /utils/src/test/scala/io/github/wtog/utils/test/RetryUtilsTest.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.utils.test 2 | 3 | import java.util.concurrent.TimeUnit 4 | 5 | import io.github.wtog.utils.RetryInfo 6 | import io.github.wtog.utils.RetryUtils._ 7 | 8 | import scala.concurrent.duration._ 9 | import scala.concurrent.{ExecutionContext, Future} 10 | import scala.util.Try 11 | 12 | /** 13 | * @author : tong.wang 14 | * @since : 2019-05-07 23:35 15 | * @version : 1.0.0 16 | */ 17 | class RetryUtilsTest extends BaseTest { 18 | 19 | test("retry") { 20 | var invokeTime = 0 21 | 22 | def method(limit: Int): Int = { 23 | invokeTime += 1 24 | if (invokeTime < limit) throw new Exception(s"invokeTime: ${invokeTime}") else invokeTime 25 | } 26 | 27 | assert(Try(retryWhen(method(6), retryTime = 3, RetryInfo(exceptions = Seq(classOf[Exception].getName)))).isFailure) 28 | invokeTime = 0 29 | assert(3 == retryWhen(method(3),retryTime = 3, RetryInfo(exceptions = Seq(classOf[Exception].getName)))) 30 | invokeTime = 0 31 | assert(2 == retryWhen(method(2),retryTime = 3,RetryInfo(exceptions = Seq(classOf[Exception].getName)))) 32 | invokeTime = 0 33 | assert(2 == retryWhen(method(2),retryTime = 1, RetryInfo(exceptions = Seq(classOf[Exception].getName)))) 34 | invokeTime = 0 35 | assert(Try(retryWhen(method(3), retryTime = 1, RetryInfo(exceptions = Seq(classOf[Exception].getName)))).isFailure) 36 | } 37 | 38 | test("futureRetryWhen") { 39 | 40 | import scala.concurrent.ExecutionContext.Implicits.global 41 | 42 | class Test(invokeTime: Int = 0) { 43 | var _invokeTime = invokeTime 44 | 45 | def method(limit: Int): Future[Int] = { 46 | _invokeTime += 1 47 | Future { 48 | if (_invokeTime < limit) throw new Exception(s"invokeTime: ${invokeTime}") else _invokeTime 49 | }(ExecutionContext.Implicits.global) 50 | } 51 | } 52 | 53 | 54 | val t1 = new Test() 55 | (futureRetryWhen(t1.method(6),retryTime = 3, RetryInfo(exceptions = Seq(classOf[Exception].getName), duration = 10 millis))).onComplete(r => assert(r.isFailure)) 56 | 57 | val t2 = new Test() 58 | (futureRetryWhen(t2.method(3),retryTime = 3, RetryInfo(exceptions = Seq(classOf[Exception].getName), duration = 10 millis))).onComplete(r => assert(r.isSuccess && r.get == 3)) 59 | 60 | val t3 = new Test() 61 | (futureRetryWhen(t3.method(2),retryTime = 3, RetryInfo(exceptions = Seq(classOf[Exception].getName), duration = 10 millis))).onComplete(r => assert(r.isSuccess && r.get == 2)) 62 | 63 | val t4 = new Test() 64 | (futureRetryWhen(t4.method(2),retryTime = 3, RetryInfo(exceptions = Seq(classOf[Exception].getName), duration = 10 millis))).onComplete(r => assert(r.isSuccess && r.get == 2)) 65 | 66 | val t5 = new Test() 67 | (futureRetryWhen(t5.method(3),retryTime = 1, RetryInfo(exceptions = Seq(classOf[Exception].getName), duration = 10 millis))).onComplete(r => assert(r.isFailure)) 68 | 69 | val t6 = new Test() 70 | (futureRetryWhen(t6.method(0).failed, retryTime = 3, RetryInfo(10 millis))).onComplete(r => assert(r.isFailure)) 71 | 72 | TimeUnit.SECONDS.sleep(1) 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /utils/src/test/scala/io/github/wtog/utils/test/jmh/StringUtilsBenchmark.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.utils.test.jmh 2 | 3 | import org.openjdk.jmh.annotations.Benchmark 4 | 5 | import scala.collection.mutable.ListBuffer 6 | 7 | /** 8 | * @author : tong.wang 9 | * @since : 11/10/19 11:25 AM 10 | * @version : 1.0.0 11 | */ 12 | class StringUtilsBenchmark { 13 | import StringUtilsBenchmark._ 14 | 15 | @Benchmark 16 | def foldleft(): String = { 17 | foldLeft() 18 | } 19 | 20 | @Benchmark 21 | def stringBuild: String = { 22 | stringbuilder() 23 | } 24 | 25 | @Benchmark 26 | def mapMkString: String = { 27 | map() 28 | } 29 | 30 | @Benchmark 31 | def stringutils: String = { 32 | stringUtils() 33 | } 34 | 35 | @Benchmark 36 | def replace: String = { 37 | replaceFirstFold() 38 | } 39 | } 40 | 41 | object StringUtilsBenchmark { 42 | val sql = 43 | """ 44 | |update house set 45 | |trading_right = ?, 46 | |house_right_owner = ?, 47 | |community_name = ?, 48 | |community_area_name = ?, 49 | |house_code = ?, 50 | |room_type_sub_info = ?, 51 | |meter_price = ?, 52 | |heating = ?, 53 | |total_price = ?, 54 | |sale_time = ?, 55 | |elevator = ?, 56 | |room_type_main_info = ?, 57 | |householdladder = ?, 58 | |room_main_info = ?, 59 | |housing_use = ?, 60 | |last_sale = ?, 61 | |house_right = ?, 62 | |room_area_sub_info = ?, 63 | |build_type = ?, 64 | |build_struct = ?, 65 | |evaluation_price = ?, 66 | |mortgage_info = ?, 67 | |room_sub_info = ?, 68 | |decoration = ?, 69 | |room_area_main_info = ?, 70 | |house_years = ? 71 | |where 72 | |house_code = ? " 73 | |""".stripMargin 74 | 75 | val params = Seq.tabulate[Any](28){a => if (a % 2 == 0) "test" else a} 76 | 77 | def foldLeft(): String = { 78 | val list = sql.split('?') zip params 79 | 80 | list.foldLeft("") { 81 | case (s, (sql, param)) => 82 | s"$s$sql$param" 83 | } 84 | } 85 | 86 | def stringbuilder(): String = { 87 | val list = sql.split('?') zip params 88 | val buffer = new StringBuilder() 89 | list.foreach { 90 | case (s,p) => 91 | buffer.append(s).append(p) 92 | } 93 | buffer.toString() 94 | } 95 | 96 | def map(): String = { 97 | val list = sql.split('?') zip params 98 | 99 | list.map { 100 | case (s, p) => 101 | s"$s$p" 102 | }.mkString("") 103 | } 104 | 105 | def replaceFirstFold(): String = { 106 | val buffer = new ListBuffer[Any] 107 | params.foreach(p => 108 | buffer.append(sql.replaceFirst("\\?", p.toString))) 109 | buffer.mkString("") 110 | } 111 | 112 | def stringUtils(): String = { 113 | import io.github.wtog.utils.StringUtils._ 114 | 115 | sql.placeholderReplacedBy("\\?", params) 116 | } 117 | } -------------------------------------------------------------------------------- /utils/src/test/scala/io/github/wtog/utils/test/reflection/ReflectionUtilsTest.scala: -------------------------------------------------------------------------------- 1 | package io.github.wtog.utils.test.reflection 2 | 3 | import io.github.wtog.utils.ReflectionUtils 4 | import io.github.wtog.utils.test.BaseTest 5 | 6 | /** 7 | * @author : tong.wang 8 | * @since : 3/7/20 9:26 AM 9 | * @version : 1.0.0 10 | */ 11 | class ReflectionUtilsTest extends BaseTest { 12 | 13 | test("get implementation classes") { 14 | val implementationClasses = ReflectionUtils 15 | .implementationClasses( 16 | classOf[BaseTest], 17 | "io.github.wtog.utils.BaseTest" 18 | ) 19 | assert(!implementationClasses.contains(classOf[BaseTest])) 20 | } 21 | 22 | } 23 | 24 | -------------------------------------------------------------------------------- /version.sbt: -------------------------------------------------------------------------------- 1 | version in ThisBuild := "0.1.3-SNAPSHOT" 2 | --------------------------------------------------------------------------------