├── project ├── build.properties └── plugins.sbt ├── version.sbt ├── doc ├── images │ ├── dashboard.png │ └── greenish-with-background.svg └── api.md ├── src ├── test │ ├── resources │ │ ├── test-partial-period │ │ ├── test-duplicate-period │ │ ├── test-ls-env │ │ ├── test-ls │ │ ├── ls-sleep │ │ └── application.conf │ └── scala │ │ ├── models │ │ ├── EnvVarSpec.scala │ │ ├── GroupStatusSpec.scala │ │ └── JsonSerdeSpec.scala │ │ ├── checker │ │ ├── CheckerSpec.scala │ │ └── CommandRunnerSpec.scala │ │ ├── AppSpec.scala │ │ ├── stats │ │ └── StatsCollectorSpec.scala │ │ └── AppConfigSpec.scala └── main │ ├── scala │ ├── models │ │ ├── Lag.scala │ │ ├── Group.scala │ │ ├── PeriodHealth.scala │ │ ├── AlertLevel.scala │ │ ├── AlertLevels.scala │ │ ├── GroupStatusSummary.scala │ │ ├── JobStatus.scala │ │ ├── JobStatusSummary.scala │ │ ├── models.scala │ │ ├── GroupStatus.scala │ │ ├── Job.scala │ │ ├── CheckFrequency.scala │ │ └── EnvVar.scala │ ├── stats │ │ ├── messages.scala │ │ └── StatsCollector.scala │ ├── checker │ │ ├── checker.scala │ │ ├── Message.scala │ │ ├── CommandRunner.scala │ │ └── StatusChecker.scala │ ├── App.scala │ ├── endpoints │ │ └── Routes.scala │ └── AppConfig.scala │ └── resources │ ├── dashboard │ ├── time_container.jsx │ ├── version_container.jsx │ ├── index.html │ ├── namespace_container.jsx │ ├── state_container.jsx │ ├── common_lib.jsx │ ├── group_container.jsx │ ├── job_container.jsx │ ├── main.css │ ├── main_container.jsx │ ├── summary_container.jsx │ ├── greenish-favicon.svg │ └── greenish-logo.svg │ └── reference.conf ├── package.json ├── release.sh ├── .travis.yml ├── COPYING ├── .gitignore └── README.md /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.3.12 2 | -------------------------------------------------------------------------------- /version.sbt: -------------------------------------------------------------------------------- 1 | version in ThisBuild := "1.8.0-SNAPSHOT" 2 | -------------------------------------------------------------------------------- /doc/images/dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amanjpro/greenish/HEAD/doc/images/dashboard.png -------------------------------------------------------------------------------- /src/test/resources/test-partial-period: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | shift 4 | 5 | for period in "$@"; do 6 | echo -e "greenish-period\t$period\t1" 7 | done 8 | -------------------------------------------------------------------------------- /src/test/resources/test-duplicate-period: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | for period in "$@"; do 4 | echo -e "greenish-period\t$period\t1" 5 | echo -e "greenish-period\t$period\t1" 6 | done 7 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.10") 2 | addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.7.3") 3 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.6.1") 4 | -------------------------------------------------------------------------------- /src/test/resources/test-ls-env: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | fst=$1 4 | 5 | shift 6 | 7 | for period in "$@"; do 8 | if ls "$GREENISH_VALUE_FOR_TEST/$fst/$period"; then 9 | echo -e "greenish-period\t$period\t1" 10 | else 11 | echo -e "greenish-period\t$period\t0" 12 | fi 13 | done 14 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "greenish", 3 | "version": "1.8.0-SNAPSHOT", 4 | "devDependencies": { 5 | "babel-cli": "^6.0.0", 6 | "babel-preset-react-app": "^3.0.0" 7 | }, 8 | "dependencies": { 9 | "react-router-hash-link": "^2.0.0", 10 | "react-router-dom": "^5.0.0" 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /src/test/resources/test-ls: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | fst=$1 4 | 5 | shift 6 | 7 | echo "LETS PRINT THINGS" 8 | 9 | for period in "$@"; do 10 | echo "DEBUG HERE TOO" 11 | if ls "$fst/$period"; then 12 | echo -e "greenish-period\t$period\t1" 13 | else 14 | echo -e "greenish-period\t$period\t0" 15 | fi 16 | done 17 | 18 | 19 | echo "DEBUG HERE" 20 | -------------------------------------------------------------------------------- /src/test/resources/ls-sleep: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | sleep 10 4 | 5 | fst=$1 6 | 7 | shift 8 | 9 | echo "LETS PRINT THINGS" 10 | 11 | for period in "$@"; do 12 | echo "DEBUG HERE TOO" 13 | if ls "$fst/$period"; then 14 | echo -e "greenish-period\t$period\t1" 15 | else 16 | echo -e "greenish-period\t$period\t0" 17 | fi 18 | done 19 | 20 | 21 | echo "DEBUG HERE" 22 | -------------------------------------------------------------------------------- /src/main/scala/models/Lag.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish.models 2 | 3 | import io.circe.{Encoder, Decoder} 4 | import io.circe.generic.extras.semiauto.{deriveConfiguredDecoder, deriveConfiguredEncoder} 5 | 6 | case class Lag(lag: Int) 7 | object Lag { 8 | implicit val lagDecoder: Decoder[Lag] = deriveConfiguredDecoder 9 | implicit val lagEncoder: Encoder[Lag] = deriveConfiguredEncoder 10 | } 11 | 12 | -------------------------------------------------------------------------------- /src/main/scala/models/Group.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish.models 2 | 3 | import io.circe.{Encoder, Decoder} 4 | import io.circe.generic.extras.semiauto.{deriveConfiguredDecoder, deriveConfiguredEncoder} 5 | 6 | case class Group ( 7 | groupId: Int, 8 | name: String, 9 | jobs: Seq[Job], 10 | ) 11 | object Group { 12 | implicit val checkGroupDecoder: Decoder[Group] = deriveConfiguredDecoder 13 | implicit val checkGroupEncoder: Encoder[Group] = deriveConfiguredEncoder 14 | } 15 | -------------------------------------------------------------------------------- /src/main/scala/models/PeriodHealth.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish.models 2 | 3 | import io.circe.{Encoder, Decoder} 4 | import io.circe.generic.extras.semiauto.{deriveConfiguredDecoder, deriveConfiguredEncoder} 5 | 6 | case class PeriodHealth ( 7 | period: String, 8 | ok: Boolean, 9 | ) 10 | 11 | object PeriodHealth { 12 | implicit val periodHealthDecoder: Decoder[PeriodHealth] = deriveConfiguredDecoder 13 | implicit val periodHealthEncoder: Encoder[PeriodHealth] = deriveConfiguredEncoder 14 | } 15 | -------------------------------------------------------------------------------- /src/main/scala/models/AlertLevel.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish.models 2 | 3 | import io.circe.Codec 4 | import io.circe.generic.extras.Configuration 5 | import io.circe.generic.extras.semiauto.deriveEnumerationCodec 6 | 7 | sealed trait AlertLevel 8 | object AlertLevel { 9 | implicit val modeCodec: Codec[AlertLevel] = deriveEnumerationCodec[AlertLevel] 10 | } 11 | case object Critical extends AlertLevel 12 | case object Warn extends AlertLevel 13 | case object Normal extends AlertLevel 14 | case object Great extends AlertLevel 15 | -------------------------------------------------------------------------------- /src/main/scala/models/AlertLevels.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish.models 2 | 3 | import io.circe.{Encoder, Decoder} 4 | import io.circe.generic.extras.semiauto.{deriveConfiguredDecoder, deriveConfiguredEncoder} 5 | 6 | case class AlertLevels( 7 | great: Int, 8 | normal: Int, 9 | warn: Int, 10 | critical: Int, 11 | ) 12 | object AlertLevels { 13 | implicit val alertLevelsDecoder: Decoder[AlertLevels] = deriveConfiguredDecoder 14 | implicit val alertLevelsEncoder: Encoder[AlertLevels] = deriveConfiguredEncoder 15 | } 16 | -------------------------------------------------------------------------------- /src/main/scala/models/GroupStatusSummary.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish.models 2 | 3 | import io.circe.{Encoder, Decoder} 4 | import io.circe.generic.extras.semiauto.{deriveConfiguredDecoder, deriveConfiguredEncoder} 5 | 6 | case class GroupStatusSummary( 7 | groupId: Int, 8 | name: String, 9 | status: Seq[JobStatusSummary], 10 | ) 11 | object GroupStatusSummary { 12 | implicit val groupStatusSummaryDecoder: Decoder[GroupStatusSummary] = deriveConfiguredDecoder 13 | implicit val groupStatusSummaryEncoder: Encoder[GroupStatusSummary] = deriveConfiguredEncoder 14 | } 15 | -------------------------------------------------------------------------------- /src/main/scala/models/JobStatus.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish.models 2 | 3 | import io.circe.{Encoder, Decoder} 4 | import io.circe.generic.extras.semiauto.{deriveConfiguredDecoder, deriveConfiguredEncoder} 5 | 6 | case class JobStatus ( 7 | job: Job, 8 | updatedAt: Long, 9 | periodHealth: Seq[PeriodHealth], 10 | ) { 11 | def countMissing = periodHealth.count(!_.ok) 12 | } 13 | object JobStatus { 14 | implicit val jobStatusDecoder: Decoder[JobStatus] = deriveConfiguredDecoder 15 | implicit val jobStatusEncoder: Encoder[JobStatus] = deriveConfiguredEncoder 16 | } 17 | -------------------------------------------------------------------------------- /src/main/scala/models/JobStatusSummary.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish.models 2 | 3 | import io.circe.{Encoder, Decoder} 4 | import io.circe.generic.extras.semiauto.{deriveConfiguredDecoder, deriveConfiguredEncoder} 5 | 6 | case class JobStatusSummary( 7 | jobId: Int, 8 | name: String, 9 | missing: Int, 10 | oldestMissingPeriod: Int, 11 | alertLevel: AlertLevel, 12 | ) 13 | object JobStatusSummary { 14 | implicit val jobStatusSummaryDecoder: Decoder[JobStatusSummary] = deriveConfiguredDecoder 15 | implicit val jobStatusSummaryEncoder: Encoder[JobStatusSummary] = deriveConfiguredEncoder 16 | } 17 | -------------------------------------------------------------------------------- /src/main/scala/stats/messages.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish.stats 2 | 3 | import akka.http.scaladsl.model.HttpMethod 4 | 5 | sealed trait Message 6 | case class IncRefresh(job: String) extends Message 7 | case class DecRefresh(job: String) extends Message 8 | case class RefreshTime(job: String, duration: Double) extends Message 9 | case class IncBadRefresh(job: String) extends Message 10 | case class MissingPeriods(job: String, num: Int) extends Message 11 | case class OldestMissingPeriod(job: String, num: Int) extends Message 12 | case class IncExpiredRefresh(job: String) extends Message 13 | case object GetPrometheus extends Message 14 | case object GetStats extends Message 15 | -------------------------------------------------------------------------------- /src/main/resources/dashboard/time_container.jsx: -------------------------------------------------------------------------------- 1 | class TimeContainer extends React.Component { 2 | intervalID 3 | constructor(props) { 4 | super(props); 5 | this.state = { 6 | now: "" 7 | }; 8 | } 9 | 10 | componentDidMount() { 11 | this.refresh() 12 | } 13 | 14 | componentWillUnmount() { 15 | clearTimeout(this.intervalID); 16 | } 17 | 18 | refresh = () => { 19 | const d = new Date() 20 | this.setState({now: d.toUTCString()}) 21 | this.intervalID = setTimeout(this.refresh, fetchInterval); 22 | } 23 | 24 | render() { 25 | const { now } = this.state; 26 | return ( 27 | 28 | {now} 29 | 30 | ) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/checker/checker.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish 2 | 3 | import me.amanj.greenish.models.PeriodHealth 4 | import java.io.File 5 | 6 | package object checker { 7 | protected[checker] def computeOldest(periodHealths: Seq[PeriodHealth]): Int = { 8 | val missingIndex = periodHealths.indexWhere(!_.ok) 9 | if(missingIndex == -1) 0 10 | else periodHealths.length - missingIndex 11 | } 12 | 13 | private[this] implicit class FileOps(p: String) { 14 | def /(c: String): String = s"$p${File.separator}$c" 15 | } 16 | 17 | def debugFile(scratchDir: File, groupId: Int, jobId: Int): String = { 18 | scratchDir.mkdirs 19 | val fileName = 20 | scratchDir.toString / s"group-$groupId-job-$jobId-stdout.txt" 21 | fileName 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /release.sh: -------------------------------------------------------------------------------- 1 | 2 | NEW_VERSION=$1 3 | NEXT_SNAPSHOT=$2 4 | 5 | git checkout master 6 | git pull 7 | 8 | echo "version in ThisBuild := \"$NEW_VERSION\"" > version.sbt 9 | jq --arg version "$NEW_VERSION" '. + {"version": $version}' package.json > package-tmp.json 10 | mv package-tmp.json package.json 11 | git add version.sbt 12 | git add package.json 13 | git commit -m "Bump version to $NEW_VERSION" 14 | git tag -a "$NEW_VERSION" -m "Release $NEW_VERSION" 15 | 16 | echo "version in ThisBuild := \"$NEXT_SNAPSHOT\"" > version.sbt 17 | jq --arg version "$NEXT_SNAPSHOT" '. + {"version": $version}' package.json > package-tmp.json 18 | mv package-tmp.json package.json 19 | git add version.sbt 20 | git add package.json 21 | git commit -m "Bump version to $NEXT_SNAPSHOT" 22 | 23 | git push origin HEAD --tags 24 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | 3 | jdk: openjdk8 4 | 5 | node_js: 6 | - 14 7 | 8 | scala: 9 | - 2.13.2 10 | 11 | before_install: 12 | - npm install 13 | script: 14 | - sbt ++$TRAVIS_SCALA_VERSION clean coverage test coverageReport 15 | - sbt assembly 16 | - sbt docker:publishLocal 17 | 18 | after_success: 19 | - bash <(curl -s https://codecov.io/bash) 20 | - docker build -t docker.pkg.github.com/amanjpro/greenish/greenish:$TRAVIS_TAG target/docker/stage 21 | - echo $GITHUB_RELEASES_TOKEN | docker login https://docker.pkg.github.com -u amanjpro --password-stdin 22 | - docker push docker.pkg.github.com/amanjpro/greenish/greenish:$TRAVIS_TAG 23 | 24 | deploy: 25 | provider: releases 26 | file_glob: true 27 | file: target/scala-2.13/greenish-assembly-*.jar 28 | skip_cleanup: true 29 | api_key: $GITHUB_RELEASES_TOKEN 30 | on: 31 | tags: true 32 | -------------------------------------------------------------------------------- /src/main/scala/checker/Message.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish.checker 2 | 3 | import java.time.ZonedDateTime 4 | import me.amanj.greenish.models.{JobStatus, PeriodHealth} 5 | 6 | sealed trait Message 7 | case class Refresh(now: () => ZonedDateTime) extends Message 8 | case class RefreshGroup(now: () => ZonedDateTime, groupId: Int) extends Message 9 | case class RefreshJob(now: () => ZonedDateTime, groupId: Int, jobId: Int) extends Message 10 | case object MaxLag extends Message 11 | case object AllEntries extends Message 12 | case object GetMissing extends Message 13 | case object Summary extends Message 14 | case class GetJobStatus(groupId: Int, jobId: Int) extends Message 15 | case class GetGroupStatus(groupId: Int) extends Message 16 | case class BatchRun(cmd: String, periods: Seq[String], 17 | env: Seq[(String, String)], 18 | groupId: Int, jobId: Int, 19 | prometheusId: String, clockCounter: Long, 20 | expireAt: Long) extends Message 21 | case class RunResult(periodHealth: Seq[PeriodHealth], 22 | groupId: Int, jobId: Int, clockCounter: Long) extends Message 23 | -------------------------------------------------------------------------------- /src/main/resources/reference.conf: -------------------------------------------------------------------------------- 1 | # Default configuration for Greenish 2 | check-groups: { 3 | refresh-in-seconds: 30 4 | binding-address: "127.0.0.1" 5 | port: 8080 6 | scratch-dir: "/tmp/greenish/stdout" 7 | default-period-check-offset: 1 8 | default-period-pattern: "yyyy-MM/dd/HH" 9 | default-job-run-frequency: "hourly" 10 | default-timezone: "UTC" 11 | default-lookback: 200 12 | default-great-at: 0 13 | default-normal-at: 1 14 | default-warn-at: 3 15 | default-error-at: 4 16 | default-start-at: 0 17 | env: {} 18 | } 19 | 20 | # This section is used to tune the performance of Greenish 21 | akka { 22 | # This is the thread-pool for running monitoring scripts 23 | # If Greenish is unresponsive, you should look into this. 24 | # As, monitoring scripts are expected to be IO bound, you 25 | # may want to maximize parallelism. 26 | refresh-dispatcher { 27 | type = Dispatcher 28 | executor = "thread-pool-executor" 29 | thread-pool-executor { 30 | fixed-pool-size = 100 31 | } 32 | throughput = 1 33 | mailbox-capacity = -1 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | Copyright 2020 Amanj Sherwany 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 7 | of the Software, and to permit persons to whom the Software is furnished to do 8 | so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /src/main/scala/models/models.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish 2 | 3 | import io.circe.generic.extras.Configuration 4 | import io.circe.Json 5 | import io.circe.syntax.EncoderOps 6 | import java.lang.management.ManagementFactory 7 | 8 | package object models { 9 | private[models] implicit val customConfig: Configuration = 10 | Configuration.default.withSnakeCaseMemberNames.withDefaults 11 | .copy(transformConstructorNames = _.toLowerCase) 12 | 13 | def errorJson(str: String): Json = Json.obj ( 14 | "error" -> str.asJson 15 | ) 16 | 17 | def sysinfo(maybeNamespace: Option[String]): Json = { 18 | val maybeVersion = Option(getClass.getPackage.getImplementationVersion()) 19 | Json.obj ( 20 | "service" -> "Greenish".asJson, 21 | "namespace" -> maybeNamespace.asJson, 22 | "version" -> maybeVersion.asJson, 23 | "uptime" -> ManagementFactory.getRuntimeMXBean().getUptime().asJson, 24 | ) 25 | } 26 | 27 | def okJson(str: String): Json = Json.obj ( 28 | "ok" -> str.asJson 29 | ) 30 | 31 | def healthJson(status: Boolean): Json = Json.obj ( 32 | "health" -> (if(status) "good".asJson else "bad".asJson) 33 | ) 34 | } 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Simple Build Tool 2 | # http://www.scala-sbt.org/release/docs/Getting-Started/Directories.html#configuring-version-control 3 | 4 | .metals/* 5 | dist/* 6 | target/ 7 | lib_managed/ 8 | src_managed/ 9 | project/metals.sbt 10 | project/.bloop 11 | project/boot/ 12 | project/plugins/project/ 13 | .history 14 | .bloop 15 | .cache 16 | .lib/ 17 | *.class 18 | *.log 19 | 20 | # General 21 | .DS_Store 22 | .AppleDouble 23 | .LSOverride 24 | 25 | # Icon must end with two \r 26 | Icon 27 | 28 | # Thumbnails 29 | ._* 30 | 31 | # Files that might appear in the root of a volume 32 | .DocumentRevisions-V100 33 | .fseventsd 34 | .Spotlight-V100 35 | .TemporaryItems 36 | .Trashes 37 | .VolumeIcon.icns 38 | .com.apple.timemachine.donotpresent 39 | 40 | # Directories potentially created on remote AFP share 41 | .AppleDB 42 | .AppleDesktop 43 | Network Trash Folder 44 | Temporary Items 45 | .apdisk 46 | 47 | [._]*.sw[a-p] 48 | [._]s[a-rt-v][a-z] 49 | [._]ss[a-gi-z] 50 | [._]sw[a-p] 51 | 52 | # Session 53 | Session.vim 54 | Sessionx.vim 55 | 56 | # Temporary 57 | .netrwhist 58 | *~ 59 | # Auto-generated tag files 60 | tags 61 | # Persistent undo 62 | [._]*.un~ 63 | 64 | # NPM junks 65 | node_modules/ 66 | package-lock.json 67 | -------------------------------------------------------------------------------- /src/main/scala/models/GroupStatus.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish.models 2 | 3 | import io.circe.{Encoder, Decoder} 4 | import io.circe.generic.extras.semiauto.{deriveConfiguredDecoder, deriveConfiguredEncoder} 5 | 6 | case class GroupStatus( 7 | group: Group, 8 | status: Array[JobStatus], 9 | ) { 10 | def canEqual(a: Any) = a.isInstanceOf[GroupStatus] 11 | 12 | override def equals(that: Any): Boolean = 13 | that match { 14 | case that: GroupStatus => { 15 | that.canEqual(this) && 16 | this.group == that.group && 17 | this.status.sameElements(that.status) 18 | } 19 | case _ => false 20 | } 21 | 22 | override def hashCode: Int = { 23 | val prime = 31 24 | var result = 1 25 | result = prime * result + group.hashCode; 26 | result = prime * result + (if (group == null) 0 else group.hashCode) 27 | result = prime * result + (if (status == null) 0 else status.toVector.hashCode) 28 | result 29 | } 30 | 31 | override def toString: String = { 32 | s"GroupStatus($group, ${status.mkString("Array(", ", ", ")")})" 33 | } 34 | } 35 | 36 | object GroupStatus { 37 | implicit val groupStatusDecoder: Decoder[GroupStatus] = deriveConfiguredDecoder 38 | implicit val groupStatusEncoder: Encoder[GroupStatus] = deriveConfiguredEncoder 39 | } 40 | -------------------------------------------------------------------------------- /src/main/resources/dashboard/version_container.jsx: -------------------------------------------------------------------------------- 1 | class VersionContainer extends React.Component { 2 | constructor(props) { 3 | super(props); 4 | this.state = { 5 | error: null, 6 | isLoaded: false, 7 | version: null 8 | }; 9 | } 10 | 11 | componentDidMount() { 12 | this.fetchData() 13 | } 14 | 15 | fetchData = () => { 16 | fetch(`/system`) 17 | .then(res => res.json()) 18 | .then( 19 | (info) => { 20 | this.setState({ 21 | isLoaded: true, 22 | version: info.version 23 | }); 24 | }, 25 | // Note: it's important to handle errors here 26 | // instead of a catch() block so that we don't swallow 27 | // exceptions from actual bugs in components. 28 | (error) => { 29 | this.setState({ 30 | isLoaded: true, 31 | error 32 | }); 33 | } 34 | ) 35 | } 36 | 37 | render() { 38 | const { error, isLoaded, version} = this.state; 39 | if (error) { 40 | return ( 41 | Error: {error.message} 42 | ) 43 | } else if (!isLoaded) { 44 | return ( 45 | Loading... 46 | ) 47 | } else { 48 | return ( 49 | Version {version} 50 | ) 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/resources/dashboard/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Greenish dashboard 6 | 7 | 8 | 9 | 10 |
11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /src/main/scala/models/Job.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish.models 2 | 3 | import java.time.ZonedDateTime 4 | import java.time.format.DateTimeFormatter 5 | import java.time.ZoneId 6 | import io.circe.{Encoder, Decoder, HCursor, Json} 7 | import io.circe.generic.extras.semiauto.{deriveConfiguredDecoder, deriveConfiguredEncoder} 8 | 9 | case class Job( 10 | jobId: Int, 11 | name: String, 12 | owner: Option[String], 13 | prometheusId: String, 14 | cmd: String, 15 | timePattern: String, 16 | frequency: CheckFrequency, 17 | periodCheckOffset: Int, 18 | timezone: ZoneId, 19 | lookback: Int, 20 | startAt: Long, 21 | alertLevels: AlertLevels, 22 | info: Option[String], 23 | env: Seq[EnvVar] 24 | ) { 25 | val timeFormat = DateTimeFormatter.ofPattern(timePattern) 26 | } 27 | 28 | object Job { 29 | implicit val zoneIdEncoder: Encoder[ZoneId] = 30 | new Encoder[ZoneId] { 31 | final def apply(zid: ZoneId): Json = Json.obj( 32 | ("zone_id", Json.fromString(zid.getId)) 33 | ) 34 | } 35 | implicit val zoneIdDecoer: Decoder[ZoneId] = new Decoder[ZoneId] { 36 | final def apply(c: HCursor): Decoder.Result[ZoneId] = 37 | for { 38 | zoneId <- c.downField("zone_id").as[String] 39 | } yield ZoneId.of(zoneId) 40 | } 41 | 42 | implicit val jobDecoder: Decoder[Job] = deriveConfiguredDecoder 43 | implicit val jobEncoder: Encoder[Job] = deriveConfiguredEncoder 44 | } 45 | -------------------------------------------------------------------------------- /src/main/resources/dashboard/namespace_container.jsx: -------------------------------------------------------------------------------- 1 | class NamespaceContainer extends React.Component { 2 | constructor(props) { 3 | super(props); 4 | this.state = { 5 | error: null, 6 | isLoaded: false, 7 | namespace: null 8 | }; 9 | } 10 | 11 | componentDidMount() { 12 | this.fetchData() 13 | } 14 | 15 | fetchData = () => { 16 | fetch(`/system`) 17 | .then(res => res.json()) 18 | .then( 19 | (info) => { 20 | if('namespace' in info) { 21 | this.setState({ 22 | isLoaded: true, 23 | namespace: info.namespace 24 | }); 25 | } else { 26 | this.setState({ 27 | isLoaded: true 28 | }); 29 | } 30 | }, 31 | // Note: it's important to handle errors here 32 | // instead of a catch() block so that we don't swallow 33 | // exceptions from actual bugs in components. 34 | (error) => { 35 | this.setState({ 36 | isLoaded: true, 37 | error 38 | }); 39 | } 40 | ) 41 | } 42 | 43 | render() { 44 | const { error, isLoaded, namespace} = this.state; 45 | if (error) { 46 | return (Error: {error.message}) 47 | } else if (!isLoaded) { 48 | return (Loading...) 49 | } else { 50 | return (namespace != null?{namespace}:null) 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/resources/dashboard/state_container.jsx: -------------------------------------------------------------------------------- 1 | class StateContainer extends React.Component { 2 | intervalID 3 | constructor(props) { 4 | super(props); 5 | this.state = { 6 | error: null, 7 | isLoaded: false, 8 | groups: [], 9 | }; 10 | } 11 | 12 | componentDidMount() { 13 | this.fetchData() 14 | } 15 | 16 | componentWillUnmount() { 17 | clearTimeout(this.intervalID); 18 | } 19 | 20 | fetchData = () => { 21 | fetch(`/${this.props.endpoint}`) 22 | .then(res => res.json()) 23 | .then( 24 | (groups) => { 25 | this.setState({ 26 | isLoaded: true, 27 | groups: groups 28 | }); 29 | }, 30 | // Note: it's important to handle errors here 31 | // instead of a catch() block so that we don't swallow 32 | // exceptions from actual bugs in components. 33 | (error) => { 34 | this.setState({ 35 | isLoaded: true, 36 | error 37 | }); 38 | } 39 | ) 40 | this.intervalID = setTimeout(this.fetchData, fetchInterval); 41 | } 42 | 43 | render() { 44 | const { error, isLoaded, groups } = this.state; 45 | if (error) { 46 | return ( 47 |
Error: {error.message}
48 | ) 49 | } else if (!isLoaded) { 50 | return ( 51 |
Loading...
52 | ) 53 | } else { 54 | return ( 55 |
56 | {renderState(groups, this.props.endpoint, 'grid-item')} 57 |
58 | ) 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/test/scala/models/EnvVarSpec.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish.models 2 | 3 | import org.scalatest.matchers.should.Matchers 4 | import org.scalatest.wordspec.AnyWordSpecLike 5 | 6 | class EnvVarSpec() extends Matchers with AnyWordSpecLike { 7 | "EnvVar.apply" must { 8 | "create PlainEnvVar when secure flag is not provided" in { 9 | val (name, value) = ("username", "Homa") 10 | val expected = PlainEnvVar(name, value) 11 | val actual = EnvVar(name, value) 12 | actual shouldBe expected 13 | } 14 | 15 | "create SecureEnvVar when secure flag is provided" in { 16 | val (name, value) = ("username", "secure(Homa)") 17 | val expected = SecureEnvVar(name, "Homa".toSeq) 18 | val actual = EnvVar(name, value) 19 | actual shouldBe expected 20 | } 21 | 22 | "create SecureEnvVar when secure flag is provided but value is empty" in { 23 | val (name, value) = ("username", "secure()") 24 | val expected = SecureEnvVar(name, "".toSeq) 25 | val actual = EnvVar(name, value) 26 | actual shouldBe expected 27 | } 28 | } 29 | 30 | "EnvVar.tupled" must { 31 | "work for secure variables" in { 32 | val (name, value) = ("username", "secure(Homa)") 33 | val origin = EnvVar(name, value) 34 | val expected = (name, "Homa") 35 | val actual = origin.tupled 36 | actual shouldBe expected 37 | } 38 | 39 | "work for plain variables" in { 40 | val (name, value) = ("username", "Homa") 41 | val origin = EnvVar(name, value) 42 | val expected = (name, value) 43 | val actual = origin.tupled 44 | actual shouldBe expected 45 | } 46 | } 47 | } 48 | 49 | -------------------------------------------------------------------------------- /src/test/scala/models/GroupStatusSpec.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish.models 2 | 3 | import org.scalatest.matchers.should.Matchers 4 | import org.scalatest.wordspec.AnyWordSpecLike 5 | import java.time.ZoneId 6 | 7 | class GroupStatusSpec() extends Matchers 8 | with AnyWordSpecLike { 9 | 10 | val job1 = Job(1, "job1", None, "p1", "foo", 11 | "yyyy-MM-dd-HH", Hourly, 1, ZoneId.of("UTC"), 12 | 4, 0, AlertLevels(0, 1, 2, 3), None, Seq(EnvVar("a", "b")) 13 | ) 14 | 15 | val job2 = Job(2, "job2", None, "p2", "bar", 16 | "yyyy-MM-dd-HH", Hourly, 1, ZoneId.of("UTC"), 17 | 4, 0, AlertLevels(0, 1, 2, 3), None, Seq(EnvVar("a", "secure(b)")) 18 | ) 19 | 20 | val group1 = Group(0, "group1", Seq(job1)) 21 | val group2 = Group(1, "group2", Seq(job2)) 22 | 23 | val gs1 = GroupStatus(group1, Array(JobStatus(job1, -1, Seq.empty))) 24 | val gs1Copy = GroupStatus(group1, Array(JobStatus(job1, -1, Seq.empty))) 25 | val gs2 = GroupStatus(group2, Array(JobStatus(job2, -1, Seq.empty))) 26 | 27 | "equals" must { 28 | "work if that is null" in { 29 | val actual = gs1 == null 30 | actual shouldBe false 31 | } 32 | 33 | "work if that is this" in { 34 | val actual = gs1 == gs1 35 | actual shouldBe true 36 | } 37 | 38 | "work if that is a clone of this" in { 39 | val actual = gs1 == gs1Copy 40 | actual shouldBe true 41 | } 42 | 43 | "not be equal to non-GroupStatus objects" in { 44 | val actual = gs1 == job1 45 | actual shouldBe false 46 | } 47 | } 48 | 49 | "hashCode" must { 50 | "be predictive" in { 51 | val actual = gs1.## == gs1.## 52 | actual shouldBe true 53 | } 54 | 55 | "produce the same value for equivalent objects" in { 56 | val actual = gs1.## == gs1Copy.## 57 | actual shouldBe true 58 | } 59 | 60 | "produce differe values for different objects" in { 61 | val actual = gs1.## == gs2.## 62 | actual shouldBe false 63 | } 64 | } 65 | } 66 | 67 | -------------------------------------------------------------------------------- /src/main/resources/dashboard/common_lib.jsx: -------------------------------------------------------------------------------- 1 | const fetchInterval = 5000 2 | 3 | function renderState(groups, subClassName, keyPrefix) { 4 | return ( 5 | groups.map(groupStatus => renderGroup(groupStatus, subClassName, keyPrefix, "")) 6 | ); 7 | } 8 | 9 | function encloseInTable(trs, gid, keyPrefix) { 10 | return ( 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | { trs } 19 | 20 |
Job nameData set periodLast updated
21 | ); 22 | } 23 | 24 | function renderGroup(groupStatus, subClassName, keyPrefix, sub) { 25 | const group = groupStatus.group; 26 | const gid = group.group_id; 27 | const jobs = groupStatus["status"].map(jobStatus => ( 28 | renderJob(jobStatus, gid, keyPrefix))) 29 | return ( 30 |
31 |

{group.name}{sub}

32 | {encloseInTable(jobs, keyPrefix, gid)} 33 |
); 34 | } 35 | 36 | function renderJob(jobStatus, gid, keyPrefix) { 37 | if(jobStatus["period_health"] != undefined) { 38 | const job = jobStatus.job; 39 | const jid = job.job_id; 40 | const date = new Date(jobStatus.updated_at).toUTCString(); 41 | return(jobStatus.period_health.map((ph, i) => ( 42 | 44 | {job.name} 45 | {ph.period} 46 | {date} 47 | 48 | ))); 49 | } 50 | } 51 | 52 | 53 | -------------------------------------------------------------------------------- /src/main/resources/dashboard/group_container.jsx: -------------------------------------------------------------------------------- 1 | const Link = ReactRouterDOM.Link; 2 | 3 | class GroupContainer extends React.Component { 4 | intervalID 5 | constructor(props) { 6 | super(props); 7 | this.state = { 8 | error: null, 9 | isLoaded: false, 10 | group: null 11 | }; 12 | this.handleBack = this.handleBack.bind(this); 13 | } 14 | 15 | componentWillUnmount() { 16 | clearTimeout(this.intervalID); 17 | } 18 | 19 | componentDidMount() { 20 | this.fetchData() 21 | } 22 | 23 | fetchData = () => { 24 | fetch(`/group/${this.props.group}`) 25 | .then(res => res.json()) 26 | .then( 27 | (group) => { 28 | this.setState({ 29 | isLoaded: true, 30 | group: group 31 | }); 32 | }, 33 | // Note: it's important to handle errors here 34 | // instead of a catch() block so that we don't swallow 35 | // exceptions from actual bugs in components. 36 | (error) => { 37 | this.setState({ 38 | isLoaded: true, 39 | error 40 | }); 41 | } 42 | ); 43 | this.intervalID = setTimeout(this.fetchData, fetchInterval); 44 | } 45 | 46 | handleBack() { 47 | this.props.handler("main", null, null); 48 | } 49 | 50 | render() { 51 | const { error, isLoaded, group } = this.state; 52 | if (error) { 53 | return ( 54 |
Error: {error.message}
55 | ) 56 | } else if (!isLoaded) { 57 | return ( 58 |
Loading...
59 | ) 60 | } else { 61 | const sub = ( 62 | 63 | `${loc.pathname}?page=main`} 64 | className="link" onClick={this.handleBack}> 65 |  See main dashboard 66 | 67 | 68 | ) 69 | return ( 70 |
71 | {renderGroup(group, 'group-view', 'grid-item', sub)} 72 |
73 | ) 74 | } 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/main/scala/App.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish 2 | 3 | import akka.actor.{ActorSystem, Props} 4 | import akka.http.scaladsl.Http 5 | import scala.language.postfixOps 6 | import akka.stream.ActorMaterializer 7 | import scala.concurrent.duration._ 8 | import java.time.ZonedDateTime 9 | import checker.{StatusChecker, Refresh} 10 | import endpoints.Routes 11 | import akka.event.{Logging, LogSource} 12 | 13 | object App { 14 | 15 | private[this] implicit val system = ActorSystem("greenish-system") 16 | private[this] implicit val executionContext = system.dispatcher 17 | private[this] val schedulerActor = system.actorOf(Props.empty) 18 | 19 | implicit val logSource: LogSource[AnyRef] = new LogSource[AnyRef] { 20 | def genString(o: AnyRef): String = o.getClass.getName 21 | override def getClazz(o: AnyRef): Class[_] = o.getClass 22 | } 23 | 24 | private[this] val logger = Logging(system, this) 25 | 26 | def main(args: Array[String]): Unit = { 27 | 28 | val appConfig = AppConfig() 29 | 30 | val statsActor = system.actorOf( 31 | Props(new stats.StatsCollector(getPrometheusIds(appConfig)))) 32 | 33 | val statusChecker = system.actorOf( 34 | Props(new StatusChecker(appConfig.groups, statsActor, 35 | appConfig.refreshInSeconds * 3, appConfig.scratchDir))) 36 | 37 | system.scheduler.scheduleWithFixedDelay( 38 | 0 seconds, 39 | appConfig.refreshInSeconds seconds, 40 | statusChecker, Refresh(() => ZonedDateTime.now())) 41 | 42 | val bindingFuture = Http() 43 | .bindAndHandle( 44 | new Routes(appConfig.namespace, appConfig.scratchDir, statusChecker, 45 | statsActor, 46 | // At least there should be one good run in the last 5 refresh sets 47 | appConfig.refreshInSeconds * 1000 * 5).routes, 48 | appConfig.address, appConfig.port) 49 | 50 | println(s"Server online at http://${appConfig.address}:${appConfig.port}...") 51 | } 52 | 53 | def getPrometheusIds(appConfig: AppConfig): Set[String] = { 54 | val prometheusIds = appConfig.groups.flatMap ( g => 55 | g.jobs.map(j => j.prometheusId)) 56 | 57 | val prometheusIdsSet = prometheusIds.toSet 58 | if(prometheusIdsSet.size < prometheusIds.size) { 59 | logger.warning( 60 | "prometheus-id is best to be unique per the entire configuration") 61 | } 62 | 63 | prometheusIdsSet 64 | } 65 | 66 | 67 | } 68 | -------------------------------------------------------------------------------- /src/test/scala/checker/CheckerSpec.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish.checker 2 | 3 | import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach} 4 | import org.scalatest.matchers.should.Matchers 5 | import org.scalatest.wordspec.AnyWordSpecLike 6 | import me.amanj.greenish.models.PeriodHealth 7 | 8 | class CheckerSpec() extends AnyWordSpecLike with Matchers { 9 | 10 | "computeOldest" must { 11 | "work for empty period health lists" in { 12 | val periods = Seq.empty[PeriodHealth] 13 | val actual = computeOldest(periods) 14 | val expected = 0 15 | actual shouldBe expected 16 | } 17 | 18 | "work when the first period is missing" in { 19 | val periods = Seq( 20 | PeriodHealth("kaka", false), 21 | PeriodHealth("kaka", true), 22 | PeriodHealth("kaka", true), 23 | PeriodHealth("kaka", true), 24 | ) 25 | val actual = computeOldest(periods) 26 | val expected = 4 27 | actual shouldBe expected 28 | } 29 | 30 | "work when a middle period is missing" in { 31 | val periods = Seq( 32 | PeriodHealth("kaka", true), 33 | PeriodHealth("kaka", false), 34 | PeriodHealth("kaka", true), 35 | PeriodHealth("kaka", true), 36 | ) 37 | val actual = computeOldest(periods) 38 | val expected = 3 39 | actual shouldBe expected 40 | } 41 | 42 | "work when the last period is missing" in { 43 | val periods = Seq( 44 | PeriodHealth("kaka", true), 45 | PeriodHealth("kaka", true), 46 | PeriodHealth("kaka", true), 47 | PeriodHealth("kaka", false), 48 | ) 49 | val actual = computeOldest(periods) 50 | val expected = 1 51 | actual shouldBe expected 52 | } 53 | 54 | "work when more than a period is missing" in { 55 | val periods = Seq( 56 | PeriodHealth("kaka", true), 57 | PeriodHealth("kaka", false), 58 | PeriodHealth("kaka", true), 59 | PeriodHealth("kaka", false), 60 | ) 61 | val actual = computeOldest(periods) 62 | val expected = 3 63 | actual shouldBe expected 64 | } 65 | 66 | "work when no period is missing" in { 67 | val periods = Seq( 68 | PeriodHealth("kaka", true), 69 | PeriodHealth("kaka", true), 70 | PeriodHealth("kaka", true), 71 | PeriodHealth("kaka", true), 72 | ) 73 | val actual = computeOldest(periods) 74 | val expected = 0 75 | actual shouldBe expected 76 | } 77 | } 78 | } 79 | 80 | 81 | -------------------------------------------------------------------------------- /src/main/scala/models/CheckFrequency.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish.models 2 | 3 | import java.time.ZonedDateTime 4 | import com.cronutils.model.time.ExecutionTime 5 | import com.cronutils.model.CronType.UNIX 6 | import com.cronutils.parser.CronParser 7 | import com.cronutils.model.definition.CronDefinitionBuilder 8 | import io.circe.{Printer, Decoder, Encoder, HCursor, Json} 9 | import io.circe.syntax.EncoderOps 10 | import io.circe.generic.extras.semiauto.{ 11 | deriveEnumerationCodec, deriveConfiguredDecoder, deriveConfiguredEncoder} 12 | 13 | sealed trait CheckFrequency { 14 | def prev(date: ZonedDateTime): ZonedDateTime 15 | } 16 | 17 | object CheckFrequency { 18 | implicit val freqDecoder: Decoder[CheckFrequency] = new Decoder[CheckFrequency] { 19 | final def apply(obj: HCursor): Decoder.Result[CheckFrequency] = { 20 | obj.as[String].map { 21 | case "hourly" => Right(Hourly) 22 | case "daily" => Right(Daily) 23 | case "monthly" => Right(Monthly) 24 | case "annually" => Right(Annually) 25 | }.getOrElse(obj.as[Cron]) 26 | } 27 | } 28 | 29 | implicit val freqEncoder: Encoder[CheckFrequency] = Encoder.instance { 30 | case Hourly => "hourly".asJson 31 | case Daily => "daily".asJson 32 | case Monthly => "monthly".asJson 33 | case Annually => "annually".asJson 34 | case other: Cron => other.asJson 35 | } 36 | } 37 | 38 | case class Cron(pattern: String) extends CheckFrequency { 39 | private[this] val parser = new CronParser( 40 | CronDefinitionBuilder.instanceDefinitionFor(UNIX)) 41 | private[this] val executionTime = ExecutionTime.forCron( 42 | parser.parse(pattern)) 43 | 44 | def prev(date: ZonedDateTime): ZonedDateTime = 45 | executionTime.lastExecution(date).get() 46 | } 47 | object Cron { 48 | implicit val cronDecoder: Decoder[Cron] = deriveConfiguredDecoder 49 | implicit val checkGroupEncoder: Encoder[Cron] = deriveConfiguredEncoder 50 | } 51 | 52 | case object Hourly extends CheckFrequency { 53 | def prev(date: ZonedDateTime): ZonedDateTime = 54 | date 55 | .minusHours(1L) 56 | } 57 | 58 | case object Daily extends CheckFrequency { 59 | def prev(date: ZonedDateTime): ZonedDateTime = 60 | date 61 | .minusDays(1L) 62 | } 63 | 64 | case object Monthly extends CheckFrequency { 65 | def prev(date: ZonedDateTime): ZonedDateTime = 66 | date 67 | .minusMonths(1L) 68 | } 69 | 70 | case object Annually extends CheckFrequency { 71 | def prev(date: ZonedDateTime): ZonedDateTime = 72 | date 73 | .minusYears(1L) 74 | } 75 | -------------------------------------------------------------------------------- /src/main/resources/dashboard/job_container.jsx: -------------------------------------------------------------------------------- 1 | const Link = ReactRouterDOM.Link; 2 | 3 | class JobContainer extends React.Component { 4 | intervalID 5 | constructor(props) { 6 | super(props); 7 | this.state = { 8 | error: null, 9 | isLoaded: false, 10 | job: null 11 | }; 12 | this.handleBack = this.handleBack.bind(this); 13 | } 14 | 15 | componentDidMount() { 16 | this.fetchData() 17 | } 18 | 19 | componentWillUnmount() { 20 | clearTimeout(this.intervalID); 21 | } 22 | 23 | fetchData = () => { 24 | fetch(`/group/${this.props.group}/job/${this.props.job}`) 25 | .then(res => res.json()) 26 | .then( 27 | (job) => { 28 | this.setState({ 29 | isLoaded: true, 30 | job: job 31 | }); 32 | }, 33 | // Note: it's important to handle errors here 34 | // instead of a catch() block so that we don't swallow 35 | // exceptions from actual bugs in components. 36 | (error) => { 37 | this.setState({ 38 | isLoaded: true, 39 | error 40 | }); 41 | } 42 | ) 43 | this.intervalID = setTimeout(this.fetchData, fetchInterval); 44 | } 45 | 46 | handleBack() { 47 | this.props.handler("main", null, null); 48 | } 49 | 50 | render() { 51 | const { error, isLoaded, job } = this.state; 52 | if (error) { 53 | return ( 54 |
Error: {error.message}
55 | ) 56 | } else if (!isLoaded) { 57 | return ( 58 |
Loading...
59 | ) 60 | } else { 61 | const jobs = renderJob(job, this.props.group, 'job-view') 62 | return ( 63 |
64 |

65 | {job.job.name}  66 | 67 | `${loc.pathname}?page=main`} 68 | onClick={this.handleBack} 69 | className="link"> 70 | See main dashboard 71 | 72 | 73 |

74 | {"owner" in job.job?
:
} 75 | {"info" in job.job?
:
} 76 |
77 | stdout 79 |
80 | {encloseInTable(jobs, 'job-view', this.props.group)} 81 |
82 | ) 83 | } 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/main/resources/dashboard/main.css: -------------------------------------------------------------------------------- 1 | body { 2 | background-color: #233502; 3 | color: darkolivegreen; 4 | padding: 0px; 5 | margin: 0px; 6 | font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; 7 | } 8 | 9 | td { 10 | margin: 5px; 11 | padding: 5px; 12 | } 13 | 14 | th { 15 | margin: 5px; 16 | padding: 5px; 17 | } 18 | 19 | table { 20 | border-collapse: collapse; 21 | width: 100%; 22 | text-align: left; 23 | } 24 | 25 | .grid-container { 26 | display: grid; 27 | grid-gap: 50px 100px; 28 | grid-template-columns: repeat(3, 1fr); 29 | justify-content: center; 30 | } 31 | 32 | .grid-item { 33 | vertical-align: top; 34 | } 35 | 36 | .grid-container-detail { 37 | display: grid; 38 | grid-gap: 50px 100px; 39 | grid-template-columns: repeat(2, 1fr); 40 | justify-content: center; 41 | } 42 | 43 | .detail-div { 44 | padding: 10px; 45 | background-color: #f3f2f2 46 | } 47 | 48 | .summary-div { 49 | padding: 10px; 50 | background-color: #e8ece9; 51 | } 52 | 53 | .great { 54 | background-color: #c9daa7; 55 | color: #345f34; 56 | } 57 | 58 | .normal { 59 | background-color: lightblue; 60 | color: steelblue; 61 | } 62 | 63 | .warn { 64 | background-color: palegoldenrod; 65 | color: sienna; 66 | } 67 | 68 | .critical { 69 | background-color: salmon; 70 | color: maroon; 71 | } 72 | 73 | .link { 74 | color: steelblue; 75 | text-decoration: none; 76 | } 77 | 78 | .link:hover { 79 | cursor: pointer 80 | } 81 | 82 | a { 83 | color: steelblue; 84 | text-decoration: none; 85 | } 86 | 87 | a:hover { 88 | cursor: pointer 89 | } 90 | 91 | .detail-box { 92 | background: whitesmoke; 93 | padding: 10px; 94 | box-shadow: 5px 5px lightgray; 95 | } 96 | 97 | .summary-box { 98 | background: #f6fdfc; 99 | padding: 10px; 100 | box-shadow: 5px 5px lightgray; 101 | } 102 | 103 | .header-div { 104 | width: 100%; 105 | margin-right: 8px; 106 | height: 100px; 107 | } 108 | 109 | .header-left { 110 | float: left; 111 | } 112 | 113 | .header-right { 114 | float: right; 115 | margin-top: 65px; 116 | } 117 | 118 | .namespace-span { 119 | margin-left: 16px; 120 | line-height: 100px; 121 | vertical-align: top; 122 | } 123 | 124 | .owner-div { 125 | margin-bottom: 8px; 126 | } 127 | 128 | .info-div { 129 | margin-bottom: 8px; 130 | } 131 | 132 | .stdout-div { 133 | margin-bottom: 20px; 134 | } 135 | 136 | .greenish-header { 137 | margin: 8px; 138 | color: snow; 139 | } 140 | 141 | .version-div { 142 | text-align: right; 143 | font-size: x-small; 144 | margin-right:4px; 145 | margin-top:4px; 146 | margin-bottom:4px; 147 | color: snow; 148 | } 149 | 150 | .time-div { 151 | text-align: right; 152 | margin-right:4px; 153 | color: snow; 154 | } 155 | -------------------------------------------------------------------------------- /src/main/scala/models/EnvVar.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish.models 2 | 3 | import io.circe.syntax.EncoderOps 4 | import io.circe.{Encoder, Decoder, HCursor, Json} 5 | 6 | sealed trait EnvVar { 7 | type T <: AnyRef 8 | def name: String 9 | def value: T 10 | def tupled: (String, String) 11 | } 12 | object EnvVar { 13 | private[this] val pattern = """secure\((.*)\)""".r 14 | def apply(key: String, value: String): EnvVar = { 15 | value match { 16 | case pattern(v) => SecureEnvVar(key, v.toSeq) 17 | case _ => PlainEnvVar(key, value) 18 | } 19 | } 20 | 21 | implicit val envVarDecoer: Decoder[EnvVar] = new Decoder[EnvVar] { 22 | final def apply(obj: HCursor): Decoder.Result[EnvVar] = { 23 | obj.downField("type").as[String].flatMap { 24 | case "secure" => obj.as[SecureEnvVar] 25 | case "plain" => obj.as[PlainEnvVar] 26 | } 27 | } 28 | } 29 | 30 | implicit val envVarEncoder: Encoder[EnvVar] = Encoder.instance { 31 | case sec: SecureEnvVar => sec.asJson 32 | case plain: PlainEnvVar => plain.asJson 33 | } 34 | } 35 | 36 | private[models] case class SecureEnvVar(name: String, value: Seq[Char]) extends EnvVar { 37 | type T = Seq[Char] 38 | def tupled: (String, String) = (name, value.mkString("")) 39 | } 40 | 41 | private[models] object SecureEnvVar { 42 | val HIDDEN_PASSWORD = "****" 43 | implicit val secureEnvVarEncoder: Encoder[SecureEnvVar] = 44 | new Encoder[SecureEnvVar] { 45 | final def apply(v: SecureEnvVar): Json = Json.obj( 46 | ("type", Json.fromString("secure")), 47 | ("name", Json.fromString(v.name)), 48 | ("value", Json.fromString(HIDDEN_PASSWORD)), 49 | ) 50 | } 51 | 52 | implicit val secureEnvVarDecoder: Decoder[SecureEnvVar] = new Decoder[SecureEnvVar] { 53 | final def apply(c: HCursor): Decoder.Result[SecureEnvVar] = 54 | c.downField("type").as[String].flatMap { 55 | case "secure" => 56 | for { 57 | name <- c.downField("name").as[String] 58 | value <- c.downField("value").as[String].map(_.toSeq) 59 | } yield SecureEnvVar(name, value) 60 | } 61 | } 62 | } 63 | 64 | private[models] case class PlainEnvVar(name: String, value: String) extends EnvVar { 65 | type T = String 66 | def tupled: (String, String) = (name, value) 67 | } 68 | private[models] object PlainEnvVar { 69 | implicit val plainEnvVarEncoder: Encoder[PlainEnvVar] = 70 | new Encoder[PlainEnvVar] { 71 | final def apply(v: PlainEnvVar): Json = Json.obj( 72 | ("type", Json.fromString("plain")), 73 | ("name", Json.fromString(v.name)), 74 | ("value", Json.fromString(v.value)), 75 | ) 76 | } 77 | 78 | implicit val secureEnvVarDecoder: Decoder[PlainEnvVar] = new Decoder[PlainEnvVar] { 79 | final def apply(c: HCursor): Decoder.Result[PlainEnvVar] = 80 | c.downField("type").as[String].flatMap { 81 | case "plain" => 82 | for { 83 | name <- c.downField("name").as[String] 84 | value <- c.downField("value").as[String] 85 | } yield PlainEnvVar(name, value) 86 | } 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /src/main/resources/dashboard/main_container.jsx: -------------------------------------------------------------------------------- 1 | const e = React.createElement; 2 | const Link = ReactRouterDOM.Link; 3 | 4 | class MainContainer extends React.Component { 5 | constructor(props) { 6 | super(props); 7 | this.state = { 8 | page: props.page, 9 | gid: props.group, 10 | jid: props.job 11 | } 12 | this.handler = this.handler.bind(this); 13 | this.renderMain = this.renderMain.bind(this); 14 | } 15 | 16 | renderMain(page, gid, jid, handler) { 17 | if (page == 'state') { 18 | return ( 19 |
20 |

All data sets  21 | 22 | `${loc.pathname}?page=main`} 23 | onClick={() => this.setState({page:"main"})} className="link"> 24 | See main dashboard 25 | 26 | 27 |

28 | 29 |
30 | ) 31 | } else if(page == 'group'){ 32 | return( 33 |
34 | 35 |
36 | ) 37 | } else if(page == 'job'){ 38 | return( 39 |
40 | 41 |
42 | ) 43 | } else { // page == 'main' 44 | return( 45 |
46 |
47 |

Summary

48 | 49 |

50 |
51 |
52 |

Detailed missing periods  53 | 54 | `${loc.pathname}?page=state`} 55 | onClick={() => this.setState({page:"state"})} className="link"> 56 | See all periods 57 | 58 | 59 |

60 | 61 |
62 |
63 | ) 64 | } 65 | } 66 | 67 | handler(page, gid, jid) { 68 | 69 | this.setState({ 70 | page: page, 71 | gid: gid, 72 | jid: jid, 73 | }) 74 | } 75 | 76 | render() { 77 | return ( 78 |
79 |
80 |
81 |

82 | 83 | 84 |

85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 | {this.renderMain(this.state.page, this.state.gid, this.state.jid)} 93 |
94 |
95 | ) 96 | } 97 | } 98 | 99 | const domContainer = document.querySelector('#main_container'); 100 | const BrowserRouter = ReactRouterDOM.BrowserRouter; 101 | const Route = ReactRouterDOM.Route; 102 | const useLocation = ReactRouterDOM.useLocation; 103 | 104 | function useQuery() { 105 | return new URLSearchParams(useLocation().search); 106 | } 107 | 108 | function ShowPage() { 109 | let query = useQuery(); 110 | let page = query.get("page"); 111 | let gid = query.get("gid"); 112 | let jid = query.get("jid"); 113 | return (); 114 | } 115 | ReactDOM.render( 116 | , 117 | domContainer 118 | ); 119 | -------------------------------------------------------------------------------- /src/main/resources/dashboard/summary_container.jsx: -------------------------------------------------------------------------------- 1 | const Link = ReactRouterDOM.Link; 2 | 3 | class SummaryContainer extends React.Component { 4 | intervalID 5 | constructor(props) { 6 | super(props); 7 | this.state = { 8 | error: null, 9 | isLoaded: false, 10 | items: [] 11 | }; 12 | this.handleGroupClick = this.handleGroupClick.bind(this); 13 | this.handleJobClick = this.handleJobClick.bind(this); 14 | } 15 | 16 | componentDidMount() { 17 | this.fetchData() 18 | } 19 | 20 | componentWillUnmount() { 21 | clearTimeout(this.intervalID); 22 | } 23 | 24 | fetchData = () => { 25 | fetch("/summary") 26 | .then(res => res.json()) 27 | .then( 28 | (items) => { 29 | this.setState({ 30 | isLoaded: true, 31 | items: items 32 | }); 33 | }, 34 | // Note: it's important to handle errors here 35 | // instead of a catch() block so that we don't swallow 36 | // exceptions from actual bugs in components. 37 | (error) => { 38 | this.setState({ 39 | isLoaded: true, 40 | error 41 | }); 42 | } 43 | ) 44 | this.intervalID = setTimeout(this.fetchData, fetchInterval); 45 | } 46 | 47 | handleGroupClick(gid) { 48 | this.props.handler("group", gid, null); 49 | } 50 | 51 | handleJobClick(gid, jid) { 52 | this.props.handler("job", gid, jid); 53 | } 54 | 55 | render() { 56 | const { error, isLoaded, items } = this.state; 57 | if (error) { 58 | return ( 59 |
Error: {error.message}
60 | ) 61 | } else if (!isLoaded) { 62 | return ( 63 |
Loading...
64 | ) 65 | } else { 66 | return ( 67 |
68 | { 69 | items.map(group => { 70 | const gid = group.group_id; 71 | return ( 72 |
73 |

74 | `${loc.pathname}?page=group&gid=${gid}`} 75 | onClick={() => {this.handleGroupClick(gid)}} className="link"> 76 | {group.name} 77 | 78 |

79 | 80 | 81 | 82 | 83 | 84 | 85 | { 86 | group["status"].map(job =>{ 87 | const jid = job.job_id; 88 | return( 89 | 90 | 96 | 98 | 99 | ) 100 | } 101 | )} 102 | 103 |
Job name# Missing data sets
91 | `${loc.pathname}?page=job&gid=${gid}&jid=${jid}`} 92 | onClick={() => {this.handleJobClick(gid, jid)}} className="link"> 93 | {job.name} 94 | 95 | {job.missing}
104 |
105 | ) 106 | }) 107 | } 108 |
109 | ) 110 | } 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /src/test/scala/AppSpec.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish 2 | 3 | import org.scalatest.matchers.should.Matchers 4 | import org.scalatest.wordspec.AnyWordSpecLike 5 | import models._ 6 | import java.time.ZoneId 7 | import java.io.File 8 | 9 | class AppSpec() extends Matchers 10 | with AnyWordSpecLike { 11 | "getPrometheusIds" must { 12 | "work when there are duplicate IDs" in { 13 | val config = new AppConfig( 14 | Seq( 15 | Group(0, "Group1", Seq( 16 | Job(0, "Job1", None, "job_1", "/tmp/first_script", 17 | "yyyy-MM-dd-HH", Hourly, 3, 18 | ZoneId.of("UTC"), 24, 0, 19 | AlertLevels(0, 1, 2, 3), 20 | None, 21 | Seq(EnvVar("VAR1", "baz"), EnvVar("VAR2", "secure(bar)")), 22 | ), 23 | Job(1, "Job2", None, "job_1", "/tmp/second_script job2", 24 | "yyyy-MM-dd-HH", Daily, 2, 25 | ZoneId.of("UTC"), 24, 0, 26 | AlertLevels(0, 1, 2, 3), 27 | None, 28 | Seq(EnvVar("VAR1", "baz"), EnvVar("VAR2", "secure(bar)")), 29 | ), 30 | )), 31 | Group(1, "Group2", Seq( 32 | Job(0, "Job3", None, "job_2", "/tmp/third_script", 33 | "yyyy-MM-dd", Monthly, 1, 34 | ZoneId.of("UTC"), 3, 0, 35 | AlertLevels(0, 1, 2, 3), 36 | None, 37 | Seq(EnvVar("VAR1", "baz"), EnvVar("VAR2", "secure(bar)")), 38 | ), 39 | Job(1, "Job4", None, "job_2", "/tmp/fourth_script", 40 | "yyyy-01-01", Annually, 1, 41 | ZoneId.of("UTC"), 3, 0, 42 | AlertLevels(0, 1, 2, 3), 43 | None, 44 | Seq(EnvVar("VAR1", "baz"), EnvVar("VAR2", "secure(bar)")), 45 | ), 46 | )), 47 | ), 48 | None, 49 | new File("/tmp/greenish/stdout"), 50 | 30, 51 | "127.0.0.1", 52 | 8080, 53 | ) 54 | 55 | val expected = Set("job_1", "job_2") 56 | 57 | val actual = App.getPrometheusIds(config) 58 | 59 | actual shouldBe expected 60 | } 61 | 62 | "work when there are no duplicate IDs" in { 63 | val config = new AppConfig( 64 | Seq( 65 | Group(0, "Group1", Seq( 66 | Job(0, "Job1", None, "job_1", "/tmp/first_script", 67 | "yyyy-MM-dd-HH", Hourly, 3, 68 | ZoneId.of("UTC"), 24, 0, 69 | AlertLevels(0, 1, 2, 3), 70 | None, 71 | Seq(EnvVar("VAR1", "baz"), EnvVar("VAR2", "secure(bar)")), 72 | ), 73 | Job(1, "Job2", None, "job_2", "/tmp/second_script job2", 74 | "yyyy-MM-dd-HH", Daily, 2, 75 | ZoneId.of("UTC"), 24, 0, 76 | AlertLevels(0, 1, 2, 3), 77 | None, 78 | Seq(EnvVar("VAR1", "baz"), EnvVar("VAR2", "secure(bar)")), 79 | ), 80 | )), 81 | Group(1, "Group2", Seq( 82 | Job(0, "Job3", None, "job_3", "/tmp/third_script", 83 | "yyyy-MM-dd", Monthly, 1, 84 | ZoneId.of("UTC"), 3, 0, 85 | AlertLevels(0, 1, 2, 3), 86 | None, 87 | Seq(EnvVar("VAR1", "baz"), EnvVar("VAR2", "secure(bar)")), 88 | ), 89 | Job(1, "Job4", None, "job_4", "/tmp/fourth_script", 90 | "yyyy-01-01", Annually, 1, 91 | ZoneId.of("UTC"), 3, 0, 92 | AlertLevels(0, 1, 2, 3), 93 | None, 94 | Seq(EnvVar("VAR1", "baz"), EnvVar("VAR2", "secure(bar)")), 95 | ), 96 | )), 97 | ), 98 | None, 99 | new File("/tmp/greenish/stdout"), 100 | 30, 101 | "127.0.0.1", 102 | 8080, 103 | ) 104 | 105 | val expected = Set("job_1", "job_2", "job_3", "job_4") 106 | 107 | val actual = App.getPrometheusIds(config) 108 | 109 | actual shouldBe expected 110 | } 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /src/main/scala/checker/CommandRunner.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish.checker 2 | 3 | import me.amanj.greenish.stats._ 4 | import me.amanj.greenish.models._ 5 | import java.time.ZonedDateTime 6 | import java.io.{File, PrintWriter} 7 | import java.nio.file.{Files, StandardCopyOption} 8 | import scala.sys.process.Process 9 | import scala.util.control.NonFatal 10 | import akka.actor.{Actor, ActorRef, ActorLogging} 11 | 12 | class CommandRunner(statsActor: ActorRef, 13 | scratchDir: File) extends Actor with ActorLogging { 14 | override def receive: Receive = { 15 | case BatchRun(cmd, periods, env, group, job, 16 | prometheusId, clockCounter, expireAt) => 17 | val startTimeLong = System.currentTimeMillis 18 | if(startTimeLong <= expireAt) { 19 | statsActor ! IncRefresh(prometheusId) 20 | val startTime = startTimeLong.toDouble 21 | try { 22 | run(cmd, periods, env, group, job, prometheusId, clockCounter) 23 | } catch { 24 | case NonFatal(exp) => 25 | log.error(exp.getMessage()) 26 | statsActor ! IncBadRefresh(prometheusId) 27 | } finally { 28 | statsActor ! DecRefresh(prometheusId) 29 | val endTime = System.currentTimeMillis.toDouble 30 | statsActor ! RefreshTime(prometheusId, 31 | (endTime - startTime) / 1000) 32 | } 33 | } else { 34 | statsActor ! IncExpiredRefresh(prometheusId) 35 | } 36 | } 37 | 38 | private[this] def run( 39 | cmd: String, 40 | periods: Seq[String], 41 | env: Seq[(String, String)], 42 | group: Int, 43 | job: Int, 44 | prometheusId: String, 45 | clockCounter: Long): Unit = { 46 | val exec = Seq("bash", "-c", CommandRunner.toBashCommand(cmd, periods)) 47 | val output = Process(exec, None, env:_*).lazyLines 48 | CommandRunner.write(debugFile(scratchDir, group, job), output) 49 | val capturedOutput = CommandRunner.parseOutput(output, periods.toSet) 50 | val distinctReturnedPeriods = capturedOutput.map(_._1).distinct 51 | if(capturedOutput.length < periods.size) { 52 | log.error(s"""|Some periods weren't returned for: 53 | |Group ID: $group, Job ID: $job 54 | |$cmd $periods 55 | |state update aborted""".stripMargin) 56 | statsActor ! IncBadRefresh(prometheusId) 57 | } else if(distinctReturnedPeriods.length != capturedOutput.size) { 58 | log.error(s"""|Some periods were returned more than once for: 59 | |$cmd $periods 60 | |Group ID: $group, Job ID: $job 61 | |$cmd $periods 62 | |state update aborted""".stripMargin) 63 | statsActor ! IncBadRefresh(prometheusId) 64 | } else { 65 | val mapped = capturedOutput.toMap 66 | val periodHealths = periods.map { 67 | period => PeriodHealth(period, mapped(period)) } 68 | context.sender ! RunResult(periodHealths, group, job, clockCounter) 69 | statsActor ! MissingPeriods(prometheusId, periodHealths.count(!_.ok)) 70 | val oldestMissingPeriod = computeOldest(periodHealths) 71 | statsActor ! OldestMissingPeriod(prometheusId, oldestMissingPeriod) 72 | } 73 | } 74 | } 75 | 76 | object CommandRunner { 77 | private[this] val Matcher = "^greenish-period\t(.*)\t(1|0)$".r 78 | 79 | protected[checker] def write(file: String, 80 | lines: LazyList[String]): Unit = { 81 | val tmp = new File(s"$file.tmp") 82 | val pw = new PrintWriter(tmp) 83 | lines.foreach(pw.println) 84 | pw.close 85 | // FIXME: There is a slight chance of race, but do we care? 86 | Files.move(tmp.toPath, new File(file).toPath, 87 | StandardCopyOption.ATOMIC_MOVE) 88 | } 89 | 90 | protected[checker] def parseOutput(lines: LazyList[String], 91 | periods: Set[String]): Seq[(String, Boolean)] = 92 | lines.map { line => 93 | line match { 94 | case Matcher(period, "1") => Some((period, true)) 95 | case Matcher(period, "0") => Some((period, false)) 96 | case _ => None 97 | } 98 | }.collect { case Some(periodStatus) => periodStatus } 99 | .filter { case (period, _) => periods.contains(period) } 100 | .toList 101 | 102 | protected[checker] def toBashCommand(command: String, periods: Seq[String]): String = 103 | s"$command ${periods.map(p => s"'$p'").mkString(" ")}" 104 | } 105 | -------------------------------------------------------------------------------- /src/main/scala/stats/StatsCollector.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish.stats 2 | 3 | import akka.actor.Actor 4 | import akka.actor.ActorLogging 5 | import io.prometheus.client.{Counter, Gauge, Histogram, CollectorRegistry} 6 | import io.prometheus.client.Collector.MetricFamilySamples 7 | import io.prometheus.client.exporter.common.TextFormat 8 | import java.util.Enumeration 9 | import java.io.{StringWriter, Writer} 10 | import akka.http.scaladsl.model.{MediaType, HttpCharsets, HttpEntity} 11 | import akka.http.scaladsl.marshalling.{ToEntityMarshaller, Marshaller} 12 | 13 | class StatsCollector(jobIDs: Set[String], 14 | registry: CollectorRegistry = new CollectorRegistry()) extends Actor with ActorLogging { 15 | 16 | // Job related metrics 17 | private[this] val refreshGauge = Gauge.build() 18 | .name("greenish_active_refresh_tasks") 19 | .help("Current number active state refresh tasks") 20 | .labelNames("job_id") 21 | .register(registry) 22 | 23 | private[this] val refreshTime = Histogram.build() 24 | .name("greenish_state_refresh_time_seconds") 25 | .help("Job state refreshing time") 26 | .labelNames("job_id") 27 | .buckets(StatsCollector.HistogramTimeBuckets:_*) 28 | .register(registry) 29 | 30 | private[this] val refreshCounter = Counter.build() 31 | .name("greenish_state_refresh_total") 32 | .help("Total number of job state refresh instances") 33 | .labelNames("job_id") 34 | .register(registry) 35 | 36 | private[this] val badRefreshCounter = Counter.build() 37 | .name("greenish_state_refresh_failed_total") 38 | .help("Total number of failed job state refresh instances") 39 | .labelNames("job_id") 40 | .register(registry) 41 | 42 | private[this] val expiredRefreshCounter = Counter.build() 43 | .name("greenish_state_refresh_expired_total") 44 | .help("Total number of expired job state refresh instances") 45 | .labelNames("job_id") 46 | .register(registry) 47 | 48 | private[this] val missingPeriods = Gauge.build() 49 | .name("greenish_missing_periods_total") 50 | .help("Current number of missing dataset periods") 51 | .labelNames("job_id") 52 | .register(registry) 53 | 54 | private[this] val oldestMissingPeriod = Gauge.build() 55 | .name("greenish_oldest_missing_period") 56 | .help("The oldest missing period") 57 | .labelNames("job_id") 58 | .register(registry) 59 | 60 | init() 61 | 62 | private[this] def init(): Unit = { 63 | jobIDs.foreach { jobId => 64 | refreshGauge.labels(jobId) 65 | refreshTime.labels(jobId) 66 | refreshCounter.labels(jobId) 67 | expiredRefreshCounter.labels(jobId) 68 | badRefreshCounter.labels(jobId) 69 | missingPeriods.labels(jobId) 70 | oldestMissingPeriod.labels(jobId) 71 | } 72 | } 73 | 74 | override def receive: Receive = { 75 | case RefreshTime(jobId, time) => 76 | refreshTime.labels(jobId).observe(time) 77 | case IncRefresh(jobId) => 78 | refreshCounter.labels(jobId).inc() 79 | refreshGauge.labels(jobId).inc() 80 | case DecRefresh(jobId) => 81 | refreshGauge.labels(jobId).dec() 82 | case IncBadRefresh(jobId) => 83 | badRefreshCounter.labels(jobId).inc() 84 | case MissingPeriods(jobId, count) => 85 | missingPeriods.labels(jobId).set(count) 86 | case OldestMissingPeriod(jobId, count) => 87 | oldestMissingPeriod.labels(jobId).set(count) 88 | case IncExpiredRefresh(jobId) => 89 | refreshCounter.labels(jobId).inc() 90 | expiredRefreshCounter.labels(jobId).inc() 91 | case GetPrometheus => 92 | import StatsCollector.{fromRegistry, toPrometheusTextFormat} 93 | val metrics = fromRegistry(registry) 94 | context.sender ! metrics 95 | } 96 | } 97 | 98 | object StatsCollector { 99 | case class MetricsEntity(samples: Enumeration[MetricFamilySamples]) 100 | 101 | private [StatsCollector] val HistogramTimeBuckets = 102 | Seq( 103 | 0.1, 0.3, 0.5, 0.8, 1, 1.3, 1.5, 1.8, 2, 2.5, 3, 3.5, 4, 4.5) 104 | 105 | private[this] val mediaTypeParams = Map("version" -> "0.0.4") 106 | private[this] val mediaType = MediaType.customWithFixedCharset( 107 | "text", "plain", HttpCharsets.`UTF-8`, params = mediaTypeParams) 108 | 109 | private[stats] def fromRegistry( 110 | collectorRegistry: CollectorRegistry): MetricsEntity = { 111 | MetricsEntity(collectorRegistry.metricFamilySamples()) 112 | } 113 | 114 | private[stats] def toPrometheusTextFormat(e: MetricsEntity): String = { 115 | val writer: Writer = new StringWriter() 116 | TextFormat.write004(writer, e.samples) 117 | 118 | writer.toString 119 | } 120 | 121 | implicit val metricsMarshaller: ToEntityMarshaller[MetricsEntity] = { 122 | Marshaller.withFixedContentType(mediaType) { s => 123 | HttpEntity(mediaType, toPrometheusTextFormat(s)) 124 | } 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /src/main/scala/checker/StatusChecker.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish.checker 2 | 3 | import me.amanj.greenish.stats.StatsCollector 4 | import me.amanj.greenish.models._ 5 | import java.time.ZonedDateTime 6 | import java.io.File 7 | import akka.actor.{Actor, Props, ActorRef, ActorLogging} 8 | import scala.sys.process.Process 9 | import scala.concurrent.{Future} 10 | import scala.util.{Success, Failure} 11 | import akka.routing.{ActorRefRoutee, RoundRobinRoutingLogic, Router} 12 | import akka.pattern.pipe 13 | import scala.annotation.tailrec 14 | 15 | trait StatusCheckerApi { 16 | protected[this] var state: IndexedSeq[GroupStatus] 17 | 18 | protected[checker] def getMissing(): IndexedSeq[GroupStatus] = { 19 | state 20 | .map { group => 21 | val newJobs: Array[JobStatus] = group.status.map { job => 22 | job.copy(periodHealth = job.periodHealth.filterNot(_.ok)) 23 | }.filterNot(_.periodHealth.isEmpty) 24 | .toArray 25 | 26 | group.copy(status = newJobs) 27 | }.filterNot(_.status.isEmpty) 28 | } 29 | 30 | protected[checker] def maxLag(): Lag = { 31 | if(state.isEmpty) Lag(0) 32 | else { 33 | val lag = state.map { group => 34 | group.status.map(_.countMissing).max 35 | }.max 36 | Lag(lag) 37 | } 38 | } 39 | 40 | protected[checker] def allEntries(): IndexedSeq[GroupStatus] = state 41 | 42 | protected[checker] def summary(): Seq[GroupStatusSummary] = 43 | state.map { group => 44 | val status = group.status.map { status => 45 | val missing = status.countMissing 46 | val alertLevel: AlertLevel = 47 | if(missing <= status.job.alertLevels.great) Great 48 | else if(missing <= status.job.alertLevels.normal) Normal 49 | else if(missing <= status.job.alertLevels.warn) Warn 50 | else Critical 51 | 52 | val oldestMissingPeriod = computeOldest(status.periodHealth) 53 | JobStatusSummary(status.job.jobId, status.job.name, missing, oldestMissingPeriod, alertLevel) 54 | }.toSeq 55 | GroupStatusSummary(group.group.groupId, group.group.name, status) 56 | } 57 | 58 | protected[checker] def getGroupStatus(groupId: Int): Option[GroupStatus] = 59 | state.lift(groupId) 60 | 61 | protected[checker] def getJobStatus(groupId: Int, jobId: Int): Option[JobStatus] = 62 | for { 63 | group <- state.lift(groupId) 64 | job <- group.status.lift(jobId) 65 | } yield job 66 | } 67 | 68 | class StatusChecker(groups: Seq[Group], 69 | statsActor: ActorRef, 70 | refreshValidityInSeconds: Long, 71 | scratchDir: File, 72 | clockCounter: () => Long = () => System.currentTimeMillis()) 73 | extends Actor with ActorLogging with StatusCheckerApi { 74 | override protected[this] var state = StatusChecker.initState(groups) 75 | 76 | import context.dispatcher 77 | 78 | private[this] val parallelism: Int = groups.map(_.jobs.length).sum 79 | 80 | private[this] val router = { 81 | val routees = (0 until parallelism) map { _ => 82 | val runner = context.actorOf( 83 | Props(new CommandRunner(statsActor, scratchDir)) 84 | .withDispatcher("akka.refresh-dispatcher")) 85 | context watch runner 86 | ActorRefRoutee(runner) 87 | } 88 | 89 | Router(RoundRobinRoutingLogic(), routees) 90 | } 91 | 92 | private[this] def refresh(now: ZonedDateTime, group: Group, job: Job): Unit = { 93 | val periods = StatusChecker.periods(job, now) 94 | 95 | val currentClockCounter = clockCounter() 96 | val expireAt = currentClockCounter + 1000 * 97 | refreshValidityInSeconds 98 | self ! BatchRun(job.cmd, periods, job.env.map(_.tupled), 99 | group.groupId, job.jobId, job.prometheusId, 100 | currentClockCounter, expireAt) 101 | } 102 | 103 | private[this] def refresh(now: ZonedDateTime, group: Group): Unit = { 104 | group.jobs.foreach { job => refresh(now, group, job) } 105 | } 106 | 107 | private[this] def refresh(now: ZonedDateTime): Unit = { 108 | 109 | groups.foreach { group => 110 | refresh(now, group) 111 | } 112 | } 113 | 114 | override def receive: Receive = { 115 | case Refresh(now) => 116 | refresh(now()) 117 | case RefreshGroup(now, groupId) => 118 | groups.find(_.groupId == groupId) match { 119 | case Some(group) => 120 | refresh(now(), group) 121 | context.sender ! true 122 | case None => 123 | context.sender ! false 124 | } 125 | case RefreshJob(now, groupId, jobId) => 126 | val result = for { 127 | group <- groups.find(_.groupId == groupId) 128 | job <- group.jobs.lift(jobId) 129 | } yield { 130 | refresh(now(), group, job) 131 | } 132 | context.sender ! result.isDefined 133 | case RunResult(periodHealth, groupId, jobId, clockCounter) => 134 | val bucket = state(groupId) 135 | val currentStatus = bucket.status(jobId) 136 | if(currentStatus.updatedAt < clockCounter) { 137 | bucket.status(jobId) = currentStatus.copy(updatedAt = clockCounter, 138 | periodHealth = periodHealth) 139 | } 140 | case GetMissing => context.sender ! getMissing() 141 | case MaxLag => context.sender ! maxLag() 142 | case AllEntries => context.sender ! allEntries() 143 | case Summary => context.sender ! summary() 144 | case GetGroupStatus(id) => 145 | context.sender ! getGroupStatus(id) 146 | case GetJobStatus(gid, jid) => 147 | context.sender ! getJobStatus(gid, jid) 148 | case run: BatchRun => 149 | router.route(run, context.sender) 150 | } 151 | } 152 | 153 | object StatusChecker { 154 | private[checker] def initState(groups: Seq[Group]): IndexedSeq[GroupStatus] = { 155 | groups.map { group => 156 | val jobStatus = group.jobs.map { job => 157 | JobStatus(job, -1, Seq.empty) 158 | } 159 | GroupStatus(group, jobStatus.toArray) 160 | }.toIndexedSeq 161 | } 162 | 163 | private[checker] def periods(entry: Job, now: ZonedDateTime): Seq[String] = { 164 | @tailrec def loop(time: ZonedDateTime, count: Int, acc: Seq[String]): Seq[String] = { 165 | if(time.toEpochSecond < entry.startAt || count == 0) acc.reverse 166 | else 167 | loop(entry.frequency.prev(time), count - 1, 168 | acc :+ time.format(entry.timeFormat)) 169 | } 170 | 171 | loop(nowMinusOffset(entry, now), 172 | entry.lookback, Vector.empty[String]) 173 | } 174 | 175 | private[checker] def nowMinusOffset(entry: Job, 176 | now: ZonedDateTime): ZonedDateTime = 177 | if(entry.periodCheckOffset == 0) 178 | now.withZoneSameInstant(entry.timezone) 179 | else 180 | (1 to entry.periodCheckOffset) 181 | .foldLeft(now.withZoneSameInstant(entry.timezone))( 182 | (acc, next) => entry.frequency.prev(acc)) 183 | } 184 | -------------------------------------------------------------------------------- /src/main/scala/endpoints/Routes.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish.endpoints 2 | 3 | import java.time.ZonedDateTime 4 | import java.io.File 5 | import akka.actor.ActorRef 6 | import akka.util.Timeout 7 | import akka.pattern.ask 8 | import akka.http.scaladsl.model.{StatusCodes, ContentTypes} 9 | import akka.http.scaladsl.server.Directive 10 | import akka.http.scaladsl.server.Directives._ 11 | import scala.concurrent.duration.Duration 12 | import io.circe.syntax._ 13 | import io.circe.Printer 14 | import me.amanj.greenish.models._ 15 | import me.amanj.greenish.stats._ 16 | import me.amanj.greenish.checker._ 17 | import akka.http.scaladsl.model.HttpResponse 18 | import scala.util.Success 19 | 20 | class Routes(namespace: Option[String], 21 | scratchDir: File, 22 | statusChecker: ActorRef, 23 | statsActor: ActorRef, 24 | goodRefreshRecency: Long, 25 | now: () => ZonedDateTime = () => ZonedDateTime.now) { 26 | private[this] implicit val timeout = Timeout(Duration.fromNanos(5000000L)) 27 | private[this] val jsonPrinter = Printer ( 28 | dropNullValues = true, 29 | indent="" 30 | ) 31 | 32 | private[this] val maxlag = get { 33 | path("maxlag") { 34 | val lagFuture = ( 35 | statusChecker ? MaxLag 36 | ).mapTo[Lag] 37 | onComplete(lagFuture) { lag => 38 | complete(lag.map(o => jsonPrinter.print(o.asJson))) 39 | } 40 | } 41 | } 42 | 43 | private[this] val summary = get { 44 | path("summary") { 45 | val lagFuture = ( 46 | statusChecker ? Summary 47 | ).mapTo[Seq[GroupStatusSummary]] 48 | onComplete(lagFuture) { lag => 49 | complete(lag.map(o => jsonPrinter.print(o.asJson))) 50 | } 51 | } 52 | } 53 | 54 | private[this] val missing = get { 55 | path("missing") { 56 | val missingFuture = ( 57 | statusChecker ? GetMissing 58 | ).mapTo[Seq[GroupStatus]] 59 | onComplete(missingFuture) { missing => 60 | complete(missing.map(o => jsonPrinter.print(o.asJson))) 61 | } 62 | } 63 | } 64 | 65 | private[this] val state = get { 66 | path("state") { 67 | val allFuture = ( 68 | statusChecker ? AllEntries 69 | ).mapTo[Seq[GroupStatus]] 70 | onComplete(allFuture) { completed => 71 | complete(completed.map(o => jsonPrinter.print(o.asJson))) 72 | } 73 | } 74 | } 75 | 76 | private[this] val getGroup = get { 77 | path("group" / IntNumber) { id => 78 | val groupFuture = ( 79 | statusChecker ? GetGroupStatus(id) 80 | ).mapTo[Option[GroupStatus]] 81 | 82 | onComplete(groupFuture) { 83 | case Success(Some(group)) => 84 | complete(jsonPrinter.print(group.asJson)) 85 | case _ => 86 | val error = jsonPrinter.print(errorJson("Group id does not exist")) 87 | complete(HttpResponse(StatusCodes.BadRequest, entity = error)) 88 | } 89 | } 90 | } 91 | 92 | private[this] val getJob = get { 93 | path("group" / IntNumber / "job" / IntNumber) { 94 | (gid, jid) => 95 | val jobFuture = ( 96 | statusChecker ? GetJobStatus(gid, jid) 97 | ).mapTo[Option[JobStatus]] 98 | onComplete(jobFuture) { 99 | case Success(Some(job)) => 100 | complete(jsonPrinter.print(job.asJson)) 101 | case _ => 102 | val error = jsonPrinter 103 | .print(errorJson("Group id and/or job id does not exist")) 104 | complete(HttpResponse(StatusCodes.BadRequest, entity = error)) 105 | } 106 | } 107 | } 108 | 109 | private[this] val getJobOutput = get { 110 | path("group" / IntNumber / "job" / IntNumber / "stdout") { 111 | (gid, jid) => 112 | getFromFile(new File(debugFile(scratchDir, gid, jid)), 113 | ContentTypes.`text/plain(UTF-8)`) 114 | } 115 | } 116 | 117 | private[this] val refreshState = get { 118 | path("state" / "refresh") { 119 | statusChecker ! Refresh(now) 120 | complete(jsonPrinter.print(okJson("State refresh is scheduled"))) 121 | } 122 | } 123 | 124 | private[this] val refreshGroup = get { 125 | path("group" / IntNumber / "refresh") { id => 126 | val statusFuture = ( 127 | statusChecker ? RefreshGroup(now, id) 128 | ).mapTo[Boolean] 129 | 130 | onComplete(statusFuture) { 131 | case Success(true) => 132 | complete(jsonPrinter.print(okJson("Group status refresh is scheduled"))) 133 | case _ => 134 | val error = jsonPrinter.print(errorJson("Group id does not exist")) 135 | complete(HttpResponse(StatusCodes.BadRequest, entity = error)) 136 | } 137 | } 138 | } 139 | 140 | private[this] val refreshJob = get { 141 | path("group" / IntNumber / "job" / IntNumber / "refresh") { 142 | (gid, jid) => 143 | val statusFuture = ( 144 | statusChecker ? RefreshJob(now, gid, jid) 145 | ).mapTo[Boolean] 146 | onComplete(statusFuture) { 147 | case Success(true) => 148 | complete(jsonPrinter.print(okJson("Job status refresh is scheduled"))) 149 | case _ => 150 | val error = jsonPrinter 151 | .print(errorJson("Group id and/or job id does not exist")) 152 | complete(HttpResponse(StatusCodes.BadRequest, entity = error)) 153 | } 154 | } 155 | } 156 | 157 | private[this] val dashboard = 158 | (get & pathPrefix("dashboard")) { 159 | (pathEndOrSingleSlash & 160 | redirectToTrailingSlashIfMissing(StatusCodes.TemporaryRedirect)) { 161 | getFromResource("dashboard/index.html") 162 | } ~ { 163 | getFromResourceDirectory("dashboard") 164 | } 165 | } 166 | 167 | private[this] val system = get { 168 | path("system") { 169 | val json = jsonPrinter.print(sysinfo(namespace)) 170 | complete(json) 171 | } 172 | } 173 | 174 | private[this] val prometheus = get { 175 | path("prometheus") { 176 | val statsFuture = 177 | (statsActor ? GetPrometheus) 178 | .mapTo[StatsCollector.MetricsEntity] 179 | onComplete(statsFuture) { entity => 180 | complete(entity) 181 | } 182 | } 183 | } 184 | 185 | private[this] val health = get { 186 | path("health") { 187 | val entriesFuture = (statusChecker ? AllEntries) 188 | .mapTo[Seq[GroupStatus]] 189 | 190 | onComplete(entriesFuture) { entity => 191 | val health = entity.map( groups => 192 | Routes.isHealthy(groups, goodRefreshRecency)).getOrElse(false) 193 | val json = jsonPrinter.print(healthJson(health)) 194 | complete(json) 195 | } 196 | } 197 | } 198 | 199 | val routes = 200 | getJob ~ getJobOutput ~ getGroup ~ refreshState ~ refreshGroup ~ 201 | refreshJob ~ maxlag ~ summary ~ missing ~ state ~ dashboard ~ 202 | system ~ prometheus ~ health 203 | } 204 | 205 | object Routes { 206 | private[endpoints] def isHealthy(groups: Seq[GroupStatus], 207 | recency: Long): Boolean = { 208 | val now = System.currentTimeMillis 209 | groups.map { group => 210 | group.status.filterNot { job => 211 | (now - job.updatedAt) > recency || job.periodHealth.isEmpty 212 | }.length 213 | }.exists(_ > 0) 214 | } 215 | } 216 | -------------------------------------------------------------------------------- /src/test/scala/stats/StatsCollectorSpec.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish.stats 2 | 3 | import akka.actor.{ActorSystem, Props} 4 | import akka.testkit.{ ImplicitSender, TestKit } 5 | import org.scalatest.BeforeAndAfterAll 6 | import org.scalatest.matchers.should.Matchers 7 | import org.scalatest.wordspec.AnyWordSpecLike 8 | import org.scalatest.concurrent.Eventually 9 | import java.time.{ZoneId, ZonedDateTime} 10 | import scala.concurrent.duration._ 11 | import scala.language.postfixOps 12 | import me.amanj.greenish.models._ 13 | import io.prometheus.client.Collector.MetricFamilySamples 14 | import java.io.File 15 | import scala.jdk.CollectionConverters._ 16 | 17 | class StatsCollectorSpec() 18 | extends TestKit(ActorSystem("StatsCollectorSpec")) 19 | with ImplicitSender 20 | with AnyWordSpecLike 21 | with Matchers 22 | with BeforeAndAfterAll 23 | with Eventually { 24 | 25 | import StatsCollectorSpec._ 26 | 27 | "StatsCollector" must { 28 | "initialize labels upon instantiation" in { 29 | val jobs = Set("p1", "p2") 30 | val stats = system.actorOf( 31 | Props(new StatsCollector(jobs))) 32 | 33 | stats ! GetPrometheus 34 | 35 | val received = receiveOne(2 seconds) 36 | assert(received.isInstanceOf[StatsCollector.MetricsEntity]) 37 | 38 | val prometheus = received 39 | .asInstanceOf[StatsCollector.MetricsEntity] 40 | .samples 41 | .asScala 42 | .toList 43 | 44 | prometheus.isEmpty shouldBe false 45 | prometheus.foreach { prom => 46 | val labels = prom.samples.asScala 47 | .flatMap(_.labelValues.asScala) 48 | .filter(jobs.contains(_)) 49 | .toSet 50 | labels shouldBe jobs 51 | } 52 | } 53 | 54 | "properly handle IncExpiredRefresh message" in { 55 | val jobs = Set("p1", "p2") 56 | val stats = system.actorOf( 57 | Props(new StatsCollector(jobs))) 58 | 59 | stats ! IncExpiredRefresh("p2") 60 | stats ! GetPrometheus 61 | 62 | val expected = Seq( 63 | (Seq("p1"), 0.0), 64 | (Seq("p2"), 1.0), 65 | ) 66 | 67 | val prom = receiveOne(2 seconds) 68 | .asInstanceOf[StatsCollector.MetricsEntity] 69 | .samples.asScala.toList 70 | 71 | checkSamples(prom, "greenish_state_refresh_total", expected) 72 | checkSamples(prom, "greenish_state_refresh_expired_total", expected) 73 | } 74 | 75 | "properly handle IncRefresh message" in { 76 | val jobs = Set("p1", "p2") 77 | val stats = system.actorOf( 78 | Props(new StatsCollector(jobs))) 79 | 80 | stats ! IncRefresh("p2") 81 | stats ! GetPrometheus 82 | 83 | val expected = Seq( 84 | (Seq("p1"), 0.0), 85 | (Seq("p2"), 1.0), 86 | ) 87 | 88 | val prom = receiveOne(2 seconds) 89 | .asInstanceOf[StatsCollector.MetricsEntity] 90 | .samples.asScala.toList 91 | 92 | checkSamples(prom, "greenish_state_refresh_total", expected) 93 | checkSamples(prom, "greenish_active_refresh_tasks", expected) 94 | } 95 | 96 | "properly handle DecRefresh message" in { 97 | val jobs = Set("p1", "p2") 98 | val stats = system.actorOf( 99 | Props(new StatsCollector(jobs))) 100 | 101 | stats ! IncRefresh("p1") 102 | stats ! DecRefresh("p1") 103 | stats ! GetPrometheus 104 | 105 | val expectedTotal = Seq( 106 | (Seq("p1"), 1.0), 107 | (Seq("p2"), 0.0), 108 | ) 109 | 110 | val expectedActive = Seq( 111 | (Seq("p1"), 0.0), 112 | (Seq("p2"), 0.0), 113 | ) 114 | 115 | val prom = receiveOne(2 seconds) 116 | .asInstanceOf[StatsCollector.MetricsEntity] 117 | .samples.asScala.toList 118 | 119 | checkSamples(prom, "greenish_state_refresh_total", expectedTotal) 120 | checkSamples(prom, "greenish_active_refresh_tasks", expectedActive) 121 | } 122 | 123 | "properly handle IncBadRefresh message" in { 124 | val jobs = Set("p1", "p2") 125 | val stats = system.actorOf( 126 | Props(new StatsCollector(jobs))) 127 | 128 | stats ! IncBadRefresh("p1") 129 | stats ! GetPrometheus 130 | 131 | val expected = Seq( 132 | (Seq("p1"), 1.0), 133 | (Seq("p2"), 0.0), 134 | ) 135 | 136 | val prom = receiveOne(2 seconds) 137 | .asInstanceOf[StatsCollector.MetricsEntity] 138 | .samples.asScala.toList 139 | 140 | checkSamples(prom, "greenish_state_refresh_failed_total", expected) 141 | } 142 | 143 | "properly handle OldestMissingPeriod message" in { 144 | val jobs = Set("p1", "p2") 145 | val stats = system.actorOf( 146 | Props(new StatsCollector(jobs))) 147 | 148 | stats ! OldestMissingPeriod("p1", 3) 149 | stats ! GetPrometheus 150 | 151 | val expected = Seq( 152 | (Seq("p1"), 3.0), 153 | (Seq("p2"), 0.0), 154 | ) 155 | 156 | val prom = receiveOne(2 seconds) 157 | .asInstanceOf[StatsCollector.MetricsEntity] 158 | .samples.asScala.toList 159 | 160 | checkSamples(prom, "greenish_oldest_missing_period", expected) 161 | } 162 | 163 | "properly handle MissingPeriods message" in { 164 | val jobs = Set("p1", "p2") 165 | val stats = system.actorOf( 166 | Props(new StatsCollector(jobs))) 167 | 168 | stats ! MissingPeriods("p1", 3) 169 | stats ! GetPrometheus 170 | 171 | val expected = Seq( 172 | (Seq("p1"), 3.0), 173 | (Seq("p2"), 0.0), 174 | ) 175 | 176 | val prom = receiveOne(2 seconds) 177 | .asInstanceOf[StatsCollector.MetricsEntity] 178 | .samples.asScala.toList 179 | 180 | checkSamples(prom, "greenish_missing_periods_total", expected) 181 | } 182 | 183 | "properly handle RefreshTime message" in { 184 | val jobs = Set("p1", "p2") 185 | val stats = system.actorOf( 186 | Props(new StatsCollector(jobs))) 187 | 188 | stats ! RefreshTime("p1", 3) 189 | stats ! GetPrometheus 190 | 191 | val expected = Set("p1") 192 | 193 | val prom = receiveOne(2 seconds) 194 | .asInstanceOf[StatsCollector.MetricsEntity] 195 | .samples.asScala.toList 196 | 197 | val actual = 198 | getNoneZeroHistogramLabels(prom, 199 | "greenish_state_refresh_time_seconds") 200 | actual shouldBe expected 201 | } 202 | } 203 | } 204 | 205 | object StatsCollectorSpec extends Matchers { 206 | def getNoneZeroHistogramLabels( 207 | prom: List[MetricFamilySamples], 208 | name: String): Set[String] = 209 | prom.filter { prom => 210 | prom.name == name 211 | }.flatMap { metric => 212 | metric.samples.asScala 213 | .map(sample => (sample.labelValues.asScala, sample.value)) 214 | }.filter { case (seq, num) => 215 | // Only keep what is set 216 | num != 0 217 | }.map { case (seq, num) => seq.head } 218 | .toSet 219 | 220 | 221 | def checkSamples( 222 | prom: List[MetricFamilySamples], 223 | name: String, 224 | expected: Seq[(Seq[String], Double)]): Unit = { 225 | 226 | val actual = prom 227 | .filter { prom => 228 | prom.name == name 229 | }.flatMap { metric => 230 | metric.samples.asScala 231 | .map(sample => (sample.labelValues.asScala, sample.value)) 232 | } 233 | 234 | actual shouldBe expected 235 | } 236 | } 237 | -------------------------------------------------------------------------------- /src/main/scala/AppConfig.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish 2 | 3 | import com.typesafe.config.{Config, ConfigFactory} 4 | import java.time.ZoneId 5 | import java.io.File 6 | import scala.util.Try 7 | import models._ 8 | import scala.jdk.CollectionConverters._ 9 | 10 | case class AppConfig(groups: Seq[Group], namespace: Option[String], 11 | scratchDir: File, refreshInSeconds: Int, 12 | address: String, port: Int, 13 | ) 14 | object AppConfig { 15 | def apply(): AppConfig = { 16 | val config = ConfigFactory.load() 17 | val appConfig = config.getConfig("check-groups") 18 | val refreshRate = appConfig.getInt("refresh-in-seconds") 19 | val port = appConfig.getInt("port") 20 | val namespace = Try(appConfig.getString("namespace")).toOption 21 | val scratchDir = new File(appConfig.getString("scratch-dir")) 22 | scratchDir.delete 23 | val address = appConfig.getString("binding-address") 24 | new AppConfig(readEntries(appConfig), namespace, scratchDir, 25 | refreshRate, address, port) 26 | } 27 | 28 | private[this] def readEntries(config: Config): Seq[Group] = { 29 | val defaultOwner = config.getOptionStringWithDefault("default-owner", None) 30 | val defaultPeriodCheckOffset = config.getInt("default-period-check-offset") 31 | val defaultTimePattern = config.getString("default-period-pattern") 32 | val defaultFrequency = config.getString("default-job-run-frequency") 33 | val defaultTimezone = config.getString("default-timezone") 34 | val defaultLookback = config.getInt("default-lookback") 35 | val defaultGreatAt = config.getInt("default-great-at") 36 | val defaultNormalAt = config.getInt("default-normal-at") 37 | val defaultWarnAt = config.getInt("default-warn-at") 38 | val defaultErrorAt = config.getInt("default-error-at") 39 | val defaultStartAt = config.getLong("default-start-at") 40 | val defaultInfo = config.getOptionStringWithDefault("default-info", None) 41 | val globalEnv = config.getEnv("env", Seq.empty) 42 | 43 | config.getConfigList("groups").asScala.zipWithIndex.map { case (groupConfig, index) => 44 | val groupName = groupConfig.getString("group-name") 45 | val groupOwner = groupConfig.getOptionStringWithDefault("group-owner", defaultOwner) 46 | val groupPeriodCheckOffset = 47 | groupConfig.getIntWithDefault("group-period-check-offset", defaultPeriodCheckOffset) 48 | val groupTimePattern = groupConfig.getStringWithDefault( 49 | "group-period-pattern", defaultTimePattern) 50 | val groupFrequency = groupConfig.getStringWithDefault( 51 | "group-job-run-frequency", defaultFrequency) 52 | val groupTimezone = groupConfig.getStringWithDefault( 53 | "group-timezone", defaultTimezone) 54 | val groupLookback = groupConfig.getIntWithDefault( 55 | "group-lookback", defaultLookback) 56 | val groupGreatAt = groupConfig.getIntWithDefault( 57 | "group-great-at", defaultGreatAt) 58 | val groupNormalAt = groupConfig.getIntWithDefault( 59 | "group-normal-at", defaultNormalAt) 60 | val groupWarnAt = groupConfig.getIntWithDefault( 61 | "group-warn-at", defaultWarnAt) 62 | val groupErrorAt = groupConfig.getIntWithDefault( 63 | "group-error-at", defaultErrorAt) 64 | val groupStartAt = groupConfig.getLongWithDefault( 65 | "group-start-at", defaultStartAt) 66 | val groupInfo = groupConfig.getOptionStringWithDefault("group-info", defaultInfo) 67 | val groupEnv = groupConfig.getEnv("env", globalEnv) 68 | 69 | val checkEntries = groupConfig.getConfigList("job-entries") 70 | .asScala.zipWithIndex.map { case (jobConfig, index) => 71 | val jobName = jobConfig.getString("job-name") 72 | val jobOwner = jobConfig.getOptionStringWithDefault("job-owner", groupOwner) 73 | val prometheusId = normalizePrometheusId( 74 | jobConfig.getStringWithDefault( 75 | "prometheus-id", s"$groupName $jobName")) 76 | val cmd = jobConfig.getString("check-command") 77 | val jobPeriodCheckOffset = jobConfig.getIntWithDefault( 78 | "job-period-check-offset", groupPeriodCheckOffset) 79 | val timePattern = jobConfig.getStringWithDefault( 80 | "period-pattern", groupTimePattern) 81 | val frequency = toFrequency( 82 | jobConfig.getStringWithDefault( 83 | "job-run-frequency", groupFrequency)) 84 | val timezone = ZoneId.of( 85 | jobConfig.getStringWithDefault("timezone", groupTimezone)) 86 | val lookback = jobConfig.getIntWithDefault( 87 | "lookback", groupLookback) 88 | val greatAt = jobConfig.getIntWithDefault( 89 | "great-at", groupGreatAt) 90 | val normalAt = jobConfig.getIntWithDefault( 91 | "normal-at", groupNormalAt) 92 | val warnAt = jobConfig.getIntWithDefault( 93 | "warn-at", groupWarnAt) 94 | val errorAt = jobConfig.getIntWithDefault( 95 | "error-at", groupErrorAt) 96 | val startAt = jobConfig.getLongWithDefault( 97 | "start-at", groupStartAt) 98 | val jobInfo = jobConfig.getOptionStringWithDefault("job-info", groupInfo) 99 | val jobEnv = jobConfig.getEnv("env", groupEnv).map { 100 | case (name, value) => EnvVar(name, value) 101 | } 102 | 103 | Job( 104 | index, 105 | jobName, 106 | jobOwner, 107 | prometheusId, 108 | cmd, 109 | timePattern, 110 | frequency, 111 | jobPeriodCheckOffset, 112 | timezone, 113 | lookback, 114 | startAt, 115 | AlertLevels(greatAt, normalAt, warnAt, errorAt), 116 | jobInfo.map(_.stripMargin), 117 | jobEnv, 118 | ) 119 | }.toSeq 120 | Group(index, groupName, checkEntries) 121 | }.toSeq 122 | } 123 | 124 | private[greenish] def normalizePrometheusId(id: String): String = { 125 | val spacelessId = id.replaceAll("(\\s|-)+","_").toLowerCase 126 | val pattern = "[a-zA-Z_][a-zA-Z0-9_]*" 127 | if(!spacelessId.matches(pattern)) { 128 | throw new Exception( 129 | s"""|$id: Invalid prometheus label ID, please provide a valid one. 130 | |Prometheus label names should match: "$pattern"""".stripMargin) 131 | } 132 | spacelessId 133 | } 134 | 135 | private[greenish] def toFrequency(freq: String): CheckFrequency = { 136 | freq.toLowerCase match { 137 | case "hourly" => Hourly 138 | case "daily" => Daily 139 | case "monthly" => Monthly 140 | case "annually" => Annually 141 | case _ => 142 | try { 143 | Cron(freq) 144 | } catch { 145 | case e: IllegalArgumentException => 146 | throw new Exception( 147 | s"""|${e.getMessage} 148 | |$freq: unsupported frequency, supported frequenices are: 149 | |hourly, daily, monthly, annually and Unix cron syntax""" 150 | .stripMargin) 151 | } 152 | } 153 | } 154 | 155 | implicit class ConfigExt[C <: Config](self: Config) { 156 | def getStringWithDefault(path: String, default: String): String = 157 | if(self.hasPath(path)) 158 | self.getString(path) 159 | else default 160 | 161 | def getIntWithDefault(path: String, default: Int): Int = 162 | if(self.hasPath(path)) 163 | self.getInt(path) 164 | else default 165 | 166 | def getLongWithDefault(path: String, default: Long): Long = 167 | if(self.hasPath(path)) 168 | self.getLong(path) 169 | else default 170 | 171 | def getOptionStringWithDefault(path: String, 172 | default: Option[String]): Option[String] = 173 | if(self.hasPath(path)) 174 | Some(self.getString(path)) 175 | else default 176 | 177 | 178 | def getEnv(path: String, parent: Seq[(String, String)]): Seq[(String, String)] = 179 | if(self.hasPath(path)) { 180 | val localEnv = self.getConfig("env") 181 | .entrySet.asScala 182 | .map(e => (e.getKey, e.getValue.unwrapped.asInstanceOf[String])) 183 | .toMap 184 | 185 | val overriddenParent = parent.filterNot { case (k, _) => 186 | localEnv.contains(k) 187 | } 188 | 189 | (localEnv.toSeq ++ overriddenParent).sorted 190 | } else parent 191 | } 192 | } 193 | -------------------------------------------------------------------------------- /src/test/scala/AppConfigSpec.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish 2 | 3 | import org.scalatest.matchers.should.Matchers 4 | import org.scalatest.wordspec.AnyWordSpecLike 5 | import java.time.ZoneId 6 | import models._ 7 | import java.io.File 8 | 9 | class AppConfigSpec() extends Matchers 10 | with AnyWordSpecLike { 11 | 12 | "AppConfig" must { 13 | "read config file correctly" in { 14 | val actual = AppConfig() 15 | val expected = new AppConfig( 16 | Seq( 17 | Group(0, "Group1", Seq( 18 | Job(0, "Job1", Some("Data"), "job_1", "/tmp/first_script", 19 | "yyyy-MM-dd-HH", Hourly, 3, 20 | ZoneId.of("UTC"), 24, 2, 21 | AlertLevels(0, 1, 2, 3), 22 | Some("Job info"), 23 | Seq(EnvVar("VAR1", "baz"), EnvVar("VAR2", "bazomba"), 24 | EnvVar("VAR3", "bada"), EnvVar("VAR4", "badam")), 25 | ), 26 | Job(1, "Job2", Some("Reporting"), "job_2", "/tmp/second_script job2", 27 | "yyyy-MM-dd-HH", Daily, 2, 28 | ZoneId.of("UTC"), 24, 1, 29 | AlertLevels(0, 1, 2, 3), 30 | Some("Group info"), 31 | Seq(EnvVar("VAR1", "baz"), EnvVar("VAR2", "secure(bar)"), 32 | EnvVar("VAR3", "bazooka")), 33 | ), 34 | Job(2, "Job5", Some("Reporting"), "group1_job5", "/tmp/second_script job5", 35 | "yyyy-MM-dd-HH", Hourly, 2, 36 | ZoneId.of("US/Alaska"), 24, 1, 37 | AlertLevels(0, 1, 2, 3), 38 | Some("Group info"), 39 | Seq(EnvVar("VAR1", "baz"), EnvVar("VAR2", "secure(bar)"), 40 | EnvVar("VAR3", "bazooka")), 41 | ), 42 | Job(3, "Job7", Some("Reporting"), "group1_job7", "/tmp/second_script job7", 43 | "yyyy-MM-dd-HH", Cron("0 * * * *"), 2, 44 | ZoneId.of("US/Alaska"), 24, 1, 45 | AlertLevels(0, 1, 2, 3), 46 | Some("Group info"), 47 | Seq(EnvVar("VAR1", "baz"), EnvVar("VAR2", "secure(bar)"), 48 | EnvVar("VAR3", "bazooka")), 49 | ), 50 | )), 51 | Group(1, "Group2", Seq( 52 | Job(0, "Job3", Some("SRE"), "job_3", "/tmp/third_script", 53 | "yyyy-MM-dd", Monthly, 1, 54 | ZoneId.of("UTC"), 3, 0, 55 | AlertLevels(0, 1, 2, 3), 56 | Some("""| 57 | |Link 58 | |""".stripMargin), 59 | Seq(EnvVar("VAR1", "foo"), EnvVar("VAR2", "secure(bar)")), 60 | ), 61 | Job(1, "Job4", Some("SRE"), "job_4", "/tmp/fourth_script", 62 | "yyyy-01-01", Annually, 1, 63 | ZoneId.of("UTC"), 3, 0, 64 | AlertLevels(0, 1, 2, 3), 65 | Some("""| 66 | |Link 67 | |""".stripMargin), 68 | Seq(EnvVar("VAR1", "foo"), EnvVar("VAR2", "secure(bar)")), 69 | ), 70 | Job(2, "Job6", Some("SRE"), "group2_job6", "/tmp/second_script job6", 71 | "yyyy-MM-dd-HH-mm", Daily, 1, 72 | ZoneId.of("US/Samoa"), 270, 0, 73 | AlertLevels(30, 40, 50, 60), 74 | Some("""| 75 | |Link 76 | |""".stripMargin), 77 | Seq(EnvVar("VAR1", "baz"), EnvVar("VAR2", "secure(bar)"), 78 | EnvVar("VAR3", "bazooka")), 79 | ), 80 | )), 81 | ), 82 | Some("Test dashboard"), 83 | new File("/tmp/greenish/stdout"), 84 | 30, 85 | "127.0.0.1", 86 | 8080, 87 | ) 88 | actual shouldBe expected 89 | } 90 | } 91 | 92 | "toFrequency" must { 93 | import AppConfig.toFrequency 94 | "handle Unix cron syntax" in { 95 | val patterns = Seq( 96 | "* * * * *", 97 | "1-2 * * * *", 98 | "*/5 * * * *", 99 | "0 23 * * MON-FRI", 100 | "1-5 0 * JAN-DEC 0-4", 101 | ) 102 | patterns.foreach { pattern => 103 | toFrequency(pattern) shouldBe Cron(pattern) 104 | } 105 | } 106 | 107 | "handle both lower and upper case frequencies" in { 108 | toFrequency("hOURly") shouldBe Hourly 109 | toFrequency("AnnuaLLy") shouldBe Annually 110 | toFrequency("monthly") shouldBe Monthly 111 | toFrequency("DAILY") shouldBe Daily 112 | } 113 | 114 | "throw an exception when it doesn't recognize a frequency" in { 115 | intercept[Exception](toFrequency("kkk")) 116 | intercept[Exception](toFrequency("weekly")) 117 | intercept[Exception](toFrequency("minutes")) 118 | } 119 | } 120 | 121 | "normalizePrometheusId" must { 122 | import AppConfig.normalizePrometheusId 123 | "convert prometheus_id to all lowercase" in { 124 | normalizePrometheusId("ABC") shouldBe "abc" 125 | } 126 | 127 | "replace - characters in prometheus_id to _" in { 128 | normalizePrometheusId("a---b") shouldBe "a_b" 129 | } 130 | 131 | "replace whitesapce characters in prometheus_id to _" in { 132 | normalizePrometheusId("a b\nc\td\t") shouldBe "a_b_c_d_" 133 | } 134 | 135 | "throw exception when prometheus_id starts with a digit" in { 136 | intercept[Exception](normalizePrometheusId("9a b\nc\td\t")) 137 | } 138 | 139 | "throw exception when prometheus_id contains anything but [a-zA-Z0-9_]" in { 140 | intercept[Exception](normalizePrometheusId("a;a")) 141 | } 142 | 143 | "throw exception when prometheus_id is empty string" in { 144 | intercept[Exception](normalizePrometheusId("")) 145 | } 146 | "accept valid characters in the begining prometheus_id" in { 147 | normalizePrometheusId("a") shouldBe "a" 148 | normalizePrometheusId("A") shouldBe "a" 149 | normalizePrometheusId("_") shouldBe "_" 150 | } 151 | } 152 | 153 | "getIntWithDefault" must { 154 | import com.typesafe.config.ConfigFactory 155 | import AppConfig._ 156 | val config = ConfigFactory.load() 157 | val appConfig = config.getConfig("check-groups") 158 | "get what the value of the property if the key exists" in { 159 | val actual = appConfig.getIntWithDefault("default-error-at", 100) 160 | val expected = 60 161 | actual shouldBe expected 162 | } 163 | 164 | "return default value if the key doesn't exists" in { 165 | val actual = appConfig.getIntWithDefault("naaah", 100) 166 | val expected = 100 167 | actual shouldBe expected 168 | } 169 | } 170 | 171 | "getStringWithDefault" must { 172 | import com.typesafe.config.ConfigFactory 173 | import AppConfig._ 174 | val config = ConfigFactory.load() 175 | val appConfig = config.getConfig("check-groups") 176 | "get what the value of the property if the key exists" in { 177 | val actual = appConfig.getStringWithDefault("default-period-pattern", "kkkk") 178 | val expected = "yyyy-MM-dd-HH-mm" 179 | actual shouldBe expected 180 | } 181 | 182 | "return default value if the key doesn't exists" in { 183 | val actual = appConfig.getStringWithDefault("naaah", "kkkk") 184 | val expected = "kkkk" 185 | actual shouldBe expected 186 | } 187 | } 188 | 189 | "getEnv" must { 190 | import com.typesafe.config.ConfigFactory 191 | import AppConfig._ 192 | val config = ConfigFactory.load() 193 | val appConfig = config.getConfig("check-groups") 194 | val groupConfig = appConfig.getConfigList("groups").iterator.next() 195 | val jobConfig = groupConfig.getConfigList("job-entries").iterator.next() 196 | val appEnv = appConfig.getEnv("env", Seq.empty) 197 | 198 | "get value if parent is empty, and key exists" in { 199 | appEnv shouldBe Seq("VAR1" -> "foo", "VAR2" -> "secure(bar)") 200 | } 201 | 202 | "properly dedup parent and child lists, if key exists" in { 203 | val actualGroup = groupConfig.getEnv("env", appEnv) 204 | val expectedGroup = Seq("VAR1" -> "baz", "VAR2" -> "secure(bar)", 205 | "VAR3" -> "bazooka") 206 | 207 | actualGroup shouldBe expectedGroup 208 | 209 | val actualJob = jobConfig.getEnv("env", expectedGroup) 210 | val expectedJob = Seq("VAR1" -> "baz", "VAR2" -> "bazomba", 211 | "VAR3" -> "bada", "VAR4" -> "badam") 212 | 213 | actualJob shouldBe expectedJob 214 | } 215 | 216 | "return parent env if the key doesn't exists" in { 217 | val actual = appConfig.getEnv("naaah", appEnv) 218 | val expected = appEnv 219 | actual shouldBe expected 220 | } 221 | } 222 | } 223 | 224 | 225 | -------------------------------------------------------------------------------- /src/main/resources/dashboard/greenish-favicon.svg: -------------------------------------------------------------------------------- 1 | Greenish Favicon -------------------------------------------------------------------------------- /doc/api.md: -------------------------------------------------------------------------------- 1 | # The API 2 | 3 | ## REST 4 | 5 | Greenish provides a few REST endpoints: 6 | 7 | ### Display the maximum number of missing datasets 8 | 9 | Basically, for all the jobs, find the job that misses the most number of 10 | period datasets, and return the number. 11 | 12 | ``` 13 | $ curl --silent -G http://0.0.0.0:8080/maxlag | jq . 14 | { 15 | "lag": 0 16 | } 17 | ``` 18 | 19 | ### Summary 20 | 21 | Display the summary of all the monitoring tasks. Very good for a quick glance: 22 | 23 | ``` 24 | $ curl --silent -G http://0.0.0.0:8080/summary | jq . 25 | [ 26 | { 27 | "group_id": 0, 28 | "name": "Group1", 29 | "status": [ 30 | { 31 | "job_id": 0, 32 | "name": "Job1", 33 | "missing": 4, 34 | "oldest_mising_period": 10, 35 | "alert_level": "warn" 36 | }, 37 | { 38 | "job_id": 1, 39 | "name": "Job2", 40 | "missing": 2, 41 | "oldest_mising_period": 3, 42 | "alert_level": "normal" 43 | } 44 | ] 45 | }, 46 | { 47 | "group_id": 0, 48 | "name": "Group2", 49 | "status": [ 50 | { 51 | "job_id": 0, 52 | "name": "Job3", 53 | "missing": 6, 54 | "oldest_mising_period": 6, 55 | "alert_level": "critical" 56 | }, 57 | { 58 | "job_id": 1, 59 | "name": "Job4", 60 | "missing": 0, 61 | "oldest_mising_period": 0, 62 | "alert_level": "great" 63 | } 64 | ] 65 | } 66 | ] 67 | ``` 68 | 69 | ### Display all the periods that are missing for all the jobs 70 | 71 | ``` 72 | $ curl --silent -G http://0.0.0.0:8080/missing | jq . 73 | [ 74 | { 75 | "group": { 76 | "group_id": 0, 77 | "name": "Group1", 78 | "jobs": [ 79 | { 80 | "job_id": 0, 81 | "name": "Job1", 82 | "cmd": "/tmp/first_script", 83 | "time_pattern": "yyyy-MM-dd-HH", 84 | "frequency": "hourly", 85 | "timezone": { 86 | "zone_id": "UTC" 87 | }, 88 | "lookback": 24, 89 | "start_at": 1593093930, 90 | "alert_levels": { 91 | "great": 0, 92 | "normal": 1, 93 | "warn": 2, 94 | "critical": 3 95 | } 96 | env: [] 97 | } 98 | ] 99 | }, 100 | "status": [ 101 | { 102 | "job": { 103 | "job_id": 0, 104 | "name": "Job1", 105 | "cmd": "/tmp/first_script", 106 | "time_pattern": "yyyy-MM-dd-HH", 107 | "frequency": "hourly", 108 | "timezone": { 109 | "zone_id": "UTC" 110 | }, 111 | "lookback": 24, 112 | "start_at": 1593093930, 113 | "alert_levels": { 114 | "great": 0, 115 | "normal": 1, 116 | "warn": 2, 117 | "critical": 3 118 | } 119 | env: [] 120 | }, 121 | "updated_at": 1593567901, 122 | "period_health": [ 123 | { 124 | "period": "2020-06-27-20", 125 | "ok": false 126 | } 127 | 128 | ... 129 | ``` 130 | 131 | ### Display the current state 132 | 133 | A very detailed view for all monitoring tasks: 134 | 135 | ``` 136 | $ curl --silent -G http://0.0.0.0:8080/state | jq . 137 | [ 138 | { 139 | "group": { 140 | "group_id": 0, 141 | "name": "Group1", 142 | "jobs": [ 143 | { 144 | "job_id": 0, 145 | "name": "Job1", 146 | "owner": "Reporting Team" 147 | "cmd": "/tmp/first_script", 148 | "time_pattern": "yyyy-MM-dd-HH", 149 | "frequency": "hourly", 150 | "timezone": { 151 | "zone_id": "UTC" 152 | }, 153 | "lookback": 24, 154 | "start_at": 1593093930, 155 | "alert_levels": { 156 | "great": 0, 157 | "normal": 1, 158 | "warn": 2, 159 | "critical": 3 160 | } 161 | env: [] 162 | } 163 | ] 164 | }, 165 | "status": [ 166 | { 167 | "job": { 168 | "job_id": 0, 169 | "name": "Job1", 170 | "cmd": "/tmp/first_script", 171 | "time_pattern": "yyyy-MM-dd-HH", 172 | "frequency": "hourly", 173 | "timezone": { 174 | "zone_id": "UTC" 175 | }, 176 | "lookback": 24, 177 | "start_at": 1593093930, 178 | "alert_levels": { 179 | "great": 0, 180 | "normal": 1, 181 | "warn": 2, 182 | "critical": 3 183 | } 184 | env: [] 185 | }, 186 | "updated_at": 1593567901, 187 | "period_health": [ 188 | { 189 | "period": "2020-06-27-20", 190 | "ok": true 191 | }, 192 | { 193 | "period": "2020-06-27-21", 194 | "ok": true 195 | }, 196 | 197 | ... 198 | ``` 199 | 200 | ### Get job and group by id 201 | 202 | You can query a single group by its id: 203 | 204 | ``` 205 | $ curl --silent -G localhost:8080/group/1 | jq . 206 | { 207 | "group": { 208 | "group_id": 1, 209 | "name": "Group2", 210 | "jobs": [ 211 | { 212 | "job_id": 0, 213 | "name": "Job3", 214 | "cmd": "/tmp/third_script", 215 | "time_pattern": "yyyy-MM-dd", 216 | "frequency": "monthly", 217 | "timezone": { 218 | ... 219 | ``` 220 | 221 | You can also focus on a single job, and query it: 222 | 223 | ``` 224 | $ curl --silent -G localhost:8080/group/1/job/0 | jq . 225 | { 226 | "job": { 227 | "job_id": 0, 228 | "name": "Job3", 229 | "cmd": "/tmp/third_script", 230 | "time_pattern": "yyyy-MM-dd", 231 | "frequency": "monthly", 232 | "timezone": { 233 | "zone_id": "UTC" 234 | }, 235 | "lookback": 3, 236 | "start_at": 1593093930, 237 | "alert_levels": { 238 | "great": 0, 239 | "normal": 1, 240 | "warn": 2, 241 | "critical": 3 242 | } 243 | env: [] 244 | }, 245 | "updated_at": 1593585049298, 246 | "period_health": [ 247 | { 248 | "period": "2020-05-01", 249 | "ok": true 250 | }, 251 | { 252 | "period": "2020-06-01", 253 | "ok": true 254 | }, 255 | { 256 | "period": "2020-07-01", 257 | "ok": true 258 | } 259 | ] 260 | } 261 | ``` 262 | 263 | Finally, you can alos get the output of a check (monitoring script) for a job: 264 | 265 | ``` 266 | $ curl --silent -G localhost:8080/group/0/job/0/stdout 267 | ``` 268 | 269 | This is useful when debugging the monitoring script, or if there are further 270 | information in the stdout that is not expressed in the UI/API. 271 | 272 | 273 | ### Refresh the state 274 | 275 | You can refresh the entire at once: 276 | 277 | ``` 278 | $ curl --silent -G localhost:8080/state/refresh | jq . 279 | { 280 | "ok": "State refresh is scheduled" 281 | } 282 | ``` 283 | 284 | You can point refresh the state of a single group by its id: 285 | 286 | ``` 287 | $ curl --silent -G localhost:8080/group/0/refresh | jq . 288 | { 289 | "ok": "Group status refresh is scheduled" 290 | } 291 | ``` 292 | 293 | You can also point refresh the state of a single job by its id: 294 | 295 | ``` 296 | $ curl --silent -G localhost:8080/group/0/job/0/refresh | jq . 297 | { 298 | "ok": "Job status refresh is scheduled" 299 | } 300 | ``` 301 | ### Health-check 302 | 303 | Checks if any of the last 5 state refreshes succeeded, if yes, then it is 304 | considered a good health. 305 | 306 | ``` 307 | $ curl --silent -G http://0.0.0.0:8080/health | jq . 308 | { 309 | "health": "good" 310 | } 311 | ``` 312 | 313 | ### System info 314 | 315 | Print basic information about the service. 316 | 317 | ``` 318 | $ curl --silent -G localhost:8080/system | jq . 319 | { 320 | "service": "Greenish", 321 | "namespace": "Staging", 322 | "version": "1.4.0-SNAPSHOT", 323 | "uptime": 1784338 324 | } 325 | ``` 326 | 327 | ## Prometheus 328 | 329 | Greenish can also export data to Prometheus. These are the supported metrics: 330 | 331 | ``` 332 | TYPE: GAUGE 333 | NAME: greenish_active_refresh_tasks 334 | HELP: Current number active state refresh tasks 335 | LABELS: job_id 336 | 337 | TYPE: HISTOGRAM 338 | NAME: greenish_state_refresh_time_seconds 339 | HELP: Job state refreshing time 340 | LABELS: job_id 341 | 342 | TYPE: COUNTER 343 | NAME: greenish_state_refresh_total 344 | HELP: Total number of job state refresh instances 345 | LABELS: job_id 346 | 347 | TYPE: COUNTER 348 | NAME: greenish_state_refresh_expired_total 349 | HELP: Total number of expired job state refresh instances 350 | LABELS: job_id 351 | 352 | TYPE: COUNTER 353 | NAME: greenish_state_refresh_failed_total 354 | HELP: Total number of failed job state refresh instances 355 | LABELS: job_id 356 | 357 | TYPE: GAUGE 358 | NAME: greenish_missing_periods_total 359 | HELP: Current number of missing dataset periods 360 | LABELS: job_id 361 | 362 | TYPE: GAUGE 363 | NAME: greenish_oldest_missing_period 364 | HELP: The oldest missing period 365 | LABELS: job_id 366 | 367 | ``` 368 | 369 | Prometheus metrics can be accessed at `/prometheus` endpoint: 370 | 371 | ``` 372 | $ curl --silent -G localhost:8080/prometheus 373 | # HELP greenish_active_refresh_tasks Current number active state refresh tasks 374 | # TYPE greenish_active_refresh_tasks gauge 375 | greenish_active_refresh_tasks{job_id="job_2",} 1.0 376 | greenish_active_refresh_tasks{job_id="job_1",} 0.0 377 | greenish_active_refresh_tasks{job_id="job_4",} 1.0 378 | greenish_active_refresh_tasks{job_id="job_3",} 1.0 379 | ... 380 | ``` 381 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | [![Build Status](https://travis-ci.org/amanjpro/greenish.svg?branch=master)](https://travis-ci.org/amanjpro/greenish) 4 | [![codecov](https://codecov.io/gh/amanjpro/greenish/branch/master/graph/badge.svg)](https://codecov.io/gh/amanjpro/greenish) [![Join the chat at https://gitter.im/greenish-monitoring/greenish](https://badges.gitter.im/greenish-monitoring/greenish.svg)](https://gitter.im/greenish-monitoring/greenish?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) 5 | 6 | **Greenish** is a monitoring tool that checks datasets for existence. 7 | 8 | Greenish understands _periods;_ for example, for an hourly job, it can 9 | verify that all datasets for the past _N_ hours exist. 10 | 11 | Configuration files use the [HOCON] syntax (a superset of [JSON]; 12 | similar to [YAML]): 13 | 14 | * [annotated example](src/test/resources/application.conf); 15 | * [default values](src/main/resources/reference.conf). 16 | 17 | [HOCON]: https://github.com/lightbend/config/blob/master/HOCON.md 18 | [JSON]: https://en.wikipedia.org/wiki/JSON 19 | [YAML]: https://en.wikipedia.org/wiki/YAML 20 | 21 | 22 | Greenish runs [monitoring jobs] to collect information about which 23 | datasets are available and which are missing. Those are individual 24 | scripts that can be written in any language. 25 | 26 | [monitoring jobs]: (#monitoring-jobs) 27 | 28 | 29 | ## Greenish dashboard 30 | 31 | Greenish provides a basic HTML dashboard to visualise the state of the 32 | monitored jobs. The dashboard can be accessed at `/dashboard`. 33 | 34 | Here is a screenshot: 35 | 36 | ![Greenish dashboard screenshot](doc/images/dashboard.png) 37 | 38 | ## API 39 | 40 | [The Greenish API is documented in `api.md`.](doc/api.md) 41 | 42 | ## Who uses Greenish? 43 | 44 | Greenish is still new. As of now, [Samsung 45 | Ads](https://www.samsung.com/us/business/samsungads/) uses Greenish to monitor 46 | _business-critical datasets_. 47 | 48 | ## Greenish vs others 49 | 50 | * **Nagios** is a monitoring tool for systems, network and 51 | infrastructure. It is very good to keep track of the instantaneous 52 | state of a system. But it has no notion of datasets that follow a 53 | periodic pattern (e.g., daily jobs or hourly jobs). Making Nagios 54 | aware of periods is entirely on the shoulder of the check writers, 55 | which can be very tricky to do (or even impossible?). 56 | 57 | * **Prometheus** is another great tool for monitoring metrics, and the 58 | health of other systems, but again it doesn't know about datasets 59 | that follow periodic patterns. It is worth mentioning that Greenish 60 | provides an endpoint to export metrics to Prometheus. 61 | 62 | * **Airflow** knows about periods, but it is not a monitoring 63 | tool. Airflow can alert when a run fails, but if an existing dataset 64 | gets deleted accidentally, Airflow stays unaware. 65 | 66 | What sets Greenish apart is that it knows about periods, and keeps checking 67 | datasets for existence. 68 | 69 | ## Monitoring Jobs 70 | 71 | As mentioned earlier, monitoring scripts are stand-alone programs, 72 | written in any language, that respect the following contract: 73 | 74 | * The scripts must be executable. 75 | 76 | * The scripts must accept an arbitrary number of `period` arguments at 77 | the end of their parameter list; e.g., for a script named 78 | `monitor-foo`, running on the `staging` environment, asked to check 79 | the status of three hourly periods: 80 | 81 | ```shell 82 | monitor-foo staging 2020-20-06-10 2020-20-06-11 2020-20-06-12 83 | ``` 84 | 85 | The `check-command` entry for the example above could be: 86 | 87 | ```yaml 88 | check-command: "monitor-foo staging" 89 | period-pattern: "yyyy-MM-dd-HH" 90 | ``` 91 | 92 | - The scripts must print one diagnostic line per provided period in 93 | one of the following two formats, where `1` indicates a successful 94 | period, and `0` indicates a failed period: 95 | 96 | ```text 97 | greenish-period 0 98 | greenish-period 1 99 | ``` 100 | 101 | Where: 102 | 103 | * Each value for `` must match one of the periods passed to 104 | the monitoring script. 105 | 106 | * Diagnostic lines are recognized by regular expression 107 | `^greenish-period\t.*\t(0|1)$`. 108 | 109 | * Any lines not matching the format are ignored by Greenish. This 110 | allows monitoring scripts to print extra debugging data. 111 | 112 | - The scripts must exit with 0, regardless of the status of any 113 | individual check. Exiting in error is reserved for problems 114 | evaluating the checks themselves. 115 | 116 | Example monitoring script: 117 | 118 | ``` 119 | #!/usr/bin/env bash 120 | farm=$1; shift 121 | 122 | echo '# Start of checks' 123 | for period in "$@"; do 124 | echo '# Arbitrary debugging info here' 125 | 126 | ## Note how the `ls` command below does print some output, which 127 | ## Greenish will ignore. (Unless the input directory is malicious, 128 | ## and purposefully includes files named in the way that Greenish 129 | ## expects as representing check output.) 130 | if ls "$farm/$period"; then 131 | printf 'greenish-period\t%s\t%d\n' "$period" 1 132 | else 133 | printf 'greenish-period\t%s\t%d\n' "$period" 0 134 | fi 135 | done 136 | ``` 137 | 138 | ## Performance Tweaking 139 | 140 | The monitoring jobs are usually blocking IO jobs. Do that network call, wait 141 | for this API, connect to a DB, HDFS etc. That is why they are running under 142 | their very own execution context (thread pool). So that they do not block the 143 | rest of the service (namely the endpoints). The execution context config for 144 | the monitoring jobs are controlled by a dispatcher named `refresh-dispatcher`. 145 | Greenish comes with a default config that is suitable for IO-bound processes, 146 | you can find it in the default settings mentioned earlier. 147 | 148 | It is best to use `thread-pool-executor` dispatcher for blocking jobs, as they 149 | are tailored for IO jobs. More information can be found: 150 | 151 | - [ThreadPoolExecutor Javadoc](https://docs.oracle.com/javase/8/docs/api/java/util/concurrent/ThreadPoolExecutor.html) 152 | - [Akka documentaiton](https://doc.akka.io/docs/akka-http/current/handling-blocking-operations-in-akka-http-routes.html#solution-dedicated-dispatcher-for-blocking-operations) 153 | 154 | ## Logging 155 | 156 | Greenish uses Akka's simple logging mechanism. In the spirit of [12 factor 157 | App](https://12factor.net/logs) all logs are written to STDOUT, and the 158 | configuration can be done via the `application.conf` file. The following 159 | is a summary of some of the most useful options for customizing logging: 160 | 161 | ``` 162 | akka { 163 | # Log the complete configuration at INFO level when Greenish is started. 164 | # This is useful when you are uncertain of what configuration is used. 165 | log-config-on-start = on 166 | # Options are: OFF, DEBUG, INFO, ERROR, WARN 167 | loglevel = "DEBUG" 168 | # To turn off logging completely 169 | stdout-loglevel = "OFF" 170 | 171 | # Not necessarily useful in prod, but can be useful during development 172 | # You probably want to skip the following in produciton 173 | log-dead-letters = 10 174 | log-dead-letters-during-shutdown = on 175 | actor { 176 | debug { 177 | # enable function of LoggingReceive, which is to log any received message at 178 | # DEBUG level 179 | receive = on 180 | # enable DEBUG logging of all AutoReceiveMessages (Kill, PoisonPill etc.) 181 | autoreceive = on 182 | # enable DEBUG logging of actor lifecycle changes 183 | lifecycle = on 184 | # enable DEBUG logging of unhandled messages 185 | unhandled = on 186 | # enable DEBUG logging of all LoggingFSMs for events, transitions and timers 187 | fsm = on 188 | } 189 | } 190 | } 191 | ``` 192 | 193 | ## Pre-built package 194 | 195 | You can download pre-built packages (both fat (i.e. assembly) jar and docker) 196 | from the [releases page](https://github.com/amanjpro/greenish/releases). The 197 | latest docker image can be found at the [packages 198 | page](https://github.com/amanjpro/greenish/packages). 199 | 200 | ## Development 201 | 202 | ### Requirements 203 | 204 | - Java 8 205 | - SBT 1.3.x 206 | - Bash 207 | - NodeJS 14+ 208 | 209 | ### Building from the source 210 | 211 | First install `npm` dependencies: 212 | 213 | `$ npm install` 214 | 215 | SBT takes care of building/testing both the Scala and JavaScript/JSX: 216 | 217 | `$ sbt clean test package` 218 | 219 | To run the service from the source: 220 | `$ sbt -Dconfig.file=PATH_TO_CONFIG_FILE run` 221 | 222 | **Note** Unfortunately, the JavaScript code has no tests yet, this is an issue 223 | that needs to be resolved. 224 | 225 | #### Packaging 226 | 227 | Greenish supports both "fat jar" and docker. Fat jar is a single and 228 | self-contained jar that can be distributed on any *nix environment (as long as 229 | Java and Bash are installed): 230 | 231 | ``` 232 | $ sbt assembly 233 | $ java -Dconfig.file=PATH_TO_CONFIG_FILE -jar target/scala-2.13/greenish-assembly-*.jar 234 | ``` 235 | 236 | You can also build docker images: 237 | 238 | ``` 239 | $ sbt docker:publishLocal 240 | # The docker image expects config to be mounted at: /app/config.yml 241 | $ docker run --volume PATH_TO_CONFIG_FILE:/app/config.yml --rm -p 8080:8080 greenish:LATEST_VERSION 242 | ``` 243 | 244 | ## Contributing 245 | 246 | Contributions are most welcome. Please, fork it, use it, open issues and submit PRs! 247 | 248 | ## Acknowledgment 249 | 250 | - Thanks to [Nasrin Zaza](https://www.linkedin.com/in/nasrin-zaza/) for the 251 | amazing logo 252 | -------------------------------------------------------------------------------- /src/test/resources/application.conf: -------------------------------------------------------------------------------- 1 | # The jobs/data-sets to monitor 2 | check-groups: { 3 | # This is an optional subtitle to show in the dashboard 4 | namespace: "Test dashboard" 5 | # Where to store temporary files, that is used to store debug lines that can 6 | # be seen in te /group/gid/job/stdout endpoint. You only need to change this 7 | # setting if you have more than one instance of Greenish running on the same 8 | # machine. If you do not, then there will be race condition between the two 9 | # instances. 10 | scratch-dir: "/tmp/greenish/stdout" 11 | # The frequency of pulling data 12 | refresh-in-seconds: 30 13 | # The binding address of the HTTP server 14 | binding-address: "127.0.0.1" 15 | # The port on which the server is running 16 | port: 8080 17 | # Not all jobs are available at the begining of a period, an hourly job might 18 | # systematically appear at the end of an hour. A monthly job at the end of 19 | # the month, and etc. This is a global setting, for specifying the deafult 20 | # period-offset, if a job is expected to arrive at the end of the period, then 21 | # offset should be 1. 22 | # This set can be overridden by `group-period-check-offset` at group level, 23 | # and `job-period-check-offset` at job level. 24 | default-period-check-offset: 1 25 | # Default period pattern. Please see job-entries' period-pattern property for 26 | # more information 27 | default-period-pattern: "yyyy-MM-dd-HH-mm" 28 | # Default run frequency. Please see job-entries' job-run-frequency property 29 | # for more information 30 | default-job-run-frequency: "daily" 31 | # Default timezone. Please see job-entries' timezone property for more 32 | # information 33 | default-timezone: "US/Samoa" 34 | # Default lookback. Please see job-entries' lookback property for more 35 | # information 36 | default-lookback: 270 37 | # Default great-at. Please see job-entries' great-at property for 38 | # more information 39 | default-great-at: 30 40 | # Default normal-at property. Please see job-entries' normal-at pattern 41 | # property for more information 42 | default-normal-at: 40 43 | # Default warn-at property. Please see job-entries' warn-at pattern 44 | # property for more information 45 | default-warn-at: 50 46 | # Default error-at property. Please see job-entries' error-at pattern 47 | # property for more information 48 | default-error-at: 60 49 | 50 | # Default start-at property. This is basically where we can define 51 | # when the oldest possible available datasets. This is useful, for 52 | # example if we add a new job to be monitored, and the lookback is 53 | # 10, for the next 10 hours the job starts complaining because some 54 | # data is missing, even though those are correctly missing (the job 55 | # didn't exist for those periods). The default value is 0. The data 56 | # is expressed in Unix timestamp epoc (seconds) This property can be 57 | # updated both at group and job level 58 | default-start-at: 0 59 | # Default owner of all the following jobs, can be used for escalation 60 | # purposes. This appears under the Job view in the dashboard. HTML tags are 61 | # supported. This setting is Optional. 62 | # Can be overridden by group-owner and job-owner settings 63 | default-owner: "SRE" 64 | # Default info for all the following jobs, can be used for adding extra 65 | # information to be shown next to the job in the dashboard. HTML tags are 66 | # encouraged here. Multiline strings are introduced using `"""` like in 67 | # Python and Scala. If you start every new line with `|`, then every leading 68 | # space to that character will be ignored. This setting is Optional. Can be 69 | # overridden by group-info and job-info settings 70 | default-info: """| 71 | |Link 72 | |""" 73 | # Additional environment variables to be passed 74 | # to the monitoring scripts, you can AWS profile 75 | # names here, for example: 76 | # AWS_PROFILE: "reader-profile" 77 | # `env` can be set on both grup and job level 78 | env: { 79 | VAR1: "foo" 80 | # `secure(...)` pattern tells Greenish that the data should be kept 81 | # secret and not returned in any of the endpoints. 82 | VAR2: "secure(bar)" 83 | } 84 | # Job groups, a group is a set of jobs/data-sets 85 | # that have some sort of logical relation 86 | groups: [ 87 | { 88 | # Pick a human friendly name here 89 | group-name: "Group1", 90 | # More or less like `default-period-check-offset`, but this scoped to the 91 | # group only. Can be overridden by `job-period-check-offset`. 92 | group-period-check-offset: 2 93 | # More or less like `default-period-pattern`, but this scoped to the 94 | # group only. Can be overridden by `period-pattern`. 95 | group-period-pattern: "yyyy-MM-dd-HH" 96 | # More or less like `default-job-run-frequency`, but this scoped to the 97 | # group only. Can be overridden by `job-run-frequency`. 98 | group-job-run-frequency: "hourly" 99 | # More or less like `default-timezone`, but this scoped to the 100 | # group only. Can be overridden by `timezone`. 101 | group-timezone: "US/Alaska" 102 | # More or less like `default-lookback`, but this scoped to the 103 | # group only. Can be overridden by `lookback`. 104 | group-lookback: 24 105 | # More or less like `default-great-at`, but this scoped to the 106 | # group only. Can be overridden by `great-at`. 107 | group-great-at: 0 108 | # More or less like `default-normal-at`, but this scoped to the 109 | # group only. Can be overridden by `normal-at`. 110 | group-normal-at: 1 111 | # More or less like `default-warn-at`, but this scoped to the 112 | # group only. Can be overridden by `warn-at`. 113 | group-warn-at: 2 114 | # More or less like `default-critical-at`, but this scoped to the 115 | # group only. Can be overridden by `critical-at`. 116 | group-error-at: 3 117 | 118 | # Just like default-start-at, but for the group level 119 | group-start-at: 1 120 | # group level owner, more or less default-owner but at the group level 121 | group-owner: "Reporting" 122 | # group level info, more or less default-info but at the group level 123 | group-info: "Group info" 124 | env: { 125 | VAR1: "baz" 126 | VAR3: "bazooka" 127 | } 128 | 129 | # A group can have many jobs/data-sets to monitor 130 | job-entries: [ 131 | { 132 | # Pick a human friendly name here 133 | job-name: "Job1" 134 | # An id to be used as a label for the exported Prometheus emtrics. 135 | # Each job will export internal metrics in a label to Prometheus, 136 | # which is controlled here. It is best to make sure that the id is 137 | # unique per job. But, it is not enforced. 138 | # 139 | # In case this option is skipped, the combination of `group` and 140 | # `job` name is chosen, turned into lower-case and all the whitespace 141 | # characters, and hyphens are replaced with _. 142 | # Prometheus IDs should match this pattern: 143 | # "[a-zA-Z_][a-zA-Z0-9_]*" 144 | prometheus-id: "job_1" 145 | # A check-command is any executable program/script that, takes 146 | # `period` in the form of `period-pattern` below as the last 147 | # argument, and exits with 0 only if successful. You an add arguments 148 | # to the script here: `/etc/check job1 production` is perfectly 149 | # allowed. 150 | # In case the Greenish failed to run the script, please wrap it in a 151 | # shell-script and add shebang at the top. Java Process Builder can 152 | # fail to recognize some scripts/programs. 153 | check-command: "/tmp/first_script", 154 | # A valid date/time pattern. Please consult the following page for 155 | # more info: 156 | # https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#patterns 157 | # If the data-set is expected to appear at the first day of every 158 | # month, You can write a pattern like: yyyy-MM-01 159 | period-pattern: "yyyy-MM-dd-HH" 160 | # What is the expected run-frequency of the job? 161 | # Supported values: hourly, daily, monthly, annually. Unix-Cron-style 162 | # syntax is also accepted here: `0 * * * *` runs at minute zero every 163 | # hour, more or less like `hourly`. Cron-style reacts differently to 164 | # `period-check-offset` settings. In the case of cron, you might want 165 | # to increase the offset by 1. 166 | job-run-frequency: "hourly" 167 | # More or less like `group-period-check-offset`, but this scoped to 168 | # this job only. 169 | job-period-check-offset: 3 170 | # What is the timezone of the periods in the data set. If you have two jobs, 171 | # one produced in Cairo, and follows Cairo timezone, and another in Canada 172 | # which follows UTC, you can configure them accordingly using this field. 173 | # Greenish respects the option when calling the monitoring script. 174 | timezone: "UTC" 175 | # How far back do you want to monitor? in this example we monitor 176 | # the last 24 datasets (hours) 177 | lookback: 24 178 | # The following are hints for Greenish, to check if a job is 179 | # at "great", "normal", "warn" or "critical" state 180 | great-at: 0 181 | normal-at: 1 182 | warn-at: 2 183 | error-at: 3 184 | # Just like default-start-at, but for the job only 185 | start-at: 2 186 | # job level owner, more or less group-owner but at the job level 187 | job-owner: "Data" 188 | # job level info, more or less group-info but at the job level 189 | job-info: "Job info" 190 | env: { 191 | VAR2: "bazomba" 192 | VAR3: "bada" 193 | VAR4: "badam" 194 | } 195 | }, 196 | { 197 | job-name: "Job2" 198 | prometheus-id: "job_2" 199 | check-command: "/tmp/second_script job2", 200 | period-pattern: "yyyy-MM-dd-HH" 201 | job-run-frequency: "daily" 202 | timezone: "UTC" 203 | lookback: 24 204 | great-at: 0 205 | normal-at: 1 206 | warn-at: 2 207 | error-at: 3 208 | }, 209 | { 210 | job-name: "Job5" 211 | check-command: "/tmp/second_script job5", 212 | }, 213 | { 214 | job-name: "Job7" 215 | check-command: "/tmp/second_script job7", 216 | job-run-frequency: "0 * * * *" 217 | } 218 | ] 219 | }, 220 | { 221 | group-name: "Group2", 222 | job-entries: [ 223 | { 224 | job-name: "Job3" 225 | prometheus-id: "job_3" 226 | check-command: "/tmp/third_script", 227 | period-pattern: "yyyy-MM-dd" 228 | job-run-frequency: "monthly" 229 | timezone: "UTC" 230 | lookback: 3 231 | great-at: 0 232 | normal-at: 1 233 | warn-at: 2 234 | error-at: 3 235 | }, 236 | { 237 | job-name: "Job4" 238 | prometheus-id: "job_4" 239 | check-command: "/tmp/fourth_script", 240 | period-pattern: "yyyy-01-01" 241 | job-run-frequency: "annually" 242 | timezone: "UTC" 243 | lookback: 3 244 | great-at: 0 245 | normal-at: 1 246 | warn-at: 2 247 | error-at: 3 248 | }, 249 | { 250 | job-name: "Job6" 251 | check-command: "/tmp/second_script job6", 252 | env: { 253 | VAR1: "baz" 254 | VAR3: "bazooka" 255 | } 256 | } 257 | ] 258 | } 259 | ] 260 | } 261 | 262 | # This section is used to tune the performance of Greenish 263 | akka { 264 | # This is the thread-pool for running monitoring scripts 265 | # If Greenish is unresponsive, you should look into this. 266 | # As, monitoring scripts are expected to be IO bound, you 267 | # may want to maximize parallelism. 268 | refresh-dispatcher { 269 | type = Dispatcher 270 | executor = "thread-pool-executor" 271 | thread-pool-executor { 272 | fixed-pool-size = 100 273 | } 274 | throughput = 1 275 | mailbox-capacity = -1 276 | } 277 | } 278 | -------------------------------------------------------------------------------- /src/test/scala/models/JsonSerdeSpec.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish.models 2 | 3 | import org.scalatest.matchers.should.Matchers 4 | import org.scalatest.wordspec.AnyWordSpecLike 5 | import java.time.ZoneId 6 | import io.circe.Json 7 | import io.circe.parser._ 8 | import io.circe.syntax.EncoderOps 9 | 10 | class JsonSerdeSpec() extends Matchers 11 | with AnyWordSpecLike { 12 | "healthJson" must { 13 | "produce correct JSON when health is bad" in { 14 | val expected = "bad" 15 | val json = healthJson(false) 16 | val actual = json.hcursor.downField("health").as[String].getOrElse(???) 17 | actual shouldBe expected 18 | json.hcursor.keys.get.size shouldBe 1 19 | } 20 | 21 | "produce correct JSON when health is good" in { 22 | val expected = "good" 23 | val json = healthJson(true) 24 | val actual = json.hcursor.downField("health").as[String].getOrElse(???) 25 | actual shouldBe expected 26 | json.hcursor.keys.get.size shouldBe 1 27 | } 28 | } 29 | 30 | "errorJson" must { 31 | "produce correct JSON" in { 32 | val expected = "Error" 33 | val json = errorJson(expected) 34 | val actual = json.hcursor.downField("error").as[String].getOrElse(???) 35 | actual shouldBe expected 36 | json.hcursor.keys.get.size shouldBe 1 37 | } 38 | } 39 | 40 | "okJson" must { 41 | "produce correct JSON" in { 42 | val expected = "OK" 43 | val json = okJson(expected) 44 | val actual = json.hcursor.downField("ok").as[String].getOrElse(???) 45 | actual shouldBe expected 46 | json.hcursor.keys.get.size shouldBe 1 47 | } 48 | } 49 | 50 | "AlertLevel" must { 51 | "produce correct JSON" in { 52 | (Great: AlertLevel).asJson shouldBe "great".asJson 53 | (Normal: AlertLevel).asJson shouldBe "normal".asJson 54 | (Warn: AlertLevel).asJson shouldBe "warn".asJson 55 | (Critical: AlertLevel).asJson shouldBe "critical".asJson 56 | } 57 | 58 | "correctly parse JSON string" in { 59 | parse(""""great"""").flatMap(_.as[AlertLevel]).getOrElse(???) shouldBe Great 60 | parse(""""normal"""").flatMap(_.as[AlertLevel]).getOrElse(???) shouldBe Normal 61 | parse(""""warn"""").flatMap(_.as[AlertLevel]).getOrElse(???) shouldBe Warn 62 | parse(""""critical"""").flatMap(_.as[AlertLevel]).getOrElse(???) shouldBe Critical 63 | } 64 | } 65 | 66 | "AlertLevels" must { 67 | "produce correct JSON" in { 68 | val expected = Json.obj ( 69 | "great" -> 1.asJson, 70 | "normal" -> 2.asJson, 71 | "warn" -> 3.asJson, 72 | "critical" -> 4.asJson, 73 | ) 74 | val actual = AlertLevels(1, 2, 3, 4).asJson 75 | actual shouldBe expected 76 | } 77 | 78 | "correctly parse JSON" in { 79 | val expected = AlertLevels(1, 2, 3, 4) 80 | val actual = expected.asJson.as[AlertLevels].getOrElse(???) 81 | 82 | actual shouldBe expected 83 | } 84 | } 85 | 86 | "CheckFrequency" must { 87 | "produce correct JSON" in { 88 | (Hourly: CheckFrequency).asJson shouldBe "hourly".asJson 89 | (Daily: CheckFrequency).asJson shouldBe "daily".asJson 90 | (Monthly: CheckFrequency).asJson shouldBe "monthly".asJson 91 | (Annually: CheckFrequency).asJson shouldBe "annually".asJson 92 | val pattern = "* * * * *" 93 | val expected = Json.obj("pattern" -> pattern.asJson) 94 | Cron(pattern).asJson shouldBe expected 95 | (Cron(pattern): CheckFrequency).asJson shouldBe expected 96 | } 97 | 98 | "correctly parse JSON string" in { 99 | parse(""""hourly"""").flatMap(_.as[CheckFrequency]).getOrElse(???) shouldBe Hourly 100 | parse(""""daily"""").flatMap(_.as[CheckFrequency]).getOrElse(???) shouldBe Daily 101 | parse(""""monthly"""").flatMap(_.as[CheckFrequency]).getOrElse(???) shouldBe Monthly 102 | parse(""""annually"""").flatMap(_.as[CheckFrequency]).getOrElse(???) shouldBe Annually 103 | val pattern = "* * * * *" 104 | val expected = Cron(pattern) 105 | val actualCron = expected.asJson.as[Cron].getOrElse(???) 106 | actualCron shouldBe expected 107 | val actualCheckFrequency = expected.asJson.as[CheckFrequency].getOrElse(???) 108 | actualCheckFrequency shouldBe expected 109 | } 110 | } 111 | 112 | "EnvVar" must { 113 | "produce correct JSON it is PlainEnvVar" in { 114 | val actual = EnvVar("username", "Homa").asJson 115 | val expected = Json.obj ( 116 | "type" -> "plain".asJson, 117 | "name" -> "username".asJson, 118 | "value" -> "Homa".asJson, 119 | ) 120 | actual shouldBe expected 121 | } 122 | 123 | "correctly parse JSON string for PlainEnvVar" in { 124 | val expected = EnvVar("username", "Homa") 125 | val actual = expected.asJson.as[EnvVar].getOrElse(???) 126 | actual shouldBe expected 127 | } 128 | 129 | "produce correct JSO it is SecureEnvVar" in { 130 | val actual = EnvVar("username", "secure(Homa)").asJson 131 | val expected = Json.obj ( 132 | "type" -> "secure".asJson, 133 | "name" -> "username".asJson, 134 | "value" -> SecureEnvVar.HIDDEN_PASSWORD.asJson, 135 | ) 136 | actual shouldBe expected 137 | } 138 | 139 | "correctly parse JSON string for SecureEnvVar" in { 140 | val origin = EnvVar("username", "secure(Homa)") 141 | val expected = EnvVar("username", s"secure(${SecureEnvVar.HIDDEN_PASSWORD})") 142 | val actual = origin.asJson.as[EnvVar].getOrElse(???) 143 | actual shouldBe expected 144 | } 145 | } 146 | 147 | "Group" must { 148 | val job = Job(1, "j", None, "p", "c", "yyyy-MM-dd", 149 | Hourly, 1, ZoneId.of("UTC"), 2, 0, AlertLevels(3, 4, 5, 6), 150 | None, Seq(EnvVar("a", "b"))) 151 | val group = Group(0, "g", Seq(job)) 152 | 153 | "produce correct JSON" in { 154 | val actual = group.asJson 155 | 156 | val expected = Json.obj( 157 | "group_id" -> 0.asJson, 158 | "name" -> "g".asJson, 159 | "jobs" -> Seq(job).asJson 160 | ) 161 | 162 | actual shouldBe expected 163 | } 164 | 165 | "correctly parse JSON" in { 166 | val expected = group 167 | val actual = expected.asJson.as[Group].getOrElse(???) 168 | 169 | actual shouldBe expected 170 | } 171 | } 172 | 173 | "GroupStatus" must { 174 | val job = Job(1, "j", None, "p", "c", "yyyy-MM-dd", 175 | Hourly, 1, ZoneId.of("UTC"), 2, 0, AlertLevels(3, 4, 5, 6), 176 | None, Seq(EnvVar("a", "b"))) 177 | val group = Group(0, "g", Seq(job)) 178 | val periods = Seq(PeriodHealth("1", true), PeriodHealth("2", false)) 179 | val jobStatus = JobStatus(job, 100, periods) 180 | val groupStatus = GroupStatus(group, Array(jobStatus)) 181 | 182 | "produce correct JSON" in { 183 | val expected = Json.obj( 184 | "group" -> group.asJson, 185 | "status" -> Seq(jobStatus).asJson, 186 | ) 187 | 188 | val actual = groupStatus.asJson 189 | actual shouldBe expected 190 | } 191 | 192 | "correctly parse JSON" in { 193 | val expected = groupStatus 194 | val actual = expected.asJson.as[GroupStatus].getOrElse(???) 195 | 196 | actual shouldBe expected 197 | } 198 | } 199 | 200 | "GroupStatusSummary" must { 201 | val jobStatus = Seq(JobStatusSummary(0, "j", 1, 1, Critical)) 202 | val groupStatusSummary = GroupStatusSummary(2, "g", jobStatus) 203 | "produce correct JSON" in { 204 | 205 | val expected = Json.obj( 206 | "group_id" -> 2.asJson, 207 | "name" -> "g".asJson, 208 | "status" -> jobStatus.asJson, 209 | ) 210 | 211 | val actual = groupStatusSummary.asJson 212 | actual shouldBe expected 213 | } 214 | 215 | "correctly parse JSON" in { 216 | val expected = groupStatusSummary 217 | val actual = expected.asJson.as[GroupStatusSummary].getOrElse(???) 218 | 219 | actual shouldBe expected 220 | } 221 | } 222 | 223 | "Job" must { 224 | val alertLevels = AlertLevels(3, 4, 5, 6) 225 | val job = Job(1, "j", None, "p", "c", "yyyy-MM-dd", 226 | Hourly, 1, ZoneId.of("UTC"), 2, 0, alertLevels, 227 | None, Seq(EnvVar("a", "b"))) 228 | 229 | "produce correct JSON when there is no owner and no info" in { 230 | val alertLevels = AlertLevels(3, 4, 5, 6) 231 | val actual = job.asJson 232 | 233 | val expected = Json.obj( 234 | "job_id" -> 1.asJson, 235 | "name" -> "j".asJson, 236 | "owner" -> Json.Null, 237 | "prometheus_id" -> "p".asJson, 238 | "cmd" -> "c".asJson, 239 | "time_pattern" -> "yyyy-MM-dd".asJson, 240 | "frequency" -> "hourly".asJson, 241 | "period_check_offset" -> 1.asJson, 242 | "timezone" -> Json.obj ("zone_id" -> "UTC".asJson), 243 | "lookback" -> 2.asJson, 244 | "start_at" -> 0.asJson, 245 | "alert_levels" -> alertLevels.asJson, 246 | "info" -> Json.Null, 247 | "env" -> Seq(EnvVar("a", "b")).asJson, 248 | ) 249 | 250 | actual shouldBe expected 251 | } 252 | 253 | "produce correct JSON when owner exists" in { 254 | val alertLevels = AlertLevels(3, 4, 5, 6) 255 | val actual = job.copy(owner=Some("me"), info=Some("you")).asJson 256 | 257 | val expected = Json.obj( 258 | "job_id" -> 1.asJson, 259 | "name" -> "j".asJson, 260 | "owner" -> "me".asJson, 261 | "prometheus_id" -> "p".asJson, 262 | "cmd" -> "c".asJson, 263 | "time_pattern" -> "yyyy-MM-dd".asJson, 264 | "frequency" -> "hourly".asJson, 265 | "period_check_offset" -> 1.asJson, 266 | "timezone" -> Json.obj ("zone_id" -> "UTC".asJson), 267 | "lookback" -> 2.asJson, 268 | "start_at" -> 0.asJson, 269 | "alert_levels" -> alertLevels.asJson, 270 | "info" -> "you".asJson, 271 | "env" -> Seq(EnvVar("a", "b")).asJson, 272 | ) 273 | 274 | actual shouldBe expected 275 | } 276 | 277 | "correctly parse JSON" in { 278 | val expected = job 279 | val actual = expected.asJson.as[Job].getOrElse(???) 280 | 281 | actual shouldBe expected 282 | } 283 | } 284 | 285 | "JobStatus" must { 286 | val job = Job(1, "j", None, "p", "c", "yyyy-MM-dd", 287 | Hourly, 1, ZoneId.of("UTC"), 2, 0, AlertLevels(3, 4, 5, 6), 288 | None, Seq(EnvVar("a", "b")) 289 | ) 290 | val periods = Seq(PeriodHealth("1", true), PeriodHealth("2", false)) 291 | val jobStatus = JobStatus(job, 100, periods) 292 | 293 | "produce correct JSON" in { 294 | val expected = Json.obj( 295 | "job" -> job.asJson, 296 | "updated_at" -> 100.asJson, 297 | "period_health" -> periods.asJson, 298 | ) 299 | 300 | val actual = jobStatus.asJson 301 | actual shouldBe expected 302 | } 303 | 304 | "correctly parse JSON" in { 305 | val expected = jobStatus 306 | val actual = expected.asJson.as[JobStatus].getOrElse(???) 307 | 308 | actual shouldBe expected 309 | } 310 | } 311 | 312 | "JobStatusSummary" must { 313 | val jobStatusSummary = JobStatusSummary(0, "j", 1, 2, Critical) 314 | "produce correct JSON" in { 315 | val expected = Json.obj( 316 | "job_id" -> 0.asJson, 317 | "name" -> "j".asJson, 318 | "missing" -> 1.asJson, 319 | "oldest_missing_period" -> 2.asJson, 320 | "alert_level" -> "critical".asJson, 321 | ) 322 | 323 | val actual = jobStatusSummary.asJson 324 | actual shouldBe expected 325 | } 326 | 327 | "correctly parse JSON" in { 328 | val expected = jobStatusSummary 329 | val actual = expected.asJson.as[JobStatusSummary].getOrElse(???) 330 | 331 | actual shouldBe expected 332 | } 333 | } 334 | 335 | "Lag" must { 336 | "produce correct JSON" in { 337 | val expected = Json.obj( 338 | "lag" -> 4.asJson, 339 | ) 340 | 341 | val actual = Lag(4).asJson 342 | actual shouldBe expected 343 | } 344 | 345 | "correctly parse JSON" in { 346 | val expected = Lag(5) 347 | val actual = expected.asJson.as[Lag].getOrElse(???) 348 | 349 | actual shouldBe expected 350 | } 351 | } 352 | 353 | "PeriodHealth" must { 354 | "produce correct JSON" in { 355 | val expected = Json.obj( 356 | "period" -> "2020-06-25-18".asJson, 357 | "ok" -> false.asJson, 358 | ) 359 | 360 | val actual = PeriodHealth("2020-06-25-18", false).asJson 361 | actual shouldBe expected 362 | } 363 | 364 | "correctly parse JSON" in { 365 | val expected = PeriodHealth("2020-06-25-18", false) 366 | val actual = expected.asJson.as[PeriodHealth].getOrElse(???) 367 | 368 | actual shouldBe expected 369 | } 370 | } 371 | 372 | "sysinfo" must { 373 | "produce correct JSON when namespace is missing" in { 374 | val json = sysinfo(None) 375 | val cursor = json.hcursor 376 | cursor.downField("version").as[Option[String]].isRight shouldBe true 377 | cursor.downField("namespace").as[Option[String]] shouldBe Right(None) 378 | cursor.downField("service").as[String] shouldBe Right("Greenish") 379 | cursor.downField("uptime").as[Long].isRight shouldBe true 380 | cursor.keys.get.size shouldBe 4 381 | } 382 | 383 | "produce correct JSON when namespace is not missing" in { 384 | val json = sysinfo(Some("my dashboard")) 385 | val cursor = json.hcursor 386 | cursor.downField("version").as[Option[String]].isRight shouldBe true 387 | cursor.downField("namespace").as[Option[String]] shouldBe Right(Some("my dashboard")) 388 | cursor.downField("service").as[String] shouldBe Right("Greenish") 389 | cursor.downField("uptime").as[Long].isRight shouldBe true 390 | cursor.keys.get.size shouldBe 4 391 | } 392 | } 393 | } 394 | 395 | -------------------------------------------------------------------------------- /doc/images/greenish-with-background.svg: -------------------------------------------------------------------------------- 1 | Greenish Logo BG -------------------------------------------------------------------------------- /src/test/scala/checker/CommandRunnerSpec.scala: -------------------------------------------------------------------------------- 1 | package me.amanj.greenish.checker 2 | 3 | import akka.actor.{ActorSystem, Props, ActorRef} 4 | import java.io.File 5 | import scala.concurrent.duration._ 6 | import akka.testkit.{ ImplicitSender, TestKit } 7 | import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach} 8 | import org.scalatest.matchers.should.Matchers 9 | import org.scalatest.wordspec.AnyWordSpecLike 10 | import org.scalatest.concurrent.Eventually 11 | import scala.concurrent.duration._ 12 | import me.amanj.greenish.models.PeriodHealth 13 | import me.amanj.greenish.stats.{StatsCollector, StatsCollectorSpec, GetPrometheus} 14 | import scala.jdk.CollectionConverters._ 15 | 16 | import scala.language.postfixOps 17 | import scala.io.Source 18 | 19 | class CommandRunnerSpec() 20 | extends TestKit(ActorSystem("CommandRunnerSpec")) 21 | with ImplicitSender 22 | with AnyWordSpecLike 23 | with Matchers 24 | with Eventually 25 | with BeforeAndAfterEach 26 | with BeforeAndAfterAll { 27 | 28 | val src = s"/tmp/greenish-test-${System.currentTimeMillis}" 29 | val farFuture = System.currentTimeMillis * 2 30 | val dir = new File("/tmp/2020-06-07-01") 31 | val dirWithSpaces = new File("/tmp/2020-06-07 01") 32 | val lsSleep = getClass.getResource("/ls-sleep").getFile 33 | val ls = getClass.getResource("/test-ls").getFile 34 | val lsEnv = getClass.getResource("/test-ls-env").getFile 35 | val lsDup = getClass.getResource("/test-duplicate-period").getFile 36 | val lsPart = getClass.getResource("/test-partial-period").getFile 37 | implicit val patience: PatienceConfig = PatienceConfig(15 seconds, 1 second) 38 | 39 | var stats: ActorRef = _ 40 | val outputDir = new File("/tmp/greenish/stdout") 41 | 42 | override def beforeAll: Unit = { 43 | dirWithSpaces.mkdirs 44 | outputDir.mkdirs 45 | dir.mkdirs 46 | } 47 | 48 | override def afterAll: Unit = { 49 | dir.delete 50 | dirWithSpaces.delete 51 | outputDir.delete 52 | TestKit.shutdownActorSystem(system) 53 | } 54 | 55 | override def afterEach(): Unit = { 56 | new File(src).delete 57 | } 58 | 59 | override def beforeEach(): Unit = { 60 | super.beforeEach() 61 | stats = system.actorOf( 62 | Props(new StatsCollector(Set("p1", "p2", "p3")))) 63 | } 64 | 65 | "parseOutput" must { 66 | "parse output lines correctly" in { 67 | val lines = LazyList( 68 | "greenish-period\t2020-02-17 8\t1", 69 | "greenish-period\t2020-02-17-9\t1", 70 | "greenish-period\t2020-02-17 10\t0", 71 | "greenish-period\t2020-02-17-11\t0", 72 | "greenish-period\t2020-02-17 10 38\t0", 73 | "Other output", 74 | "greenish-period 2020-02-17 10 38 0", 75 | "greenish-period\t2020-02-17 10 38\t9", 76 | ) 77 | val periods = Set( 78 | "2020-02-17 8", 79 | "2020-02-17-9", 80 | "2020-02-17 10", 81 | "2020-02-17-11", 82 | ) 83 | 84 | val expected = Seq( 85 | ("2020-02-17 8", true), 86 | ("2020-02-17-9", true), 87 | ("2020-02-17 10", false), 88 | ("2020-02-17-11", false), 89 | ) 90 | 91 | val actual = CommandRunner.parseOutput(lines, periods) 92 | 93 | actual shouldBe expected 94 | } 95 | 96 | "ignore lines that do not match the period set" in { 97 | val lines = LazyList( 98 | "greenish-period\t2020-02-17-10\t1", 99 | "greenish-period\t2020-02-17-11\t0", 100 | ) 101 | val periods = Set( 102 | "2020-02-17-10", 103 | ) 104 | 105 | val expected = Seq( 106 | ("2020-02-17-10", true), 107 | ) 108 | 109 | val actual = CommandRunner.parseOutput(lines, periods) 110 | 111 | actual shouldBe expected 112 | } 113 | 114 | "capture duplicate periods correctly" in { 115 | val lines = LazyList( 116 | "greenish-period\t2020-02-17-10\t1", 117 | "greenish-period\t2020-02-17-10\t0", 118 | "greenish-period\t2020-02-17-11\t0", 119 | ) 120 | val periods = Set( 121 | "2020-02-17-10", 122 | "2020-02-17-11", 123 | ) 124 | 125 | val expected = Seq( 126 | ("2020-02-17-10", true), 127 | ("2020-02-17-10", false), 128 | ("2020-02-17-11", false), 129 | ) 130 | 131 | val actual = CommandRunner.parseOutput(lines, periods) 132 | 133 | actual shouldBe expected 134 | } 135 | 136 | "Have no problem if a period in the provided period-set wasn't in the output lines" in { 137 | val lines = LazyList( 138 | "greenish-period\t2020-02-17-10\t1", 139 | "greenish-period\t2020-02-17-11\t0", 140 | ) 141 | val periods = Set( 142 | "2020-02-17-10", 143 | "2020-02-17-11", 144 | "2020-02-17-12", 145 | ) 146 | 147 | val expected = Seq( 148 | ("2020-02-17-10", true), 149 | ("2020-02-17-11", false), 150 | ) 151 | 152 | val actual = CommandRunner.parseOutput(lines, periods) 153 | 154 | actual shouldBe expected 155 | } 156 | } 157 | 158 | "toBashCommand" must { 159 | "single-quote the periods to avoid bash splitting" in { 160 | val periods = Seq("20 02", "30 03", "01 10", "400") 161 | val cmd = "hey this is a command" 162 | val actual = CommandRunner.toBashCommand(cmd, periods) 163 | val expected = "hey this is a command '20 02' '30 03' '01 10' '400'" 164 | actual shouldBe expected 165 | } 166 | } 167 | 168 | "BatchRun command" must { 169 | 170 | import StatsCollectorSpec.{checkSamples, getNoneZeroHistogramLabels} 171 | 172 | "not run anything if the refresh command is too old" in { 173 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir))) 174 | actor ! BatchRun(lsPart, Seq("2020-06-07-01", "2020-06-07-02"), Seq.empty, 0, 1, "p1", 2, 0) 175 | expectNoMessage(4 seconds) 176 | } 177 | 178 | "write debugging lines to disk verbatim" in { 179 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir))) 180 | actor ! BatchRun(s"$ls /tmp", Seq("2020-06-07-01", "2020-06-07-02"), Seq.empty, 0, 1, "p1", 2, farFuture) 181 | 182 | val _ = receiveOne(2 seconds) 183 | 184 | val expected = List("LETS PRINT THINGS", "DEBUG HERE TOO", 185 | "greenish-period\t2020-06-07-01\t1", "DEBUG HERE TOO", 186 | "greenish-period\t2020-06-07-02\t0", "DEBUG HERE") 187 | val actual = Source.fromFile(debugFile(outputDir, 0, 1)).getLines.toList 188 | actual shouldBe expected 189 | } 190 | 191 | "send back nothing, when command does not exit" in { 192 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir))) 193 | actor ! BatchRun("a;kjdw", Seq.empty, Seq.empty, 0, 0, "p1", 0, farFuture) 194 | expectNoMessage(4 seconds) 195 | } 196 | 197 | "send back nothing, when command does not exit with 0" in { 198 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir))) 199 | actor ! BatchRun("exit 1;", Seq.empty, Seq.empty, 0, 0, "p1", 0, farFuture) 200 | expectNoMessage(4 seconds) 201 | } 202 | 203 | "send back nothing, when command exits with 0, but not all periods are printed" in { 204 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir))) 205 | actor ! BatchRun(lsPart, Seq("2020-06-07-01", "2020-06-07-02"), Seq.empty, 0, 1, "p1", 2, farFuture) 206 | expectNoMessage(4 seconds) 207 | } 208 | 209 | "send back nothing, when command exits with 0, but some periods are printed more than once" in { 210 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir))) 211 | actor ! BatchRun(lsDup, Seq("2020-06-07-01", "2020-06-07-02"), Seq.empty, 0, 1, "p1", 2, farFuture) 212 | expectNoMessage(4 seconds) 213 | } 214 | 215 | "send back health for all periods, when command does exit with 0 with all periods printed exactly once" in { 216 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir))) 217 | actor ! BatchRun(s"$ls /tmp", Seq("2020-06-07-01", "2020-06-07-02"), Seq.empty, 0, 1, "p1", 2, farFuture) 218 | val expected = RunResult(Seq( 219 | PeriodHealth("2020-06-07-01", true), 220 | PeriodHealth("2020-06-07-02", false)), 0, 1, 2) 221 | expectMsg(expected) 222 | } 223 | 224 | "Support spaces in the period pattern" in { 225 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir))) 226 | actor ! BatchRun(s"$ls /tmp", Seq("2020-06-07 01", "2020-06-07 02"), Seq.empty, 0, 1, "p1", 2, farFuture) 227 | val expected = RunResult(Seq( 228 | PeriodHealth("2020-06-07 01", true), 229 | PeriodHealth("2020-06-07 02", false)), 0, 1, 2) 230 | expectMsg(expected) 231 | } 232 | 233 | "use provided environment variables" in { 234 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir))) 235 | actor ! BatchRun(s"$lsEnv .", Seq("2020-06-07-01", "2020-06-07-02"), Seq.empty, 0, 1, "p1", 2, farFuture) 236 | val expected1 = RunResult(Seq( 237 | PeriodHealth("2020-06-07-01", false), 238 | PeriodHealth("2020-06-07-02", false)), 0, 1, 2) 239 | expectMsg(expected1) 240 | 241 | actor ! BatchRun(s"$lsEnv .", Seq("2020-06-07-01", "2020-06-07-02"), 242 | Seq("GREENISH_VALUE_FOR_TEST" -> "/tmp"), 0, 1, "p1", 2, 243 | farFuture) 244 | val expected2 = RunResult(Seq( 245 | PeriodHealth("2020-06-07-01", true), 246 | PeriodHealth("2020-06-07-02", false)), 0, 1, 2) 247 | expectMsg(expected2) 248 | } 249 | 250 | "correctly send stats when command run is expired" in { 251 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir))) 252 | actor ! BatchRun( 253 | s"exit 1", Seq("2020-06-07-01", "2020-06-07-02"), Seq.empty, 0, 1, "p1", 2, 0) 254 | 255 | eventually { 256 | stats ! GetPrometheus 257 | 258 | val expectedTotal = Seq( 259 | (Seq("p1"), 1.0), 260 | (Seq("p2"), 0.0), 261 | (Seq("p3"), 0.0), 262 | ) 263 | 264 | val allZeros = Seq( 265 | (Seq("p1"), 0.0), 266 | (Seq("p2"), 0.0), 267 | (Seq("p3"), 0.0), 268 | ) 269 | 270 | val prom = receiveOne(2 seconds) 271 | .asInstanceOf[StatsCollector.MetricsEntity] 272 | .samples.asScala.toList 273 | 274 | checkSamples(prom, "greenish_state_refresh_total", expectedTotal) 275 | checkSamples(prom, "greenish_state_refresh_expired_total", expectedTotal) 276 | checkSamples(prom, "greenish_state_refresh_failed_total", allZeros) 277 | checkSamples(prom, "greenish_missing_periods_total", allZeros) 278 | checkSamples(prom, "greenish_oldest_missing_period", allZeros) 279 | checkSamples(prom, "greenish_active_refresh_tasks", allZeros) 280 | 281 | val actual = getNoneZeroHistogramLabels(prom, "greenish_state_refresh_time_seconds") 282 | actual shouldBe Set.empty 283 | } 284 | } 285 | 286 | "correctly send stats when command run fails" in { 287 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir))) 288 | actor ! BatchRun( 289 | s"exit 1", Seq("2020-06-07-01", "2020-06-07-02"), Seq.empty, 0, 1, "p1", 2, farFuture) 290 | 291 | eventually { 292 | stats ! GetPrometheus 293 | 294 | val expectedTotal = Seq( 295 | (Seq("p1"), 1.0), 296 | (Seq("p2"), 0.0), 297 | (Seq("p3"), 0.0), 298 | ) 299 | 300 | val allZeros = Seq( 301 | (Seq("p1"), 0.0), 302 | (Seq("p2"), 0.0), 303 | (Seq("p3"), 0.0), 304 | ) 305 | 306 | val prom = receiveOne(2 seconds) 307 | .asInstanceOf[StatsCollector.MetricsEntity] 308 | .samples.asScala.toList 309 | 310 | checkSamples(prom, "greenish_state_refresh_total", expectedTotal) 311 | checkSamples(prom, "greenish_state_refresh_expired_total", allZeros) 312 | checkSamples(prom, "greenish_state_refresh_failed_total", expectedTotal) 313 | checkSamples(prom, "greenish_missing_periods_total", allZeros) 314 | checkSamples(prom, "greenish_oldest_missing_period", allZeros) 315 | checkSamples(prom, "greenish_active_refresh_tasks", allZeros) 316 | 317 | val actual = getNoneZeroHistogramLabels(prom, "greenish_state_refresh_time_seconds") 318 | actual shouldBe Set("p1") 319 | } 320 | } 321 | 322 | "correctly send stats when command run succeeds" in { 323 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir))) 324 | actor ! BatchRun( 325 | s"$ls /tmp", Seq("2020-06-07-01", "2020-06-07-02"), Seq.empty, 0, 1, "p2", 2, farFuture) 326 | 327 | eventually { 328 | stats ! GetPrometheus 329 | 330 | val expectedTotal = Seq( 331 | (Seq("p1"), 0.0), 332 | (Seq("p2"), 1.0), 333 | (Seq("p3"), 0.0), 334 | ) 335 | 336 | val allZeros = Seq( 337 | (Seq("p1"), 0.0), 338 | (Seq("p2"), 0.0), 339 | (Seq("p3"), 0.0), 340 | ) 341 | 342 | val prom = receiveOne(2 seconds) 343 | .asInstanceOf[StatsCollector.MetricsEntity] 344 | .samples.asScala.toList 345 | 346 | checkSamples(prom, "greenish_state_refresh_total", expectedTotal) 347 | checkSamples(prom, "greenish_state_refresh_expired_total", allZeros) 348 | checkSamples(prom, "greenish_state_refresh_failed_total", allZeros) 349 | checkSamples(prom, "greenish_missing_periods_total", expectedTotal) 350 | checkSamples(prom, "greenish_oldest_missing_period", expectedTotal) 351 | checkSamples(prom, "greenish_active_refresh_tasks", allZeros) 352 | 353 | val actual = getNoneZeroHistogramLabels(prom, "greenish_state_refresh_time_seconds") 354 | actual shouldBe Set("p2") 355 | } 356 | } 357 | 358 | "correctly send stats when command run misses some periods" in { 359 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir))) 360 | actor ! BatchRun( 361 | lsPart, Seq("2020-06-07-01", "2020-06-07-02"), Seq.empty, 0, 1, "p2", 2, farFuture) 362 | 363 | eventually { 364 | stats ! GetPrometheus 365 | 366 | val expectedTotal = Seq( 367 | (Seq("p1"), 0.0), 368 | (Seq("p2"), 1.0), 369 | (Seq("p3"), 0.0), 370 | ) 371 | 372 | val allZeros = Seq( 373 | (Seq("p1"), 0.0), 374 | (Seq("p2"), 0.0), 375 | (Seq("p3"), 0.0), 376 | ) 377 | 378 | val prom = receiveOne(2 seconds) 379 | .asInstanceOf[StatsCollector.MetricsEntity] 380 | .samples.asScala.toList 381 | 382 | checkSamples(prom, "greenish_state_refresh_total", expectedTotal) 383 | checkSamples(prom, "greenish_state_refresh_expired_total", allZeros) 384 | checkSamples(prom, "greenish_state_refresh_failed_total", expectedTotal) 385 | checkSamples(prom, "greenish_missing_periods_total", allZeros) 386 | checkSamples(prom, "greenish_oldest_missing_period", allZeros) 387 | checkSamples(prom, "greenish_active_refresh_tasks", allZeros) 388 | 389 | val actual = getNoneZeroHistogramLabels(prom, "greenish_state_refresh_time_seconds") 390 | actual shouldBe Set("p2") 391 | } 392 | } 393 | 394 | "correctly send stats when command run prints duplicate periods" in { 395 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir))) 396 | actor ! BatchRun( 397 | lsDup, Seq("2020-06-07-01", "2020-06-07-02"), Seq.empty, 0, 1, "p2", 2, farFuture) 398 | 399 | eventually { 400 | stats ! GetPrometheus 401 | 402 | val expectedTotal = Seq( 403 | (Seq("p1"), 0.0), 404 | (Seq("p2"), 1.0), 405 | (Seq("p3"), 0.0), 406 | ) 407 | 408 | val allZeros = Seq( 409 | (Seq("p1"), 0.0), 410 | (Seq("p2"), 0.0), 411 | (Seq("p3"), 0.0), 412 | ) 413 | 414 | val prom = receiveOne(2 seconds) 415 | .asInstanceOf[StatsCollector.MetricsEntity] 416 | .samples.asScala.toList 417 | 418 | checkSamples(prom, "greenish_state_refresh_total", expectedTotal) 419 | checkSamples(prom, "greenish_state_refresh_failed_total", expectedTotal) 420 | checkSamples(prom, "greenish_missing_periods_total", allZeros) 421 | checkSamples(prom, "greenish_oldest_missing_period", allZeros) 422 | checkSamples(prom, "greenish_active_refresh_tasks", allZeros) 423 | 424 | val actual = getNoneZeroHistogramLabels(prom, "greenish_state_refresh_time_seconds") 425 | actual shouldBe Set("p2") 426 | } 427 | } 428 | 429 | "correctly compute active refresh stats" in { 430 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir))) 431 | actor ! BatchRun( 432 | lsSleep, Seq("2020-06-07-01", "2020-06-07-02"), Seq.empty, 0, 1, "p3", 2, farFuture) 433 | 434 | eventually { 435 | stats ! GetPrometheus 436 | 437 | val expected = Seq( 438 | (Seq("p1"), 0.0), 439 | (Seq("p2"), 0.0), 440 | (Seq("p3"), 1.0), 441 | ) 442 | 443 | val prom = receiveOne(2 seconds) 444 | .asInstanceOf[StatsCollector.MetricsEntity] 445 | .samples.asScala.toList 446 | 447 | checkSamples(prom, "greenish_active_refresh_tasks", expected) 448 | } 449 | 450 | eventually { 451 | stats ! GetPrometheus 452 | 453 | val expected = Seq( 454 | (Seq("p1"), 0.0), 455 | (Seq("p2"), 0.0), 456 | (Seq("p3"), 0.0), 457 | ) 458 | 459 | val prom = receiveOne(2 seconds) 460 | .asInstanceOf[StatsCollector.MetricsEntity] 461 | .samples.asScala.toList 462 | 463 | checkSamples(prom, "greenish_active_refresh_tasks", expected) 464 | } 465 | } 466 | } 467 | 468 | "write" must { 469 | "write lines to disk" in { 470 | val data = LazyList("first", "second") 471 | CommandRunner.write(src, data) 472 | val expected = data.toList 473 | 474 | val actual = Source.fromFile(src).getLines.toList 475 | actual shouldBe expected 476 | } 477 | } 478 | } 479 | -------------------------------------------------------------------------------- /src/main/resources/dashboard/greenish-logo.svg: -------------------------------------------------------------------------------- 1 | Greenish Logog --------------------------------------------------------------------------------