2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
4 | this software and associated documentation files (the "Software"), to deal in
5 | the Software without restriction, including without limitation the rights to
6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
7 | of the Software, and to permit persons to whom the Software is furnished to do
8 | so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 |
--------------------------------------------------------------------------------
/src/main/scala/models/models.scala:
--------------------------------------------------------------------------------
1 | package me.amanj.greenish
2 |
3 | import io.circe.generic.extras.Configuration
4 | import io.circe.Json
5 | import io.circe.syntax.EncoderOps
6 | import java.lang.management.ManagementFactory
7 |
8 | package object models {
9 | private[models] implicit val customConfig: Configuration =
10 | Configuration.default.withSnakeCaseMemberNames.withDefaults
11 | .copy(transformConstructorNames = _.toLowerCase)
12 |
13 | def errorJson(str: String): Json = Json.obj (
14 | "error" -> str.asJson
15 | )
16 |
17 | def sysinfo(maybeNamespace: Option[String]): Json = {
18 | val maybeVersion = Option(getClass.getPackage.getImplementationVersion())
19 | Json.obj (
20 | "service" -> "Greenish".asJson,
21 | "namespace" -> maybeNamespace.asJson,
22 | "version" -> maybeVersion.asJson,
23 | "uptime" -> ManagementFactory.getRuntimeMXBean().getUptime().asJson,
24 | )
25 | }
26 |
27 | def okJson(str: String): Json = Json.obj (
28 | "ok" -> str.asJson
29 | )
30 |
31 | def healthJson(status: Boolean): Json = Json.obj (
32 | "health" -> (if(status) "good".asJson else "bad".asJson)
33 | )
34 | }
35 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Simple Build Tool
2 | # http://www.scala-sbt.org/release/docs/Getting-Started/Directories.html#configuring-version-control
3 |
4 | .metals/*
5 | dist/*
6 | target/
7 | lib_managed/
8 | src_managed/
9 | project/metals.sbt
10 | project/.bloop
11 | project/boot/
12 | project/plugins/project/
13 | .history
14 | .bloop
15 | .cache
16 | .lib/
17 | *.class
18 | *.log
19 |
20 | # General
21 | .DS_Store
22 | .AppleDouble
23 | .LSOverride
24 |
25 | # Icon must end with two \r
26 | Icon
27 |
28 | # Thumbnails
29 | ._*
30 |
31 | # Files that might appear in the root of a volume
32 | .DocumentRevisions-V100
33 | .fseventsd
34 | .Spotlight-V100
35 | .TemporaryItems
36 | .Trashes
37 | .VolumeIcon.icns
38 | .com.apple.timemachine.donotpresent
39 |
40 | # Directories potentially created on remote AFP share
41 | .AppleDB
42 | .AppleDesktop
43 | Network Trash Folder
44 | Temporary Items
45 | .apdisk
46 |
47 | [._]*.sw[a-p]
48 | [._]s[a-rt-v][a-z]
49 | [._]ss[a-gi-z]
50 | [._]sw[a-p]
51 |
52 | # Session
53 | Session.vim
54 | Sessionx.vim
55 |
56 | # Temporary
57 | .netrwhist
58 | *~
59 | # Auto-generated tag files
60 | tags
61 | # Persistent undo
62 | [._]*.un~
63 |
64 | # NPM junks
65 | node_modules/
66 | package-lock.json
67 |
--------------------------------------------------------------------------------
/src/main/scala/models/GroupStatus.scala:
--------------------------------------------------------------------------------
1 | package me.amanj.greenish.models
2 |
3 | import io.circe.{Encoder, Decoder}
4 | import io.circe.generic.extras.semiauto.{deriveConfiguredDecoder, deriveConfiguredEncoder}
5 |
6 | case class GroupStatus(
7 | group: Group,
8 | status: Array[JobStatus],
9 | ) {
10 | def canEqual(a: Any) = a.isInstanceOf[GroupStatus]
11 |
12 | override def equals(that: Any): Boolean =
13 | that match {
14 | case that: GroupStatus => {
15 | that.canEqual(this) &&
16 | this.group == that.group &&
17 | this.status.sameElements(that.status)
18 | }
19 | case _ => false
20 | }
21 |
22 | override def hashCode: Int = {
23 | val prime = 31
24 | var result = 1
25 | result = prime * result + group.hashCode;
26 | result = prime * result + (if (group == null) 0 else group.hashCode)
27 | result = prime * result + (if (status == null) 0 else status.toVector.hashCode)
28 | result
29 | }
30 |
31 | override def toString: String = {
32 | s"GroupStatus($group, ${status.mkString("Array(", ", ", ")")})"
33 | }
34 | }
35 |
36 | object GroupStatus {
37 | implicit val groupStatusDecoder: Decoder[GroupStatus] = deriveConfiguredDecoder
38 | implicit val groupStatusEncoder: Encoder[GroupStatus] = deriveConfiguredEncoder
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/resources/dashboard/version_container.jsx:
--------------------------------------------------------------------------------
1 | class VersionContainer extends React.Component {
2 | constructor(props) {
3 | super(props);
4 | this.state = {
5 | error: null,
6 | isLoaded: false,
7 | version: null
8 | };
9 | }
10 |
11 | componentDidMount() {
12 | this.fetchData()
13 | }
14 |
15 | fetchData = () => {
16 | fetch(`/system`)
17 | .then(res => res.json())
18 | .then(
19 | (info) => {
20 | this.setState({
21 | isLoaded: true,
22 | version: info.version
23 | });
24 | },
25 | // Note: it's important to handle errors here
26 | // instead of a catch() block so that we don't swallow
27 | // exceptions from actual bugs in components.
28 | (error) => {
29 | this.setState({
30 | isLoaded: true,
31 | error
32 | });
33 | }
34 | )
35 | }
36 |
37 | render() {
38 | const { error, isLoaded, version} = this.state;
39 | if (error) {
40 | return (
41 | Error: {error.message}
42 | )
43 | } else if (!isLoaded) {
44 | return (
45 | Loading...
46 | )
47 | } else {
48 | return (
49 | Version {version}
50 | )
51 | }
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/src/main/resources/dashboard/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Greenish dashboard
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/src/main/scala/models/Job.scala:
--------------------------------------------------------------------------------
1 | package me.amanj.greenish.models
2 |
3 | import java.time.ZonedDateTime
4 | import java.time.format.DateTimeFormatter
5 | import java.time.ZoneId
6 | import io.circe.{Encoder, Decoder, HCursor, Json}
7 | import io.circe.generic.extras.semiauto.{deriveConfiguredDecoder, deriveConfiguredEncoder}
8 |
9 | case class Job(
10 | jobId: Int,
11 | name: String,
12 | owner: Option[String],
13 | prometheusId: String,
14 | cmd: String,
15 | timePattern: String,
16 | frequency: CheckFrequency,
17 | periodCheckOffset: Int,
18 | timezone: ZoneId,
19 | lookback: Int,
20 | startAt: Long,
21 | alertLevels: AlertLevels,
22 | info: Option[String],
23 | env: Seq[EnvVar]
24 | ) {
25 | val timeFormat = DateTimeFormatter.ofPattern(timePattern)
26 | }
27 |
28 | object Job {
29 | implicit val zoneIdEncoder: Encoder[ZoneId] =
30 | new Encoder[ZoneId] {
31 | final def apply(zid: ZoneId): Json = Json.obj(
32 | ("zone_id", Json.fromString(zid.getId))
33 | )
34 | }
35 | implicit val zoneIdDecoer: Decoder[ZoneId] = new Decoder[ZoneId] {
36 | final def apply(c: HCursor): Decoder.Result[ZoneId] =
37 | for {
38 | zoneId <- c.downField("zone_id").as[String]
39 | } yield ZoneId.of(zoneId)
40 | }
41 |
42 | implicit val jobDecoder: Decoder[Job] = deriveConfiguredDecoder
43 | implicit val jobEncoder: Encoder[Job] = deriveConfiguredEncoder
44 | }
45 |
--------------------------------------------------------------------------------
/src/main/resources/dashboard/namespace_container.jsx:
--------------------------------------------------------------------------------
1 | class NamespaceContainer extends React.Component {
2 | constructor(props) {
3 | super(props);
4 | this.state = {
5 | error: null,
6 | isLoaded: false,
7 | namespace: null
8 | };
9 | }
10 |
11 | componentDidMount() {
12 | this.fetchData()
13 | }
14 |
15 | fetchData = () => {
16 | fetch(`/system`)
17 | .then(res => res.json())
18 | .then(
19 | (info) => {
20 | if('namespace' in info) {
21 | this.setState({
22 | isLoaded: true,
23 | namespace: info.namespace
24 | });
25 | } else {
26 | this.setState({
27 | isLoaded: true
28 | });
29 | }
30 | },
31 | // Note: it's important to handle errors here
32 | // instead of a catch() block so that we don't swallow
33 | // exceptions from actual bugs in components.
34 | (error) => {
35 | this.setState({
36 | isLoaded: true,
37 | error
38 | });
39 | }
40 | )
41 | }
42 |
43 | render() {
44 | const { error, isLoaded, namespace} = this.state;
45 | if (error) {
46 | return (Error: {error.message})
47 | } else if (!isLoaded) {
48 | return (Loading...)
49 | } else {
50 | return (namespace != null?{namespace}:null)
51 | }
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/src/main/resources/dashboard/state_container.jsx:
--------------------------------------------------------------------------------
1 | class StateContainer extends React.Component {
2 | intervalID
3 | constructor(props) {
4 | super(props);
5 | this.state = {
6 | error: null,
7 | isLoaded: false,
8 | groups: [],
9 | };
10 | }
11 |
12 | componentDidMount() {
13 | this.fetchData()
14 | }
15 |
16 | componentWillUnmount() {
17 | clearTimeout(this.intervalID);
18 | }
19 |
20 | fetchData = () => {
21 | fetch(`/${this.props.endpoint}`)
22 | .then(res => res.json())
23 | .then(
24 | (groups) => {
25 | this.setState({
26 | isLoaded: true,
27 | groups: groups
28 | });
29 | },
30 | // Note: it's important to handle errors here
31 | // instead of a catch() block so that we don't swallow
32 | // exceptions from actual bugs in components.
33 | (error) => {
34 | this.setState({
35 | isLoaded: true,
36 | error
37 | });
38 | }
39 | )
40 | this.intervalID = setTimeout(this.fetchData, fetchInterval);
41 | }
42 |
43 | render() {
44 | const { error, isLoaded, groups } = this.state;
45 | if (error) {
46 | return (
47 | Error: {error.message}
48 | )
49 | } else if (!isLoaded) {
50 | return (
51 | Loading...
52 | )
53 | } else {
54 | return (
55 |
56 | {renderState(groups, this.props.endpoint, 'grid-item')}
57 |
58 | )
59 | }
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/src/test/scala/models/EnvVarSpec.scala:
--------------------------------------------------------------------------------
1 | package me.amanj.greenish.models
2 |
3 | import org.scalatest.matchers.should.Matchers
4 | import org.scalatest.wordspec.AnyWordSpecLike
5 |
6 | class EnvVarSpec() extends Matchers with AnyWordSpecLike {
7 | "EnvVar.apply" must {
8 | "create PlainEnvVar when secure flag is not provided" in {
9 | val (name, value) = ("username", "Homa")
10 | val expected = PlainEnvVar(name, value)
11 | val actual = EnvVar(name, value)
12 | actual shouldBe expected
13 | }
14 |
15 | "create SecureEnvVar when secure flag is provided" in {
16 | val (name, value) = ("username", "secure(Homa)")
17 | val expected = SecureEnvVar(name, "Homa".toSeq)
18 | val actual = EnvVar(name, value)
19 | actual shouldBe expected
20 | }
21 |
22 | "create SecureEnvVar when secure flag is provided but value is empty" in {
23 | val (name, value) = ("username", "secure()")
24 | val expected = SecureEnvVar(name, "".toSeq)
25 | val actual = EnvVar(name, value)
26 | actual shouldBe expected
27 | }
28 | }
29 |
30 | "EnvVar.tupled" must {
31 | "work for secure variables" in {
32 | val (name, value) = ("username", "secure(Homa)")
33 | val origin = EnvVar(name, value)
34 | val expected = (name, "Homa")
35 | val actual = origin.tupled
36 | actual shouldBe expected
37 | }
38 |
39 | "work for plain variables" in {
40 | val (name, value) = ("username", "Homa")
41 | val origin = EnvVar(name, value)
42 | val expected = (name, value)
43 | val actual = origin.tupled
44 | actual shouldBe expected
45 | }
46 | }
47 | }
48 |
49 |
--------------------------------------------------------------------------------
/src/test/scala/models/GroupStatusSpec.scala:
--------------------------------------------------------------------------------
1 | package me.amanj.greenish.models
2 |
3 | import org.scalatest.matchers.should.Matchers
4 | import org.scalatest.wordspec.AnyWordSpecLike
5 | import java.time.ZoneId
6 |
7 | class GroupStatusSpec() extends Matchers
8 | with AnyWordSpecLike {
9 |
10 | val job1 = Job(1, "job1", None, "p1", "foo",
11 | "yyyy-MM-dd-HH", Hourly, 1, ZoneId.of("UTC"),
12 | 4, 0, AlertLevels(0, 1, 2, 3), None, Seq(EnvVar("a", "b"))
13 | )
14 |
15 | val job2 = Job(2, "job2", None, "p2", "bar",
16 | "yyyy-MM-dd-HH", Hourly, 1, ZoneId.of("UTC"),
17 | 4, 0, AlertLevels(0, 1, 2, 3), None, Seq(EnvVar("a", "secure(b)"))
18 | )
19 |
20 | val group1 = Group(0, "group1", Seq(job1))
21 | val group2 = Group(1, "group2", Seq(job2))
22 |
23 | val gs1 = GroupStatus(group1, Array(JobStatus(job1, -1, Seq.empty)))
24 | val gs1Copy = GroupStatus(group1, Array(JobStatus(job1, -1, Seq.empty)))
25 | val gs2 = GroupStatus(group2, Array(JobStatus(job2, -1, Seq.empty)))
26 |
27 | "equals" must {
28 | "work if that is null" in {
29 | val actual = gs1 == null
30 | actual shouldBe false
31 | }
32 |
33 | "work if that is this" in {
34 | val actual = gs1 == gs1
35 | actual shouldBe true
36 | }
37 |
38 | "work if that is a clone of this" in {
39 | val actual = gs1 == gs1Copy
40 | actual shouldBe true
41 | }
42 |
43 | "not be equal to non-GroupStatus objects" in {
44 | val actual = gs1 == job1
45 | actual shouldBe false
46 | }
47 | }
48 |
49 | "hashCode" must {
50 | "be predictive" in {
51 | val actual = gs1.## == gs1.##
52 | actual shouldBe true
53 | }
54 |
55 | "produce the same value for equivalent objects" in {
56 | val actual = gs1.## == gs1Copy.##
57 | actual shouldBe true
58 | }
59 |
60 | "produce differe values for different objects" in {
61 | val actual = gs1.## == gs2.##
62 | actual shouldBe false
63 | }
64 | }
65 | }
66 |
67 |
--------------------------------------------------------------------------------
/src/main/resources/dashboard/common_lib.jsx:
--------------------------------------------------------------------------------
1 | const fetchInterval = 5000
2 |
3 | function renderState(groups, subClassName, keyPrefix) {
4 | return (
5 | groups.map(groupStatus => renderGroup(groupStatus, subClassName, keyPrefix, ""))
6 | );
7 | }
8 |
9 | function encloseInTable(trs, gid, keyPrefix) {
10 | return (
11 |
12 |
13 |
14 | | Job name |
15 | Data set period |
16 | Last updated |
17 |
18 | { trs }
19 |
20 |
21 | );
22 | }
23 |
24 | function renderGroup(groupStatus, subClassName, keyPrefix, sub) {
25 | const group = groupStatus.group;
26 | const gid = group.group_id;
27 | const jobs = groupStatus["status"].map(jobStatus => (
28 | renderJob(jobStatus, gid, keyPrefix)))
29 | return (
30 |
31 |
{group.name}{sub}
32 | {encloseInTable(jobs, keyPrefix, gid)}
33 | );
34 | }
35 |
36 | function renderJob(jobStatus, gid, keyPrefix) {
37 | if(jobStatus["period_health"] != undefined) {
38 | const job = jobStatus.job;
39 | const jid = job.job_id;
40 | const date = new Date(jobStatus.updated_at).toUTCString();
41 | return(jobStatus.period_health.map((ph, i) => (
42 |
44 | | {job.name} |
45 | {ph.period} |
46 | {date} |
47 |
48 | )));
49 | }
50 | }
51 |
52 |
53 |
--------------------------------------------------------------------------------
/src/main/resources/dashboard/group_container.jsx:
--------------------------------------------------------------------------------
1 | const Link = ReactRouterDOM.Link;
2 |
3 | class GroupContainer extends React.Component {
4 | intervalID
5 | constructor(props) {
6 | super(props);
7 | this.state = {
8 | error: null,
9 | isLoaded: false,
10 | group: null
11 | };
12 | this.handleBack = this.handleBack.bind(this);
13 | }
14 |
15 | componentWillUnmount() {
16 | clearTimeout(this.intervalID);
17 | }
18 |
19 | componentDidMount() {
20 | this.fetchData()
21 | }
22 |
23 | fetchData = () => {
24 | fetch(`/group/${this.props.group}`)
25 | .then(res => res.json())
26 | .then(
27 | (group) => {
28 | this.setState({
29 | isLoaded: true,
30 | group: group
31 | });
32 | },
33 | // Note: it's important to handle errors here
34 | // instead of a catch() block so that we don't swallow
35 | // exceptions from actual bugs in components.
36 | (error) => {
37 | this.setState({
38 | isLoaded: true,
39 | error
40 | });
41 | }
42 | );
43 | this.intervalID = setTimeout(this.fetchData, fetchInterval);
44 | }
45 |
46 | handleBack() {
47 | this.props.handler("main", null, null);
48 | }
49 |
50 | render() {
51 | const { error, isLoaded, group } = this.state;
52 | if (error) {
53 | return (
54 | Error: {error.message}
55 | )
56 | } else if (!isLoaded) {
57 | return (
58 | Loading...
59 | )
60 | } else {
61 | const sub = (
62 |
63 | `${loc.pathname}?page=main`}
64 | className="link" onClick={this.handleBack}>
65 | See main dashboard
66 |
67 |
68 | )
69 | return (
70 |
71 | {renderGroup(group, 'group-view', 'grid-item', sub)}
72 |
73 | )
74 | }
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/src/main/scala/App.scala:
--------------------------------------------------------------------------------
1 | package me.amanj.greenish
2 |
3 | import akka.actor.{ActorSystem, Props}
4 | import akka.http.scaladsl.Http
5 | import scala.language.postfixOps
6 | import akka.stream.ActorMaterializer
7 | import scala.concurrent.duration._
8 | import java.time.ZonedDateTime
9 | import checker.{StatusChecker, Refresh}
10 | import endpoints.Routes
11 | import akka.event.{Logging, LogSource}
12 |
13 | object App {
14 |
15 | private[this] implicit val system = ActorSystem("greenish-system")
16 | private[this] implicit val executionContext = system.dispatcher
17 | private[this] val schedulerActor = system.actorOf(Props.empty)
18 |
19 | implicit val logSource: LogSource[AnyRef] = new LogSource[AnyRef] {
20 | def genString(o: AnyRef): String = o.getClass.getName
21 | override def getClazz(o: AnyRef): Class[_] = o.getClass
22 | }
23 |
24 | private[this] val logger = Logging(system, this)
25 |
26 | def main(args: Array[String]): Unit = {
27 |
28 | val appConfig = AppConfig()
29 |
30 | val statsActor = system.actorOf(
31 | Props(new stats.StatsCollector(getPrometheusIds(appConfig))))
32 |
33 | val statusChecker = system.actorOf(
34 | Props(new StatusChecker(appConfig.groups, statsActor,
35 | appConfig.refreshInSeconds * 3, appConfig.scratchDir)))
36 |
37 | system.scheduler.scheduleWithFixedDelay(
38 | 0 seconds,
39 | appConfig.refreshInSeconds seconds,
40 | statusChecker, Refresh(() => ZonedDateTime.now()))
41 |
42 | val bindingFuture = Http()
43 | .bindAndHandle(
44 | new Routes(appConfig.namespace, appConfig.scratchDir, statusChecker,
45 | statsActor,
46 | // At least there should be one good run in the last 5 refresh sets
47 | appConfig.refreshInSeconds * 1000 * 5).routes,
48 | appConfig.address, appConfig.port)
49 |
50 | println(s"Server online at http://${appConfig.address}:${appConfig.port}...")
51 | }
52 |
53 | def getPrometheusIds(appConfig: AppConfig): Set[String] = {
54 | val prometheusIds = appConfig.groups.flatMap ( g =>
55 | g.jobs.map(j => j.prometheusId))
56 |
57 | val prometheusIdsSet = prometheusIds.toSet
58 | if(prometheusIdsSet.size < prometheusIds.size) {
59 | logger.warning(
60 | "prometheus-id is best to be unique per the entire configuration")
61 | }
62 |
63 | prometheusIdsSet
64 | }
65 |
66 |
67 | }
68 |
--------------------------------------------------------------------------------
/src/test/scala/checker/CheckerSpec.scala:
--------------------------------------------------------------------------------
1 | package me.amanj.greenish.checker
2 |
3 | import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach}
4 | import org.scalatest.matchers.should.Matchers
5 | import org.scalatest.wordspec.AnyWordSpecLike
6 | import me.amanj.greenish.models.PeriodHealth
7 |
8 | class CheckerSpec() extends AnyWordSpecLike with Matchers {
9 |
10 | "computeOldest" must {
11 | "work for empty period health lists" in {
12 | val periods = Seq.empty[PeriodHealth]
13 | val actual = computeOldest(periods)
14 | val expected = 0
15 | actual shouldBe expected
16 | }
17 |
18 | "work when the first period is missing" in {
19 | val periods = Seq(
20 | PeriodHealth("kaka", false),
21 | PeriodHealth("kaka", true),
22 | PeriodHealth("kaka", true),
23 | PeriodHealth("kaka", true),
24 | )
25 | val actual = computeOldest(periods)
26 | val expected = 4
27 | actual shouldBe expected
28 | }
29 |
30 | "work when a middle period is missing" in {
31 | val periods = Seq(
32 | PeriodHealth("kaka", true),
33 | PeriodHealth("kaka", false),
34 | PeriodHealth("kaka", true),
35 | PeriodHealth("kaka", true),
36 | )
37 | val actual = computeOldest(periods)
38 | val expected = 3
39 | actual shouldBe expected
40 | }
41 |
42 | "work when the last period is missing" in {
43 | val periods = Seq(
44 | PeriodHealth("kaka", true),
45 | PeriodHealth("kaka", true),
46 | PeriodHealth("kaka", true),
47 | PeriodHealth("kaka", false),
48 | )
49 | val actual = computeOldest(periods)
50 | val expected = 1
51 | actual shouldBe expected
52 | }
53 |
54 | "work when more than a period is missing" in {
55 | val periods = Seq(
56 | PeriodHealth("kaka", true),
57 | PeriodHealth("kaka", false),
58 | PeriodHealth("kaka", true),
59 | PeriodHealth("kaka", false),
60 | )
61 | val actual = computeOldest(periods)
62 | val expected = 3
63 | actual shouldBe expected
64 | }
65 |
66 | "work when no period is missing" in {
67 | val periods = Seq(
68 | PeriodHealth("kaka", true),
69 | PeriodHealth("kaka", true),
70 | PeriodHealth("kaka", true),
71 | PeriodHealth("kaka", true),
72 | )
73 | val actual = computeOldest(periods)
74 | val expected = 0
75 | actual shouldBe expected
76 | }
77 | }
78 | }
79 |
80 |
81 |
--------------------------------------------------------------------------------
/src/main/scala/models/CheckFrequency.scala:
--------------------------------------------------------------------------------
1 | package me.amanj.greenish.models
2 |
3 | import java.time.ZonedDateTime
4 | import com.cronutils.model.time.ExecutionTime
5 | import com.cronutils.model.CronType.UNIX
6 | import com.cronutils.parser.CronParser
7 | import com.cronutils.model.definition.CronDefinitionBuilder
8 | import io.circe.{Printer, Decoder, Encoder, HCursor, Json}
9 | import io.circe.syntax.EncoderOps
10 | import io.circe.generic.extras.semiauto.{
11 | deriveEnumerationCodec, deriveConfiguredDecoder, deriveConfiguredEncoder}
12 |
13 | sealed trait CheckFrequency {
14 | def prev(date: ZonedDateTime): ZonedDateTime
15 | }
16 |
17 | object CheckFrequency {
18 | implicit val freqDecoder: Decoder[CheckFrequency] = new Decoder[CheckFrequency] {
19 | final def apply(obj: HCursor): Decoder.Result[CheckFrequency] = {
20 | obj.as[String].map {
21 | case "hourly" => Right(Hourly)
22 | case "daily" => Right(Daily)
23 | case "monthly" => Right(Monthly)
24 | case "annually" => Right(Annually)
25 | }.getOrElse(obj.as[Cron])
26 | }
27 | }
28 |
29 | implicit val freqEncoder: Encoder[CheckFrequency] = Encoder.instance {
30 | case Hourly => "hourly".asJson
31 | case Daily => "daily".asJson
32 | case Monthly => "monthly".asJson
33 | case Annually => "annually".asJson
34 | case other: Cron => other.asJson
35 | }
36 | }
37 |
38 | case class Cron(pattern: String) extends CheckFrequency {
39 | private[this] val parser = new CronParser(
40 | CronDefinitionBuilder.instanceDefinitionFor(UNIX))
41 | private[this] val executionTime = ExecutionTime.forCron(
42 | parser.parse(pattern))
43 |
44 | def prev(date: ZonedDateTime): ZonedDateTime =
45 | executionTime.lastExecution(date).get()
46 | }
47 | object Cron {
48 | implicit val cronDecoder: Decoder[Cron] = deriveConfiguredDecoder
49 | implicit val checkGroupEncoder: Encoder[Cron] = deriveConfiguredEncoder
50 | }
51 |
52 | case object Hourly extends CheckFrequency {
53 | def prev(date: ZonedDateTime): ZonedDateTime =
54 | date
55 | .minusHours(1L)
56 | }
57 |
58 | case object Daily extends CheckFrequency {
59 | def prev(date: ZonedDateTime): ZonedDateTime =
60 | date
61 | .minusDays(1L)
62 | }
63 |
64 | case object Monthly extends CheckFrequency {
65 | def prev(date: ZonedDateTime): ZonedDateTime =
66 | date
67 | .minusMonths(1L)
68 | }
69 |
70 | case object Annually extends CheckFrequency {
71 | def prev(date: ZonedDateTime): ZonedDateTime =
72 | date
73 | .minusYears(1L)
74 | }
75 |
--------------------------------------------------------------------------------
/src/main/resources/dashboard/job_container.jsx:
--------------------------------------------------------------------------------
1 | const Link = ReactRouterDOM.Link;
2 |
3 | class JobContainer extends React.Component {
4 | intervalID
5 | constructor(props) {
6 | super(props);
7 | this.state = {
8 | error: null,
9 | isLoaded: false,
10 | job: null
11 | };
12 | this.handleBack = this.handleBack.bind(this);
13 | }
14 |
15 | componentDidMount() {
16 | this.fetchData()
17 | }
18 |
19 | componentWillUnmount() {
20 | clearTimeout(this.intervalID);
21 | }
22 |
23 | fetchData = () => {
24 | fetch(`/group/${this.props.group}/job/${this.props.job}`)
25 | .then(res => res.json())
26 | .then(
27 | (job) => {
28 | this.setState({
29 | isLoaded: true,
30 | job: job
31 | });
32 | },
33 | // Note: it's important to handle errors here
34 | // instead of a catch() block so that we don't swallow
35 | // exceptions from actual bugs in components.
36 | (error) => {
37 | this.setState({
38 | isLoaded: true,
39 | error
40 | });
41 | }
42 | )
43 | this.intervalID = setTimeout(this.fetchData, fetchInterval);
44 | }
45 |
46 | handleBack() {
47 | this.props.handler("main", null, null);
48 | }
49 |
50 | render() {
51 | const { error, isLoaded, job } = this.state;
52 | if (error) {
53 | return (
54 | Error: {error.message}
55 | )
56 | } else if (!isLoaded) {
57 | return (
58 | Loading...
59 | )
60 | } else {
61 | const jobs = renderJob(job, this.props.group, 'job-view')
62 | return (
63 |
64 |
65 | {job.job.name}
66 |
67 | `${loc.pathname}?page=main`}
68 | onClick={this.handleBack}
69 | className="link">
70 | See main dashboard
71 |
72 |
73 |
74 | {"owner" in job.job?
:
}
75 | {"info" in job.job?
:
}
76 |
80 | {encloseInTable(jobs, 'job-view', this.props.group)}
81 |
82 | )
83 | }
84 | }
85 | }
86 |
--------------------------------------------------------------------------------
/src/main/resources/dashboard/main.css:
--------------------------------------------------------------------------------
1 | body {
2 | background-color: #233502;
3 | color: darkolivegreen;
4 | padding: 0px;
5 | margin: 0px;
6 | font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
7 | }
8 |
9 | td {
10 | margin: 5px;
11 | padding: 5px;
12 | }
13 |
14 | th {
15 | margin: 5px;
16 | padding: 5px;
17 | }
18 |
19 | table {
20 | border-collapse: collapse;
21 | width: 100%;
22 | text-align: left;
23 | }
24 |
25 | .grid-container {
26 | display: grid;
27 | grid-gap: 50px 100px;
28 | grid-template-columns: repeat(3, 1fr);
29 | justify-content: center;
30 | }
31 |
32 | .grid-item {
33 | vertical-align: top;
34 | }
35 |
36 | .grid-container-detail {
37 | display: grid;
38 | grid-gap: 50px 100px;
39 | grid-template-columns: repeat(2, 1fr);
40 | justify-content: center;
41 | }
42 |
43 | .detail-div {
44 | padding: 10px;
45 | background-color: #f3f2f2
46 | }
47 |
48 | .summary-div {
49 | padding: 10px;
50 | background-color: #e8ece9;
51 | }
52 |
53 | .great {
54 | background-color: #c9daa7;
55 | color: #345f34;
56 | }
57 |
58 | .normal {
59 | background-color: lightblue;
60 | color: steelblue;
61 | }
62 |
63 | .warn {
64 | background-color: palegoldenrod;
65 | color: sienna;
66 | }
67 |
68 | .critical {
69 | background-color: salmon;
70 | color: maroon;
71 | }
72 |
73 | .link {
74 | color: steelblue;
75 | text-decoration: none;
76 | }
77 |
78 | .link:hover {
79 | cursor: pointer
80 | }
81 |
82 | a {
83 | color: steelblue;
84 | text-decoration: none;
85 | }
86 |
87 | a:hover {
88 | cursor: pointer
89 | }
90 |
91 | .detail-box {
92 | background: whitesmoke;
93 | padding: 10px;
94 | box-shadow: 5px 5px lightgray;
95 | }
96 |
97 | .summary-box {
98 | background: #f6fdfc;
99 | padding: 10px;
100 | box-shadow: 5px 5px lightgray;
101 | }
102 |
103 | .header-div {
104 | width: 100%;
105 | margin-right: 8px;
106 | height: 100px;
107 | }
108 |
109 | .header-left {
110 | float: left;
111 | }
112 |
113 | .header-right {
114 | float: right;
115 | margin-top: 65px;
116 | }
117 |
118 | .namespace-span {
119 | margin-left: 16px;
120 | line-height: 100px;
121 | vertical-align: top;
122 | }
123 |
124 | .owner-div {
125 | margin-bottom: 8px;
126 | }
127 |
128 | .info-div {
129 | margin-bottom: 8px;
130 | }
131 |
132 | .stdout-div {
133 | margin-bottom: 20px;
134 | }
135 |
136 | .greenish-header {
137 | margin: 8px;
138 | color: snow;
139 | }
140 |
141 | .version-div {
142 | text-align: right;
143 | font-size: x-small;
144 | margin-right:4px;
145 | margin-top:4px;
146 | margin-bottom:4px;
147 | color: snow;
148 | }
149 |
150 | .time-div {
151 | text-align: right;
152 | margin-right:4px;
153 | color: snow;
154 | }
155 |
--------------------------------------------------------------------------------
/src/main/scala/models/EnvVar.scala:
--------------------------------------------------------------------------------
1 | package me.amanj.greenish.models
2 |
3 | import io.circe.syntax.EncoderOps
4 | import io.circe.{Encoder, Decoder, HCursor, Json}
5 |
6 | sealed trait EnvVar {
7 | type T <: AnyRef
8 | def name: String
9 | def value: T
10 | def tupled: (String, String)
11 | }
12 | object EnvVar {
13 | private[this] val pattern = """secure\((.*)\)""".r
14 | def apply(key: String, value: String): EnvVar = {
15 | value match {
16 | case pattern(v) => SecureEnvVar(key, v.toSeq)
17 | case _ => PlainEnvVar(key, value)
18 | }
19 | }
20 |
21 | implicit val envVarDecoer: Decoder[EnvVar] = new Decoder[EnvVar] {
22 | final def apply(obj: HCursor): Decoder.Result[EnvVar] = {
23 | obj.downField("type").as[String].flatMap {
24 | case "secure" => obj.as[SecureEnvVar]
25 | case "plain" => obj.as[PlainEnvVar]
26 | }
27 | }
28 | }
29 |
30 | implicit val envVarEncoder: Encoder[EnvVar] = Encoder.instance {
31 | case sec: SecureEnvVar => sec.asJson
32 | case plain: PlainEnvVar => plain.asJson
33 | }
34 | }
35 |
36 | private[models] case class SecureEnvVar(name: String, value: Seq[Char]) extends EnvVar {
37 | type T = Seq[Char]
38 | def tupled: (String, String) = (name, value.mkString(""))
39 | }
40 |
41 | private[models] object SecureEnvVar {
42 | val HIDDEN_PASSWORD = "****"
43 | implicit val secureEnvVarEncoder: Encoder[SecureEnvVar] =
44 | new Encoder[SecureEnvVar] {
45 | final def apply(v: SecureEnvVar): Json = Json.obj(
46 | ("type", Json.fromString("secure")),
47 | ("name", Json.fromString(v.name)),
48 | ("value", Json.fromString(HIDDEN_PASSWORD)),
49 | )
50 | }
51 |
52 | implicit val secureEnvVarDecoder: Decoder[SecureEnvVar] = new Decoder[SecureEnvVar] {
53 | final def apply(c: HCursor): Decoder.Result[SecureEnvVar] =
54 | c.downField("type").as[String].flatMap {
55 | case "secure" =>
56 | for {
57 | name <- c.downField("name").as[String]
58 | value <- c.downField("value").as[String].map(_.toSeq)
59 | } yield SecureEnvVar(name, value)
60 | }
61 | }
62 | }
63 |
64 | private[models] case class PlainEnvVar(name: String, value: String) extends EnvVar {
65 | type T = String
66 | def tupled: (String, String) = (name, value)
67 | }
68 | private[models] object PlainEnvVar {
69 | implicit val plainEnvVarEncoder: Encoder[PlainEnvVar] =
70 | new Encoder[PlainEnvVar] {
71 | final def apply(v: PlainEnvVar): Json = Json.obj(
72 | ("type", Json.fromString("plain")),
73 | ("name", Json.fromString(v.name)),
74 | ("value", Json.fromString(v.value)),
75 | )
76 | }
77 |
78 | implicit val secureEnvVarDecoder: Decoder[PlainEnvVar] = new Decoder[PlainEnvVar] {
79 | final def apply(c: HCursor): Decoder.Result[PlainEnvVar] =
80 | c.downField("type").as[String].flatMap {
81 | case "plain" =>
82 | for {
83 | name <- c.downField("name").as[String]
84 | value <- c.downField("value").as[String]
85 | } yield PlainEnvVar(name, value)
86 | }
87 | }
88 | }
89 |
--------------------------------------------------------------------------------
/src/main/resources/dashboard/main_container.jsx:
--------------------------------------------------------------------------------
1 | const e = React.createElement;
2 | const Link = ReactRouterDOM.Link;
3 |
4 | class MainContainer extends React.Component {
5 | constructor(props) {
6 | super(props);
7 | this.state = {
8 | page: props.page,
9 | gid: props.group,
10 | jid: props.job
11 | }
12 | this.handler = this.handler.bind(this);
13 | this.renderMain = this.renderMain.bind(this);
14 | }
15 |
16 | renderMain(page, gid, jid, handler) {
17 | if (page == 'state') {
18 | return (
19 |
20 |
All data sets
21 |
22 | `${loc.pathname}?page=main`}
23 | onClick={() => this.setState({page:"main"})} className="link">
24 | See main dashboard
25 |
26 |
27 |
28 |
29 |
30 | )
31 | } else if(page == 'group'){
32 | return(
33 |
34 |
35 |
36 | )
37 | } else if(page == 'job'){
38 | return(
39 |
40 |
41 |
42 | )
43 | } else { // page == 'main'
44 | return(
45 |
46 |
47 |
Summary
48 |
49 |
50 |
51 |
52 |
Detailed missing periods
53 |
54 | `${loc.pathname}?page=state`}
55 | onClick={() => this.setState({page:"state"})} className="link">
56 | See all periods
57 |
58 |
59 |
60 |
61 |
62 |
63 | )
64 | }
65 | }
66 |
67 | handler(page, gid, jid) {
68 |
69 | this.setState({
70 | page: page,
71 | gid: gid,
72 | jid: jid,
73 | })
74 | }
75 |
76 | render() {
77 | return (
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
90 |
91 |
92 | {this.renderMain(this.state.page, this.state.gid, this.state.jid)}
93 |
94 |
95 | )
96 | }
97 | }
98 |
99 | const domContainer = document.querySelector('#main_container');
100 | const BrowserRouter = ReactRouterDOM.BrowserRouter;
101 | const Route = ReactRouterDOM.Route;
102 | const useLocation = ReactRouterDOM.useLocation;
103 |
104 | function useQuery() {
105 | return new URLSearchParams(useLocation().search);
106 | }
107 |
108 | function ShowPage() {
109 | let query = useQuery();
110 | let page = query.get("page");
111 | let gid = query.get("gid");
112 | let jid = query.get("jid");
113 | return ();
114 | }
115 | ReactDOM.render(
116 | ,
117 | domContainer
118 | );
119 |
--------------------------------------------------------------------------------
/src/main/resources/dashboard/summary_container.jsx:
--------------------------------------------------------------------------------
1 | const Link = ReactRouterDOM.Link;
2 |
3 | class SummaryContainer extends React.Component {
4 | intervalID
5 | constructor(props) {
6 | super(props);
7 | this.state = {
8 | error: null,
9 | isLoaded: false,
10 | items: []
11 | };
12 | this.handleGroupClick = this.handleGroupClick.bind(this);
13 | this.handleJobClick = this.handleJobClick.bind(this);
14 | }
15 |
16 | componentDidMount() {
17 | this.fetchData()
18 | }
19 |
20 | componentWillUnmount() {
21 | clearTimeout(this.intervalID);
22 | }
23 |
24 | fetchData = () => {
25 | fetch("/summary")
26 | .then(res => res.json())
27 | .then(
28 | (items) => {
29 | this.setState({
30 | isLoaded: true,
31 | items: items
32 | });
33 | },
34 | // Note: it's important to handle errors here
35 | // instead of a catch() block so that we don't swallow
36 | // exceptions from actual bugs in components.
37 | (error) => {
38 | this.setState({
39 | isLoaded: true,
40 | error
41 | });
42 | }
43 | )
44 | this.intervalID = setTimeout(this.fetchData, fetchInterval);
45 | }
46 |
47 | handleGroupClick(gid) {
48 | this.props.handler("group", gid, null);
49 | }
50 |
51 | handleJobClick(gid, jid) {
52 | this.props.handler("job", gid, jid);
53 | }
54 |
55 | render() {
56 | const { error, isLoaded, items } = this.state;
57 | if (error) {
58 | return (
59 | Error: {error.message}
60 | )
61 | } else if (!isLoaded) {
62 | return (
63 | Loading...
64 | )
65 | } else {
66 | return (
67 |
68 | {
69 | items.map(group => {
70 | const gid = group.group_id;
71 | return (
72 |
73 |
74 | `${loc.pathname}?page=group&gid=${gid}`}
75 | onClick={() => {this.handleGroupClick(gid)}} className="link">
76 | {group.name}
77 |
78 |
79 |
80 |
81 |
82 | | Job name |
83 | # Missing data sets |
84 |
85 | {
86 | group["status"].map(job =>{
87 | const jid = job.job_id;
88 | return(
89 |
90 | |
91 | `${loc.pathname}?page=job&gid=${gid}&jid=${jid}`}
92 | onClick={() => {this.handleJobClick(gid, jid)}} className="link">
93 | {job.name}
94 |
95 | |
96 | {job.missing} |
98 |
99 | )
100 | }
101 | )}
102 |
103 |
104 |
105 | )
106 | })
107 | }
108 |
109 | )
110 | }
111 | }
112 | }
113 |
--------------------------------------------------------------------------------
/src/test/scala/AppSpec.scala:
--------------------------------------------------------------------------------
1 | package me.amanj.greenish
2 |
3 | import org.scalatest.matchers.should.Matchers
4 | import org.scalatest.wordspec.AnyWordSpecLike
5 | import models._
6 | import java.time.ZoneId
7 | import java.io.File
8 |
9 | class AppSpec() extends Matchers
10 | with AnyWordSpecLike {
11 | "getPrometheusIds" must {
12 | "work when there are duplicate IDs" in {
13 | val config = new AppConfig(
14 | Seq(
15 | Group(0, "Group1", Seq(
16 | Job(0, "Job1", None, "job_1", "/tmp/first_script",
17 | "yyyy-MM-dd-HH", Hourly, 3,
18 | ZoneId.of("UTC"), 24, 0,
19 | AlertLevels(0, 1, 2, 3),
20 | None,
21 | Seq(EnvVar("VAR1", "baz"), EnvVar("VAR2", "secure(bar)")),
22 | ),
23 | Job(1, "Job2", None, "job_1", "/tmp/second_script job2",
24 | "yyyy-MM-dd-HH", Daily, 2,
25 | ZoneId.of("UTC"), 24, 0,
26 | AlertLevels(0, 1, 2, 3),
27 | None,
28 | Seq(EnvVar("VAR1", "baz"), EnvVar("VAR2", "secure(bar)")),
29 | ),
30 | )),
31 | Group(1, "Group2", Seq(
32 | Job(0, "Job3", None, "job_2", "/tmp/third_script",
33 | "yyyy-MM-dd", Monthly, 1,
34 | ZoneId.of("UTC"), 3, 0,
35 | AlertLevels(0, 1, 2, 3),
36 | None,
37 | Seq(EnvVar("VAR1", "baz"), EnvVar("VAR2", "secure(bar)")),
38 | ),
39 | Job(1, "Job4", None, "job_2", "/tmp/fourth_script",
40 | "yyyy-01-01", Annually, 1,
41 | ZoneId.of("UTC"), 3, 0,
42 | AlertLevels(0, 1, 2, 3),
43 | None,
44 | Seq(EnvVar("VAR1", "baz"), EnvVar("VAR2", "secure(bar)")),
45 | ),
46 | )),
47 | ),
48 | None,
49 | new File("/tmp/greenish/stdout"),
50 | 30,
51 | "127.0.0.1",
52 | 8080,
53 | )
54 |
55 | val expected = Set("job_1", "job_2")
56 |
57 | val actual = App.getPrometheusIds(config)
58 |
59 | actual shouldBe expected
60 | }
61 |
62 | "work when there are no duplicate IDs" in {
63 | val config = new AppConfig(
64 | Seq(
65 | Group(0, "Group1", Seq(
66 | Job(0, "Job1", None, "job_1", "/tmp/first_script",
67 | "yyyy-MM-dd-HH", Hourly, 3,
68 | ZoneId.of("UTC"), 24, 0,
69 | AlertLevels(0, 1, 2, 3),
70 | None,
71 | Seq(EnvVar("VAR1", "baz"), EnvVar("VAR2", "secure(bar)")),
72 | ),
73 | Job(1, "Job2", None, "job_2", "/tmp/second_script job2",
74 | "yyyy-MM-dd-HH", Daily, 2,
75 | ZoneId.of("UTC"), 24, 0,
76 | AlertLevels(0, 1, 2, 3),
77 | None,
78 | Seq(EnvVar("VAR1", "baz"), EnvVar("VAR2", "secure(bar)")),
79 | ),
80 | )),
81 | Group(1, "Group2", Seq(
82 | Job(0, "Job3", None, "job_3", "/tmp/third_script",
83 | "yyyy-MM-dd", Monthly, 1,
84 | ZoneId.of("UTC"), 3, 0,
85 | AlertLevels(0, 1, 2, 3),
86 | None,
87 | Seq(EnvVar("VAR1", "baz"), EnvVar("VAR2", "secure(bar)")),
88 | ),
89 | Job(1, "Job4", None, "job_4", "/tmp/fourth_script",
90 | "yyyy-01-01", Annually, 1,
91 | ZoneId.of("UTC"), 3, 0,
92 | AlertLevels(0, 1, 2, 3),
93 | None,
94 | Seq(EnvVar("VAR1", "baz"), EnvVar("VAR2", "secure(bar)")),
95 | ),
96 | )),
97 | ),
98 | None,
99 | new File("/tmp/greenish/stdout"),
100 | 30,
101 | "127.0.0.1",
102 | 8080,
103 | )
104 |
105 | val expected = Set("job_1", "job_2", "job_3", "job_4")
106 |
107 | val actual = App.getPrometheusIds(config)
108 |
109 | actual shouldBe expected
110 | }
111 | }
112 | }
113 |
--------------------------------------------------------------------------------
/src/main/scala/checker/CommandRunner.scala:
--------------------------------------------------------------------------------
1 | package me.amanj.greenish.checker
2 |
3 | import me.amanj.greenish.stats._
4 | import me.amanj.greenish.models._
5 | import java.time.ZonedDateTime
6 | import java.io.{File, PrintWriter}
7 | import java.nio.file.{Files, StandardCopyOption}
8 | import scala.sys.process.Process
9 | import scala.util.control.NonFatal
10 | import akka.actor.{Actor, ActorRef, ActorLogging}
11 |
12 | class CommandRunner(statsActor: ActorRef,
13 | scratchDir: File) extends Actor with ActorLogging {
14 | override def receive: Receive = {
15 | case BatchRun(cmd, periods, env, group, job,
16 | prometheusId, clockCounter, expireAt) =>
17 | val startTimeLong = System.currentTimeMillis
18 | if(startTimeLong <= expireAt) {
19 | statsActor ! IncRefresh(prometheusId)
20 | val startTime = startTimeLong.toDouble
21 | try {
22 | run(cmd, periods, env, group, job, prometheusId, clockCounter)
23 | } catch {
24 | case NonFatal(exp) =>
25 | log.error(exp.getMessage())
26 | statsActor ! IncBadRefresh(prometheusId)
27 | } finally {
28 | statsActor ! DecRefresh(prometheusId)
29 | val endTime = System.currentTimeMillis.toDouble
30 | statsActor ! RefreshTime(prometheusId,
31 | (endTime - startTime) / 1000)
32 | }
33 | } else {
34 | statsActor ! IncExpiredRefresh(prometheusId)
35 | }
36 | }
37 |
38 | private[this] def run(
39 | cmd: String,
40 | periods: Seq[String],
41 | env: Seq[(String, String)],
42 | group: Int,
43 | job: Int,
44 | prometheusId: String,
45 | clockCounter: Long): Unit = {
46 | val exec = Seq("bash", "-c", CommandRunner.toBashCommand(cmd, periods))
47 | val output = Process(exec, None, env:_*).lazyLines
48 | CommandRunner.write(debugFile(scratchDir, group, job), output)
49 | val capturedOutput = CommandRunner.parseOutput(output, periods.toSet)
50 | val distinctReturnedPeriods = capturedOutput.map(_._1).distinct
51 | if(capturedOutput.length < periods.size) {
52 | log.error(s"""|Some periods weren't returned for:
53 | |Group ID: $group, Job ID: $job
54 | |$cmd $periods
55 | |state update aborted""".stripMargin)
56 | statsActor ! IncBadRefresh(prometheusId)
57 | } else if(distinctReturnedPeriods.length != capturedOutput.size) {
58 | log.error(s"""|Some periods were returned more than once for:
59 | |$cmd $periods
60 | |Group ID: $group, Job ID: $job
61 | |$cmd $periods
62 | |state update aborted""".stripMargin)
63 | statsActor ! IncBadRefresh(prometheusId)
64 | } else {
65 | val mapped = capturedOutput.toMap
66 | val periodHealths = periods.map {
67 | period => PeriodHealth(period, mapped(period)) }
68 | context.sender ! RunResult(periodHealths, group, job, clockCounter)
69 | statsActor ! MissingPeriods(prometheusId, periodHealths.count(!_.ok))
70 | val oldestMissingPeriod = computeOldest(periodHealths)
71 | statsActor ! OldestMissingPeriod(prometheusId, oldestMissingPeriod)
72 | }
73 | }
74 | }
75 |
76 | object CommandRunner {
77 | private[this] val Matcher = "^greenish-period\t(.*)\t(1|0)$".r
78 |
79 | protected[checker] def write(file: String,
80 | lines: LazyList[String]): Unit = {
81 | val tmp = new File(s"$file.tmp")
82 | val pw = new PrintWriter(tmp)
83 | lines.foreach(pw.println)
84 | pw.close
85 | // FIXME: There is a slight chance of race, but do we care?
86 | Files.move(tmp.toPath, new File(file).toPath,
87 | StandardCopyOption.ATOMIC_MOVE)
88 | }
89 |
90 | protected[checker] def parseOutput(lines: LazyList[String],
91 | periods: Set[String]): Seq[(String, Boolean)] =
92 | lines.map { line =>
93 | line match {
94 | case Matcher(period, "1") => Some((period, true))
95 | case Matcher(period, "0") => Some((period, false))
96 | case _ => None
97 | }
98 | }.collect { case Some(periodStatus) => periodStatus }
99 | .filter { case (period, _) => periods.contains(period) }
100 | .toList
101 |
102 | protected[checker] def toBashCommand(command: String, periods: Seq[String]): String =
103 | s"$command ${periods.map(p => s"'$p'").mkString(" ")}"
104 | }
105 |
--------------------------------------------------------------------------------
/src/main/scala/stats/StatsCollector.scala:
--------------------------------------------------------------------------------
1 | package me.amanj.greenish.stats
2 |
3 | import akka.actor.Actor
4 | import akka.actor.ActorLogging
5 | import io.prometheus.client.{Counter, Gauge, Histogram, CollectorRegistry}
6 | import io.prometheus.client.Collector.MetricFamilySamples
7 | import io.prometheus.client.exporter.common.TextFormat
8 | import java.util.Enumeration
9 | import java.io.{StringWriter, Writer}
10 | import akka.http.scaladsl.model.{MediaType, HttpCharsets, HttpEntity}
11 | import akka.http.scaladsl.marshalling.{ToEntityMarshaller, Marshaller}
12 |
13 | class StatsCollector(jobIDs: Set[String],
14 | registry: CollectorRegistry = new CollectorRegistry()) extends Actor with ActorLogging {
15 |
16 | // Job related metrics
17 | private[this] val refreshGauge = Gauge.build()
18 | .name("greenish_active_refresh_tasks")
19 | .help("Current number active state refresh tasks")
20 | .labelNames("job_id")
21 | .register(registry)
22 |
23 | private[this] val refreshTime = Histogram.build()
24 | .name("greenish_state_refresh_time_seconds")
25 | .help("Job state refreshing time")
26 | .labelNames("job_id")
27 | .buckets(StatsCollector.HistogramTimeBuckets:_*)
28 | .register(registry)
29 |
30 | private[this] val refreshCounter = Counter.build()
31 | .name("greenish_state_refresh_total")
32 | .help("Total number of job state refresh instances")
33 | .labelNames("job_id")
34 | .register(registry)
35 |
36 | private[this] val badRefreshCounter = Counter.build()
37 | .name("greenish_state_refresh_failed_total")
38 | .help("Total number of failed job state refresh instances")
39 | .labelNames("job_id")
40 | .register(registry)
41 |
42 | private[this] val expiredRefreshCounter = Counter.build()
43 | .name("greenish_state_refresh_expired_total")
44 | .help("Total number of expired job state refresh instances")
45 | .labelNames("job_id")
46 | .register(registry)
47 |
48 | private[this] val missingPeriods = Gauge.build()
49 | .name("greenish_missing_periods_total")
50 | .help("Current number of missing dataset periods")
51 | .labelNames("job_id")
52 | .register(registry)
53 |
54 | private[this] val oldestMissingPeriod = Gauge.build()
55 | .name("greenish_oldest_missing_period")
56 | .help("The oldest missing period")
57 | .labelNames("job_id")
58 | .register(registry)
59 |
60 | init()
61 |
62 | private[this] def init(): Unit = {
63 | jobIDs.foreach { jobId =>
64 | refreshGauge.labels(jobId)
65 | refreshTime.labels(jobId)
66 | refreshCounter.labels(jobId)
67 | expiredRefreshCounter.labels(jobId)
68 | badRefreshCounter.labels(jobId)
69 | missingPeriods.labels(jobId)
70 | oldestMissingPeriod.labels(jobId)
71 | }
72 | }
73 |
74 | override def receive: Receive = {
75 | case RefreshTime(jobId, time) =>
76 | refreshTime.labels(jobId).observe(time)
77 | case IncRefresh(jobId) =>
78 | refreshCounter.labels(jobId).inc()
79 | refreshGauge.labels(jobId).inc()
80 | case DecRefresh(jobId) =>
81 | refreshGauge.labels(jobId).dec()
82 | case IncBadRefresh(jobId) =>
83 | badRefreshCounter.labels(jobId).inc()
84 | case MissingPeriods(jobId, count) =>
85 | missingPeriods.labels(jobId).set(count)
86 | case OldestMissingPeriod(jobId, count) =>
87 | oldestMissingPeriod.labels(jobId).set(count)
88 | case IncExpiredRefresh(jobId) =>
89 | refreshCounter.labels(jobId).inc()
90 | expiredRefreshCounter.labels(jobId).inc()
91 | case GetPrometheus =>
92 | import StatsCollector.{fromRegistry, toPrometheusTextFormat}
93 | val metrics = fromRegistry(registry)
94 | context.sender ! metrics
95 | }
96 | }
97 |
98 | object StatsCollector {
99 | case class MetricsEntity(samples: Enumeration[MetricFamilySamples])
100 |
101 | private [StatsCollector] val HistogramTimeBuckets =
102 | Seq(
103 | 0.1, 0.3, 0.5, 0.8, 1, 1.3, 1.5, 1.8, 2, 2.5, 3, 3.5, 4, 4.5)
104 |
105 | private[this] val mediaTypeParams = Map("version" -> "0.0.4")
106 | private[this] val mediaType = MediaType.customWithFixedCharset(
107 | "text", "plain", HttpCharsets.`UTF-8`, params = mediaTypeParams)
108 |
109 | private[stats] def fromRegistry(
110 | collectorRegistry: CollectorRegistry): MetricsEntity = {
111 | MetricsEntity(collectorRegistry.metricFamilySamples())
112 | }
113 |
114 | private[stats] def toPrometheusTextFormat(e: MetricsEntity): String = {
115 | val writer: Writer = new StringWriter()
116 | TextFormat.write004(writer, e.samples)
117 |
118 | writer.toString
119 | }
120 |
121 | implicit val metricsMarshaller: ToEntityMarshaller[MetricsEntity] = {
122 | Marshaller.withFixedContentType(mediaType) { s =>
123 | HttpEntity(mediaType, toPrometheusTextFormat(s))
124 | }
125 | }
126 | }
127 |
--------------------------------------------------------------------------------
/src/main/scala/checker/StatusChecker.scala:
--------------------------------------------------------------------------------
1 | package me.amanj.greenish.checker
2 |
3 | import me.amanj.greenish.stats.StatsCollector
4 | import me.amanj.greenish.models._
5 | import java.time.ZonedDateTime
6 | import java.io.File
7 | import akka.actor.{Actor, Props, ActorRef, ActorLogging}
8 | import scala.sys.process.Process
9 | import scala.concurrent.{Future}
10 | import scala.util.{Success, Failure}
11 | import akka.routing.{ActorRefRoutee, RoundRobinRoutingLogic, Router}
12 | import akka.pattern.pipe
13 | import scala.annotation.tailrec
14 |
15 | trait StatusCheckerApi {
16 | protected[this] var state: IndexedSeq[GroupStatus]
17 |
18 | protected[checker] def getMissing(): IndexedSeq[GroupStatus] = {
19 | state
20 | .map { group =>
21 | val newJobs: Array[JobStatus] = group.status.map { job =>
22 | job.copy(periodHealth = job.periodHealth.filterNot(_.ok))
23 | }.filterNot(_.periodHealth.isEmpty)
24 | .toArray
25 |
26 | group.copy(status = newJobs)
27 | }.filterNot(_.status.isEmpty)
28 | }
29 |
30 | protected[checker] def maxLag(): Lag = {
31 | if(state.isEmpty) Lag(0)
32 | else {
33 | val lag = state.map { group =>
34 | group.status.map(_.countMissing).max
35 | }.max
36 | Lag(lag)
37 | }
38 | }
39 |
40 | protected[checker] def allEntries(): IndexedSeq[GroupStatus] = state
41 |
42 | protected[checker] def summary(): Seq[GroupStatusSummary] =
43 | state.map { group =>
44 | val status = group.status.map { status =>
45 | val missing = status.countMissing
46 | val alertLevel: AlertLevel =
47 | if(missing <= status.job.alertLevels.great) Great
48 | else if(missing <= status.job.alertLevels.normal) Normal
49 | else if(missing <= status.job.alertLevels.warn) Warn
50 | else Critical
51 |
52 | val oldestMissingPeriod = computeOldest(status.periodHealth)
53 | JobStatusSummary(status.job.jobId, status.job.name, missing, oldestMissingPeriod, alertLevel)
54 | }.toSeq
55 | GroupStatusSummary(group.group.groupId, group.group.name, status)
56 | }
57 |
58 | protected[checker] def getGroupStatus(groupId: Int): Option[GroupStatus] =
59 | state.lift(groupId)
60 |
61 | protected[checker] def getJobStatus(groupId: Int, jobId: Int): Option[JobStatus] =
62 | for {
63 | group <- state.lift(groupId)
64 | job <- group.status.lift(jobId)
65 | } yield job
66 | }
67 |
68 | class StatusChecker(groups: Seq[Group],
69 | statsActor: ActorRef,
70 | refreshValidityInSeconds: Long,
71 | scratchDir: File,
72 | clockCounter: () => Long = () => System.currentTimeMillis())
73 | extends Actor with ActorLogging with StatusCheckerApi {
74 | override protected[this] var state = StatusChecker.initState(groups)
75 |
76 | import context.dispatcher
77 |
78 | private[this] val parallelism: Int = groups.map(_.jobs.length).sum
79 |
80 | private[this] val router = {
81 | val routees = (0 until parallelism) map { _ =>
82 | val runner = context.actorOf(
83 | Props(new CommandRunner(statsActor, scratchDir))
84 | .withDispatcher("akka.refresh-dispatcher"))
85 | context watch runner
86 | ActorRefRoutee(runner)
87 | }
88 |
89 | Router(RoundRobinRoutingLogic(), routees)
90 | }
91 |
92 | private[this] def refresh(now: ZonedDateTime, group: Group, job: Job): Unit = {
93 | val periods = StatusChecker.periods(job, now)
94 |
95 | val currentClockCounter = clockCounter()
96 | val expireAt = currentClockCounter + 1000 *
97 | refreshValidityInSeconds
98 | self ! BatchRun(job.cmd, periods, job.env.map(_.tupled),
99 | group.groupId, job.jobId, job.prometheusId,
100 | currentClockCounter, expireAt)
101 | }
102 |
103 | private[this] def refresh(now: ZonedDateTime, group: Group): Unit = {
104 | group.jobs.foreach { job => refresh(now, group, job) }
105 | }
106 |
107 | private[this] def refresh(now: ZonedDateTime): Unit = {
108 |
109 | groups.foreach { group =>
110 | refresh(now, group)
111 | }
112 | }
113 |
114 | override def receive: Receive = {
115 | case Refresh(now) =>
116 | refresh(now())
117 | case RefreshGroup(now, groupId) =>
118 | groups.find(_.groupId == groupId) match {
119 | case Some(group) =>
120 | refresh(now(), group)
121 | context.sender ! true
122 | case None =>
123 | context.sender ! false
124 | }
125 | case RefreshJob(now, groupId, jobId) =>
126 | val result = for {
127 | group <- groups.find(_.groupId == groupId)
128 | job <- group.jobs.lift(jobId)
129 | } yield {
130 | refresh(now(), group, job)
131 | }
132 | context.sender ! result.isDefined
133 | case RunResult(periodHealth, groupId, jobId, clockCounter) =>
134 | val bucket = state(groupId)
135 | val currentStatus = bucket.status(jobId)
136 | if(currentStatus.updatedAt < clockCounter) {
137 | bucket.status(jobId) = currentStatus.copy(updatedAt = clockCounter,
138 | periodHealth = periodHealth)
139 | }
140 | case GetMissing => context.sender ! getMissing()
141 | case MaxLag => context.sender ! maxLag()
142 | case AllEntries => context.sender ! allEntries()
143 | case Summary => context.sender ! summary()
144 | case GetGroupStatus(id) =>
145 | context.sender ! getGroupStatus(id)
146 | case GetJobStatus(gid, jid) =>
147 | context.sender ! getJobStatus(gid, jid)
148 | case run: BatchRun =>
149 | router.route(run, context.sender)
150 | }
151 | }
152 |
153 | object StatusChecker {
154 | private[checker] def initState(groups: Seq[Group]): IndexedSeq[GroupStatus] = {
155 | groups.map { group =>
156 | val jobStatus = group.jobs.map { job =>
157 | JobStatus(job, -1, Seq.empty)
158 | }
159 | GroupStatus(group, jobStatus.toArray)
160 | }.toIndexedSeq
161 | }
162 |
163 | private[checker] def periods(entry: Job, now: ZonedDateTime): Seq[String] = {
164 | @tailrec def loop(time: ZonedDateTime, count: Int, acc: Seq[String]): Seq[String] = {
165 | if(time.toEpochSecond < entry.startAt || count == 0) acc.reverse
166 | else
167 | loop(entry.frequency.prev(time), count - 1,
168 | acc :+ time.format(entry.timeFormat))
169 | }
170 |
171 | loop(nowMinusOffset(entry, now),
172 | entry.lookback, Vector.empty[String])
173 | }
174 |
175 | private[checker] def nowMinusOffset(entry: Job,
176 | now: ZonedDateTime): ZonedDateTime =
177 | if(entry.periodCheckOffset == 0)
178 | now.withZoneSameInstant(entry.timezone)
179 | else
180 | (1 to entry.periodCheckOffset)
181 | .foldLeft(now.withZoneSameInstant(entry.timezone))(
182 | (acc, next) => entry.frequency.prev(acc))
183 | }
184 |
--------------------------------------------------------------------------------
/src/main/scala/endpoints/Routes.scala:
--------------------------------------------------------------------------------
1 | package me.amanj.greenish.endpoints
2 |
3 | import java.time.ZonedDateTime
4 | import java.io.File
5 | import akka.actor.ActorRef
6 | import akka.util.Timeout
7 | import akka.pattern.ask
8 | import akka.http.scaladsl.model.{StatusCodes, ContentTypes}
9 | import akka.http.scaladsl.server.Directive
10 | import akka.http.scaladsl.server.Directives._
11 | import scala.concurrent.duration.Duration
12 | import io.circe.syntax._
13 | import io.circe.Printer
14 | import me.amanj.greenish.models._
15 | import me.amanj.greenish.stats._
16 | import me.amanj.greenish.checker._
17 | import akka.http.scaladsl.model.HttpResponse
18 | import scala.util.Success
19 |
20 | class Routes(namespace: Option[String],
21 | scratchDir: File,
22 | statusChecker: ActorRef,
23 | statsActor: ActorRef,
24 | goodRefreshRecency: Long,
25 | now: () => ZonedDateTime = () => ZonedDateTime.now) {
26 | private[this] implicit val timeout = Timeout(Duration.fromNanos(5000000L))
27 | private[this] val jsonPrinter = Printer (
28 | dropNullValues = true,
29 | indent=""
30 | )
31 |
32 | private[this] val maxlag = get {
33 | path("maxlag") {
34 | val lagFuture = (
35 | statusChecker ? MaxLag
36 | ).mapTo[Lag]
37 | onComplete(lagFuture) { lag =>
38 | complete(lag.map(o => jsonPrinter.print(o.asJson)))
39 | }
40 | }
41 | }
42 |
43 | private[this] val summary = get {
44 | path("summary") {
45 | val lagFuture = (
46 | statusChecker ? Summary
47 | ).mapTo[Seq[GroupStatusSummary]]
48 | onComplete(lagFuture) { lag =>
49 | complete(lag.map(o => jsonPrinter.print(o.asJson)))
50 | }
51 | }
52 | }
53 |
54 | private[this] val missing = get {
55 | path("missing") {
56 | val missingFuture = (
57 | statusChecker ? GetMissing
58 | ).mapTo[Seq[GroupStatus]]
59 | onComplete(missingFuture) { missing =>
60 | complete(missing.map(o => jsonPrinter.print(o.asJson)))
61 | }
62 | }
63 | }
64 |
65 | private[this] val state = get {
66 | path("state") {
67 | val allFuture = (
68 | statusChecker ? AllEntries
69 | ).mapTo[Seq[GroupStatus]]
70 | onComplete(allFuture) { completed =>
71 | complete(completed.map(o => jsonPrinter.print(o.asJson)))
72 | }
73 | }
74 | }
75 |
76 | private[this] val getGroup = get {
77 | path("group" / IntNumber) { id =>
78 | val groupFuture = (
79 | statusChecker ? GetGroupStatus(id)
80 | ).mapTo[Option[GroupStatus]]
81 |
82 | onComplete(groupFuture) {
83 | case Success(Some(group)) =>
84 | complete(jsonPrinter.print(group.asJson))
85 | case _ =>
86 | val error = jsonPrinter.print(errorJson("Group id does not exist"))
87 | complete(HttpResponse(StatusCodes.BadRequest, entity = error))
88 | }
89 | }
90 | }
91 |
92 | private[this] val getJob = get {
93 | path("group" / IntNumber / "job" / IntNumber) {
94 | (gid, jid) =>
95 | val jobFuture = (
96 | statusChecker ? GetJobStatus(gid, jid)
97 | ).mapTo[Option[JobStatus]]
98 | onComplete(jobFuture) {
99 | case Success(Some(job)) =>
100 | complete(jsonPrinter.print(job.asJson))
101 | case _ =>
102 | val error = jsonPrinter
103 | .print(errorJson("Group id and/or job id does not exist"))
104 | complete(HttpResponse(StatusCodes.BadRequest, entity = error))
105 | }
106 | }
107 | }
108 |
109 | private[this] val getJobOutput = get {
110 | path("group" / IntNumber / "job" / IntNumber / "stdout") {
111 | (gid, jid) =>
112 | getFromFile(new File(debugFile(scratchDir, gid, jid)),
113 | ContentTypes.`text/plain(UTF-8)`)
114 | }
115 | }
116 |
117 | private[this] val refreshState = get {
118 | path("state" / "refresh") {
119 | statusChecker ! Refresh(now)
120 | complete(jsonPrinter.print(okJson("State refresh is scheduled")))
121 | }
122 | }
123 |
124 | private[this] val refreshGroup = get {
125 | path("group" / IntNumber / "refresh") { id =>
126 | val statusFuture = (
127 | statusChecker ? RefreshGroup(now, id)
128 | ).mapTo[Boolean]
129 |
130 | onComplete(statusFuture) {
131 | case Success(true) =>
132 | complete(jsonPrinter.print(okJson("Group status refresh is scheduled")))
133 | case _ =>
134 | val error = jsonPrinter.print(errorJson("Group id does not exist"))
135 | complete(HttpResponse(StatusCodes.BadRequest, entity = error))
136 | }
137 | }
138 | }
139 |
140 | private[this] val refreshJob = get {
141 | path("group" / IntNumber / "job" / IntNumber / "refresh") {
142 | (gid, jid) =>
143 | val statusFuture = (
144 | statusChecker ? RefreshJob(now, gid, jid)
145 | ).mapTo[Boolean]
146 | onComplete(statusFuture) {
147 | case Success(true) =>
148 | complete(jsonPrinter.print(okJson("Job status refresh is scheduled")))
149 | case _ =>
150 | val error = jsonPrinter
151 | .print(errorJson("Group id and/or job id does not exist"))
152 | complete(HttpResponse(StatusCodes.BadRequest, entity = error))
153 | }
154 | }
155 | }
156 |
157 | private[this] val dashboard =
158 | (get & pathPrefix("dashboard")) {
159 | (pathEndOrSingleSlash &
160 | redirectToTrailingSlashIfMissing(StatusCodes.TemporaryRedirect)) {
161 | getFromResource("dashboard/index.html")
162 | } ~ {
163 | getFromResourceDirectory("dashboard")
164 | }
165 | }
166 |
167 | private[this] val system = get {
168 | path("system") {
169 | val json = jsonPrinter.print(sysinfo(namespace))
170 | complete(json)
171 | }
172 | }
173 |
174 | private[this] val prometheus = get {
175 | path("prometheus") {
176 | val statsFuture =
177 | (statsActor ? GetPrometheus)
178 | .mapTo[StatsCollector.MetricsEntity]
179 | onComplete(statsFuture) { entity =>
180 | complete(entity)
181 | }
182 | }
183 | }
184 |
185 | private[this] val health = get {
186 | path("health") {
187 | val entriesFuture = (statusChecker ? AllEntries)
188 | .mapTo[Seq[GroupStatus]]
189 |
190 | onComplete(entriesFuture) { entity =>
191 | val health = entity.map( groups =>
192 | Routes.isHealthy(groups, goodRefreshRecency)).getOrElse(false)
193 | val json = jsonPrinter.print(healthJson(health))
194 | complete(json)
195 | }
196 | }
197 | }
198 |
199 | val routes =
200 | getJob ~ getJobOutput ~ getGroup ~ refreshState ~ refreshGroup ~
201 | refreshJob ~ maxlag ~ summary ~ missing ~ state ~ dashboard ~
202 | system ~ prometheus ~ health
203 | }
204 |
205 | object Routes {
206 | private[endpoints] def isHealthy(groups: Seq[GroupStatus],
207 | recency: Long): Boolean = {
208 | val now = System.currentTimeMillis
209 | groups.map { group =>
210 | group.status.filterNot { job =>
211 | (now - job.updatedAt) > recency || job.periodHealth.isEmpty
212 | }.length
213 | }.exists(_ > 0)
214 | }
215 | }
216 |
--------------------------------------------------------------------------------
/src/test/scala/stats/StatsCollectorSpec.scala:
--------------------------------------------------------------------------------
1 | package me.amanj.greenish.stats
2 |
3 | import akka.actor.{ActorSystem, Props}
4 | import akka.testkit.{ ImplicitSender, TestKit }
5 | import org.scalatest.BeforeAndAfterAll
6 | import org.scalatest.matchers.should.Matchers
7 | import org.scalatest.wordspec.AnyWordSpecLike
8 | import org.scalatest.concurrent.Eventually
9 | import java.time.{ZoneId, ZonedDateTime}
10 | import scala.concurrent.duration._
11 | import scala.language.postfixOps
12 | import me.amanj.greenish.models._
13 | import io.prometheus.client.Collector.MetricFamilySamples
14 | import java.io.File
15 | import scala.jdk.CollectionConverters._
16 |
17 | class StatsCollectorSpec()
18 | extends TestKit(ActorSystem("StatsCollectorSpec"))
19 | with ImplicitSender
20 | with AnyWordSpecLike
21 | with Matchers
22 | with BeforeAndAfterAll
23 | with Eventually {
24 |
25 | import StatsCollectorSpec._
26 |
27 | "StatsCollector" must {
28 | "initialize labels upon instantiation" in {
29 | val jobs = Set("p1", "p2")
30 | val stats = system.actorOf(
31 | Props(new StatsCollector(jobs)))
32 |
33 | stats ! GetPrometheus
34 |
35 | val received = receiveOne(2 seconds)
36 | assert(received.isInstanceOf[StatsCollector.MetricsEntity])
37 |
38 | val prometheus = received
39 | .asInstanceOf[StatsCollector.MetricsEntity]
40 | .samples
41 | .asScala
42 | .toList
43 |
44 | prometheus.isEmpty shouldBe false
45 | prometheus.foreach { prom =>
46 | val labels = prom.samples.asScala
47 | .flatMap(_.labelValues.asScala)
48 | .filter(jobs.contains(_))
49 | .toSet
50 | labels shouldBe jobs
51 | }
52 | }
53 |
54 | "properly handle IncExpiredRefresh message" in {
55 | val jobs = Set("p1", "p2")
56 | val stats = system.actorOf(
57 | Props(new StatsCollector(jobs)))
58 |
59 | stats ! IncExpiredRefresh("p2")
60 | stats ! GetPrometheus
61 |
62 | val expected = Seq(
63 | (Seq("p1"), 0.0),
64 | (Seq("p2"), 1.0),
65 | )
66 |
67 | val prom = receiveOne(2 seconds)
68 | .asInstanceOf[StatsCollector.MetricsEntity]
69 | .samples.asScala.toList
70 |
71 | checkSamples(prom, "greenish_state_refresh_total", expected)
72 | checkSamples(prom, "greenish_state_refresh_expired_total", expected)
73 | }
74 |
75 | "properly handle IncRefresh message" in {
76 | val jobs = Set("p1", "p2")
77 | val stats = system.actorOf(
78 | Props(new StatsCollector(jobs)))
79 |
80 | stats ! IncRefresh("p2")
81 | stats ! GetPrometheus
82 |
83 | val expected = Seq(
84 | (Seq("p1"), 0.0),
85 | (Seq("p2"), 1.0),
86 | )
87 |
88 | val prom = receiveOne(2 seconds)
89 | .asInstanceOf[StatsCollector.MetricsEntity]
90 | .samples.asScala.toList
91 |
92 | checkSamples(prom, "greenish_state_refresh_total", expected)
93 | checkSamples(prom, "greenish_active_refresh_tasks", expected)
94 | }
95 |
96 | "properly handle DecRefresh message" in {
97 | val jobs = Set("p1", "p2")
98 | val stats = system.actorOf(
99 | Props(new StatsCollector(jobs)))
100 |
101 | stats ! IncRefresh("p1")
102 | stats ! DecRefresh("p1")
103 | stats ! GetPrometheus
104 |
105 | val expectedTotal = Seq(
106 | (Seq("p1"), 1.0),
107 | (Seq("p2"), 0.0),
108 | )
109 |
110 | val expectedActive = Seq(
111 | (Seq("p1"), 0.0),
112 | (Seq("p2"), 0.0),
113 | )
114 |
115 | val prom = receiveOne(2 seconds)
116 | .asInstanceOf[StatsCollector.MetricsEntity]
117 | .samples.asScala.toList
118 |
119 | checkSamples(prom, "greenish_state_refresh_total", expectedTotal)
120 | checkSamples(prom, "greenish_active_refresh_tasks", expectedActive)
121 | }
122 |
123 | "properly handle IncBadRefresh message" in {
124 | val jobs = Set("p1", "p2")
125 | val stats = system.actorOf(
126 | Props(new StatsCollector(jobs)))
127 |
128 | stats ! IncBadRefresh("p1")
129 | stats ! GetPrometheus
130 |
131 | val expected = Seq(
132 | (Seq("p1"), 1.0),
133 | (Seq("p2"), 0.0),
134 | )
135 |
136 | val prom = receiveOne(2 seconds)
137 | .asInstanceOf[StatsCollector.MetricsEntity]
138 | .samples.asScala.toList
139 |
140 | checkSamples(prom, "greenish_state_refresh_failed_total", expected)
141 | }
142 |
143 | "properly handle OldestMissingPeriod message" in {
144 | val jobs = Set("p1", "p2")
145 | val stats = system.actorOf(
146 | Props(new StatsCollector(jobs)))
147 |
148 | stats ! OldestMissingPeriod("p1", 3)
149 | stats ! GetPrometheus
150 |
151 | val expected = Seq(
152 | (Seq("p1"), 3.0),
153 | (Seq("p2"), 0.0),
154 | )
155 |
156 | val prom = receiveOne(2 seconds)
157 | .asInstanceOf[StatsCollector.MetricsEntity]
158 | .samples.asScala.toList
159 |
160 | checkSamples(prom, "greenish_oldest_missing_period", expected)
161 | }
162 |
163 | "properly handle MissingPeriods message" in {
164 | val jobs = Set("p1", "p2")
165 | val stats = system.actorOf(
166 | Props(new StatsCollector(jobs)))
167 |
168 | stats ! MissingPeriods("p1", 3)
169 | stats ! GetPrometheus
170 |
171 | val expected = Seq(
172 | (Seq("p1"), 3.0),
173 | (Seq("p2"), 0.0),
174 | )
175 |
176 | val prom = receiveOne(2 seconds)
177 | .asInstanceOf[StatsCollector.MetricsEntity]
178 | .samples.asScala.toList
179 |
180 | checkSamples(prom, "greenish_missing_periods_total", expected)
181 | }
182 |
183 | "properly handle RefreshTime message" in {
184 | val jobs = Set("p1", "p2")
185 | val stats = system.actorOf(
186 | Props(new StatsCollector(jobs)))
187 |
188 | stats ! RefreshTime("p1", 3)
189 | stats ! GetPrometheus
190 |
191 | val expected = Set("p1")
192 |
193 | val prom = receiveOne(2 seconds)
194 | .asInstanceOf[StatsCollector.MetricsEntity]
195 | .samples.asScala.toList
196 |
197 | val actual =
198 | getNoneZeroHistogramLabels(prom,
199 | "greenish_state_refresh_time_seconds")
200 | actual shouldBe expected
201 | }
202 | }
203 | }
204 |
205 | object StatsCollectorSpec extends Matchers {
206 | def getNoneZeroHistogramLabels(
207 | prom: List[MetricFamilySamples],
208 | name: String): Set[String] =
209 | prom.filter { prom =>
210 | prom.name == name
211 | }.flatMap { metric =>
212 | metric.samples.asScala
213 | .map(sample => (sample.labelValues.asScala, sample.value))
214 | }.filter { case (seq, num) =>
215 | // Only keep what is set
216 | num != 0
217 | }.map { case (seq, num) => seq.head }
218 | .toSet
219 |
220 |
221 | def checkSamples(
222 | prom: List[MetricFamilySamples],
223 | name: String,
224 | expected: Seq[(Seq[String], Double)]): Unit = {
225 |
226 | val actual = prom
227 | .filter { prom =>
228 | prom.name == name
229 | }.flatMap { metric =>
230 | metric.samples.asScala
231 | .map(sample => (sample.labelValues.asScala, sample.value))
232 | }
233 |
234 | actual shouldBe expected
235 | }
236 | }
237 |
--------------------------------------------------------------------------------
/src/main/scala/AppConfig.scala:
--------------------------------------------------------------------------------
1 | package me.amanj.greenish
2 |
3 | import com.typesafe.config.{Config, ConfigFactory}
4 | import java.time.ZoneId
5 | import java.io.File
6 | import scala.util.Try
7 | import models._
8 | import scala.jdk.CollectionConverters._
9 |
10 | case class AppConfig(groups: Seq[Group], namespace: Option[String],
11 | scratchDir: File, refreshInSeconds: Int,
12 | address: String, port: Int,
13 | )
14 | object AppConfig {
15 | def apply(): AppConfig = {
16 | val config = ConfigFactory.load()
17 | val appConfig = config.getConfig("check-groups")
18 | val refreshRate = appConfig.getInt("refresh-in-seconds")
19 | val port = appConfig.getInt("port")
20 | val namespace = Try(appConfig.getString("namespace")).toOption
21 | val scratchDir = new File(appConfig.getString("scratch-dir"))
22 | scratchDir.delete
23 | val address = appConfig.getString("binding-address")
24 | new AppConfig(readEntries(appConfig), namespace, scratchDir,
25 | refreshRate, address, port)
26 | }
27 |
28 | private[this] def readEntries(config: Config): Seq[Group] = {
29 | val defaultOwner = config.getOptionStringWithDefault("default-owner", None)
30 | val defaultPeriodCheckOffset = config.getInt("default-period-check-offset")
31 | val defaultTimePattern = config.getString("default-period-pattern")
32 | val defaultFrequency = config.getString("default-job-run-frequency")
33 | val defaultTimezone = config.getString("default-timezone")
34 | val defaultLookback = config.getInt("default-lookback")
35 | val defaultGreatAt = config.getInt("default-great-at")
36 | val defaultNormalAt = config.getInt("default-normal-at")
37 | val defaultWarnAt = config.getInt("default-warn-at")
38 | val defaultErrorAt = config.getInt("default-error-at")
39 | val defaultStartAt = config.getLong("default-start-at")
40 | val defaultInfo = config.getOptionStringWithDefault("default-info", None)
41 | val globalEnv = config.getEnv("env", Seq.empty)
42 |
43 | config.getConfigList("groups").asScala.zipWithIndex.map { case (groupConfig, index) =>
44 | val groupName = groupConfig.getString("group-name")
45 | val groupOwner = groupConfig.getOptionStringWithDefault("group-owner", defaultOwner)
46 | val groupPeriodCheckOffset =
47 | groupConfig.getIntWithDefault("group-period-check-offset", defaultPeriodCheckOffset)
48 | val groupTimePattern = groupConfig.getStringWithDefault(
49 | "group-period-pattern", defaultTimePattern)
50 | val groupFrequency = groupConfig.getStringWithDefault(
51 | "group-job-run-frequency", defaultFrequency)
52 | val groupTimezone = groupConfig.getStringWithDefault(
53 | "group-timezone", defaultTimezone)
54 | val groupLookback = groupConfig.getIntWithDefault(
55 | "group-lookback", defaultLookback)
56 | val groupGreatAt = groupConfig.getIntWithDefault(
57 | "group-great-at", defaultGreatAt)
58 | val groupNormalAt = groupConfig.getIntWithDefault(
59 | "group-normal-at", defaultNormalAt)
60 | val groupWarnAt = groupConfig.getIntWithDefault(
61 | "group-warn-at", defaultWarnAt)
62 | val groupErrorAt = groupConfig.getIntWithDefault(
63 | "group-error-at", defaultErrorAt)
64 | val groupStartAt = groupConfig.getLongWithDefault(
65 | "group-start-at", defaultStartAt)
66 | val groupInfo = groupConfig.getOptionStringWithDefault("group-info", defaultInfo)
67 | val groupEnv = groupConfig.getEnv("env", globalEnv)
68 |
69 | val checkEntries = groupConfig.getConfigList("job-entries")
70 | .asScala.zipWithIndex.map { case (jobConfig, index) =>
71 | val jobName = jobConfig.getString("job-name")
72 | val jobOwner = jobConfig.getOptionStringWithDefault("job-owner", groupOwner)
73 | val prometheusId = normalizePrometheusId(
74 | jobConfig.getStringWithDefault(
75 | "prometheus-id", s"$groupName $jobName"))
76 | val cmd = jobConfig.getString("check-command")
77 | val jobPeriodCheckOffset = jobConfig.getIntWithDefault(
78 | "job-period-check-offset", groupPeriodCheckOffset)
79 | val timePattern = jobConfig.getStringWithDefault(
80 | "period-pattern", groupTimePattern)
81 | val frequency = toFrequency(
82 | jobConfig.getStringWithDefault(
83 | "job-run-frequency", groupFrequency))
84 | val timezone = ZoneId.of(
85 | jobConfig.getStringWithDefault("timezone", groupTimezone))
86 | val lookback = jobConfig.getIntWithDefault(
87 | "lookback", groupLookback)
88 | val greatAt = jobConfig.getIntWithDefault(
89 | "great-at", groupGreatAt)
90 | val normalAt = jobConfig.getIntWithDefault(
91 | "normal-at", groupNormalAt)
92 | val warnAt = jobConfig.getIntWithDefault(
93 | "warn-at", groupWarnAt)
94 | val errorAt = jobConfig.getIntWithDefault(
95 | "error-at", groupErrorAt)
96 | val startAt = jobConfig.getLongWithDefault(
97 | "start-at", groupStartAt)
98 | val jobInfo = jobConfig.getOptionStringWithDefault("job-info", groupInfo)
99 | val jobEnv = jobConfig.getEnv("env", groupEnv).map {
100 | case (name, value) => EnvVar(name, value)
101 | }
102 |
103 | Job(
104 | index,
105 | jobName,
106 | jobOwner,
107 | prometheusId,
108 | cmd,
109 | timePattern,
110 | frequency,
111 | jobPeriodCheckOffset,
112 | timezone,
113 | lookback,
114 | startAt,
115 | AlertLevels(greatAt, normalAt, warnAt, errorAt),
116 | jobInfo.map(_.stripMargin),
117 | jobEnv,
118 | )
119 | }.toSeq
120 | Group(index, groupName, checkEntries)
121 | }.toSeq
122 | }
123 |
124 | private[greenish] def normalizePrometheusId(id: String): String = {
125 | val spacelessId = id.replaceAll("(\\s|-)+","_").toLowerCase
126 | val pattern = "[a-zA-Z_][a-zA-Z0-9_]*"
127 | if(!spacelessId.matches(pattern)) {
128 | throw new Exception(
129 | s"""|$id: Invalid prometheus label ID, please provide a valid one.
130 | |Prometheus label names should match: "$pattern"""".stripMargin)
131 | }
132 | spacelessId
133 | }
134 |
135 | private[greenish] def toFrequency(freq: String): CheckFrequency = {
136 | freq.toLowerCase match {
137 | case "hourly" => Hourly
138 | case "daily" => Daily
139 | case "monthly" => Monthly
140 | case "annually" => Annually
141 | case _ =>
142 | try {
143 | Cron(freq)
144 | } catch {
145 | case e: IllegalArgumentException =>
146 | throw new Exception(
147 | s"""|${e.getMessage}
148 | |$freq: unsupported frequency, supported frequenices are:
149 | |hourly, daily, monthly, annually and Unix cron syntax"""
150 | .stripMargin)
151 | }
152 | }
153 | }
154 |
155 | implicit class ConfigExt[C <: Config](self: Config) {
156 | def getStringWithDefault(path: String, default: String): String =
157 | if(self.hasPath(path))
158 | self.getString(path)
159 | else default
160 |
161 | def getIntWithDefault(path: String, default: Int): Int =
162 | if(self.hasPath(path))
163 | self.getInt(path)
164 | else default
165 |
166 | def getLongWithDefault(path: String, default: Long): Long =
167 | if(self.hasPath(path))
168 | self.getLong(path)
169 | else default
170 |
171 | def getOptionStringWithDefault(path: String,
172 | default: Option[String]): Option[String] =
173 | if(self.hasPath(path))
174 | Some(self.getString(path))
175 | else default
176 |
177 |
178 | def getEnv(path: String, parent: Seq[(String, String)]): Seq[(String, String)] =
179 | if(self.hasPath(path)) {
180 | val localEnv = self.getConfig("env")
181 | .entrySet.asScala
182 | .map(e => (e.getKey, e.getValue.unwrapped.asInstanceOf[String]))
183 | .toMap
184 |
185 | val overriddenParent = parent.filterNot { case (k, _) =>
186 | localEnv.contains(k)
187 | }
188 |
189 | (localEnv.toSeq ++ overriddenParent).sorted
190 | } else parent
191 | }
192 | }
193 |
--------------------------------------------------------------------------------
/src/test/scala/AppConfigSpec.scala:
--------------------------------------------------------------------------------
1 | package me.amanj.greenish
2 |
3 | import org.scalatest.matchers.should.Matchers
4 | import org.scalatest.wordspec.AnyWordSpecLike
5 | import java.time.ZoneId
6 | import models._
7 | import java.io.File
8 |
9 | class AppConfigSpec() extends Matchers
10 | with AnyWordSpecLike {
11 |
12 | "AppConfig" must {
13 | "read config file correctly" in {
14 | val actual = AppConfig()
15 | val expected = new AppConfig(
16 | Seq(
17 | Group(0, "Group1", Seq(
18 | Job(0, "Job1", Some("Data"), "job_1", "/tmp/first_script",
19 | "yyyy-MM-dd-HH", Hourly, 3,
20 | ZoneId.of("UTC"), 24, 2,
21 | AlertLevels(0, 1, 2, 3),
22 | Some("Job info"),
23 | Seq(EnvVar("VAR1", "baz"), EnvVar("VAR2", "bazomba"),
24 | EnvVar("VAR3", "bada"), EnvVar("VAR4", "badam")),
25 | ),
26 | Job(1, "Job2", Some("Reporting"), "job_2", "/tmp/second_script job2",
27 | "yyyy-MM-dd-HH", Daily, 2,
28 | ZoneId.of("UTC"), 24, 1,
29 | AlertLevels(0, 1, 2, 3),
30 | Some("Group info"),
31 | Seq(EnvVar("VAR1", "baz"), EnvVar("VAR2", "secure(bar)"),
32 | EnvVar("VAR3", "bazooka")),
33 | ),
34 | Job(2, "Job5", Some("Reporting"), "group1_job5", "/tmp/second_script job5",
35 | "yyyy-MM-dd-HH", Hourly, 2,
36 | ZoneId.of("US/Alaska"), 24, 1,
37 | AlertLevels(0, 1, 2, 3),
38 | Some("Group info"),
39 | Seq(EnvVar("VAR1", "baz"), EnvVar("VAR2", "secure(bar)"),
40 | EnvVar("VAR3", "bazooka")),
41 | ),
42 | Job(3, "Job7", Some("Reporting"), "group1_job7", "/tmp/second_script job7",
43 | "yyyy-MM-dd-HH", Cron("0 * * * *"), 2,
44 | ZoneId.of("US/Alaska"), 24, 1,
45 | AlertLevels(0, 1, 2, 3),
46 | Some("Group info"),
47 | Seq(EnvVar("VAR1", "baz"), EnvVar("VAR2", "secure(bar)"),
48 | EnvVar("VAR3", "bazooka")),
49 | ),
50 | )),
51 | Group(1, "Group2", Seq(
52 | Job(0, "Job3", Some("SRE"), "job_3", "/tmp/third_script",
53 | "yyyy-MM-dd", Monthly, 1,
54 | ZoneId.of("UTC"), 3, 0,
55 | AlertLevels(0, 1, 2, 3),
56 | Some("""|
57 | |Link
58 | |""".stripMargin),
59 | Seq(EnvVar("VAR1", "foo"), EnvVar("VAR2", "secure(bar)")),
60 | ),
61 | Job(1, "Job4", Some("SRE"), "job_4", "/tmp/fourth_script",
62 | "yyyy-01-01", Annually, 1,
63 | ZoneId.of("UTC"), 3, 0,
64 | AlertLevels(0, 1, 2, 3),
65 | Some("""|
66 | |Link
67 | |""".stripMargin),
68 | Seq(EnvVar("VAR1", "foo"), EnvVar("VAR2", "secure(bar)")),
69 | ),
70 | Job(2, "Job6", Some("SRE"), "group2_job6", "/tmp/second_script job6",
71 | "yyyy-MM-dd-HH-mm", Daily, 1,
72 | ZoneId.of("US/Samoa"), 270, 0,
73 | AlertLevels(30, 40, 50, 60),
74 | Some("""|
75 | |Link
76 | |""".stripMargin),
77 | Seq(EnvVar("VAR1", "baz"), EnvVar("VAR2", "secure(bar)"),
78 | EnvVar("VAR3", "bazooka")),
79 | ),
80 | )),
81 | ),
82 | Some("Test dashboard"),
83 | new File("/tmp/greenish/stdout"),
84 | 30,
85 | "127.0.0.1",
86 | 8080,
87 | )
88 | actual shouldBe expected
89 | }
90 | }
91 |
92 | "toFrequency" must {
93 | import AppConfig.toFrequency
94 | "handle Unix cron syntax" in {
95 | val patterns = Seq(
96 | "* * * * *",
97 | "1-2 * * * *",
98 | "*/5 * * * *",
99 | "0 23 * * MON-FRI",
100 | "1-5 0 * JAN-DEC 0-4",
101 | )
102 | patterns.foreach { pattern =>
103 | toFrequency(pattern) shouldBe Cron(pattern)
104 | }
105 | }
106 |
107 | "handle both lower and upper case frequencies" in {
108 | toFrequency("hOURly") shouldBe Hourly
109 | toFrequency("AnnuaLLy") shouldBe Annually
110 | toFrequency("monthly") shouldBe Monthly
111 | toFrequency("DAILY") shouldBe Daily
112 | }
113 |
114 | "throw an exception when it doesn't recognize a frequency" in {
115 | intercept[Exception](toFrequency("kkk"))
116 | intercept[Exception](toFrequency("weekly"))
117 | intercept[Exception](toFrequency("minutes"))
118 | }
119 | }
120 |
121 | "normalizePrometheusId" must {
122 | import AppConfig.normalizePrometheusId
123 | "convert prometheus_id to all lowercase" in {
124 | normalizePrometheusId("ABC") shouldBe "abc"
125 | }
126 |
127 | "replace - characters in prometheus_id to _" in {
128 | normalizePrometheusId("a---b") shouldBe "a_b"
129 | }
130 |
131 | "replace whitesapce characters in prometheus_id to _" in {
132 | normalizePrometheusId("a b\nc\td\t") shouldBe "a_b_c_d_"
133 | }
134 |
135 | "throw exception when prometheus_id starts with a digit" in {
136 | intercept[Exception](normalizePrometheusId("9a b\nc\td\t"))
137 | }
138 |
139 | "throw exception when prometheus_id contains anything but [a-zA-Z0-9_]" in {
140 | intercept[Exception](normalizePrometheusId("a;a"))
141 | }
142 |
143 | "throw exception when prometheus_id is empty string" in {
144 | intercept[Exception](normalizePrometheusId(""))
145 | }
146 | "accept valid characters in the begining prometheus_id" in {
147 | normalizePrometheusId("a") shouldBe "a"
148 | normalizePrometheusId("A") shouldBe "a"
149 | normalizePrometheusId("_") shouldBe "_"
150 | }
151 | }
152 |
153 | "getIntWithDefault" must {
154 | import com.typesafe.config.ConfigFactory
155 | import AppConfig._
156 | val config = ConfigFactory.load()
157 | val appConfig = config.getConfig("check-groups")
158 | "get what the value of the property if the key exists" in {
159 | val actual = appConfig.getIntWithDefault("default-error-at", 100)
160 | val expected = 60
161 | actual shouldBe expected
162 | }
163 |
164 | "return default value if the key doesn't exists" in {
165 | val actual = appConfig.getIntWithDefault("naaah", 100)
166 | val expected = 100
167 | actual shouldBe expected
168 | }
169 | }
170 |
171 | "getStringWithDefault" must {
172 | import com.typesafe.config.ConfigFactory
173 | import AppConfig._
174 | val config = ConfigFactory.load()
175 | val appConfig = config.getConfig("check-groups")
176 | "get what the value of the property if the key exists" in {
177 | val actual = appConfig.getStringWithDefault("default-period-pattern", "kkkk")
178 | val expected = "yyyy-MM-dd-HH-mm"
179 | actual shouldBe expected
180 | }
181 |
182 | "return default value if the key doesn't exists" in {
183 | val actual = appConfig.getStringWithDefault("naaah", "kkkk")
184 | val expected = "kkkk"
185 | actual shouldBe expected
186 | }
187 | }
188 |
189 | "getEnv" must {
190 | import com.typesafe.config.ConfigFactory
191 | import AppConfig._
192 | val config = ConfigFactory.load()
193 | val appConfig = config.getConfig("check-groups")
194 | val groupConfig = appConfig.getConfigList("groups").iterator.next()
195 | val jobConfig = groupConfig.getConfigList("job-entries").iterator.next()
196 | val appEnv = appConfig.getEnv("env", Seq.empty)
197 |
198 | "get value if parent is empty, and key exists" in {
199 | appEnv shouldBe Seq("VAR1" -> "foo", "VAR2" -> "secure(bar)")
200 | }
201 |
202 | "properly dedup parent and child lists, if key exists" in {
203 | val actualGroup = groupConfig.getEnv("env", appEnv)
204 | val expectedGroup = Seq("VAR1" -> "baz", "VAR2" -> "secure(bar)",
205 | "VAR3" -> "bazooka")
206 |
207 | actualGroup shouldBe expectedGroup
208 |
209 | val actualJob = jobConfig.getEnv("env", expectedGroup)
210 | val expectedJob = Seq("VAR1" -> "baz", "VAR2" -> "bazomba",
211 | "VAR3" -> "bada", "VAR4" -> "badam")
212 |
213 | actualJob shouldBe expectedJob
214 | }
215 |
216 | "return parent env if the key doesn't exists" in {
217 | val actual = appConfig.getEnv("naaah", appEnv)
218 | val expected = appEnv
219 | actual shouldBe expected
220 | }
221 | }
222 | }
223 |
224 |
225 |
--------------------------------------------------------------------------------
/src/main/resources/dashboard/greenish-favicon.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/doc/api.md:
--------------------------------------------------------------------------------
1 | # The API
2 |
3 | ## REST
4 |
5 | Greenish provides a few REST endpoints:
6 |
7 | ### Display the maximum number of missing datasets
8 |
9 | Basically, for all the jobs, find the job that misses the most number of
10 | period datasets, and return the number.
11 |
12 | ```
13 | $ curl --silent -G http://0.0.0.0:8080/maxlag | jq .
14 | {
15 | "lag": 0
16 | }
17 | ```
18 |
19 | ### Summary
20 |
21 | Display the summary of all the monitoring tasks. Very good for a quick glance:
22 |
23 | ```
24 | $ curl --silent -G http://0.0.0.0:8080/summary | jq .
25 | [
26 | {
27 | "group_id": 0,
28 | "name": "Group1",
29 | "status": [
30 | {
31 | "job_id": 0,
32 | "name": "Job1",
33 | "missing": 4,
34 | "oldest_mising_period": 10,
35 | "alert_level": "warn"
36 | },
37 | {
38 | "job_id": 1,
39 | "name": "Job2",
40 | "missing": 2,
41 | "oldest_mising_period": 3,
42 | "alert_level": "normal"
43 | }
44 | ]
45 | },
46 | {
47 | "group_id": 0,
48 | "name": "Group2",
49 | "status": [
50 | {
51 | "job_id": 0,
52 | "name": "Job3",
53 | "missing": 6,
54 | "oldest_mising_period": 6,
55 | "alert_level": "critical"
56 | },
57 | {
58 | "job_id": 1,
59 | "name": "Job4",
60 | "missing": 0,
61 | "oldest_mising_period": 0,
62 | "alert_level": "great"
63 | }
64 | ]
65 | }
66 | ]
67 | ```
68 |
69 | ### Display all the periods that are missing for all the jobs
70 |
71 | ```
72 | $ curl --silent -G http://0.0.0.0:8080/missing | jq .
73 | [
74 | {
75 | "group": {
76 | "group_id": 0,
77 | "name": "Group1",
78 | "jobs": [
79 | {
80 | "job_id": 0,
81 | "name": "Job1",
82 | "cmd": "/tmp/first_script",
83 | "time_pattern": "yyyy-MM-dd-HH",
84 | "frequency": "hourly",
85 | "timezone": {
86 | "zone_id": "UTC"
87 | },
88 | "lookback": 24,
89 | "start_at": 1593093930,
90 | "alert_levels": {
91 | "great": 0,
92 | "normal": 1,
93 | "warn": 2,
94 | "critical": 3
95 | }
96 | env: []
97 | }
98 | ]
99 | },
100 | "status": [
101 | {
102 | "job": {
103 | "job_id": 0,
104 | "name": "Job1",
105 | "cmd": "/tmp/first_script",
106 | "time_pattern": "yyyy-MM-dd-HH",
107 | "frequency": "hourly",
108 | "timezone": {
109 | "zone_id": "UTC"
110 | },
111 | "lookback": 24,
112 | "start_at": 1593093930,
113 | "alert_levels": {
114 | "great": 0,
115 | "normal": 1,
116 | "warn": 2,
117 | "critical": 3
118 | }
119 | env: []
120 | },
121 | "updated_at": 1593567901,
122 | "period_health": [
123 | {
124 | "period": "2020-06-27-20",
125 | "ok": false
126 | }
127 |
128 | ...
129 | ```
130 |
131 | ### Display the current state
132 |
133 | A very detailed view for all monitoring tasks:
134 |
135 | ```
136 | $ curl --silent -G http://0.0.0.0:8080/state | jq .
137 | [
138 | {
139 | "group": {
140 | "group_id": 0,
141 | "name": "Group1",
142 | "jobs": [
143 | {
144 | "job_id": 0,
145 | "name": "Job1",
146 | "owner": "Reporting Team"
147 | "cmd": "/tmp/first_script",
148 | "time_pattern": "yyyy-MM-dd-HH",
149 | "frequency": "hourly",
150 | "timezone": {
151 | "zone_id": "UTC"
152 | },
153 | "lookback": 24,
154 | "start_at": 1593093930,
155 | "alert_levels": {
156 | "great": 0,
157 | "normal": 1,
158 | "warn": 2,
159 | "critical": 3
160 | }
161 | env: []
162 | }
163 | ]
164 | },
165 | "status": [
166 | {
167 | "job": {
168 | "job_id": 0,
169 | "name": "Job1",
170 | "cmd": "/tmp/first_script",
171 | "time_pattern": "yyyy-MM-dd-HH",
172 | "frequency": "hourly",
173 | "timezone": {
174 | "zone_id": "UTC"
175 | },
176 | "lookback": 24,
177 | "start_at": 1593093930,
178 | "alert_levels": {
179 | "great": 0,
180 | "normal": 1,
181 | "warn": 2,
182 | "critical": 3
183 | }
184 | env: []
185 | },
186 | "updated_at": 1593567901,
187 | "period_health": [
188 | {
189 | "period": "2020-06-27-20",
190 | "ok": true
191 | },
192 | {
193 | "period": "2020-06-27-21",
194 | "ok": true
195 | },
196 |
197 | ...
198 | ```
199 |
200 | ### Get job and group by id
201 |
202 | You can query a single group by its id:
203 |
204 | ```
205 | $ curl --silent -G localhost:8080/group/1 | jq .
206 | {
207 | "group": {
208 | "group_id": 1,
209 | "name": "Group2",
210 | "jobs": [
211 | {
212 | "job_id": 0,
213 | "name": "Job3",
214 | "cmd": "/tmp/third_script",
215 | "time_pattern": "yyyy-MM-dd",
216 | "frequency": "monthly",
217 | "timezone": {
218 | ...
219 | ```
220 |
221 | You can also focus on a single job, and query it:
222 |
223 | ```
224 | $ curl --silent -G localhost:8080/group/1/job/0 | jq .
225 | {
226 | "job": {
227 | "job_id": 0,
228 | "name": "Job3",
229 | "cmd": "/tmp/third_script",
230 | "time_pattern": "yyyy-MM-dd",
231 | "frequency": "monthly",
232 | "timezone": {
233 | "zone_id": "UTC"
234 | },
235 | "lookback": 3,
236 | "start_at": 1593093930,
237 | "alert_levels": {
238 | "great": 0,
239 | "normal": 1,
240 | "warn": 2,
241 | "critical": 3
242 | }
243 | env: []
244 | },
245 | "updated_at": 1593585049298,
246 | "period_health": [
247 | {
248 | "period": "2020-05-01",
249 | "ok": true
250 | },
251 | {
252 | "period": "2020-06-01",
253 | "ok": true
254 | },
255 | {
256 | "period": "2020-07-01",
257 | "ok": true
258 | }
259 | ]
260 | }
261 | ```
262 |
263 | Finally, you can alos get the output of a check (monitoring script) for a job:
264 |
265 | ```
266 | $ curl --silent -G localhost:8080/group/0/job/0/stdout
267 | ```
268 |
269 | This is useful when debugging the monitoring script, or if there are further
270 | information in the stdout that is not expressed in the UI/API.
271 |
272 |
273 | ### Refresh the state
274 |
275 | You can refresh the entire at once:
276 |
277 | ```
278 | $ curl --silent -G localhost:8080/state/refresh | jq .
279 | {
280 | "ok": "State refresh is scheduled"
281 | }
282 | ```
283 |
284 | You can point refresh the state of a single group by its id:
285 |
286 | ```
287 | $ curl --silent -G localhost:8080/group/0/refresh | jq .
288 | {
289 | "ok": "Group status refresh is scheduled"
290 | }
291 | ```
292 |
293 | You can also point refresh the state of a single job by its id:
294 |
295 | ```
296 | $ curl --silent -G localhost:8080/group/0/job/0/refresh | jq .
297 | {
298 | "ok": "Job status refresh is scheduled"
299 | }
300 | ```
301 | ### Health-check
302 |
303 | Checks if any of the last 5 state refreshes succeeded, if yes, then it is
304 | considered a good health.
305 |
306 | ```
307 | $ curl --silent -G http://0.0.0.0:8080/health | jq .
308 | {
309 | "health": "good"
310 | }
311 | ```
312 |
313 | ### System info
314 |
315 | Print basic information about the service.
316 |
317 | ```
318 | $ curl --silent -G localhost:8080/system | jq .
319 | {
320 | "service": "Greenish",
321 | "namespace": "Staging",
322 | "version": "1.4.0-SNAPSHOT",
323 | "uptime": 1784338
324 | }
325 | ```
326 |
327 | ## Prometheus
328 |
329 | Greenish can also export data to Prometheus. These are the supported metrics:
330 |
331 | ```
332 | TYPE: GAUGE
333 | NAME: greenish_active_refresh_tasks
334 | HELP: Current number active state refresh tasks
335 | LABELS: job_id
336 |
337 | TYPE: HISTOGRAM
338 | NAME: greenish_state_refresh_time_seconds
339 | HELP: Job state refreshing time
340 | LABELS: job_id
341 |
342 | TYPE: COUNTER
343 | NAME: greenish_state_refresh_total
344 | HELP: Total number of job state refresh instances
345 | LABELS: job_id
346 |
347 | TYPE: COUNTER
348 | NAME: greenish_state_refresh_expired_total
349 | HELP: Total number of expired job state refresh instances
350 | LABELS: job_id
351 |
352 | TYPE: COUNTER
353 | NAME: greenish_state_refresh_failed_total
354 | HELP: Total number of failed job state refresh instances
355 | LABELS: job_id
356 |
357 | TYPE: GAUGE
358 | NAME: greenish_missing_periods_total
359 | HELP: Current number of missing dataset periods
360 | LABELS: job_id
361 |
362 | TYPE: GAUGE
363 | NAME: greenish_oldest_missing_period
364 | HELP: The oldest missing period
365 | LABELS: job_id
366 |
367 | ```
368 |
369 | Prometheus metrics can be accessed at `/prometheus` endpoint:
370 |
371 | ```
372 | $ curl --silent -G localhost:8080/prometheus
373 | # HELP greenish_active_refresh_tasks Current number active state refresh tasks
374 | # TYPE greenish_active_refresh_tasks gauge
375 | greenish_active_refresh_tasks{job_id="job_2",} 1.0
376 | greenish_active_refresh_tasks{job_id="job_1",} 0.0
377 | greenish_active_refresh_tasks{job_id="job_4",} 1.0
378 | greenish_active_refresh_tasks{job_id="job_3",} 1.0
379 | ...
380 | ```
381 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | [](https://travis-ci.org/amanjpro/greenish)
4 | [](https://codecov.io/gh/amanjpro/greenish) [](https://gitter.im/greenish-monitoring/greenish?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
5 |
6 | **Greenish** is a monitoring tool that checks datasets for existence.
7 |
8 | Greenish understands _periods;_ for example, for an hourly job, it can
9 | verify that all datasets for the past _N_ hours exist.
10 |
11 | Configuration files use the [HOCON] syntax (a superset of [JSON];
12 | similar to [YAML]):
13 |
14 | * [annotated example](src/test/resources/application.conf);
15 | * [default values](src/main/resources/reference.conf).
16 |
17 | [HOCON]: https://github.com/lightbend/config/blob/master/HOCON.md
18 | [JSON]: https://en.wikipedia.org/wiki/JSON
19 | [YAML]: https://en.wikipedia.org/wiki/YAML
20 |
21 |
22 | Greenish runs [monitoring jobs] to collect information about which
23 | datasets are available and which are missing. Those are individual
24 | scripts that can be written in any language.
25 |
26 | [monitoring jobs]: (#monitoring-jobs)
27 |
28 |
29 | ## Greenish dashboard
30 |
31 | Greenish provides a basic HTML dashboard to visualise the state of the
32 | monitored jobs. The dashboard can be accessed at `/dashboard`.
33 |
34 | Here is a screenshot:
35 |
36 | 
37 |
38 | ## API
39 |
40 | [The Greenish API is documented in `api.md`.](doc/api.md)
41 |
42 | ## Who uses Greenish?
43 |
44 | Greenish is still new. As of now, [Samsung
45 | Ads](https://www.samsung.com/us/business/samsungads/) uses Greenish to monitor
46 | _business-critical datasets_.
47 |
48 | ## Greenish vs others
49 |
50 | * **Nagios** is a monitoring tool for systems, network and
51 | infrastructure. It is very good to keep track of the instantaneous
52 | state of a system. But it has no notion of datasets that follow a
53 | periodic pattern (e.g., daily jobs or hourly jobs). Making Nagios
54 | aware of periods is entirely on the shoulder of the check writers,
55 | which can be very tricky to do (or even impossible?).
56 |
57 | * **Prometheus** is another great tool for monitoring metrics, and the
58 | health of other systems, but again it doesn't know about datasets
59 | that follow periodic patterns. It is worth mentioning that Greenish
60 | provides an endpoint to export metrics to Prometheus.
61 |
62 | * **Airflow** knows about periods, but it is not a monitoring
63 | tool. Airflow can alert when a run fails, but if an existing dataset
64 | gets deleted accidentally, Airflow stays unaware.
65 |
66 | What sets Greenish apart is that it knows about periods, and keeps checking
67 | datasets for existence.
68 |
69 | ## Monitoring Jobs
70 |
71 | As mentioned earlier, monitoring scripts are stand-alone programs,
72 | written in any language, that respect the following contract:
73 |
74 | * The scripts must be executable.
75 |
76 | * The scripts must accept an arbitrary number of `period` arguments at
77 | the end of their parameter list; e.g., for a script named
78 | `monitor-foo`, running on the `staging` environment, asked to check
79 | the status of three hourly periods:
80 |
81 | ```shell
82 | monitor-foo staging 2020-20-06-10 2020-20-06-11 2020-20-06-12
83 | ```
84 |
85 | The `check-command` entry for the example above could be:
86 |
87 | ```yaml
88 | check-command: "monitor-foo staging"
89 | period-pattern: "yyyy-MM-dd-HH"
90 | ```
91 |
92 | - The scripts must print one diagnostic line per provided period in
93 | one of the following two formats, where `1` indicates a successful
94 | period, and `0` indicates a failed period:
95 |
96 | ```text
97 | greenish-period 0
98 | greenish-period 1
99 | ```
100 |
101 | Where:
102 |
103 | * Each value for `` must match one of the periods passed to
104 | the monitoring script.
105 |
106 | * Diagnostic lines are recognized by regular expression
107 | `^greenish-period\t.*\t(0|1)$`.
108 |
109 | * Any lines not matching the format are ignored by Greenish. This
110 | allows monitoring scripts to print extra debugging data.
111 |
112 | - The scripts must exit with 0, regardless of the status of any
113 | individual check. Exiting in error is reserved for problems
114 | evaluating the checks themselves.
115 |
116 | Example monitoring script:
117 |
118 | ```
119 | #!/usr/bin/env bash
120 | farm=$1; shift
121 |
122 | echo '# Start of checks'
123 | for period in "$@"; do
124 | echo '# Arbitrary debugging info here'
125 |
126 | ## Note how the `ls` command below does print some output, which
127 | ## Greenish will ignore. (Unless the input directory is malicious,
128 | ## and purposefully includes files named in the way that Greenish
129 | ## expects as representing check output.)
130 | if ls "$farm/$period"; then
131 | printf 'greenish-period\t%s\t%d\n' "$period" 1
132 | else
133 | printf 'greenish-period\t%s\t%d\n' "$period" 0
134 | fi
135 | done
136 | ```
137 |
138 | ## Performance Tweaking
139 |
140 | The monitoring jobs are usually blocking IO jobs. Do that network call, wait
141 | for this API, connect to a DB, HDFS etc. That is why they are running under
142 | their very own execution context (thread pool). So that they do not block the
143 | rest of the service (namely the endpoints). The execution context config for
144 | the monitoring jobs are controlled by a dispatcher named `refresh-dispatcher`.
145 | Greenish comes with a default config that is suitable for IO-bound processes,
146 | you can find it in the default settings mentioned earlier.
147 |
148 | It is best to use `thread-pool-executor` dispatcher for blocking jobs, as they
149 | are tailored for IO jobs. More information can be found:
150 |
151 | - [ThreadPoolExecutor Javadoc](https://docs.oracle.com/javase/8/docs/api/java/util/concurrent/ThreadPoolExecutor.html)
152 | - [Akka documentaiton](https://doc.akka.io/docs/akka-http/current/handling-blocking-operations-in-akka-http-routes.html#solution-dedicated-dispatcher-for-blocking-operations)
153 |
154 | ## Logging
155 |
156 | Greenish uses Akka's simple logging mechanism. In the spirit of [12 factor
157 | App](https://12factor.net/logs) all logs are written to STDOUT, and the
158 | configuration can be done via the `application.conf` file. The following
159 | is a summary of some of the most useful options for customizing logging:
160 |
161 | ```
162 | akka {
163 | # Log the complete configuration at INFO level when Greenish is started.
164 | # This is useful when you are uncertain of what configuration is used.
165 | log-config-on-start = on
166 | # Options are: OFF, DEBUG, INFO, ERROR, WARN
167 | loglevel = "DEBUG"
168 | # To turn off logging completely
169 | stdout-loglevel = "OFF"
170 |
171 | # Not necessarily useful in prod, but can be useful during development
172 | # You probably want to skip the following in produciton
173 | log-dead-letters = 10
174 | log-dead-letters-during-shutdown = on
175 | actor {
176 | debug {
177 | # enable function of LoggingReceive, which is to log any received message at
178 | # DEBUG level
179 | receive = on
180 | # enable DEBUG logging of all AutoReceiveMessages (Kill, PoisonPill etc.)
181 | autoreceive = on
182 | # enable DEBUG logging of actor lifecycle changes
183 | lifecycle = on
184 | # enable DEBUG logging of unhandled messages
185 | unhandled = on
186 | # enable DEBUG logging of all LoggingFSMs for events, transitions and timers
187 | fsm = on
188 | }
189 | }
190 | }
191 | ```
192 |
193 | ## Pre-built package
194 |
195 | You can download pre-built packages (both fat (i.e. assembly) jar and docker)
196 | from the [releases page](https://github.com/amanjpro/greenish/releases). The
197 | latest docker image can be found at the [packages
198 | page](https://github.com/amanjpro/greenish/packages).
199 |
200 | ## Development
201 |
202 | ### Requirements
203 |
204 | - Java 8
205 | - SBT 1.3.x
206 | - Bash
207 | - NodeJS 14+
208 |
209 | ### Building from the source
210 |
211 | First install `npm` dependencies:
212 |
213 | `$ npm install`
214 |
215 | SBT takes care of building/testing both the Scala and JavaScript/JSX:
216 |
217 | `$ sbt clean test package`
218 |
219 | To run the service from the source:
220 | `$ sbt -Dconfig.file=PATH_TO_CONFIG_FILE run`
221 |
222 | **Note** Unfortunately, the JavaScript code has no tests yet, this is an issue
223 | that needs to be resolved.
224 |
225 | #### Packaging
226 |
227 | Greenish supports both "fat jar" and docker. Fat jar is a single and
228 | self-contained jar that can be distributed on any *nix environment (as long as
229 | Java and Bash are installed):
230 |
231 | ```
232 | $ sbt assembly
233 | $ java -Dconfig.file=PATH_TO_CONFIG_FILE -jar target/scala-2.13/greenish-assembly-*.jar
234 | ```
235 |
236 | You can also build docker images:
237 |
238 | ```
239 | $ sbt docker:publishLocal
240 | # The docker image expects config to be mounted at: /app/config.yml
241 | $ docker run --volume PATH_TO_CONFIG_FILE:/app/config.yml --rm -p 8080:8080 greenish:LATEST_VERSION
242 | ```
243 |
244 | ## Contributing
245 |
246 | Contributions are most welcome. Please, fork it, use it, open issues and submit PRs!
247 |
248 | ## Acknowledgment
249 |
250 | - Thanks to [Nasrin Zaza](https://www.linkedin.com/in/nasrin-zaza/) for the
251 | amazing logo
252 |
--------------------------------------------------------------------------------
/src/test/resources/application.conf:
--------------------------------------------------------------------------------
1 | # The jobs/data-sets to monitor
2 | check-groups: {
3 | # This is an optional subtitle to show in the dashboard
4 | namespace: "Test dashboard"
5 | # Where to store temporary files, that is used to store debug lines that can
6 | # be seen in te /group/gid/job/stdout endpoint. You only need to change this
7 | # setting if you have more than one instance of Greenish running on the same
8 | # machine. If you do not, then there will be race condition between the two
9 | # instances.
10 | scratch-dir: "/tmp/greenish/stdout"
11 | # The frequency of pulling data
12 | refresh-in-seconds: 30
13 | # The binding address of the HTTP server
14 | binding-address: "127.0.0.1"
15 | # The port on which the server is running
16 | port: 8080
17 | # Not all jobs are available at the begining of a period, an hourly job might
18 | # systematically appear at the end of an hour. A monthly job at the end of
19 | # the month, and etc. This is a global setting, for specifying the deafult
20 | # period-offset, if a job is expected to arrive at the end of the period, then
21 | # offset should be 1.
22 | # This set can be overridden by `group-period-check-offset` at group level,
23 | # and `job-period-check-offset` at job level.
24 | default-period-check-offset: 1
25 | # Default period pattern. Please see job-entries' period-pattern property for
26 | # more information
27 | default-period-pattern: "yyyy-MM-dd-HH-mm"
28 | # Default run frequency. Please see job-entries' job-run-frequency property
29 | # for more information
30 | default-job-run-frequency: "daily"
31 | # Default timezone. Please see job-entries' timezone property for more
32 | # information
33 | default-timezone: "US/Samoa"
34 | # Default lookback. Please see job-entries' lookback property for more
35 | # information
36 | default-lookback: 270
37 | # Default great-at. Please see job-entries' great-at property for
38 | # more information
39 | default-great-at: 30
40 | # Default normal-at property. Please see job-entries' normal-at pattern
41 | # property for more information
42 | default-normal-at: 40
43 | # Default warn-at property. Please see job-entries' warn-at pattern
44 | # property for more information
45 | default-warn-at: 50
46 | # Default error-at property. Please see job-entries' error-at pattern
47 | # property for more information
48 | default-error-at: 60
49 |
50 | # Default start-at property. This is basically where we can define
51 | # when the oldest possible available datasets. This is useful, for
52 | # example if we add a new job to be monitored, and the lookback is
53 | # 10, for the next 10 hours the job starts complaining because some
54 | # data is missing, even though those are correctly missing (the job
55 | # didn't exist for those periods). The default value is 0. The data
56 | # is expressed in Unix timestamp epoc (seconds) This property can be
57 | # updated both at group and job level
58 | default-start-at: 0
59 | # Default owner of all the following jobs, can be used for escalation
60 | # purposes. This appears under the Job view in the dashboard. HTML tags are
61 | # supported. This setting is Optional.
62 | # Can be overridden by group-owner and job-owner settings
63 | default-owner: "SRE"
64 | # Default info for all the following jobs, can be used for adding extra
65 | # information to be shown next to the job in the dashboard. HTML tags are
66 | # encouraged here. Multiline strings are introduced using `"""` like in
67 | # Python and Scala. If you start every new line with `|`, then every leading
68 | # space to that character will be ignored. This setting is Optional. Can be
69 | # overridden by group-info and job-info settings
70 | default-info: """|
71 | |Link
72 | |"""
73 | # Additional environment variables to be passed
74 | # to the monitoring scripts, you can AWS profile
75 | # names here, for example:
76 | # AWS_PROFILE: "reader-profile"
77 | # `env` can be set on both grup and job level
78 | env: {
79 | VAR1: "foo"
80 | # `secure(...)` pattern tells Greenish that the data should be kept
81 | # secret and not returned in any of the endpoints.
82 | VAR2: "secure(bar)"
83 | }
84 | # Job groups, a group is a set of jobs/data-sets
85 | # that have some sort of logical relation
86 | groups: [
87 | {
88 | # Pick a human friendly name here
89 | group-name: "Group1",
90 | # More or less like `default-period-check-offset`, but this scoped to the
91 | # group only. Can be overridden by `job-period-check-offset`.
92 | group-period-check-offset: 2
93 | # More or less like `default-period-pattern`, but this scoped to the
94 | # group only. Can be overridden by `period-pattern`.
95 | group-period-pattern: "yyyy-MM-dd-HH"
96 | # More or less like `default-job-run-frequency`, but this scoped to the
97 | # group only. Can be overridden by `job-run-frequency`.
98 | group-job-run-frequency: "hourly"
99 | # More or less like `default-timezone`, but this scoped to the
100 | # group only. Can be overridden by `timezone`.
101 | group-timezone: "US/Alaska"
102 | # More or less like `default-lookback`, but this scoped to the
103 | # group only. Can be overridden by `lookback`.
104 | group-lookback: 24
105 | # More or less like `default-great-at`, but this scoped to the
106 | # group only. Can be overridden by `great-at`.
107 | group-great-at: 0
108 | # More or less like `default-normal-at`, but this scoped to the
109 | # group only. Can be overridden by `normal-at`.
110 | group-normal-at: 1
111 | # More or less like `default-warn-at`, but this scoped to the
112 | # group only. Can be overridden by `warn-at`.
113 | group-warn-at: 2
114 | # More or less like `default-critical-at`, but this scoped to the
115 | # group only. Can be overridden by `critical-at`.
116 | group-error-at: 3
117 |
118 | # Just like default-start-at, but for the group level
119 | group-start-at: 1
120 | # group level owner, more or less default-owner but at the group level
121 | group-owner: "Reporting"
122 | # group level info, more or less default-info but at the group level
123 | group-info: "Group info"
124 | env: {
125 | VAR1: "baz"
126 | VAR3: "bazooka"
127 | }
128 |
129 | # A group can have many jobs/data-sets to monitor
130 | job-entries: [
131 | {
132 | # Pick a human friendly name here
133 | job-name: "Job1"
134 | # An id to be used as a label for the exported Prometheus emtrics.
135 | # Each job will export internal metrics in a label to Prometheus,
136 | # which is controlled here. It is best to make sure that the id is
137 | # unique per job. But, it is not enforced.
138 | #
139 | # In case this option is skipped, the combination of `group` and
140 | # `job` name is chosen, turned into lower-case and all the whitespace
141 | # characters, and hyphens are replaced with _.
142 | # Prometheus IDs should match this pattern:
143 | # "[a-zA-Z_][a-zA-Z0-9_]*"
144 | prometheus-id: "job_1"
145 | # A check-command is any executable program/script that, takes
146 | # `period` in the form of `period-pattern` below as the last
147 | # argument, and exits with 0 only if successful. You an add arguments
148 | # to the script here: `/etc/check job1 production` is perfectly
149 | # allowed.
150 | # In case the Greenish failed to run the script, please wrap it in a
151 | # shell-script and add shebang at the top. Java Process Builder can
152 | # fail to recognize some scripts/programs.
153 | check-command: "/tmp/first_script",
154 | # A valid date/time pattern. Please consult the following page for
155 | # more info:
156 | # https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#patterns
157 | # If the data-set is expected to appear at the first day of every
158 | # month, You can write a pattern like: yyyy-MM-01
159 | period-pattern: "yyyy-MM-dd-HH"
160 | # What is the expected run-frequency of the job?
161 | # Supported values: hourly, daily, monthly, annually. Unix-Cron-style
162 | # syntax is also accepted here: `0 * * * *` runs at minute zero every
163 | # hour, more or less like `hourly`. Cron-style reacts differently to
164 | # `period-check-offset` settings. In the case of cron, you might want
165 | # to increase the offset by 1.
166 | job-run-frequency: "hourly"
167 | # More or less like `group-period-check-offset`, but this scoped to
168 | # this job only.
169 | job-period-check-offset: 3
170 | # What is the timezone of the periods in the data set. If you have two jobs,
171 | # one produced in Cairo, and follows Cairo timezone, and another in Canada
172 | # which follows UTC, you can configure them accordingly using this field.
173 | # Greenish respects the option when calling the monitoring script.
174 | timezone: "UTC"
175 | # How far back do you want to monitor? in this example we monitor
176 | # the last 24 datasets (hours)
177 | lookback: 24
178 | # The following are hints for Greenish, to check if a job is
179 | # at "great", "normal", "warn" or "critical" state
180 | great-at: 0
181 | normal-at: 1
182 | warn-at: 2
183 | error-at: 3
184 | # Just like default-start-at, but for the job only
185 | start-at: 2
186 | # job level owner, more or less group-owner but at the job level
187 | job-owner: "Data"
188 | # job level info, more or less group-info but at the job level
189 | job-info: "Job info"
190 | env: {
191 | VAR2: "bazomba"
192 | VAR3: "bada"
193 | VAR4: "badam"
194 | }
195 | },
196 | {
197 | job-name: "Job2"
198 | prometheus-id: "job_2"
199 | check-command: "/tmp/second_script job2",
200 | period-pattern: "yyyy-MM-dd-HH"
201 | job-run-frequency: "daily"
202 | timezone: "UTC"
203 | lookback: 24
204 | great-at: 0
205 | normal-at: 1
206 | warn-at: 2
207 | error-at: 3
208 | },
209 | {
210 | job-name: "Job5"
211 | check-command: "/tmp/second_script job5",
212 | },
213 | {
214 | job-name: "Job7"
215 | check-command: "/tmp/second_script job7",
216 | job-run-frequency: "0 * * * *"
217 | }
218 | ]
219 | },
220 | {
221 | group-name: "Group2",
222 | job-entries: [
223 | {
224 | job-name: "Job3"
225 | prometheus-id: "job_3"
226 | check-command: "/tmp/third_script",
227 | period-pattern: "yyyy-MM-dd"
228 | job-run-frequency: "monthly"
229 | timezone: "UTC"
230 | lookback: 3
231 | great-at: 0
232 | normal-at: 1
233 | warn-at: 2
234 | error-at: 3
235 | },
236 | {
237 | job-name: "Job4"
238 | prometheus-id: "job_4"
239 | check-command: "/tmp/fourth_script",
240 | period-pattern: "yyyy-01-01"
241 | job-run-frequency: "annually"
242 | timezone: "UTC"
243 | lookback: 3
244 | great-at: 0
245 | normal-at: 1
246 | warn-at: 2
247 | error-at: 3
248 | },
249 | {
250 | job-name: "Job6"
251 | check-command: "/tmp/second_script job6",
252 | env: {
253 | VAR1: "baz"
254 | VAR3: "bazooka"
255 | }
256 | }
257 | ]
258 | }
259 | ]
260 | }
261 |
262 | # This section is used to tune the performance of Greenish
263 | akka {
264 | # This is the thread-pool for running monitoring scripts
265 | # If Greenish is unresponsive, you should look into this.
266 | # As, monitoring scripts are expected to be IO bound, you
267 | # may want to maximize parallelism.
268 | refresh-dispatcher {
269 | type = Dispatcher
270 | executor = "thread-pool-executor"
271 | thread-pool-executor {
272 | fixed-pool-size = 100
273 | }
274 | throughput = 1
275 | mailbox-capacity = -1
276 | }
277 | }
278 |
--------------------------------------------------------------------------------
/src/test/scala/models/JsonSerdeSpec.scala:
--------------------------------------------------------------------------------
1 | package me.amanj.greenish.models
2 |
3 | import org.scalatest.matchers.should.Matchers
4 | import org.scalatest.wordspec.AnyWordSpecLike
5 | import java.time.ZoneId
6 | import io.circe.Json
7 | import io.circe.parser._
8 | import io.circe.syntax.EncoderOps
9 |
10 | class JsonSerdeSpec() extends Matchers
11 | with AnyWordSpecLike {
12 | "healthJson" must {
13 | "produce correct JSON when health is bad" in {
14 | val expected = "bad"
15 | val json = healthJson(false)
16 | val actual = json.hcursor.downField("health").as[String].getOrElse(???)
17 | actual shouldBe expected
18 | json.hcursor.keys.get.size shouldBe 1
19 | }
20 |
21 | "produce correct JSON when health is good" in {
22 | val expected = "good"
23 | val json = healthJson(true)
24 | val actual = json.hcursor.downField("health").as[String].getOrElse(???)
25 | actual shouldBe expected
26 | json.hcursor.keys.get.size shouldBe 1
27 | }
28 | }
29 |
30 | "errorJson" must {
31 | "produce correct JSON" in {
32 | val expected = "Error"
33 | val json = errorJson(expected)
34 | val actual = json.hcursor.downField("error").as[String].getOrElse(???)
35 | actual shouldBe expected
36 | json.hcursor.keys.get.size shouldBe 1
37 | }
38 | }
39 |
40 | "okJson" must {
41 | "produce correct JSON" in {
42 | val expected = "OK"
43 | val json = okJson(expected)
44 | val actual = json.hcursor.downField("ok").as[String].getOrElse(???)
45 | actual shouldBe expected
46 | json.hcursor.keys.get.size shouldBe 1
47 | }
48 | }
49 |
50 | "AlertLevel" must {
51 | "produce correct JSON" in {
52 | (Great: AlertLevel).asJson shouldBe "great".asJson
53 | (Normal: AlertLevel).asJson shouldBe "normal".asJson
54 | (Warn: AlertLevel).asJson shouldBe "warn".asJson
55 | (Critical: AlertLevel).asJson shouldBe "critical".asJson
56 | }
57 |
58 | "correctly parse JSON string" in {
59 | parse(""""great"""").flatMap(_.as[AlertLevel]).getOrElse(???) shouldBe Great
60 | parse(""""normal"""").flatMap(_.as[AlertLevel]).getOrElse(???) shouldBe Normal
61 | parse(""""warn"""").flatMap(_.as[AlertLevel]).getOrElse(???) shouldBe Warn
62 | parse(""""critical"""").flatMap(_.as[AlertLevel]).getOrElse(???) shouldBe Critical
63 | }
64 | }
65 |
66 | "AlertLevels" must {
67 | "produce correct JSON" in {
68 | val expected = Json.obj (
69 | "great" -> 1.asJson,
70 | "normal" -> 2.asJson,
71 | "warn" -> 3.asJson,
72 | "critical" -> 4.asJson,
73 | )
74 | val actual = AlertLevels(1, 2, 3, 4).asJson
75 | actual shouldBe expected
76 | }
77 |
78 | "correctly parse JSON" in {
79 | val expected = AlertLevels(1, 2, 3, 4)
80 | val actual = expected.asJson.as[AlertLevels].getOrElse(???)
81 |
82 | actual shouldBe expected
83 | }
84 | }
85 |
86 | "CheckFrequency" must {
87 | "produce correct JSON" in {
88 | (Hourly: CheckFrequency).asJson shouldBe "hourly".asJson
89 | (Daily: CheckFrequency).asJson shouldBe "daily".asJson
90 | (Monthly: CheckFrequency).asJson shouldBe "monthly".asJson
91 | (Annually: CheckFrequency).asJson shouldBe "annually".asJson
92 | val pattern = "* * * * *"
93 | val expected = Json.obj("pattern" -> pattern.asJson)
94 | Cron(pattern).asJson shouldBe expected
95 | (Cron(pattern): CheckFrequency).asJson shouldBe expected
96 | }
97 |
98 | "correctly parse JSON string" in {
99 | parse(""""hourly"""").flatMap(_.as[CheckFrequency]).getOrElse(???) shouldBe Hourly
100 | parse(""""daily"""").flatMap(_.as[CheckFrequency]).getOrElse(???) shouldBe Daily
101 | parse(""""monthly"""").flatMap(_.as[CheckFrequency]).getOrElse(???) shouldBe Monthly
102 | parse(""""annually"""").flatMap(_.as[CheckFrequency]).getOrElse(???) shouldBe Annually
103 | val pattern = "* * * * *"
104 | val expected = Cron(pattern)
105 | val actualCron = expected.asJson.as[Cron].getOrElse(???)
106 | actualCron shouldBe expected
107 | val actualCheckFrequency = expected.asJson.as[CheckFrequency].getOrElse(???)
108 | actualCheckFrequency shouldBe expected
109 | }
110 | }
111 |
112 | "EnvVar" must {
113 | "produce correct JSON it is PlainEnvVar" in {
114 | val actual = EnvVar("username", "Homa").asJson
115 | val expected = Json.obj (
116 | "type" -> "plain".asJson,
117 | "name" -> "username".asJson,
118 | "value" -> "Homa".asJson,
119 | )
120 | actual shouldBe expected
121 | }
122 |
123 | "correctly parse JSON string for PlainEnvVar" in {
124 | val expected = EnvVar("username", "Homa")
125 | val actual = expected.asJson.as[EnvVar].getOrElse(???)
126 | actual shouldBe expected
127 | }
128 |
129 | "produce correct JSO it is SecureEnvVar" in {
130 | val actual = EnvVar("username", "secure(Homa)").asJson
131 | val expected = Json.obj (
132 | "type" -> "secure".asJson,
133 | "name" -> "username".asJson,
134 | "value" -> SecureEnvVar.HIDDEN_PASSWORD.asJson,
135 | )
136 | actual shouldBe expected
137 | }
138 |
139 | "correctly parse JSON string for SecureEnvVar" in {
140 | val origin = EnvVar("username", "secure(Homa)")
141 | val expected = EnvVar("username", s"secure(${SecureEnvVar.HIDDEN_PASSWORD})")
142 | val actual = origin.asJson.as[EnvVar].getOrElse(???)
143 | actual shouldBe expected
144 | }
145 | }
146 |
147 | "Group" must {
148 | val job = Job(1, "j", None, "p", "c", "yyyy-MM-dd",
149 | Hourly, 1, ZoneId.of("UTC"), 2, 0, AlertLevels(3, 4, 5, 6),
150 | None, Seq(EnvVar("a", "b")))
151 | val group = Group(0, "g", Seq(job))
152 |
153 | "produce correct JSON" in {
154 | val actual = group.asJson
155 |
156 | val expected = Json.obj(
157 | "group_id" -> 0.asJson,
158 | "name" -> "g".asJson,
159 | "jobs" -> Seq(job).asJson
160 | )
161 |
162 | actual shouldBe expected
163 | }
164 |
165 | "correctly parse JSON" in {
166 | val expected = group
167 | val actual = expected.asJson.as[Group].getOrElse(???)
168 |
169 | actual shouldBe expected
170 | }
171 | }
172 |
173 | "GroupStatus" must {
174 | val job = Job(1, "j", None, "p", "c", "yyyy-MM-dd",
175 | Hourly, 1, ZoneId.of("UTC"), 2, 0, AlertLevels(3, 4, 5, 6),
176 | None, Seq(EnvVar("a", "b")))
177 | val group = Group(0, "g", Seq(job))
178 | val periods = Seq(PeriodHealth("1", true), PeriodHealth("2", false))
179 | val jobStatus = JobStatus(job, 100, periods)
180 | val groupStatus = GroupStatus(group, Array(jobStatus))
181 |
182 | "produce correct JSON" in {
183 | val expected = Json.obj(
184 | "group" -> group.asJson,
185 | "status" -> Seq(jobStatus).asJson,
186 | )
187 |
188 | val actual = groupStatus.asJson
189 | actual shouldBe expected
190 | }
191 |
192 | "correctly parse JSON" in {
193 | val expected = groupStatus
194 | val actual = expected.asJson.as[GroupStatus].getOrElse(???)
195 |
196 | actual shouldBe expected
197 | }
198 | }
199 |
200 | "GroupStatusSummary" must {
201 | val jobStatus = Seq(JobStatusSummary(0, "j", 1, 1, Critical))
202 | val groupStatusSummary = GroupStatusSummary(2, "g", jobStatus)
203 | "produce correct JSON" in {
204 |
205 | val expected = Json.obj(
206 | "group_id" -> 2.asJson,
207 | "name" -> "g".asJson,
208 | "status" -> jobStatus.asJson,
209 | )
210 |
211 | val actual = groupStatusSummary.asJson
212 | actual shouldBe expected
213 | }
214 |
215 | "correctly parse JSON" in {
216 | val expected = groupStatusSummary
217 | val actual = expected.asJson.as[GroupStatusSummary].getOrElse(???)
218 |
219 | actual shouldBe expected
220 | }
221 | }
222 |
223 | "Job" must {
224 | val alertLevels = AlertLevels(3, 4, 5, 6)
225 | val job = Job(1, "j", None, "p", "c", "yyyy-MM-dd",
226 | Hourly, 1, ZoneId.of("UTC"), 2, 0, alertLevels,
227 | None, Seq(EnvVar("a", "b")))
228 |
229 | "produce correct JSON when there is no owner and no info" in {
230 | val alertLevels = AlertLevels(3, 4, 5, 6)
231 | val actual = job.asJson
232 |
233 | val expected = Json.obj(
234 | "job_id" -> 1.asJson,
235 | "name" -> "j".asJson,
236 | "owner" -> Json.Null,
237 | "prometheus_id" -> "p".asJson,
238 | "cmd" -> "c".asJson,
239 | "time_pattern" -> "yyyy-MM-dd".asJson,
240 | "frequency" -> "hourly".asJson,
241 | "period_check_offset" -> 1.asJson,
242 | "timezone" -> Json.obj ("zone_id" -> "UTC".asJson),
243 | "lookback" -> 2.asJson,
244 | "start_at" -> 0.asJson,
245 | "alert_levels" -> alertLevels.asJson,
246 | "info" -> Json.Null,
247 | "env" -> Seq(EnvVar("a", "b")).asJson,
248 | )
249 |
250 | actual shouldBe expected
251 | }
252 |
253 | "produce correct JSON when owner exists" in {
254 | val alertLevels = AlertLevels(3, 4, 5, 6)
255 | val actual = job.copy(owner=Some("me"), info=Some("you")).asJson
256 |
257 | val expected = Json.obj(
258 | "job_id" -> 1.asJson,
259 | "name" -> "j".asJson,
260 | "owner" -> "me".asJson,
261 | "prometheus_id" -> "p".asJson,
262 | "cmd" -> "c".asJson,
263 | "time_pattern" -> "yyyy-MM-dd".asJson,
264 | "frequency" -> "hourly".asJson,
265 | "period_check_offset" -> 1.asJson,
266 | "timezone" -> Json.obj ("zone_id" -> "UTC".asJson),
267 | "lookback" -> 2.asJson,
268 | "start_at" -> 0.asJson,
269 | "alert_levels" -> alertLevels.asJson,
270 | "info" -> "you".asJson,
271 | "env" -> Seq(EnvVar("a", "b")).asJson,
272 | )
273 |
274 | actual shouldBe expected
275 | }
276 |
277 | "correctly parse JSON" in {
278 | val expected = job
279 | val actual = expected.asJson.as[Job].getOrElse(???)
280 |
281 | actual shouldBe expected
282 | }
283 | }
284 |
285 | "JobStatus" must {
286 | val job = Job(1, "j", None, "p", "c", "yyyy-MM-dd",
287 | Hourly, 1, ZoneId.of("UTC"), 2, 0, AlertLevels(3, 4, 5, 6),
288 | None, Seq(EnvVar("a", "b"))
289 | )
290 | val periods = Seq(PeriodHealth("1", true), PeriodHealth("2", false))
291 | val jobStatus = JobStatus(job, 100, periods)
292 |
293 | "produce correct JSON" in {
294 | val expected = Json.obj(
295 | "job" -> job.asJson,
296 | "updated_at" -> 100.asJson,
297 | "period_health" -> periods.asJson,
298 | )
299 |
300 | val actual = jobStatus.asJson
301 | actual shouldBe expected
302 | }
303 |
304 | "correctly parse JSON" in {
305 | val expected = jobStatus
306 | val actual = expected.asJson.as[JobStatus].getOrElse(???)
307 |
308 | actual shouldBe expected
309 | }
310 | }
311 |
312 | "JobStatusSummary" must {
313 | val jobStatusSummary = JobStatusSummary(0, "j", 1, 2, Critical)
314 | "produce correct JSON" in {
315 | val expected = Json.obj(
316 | "job_id" -> 0.asJson,
317 | "name" -> "j".asJson,
318 | "missing" -> 1.asJson,
319 | "oldest_missing_period" -> 2.asJson,
320 | "alert_level" -> "critical".asJson,
321 | )
322 |
323 | val actual = jobStatusSummary.asJson
324 | actual shouldBe expected
325 | }
326 |
327 | "correctly parse JSON" in {
328 | val expected = jobStatusSummary
329 | val actual = expected.asJson.as[JobStatusSummary].getOrElse(???)
330 |
331 | actual shouldBe expected
332 | }
333 | }
334 |
335 | "Lag" must {
336 | "produce correct JSON" in {
337 | val expected = Json.obj(
338 | "lag" -> 4.asJson,
339 | )
340 |
341 | val actual = Lag(4).asJson
342 | actual shouldBe expected
343 | }
344 |
345 | "correctly parse JSON" in {
346 | val expected = Lag(5)
347 | val actual = expected.asJson.as[Lag].getOrElse(???)
348 |
349 | actual shouldBe expected
350 | }
351 | }
352 |
353 | "PeriodHealth" must {
354 | "produce correct JSON" in {
355 | val expected = Json.obj(
356 | "period" -> "2020-06-25-18".asJson,
357 | "ok" -> false.asJson,
358 | )
359 |
360 | val actual = PeriodHealth("2020-06-25-18", false).asJson
361 | actual shouldBe expected
362 | }
363 |
364 | "correctly parse JSON" in {
365 | val expected = PeriodHealth("2020-06-25-18", false)
366 | val actual = expected.asJson.as[PeriodHealth].getOrElse(???)
367 |
368 | actual shouldBe expected
369 | }
370 | }
371 |
372 | "sysinfo" must {
373 | "produce correct JSON when namespace is missing" in {
374 | val json = sysinfo(None)
375 | val cursor = json.hcursor
376 | cursor.downField("version").as[Option[String]].isRight shouldBe true
377 | cursor.downField("namespace").as[Option[String]] shouldBe Right(None)
378 | cursor.downField("service").as[String] shouldBe Right("Greenish")
379 | cursor.downField("uptime").as[Long].isRight shouldBe true
380 | cursor.keys.get.size shouldBe 4
381 | }
382 |
383 | "produce correct JSON when namespace is not missing" in {
384 | val json = sysinfo(Some("my dashboard"))
385 | val cursor = json.hcursor
386 | cursor.downField("version").as[Option[String]].isRight shouldBe true
387 | cursor.downField("namespace").as[Option[String]] shouldBe Right(Some("my dashboard"))
388 | cursor.downField("service").as[String] shouldBe Right("Greenish")
389 | cursor.downField("uptime").as[Long].isRight shouldBe true
390 | cursor.keys.get.size shouldBe 4
391 | }
392 | }
393 | }
394 |
395 |
--------------------------------------------------------------------------------
/doc/images/greenish-with-background.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/test/scala/checker/CommandRunnerSpec.scala:
--------------------------------------------------------------------------------
1 | package me.amanj.greenish.checker
2 |
3 | import akka.actor.{ActorSystem, Props, ActorRef}
4 | import java.io.File
5 | import scala.concurrent.duration._
6 | import akka.testkit.{ ImplicitSender, TestKit }
7 | import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach}
8 | import org.scalatest.matchers.should.Matchers
9 | import org.scalatest.wordspec.AnyWordSpecLike
10 | import org.scalatest.concurrent.Eventually
11 | import scala.concurrent.duration._
12 | import me.amanj.greenish.models.PeriodHealth
13 | import me.amanj.greenish.stats.{StatsCollector, StatsCollectorSpec, GetPrometheus}
14 | import scala.jdk.CollectionConverters._
15 |
16 | import scala.language.postfixOps
17 | import scala.io.Source
18 |
19 | class CommandRunnerSpec()
20 | extends TestKit(ActorSystem("CommandRunnerSpec"))
21 | with ImplicitSender
22 | with AnyWordSpecLike
23 | with Matchers
24 | with Eventually
25 | with BeforeAndAfterEach
26 | with BeforeAndAfterAll {
27 |
28 | val src = s"/tmp/greenish-test-${System.currentTimeMillis}"
29 | val farFuture = System.currentTimeMillis * 2
30 | val dir = new File("/tmp/2020-06-07-01")
31 | val dirWithSpaces = new File("/tmp/2020-06-07 01")
32 | val lsSleep = getClass.getResource("/ls-sleep").getFile
33 | val ls = getClass.getResource("/test-ls").getFile
34 | val lsEnv = getClass.getResource("/test-ls-env").getFile
35 | val lsDup = getClass.getResource("/test-duplicate-period").getFile
36 | val lsPart = getClass.getResource("/test-partial-period").getFile
37 | implicit val patience: PatienceConfig = PatienceConfig(15 seconds, 1 second)
38 |
39 | var stats: ActorRef = _
40 | val outputDir = new File("/tmp/greenish/stdout")
41 |
42 | override def beforeAll: Unit = {
43 | dirWithSpaces.mkdirs
44 | outputDir.mkdirs
45 | dir.mkdirs
46 | }
47 |
48 | override def afterAll: Unit = {
49 | dir.delete
50 | dirWithSpaces.delete
51 | outputDir.delete
52 | TestKit.shutdownActorSystem(system)
53 | }
54 |
55 | override def afterEach(): Unit = {
56 | new File(src).delete
57 | }
58 |
59 | override def beforeEach(): Unit = {
60 | super.beforeEach()
61 | stats = system.actorOf(
62 | Props(new StatsCollector(Set("p1", "p2", "p3"))))
63 | }
64 |
65 | "parseOutput" must {
66 | "parse output lines correctly" in {
67 | val lines = LazyList(
68 | "greenish-period\t2020-02-17 8\t1",
69 | "greenish-period\t2020-02-17-9\t1",
70 | "greenish-period\t2020-02-17 10\t0",
71 | "greenish-period\t2020-02-17-11\t0",
72 | "greenish-period\t2020-02-17 10 38\t0",
73 | "Other output",
74 | "greenish-period 2020-02-17 10 38 0",
75 | "greenish-period\t2020-02-17 10 38\t9",
76 | )
77 | val periods = Set(
78 | "2020-02-17 8",
79 | "2020-02-17-9",
80 | "2020-02-17 10",
81 | "2020-02-17-11",
82 | )
83 |
84 | val expected = Seq(
85 | ("2020-02-17 8", true),
86 | ("2020-02-17-9", true),
87 | ("2020-02-17 10", false),
88 | ("2020-02-17-11", false),
89 | )
90 |
91 | val actual = CommandRunner.parseOutput(lines, periods)
92 |
93 | actual shouldBe expected
94 | }
95 |
96 | "ignore lines that do not match the period set" in {
97 | val lines = LazyList(
98 | "greenish-period\t2020-02-17-10\t1",
99 | "greenish-period\t2020-02-17-11\t0",
100 | )
101 | val periods = Set(
102 | "2020-02-17-10",
103 | )
104 |
105 | val expected = Seq(
106 | ("2020-02-17-10", true),
107 | )
108 |
109 | val actual = CommandRunner.parseOutput(lines, periods)
110 |
111 | actual shouldBe expected
112 | }
113 |
114 | "capture duplicate periods correctly" in {
115 | val lines = LazyList(
116 | "greenish-period\t2020-02-17-10\t1",
117 | "greenish-period\t2020-02-17-10\t0",
118 | "greenish-period\t2020-02-17-11\t0",
119 | )
120 | val periods = Set(
121 | "2020-02-17-10",
122 | "2020-02-17-11",
123 | )
124 |
125 | val expected = Seq(
126 | ("2020-02-17-10", true),
127 | ("2020-02-17-10", false),
128 | ("2020-02-17-11", false),
129 | )
130 |
131 | val actual = CommandRunner.parseOutput(lines, periods)
132 |
133 | actual shouldBe expected
134 | }
135 |
136 | "Have no problem if a period in the provided period-set wasn't in the output lines" in {
137 | val lines = LazyList(
138 | "greenish-period\t2020-02-17-10\t1",
139 | "greenish-period\t2020-02-17-11\t0",
140 | )
141 | val periods = Set(
142 | "2020-02-17-10",
143 | "2020-02-17-11",
144 | "2020-02-17-12",
145 | )
146 |
147 | val expected = Seq(
148 | ("2020-02-17-10", true),
149 | ("2020-02-17-11", false),
150 | )
151 |
152 | val actual = CommandRunner.parseOutput(lines, periods)
153 |
154 | actual shouldBe expected
155 | }
156 | }
157 |
158 | "toBashCommand" must {
159 | "single-quote the periods to avoid bash splitting" in {
160 | val periods = Seq("20 02", "30 03", "01 10", "400")
161 | val cmd = "hey this is a command"
162 | val actual = CommandRunner.toBashCommand(cmd, periods)
163 | val expected = "hey this is a command '20 02' '30 03' '01 10' '400'"
164 | actual shouldBe expected
165 | }
166 | }
167 |
168 | "BatchRun command" must {
169 |
170 | import StatsCollectorSpec.{checkSamples, getNoneZeroHistogramLabels}
171 |
172 | "not run anything if the refresh command is too old" in {
173 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir)))
174 | actor ! BatchRun(lsPart, Seq("2020-06-07-01", "2020-06-07-02"), Seq.empty, 0, 1, "p1", 2, 0)
175 | expectNoMessage(4 seconds)
176 | }
177 |
178 | "write debugging lines to disk verbatim" in {
179 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir)))
180 | actor ! BatchRun(s"$ls /tmp", Seq("2020-06-07-01", "2020-06-07-02"), Seq.empty, 0, 1, "p1", 2, farFuture)
181 |
182 | val _ = receiveOne(2 seconds)
183 |
184 | val expected = List("LETS PRINT THINGS", "DEBUG HERE TOO",
185 | "greenish-period\t2020-06-07-01\t1", "DEBUG HERE TOO",
186 | "greenish-period\t2020-06-07-02\t0", "DEBUG HERE")
187 | val actual = Source.fromFile(debugFile(outputDir, 0, 1)).getLines.toList
188 | actual shouldBe expected
189 | }
190 |
191 | "send back nothing, when command does not exit" in {
192 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir)))
193 | actor ! BatchRun("a;kjdw", Seq.empty, Seq.empty, 0, 0, "p1", 0, farFuture)
194 | expectNoMessage(4 seconds)
195 | }
196 |
197 | "send back nothing, when command does not exit with 0" in {
198 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir)))
199 | actor ! BatchRun("exit 1;", Seq.empty, Seq.empty, 0, 0, "p1", 0, farFuture)
200 | expectNoMessage(4 seconds)
201 | }
202 |
203 | "send back nothing, when command exits with 0, but not all periods are printed" in {
204 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir)))
205 | actor ! BatchRun(lsPart, Seq("2020-06-07-01", "2020-06-07-02"), Seq.empty, 0, 1, "p1", 2, farFuture)
206 | expectNoMessage(4 seconds)
207 | }
208 |
209 | "send back nothing, when command exits with 0, but some periods are printed more than once" in {
210 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir)))
211 | actor ! BatchRun(lsDup, Seq("2020-06-07-01", "2020-06-07-02"), Seq.empty, 0, 1, "p1", 2, farFuture)
212 | expectNoMessage(4 seconds)
213 | }
214 |
215 | "send back health for all periods, when command does exit with 0 with all periods printed exactly once" in {
216 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir)))
217 | actor ! BatchRun(s"$ls /tmp", Seq("2020-06-07-01", "2020-06-07-02"), Seq.empty, 0, 1, "p1", 2, farFuture)
218 | val expected = RunResult(Seq(
219 | PeriodHealth("2020-06-07-01", true),
220 | PeriodHealth("2020-06-07-02", false)), 0, 1, 2)
221 | expectMsg(expected)
222 | }
223 |
224 | "Support spaces in the period pattern" in {
225 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir)))
226 | actor ! BatchRun(s"$ls /tmp", Seq("2020-06-07 01", "2020-06-07 02"), Seq.empty, 0, 1, "p1", 2, farFuture)
227 | val expected = RunResult(Seq(
228 | PeriodHealth("2020-06-07 01", true),
229 | PeriodHealth("2020-06-07 02", false)), 0, 1, 2)
230 | expectMsg(expected)
231 | }
232 |
233 | "use provided environment variables" in {
234 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir)))
235 | actor ! BatchRun(s"$lsEnv .", Seq("2020-06-07-01", "2020-06-07-02"), Seq.empty, 0, 1, "p1", 2, farFuture)
236 | val expected1 = RunResult(Seq(
237 | PeriodHealth("2020-06-07-01", false),
238 | PeriodHealth("2020-06-07-02", false)), 0, 1, 2)
239 | expectMsg(expected1)
240 |
241 | actor ! BatchRun(s"$lsEnv .", Seq("2020-06-07-01", "2020-06-07-02"),
242 | Seq("GREENISH_VALUE_FOR_TEST" -> "/tmp"), 0, 1, "p1", 2,
243 | farFuture)
244 | val expected2 = RunResult(Seq(
245 | PeriodHealth("2020-06-07-01", true),
246 | PeriodHealth("2020-06-07-02", false)), 0, 1, 2)
247 | expectMsg(expected2)
248 | }
249 |
250 | "correctly send stats when command run is expired" in {
251 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir)))
252 | actor ! BatchRun(
253 | s"exit 1", Seq("2020-06-07-01", "2020-06-07-02"), Seq.empty, 0, 1, "p1", 2, 0)
254 |
255 | eventually {
256 | stats ! GetPrometheus
257 |
258 | val expectedTotal = Seq(
259 | (Seq("p1"), 1.0),
260 | (Seq("p2"), 0.0),
261 | (Seq("p3"), 0.0),
262 | )
263 |
264 | val allZeros = Seq(
265 | (Seq("p1"), 0.0),
266 | (Seq("p2"), 0.0),
267 | (Seq("p3"), 0.0),
268 | )
269 |
270 | val prom = receiveOne(2 seconds)
271 | .asInstanceOf[StatsCollector.MetricsEntity]
272 | .samples.asScala.toList
273 |
274 | checkSamples(prom, "greenish_state_refresh_total", expectedTotal)
275 | checkSamples(prom, "greenish_state_refresh_expired_total", expectedTotal)
276 | checkSamples(prom, "greenish_state_refresh_failed_total", allZeros)
277 | checkSamples(prom, "greenish_missing_periods_total", allZeros)
278 | checkSamples(prom, "greenish_oldest_missing_period", allZeros)
279 | checkSamples(prom, "greenish_active_refresh_tasks", allZeros)
280 |
281 | val actual = getNoneZeroHistogramLabels(prom, "greenish_state_refresh_time_seconds")
282 | actual shouldBe Set.empty
283 | }
284 | }
285 |
286 | "correctly send stats when command run fails" in {
287 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir)))
288 | actor ! BatchRun(
289 | s"exit 1", Seq("2020-06-07-01", "2020-06-07-02"), Seq.empty, 0, 1, "p1", 2, farFuture)
290 |
291 | eventually {
292 | stats ! GetPrometheus
293 |
294 | val expectedTotal = Seq(
295 | (Seq("p1"), 1.0),
296 | (Seq("p2"), 0.0),
297 | (Seq("p3"), 0.0),
298 | )
299 |
300 | val allZeros = Seq(
301 | (Seq("p1"), 0.0),
302 | (Seq("p2"), 0.0),
303 | (Seq("p3"), 0.0),
304 | )
305 |
306 | val prom = receiveOne(2 seconds)
307 | .asInstanceOf[StatsCollector.MetricsEntity]
308 | .samples.asScala.toList
309 |
310 | checkSamples(prom, "greenish_state_refresh_total", expectedTotal)
311 | checkSamples(prom, "greenish_state_refresh_expired_total", allZeros)
312 | checkSamples(prom, "greenish_state_refresh_failed_total", expectedTotal)
313 | checkSamples(prom, "greenish_missing_periods_total", allZeros)
314 | checkSamples(prom, "greenish_oldest_missing_period", allZeros)
315 | checkSamples(prom, "greenish_active_refresh_tasks", allZeros)
316 |
317 | val actual = getNoneZeroHistogramLabels(prom, "greenish_state_refresh_time_seconds")
318 | actual shouldBe Set("p1")
319 | }
320 | }
321 |
322 | "correctly send stats when command run succeeds" in {
323 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir)))
324 | actor ! BatchRun(
325 | s"$ls /tmp", Seq("2020-06-07-01", "2020-06-07-02"), Seq.empty, 0, 1, "p2", 2, farFuture)
326 |
327 | eventually {
328 | stats ! GetPrometheus
329 |
330 | val expectedTotal = Seq(
331 | (Seq("p1"), 0.0),
332 | (Seq("p2"), 1.0),
333 | (Seq("p3"), 0.0),
334 | )
335 |
336 | val allZeros = Seq(
337 | (Seq("p1"), 0.0),
338 | (Seq("p2"), 0.0),
339 | (Seq("p3"), 0.0),
340 | )
341 |
342 | val prom = receiveOne(2 seconds)
343 | .asInstanceOf[StatsCollector.MetricsEntity]
344 | .samples.asScala.toList
345 |
346 | checkSamples(prom, "greenish_state_refresh_total", expectedTotal)
347 | checkSamples(prom, "greenish_state_refresh_expired_total", allZeros)
348 | checkSamples(prom, "greenish_state_refresh_failed_total", allZeros)
349 | checkSamples(prom, "greenish_missing_periods_total", expectedTotal)
350 | checkSamples(prom, "greenish_oldest_missing_period", expectedTotal)
351 | checkSamples(prom, "greenish_active_refresh_tasks", allZeros)
352 |
353 | val actual = getNoneZeroHistogramLabels(prom, "greenish_state_refresh_time_seconds")
354 | actual shouldBe Set("p2")
355 | }
356 | }
357 |
358 | "correctly send stats when command run misses some periods" in {
359 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir)))
360 | actor ! BatchRun(
361 | lsPart, Seq("2020-06-07-01", "2020-06-07-02"), Seq.empty, 0, 1, "p2", 2, farFuture)
362 |
363 | eventually {
364 | stats ! GetPrometheus
365 |
366 | val expectedTotal = Seq(
367 | (Seq("p1"), 0.0),
368 | (Seq("p2"), 1.0),
369 | (Seq("p3"), 0.0),
370 | )
371 |
372 | val allZeros = Seq(
373 | (Seq("p1"), 0.0),
374 | (Seq("p2"), 0.0),
375 | (Seq("p3"), 0.0),
376 | )
377 |
378 | val prom = receiveOne(2 seconds)
379 | .asInstanceOf[StatsCollector.MetricsEntity]
380 | .samples.asScala.toList
381 |
382 | checkSamples(prom, "greenish_state_refresh_total", expectedTotal)
383 | checkSamples(prom, "greenish_state_refresh_expired_total", allZeros)
384 | checkSamples(prom, "greenish_state_refresh_failed_total", expectedTotal)
385 | checkSamples(prom, "greenish_missing_periods_total", allZeros)
386 | checkSamples(prom, "greenish_oldest_missing_period", allZeros)
387 | checkSamples(prom, "greenish_active_refresh_tasks", allZeros)
388 |
389 | val actual = getNoneZeroHistogramLabels(prom, "greenish_state_refresh_time_seconds")
390 | actual shouldBe Set("p2")
391 | }
392 | }
393 |
394 | "correctly send stats when command run prints duplicate periods" in {
395 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir)))
396 | actor ! BatchRun(
397 | lsDup, Seq("2020-06-07-01", "2020-06-07-02"), Seq.empty, 0, 1, "p2", 2, farFuture)
398 |
399 | eventually {
400 | stats ! GetPrometheus
401 |
402 | val expectedTotal = Seq(
403 | (Seq("p1"), 0.0),
404 | (Seq("p2"), 1.0),
405 | (Seq("p3"), 0.0),
406 | )
407 |
408 | val allZeros = Seq(
409 | (Seq("p1"), 0.0),
410 | (Seq("p2"), 0.0),
411 | (Seq("p3"), 0.0),
412 | )
413 |
414 | val prom = receiveOne(2 seconds)
415 | .asInstanceOf[StatsCollector.MetricsEntity]
416 | .samples.asScala.toList
417 |
418 | checkSamples(prom, "greenish_state_refresh_total", expectedTotal)
419 | checkSamples(prom, "greenish_state_refresh_failed_total", expectedTotal)
420 | checkSamples(prom, "greenish_missing_periods_total", allZeros)
421 | checkSamples(prom, "greenish_oldest_missing_period", allZeros)
422 | checkSamples(prom, "greenish_active_refresh_tasks", allZeros)
423 |
424 | val actual = getNoneZeroHistogramLabels(prom, "greenish_state_refresh_time_seconds")
425 | actual shouldBe Set("p2")
426 | }
427 | }
428 |
429 | "correctly compute active refresh stats" in {
430 | val actor = system.actorOf(Props(new CommandRunner(stats, outputDir)))
431 | actor ! BatchRun(
432 | lsSleep, Seq("2020-06-07-01", "2020-06-07-02"), Seq.empty, 0, 1, "p3", 2, farFuture)
433 |
434 | eventually {
435 | stats ! GetPrometheus
436 |
437 | val expected = Seq(
438 | (Seq("p1"), 0.0),
439 | (Seq("p2"), 0.0),
440 | (Seq("p3"), 1.0),
441 | )
442 |
443 | val prom = receiveOne(2 seconds)
444 | .asInstanceOf[StatsCollector.MetricsEntity]
445 | .samples.asScala.toList
446 |
447 | checkSamples(prom, "greenish_active_refresh_tasks", expected)
448 | }
449 |
450 | eventually {
451 | stats ! GetPrometheus
452 |
453 | val expected = Seq(
454 | (Seq("p1"), 0.0),
455 | (Seq("p2"), 0.0),
456 | (Seq("p3"), 0.0),
457 | )
458 |
459 | val prom = receiveOne(2 seconds)
460 | .asInstanceOf[StatsCollector.MetricsEntity]
461 | .samples.asScala.toList
462 |
463 | checkSamples(prom, "greenish_active_refresh_tasks", expected)
464 | }
465 | }
466 | }
467 |
468 | "write" must {
469 | "write lines to disk" in {
470 | val data = LazyList("first", "second")
471 | CommandRunner.write(src, data)
472 | val expected = data.toList
473 |
474 | val actual = Source.fromFile(src).getLines.toList
475 | actual shouldBe expected
476 | }
477 | }
478 | }
479 |
--------------------------------------------------------------------------------
/src/main/resources/dashboard/greenish-logo.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------