├── .gitignore ├── docker ├── build.sh ├── README └── Dockerfile ├── docs └── gitbook │ ├── appendix │ ├── README.md │ └── maelstromd_env_vars.md │ ├── how_works │ ├── README.md │ └── architecture.md │ ├── production │ ├── README.md │ ├── prune.md │ └── shutdown.md │ ├── getting_started │ ├── README.md │ ├── next_steps.md │ ├── remove_project.md │ ├── prerequisites.md │ ├── create_project.md │ ├── update_project.md │ ├── what_happened.md │ ├── activate_component.md │ └── installation.md │ ├── event_sources │ ├── README.md │ ├── toggle.md │ ├── cron.md │ ├── http.md │ ├── aws_stepfunc.md │ └── aws_sqs.md │ ├── SUMMARY.md │ └── README.md ├── pkg ├── maelstrom │ ├── notify.go │ ├── puller.go │ ├── pullstate.go │ ├── aws.go │ ├── resolver_test.go │ ├── gateway.go │ ├── sort.go │ ├── integration_test.go │ ├── pruner.go │ ├── resolver.go │ ├── logs.go │ ├── cluster.go │ └── placement.go ├── v1 │ ├── component.go │ └── event_source.go ├── common │ ├── time.go │ ├── model.go │ ├── math.go │ ├── http.go │ ├── crypto.go │ ├── string_test.go │ ├── util.go │ └── string.go ├── evsource │ ├── evsource.go │ ├── cron │ │ └── cron.go │ ├── poller │ │ └── evpoller.go │ └── aws │ │ ├── stepfunc │ │ └── stepfunc.go │ │ └── sqs │ │ └── sqs.go ├── revproxy │ ├── pool.go │ ├── request.go │ ├── dispenser.go │ └── revproxy.go ├── test │ ├── event_source.go │ ├── component.go │ └── project.go ├── vm │ ├── create.go │ └── iface.go ├── router │ ├── registry.go │ ├── router_test.go │ └── router.go ├── db │ └── db.go ├── converge │ ├── complock.go │ ├── converger_test.go │ └── registry.go ├── config │ ├── config_test.go │ └── config.go └── cert │ └── letsencrypt.go ├── scripts ├── gofmt_check.sh └── install_deps.sh ├── .gitlab-ci.yml ├── Dockerfile ├── cloud └── aws │ └── mael-init-node.sh ├── .github └── workflows │ └── test.yml ├── go.mod ├── roadmap.md ├── README.md ├── Makefile └── design └── overview.md /.gitignore: -------------------------------------------------------------------------------- 1 | dist 2 | *.iml 3 | pkg/v1/test.db 4 | tmp 5 | docs/gitbook/_book 6 | -------------------------------------------------------------------------------- /docker/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | docker build -t coopernurse/maelstrom-build . 4 | -------------------------------------------------------------------------------- /docs/gitbook/appendix/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Appendixes 3 | 4 | This section contains reference material on `maelstromd` and the project YAML format. -------------------------------------------------------------------------------- /docs/gitbook/how_works/README.md: -------------------------------------------------------------------------------- 1 | # How it Works 2 | 3 | This section describes various aspects of how **maelstrom** is designed and implemented. 4 | 5 | -------------------------------------------------------------------------------- /docs/gitbook/production/README.md: -------------------------------------------------------------------------------- 1 | # Running in Production 2 | 3 | This section covers how to reliably run **maelstrom** in production environments. 4 | 5 | -------------------------------------------------------------------------------- /docs/gitbook/getting_started/README.md: -------------------------------------------------------------------------------- 1 | # Getting Started 2 | 3 | This chapter explains how to install **maelstrom** and get a simple project running. 4 | 5 | -------------------------------------------------------------------------------- /pkg/maelstrom/notify.go: -------------------------------------------------------------------------------- 1 | package maelstrom 2 | 3 | import v1 "github.com/coopernurse/maelstrom/pkg/v1" 4 | 5 | type ComponentSubscriber interface { 6 | OnComponentNotification(change v1.DataChangedUnion) 7 | } 8 | -------------------------------------------------------------------------------- /pkg/v1/component.go: -------------------------------------------------------------------------------- 1 | package v1 2 | 3 | func HealthCheckSeconds(d *DockerComponent) int64 { 4 | seconds := d.HttpStartHealthCheckSeconds 5 | if seconds <= 0 { 6 | seconds = 60 7 | } 8 | return seconds 9 | } 10 | -------------------------------------------------------------------------------- /scripts/gofmt_check.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | out=$(gofmt -l cmd pkg) 4 | if [ -n "$out" ]; then 5 | echo "ERROR: Some files require gofmt formatting:" 6 | echo $out 7 | echo 8 | echo "Failing build" 9 | exit 1 10 | fi 11 | -------------------------------------------------------------------------------- /docs/gitbook/getting_started/next_steps.md: -------------------------------------------------------------------------------- 1 | 2 | # Next Steps 3 | 4 | If you've made it this far you've gotten a good overview of what **maelstrom** can do. 5 | 6 | Next try creating a new `maelstrom.yml` file for your project, registering components for your own 7 | docker images. 8 | -------------------------------------------------------------------------------- /docs/gitbook/event_sources/README.md: -------------------------------------------------------------------------------- 1 | # Getting Started 2 | 3 | Event Sources are external gateways into maelstrom that map to a particular 4 | component. When an event is received, the component associated with the event 5 | source is invoked. 6 | 7 | Over time **maelstrom** will support additional event sources. 8 | 9 | -------------------------------------------------------------------------------- /pkg/common/time.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import "time" 4 | 5 | func NowMillis() int64 { 6 | return TimeToMillis(time.Now()) 7 | } 8 | 9 | func TimeToMillis(t time.Time) int64 { 10 | return t.UnixNano() / 1e6 11 | } 12 | 13 | func MillisToTime(millis int64) time.Time { 14 | return time.Unix(0, millis*1e6) 15 | } 16 | -------------------------------------------------------------------------------- /pkg/evsource/evsource.go: -------------------------------------------------------------------------------- 1 | package evsource 2 | 3 | import ( 4 | "context" 5 | ) 6 | 7 | type PollCreator interface { 8 | NewPoller() Poller 9 | ComponentName() string 10 | RoleIdPrefix() string 11 | MaxConcurrency() int 12 | MaxConcurrencyPerPoller() int 13 | } 14 | 15 | type Poller func(ctx context.Context, concurrency int, roleId string) 16 | -------------------------------------------------------------------------------- /pkg/common/model.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | ) 7 | 8 | const PingLogMsg = "PING" 9 | 10 | type LogMsg struct { 11 | Component string 12 | Stream string 13 | Data string 14 | } 15 | 16 | func (m LogMsg) Format() string { 17 | return fmt.Sprintf("[%s]\t%s", m.Component, strings.TrimSpace(m.Data)) 18 | } 19 | -------------------------------------------------------------------------------- /docker/README: -------------------------------------------------------------------------------- 1 | This docker image is used by the CI job. 2 | 3 | It's derived from the standard golang docker image and adds the current 4 | version of docker. We link it to a docker-in-docker image via gitlab-ci.yml 5 | and this allows the CI job to start/stop docker containers as required by 6 | the tests. 7 | 8 | This image will need to be updated whenever we want to move to a newer 9 | version of Go. 10 | -------------------------------------------------------------------------------- /docs/gitbook/getting_started/remove_project.md: -------------------------------------------------------------------------------- 1 | 2 | # Remove Project 3 | 4 | Removing a project is as simple as: 5 | 6 | ``` 7 | $ /usr/local/bin/maelctl project rm hello-mael 8 | Project removed: hello-mael 9 | ``` 10 | 11 | Or via docker: 12 | 13 | ``` 14 | docker exec maelstromd maelctl project rm hello-mael 15 | ``` 16 | 17 | This removes all components and event sources contained in the project file. 18 | -------------------------------------------------------------------------------- /pkg/common/math.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | func MaxInt64(vals ...int64) int64 { 4 | if len(vals) == 0 { 5 | return 0 6 | } 7 | 8 | max := vals[0] 9 | for i := 1; i < len(vals); i++ { 10 | if vals[i] > max { 11 | max = vals[i] 12 | } 13 | } 14 | return max 15 | } 16 | 17 | func DefaultInt64(v int64, defaultVal int64) int64 { 18 | if v == 0 { 19 | return defaultVal 20 | } 21 | return v 22 | } 23 | -------------------------------------------------------------------------------- /pkg/revproxy/pool.go: -------------------------------------------------------------------------------- 1 | package revproxy 2 | 3 | import "sync" 4 | 5 | func NewProxyBufferPool() *ProxyBufferPool { 6 | pool := &sync.Pool{ 7 | New: func() interface{} { 8 | return make([]byte, 32*1024) 9 | }, 10 | } 11 | return &ProxyBufferPool{pool: pool} 12 | } 13 | 14 | type ProxyBufferPool struct { 15 | pool *sync.Pool 16 | } 17 | 18 | func (p *ProxyBufferPool) Get() []byte { 19 | b, ok := p.pool.Get().([]byte) 20 | if !ok { 21 | panic("pool didn't not return a []byte") 22 | } 23 | return b 24 | } 25 | 26 | func (p *ProxyBufferPool) Put(b []byte) { 27 | p.pool.Put(b) 28 | } 29 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.11-stretch 2 | 3 | # install docker 4 | RUN apt-get update && \ 5 | apt-get install -y \ 6 | apt-transport-https \ 7 | ca-certificates \ 8 | curl \ 9 | gnupg2 \ 10 | software-properties-common && \ 11 | bash -c 'curl -fsSL https://download.docker.com/linux/debian/gpg | apt-key add -' && \ 12 | add-apt-repository \ 13 | "deb [arch=amd64] https://download.docker.com/linux/debian \ 14 | $(lsb_release -cs) \ 15 | stable" && \ 16 | apt-get update && \ 17 | apt-get install -y docker-ce libsqlite3-dev 18 | 19 | -------------------------------------------------------------------------------- /pkg/revproxy/request.go: -------------------------------------------------------------------------------- 1 | package revproxy 2 | 3 | import ( 4 | v1 "github.com/coopernurse/maelstrom/pkg/v1" 5 | "net/http" 6 | "time" 7 | ) 8 | 9 | func NewRequest(req *http.Request, rw http.ResponseWriter, comp *v1.Component, preferLocal bool) *Request { 10 | return &Request{ 11 | Req: req, 12 | Rw: rw, 13 | Component: comp, 14 | StartTime: time.Now(), 15 | PreferLocal: preferLocal, 16 | Done: make(chan bool, 1), 17 | } 18 | } 19 | 20 | type Request struct { 21 | Req *http.Request 22 | Rw http.ResponseWriter 23 | Component *v1.Component 24 | StartTime time.Time 25 | PreferLocal bool 26 | Done chan bool 27 | } 28 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | image: coopernurse/maelstrom-build:latest 2 | services: 3 | - docker:dind 4 | 5 | variables: 6 | REPO_NAME: gitlab.com/coopernurse/maelstrom 7 | DOCKER_HOST: tcp://docker:2375/ 8 | DOCKER_DRIVER: overlay2 9 | DIND_HOST: docker 10 | 11 | before_script: 12 | - export CI_CACHEDIR=`pwd`/.cache 13 | - export GOPATH="$CI_CACHEDIR" 14 | - export PATH="$PATH:$CI_CACHEDIR/bin" 15 | - export PYTHONPATH="$CI_CACHEDIR/lib/python2.7/site-packages" 16 | - mkdir -p "$CI_CACHEDIR" 17 | - ./scripts/install_deps.sh 18 | 19 | cache: 20 | paths: 21 | - .cache 22 | 23 | stages: 24 | - test 25 | 26 | test: 27 | stage: test 28 | script: 29 | - make idl 30 | - make test 31 | - make maelctl 32 | - make maelstromd 33 | -------------------------------------------------------------------------------- /pkg/test/event_source.go: -------------------------------------------------------------------------------- 1 | package test 2 | 3 | import v1 "github.com/coopernurse/maelstrom/pkg/v1" 4 | 5 | func ValidPutEventSourceInput(eventSourceName string, componentName string) v1.PutEventSourceInput { 6 | return v1.PutEventSourceInput{ 7 | EventSource: v1.EventSource{ 8 | Name: eventSourceName, 9 | ComponentName: componentName, 10 | Http: &v1.HttpEventSource{ 11 | Hostname: "www.example.com", 12 | }, 13 | }, 14 | } 15 | } 16 | 17 | func SanitizeEventSources(list []v1.EventSourceWithStatus) { 18 | for i, es := range list { 19 | list[i] = SanitizeEventSource(&es) 20 | } 21 | } 22 | 23 | func SanitizeEventSource(es *v1.EventSourceWithStatus) v1.EventSourceWithStatus { 24 | es.EventSource.Version = 0 25 | es.EventSource.ModifiedAt = 0 26 | return *es 27 | } 28 | -------------------------------------------------------------------------------- /pkg/test/component.go: -------------------------------------------------------------------------------- 1 | package test 2 | 3 | import v1 "github.com/coopernurse/maelstrom/pkg/v1" 4 | 5 | func ValidComponent(componentName string) v1.PutComponentInput { 6 | return v1.PutComponentInput{ 7 | Component: v1.Component{ 8 | Name: componentName, 9 | Docker: &v1.DockerComponent{ 10 | Image: "coopernurse/foo", 11 | HttpPort: 8080, 12 | }, 13 | }, 14 | } 15 | } 16 | 17 | func SanitizeComponentsWithEventSources(list []v1.ComponentWithEventSources) { 18 | for i, ces := range list { 19 | list[i].Component = SanitizeComponent(&ces.Component) 20 | for x, es := range ces.EventSources { 21 | ces.EventSources[x] = SanitizeEventSource(&es) 22 | } 23 | } 24 | } 25 | 26 | func SanitizeComponent(c *v1.Component) v1.Component { 27 | c.Version = 0 28 | c.ModifiedAt = 0 29 | return *c 30 | } 31 | -------------------------------------------------------------------------------- /pkg/v1/event_source.go: -------------------------------------------------------------------------------- 1 | package v1 2 | 3 | import ( 4 | "fmt" 5 | ) 6 | 7 | func StrToEventSourceType(esType string) (EventSourceType, error) { 8 | es := EventSourceType(esType) 9 | all := []EventSourceType{EventSourceTypeSqs, EventSourceTypeCron, EventSourceTypeHttp, 10 | EventSourceTypeAwsstepfunc} 11 | for _, t := range all { 12 | if t == es { 13 | return t, nil 14 | } 15 | } 16 | return "", fmt.Errorf("invalid EventSourceType: %s", esType) 17 | } 18 | 19 | func GetEventSourceType(e EventSource) EventSourceType { 20 | if e.Http != nil { 21 | return EventSourceTypeHttp 22 | } else if e.Cron != nil { 23 | return EventSourceTypeCron 24 | } else if e.Sqs != nil { 25 | return EventSourceTypeSqs 26 | } else if e.Awsstepfunc != nil { 27 | return EventSourceTypeAwsstepfunc 28 | } else { 29 | panic("Unknown eventType for EventSource") 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # build stage 2 | FROM debian:stretch-slim AS build-env 3 | RUN apt-get update && apt install -y ca-certificates 4 | RUN apt install -y curl python-pip make 5 | RUN cd /usr/local && curl -LO https://dl.google.com/go/go1.13.linux-amd64.tar.gz && \ 6 | tar zxf go1.13.linux-amd64.tar.gz && rm -f go1.13.linux-amd64.tar.gz 7 | RUN pip install --pre barrister 8 | RUN apt install -y git libsqlite3-dev 9 | ENV GOROOT=/usr/local/go 10 | ENV PATH="${GOROOT}/bin:/root/go/bin:${PATH}" 11 | RUN go get github.com/coopernurse/barrister-go && go install github.com/coopernurse/barrister-go/idl2go 12 | ADD . /src 13 | RUN cd /src && make idl && make maelstromd && make maelctl 14 | 15 | # final stage 16 | FROM debian:stretch-slim 17 | RUN apt-get update && apt install -y ca-certificates libsqlite3-dev 18 | WORKDIR /app 19 | COPY --from=build-env /src/dist/maelstromd /usr/bin 20 | COPY --from=build-env /src/dist/maelctl /usr/bin 21 | -------------------------------------------------------------------------------- /pkg/vm/create.go: -------------------------------------------------------------------------------- 1 | package vm 2 | 3 | import ( 4 | "github.com/coopernurse/maelstrom/pkg/common" 5 | "github.com/coopernurse/maelstrom/pkg/config" 6 | ) 7 | 8 | func CreateCluster(cfg config.Config, opts CreateClusterOptions, adapter Adapter) (CreateClusterOut, error) { 9 | out := CreateClusterOut{ClusterName: cfg.Cluster.Name} 10 | 11 | // - create ssh key pair 12 | 13 | err := common.MakeSSHKeyPair(opts.SSHPublicKeyFile, opts.SSHPrivateKeyFile, 2048) 14 | if err != nil { 15 | return out, err 16 | } 17 | 18 | // - Create CA key pair (for cluster node<->node communication) 19 | // - Cloud: 20 | // - register ssh key 21 | // - Create firewall 22 | // - Create load balancer 23 | // - Create first node 24 | return adapter.CreateCluster(cfg, opts) 25 | 26 | // - copy files to node: 27 | // - ssh private key 28 | // - Maelstrom binaries 29 | // - CA key pair 30 | // - Config file 31 | // - systemd unit file 32 | // - register and start maelstromd service 33 | } 34 | -------------------------------------------------------------------------------- /pkg/maelstrom/puller.go: -------------------------------------------------------------------------------- 1 | package maelstrom 2 | 3 | import ( 4 | "github.com/coopernurse/maelstrom/pkg/db" 5 | v1 "github.com/coopernurse/maelstrom/pkg/v1" 6 | docker "github.com/docker/docker/client" 7 | log "github.com/mgutz/logxi/v1" 8 | ) 9 | 10 | func NewImagePuller(dockerClient *docker.Client, db db.Db, pullState *PullState) *ImagePuller { 11 | return &ImagePuller{ 12 | dockerClient: dockerClient, 13 | db: db, 14 | pullState: pullState, 15 | } 16 | } 17 | 18 | type ImagePuller struct { 19 | dockerClient *docker.Client 20 | db db.Db 21 | pullState *PullState 22 | } 23 | 24 | func (i *ImagePuller) OnComponentNotification(change v1.DataChangedUnion) { 25 | if change.PutComponent != nil { 26 | comp, err := i.db.GetComponent(change.PutComponent.Name) 27 | if err == nil { 28 | if comp.Docker.PullImageOnPut { 29 | i.pullState.Pull(comp, false) 30 | } 31 | } else { 32 | log.Error("puller: unable to GetComponent", "err", err, "component", change.PutComponent.Name) 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /docs/gitbook/getting_started/prerequisites.md: -------------------------------------------------------------------------------- 1 | 2 | # Prerequisites 3 | 4 | ## Docker 5 | 6 | You must have Docker installed and running before you can install `maelstromd`. 7 | If you don't have Docker running, follow the 8 | [Docker Engine installation guide](https://docs.docker.com/install/) 9 | 10 | Test your Docker installation by running the `hello-world` image. You should see something like this: 11 | 12 | ``` 13 | $ docker run hello-world 14 | 15 | Hello from Docker! 16 | This message shows that your installation appears to be working correctly. 17 | ``` 18 | 19 | Once this is working you're ready to install maelstrom. 20 | 21 | ## Root access? 22 | 23 | Recent Docker installations create a `docker` group. Members of this group can connect to the local Docker daemon 24 | without being root. 25 | 26 | If you try the above `docker run hello-world` and get a permission denied error you might try running the command 27 | via sudo `sudo docker run hello-world`. If this works, you'll need to run `maelstromd` via sudo as well since it 28 | needs the ability to communicate with the Docker daemon. 29 | -------------------------------------------------------------------------------- /pkg/test/project.go: -------------------------------------------------------------------------------- 1 | package test 2 | 3 | import ( 4 | "fmt" 5 | v1 "github.com/coopernurse/maelstrom/pkg/v1" 6 | "math/rand" 7 | ) 8 | 9 | func ValidProject(projectName string) v1.PutProjectInput { 10 | num := rand.Intn(5) + 1 11 | components := make([]v1.ComponentWithEventSources, num) 12 | for i := 0; i < num; i++ { 13 | compName := fmt.Sprintf("comp-%d", i) 14 | comp := ValidComponent(compName) 15 | comp.Component.ProjectName = projectName 16 | esNum := rand.Intn(5) 17 | eventSources := make([]v1.EventSourceWithStatus, esNum) 18 | for x := 0; x < esNum; x++ { 19 | esName := fmt.Sprintf("es-%d-%d", i, x) 20 | es := ValidPutEventSourceInput(esName, compName).EventSource 21 | es.ProjectName = projectName 22 | eventSources[x] = v1.EventSourceWithStatus{ 23 | EventSource: es, 24 | Enabled: true, 25 | } 26 | } 27 | components[i] = v1.ComponentWithEventSources{ 28 | Component: comp.Component, 29 | EventSources: eventSources, 30 | } 31 | } 32 | 33 | return v1.PutProjectInput{ 34 | Project: v1.Project{ 35 | Name: projectName, 36 | Components: components, 37 | }, 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /scripts/install_deps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -x 2 | 3 | # install barrister 4 | cmd=$(command -v barrister) 5 | if [ -z "$cmd" ]; then 6 | echo "Installing barrister" 7 | set -e 8 | # opts="--user" 9 | # if [ -n "$CI_CACHEDIR" ]; then 10 | # apt-get update 11 | # apt-get install -y python-setuptools 12 | # easy_install pip 13 | # opts="--install-option=--prefix=$CI_CACHEDIR --ignore-installed" 14 | # pip install $opts setuptools 15 | # fi 16 | pip install --pre --user barrister 17 | set +e 18 | else 19 | echo "Found barrister: $cmd" 20 | fi 21 | 22 | # install barrister-go 23 | cmd=$(command -v idl2go) 24 | if [ -z "$cmd" ]; then 25 | echo "Installing barrister-go / idl2go" 26 | set -e 27 | go get github.com/coopernurse/barrister-go 28 | go install github.com/coopernurse/barrister-go/idl2go 29 | set +e 30 | else 31 | echo "Found idl2go: $cmd" 32 | fi 33 | 34 | # install errcheck 35 | cmd=$(command -v errcheck) 36 | if [ -z "$cmd" ]; then 37 | echo "Installing errcheck" 38 | set -e 39 | go get github.com/kisielk/errcheck 40 | set +e 41 | else 42 | echo "Found errcheck: $cmd" 43 | fi 44 | -------------------------------------------------------------------------------- /docs/gitbook/SUMMARY.md: -------------------------------------------------------------------------------- 1 | # Maelstrom 2 | 3 | * [Introduction](README.md) 4 | * [Getting Started](getting_started/README.md) 5 | * [Prerequisites](getting_started/prerequisites.md) 6 | * [Download and Install](getting_started/installation.md) 7 | * [Create Project](getting_started/create_project.md) 8 | * [Activate Component](getting_started/activate_component.md) 9 | * [What Happened](getting_started/what_happened.md) 10 | * [Update Project](getting_started/update_project.md) 11 | * [Remove Project](getting_started/remove_project.md) 12 | * [Next Steps](getting_started/next_steps.md) 13 | * [Event Sources](event_sources/README.md) 14 | * [HTTP](event_sources/http.md) 15 | * [Cron](event_sources/cron.md) 16 | * [AWS SQS](event_sources/aws_sqs.md) 17 | * [AWS Step Functions](event_sources/aws_stepfunc.md) 18 | * [Toggle On/Off](event_sources/toggle.md) 19 | * [How it Works](how_works/README.md) 20 | * [Architecture](how_works/architecture.md) 21 | * [Running in Production](production/README.md) 22 | * [Pruning Images](production/prune.md) 23 | * [Graceful Shutdown](production/shutdown.md) 24 | * [Appendixes](appendix/README.md) 25 | * [maelstromd Environment Variables](appendix/maelstromd_env_vars.md) 26 | * [Project YAML reference](appendix/project_yaml_ref.md) 27 | -------------------------------------------------------------------------------- /pkg/maelstrom/pullstate.go: -------------------------------------------------------------------------------- 1 | package maelstrom 2 | 3 | import ( 4 | "github.com/coopernurse/maelstrom/pkg/common" 5 | v1 "github.com/coopernurse/maelstrom/pkg/v1" 6 | docker "github.com/docker/docker/client" 7 | log "github.com/mgutz/logxi/v1" 8 | "sync" 9 | ) 10 | 11 | func NewPullState(dockerClient *docker.Client) *PullState { 12 | return &PullState{ 13 | dockerClient: dockerClient, 14 | pulledVerByComponent: make(map[string]int64), 15 | lock: &sync.Mutex{}, 16 | } 17 | } 18 | 19 | type PullState struct { 20 | dockerClient *docker.Client 21 | pulledVerByComponent map[string]int64 22 | lock *sync.Mutex 23 | } 24 | 25 | func (p *PullState) Pull(c v1.Component, forcePull bool) { 26 | p.lock.Lock() 27 | ver := p.pulledVerByComponent[c.Name] 28 | p.lock.Unlock() 29 | 30 | if ver != c.Version || forcePull { 31 | err := common.PullImage(p.dockerClient, c) 32 | if err != nil { 33 | log.Warn("component: unable to pull image", "err", err.Error(), "component", c.Name, 34 | "image", c.Docker.Image) 35 | } else { 36 | p.lock.Lock() 37 | p.pulledVerByComponent[c.Name] = c.Version 38 | p.lock.Unlock() 39 | log.Info("component: successfully pulled image", "component", c.Name, "image", c.Docker.Image) 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /docs/gitbook/getting_started/create_project.md: -------------------------------------------------------------------------------- 1 | # Create Project 2 | 3 | ## Create maelstrom.yml 4 | 5 | Open another shell and create a new file: `maelstrom.yml` with the following contents: 6 | 7 | ```yaml 8 | # example project 9 | --- 10 | name: hello-mael 11 | components: 12 | hello: 13 | image: docker.io/coopernurse/go-hello-http 14 | httpport: 8080 15 | httphealthcheckpath: / 16 | reservememory: 128 17 | eventsources: 18 | hello_http: 19 | http: 20 | hostname: hello.localhost 21 | ``` 22 | 23 | ## Register file 24 | 25 | Use `maelctl` to register this project with `maelstromd`: 26 | 27 | ``` 28 | $ /usr/local/bin/maelctl project put 29 | ``` 30 | 31 | Or via docker (we volume mounted the current dir to `/app` so we can access it there in the container): 32 | 33 | ``` 34 | docker exec maelstromd maelctl project put 35 | ``` 36 | 37 | You should see this output: 38 | 39 | ``` 40 | Project saved: hello-mael from file: maelstrom.yml 41 | Type Name Action 42 | Component hello-mael_hello Added 43 | EventSource hello-mael_hello_http Added 44 | ``` 45 | 46 | `maelctl project put` uses `maelstrom.yml` by default, but you can specify a different 47 | path using the `--file` switch if desired. 48 | -------------------------------------------------------------------------------- /cloud/aws/mael-init-node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -xe 2 | 3 | role="maelnode" 4 | 5 | EC2_AVAIL_ZONE=`curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone` 6 | EC2_INSTANCE_ID=`curl -s http://169.254.169.254/latest/meta-data/instance-id` 7 | EC2_REGION="`echo \"$EC2_AVAIL_ZONE\" | sed 's/[a-z]$//'`" 8 | 9 | # create systemd unit 10 | cat < /etc/systemd/system/maelstromd.service 11 | [Unit] 12 | Description=maelstromd 13 | After=docker.service 14 | [Service] 15 | TimeoutStartSec=0 16 | Restart=always 17 | RestartSec=5 18 | Environment=AWS_REGION=${EC2_REGION} 19 | Environment=MAEL_INSTANCE_ID=${EC2_INSTANCE_ID} 20 | Environment=MAEL_SQL_DRIVER=${MAEL_SQL_DRIVER} 21 | Environment=MAEL_SQL_DSN=${MAEL_SQL_DSN} 22 | Environment=MAEL_AWS_TERMINATE_QUEUE_URL=${MAEL_AWS_TERMINATE_QUEUE_URL} 23 | Environment=MAEL_SHUTDOWN_PAUSE_SECONDS=5 24 | ExecStartPre=/bin/mkdir -p /var/maelstrom 25 | ExecStartPre=/bin/chmod 700 /var/maelstrom 26 | ExecStart=/usr/bin/maelstromd 27 | [Install] 28 | WantedBy=multi-user.target 29 | EOF 30 | chmod 600 /etc/systemd/system/maelstromd.service 31 | 32 | # set hostname 33 | hostname="${role}-${EC2_INSTANCE_ID}" 34 | sudo hostname ${hostname} 35 | sudo bash -c "echo ${hostname} > /etc/hostname" 36 | 37 | # start docker 38 | systemctl restart docker 39 | 40 | # start maelstromd 41 | systemctl daemon-reload 42 | systemctl enable maelstromd 43 | systemctl start maelstromd 44 | -------------------------------------------------------------------------------- /pkg/vm/iface.go: -------------------------------------------------------------------------------- 1 | package vm 2 | 3 | import ( 4 | "fmt" 5 | "github.com/coopernurse/maelstrom/pkg/config" 6 | "time" 7 | ) 8 | 9 | var NotFound = fmt.Errorf("not found") 10 | 11 | type Adapter interface { 12 | Name() string 13 | CreateCluster(cfg config.Config, opts CreateClusterOptions) (CreateClusterOut, error) 14 | DestroyCluster(cfg config.Config) error 15 | GetClusterInfo(cfg config.Config) (ClusterInfo, error) 16 | CreateVM(cfg config.Config, opts CreateVMOptions) (VM, error) 17 | DestroyVM(opts DestroyVMOptions) error 18 | ListVMs(opts ListVMsOptions) ([]VM, error) 19 | } 20 | 21 | type VM struct { 22 | Id string 23 | CreatedAt time.Time 24 | PublicIpAddr string 25 | PrivateIpAddr string 26 | } 27 | 28 | type LoadBalancer struct { 29 | PublicIpAddr string 30 | Hostname string 31 | } 32 | 33 | type CreateClusterOptions struct { 34 | SSHPublicKeyFile string 35 | SSHPrivateKeyFile string 36 | } 37 | 38 | type CreateClusterOut struct { 39 | ClusterName string 40 | RootVM VM 41 | } 42 | 43 | type ClusterInfo struct { 44 | ClusterName string 45 | LoadBalancer *LoadBalancer 46 | VMs []VM 47 | Meta map[string]string 48 | } 49 | 50 | type CreateVMOptions struct { 51 | VMName string 52 | Tags []string 53 | } 54 | 55 | type DestroyVMOptions struct { 56 | Ids []string 57 | } 58 | 59 | type ListVMsOptions struct { 60 | ClusterName string 61 | } 62 | -------------------------------------------------------------------------------- /pkg/common/http.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import ( 4 | "golang.org/x/net/http2" 5 | "net" 6 | "net/http" 7 | "time" 8 | ) 9 | 10 | type HTTPClientSettings struct { 11 | ConnectTimeout time.Duration 12 | ConnKeepAlive time.Duration 13 | ExpectContinue time.Duration 14 | IdleConnTimeout time.Duration 15 | MaxAllIdleConns int 16 | MaxHostIdleConns int 17 | ResponseHeaderTimeout time.Duration 18 | TLSHandshakeTimeout time.Duration 19 | } 20 | 21 | func NewHTTPClientWithSettings(httpSettings HTTPClientSettings) (*http.Client, error) { 22 | tr := &http.Transport{ 23 | ResponseHeaderTimeout: httpSettings.ResponseHeaderTimeout, 24 | Proxy: http.ProxyFromEnvironment, 25 | DialContext: (&net.Dialer{ 26 | KeepAlive: httpSettings.ConnKeepAlive, 27 | DualStack: true, 28 | Timeout: httpSettings.ConnectTimeout, 29 | }).DialContext, 30 | MaxIdleConns: httpSettings.MaxAllIdleConns, 31 | IdleConnTimeout: httpSettings.IdleConnTimeout, 32 | TLSHandshakeTimeout: httpSettings.TLSHandshakeTimeout, 33 | MaxIdleConnsPerHost: httpSettings.MaxHostIdleConns, 34 | ExpectContinueTimeout: httpSettings.ExpectContinue, 35 | } 36 | 37 | // So client makes HTTP/2 requests 38 | err := http2.ConfigureTransport(tr) 39 | if err != nil { 40 | return nil, err 41 | } 42 | 43 | return &http.Client{ 44 | Transport: tr, 45 | }, nil 46 | } 47 | -------------------------------------------------------------------------------- /pkg/router/registry.go: -------------------------------------------------------------------------------- 1 | package router 2 | 3 | import ( 4 | "github.com/coopernurse/maelstrom/pkg/revproxy" 5 | "sync" 6 | "time" 7 | ) 8 | 9 | func NewRegistry(nodeId string, startCompFunc StartComponentFunc) *Registry { 10 | return &Registry{ 11 | nodeId: nodeId, 12 | startCompFunc: startCompFunc, 13 | byCompName: make(map[string]*Router), 14 | bufferPool: revproxy.NewProxyBufferPool(), 15 | lock: &sync.Mutex{}, 16 | } 17 | } 18 | 19 | type Registry struct { 20 | nodeId string 21 | startCompFunc StartComponentFunc 22 | byCompName map[string]*Router 23 | bufferPool *revproxy.ProxyBufferPool 24 | lock *sync.Mutex 25 | } 26 | 27 | func (r *Registry) ByComponent(componentName string) (router *Router) { 28 | r.lock.Lock() 29 | router = r.byCompName[componentName] 30 | if router == nil { 31 | router = NewRouter(componentName, r.nodeId, r.bufferPool, r.startCompFunc) 32 | r.byCompName[componentName] = router 33 | } 34 | r.lock.Unlock() 35 | return 36 | } 37 | 38 | func (r *Registry) WaitForInflightToDrain() { 39 | for { 40 | r.lock.Lock() 41 | byName := r.byCompName 42 | r.lock.Unlock() 43 | 44 | drained := true 45 | for _, router := range byName { 46 | if router.GetInflightReqs() > 0 { 47 | drained = false 48 | break 49 | } 50 | } 51 | if drained { 52 | return 53 | } 54 | time.Sleep(100 * time.Millisecond) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /docs/gitbook/event_sources/toggle.md: -------------------------------------------------------------------------------- 1 | 2 | # Toggle On and Off 3 | 4 | New event sources are enabled by default, but **maelstrom** provides a way to toggle event sources 5 | on and off. This can be used to pause cron or SQS pollers during maintenance windows, for example. 6 | 7 | Toggling the status of an event source does not modify the event source itself, so the modified 8 | time and version of the event source does not change. 9 | 10 | The current status of an event source is included in the `mael es ls` output. 11 | 12 | Enabled status is cached in memory, so changes to event source status will not take effect immediately. 13 | The default cache interval is 1 minute for SQS and cron event sources and 1 second for HTTP event sources. 14 | 15 | ## Examples 16 | 17 | ### Example 1: Basic usage. Disable or enable all. 18 | 19 | #### Disable all event sources: 20 | ``` 21 | maelctl es disable 22 | ``` 23 | 24 | #### Enable all event sources: 25 | ``` 26 | maelctl es enable 27 | ``` 28 | 29 | ### Example 2: Disable all event sources by type 30 | 31 | ``` 32 | # type can be: http, sqs, cron 33 | maelctl es disable --type=http 34 | ``` 35 | 36 | ### Example 3: Enable all event sources by name prefix 37 | 38 | ``` 39 | # only event sources whose names start with "foo" will be modified 40 | maelctl es enable --prefix=foo 41 | ``` 42 | 43 | ### Example 4: Disable all events by project 44 | 45 | ``` 46 | maelctl es disable --project=finance 47 | ``` 48 | -------------------------------------------------------------------------------- /pkg/common/crypto.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import ( 4 | "crypto/rand" 5 | "crypto/rsa" 6 | "crypto/x509" 7 | "encoding/pem" 8 | "golang.org/x/crypto/ssh" 9 | "io/ioutil" 10 | "os" 11 | ) 12 | 13 | // Adapted from: 14 | // https://stackoverflow.com/questions/21151714/go-generate-an-ssh-public-key 15 | 16 | // MakeSSHKeyPair make a pair of public and private keys for SSH access. 17 | // Public key is encoded in the format for inclusion in an OpenSSH authorized_keys file. 18 | // Private Key generated is PEM encoded 19 | // keySize is the size of the private key in bytes 20 | func MakeSSHKeyPair(pubKeyPath, privateKeyPath string, keySize int) error { 21 | privateKey, err := rsa.GenerateKey(rand.Reader, keySize) 22 | if err != nil { 23 | return err 24 | } 25 | 26 | // generate and write private key as PEM 27 | privateKeyFile, err := os.Create(privateKeyPath) 28 | defer CheckClose(privateKeyFile, &err) 29 | if err != nil { 30 | return err 31 | } 32 | err = privateKeyFile.Chmod(os.FileMode(0600)) 33 | if err != nil { 34 | return err 35 | } 36 | privateKeyPEM := &pem.Block{Type: "RSA PRIVATE KEY", Bytes: x509.MarshalPKCS1PrivateKey(privateKey)} 37 | if err := pem.Encode(privateKeyFile, privateKeyPEM); err != nil { 38 | return err 39 | } 40 | 41 | // generate and write public key 42 | pub, err := ssh.NewPublicKey(&privateKey.PublicKey) 43 | if err != nil { 44 | return err 45 | } 46 | return ioutil.WriteFile(pubKeyPath, ssh.MarshalAuthorizedKey(pub), 0655) 47 | } 48 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: test 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v1 10 | - name: Run tests 11 | shell: bash 12 | run: | 13 | set -x 14 | lsb_release -d 15 | sudo apt install python2 -y 16 | curl https://bootstrap.pypa.io/pip/2.7/get-pip.py --output get-pip.py 17 | sudo python2 get-pip.py 18 | sudo update-alternatives --install /usr/bin/python python /usr/bin/python2 1 19 | sudo update-alternatives --install /usr/bin/python python /usr/bin/python3 2 20 | export CI_CACHEDIR=`pwd`/.cache 21 | export GOPATH="$CI_CACHEDIR" 22 | export PATH="$PATH:$CI_CACHEDIR/bin:$HOME/.local/bin" 23 | export PYTHONPATH="$CI_CACHEDIR/lib/python2.7/site-packages" 24 | mkdir -p "$CI_CACHEDIR" 25 | pip2 install --user wheel 26 | ./scripts/install_deps.sh 27 | sudo apt-get update 28 | sudo apt-get install -y libsqlite3-dev 29 | go version 30 | make idl 31 | make maelctl 32 | make maelstromd 33 | make test 34 | - uses: jakejarvis/s3-sync-action@master 35 | if: github.ref == 'refs/heads/master' 36 | env: 37 | AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }} 38 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} 39 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 40 | AWS_REGION: 'us-west-2' 41 | SOURCE_DIR: 'dist' 42 | DEST_DIR: 'download/latest/linux_x86_64' 43 | -------------------------------------------------------------------------------- /docs/gitbook/getting_started/update_project.md: -------------------------------------------------------------------------------- 1 | 2 | # Update Project 3 | 4 | ## Add cron event source 5 | 6 | Edit `maelstrom.yml` and add a `cron` event source. The full file should look like this: 7 | 8 | ```yaml 9 | # example project 10 | --- 11 | name: hello-mael 12 | components: 13 | hello: 14 | image: docker.io/coopernurse/go-hello-http 15 | httpport: 8080 16 | httphealthcheckpath: / 17 | reservememory: 128 18 | eventsources: 19 | hello_http: 20 | http: 21 | hostname: hello.localhost 22 | # cron event source here: 23 | hello_cron: 24 | cron: 25 | schedule: "@every 10s" 26 | http: 27 | method: GET 28 | path: /log 29 | ``` 30 | 31 | Then re-run `maelctl project put` 32 | 33 | ``` 34 | $ /usr/local/bin/maelctl project put 35 | ``` 36 | 37 | Or via docker: 38 | 39 | ``` 40 | docker exec maelstromd maelctl project put 41 | ``` 42 | 43 | ## Wait 10 seconds.. 44 | 45 | We just told `maelstromd` to make a `GET` request to `/log` on this component every 10 seconds. 46 | 47 | If the component is not already running `maelstromd` will start it. 48 | 49 | ## Watch the logs 50 | 51 | Run this to see the logs for the container. The `/count` endpoint prints the time to STDOUT each time it's invoked. 52 | 53 | ``` 54 | $ /usr/local/bin/maelctl logs 55 | [hello-mael_hello] Current time: 2019-09-17 21:12:11.001191209 +0000 UTC m=+942.075217013 56 | [hello-mael_hello] Current time: 2019-09-17 21:12:21.001580013 +0000 UTC m=+952.075605806 57 | [hello-mael_hello] Current time: 2019-09-17 21:12:31.001557084 +0000 UTC m=+962.075582883 58 | ``` 59 | 60 | Or via docker: 61 | 62 | ``` 63 | docker exec maelstromd maelctl logs 64 | ``` -------------------------------------------------------------------------------- /pkg/db/db.go: -------------------------------------------------------------------------------- 1 | package db 2 | 3 | import ( 4 | "fmt" 5 | "github.com/coopernurse/maelstrom/pkg/v1" 6 | "time" 7 | ) 8 | 9 | var NotFound = fmt.Errorf("Not Found") 10 | var AlreadyExists = fmt.Errorf("Entity already exists") 11 | var IncorrectPreviousVersion = fmt.Errorf("Incorrect PreviousVersion") 12 | 13 | const ( 14 | RolePlacement = "placement" 15 | RoleAutoScale = "autoscale" 16 | RoleCron = "cron" 17 | ) 18 | 19 | type Db interface { 20 | Migrate() error 21 | 22 | AcquireOrRenewRole(roleId string, nodeId string, lockDur time.Duration) (bool, string, error) 23 | ReleaseRole(roleId string, nodeId string) error 24 | ReleaseAllRoles(nodeId string) error 25 | 26 | ListProjects(input v1.ListProjectsInput) (v1.ListProjectsOutput, error) 27 | 28 | PutComponent(component v1.Component) (int64, error) 29 | GetComponent(componentName string) (v1.Component, error) 30 | ListComponents(input v1.ListComponentsInput) (v1.ListComponentsOutput, error) 31 | RemoveComponent(componentName string) (bool, error) 32 | 33 | GetComponentDeployCount(componentName string, version int64) (int, error) 34 | IncrementComponentDeployCount(componentName string, version int64) error 35 | 36 | PutEventSource(eventSource v1.EventSource) (int64, error) 37 | GetEventSource(eventSourceName string) (v1.EventSource, error) 38 | ListEventSources(input v1.ListEventSourcesInput) (v1.ListEventSourcesOutput, error) 39 | RemoveEventSource(eventSourceName string) (bool, error) 40 | SetEventSourcesEnabled(eventSourceNames []string, enabled bool) (int64, error) 41 | 42 | PutNodeStatus(status v1.NodeStatus) error 43 | ListNodeStatus() ([]v1.NodeStatus, error) 44 | RemoveNodeStatusOlderThan(observedAt time.Time) (int64, error) 45 | RemoveNodeStatus(nodeId string) (bool, error) 46 | } 47 | -------------------------------------------------------------------------------- /docs/gitbook/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Maelstrom Documentation 3 | 4 | ## Welcome! 5 | 6 | Thanks for checking out **maelstrom**, the simple container orchestrator. 7 | This book will explain how to install maelstrom, how to create your first project, 8 | and how to configure different event sources. 9 | 10 | The appendix sections provide a reference for `maelstromd` configuration variables 11 | and the `maelstrom.yml` project YAML format. 12 | 13 | ## Concepts 14 | 15 | **maelstrom** is built on top of [Docker](https://www.docker.com/) and provides a way to 16 | start and stop Docker containers in response to inbound requests. 17 | 18 | In **maelstrom** all running containers **must** expose a HTTP server. Requests will be reverse 19 | proxied to this web server. 20 | 21 | ### Docker concepts 22 | 23 | | Term | Meaning | 24 | |----------------------|--------------------------------------------------------| 25 | | [image](https://docs.docker.com/glossary/?term=image)| A named ordered collection of root filesystem layers 26 | | [container](https://docs.docker.com/glossary/?term=container) | Runtime instance of a docker image 27 | 28 | ### Maelstrom concepts 29 | 30 | | Term | Meaning | 31 | |--------------|----------------------------------------------------------------------| 32 | | component | Spec defining how to run a container. Specifies min/max limits, RAM requirements, logging. 33 | | event source | Spec that defines an external request source to map to a component. Event sources include HTTP, Amazon SQS, and cron. 34 | | project | Collection of components and event sources. Defined in a YAML file. 35 | | node | A physical or virtual machine running the `maelstromd` daemon. 36 | | cluster | A collection of 1..n maelstrom nodes that share a common database. 37 | 38 | These concepts will be explained in more detail in later chapters. 39 | -------------------------------------------------------------------------------- /pkg/maelstrom/aws.go: -------------------------------------------------------------------------------- 1 | package maelstrom 2 | 3 | import ( 4 | "encoding/json" 5 | "github.com/coopernurse/maelstrom/pkg/common" 6 | v1 "github.com/coopernurse/maelstrom/pkg/v1" 7 | log "github.com/mgutz/logxi/v1" 8 | "net/http" 9 | "time" 10 | ) 11 | 12 | type AwsSpotInstanceAction struct { 13 | Action string 14 | Time string 15 | } 16 | 17 | func awsSpotInstanceTerminate() bool { 18 | // See docs: 19 | // https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-interruptions.html 20 | // 21 | resp, err := http.Get("http://169.254.169.254/latest/meta-data/spot/instance-action") 22 | if err != nil { 23 | return false 24 | } 25 | defer common.CheckClose(resp.Body, &err) 26 | 27 | if resp.StatusCode == http.StatusOK { 28 | var inst AwsSpotInstanceAction 29 | err = json.NewDecoder(resp.Body).Decode(&inst) 30 | if err == nil { 31 | return inst.Action == "stop" || inst.Action == "terminate" 32 | } 33 | } 34 | return false 35 | } 36 | 37 | type AwsLifecycleHookMessage struct { 38 | QueueUrl string 39 | MessageReceiptHandle string 40 | AccountId string 41 | RequestId string 42 | Time string 43 | Service string 44 | AutoScalingGroupName string 45 | EC2InstanceId string 46 | LifecycleActionToken string 47 | LifecycleHookName string 48 | } 49 | 50 | func (h *AwsLifecycleHookMessage) ToAwsLifecycleHook() *v1.AwsLifecycleHook { 51 | return &v1.AwsLifecycleHook{ 52 | QueueUrl: h.QueueUrl, 53 | MessageReceiptHandle: h.MessageReceiptHandle, 54 | AutoScalingGroupName: h.AutoScalingGroupName, 55 | InstanceId: h.EC2InstanceId, 56 | LifecycleActionToken: h.LifecycleActionToken, 57 | LifecycleHookName: h.LifecycleHookName, 58 | } 59 | } 60 | 61 | func (h *AwsLifecycleHookMessage) TryParseAge() *time.Duration { 62 | t, err := time.Parse("2006-01-02T15:04:05.000Z", h.Time) 63 | if err == nil { 64 | dur := time.Now().Sub(t) 65 | return &dur 66 | } else { 67 | log.Warn("aws: unable to parse lifecycle hook time: " + h.Time) 68 | return nil 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /pkg/revproxy/dispenser.go: -------------------------------------------------------------------------------- 1 | package revproxy 2 | 3 | import ( 4 | "context" 5 | "net/http/httputil" 6 | "sync" 7 | "time" 8 | ) 9 | 10 | type Proxy func(req *Request) 11 | 12 | func NewGetProxyRequest() *GetProxyRequest { 13 | return &GetProxyRequest{ 14 | Proxy: make(chan Proxy), 15 | } 16 | } 17 | 18 | type GetProxyRequest struct { 19 | Proxy chan Proxy 20 | } 21 | 22 | func NewDispenser(maxConcurrency int, reqCh <-chan *GetProxyRequest, 23 | myNodeId string, componentName string, 24 | proxy *httputil.ReverseProxy, statCh chan<- time.Duration, ctx context.Context) *Dispenser { 25 | 26 | doneChSize := maxConcurrency 27 | if doneChSize < 1 { 28 | doneChSize = 1 29 | } 30 | doneCh := make(chan bool, doneChSize) 31 | doneFx := func() { doneCh <- true } 32 | 33 | proxyFx := func(req *Request) { 34 | defer doneFx() 35 | handleReq(req, myNodeId, componentName, proxy, statCh, ctx) 36 | } 37 | 38 | return &Dispenser{ 39 | maxConcurrency: maxConcurrency, 40 | reqCh: reqCh, 41 | doneCh: doneCh, 42 | proxy: proxyFx, 43 | } 44 | } 45 | 46 | type Dispenser struct { 47 | maxConcurrency int 48 | reqCh <-chan *GetProxyRequest 49 | doneCh chan bool 50 | proxy Proxy 51 | } 52 | 53 | func (d *Dispenser) Run(ctx context.Context, reqWaitGroup *sync.WaitGroup) { 54 | if reqWaitGroup != nil { 55 | defer reqWaitGroup.Done() 56 | } 57 | 58 | concur := 0 59 | active := true 60 | for active { 61 | if d.maxConcurrency < 1 || concur < d.maxConcurrency { 62 | // under concurrency limit, accept a request, or a 'done' msg 63 | select { 64 | case <-ctx.Done(): 65 | active = false 66 | case req := <-d.reqCh: 67 | concur++ 68 | req.Proxy <- d.proxy 69 | case <-d.doneCh: 70 | concur-- 71 | } 72 | } else { 73 | // at limit - wait for 'done' msg 74 | select { 75 | case <-ctx.Done(): 76 | active = false 77 | case <-d.doneCh: 78 | concur-- 79 | } 80 | } 81 | } 82 | 83 | // wait for all in-flight reqs to finish 84 | for concur > 0 { 85 | <-d.doneCh 86 | concur-- 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /pkg/maelstrom/resolver_test.go: -------------------------------------------------------------------------------- 1 | package maelstrom 2 | 3 | import ( 4 | v1 "github.com/coopernurse/maelstrom/pkg/v1" 5 | "github.com/stretchr/testify/assert" 6 | "math/rand" 7 | "sort" 8 | "testing" 9 | ) 10 | 11 | func TestHttpEventSourceMatches(t *testing.T) { 12 | var tests = []struct { 13 | hostname string 14 | pathprefix string 15 | reqhost string 16 | reqpath string 17 | matches bool 18 | }{ 19 | {"", "", "", "", false}, 20 | {"foo.com", "", "foo.com", "", true}, 21 | {"foo.com", "", "foo2.com", "", false}, 22 | {"foo.com", "/foo", "foo.com", "", false}, 23 | {"foo.com", "/foo", "foo.com", "/foo", true}, 24 | {"foo.com", "/foo", "foo.com", "/foo/bar", true}, 25 | {"foo.com", "/foo", "foo2.com", "/foo/bar", false}, 26 | } 27 | for i, test := range tests { 28 | es := v1.EventSource{Http: &v1.HttpEventSource{ 29 | Hostname: test.hostname, 30 | PathPrefix: test.pathprefix, 31 | StripPrefix: false, 32 | }} 33 | assert.Equal(t, test.matches, httpEventSourceMatches(es, test.reqhost, test.reqpath), "failed %d", i) 34 | } 35 | } 36 | 37 | func TestHttpEventSourceSorting(t *testing.T) { 38 | sources := []v1.EventSource{ 39 | httpSource("foo.com", "/aaaa"), 40 | httpSource("bar.com", "/aaa"), 41 | httpSource("bar.com", "/aa"), 42 | httpSource("foo.com", "/"), 43 | httpSource("bar.com", "/"), 44 | httpSource("foo.com", ""), 45 | httpSource("bar.com", ""), 46 | httpSource("", "/aaaa"), 47 | httpSource("", "/aaa"), 48 | httpSource("", "/a"), 49 | httpSource("", ""), 50 | } 51 | expected := make([]v1.EventSource, len(sources)) 52 | for i := 0; i < 20; i++ { 53 | copy(expected, sources) 54 | rand.Shuffle(len(sources), func(i, j int) { sources[i], sources[j] = sources[j], sources[i] }) 55 | sort.Sort(httpEventSourcesForResolver(sources)) 56 | assert.Equal(t, expected, sources) 57 | } 58 | } 59 | 60 | func httpSource(hostname string, pathPrefix string) v1.EventSource { 61 | return v1.EventSource{ 62 | Http: &v1.HttpEventSource{ 63 | Hostname: hostname, 64 | PathPrefix: pathPrefix, 65 | StripPrefix: false, 66 | }, 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/coopernurse/maelstrom 2 | 3 | require ( 4 | github.com/GuiaBolso/darwin v0.0.0-20170210191649-86919dfcf808 5 | github.com/Masterminds/squirrel v0.0.0-20181211162353-a8c1880ebb4d 6 | github.com/aws/aws-sdk-go v1.23.18 7 | github.com/c9s/goprocinfo v0.0.0-20190309065803-0b2ad9ac246b 8 | github.com/coopernurse/barrister-go v0.0.0-20180602004421-efbfefb2c4a3 9 | github.com/coopernurse/envconfig v0.0.2 10 | github.com/digitalocean/godo v1.14.0 11 | github.com/docker/distribution v2.7.0+incompatible // indirect 12 | github.com/docker/docker v1.13.1 13 | github.com/docker/go-connections v0.4.0 14 | github.com/docker/go-units v0.3.3 15 | github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815 16 | github.com/dustin/go-humanize v1.0.0 17 | github.com/go-acme/lego v2.5.0+incompatible 18 | github.com/go-sql-driver/mysql v1.4.1 19 | github.com/go-yaml/yaml v2.1.0+incompatible 20 | github.com/google/go-querystring v1.0.0 // indirect 21 | github.com/google/gofuzz v0.0.0-20170612174753-24818f796faf 22 | github.com/kr/pretty v0.2.0 // indirect 23 | github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect 24 | github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect 25 | github.com/lib/pq v1.2.0 26 | github.com/mattn/go-colorable v0.1.1 // indirect 27 | github.com/mattn/go-isatty v0.0.6 // indirect 28 | github.com/mattn/go-sqlite3 v1.10.0 29 | github.com/mgutz/ansi v0.0.0-20170206155736-9520e82c474b // indirect 30 | github.com/mgutz/logxi v0.0.0-20161027140823-aebf8a7d67ab 31 | github.com/mholt/certmagic v0.5.1 32 | github.com/namedotcom/go v0.0.0-20180403034216-08470befbe04 // indirect 33 | github.com/opencontainers/go-digest v1.0.0-rc1 // indirect 34 | github.com/pkg/errors v0.8.0 35 | github.com/robfig/cron/v3 v3.0.0 36 | github.com/stretchr/testify v1.3.0 37 | golang.org/x/crypto v0.0.0-20190123085648-057139ce5d2b 38 | golang.org/x/net v0.0.0-20190125091013-d26f9f9a57f3 39 | golang.org/x/oauth2 v0.0.0-20190523182746-aaccbc9213b0 40 | gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 // indirect 41 | gopkg.in/yaml.v2 v2.2.8 // indirect 42 | ) 43 | 44 | go 1.13 45 | -------------------------------------------------------------------------------- /pkg/revproxy/revproxy.go: -------------------------------------------------------------------------------- 1 | package revproxy 2 | 3 | import ( 4 | "context" 5 | log "github.com/mgutz/logxi/v1" 6 | "net/http/httputil" 7 | "sync" 8 | "time" 9 | ) 10 | 11 | func LocalRevProxy(reqCh <-chan *Request, statCh chan<- time.Duration, proxy *httputil.ReverseProxy, 12 | ctx context.Context, wg *sync.WaitGroup) { 13 | RevProxyLoop(reqCh, statCh, proxy, ctx, wg, "", "") 14 | } 15 | 16 | func RevProxyLoop(reqCh <-chan *Request, statCh chan<- time.Duration, 17 | proxy *httputil.ReverseProxy, ctx context.Context, wg *sync.WaitGroup, myNodeId string, componentName string) { 18 | 19 | if wg != nil { 20 | defer wg.Done() 21 | } 22 | 23 | for { 24 | select { 25 | case mr := <-reqCh: 26 | if mr == nil { 27 | // reqCh closed and drained - all rev proxy loops for this component can exit 28 | return 29 | } 30 | handleReq(mr, myNodeId, componentName, proxy, statCh, ctx) 31 | case <-ctx.Done(): 32 | return 33 | } 34 | } 35 | } 36 | 37 | func handleReq(req *Request, myNodeId string, componentName string, proxy *httputil.ReverseProxy, 38 | statCh chan<- time.Duration, ctx context.Context) { 39 | 40 | defer func() { 41 | if r := recover(); r != nil { 42 | log.Warn("revproxy: recovered panic", "r", r) 43 | } 44 | }() 45 | 46 | if myNodeId != "" { 47 | relayPath := req.Req.Header.Get("MAELSTROM-RELAY-PATH") 48 | if relayPath == "" { 49 | relayPath = myNodeId 50 | } else if len(relayPath) < 1024 { 51 | relayPath = relayPath + "|" + myNodeId 52 | } else { 53 | log.Warn("revproxy: relay path too long to append to", "component", componentName) 54 | } 55 | req.Req.Header.Set("MAELSTROM-COMPONENT", componentName) 56 | req.Req.Header.Set("MAELSTROM-RELAY-PATH", relayPath) 57 | 58 | // TODO: need to set a header with time of request deadline 59 | // so that receiving node can set the request deadline appropriately to account for time already spent 60 | } 61 | 62 | proxy.ServeHTTP(req.Rw, req.Req) 63 | req.Done <- true 64 | if statCh != nil { 65 | select { 66 | case statCh <- time.Now().Sub(req.StartTime): 67 | // ok - sent 68 | case <-ctx.Done(): 69 | } 70 | 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /pkg/common/string_test.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import ( 4 | "github.com/stretchr/testify/assert" 5 | "testing" 6 | ) 7 | 8 | func TestInterpolate(t *testing.T) { 9 | m := map[string]string{ 10 | "A": "aval", 11 | "B": "bval", 12 | "b": "bvallower", 13 | "E": "", 14 | } 15 | 16 | assert.Equal(t, "", InterpolateWithMap("", m)) 17 | assert.Equal(t, "aval", InterpolateWithMap("${A}", m)) 18 | assert.Equal(t, "${A}", InterpolateWithMap("$${A}", m)) 19 | assert.Equal(t, " hello ${A} ", InterpolateWithMap(" hello $${A} ", m)) 20 | assert.Equal(t, "aval bval\n bvallower ${} ${", InterpolateWithMap("${A} ${B}\n ${b} ${} ${", m)) 21 | assert.Equal(t, "", InterpolateWithMap("${E}", m)) 22 | assert.Equal(t, "defaultVal", InterpolateWithMap("${E:-defaultVal}", m)) 23 | assert.Equal(t, "", InterpolateWithMap("${E-defaultVal}", m)) 24 | assert.Equal(t, "test defaultVal", InterpolateWithMap("test ${D:-defaultVal}", m)) 25 | assert.Equal(t, "test defaultVal", InterpolateWithMap("test ${D:defaultVal}", m)) 26 | assert.Equal(t, "test ", InterpolateWithMap("test ${D:}", m)) 27 | } 28 | 29 | func TestGlobMatches(t *testing.T) { 30 | // glob = "*" - matches everything 31 | assert.True(t, GlobMatches("*", "foo")) 32 | assert.True(t, GlobMatches("*", "")) 33 | 34 | // glob is string - only exact match 35 | assert.True(t, GlobMatches("cat", "cat")) 36 | assert.False(t, GlobMatches("cat", "Cat")) 37 | assert.False(t, GlobMatches("cat", "cat1")) 38 | assert.False(t, GlobMatches("cat", "1cat")) 39 | 40 | // glob on both sides - substring match 41 | assert.True(t, GlobMatches("*cat*", "cat")) 42 | assert.True(t, GlobMatches("*cat*", "1cat")) 43 | assert.True(t, GlobMatches("*cat*", "1cat1")) 44 | assert.True(t, GlobMatches("*cat*", "cat1")) 45 | assert.False(t, GlobMatches("*cat*", "cdat1")) 46 | 47 | // prefix glob 48 | assert.True(t, GlobMatches("*cat", "cat")) 49 | assert.True(t, GlobMatches("*cat", "foocat")) 50 | assert.False(t, GlobMatches("*cat", "catfoo")) 51 | 52 | // suffix glob 53 | assert.True(t, GlobMatches("cat*", "cat")) 54 | assert.False(t, GlobMatches("cat*", "foocat")) 55 | assert.True(t, GlobMatches("cat*", "catfoo")) 56 | } 57 | -------------------------------------------------------------------------------- /roadmap.md: -------------------------------------------------------------------------------- 1 | # maelstrom roadmap 2 | 3 | ## m1 4 | 5 | * Stub maelstromd 6 | * Docker container running in privileged mode 7 | * Listens on 80 externally (reverse proxy for components) 8 | * Listens on ?? for management requests 9 | * Stub CLI to manage cluster - maelctl 10 | * Talks to maelstromd on the management port 11 | * Implement commands: 12 | * PutComponent 13 | * GetComponents 14 | * Persistent state backend 15 | * SQL db only (gorp) 16 | * Activate component when HTTP request received 17 | * Docker components only (no zip functions) 18 | * Pull image, start container, tag container 19 | * Stop component after x seconds of inactivity 20 | * Load state of existing components from docker if maelstromd restarted 21 | * Project web site (hugo?) 22 | * Home page 23 | * Getting started page 24 | * Release history 25 | * maelctl reference 26 | * Contributor guidelines 27 | * Issue template (?) 28 | * CI 29 | * Run tests on merge 30 | * Deploy web site on merge 31 | 32 | ## m2 33 | 34 | * Security 35 | * SSL cert support for external port 36 | * LetsEncrypt support 37 | * Mandatory SSL cert for management port 38 | * If no real cert configured, use self-signed 39 | * maelctl: add insecure flag for this case 40 | * Process limits 41 | * RAM 42 | * CPU share 43 | * Max request time 44 | * Function support 45 | * API to pull task from maelstromd 46 | * API to invoke task 47 | * Function ZIP file storage implementations 48 | * Filesystem 49 | * S3 50 | * DigitalOcean spaces 51 | * Function API bindings 52 | * Go 53 | * Python 54 | * NodeJS 55 | 56 | ## m3 57 | 58 | * Cluster support 59 | * Discover peers via persistent store 60 | * Component autoscaling 61 | * Request rate per second (?) 62 | * Response time (?) 63 | * Terraform templates 64 | * AWS 65 | * DigitalOcean 66 | 67 | ## m4 68 | 69 | * Queue event sources 70 | * SQS 71 | * Scheduled job support 72 | * Deploy hooks (?) 73 | * Some solution for schema migrations 74 | * Persistent state backend 75 | * Add Redis (?) 76 | * Add etcd (?) 77 | 78 | -------------------------------------------------------------------------------- /pkg/converge/complock.go: -------------------------------------------------------------------------------- 1 | package converge 2 | 3 | import ( 4 | "context" 5 | "github.com/coopernurse/maelstrom/pkg/db" 6 | v1 "github.com/coopernurse/maelstrom/pkg/v1" 7 | "github.com/pkg/errors" 8 | "time" 9 | ) 10 | 11 | type CompLocker struct { 12 | db db.Db 13 | nodeId string 14 | } 15 | 16 | func NewCompLocker(db db.Db, nodeId string) *CompLocker { 17 | return &CompLocker{ 18 | db: db, 19 | nodeId: nodeId, 20 | } 21 | } 22 | 23 | func (c *CompLocker) StartLockAcquire(ctx context.Context, comp *v1.Component) (bool, error) { 24 | if comp.StartParallelism == v1.StartParallelismParallel { 25 | // no lock required 26 | return false, nil 27 | } 28 | 29 | roleId := compLockerRoleId(comp) 30 | lockDur := time.Second * time.Duration(v1.HealthCheckSeconds(comp.Docker)+1) 31 | for { 32 | 33 | if comp.StartParallelism == v1.StartParallelismSeriesfirst { 34 | deployCount, err := c.db.GetComponentDeployCount(comp.Name, comp.Version) 35 | if err != nil { 36 | return false, errors.Wrap(err, "complock: unable to get component deploy count") 37 | } 38 | if deployCount > 0 { 39 | // no lock required 40 | return false, nil 41 | } 42 | } 43 | 44 | // try to lock 45 | ok, _, err := c.db.AcquireOrRenewRole(roleId, c.nodeId, lockDur) 46 | if err != nil { 47 | return false, err 48 | } 49 | if ok { 50 | // lock acquired 51 | return true, nil 52 | } 53 | 54 | // wait to retry, aborting if context canceled 55 | ticker := time.NewTicker(5 * time.Second) 56 | select { 57 | case <-ctx.Done(): 58 | return false, ErrConvergeContextCanceled 59 | case <-ticker.C: 60 | // try again 61 | } 62 | } 63 | } 64 | 65 | func (c *CompLocker) PostStartContainer(comp *v1.Component, releaseLock bool, success bool) error { 66 | if success { 67 | err := c.db.IncrementComponentDeployCount(comp.Name, comp.Version) 68 | if err != nil { 69 | return err 70 | } 71 | } 72 | if releaseLock { 73 | err := c.db.ReleaseRole(compLockerRoleId(comp), c.nodeId) 74 | if err != nil { 75 | return err 76 | } 77 | } 78 | return nil 79 | } 80 | 81 | func compLockerRoleId(comp *v1.Component) string { 82 | return "start_container_" + comp.Name 83 | } 84 | -------------------------------------------------------------------------------- /docs/gitbook/getting_started/what_happened.md: -------------------------------------------------------------------------------- 1 | 2 | # What Happened?? 3 | 4 | ## Container placement and activation 5 | 6 | Let's take a moment to dig into what happened when you made this `curl` request. 7 | 8 | 1. You run `curl http://hello.localhost:8008/env` 9 | 1. `maelstromd` tries to resolve the request to a _component_ 10 | 1. The list of `http` event sources is loaded 11 | 1. Each event source's hostname is compared with the request hostname 12 | 1. If a match is found the component is returned 13 | 1. `maelstromd` asks the internal router for a reverse proxy endpoint for the component 14 | 1. If a component is running somewhere in the cluster, the endpoint is returned. 15 | 1. If no component is running, a _placement request_ is made and the request is internally queued 16 | until a container for that component is started somewhere in the cluster. 17 | 1. In this case no component was running so a placement request was made: 18 | 1. The node attempts to acquire the `placement` role lock 19 | 1. If it acquires the lock it runs the placement sequence locally. Otherwise it makes a RPC call to the node 20 | that owns the lock. 21 | 1. The placement node decides which node in the cluster should run the component and notifies them. 22 | 1. The target node pulls the docker image and starts the container 23 | 1. The target node writes updated state to the db and broadcasts its state to its cluster peers 24 | 1. The original node receives the updated state and proxies the request to that node 25 | 1. The node proxies the request to the local container 26 | 27 | Note that the above steps are performed regardless of cluster size. In our case our cluster has a single node, so 28 | all operations were performed locally, but the exact same steps occur in clusters with multiple nodes. 29 | 30 | ## Unsuccessful request 31 | 32 | Try making a request with a different hostname. No event source will map to the hostname and an error will be 33 | returned: 34 | 35 | ``` 36 | $ curl http://bogus.localhost:8008/ 37 | No component matches the request 38 | ``` 39 | 40 | Congratulations, you've successfully run your first maelstrom project. Let's try updating it and adding a cron 41 | event source. 42 | -------------------------------------------------------------------------------- /pkg/common/util.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import ( 4 | "bytes" 5 | "io" 6 | "net" 7 | "os" 8 | "sort" 9 | "strconv" 10 | "strings" 11 | ) 12 | 13 | func ToIntOrDefault(s string, defaultVal int) int { 14 | v, err := strconv.Atoi(s) 15 | if err == nil { 16 | return v 17 | } 18 | return defaultVal 19 | } 20 | 21 | func CheckClose(c io.Closer, err *error) { 22 | cerr := c.Close() 23 | if *err == nil { 24 | *err = cerr 25 | } 26 | } 27 | 28 | func EnvVarMap() map[string]string { 29 | return ParseEnvVarMap(os.Environ()) 30 | } 31 | 32 | func ParseEnvVarMap(nvpairs []string) map[string]string { 33 | envVars := make(map[string]string) 34 | for _, s := range nvpairs { 35 | pos := strings.Index(s, "=") 36 | if pos > -1 { 37 | key := s[0:pos] 38 | val := s[pos+1:] 39 | envVars[key] = val 40 | } 41 | } 42 | return envVars 43 | } 44 | 45 | func SortedMapKeys(m map[string]string) []string { 46 | keys := make([]string, 0) 47 | for key := range m { 48 | keys = append(keys, key) 49 | } 50 | sort.Strings(keys) 51 | return keys 52 | } 53 | 54 | // dropCR drops a terminal \r from the data. 55 | func dropCR(data []byte) []byte { 56 | if len(data) > 0 && data[len(data)-1] == '\r' { 57 | return data[0 : len(data)-1] 58 | } 59 | return data 60 | } 61 | 62 | // From: https://stackoverflow.com/questions/37530451/golang-bufio-read-multiline-until-crlf-r-n-delimiter 63 | func ScanCRLF(data []byte, atEOF bool) (advance int, token []byte, err error) { 64 | if atEOF && len(data) == 0 { 65 | return 0, nil, nil 66 | } 67 | if i := bytes.Index(data, []byte{'\r', '\n'}); i >= 0 { 68 | // We have a full newline-terminated line. 69 | return i + 2, dropCR(data[0:i]), nil 70 | } 71 | // If we're at EOF, we have a final, non-terminated line. Return it. 72 | if atEOF { 73 | return len(data), dropCR(data), nil 74 | } 75 | // Request more data. 76 | return 0, nil, nil 77 | } 78 | 79 | // Get preferred outbound IP of this machine 80 | // From: https://stackoverflow.com/questions/23558425/how-do-i-get-the-local-ip-address-in-go 81 | func GetOutboundIP() (net.IP, error) { 82 | conn, err := net.Dial("udp", "8.8.8.8:53") 83 | if err != nil { 84 | return net.IP{}, err 85 | } 86 | err = conn.Close() 87 | 88 | localAddr := conn.LocalAddr().(*net.UDPAddr) 89 | return localAddr.IP, err 90 | } 91 | -------------------------------------------------------------------------------- /docs/gitbook/getting_started/activate_component.md: -------------------------------------------------------------------------------- 1 | 2 | # Activate Component 3 | 4 | ## Nothing running yet 5 | 6 | When you ran `maelctl project put` you told **maelstrom** that: 7 | 8 | * You want to register a component named `hello` 9 | * This component maps to a docker image named: `docker.io/coopernurse/go-hello-http` 10 | * This component should use a max of 128MiB of RAM 11 | * This component should be started when a request is received via the hostname: `hello.localhost` 12 | 13 | But we haven't requested it yet. Try running `docker ps`. You shouldn't see any containers running yet. 14 | 15 | ``` 16 | $ docker ps 17 | CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 18 | ``` 19 | 20 | ## Setup hostname 21 | 22 | In our `maelstrom.yml` we used the hostname `hello.localhost`. Let's add that to `/etc/hosts` so our 23 | computer can resolve it. 24 | 25 | ```bash 26 | sudo bash -c "echo '127.0.0.1 hello.localhost' >> /etc/hosts" 27 | ``` 28 | 29 | Note: If you're running `dnsmasq` you may not need to perform the above step, as it sets up a wildcard 30 | DNS entry for `*.localhost` automatically. Trying pinging `hello.localhost` first and if it doesn't resolve, 31 | run the command above. 32 | 33 | ## Make a request 34 | 35 | `maelstromd` is running on port 8008 (because of `MAEL_PUBLIC_PORT=8008` in `mael.env`). 36 | If we make a request to `hello.localhost:8008` then we should get a container. 37 | 38 | Let's try it: 39 | 40 | ```bash 41 | $ curl http://hello.localhost:8008/env 42 | ``` 43 | 44 | After a couple of seconds you should see output like this: 45 | 46 | ``` 47 | $ curl http://hello.localhost:8008/env 48 | MAELSTROM_COMPONENT_VERSION=1 49 | HOSTNAME=4ab1be0128d2 50 | SHLVL=1 51 | HOME=/root 52 | MAELSTROM_PRIVATE_URL=http://172.18.0.1:8374 53 | MAELSTROM_COMPONENT_NAME=hello-mael_hello 54 | PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin 55 | PWD=/app 56 | ``` 57 | 58 | And `docker ps` will show a running container: 59 | 60 | ``` 61 | $ docker ps 62 | CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 63 | 4ab1be0128d2 coopernurse/go-hello-http "/bin/sh -c ./goapp" 57 seconds ago Up 56 seconds 8080/tcp confident_mayer 64 | ``` -------------------------------------------------------------------------------- /docs/gitbook/getting_started/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | ## Download 4 | 5 | Pre-built binaries are available for Linux x86_64. On other platforms you'll need to use a pre-compiled docker image. 6 | 7 | ``` 8 | # Use any directory you wish that's in your PATH 9 | cd /usr/local/bin 10 | # download binaries 11 | curl -LO https://download.maelstromapp.com/latest/linux_x86_64/maelstromd 12 | curl -LO https://download.maelstromapp.com/latest/linux_x86_64/maelctl 13 | chmod 755 maelstromd maelctl 14 | ``` 15 | 16 | ## Configure Environment 17 | 18 | `maelstromd` may be configured by setting environment variables manually, or via a configuration file with `name=value` 19 | lines. 20 | 21 | Create a file called `mael.env` with the following content: 22 | 23 | ``` 24 | MAEL_SQL_DRIVER=sqlite3 25 | MAEL_SQL_DSN=maelstrom.db?cache=shared&_journal_mode=MEMORY 26 | MAEL_PUBLIC_PORT=8008 27 | ``` 28 | 29 | ## Start maelstromd 30 | 31 | Run `maelstromd` - if using the Linux binaries: 32 | 33 | ``` 34 | $ /usr/local/bin/maelstromd -f mael.env 35 | ``` 36 | 37 | Or if using the docker image: 38 | 39 | ``` 40 | # add a -d switch if you want to background this process 41 | # otherwise it will run in the foreground 42 | docker run --name maelstromd -p 8374:8374 -p 8008:8008 -v `pwd`:/app --privileged \ 43 | -v /var/run/docker.sock:/var/run/docker.sock --env-file mael.env \ 44 | coopernurse/maelstrom maelstromd 45 | ``` 46 | 47 | You should see output that looks like this: 48 | ``` 49 | 13:02:29.778603 INF ~ maelstromd: starting 50 | 13:02:29.791538 INF ~ handler: creating DockerHandlerFactory maelstromUrl: http://172.18.0.1:8374 51 | 13:02:29.819370 INF ~ cluster: added node nodeId: H5ML:TQZ7:7TKL:DLUP:JB7A:C2VA:XYCW:TEOM:FE6L:65PS:FISY:YWOO 52 | remoteNode: H5ML:TQZ7:7TKL:DLUP:JB7A:C2VA:XYCW:TEOM:FE6L:65PS:FISY:YWOO 53 | 13:02:29.819441 INF ~ maelstromd: created NodeService 54 | nodeId: H5ML:TQZ7:7TKL:DLUP:JB7A:C2VA:XYCW:TEOM:FE6L:65PS:FISY:YWOO peerUrl: http://192.168.1.76:8374 numCPUs: 8 55 | 13:02:29.823934 INF ~ maelstromd: aws session initialized 56 | 13:02:29.823982 INF ~ maelstromd: starting HTTP servers publicPort: 8008 privatePort: 8374 57 | 13:02:29.824123 INF ~ cron: starting cron service refreshRate: 1m0s 58 | 13:02:29.825061 INF ~ cron: acquired role lock, starting cron 59 | ``` 60 | 61 | You may stop `maelstromd` at any time by pressing `control-c`, but let's keep it running and create our first project. 62 | 63 | 64 | -------------------------------------------------------------------------------- /pkg/common/string.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import ( 4 | "bytes" 5 | "regexp" 6 | "strings" 7 | ) 8 | 9 | type StringPtr []*string 10 | 11 | func (s StringPtr) Len() int { return len(s) } 12 | func (s StringPtr) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 13 | func (s StringPtr) Less(i, j int) bool { return *s[i] < *s[j] } 14 | 15 | var interpolateRE = regexp.MustCompile(`(?m)(\$?\${[^}]+})`) 16 | 17 | func parseToken(tok string) (key string, defaultVal string, emptyToDefault bool) { 18 | // strip ${ and } 19 | tok = tok[2 : len(tok)-1] 20 | key = tok 21 | pos := strings.Index(tok, ":") 22 | if pos > -1 { 23 | key = tok[0:pos] 24 | if pos < len(tok)-1 { 25 | if tok[pos+1] == '-' { 26 | emptyToDefault = true 27 | pos++ 28 | } 29 | if pos < len(tok)-1 { 30 | defaultVal = tok[pos+1:] 31 | } 32 | } 33 | } 34 | return 35 | } 36 | 37 | func InterpolateWithMap(input string, vars map[string]string) string { 38 | out := bytes.NewBufferString("") 39 | matches := interpolateRE.FindAllStringIndex(input, -1) 40 | pos := 0 41 | for _, m := range matches { 42 | tok := input[m[0]:m[1]] 43 | var val string 44 | var ok bool 45 | if strings.HasPrefix(tok, "$${") { 46 | val = tok[1:] 47 | } else { 48 | key, defaultVal, emptyToDefault := parseToken(tok) 49 | val, ok = vars[key] 50 | if !ok || (val == "" && emptyToDefault) { 51 | val = defaultVal 52 | } 53 | } 54 | out.WriteString(input[pos:m[0]]) 55 | out.WriteString(val) 56 | pos = m[1] 57 | } 58 | out.WriteString(input[pos:]) 59 | return out.String() 60 | } 61 | 62 | func StrTruncate(s string, maxlen int) string { 63 | if len(s) > maxlen { 64 | return s[0:maxlen] 65 | } 66 | return s 67 | } 68 | 69 | func TruncNodeId(id string) string { 70 | return StrTruncate(id, 14) 71 | } 72 | 73 | func TruncContainerId(id string) string { 74 | return StrTruncate(id, 8) 75 | } 76 | 77 | func GlobMatches(globOrStr string, target string) bool { 78 | if globOrStr == "*" || globOrStr == target { 79 | return true 80 | } 81 | 82 | if strings.HasPrefix(globOrStr, "*") && strings.HasSuffix(globOrStr, "*") { 83 | return strings.Contains(target, globOrStr[1:len(globOrStr)-1]) 84 | } 85 | if strings.HasPrefix(globOrStr, "*") { 86 | return strings.HasSuffix(target, globOrStr[1:]) 87 | } 88 | if strings.HasSuffix(globOrStr, "*") { 89 | return strings.HasPrefix(target, globOrStr[:len(globOrStr)-1]) 90 | } 91 | return false 92 | } 93 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Maelstrom 2 | 3 | [![GitHub Actions](https://github.com/maelstromapp/maelstrom/workflows/test/badge.svg)](https://github.com/maelstromapp/maelstrom/actions) 4 | 5 | ## Overview 6 | 7 | Maelstrom is a HTTP reverse proxy and container orchestrator that starts and scales containers 8 | automatically as needed. 9 | 10 | A Maelstrom cluster is composed of nodes each running `maelstromd` pointed at a shared database which 11 | stores configuration state about the components and event sources in the system. 12 | 13 | * A component is a docker image with a related run configuration and environment variables. 14 | * An event source is something that generates requests for a component. HTTP requests, scheduled jobs, and 15 | messages in queues are all types of events. Maelstrom currently supports HTTP, cron, and AWS SQS events. 16 | 17 | ![screencast](https://maelstromapp.com/images/demo_screencast.svg) 18 | 19 | ## Documentation 20 | 21 | Full docs including a getting started guide are available at: 22 | [https://maelstromapp.com/docs/](https://maelstromapp.com/docs/) 23 | 24 | ## Support 25 | 26 | I'm available on a contract basis to help your team begin using Maelstrom. 27 | 28 | Please contact James Cooper at: james@bitmechanic.com. 29 | 30 | ## Development 31 | 32 | Maelstrom is written in Go and uses the [Go 1.11 module system](https://github.com/golang/go/wiki/Modules). 33 | 34 | ### Build and Test 35 | 36 | Maelstrom uses [Barrister RPC](http://barrister.bitmechanic.com/) for communication between `maelctl` and `maelstromd` 37 | and for communication between cluster peers. 38 | 39 | ``` 40 | # install Barrister and Go bindings 41 | pip install --pre --user barrister 42 | go get github.com/coopernurse/barrister-go 43 | go install github.com/coopernurse/barrister-go/idl2go 44 | 45 | # compile IDL to go 46 | make idl 47 | ``` 48 | 49 | ``` 50 | # run tests 51 | make test 52 | 53 | # build CLI 54 | make maelctl 55 | 56 | # build daemon 57 | make maelstromd 58 | ``` 59 | 60 | ### Contributions 61 | 62 | Pull requests are very welcome. I'd suggest opening an issue first so we can all discuss the 63 | solution before major work is done. But if you have an itch to scratch feel free to open a PR 64 | directly. 65 | 66 | * Keep documentation in `docs/gitbook` up to date if you add a new feature 67 | * Make sure all code has been formatted with `gofmt` 68 | 69 | ### Dev notes 70 | 71 | * [Docker Go API](https://docs.docker.com/develop/sdk/examples/#list-and-manage-containers) 72 | -------------------------------------------------------------------------------- /pkg/config/config_test.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | import ( 4 | "bytes" 5 | "github.com/stretchr/testify/assert" 6 | "testing" 7 | ) 8 | 9 | func TestConfigFromEnv(t *testing.T) { 10 | env := ` 11 | # comment here 12 | MAEL_PUBLIC_PORT=1 13 | MAEL_PUBLIC_HTTPS_PORT=2 14 | MAEL_PRIVATE_PORT=3 15 | 16 | # ignore blank lines 17 | MAEL_SQL_DRIVER=driver 18 | MAEL_SQL_DSN=dsn Here 19 | 20 | MAEL_CRON_REFRESH_SECONDS=50 21 | MAEL_TOTAL_MEMORY=-200 22 | MAEL_INSTANCE_ID=instid 23 | MAEL_SHUTDOWN_PAUSE_SECONDS=3 24 | MAEL_TERMINATE_COMMAND=term cmd 25 | MAEL_AWS_TERMINATE_QUEUE_URL=q1 26 | MAEL_AWS_TERMINATE_MAX_AGE_SECONDS=44 27 | MAEL_AWS_SPOT_TERMINATE_POLL_SECONDS=55 28 | 29 | MAEL_HTTP_READ_TIMEOUT=100 30 | MAEL_HTTP_WRITE_TIMEOUT=200 31 | MAEL_HTTP_IDLE_TIMEOUT=400 32 | 33 | MAEL_LOG_GC_SECONDS=66 34 | MAEL_CPU_PROFILE_FILENAME=somefile 35 | MAEL_PPROF=true 36 | 37 | MAEL_DOCKER_PRUNE_MINUTES=123 38 | MAEL_DOCKER_PRUNE_UNREG_IMAGES=true 39 | MAEL_DOCKER_PRUNE_UNREG_KEEP=acme.org/*,hello-world 40 | 41 | MAEL_NODE_LIVENESS_SECONDS=200 42 | ` 43 | 44 | expected := Config{ 45 | InstanceId: "instid", 46 | PublicPort: 1, 47 | PublicHTTPSPort: 2, 48 | PrivatePort: 3, 49 | HTTPReadTimeout: 100, 50 | HTTPWriteTimeout: 200, 51 | HTTPIdleTimeout: 400, 52 | SqlDriver: "driver", 53 | SqlDsn: "dsn Here", 54 | CronRefreshSeconds: 50, 55 | LogGcSeconds: 66, 56 | CpuProfileFilename: "somefile", 57 | Pprof: true, 58 | TotalMemory: -200, 59 | TerminateCommand: "term cmd", 60 | ShutdownPauseSeconds: 3, 61 | AwsTerminateQueueUrl: "q1", 62 | AwsTerminateMaxAgeSeconds: 44, 63 | AwsSpotTerminatePollSeconds: 55, 64 | NodeLivenessSeconds: 200, 65 | DockerPruneUnregImages: true, 66 | DockerPruneMinutes: 123, 67 | DockerPruneUnregKeep: "acme.org/*,hello-world", 68 | Cluster: ClusterOptions{ 69 | Name: "", 70 | MinSize: 1, 71 | MaxSize: 20, 72 | }, 73 | DigitalOcean: &DigitalOceanOptions{ 74 | AccessToken: "", 75 | Region: "nyc3", 76 | SSHFingerprint: "", 77 | DropletSize: "s-1vcpu-1gb", 78 | ImageSlug: "debian-9-x64", 79 | Backups: true, 80 | IPV6: false, 81 | }, 82 | } 83 | 84 | assert.Nil(t, ReaderToEnv(bytes.NewBufferString(env))) 85 | conf, err := FromEnv() 86 | assert.Nil(t, err) 87 | assert.Equal(t, expected, conf) 88 | } 89 | -------------------------------------------------------------------------------- /docs/gitbook/production/prune.md: -------------------------------------------------------------------------------- 1 | # Pruning Images 2 | 3 | Over time Maelstrom nodes may accumulate unused and unwanted Docker images. 4 | Eventually this could consume all disk space on a node. 5 | 6 | Maelstrom can be optionally configured to periodically remove untagged images and 7 | images that are not associated with any components. This feature is disabled by 8 | default, but can be easily enabled by setting a single environment variable. 9 | 10 | ## Examples 11 | 12 | ### Example 1: Remove exited containers and untagged images 13 | 14 | This is similar in behavior to `docker system prune`. To enable this simply set 15 | `MAEL_DOCKER_PRUNE_MINUTES` to specify the interval this job should run on each 16 | Maelstrom node. 17 | 18 | ``` 19 | # run daily 20 | MAEL_DOCKER_PRUNE_MINUTES=1440 21 | ``` 22 | 23 | ### Example 2: Same as #1 plus all images not associated with a component 24 | 25 | ``` 26 | # run every 12 hours 27 | MAEL_DOCKER_PRUNE_MINUTES=720 28 | # remove all images NOT associated with a Maelstrom component 29 | MAEL_DOCKER_PRUNE_UNREG_IMAGES=true 30 | ``` 31 | 32 | This example is a superset of Example 1. In addition to removing exited containers and untagged images 33 | Maelstrom will remove all images not associated with a Maelstrom component. For example, if your system 34 | has 5 components defined with these image names: 35 | 36 | ``` 37 | mycorp/web:v1 38 | mycorp/accounting:v2 39 | mycorp/intranet:v2 40 | mycorp/hr 41 | mycorp/api 42 | ``` 43 | 44 | And your maelstrom node has these images stored locally: 45 | 46 | ``` 47 | mycorp/web:v1 48 | mycorp/accounting:v1 49 | mycorp/accounting:v2 50 | mycorp/intranet:v1 51 | mycorp/intranet:v2 52 | mycorp/hr 53 | mycorp/api 54 | othercorp/api 55 | redis:3.2 56 | ``` 57 | 58 | Then these images would be removed: 59 | 60 | ``` 61 | mycorp/accounting:v1 62 | mycorp/intranet:v1 63 | othercorp/api 64 | redis:3.2 65 | ``` 66 | 67 | ### Example 3: Same as #2, but keep some images 68 | 69 | What if you need the redis image and don't want it deleted? Maelstrom allows you to specify a list 70 | of image name patterns to keep. 71 | 72 | ``` 73 | # run every 12 hours 74 | MAEL_DOCKER_PRUNE_MINUTES=720 75 | # remove all images NOT associated with a Maelstrom component 76 | MAEL_DOCKER_PRUNE_UNREG_IMAGES=true 77 | # comma separated list of names to retain. prefix and suffix * globs 78 | # are supported (but not full regexps) 79 | MAEL_DOCKER_PRUNE_UNREG_KEEP=othercorp/*,redis* 80 | ``` 81 | 82 | In this configuration only these images would be removed: 83 | 84 | ``` 85 | mycorp/accounting:v1 86 | mycorp/intranet:v1 87 | ``` 88 | 89 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: test watch-test maelctl maelstromd idl run-maelstromd cover 2 | .EXPORT_ALL_VARIABLES: 3 | 4 | GO111MODULE = on 5 | MAEL_SQL_DRIVER = sqlite3 6 | MAEL_SQL_DSN = ./tmp/maelstrom.db?cache=shared&_journal_mode=MEMORY 7 | #MAEL_SQL_DRIVER = postgres 8 | #MAEL_SQL_DSN = postgres://postgres:test@localhost:5432/mael?sslmode=disable 9 | MAEL_PUBLIC_PORT = 8008 10 | 11 | BUILD_VER = 0.0.0 12 | BUILD_DATE := $(shell date +%FT%T%z) 13 | BUILD_GITSHA := $(shell git rev-parse --short HEAD) 14 | LD_FLAGS = -ldflags "-X main.version=$(BUILD_VER) -X main.builddate=$(BUILD_DATE) -X main.gitsha=$(BUILD_GITSHA)" 15 | 16 | test: 17 | scripts/gofmt_check.sh 18 | rm -f pkg/v1/test.db pkg/gateway/test.db 19 | go test -timeout 4m ./... 20 | errcheck -ignore 'fmt:[FS]?[Pp]rint*' ./... 21 | 22 | watch-test: 23 | find . -name *.go | entr -c make test 24 | 25 | cover: 26 | mkdir -p tmp 27 | go test -coverprofile=tmp/cover.out github.com/coopernurse/maelstrom/pkg/maelstrom \ 28 | github.com/coopernurse/maelstrom/pkg/maelstrom/component \ 29 | github.com/coopernurse/maelstrom/pkg/common 30 | go tool cover -html=tmp/cover.out 31 | 32 | maelctl: 33 | go build ${LD_FLAGS} -o dist/maelctl --tags "libsqlite3 linux" cmd/maelctl/*.go 34 | 35 | maelstromd: 36 | go build ${LD_FLAGS} -o dist/maelstromd --tags "libsqlite3 linux" cmd/maelstromd/*.go 37 | 38 | idl: 39 | barrister idl/maelstrom.idl | idl2go -i -p v1 -d pkg 40 | gofmt -w pkg/v1/*.go 41 | 42 | docker-image: 43 | docker build -t coopernurse/maelstrom . 44 | 45 | docker-run-maelstromd: 46 | mkdir -p tmp 47 | docker run -d --name maelstromd -p 8374:8374 -p 8008:8008 -v `pwd`/tmp:/data --privileged \ 48 | -v /var/run/docker.sock:/var/run/docker.sock \ 49 | -e MAEL_SQL_DRIVER="sqlite3" \ 50 | -e MAEL_PUBLIC_PORT=8008 \ 51 | -e MAEL_SQL_DSN="/data/maelstrom.db?cache=shared&_journal_mode=MEMORY" \ 52 | coopernurse/maelstrom maelstromd 53 | 54 | docker-push-image: 55 | docker tag coopernurse/maelstrom docker.io/coopernurse/maelstrom 56 | docker push docker.io/coopernurse/maelstrom 57 | 58 | run-maelstromd: 59 | mkdir -p tmp 60 | ./dist/maelstromd & 61 | 62 | profile-maelstromd: 63 | mkdir -p tmp 64 | ./dist/maelstromd & 65 | 66 | copy-to-server: 67 | scp ./dist/maelstromd root@maelstromapp.com:/opt/web/sites/download.maelstromapp.com/latest/linux_x86_64/ 68 | scp ./dist/maelctl root@maelstromapp.com:/opt/web/sites/download.maelstromapp.com/latest/linux_x86_64/ 69 | 70 | copy-aws-scripts-to-server: 71 | scp ./cloud/aws/mael-init-node.sh root@maelstromapp.com:/opt/web/sites/download.maelstromapp.com/latest/ 72 | 73 | gitbook: 74 | cd docs/gitbook && gitbook build 75 | 76 | publish-web: 77 | make gitbook 78 | rm -rf docs/maelstromapp.com/docs/ 79 | cp -r docs/gitbook/_book docs/maelstromapp.com/docs 80 | rsync -avz docs/maelstromapp.com/ root@maelstromapp.com:/opt/web/sites/maelstromapp.com/ 81 | -------------------------------------------------------------------------------- /docs/gitbook/event_sources/cron.md: -------------------------------------------------------------------------------- 1 | 2 | # Cron 3 | 4 | A cron event source activates your component at a timed interval specified in the `schedule` field. 5 | When a cron event occurs, a HTTP request is made to your component on the path specified. 6 | 7 | This allows you to run periodic jobs without an additional scheduler. 8 | 9 | In a cluster with more than one node, a single node is responsible for triggering cron events at any 10 | time. 11 | 12 | The cron system is implementing using [robfig/cron](https://godoc.org/github.com/robfig/cron), so 13 | full documentation on the different schedule string formats is available in that package's documentation. 14 | 15 | This library is invoked with default options which enables the "standard" cron parser, which expects the 16 | date patterns to have 5 fields starting with minute. Quartz style cron rules (which support second granularity) 17 | are not currently supported. 18 | 19 | NOTE: Make sure to use quotes around the `schedule` value to avoid YAML parsing issues with `*` characters. 20 | 21 | ## Shorthand "crontab" support 22 | 23 | Since some projects have a large number of scheduled jobs, a more compact "crontab" format is 24 | supported in `maelstrom.yml` files. Each line is parsed and added to the event source list. 25 | You may combine the `crontab` field with standard `eventsources` fields. 26 | 27 | Each cron entry created from the `crontab` block has these attributes: 28 | 29 | * Event source name: `-cron-` (zero based) 30 | * HTTP method: GET 31 | * No HTTP data or headers 32 | 33 | ## Examples 34 | 35 | ### Example 1: Run every hour - make GET request 36 | 37 | ```yaml 38 | components: 39 | mywebapp: 40 | image: coopernurse/go-hello-http 41 | eventsources: 42 | backup_db_hourly: 43 | cron: 44 | schedule: "@every 1h" 45 | http: 46 | method: GET 47 | path: /jobs/backup_db 48 | ``` 49 | 50 | ### Example 2: Make a POST request nightly with custom headers 51 | 52 | ```yaml 53 | components: 54 | mywebapp: 55 | image: coopernurse/go-hello-http 56 | eventsources: 57 | some_job_nightly: 58 | cron: 59 | schedule: "30 * * * *" 60 | http: 61 | method: POST 62 | path: /jobs/some_job 63 | data: '{"key": "value", "key2", "value2}' 64 | headers: 65 | - name: Content-Type 66 | value: application/json 67 | ``` 68 | 69 | ### Example 3: Define 3 cron rules via the crontab block 70 | 71 | ```yaml 72 | components: 73 | mywebapp: 74 | image: coopernurse/go-hello-http 75 | # Important: use a | (not >) to define the crontab multi-line block 76 | crontab: | 77 | # lines starting with hash are ignored 78 | @every 30m /cron/job1 79 | # second job: 80 | 30 * * * * /cron/job2 81 | * */2 * * * /cron/job3 82 | ``` -------------------------------------------------------------------------------- /docs/gitbook/how_works/architecture.md: -------------------------------------------------------------------------------- 1 | 2 | # Architecture 3 | 4 | ## The Basics 5 | 6 | * A maelstrom **node** is a machine running the `maelstromd` process 7 | * A maelstrom **cluster** is a collection of nodes that share a common database. 8 | * Nodes in a cluster should be able to connect to each other on their private port (8374 by default) 9 | * A maelstrom **component** is a name mapped to configuration that specifies how to start a docker container that 10 | runs a HTTP server. Requests for the component will be reverse proxied to that container on demand. 11 | * Nodes periodically write their current state to the database (default: once per minute) 12 | * All nodes are peers and should be identically configured. 13 | * Nodes acquire locks from the database before performing certain tasks. This ensures that only 14 | one node is performing critical operations such as autoscaling or cron triggering. 15 | * `maelctl` makes JSON-RPC requests to a maelstrom node. 16 | * The `MAELSTROM_PRIVATE_URL` env var tells `maelctl` where to connect. (default=`http://127.0.0.1:8374`) 17 | * New nodes that join the cluster will be automatically assigned components to run within 90 seconds of joining. 18 | 19 | ## Routing 20 | 21 | * All requests and responses in maelstrom are HTTP 22 | * If multiple instances of a component are running, requests are distributed across using round-robin scheduling. 23 | * Each node maintains a routing table and can route requests to its peers if the component is not running locally. 24 | 25 | ``` 26 | +-----------------+ 27 | +------------------+ Load Balancer +-------------------+ 28 | | | (optional) | | 29 | | +--------+--------+ | 30 | | | | 31 | | | | 32 | | 80/443 | 80/443 | 80/443 33 | +--------v--------+ +--------v--------+ +---------v-------+ 34 | |node-a | |node-b | |node-c | 35 | | +---------+ +---------+ | 36 | |10.0.0.2 | 8374 |10.0.0.3 | 8374 |10.0.0.4 | 37 | +--------+--------+ +--------+--------+ +---------+-------+ 38 | | | | 39 | | | | 40 | | | | 41 | | +--------v--------+ | 42 | +----------------->+ MySQL +<------------------+ 43 | | or Postgres | 44 | +-----------------+ 45 | ``` -------------------------------------------------------------------------------- /docs/gitbook/event_sources/http.md: -------------------------------------------------------------------------------- 1 | 2 | # HTTP 3 | 4 | A HTTP event source activates your component when a request is received by `maelstromd` on the public HTTP port 5 | with a `Host` header that matches the `hostname` and `pathprefix` value. 6 | 7 | ## Fields 8 | 9 | | Field | Description | Default 10 | |--------------|----------------------------------------------------------------------------------------|------------- 11 | | hostname | Must match hostname on request (or Host header) | None 12 | | pathprefix | Must match the beginning of the request path | None 13 | | stripprefix | If true and pathprefix is set, pathprefix is removed from path before proxying request | false 14 | 15 | ## Rule sorting 16 | 17 | Rules are sorted from most to least specific. Rules without hostnames sort below all rules with hostnames. 18 | Rules with hostnames and paths sort above rules with only hostnames. Rules with both hostname and path are 19 | sorted by path length (longest first). Here's an example of a sorted list of rules. 20 | 21 | ``` 22 | hostname path 23 | ------------------------ 24 | foo.com /aaaa 25 | bar.com /aaa 26 | bar.com /aa 27 | foo.com /a 28 | foo.com 29 | bar.com 30 | /aaaa 31 | /aaa 32 | /a 33 | ``` 34 | 35 | ## Examples 36 | 37 | **Example 1: Match all requests to hello.example.org** 38 | 39 | ```yaml 40 | components: 41 | mywebapp: 42 | image: coopernurse/go-hello-http 43 | eventsources: 44 | # "myhttp_source" is the name of the event source 45 | # this name must be unique within a project yaml file 46 | myhttp_source: 47 | # http field indicates this is a HTTP event source 48 | http: 49 | # Either hostname or pathprefix must be provided 50 | # if both are provided, then request must match both 51 | hostname: hello.example.org 52 | ``` 53 | 54 | **Example 2: Match all requests to hello.example.org/foo/** 55 | 56 | This doesn't strip the prefix, so requests to: 57 | http://hello.example.org/foo/ goes to http://x.y.z:port/foo/ 58 | 59 | ```yaml 60 | components: 61 | mywebapp: 62 | image: coopernurse/go-hello-http 63 | eventsources: 64 | myhttp_source: 65 | http: 66 | hostname: hello.example.org 67 | pathprefix: /foo/ 68 | ``` 69 | 70 | **Example 3: Same as 2 but strip prefix** 71 | 72 | http://hello.example.org/foo/ goes to http://x.y.z:port/ 73 | 74 | ```yaml 75 | components: 76 | mywebapp: 77 | image: coopernurse/go-hello-http 78 | eventsources: 79 | myhttp_source: 80 | http: 81 | hostname: hello.example.org 82 | pathprefix: /foo/ 83 | stripprefix: true 84 | ``` 85 | 86 | **Example 4: Match all requests to /someapi** 87 | 88 | Be careful with this as it will match any hostname. 89 | 90 | ```yaml 91 | components: 92 | mywebapp: 93 | image: coopernurse/go-hello-http 94 | eventsources: 95 | myhttp_source: 96 | http: 97 | pathprefix: /someapi 98 | ``` -------------------------------------------------------------------------------- /design/overview.md: -------------------------------------------------------------------------------- 1 | 2 | # Maelstrom Design 3 | 4 | ## Overview 5 | 6 | ``` 7 | +----------------------+ +--------------------+ +--------------------------+ 8 | | | | | | | 9 | | http-event-source | | sqs-event-source | | scheduled-event-source | 10 | | | | | | | 11 | +----------+-----------+ +---------+----------+ +------------+-------------+ 12 | | | | 13 | | events activate | | 14 | | a component | | 15 | | | | 16 | | | | 17 | | +-------v-------+ | 18 | +--------------------> <----------------------+ 19 | | component | 20 | | +---------------+ 21 | +---------------+ | 22 | | container is an 23 | | instance of a component 24 | | 25 | +----------+ +------+--------+ 26 | | | | | 27 | | node +----------+ container | 28 | | | | | 29 | +----------+ +---------------+ 30 | 31 | nodes start/stop containers 32 | nodes proxy http requests to containers 33 | a collection of nodes forms a cluster 34 | ``` 35 | 36 | | Entity | Description 37 | | ----------------------- | ---------------- 38 | | component | HTTP service or function 39 | | http-event-source | Config specifying HTTP hostname/path to reverse proxy to component 40 | | scheduled-event-source | Cron-like scheduling rule that triggers a component 41 | | sqs-event-source | SQS queue(s) to poll to trigger a component 42 | | node | Machine instance running maelstromd 43 | | container | Instance of component running on a node 44 | 45 | ## CLI commands 46 | 47 | `maelctl` command list 48 | 49 | | Command | Description 50 | | ----------------------- | ---------------- 51 | | comp put | Registers a component by name, replacing the previous version with the same name 52 | | comp ls | Lists all registered components 53 | | comp rm | Removes a component. This will also remove any related event sources. 54 | | es put | Registers an event source for a component 55 | | es ls | Lists event sources 56 | | es rm | Removes an event source 57 | | cfg put | ?? is config a file? a single name/value? 58 | | node ls | Lists all nodes in the cluster 59 | -------------------------------------------------------------------------------- /docs/gitbook/event_sources/aws_stepfunc.md: -------------------------------------------------------------------------------- 1 | 2 | # AWS Step Functions 3 | 4 | A step function event source tells `maelstromd` to poll AWS for [activity tasks](https://docs.aws.amazon.com/step-functions/latest/dg/concepts-activities.html) using the `GetActivityTask` API. If a task is received, the related 5 | component is activated and the task input is sent to the component via HTTP POST. The task input is sent as the 6 | POST body. If the component responds with a 200 status code, `SendTaskSuccess` is called using the response body as 7 | the `output` field. If a non-200 response is returned, or if the request times out, `SendTaskFailure` is called. 8 | 9 | ## Errors 10 | 11 | If a non-200 response is returned, `SendTaskFailure` is called. You may optionally 12 | set response HTTP headers for the "cause" and "error" to be sent back to AWS. These values will be 13 | displayed in the step function UI. 14 | 15 | See the [SendTaskFailure docs](https://docs.aws.amazon.com/step-functions/latest/apireference/API_SendTaskFailure.html) for more info on these fields. 16 | 17 | `taskToken` is automatically set on the `SendTaskFailure` call and cannot be 18 | overridden. 19 | 20 | | HTTP Header | SendTaskFailure Field | Max Length 21 | |------------------|--------------------------|---------------------- 22 | | step-func-error | error | 256 23 | | step-func-cause | cause | 32768 24 | 25 | ## Activity creation 26 | 27 | `maelstromd` will call `CreateActivity` to resolve the ARN associated with the activity name, but you must 28 | create the state machine and reference the appropriate ARNs in the activity states. 29 | 30 | Activity state ARNs have the following naming convention: 31 | 32 | `arn:aws:states:::activity:` 33 | 34 | ## Concurrency 35 | 36 | You can control the number of tasks `maelstromd` will dequeue at a time per activity. 37 | For example, if you want your system to process no more than 10 tasks concurrently, set 38 | `maxconcurrency: 10` on the step function event source. 39 | 40 | This is different than the `maxconcurrency` setting on the component. The component setting specifies max concurrency 41 | per instance of a component across all event sources, where the step function setting specifies the max concurrency 42 | across ALL instances of the component that will originate from this event source. 43 | 44 | ## Properties 45 | 46 | | Property | Description | Default 47 | |----------------------|-------------------------------------------------------------------------------|----------------- 48 | | activityname | Name of step function activity to poll | None (required) 49 | | path | Request path to POST to on component | None (required) 50 | | maxconcurrency | Max tasks to process at a time across all maelstromd instances | 1 51 | | concurrencyperpoller | Concurrent messages to process per polling process | 1 52 | 53 | ## Example 54 | 55 | In the example below we register two activities: `split` and `sum`. `split` uses the default concurrency settings, so 56 | it will only process a single task at a time across the cluster. `sum` specifies higher concurrency limits and will 57 | process up to 10 tasks concurrently (5 per poller, for a max of 2 pollers). 58 | 59 | ```yaml 60 | components: 61 | mywebapp: 62 | image: example/myapp 63 | eventsources: 64 | split: 65 | awsstepfunc: 66 | activityname: split 67 | path: /step/split 68 | sum: 69 | awsstepfunc: 70 | activityname: sum 71 | path: /step/sum 72 | maxconcurrency: 10 73 | concurrencyperpoller: 5 74 | ``` 75 | -------------------------------------------------------------------------------- /docs/gitbook/event_sources/aws_sqs.md: -------------------------------------------------------------------------------- 1 | 2 | # AWS SQS 3 | 4 | A SQS event source tells `maelstromd` to poll a SQS queue for messages. If a message is received, the related 5 | component is activated and the message body is sent to the component via HTTP POST. The message body is sent as the 6 | POST body. If the component responds with a 200 status code, the SQS message is deleted from the queue. Otherwise 7 | the message will stay in the queue and become eligible for redelivery when the visibility timeout elapses. 8 | 9 | While SQS has no native notion of priority queueing, this event source optionally supports polling a set of queues 10 | that share a common name prefix. For example, given an event source configured with `nameasprefix: true` and 11 | a set of queues named: 12 | 13 | ``` 14 | resize-image-0 15 | resize-image-1 16 | resize-image-2 17 | ``` 18 | 19 | `maelstromd` will poll the queues in name order, draining the "0" queue completely, then "1", then "2". 20 | 21 | While driaining lower priority queues, `maelstromd` will periodically reset and poll from the head of the list 22 | in order to re-check the higher priority queues. 23 | 24 | ## Concurrency 25 | 26 | You can control the number of messages `maelstromd` will dequeue at a time. For example, if you want your system 27 | to process no more than 10 messages concurrently, set `maxconcurrency: 10` on the SQS event source. 28 | 29 | This is different than the `maxconcurrency` setting on the component. The component setting specifies max concurrency 30 | per instance of a component across all event sources, where the SQS setting specified the max concurrency across ALL 31 | instances of the component that will originate from this event source. 32 | 33 | If you are using a SQS FIFO queue you may also wish to set `concurrencyperpoller: 1` to ensure that messages are 34 | processed in the strict order they are dequeued. 35 | 36 | The max number of pollers = `ceil(sqs-maxconcurrency / concurrencyperpoller)` 37 | 38 | ## Back pressure 39 | 40 | If you return a HTTP header `pause-seconds`, `maelstromd` will sleep for that number of seconds before polling SQS again. 41 | The header value must be an integer (e.g.: `pause-seconds: 30`). 42 | 43 | The poller will pause once for the specified number of seconds, then clear this value. 44 | 45 | ## Properties 46 | 47 | | Property | Description | Default 48 | |----------------------|-------------------------------------------------------------------------------|----------------- 49 | | queuename | Name of queue (or queue prefix) to poll | None (required) 50 | | nameasprefix | If true, treat queuename as prefix. Poll all queues starting with that name | false 51 | | path | Request path to POST to on component | None (required) 52 | | maxconcurrency | Max concurrency messages to process via SQS across all instances | 10 53 | | messagesperpoll | Messages to receive per polling attempt (1..10) | 1 54 | | concurrencyperpoller | Concurrent messages to process per polling process | messagesperpoll 55 | | visibilitytimeout | Seconds to mark message invisible before it is eligible to dequeue again | 300 56 | 57 | 58 | ## Examples 59 | 60 | ### Example 1: Poll a single queue and process up to 15 messages concurrently 61 | 62 | ```yaml 63 | components: 64 | mywebapp: 65 | image: example/myapp 66 | eventsources: 67 | process_order_sqs: 68 | sqs: 69 | queuename: pending-orders 70 | path: /queues/pendingorder 71 | maxconcurrency: 15 72 | visibilitytimeout: 600 73 | ``` 74 | 75 | ### Example 2: Poll a set of queues with a name prefix. Pull 10 messages at a time per poller. 76 | 77 | ```yaml 78 | components: 79 | mywebapp: 80 | image: example/myapp 81 | eventsources: 82 | process_order_sqs: 83 | sqs: 84 | queuename: resize-image- 85 | nameasprefix: true 86 | path: /queues/resizeimage 87 | maxconcurrency: 50 88 | messagesperpoll: 10 89 | concurrencyperpoller: 10 90 | ``` 91 | -------------------------------------------------------------------------------- /pkg/maelstrom/gateway.go: -------------------------------------------------------------------------------- 1 | package maelstrom 2 | 3 | import ( 4 | "context" 5 | "net/http" 6 | "strconv" 7 | "time" 8 | 9 | "github.com/coopernurse/maelstrom/pkg/db" 10 | "github.com/coopernurse/maelstrom/pkg/revproxy" 11 | "github.com/coopernurse/maelstrom/pkg/router" 12 | v1 "github.com/coopernurse/maelstrom/pkg/v1" 13 | log "github.com/mgutz/logxi/v1" 14 | ) 15 | 16 | func NewGateway(r ComponentResolver, routerReg *router.Registry, public bool, 17 | myIpAddr string) *Gateway { 18 | return &Gateway{ 19 | compResolver: r, 20 | routerReg: routerReg, 21 | public: public, 22 | myIpAddr: myIpAddr, 23 | } 24 | } 25 | 26 | type Gateway struct { 27 | compResolver ComponentResolver 28 | routerReg *router.Registry 29 | public bool 30 | myIpAddr string 31 | } 32 | 33 | func (g *Gateway) ServeHTTP(rw http.ResponseWriter, req *http.Request) { 34 | // Handle health check URL 35 | if req.RequestURI == "/_mael_health_check" { 36 | respondText(rw, http.StatusOK, "OK") 37 | return 38 | } 39 | 40 | // Resolve Component based on hostname/path 41 | comp, err := g.compResolver.ByHTTPRequest(req, g.public) 42 | if err != nil { 43 | if err == db.NotFound { 44 | respondText(rw, http.StatusNotFound, "No component matches the request") 45 | } else { 46 | log.Error("gateway: compResolver.ByHTTPRequest", "err", err) 47 | respondText(rw, http.StatusInternalServerError, "Server Error") 48 | } 49 | return 50 | } 51 | 52 | // Set X-Forwarded-For header 53 | xForward := req.Header.Get("X-Forwarded-For") 54 | if xForward != "" { 55 | xForward += ", " 56 | } 57 | xForward += g.myIpAddr 58 | req.Header.Set("X-Forwarded-For", xForward) 59 | 60 | g.Route(rw, req, &comp, g.public) 61 | } 62 | 63 | func (g *Gateway) Route(rw http.ResponseWriter, req *http.Request, comp *v1.Component, publicGateway bool) { 64 | // Set Deadline 65 | var reqStartNano int64 66 | var deadlineNano int64 67 | if !publicGateway { 68 | deadlineStr := req.Header.Get("MAELSTROM-DEADLINE-NANO") 69 | if deadlineStr != "" { 70 | deadlineNano, _ = strconv.ParseInt(deadlineStr, 10, 64) 71 | } 72 | reqStartStr := req.Header.Get("MAELSTROM-START-NANO") 73 | if reqStartStr != "" { 74 | reqStartNano, _ = strconv.ParseInt(reqStartStr, 10, 64) 75 | } 76 | } 77 | 78 | deadline := componentReqDeadline(deadlineNano, comp) 79 | ctx, ctxCancel := context.WithDeadline(context.Background(), deadline) 80 | defer ctxCancel() 81 | 82 | if req.Header.Get("MAELSTROM-DEADLINE-NANO") == "" { 83 | req.Header.Set("MAELSTROM-DEADLINE-NANO", strconv.FormatInt(deadline.UnixNano(), 10)) 84 | } 85 | req = req.WithContext(ctx) 86 | 87 | // Send request to dispatcher 88 | preferLocal := req.Header.Get("MAELSTROM-RELAY-PATH") != "" 89 | compReq := revproxy.NewRequest(req, rw, comp, preferLocal) 90 | if reqStartNano != 0 && reqStartNano < compReq.StartTime.UnixNano() { 91 | compReq.StartTime = time.Unix(0, reqStartNano) 92 | } 93 | if req.Header.Get("MAELSTROM-START-NANO") == "" { 94 | req.Header.Set("MAELSTROM-START-NANO", strconv.FormatInt(compReq.StartTime.UnixNano(), 10)) 95 | } 96 | g.routerReg.ByComponent(comp.Name).Route(ctx, compReq) 97 | 98 | // Block on result, or timeout 99 | select { 100 | case <-compReq.Done: 101 | return 102 | case <-ctx.Done(): 103 | msg := "gateway: Timeout proxying component: " + comp.Name 104 | log.Warn(msg, "component", comp.Name, "version", comp.Version) 105 | respondText(rw, http.StatusGatewayTimeout, msg) 106 | return 107 | } 108 | } 109 | 110 | func componentReqDeadline(deadlineNanoFromHeader int64, comp *v1.Component) time.Time { 111 | if deadlineNanoFromHeader == 0 { 112 | maxDur := comp.MaxDurationSeconds 113 | if maxDur <= 0 { 114 | maxDur = 300 115 | } 116 | startTime := time.Now() 117 | return startTime.Add(time.Duration(maxDur) * time.Second) 118 | } else { 119 | return time.Unix(0, deadlineNanoFromHeader) 120 | } 121 | } 122 | 123 | func respondText(rw http.ResponseWriter, statusCode int, body string) { 124 | rw.Header().Add("content-type", "text/plain") 125 | rw.WriteHeader(statusCode) 126 | _, err := rw.Write([]byte(body)) 127 | if err != nil { 128 | log.Warn("gateway: respondText.Write error", "err", err.Error()) 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /pkg/maelstrom/sort.go: -------------------------------------------------------------------------------- 1 | package maelstrom 2 | 3 | import ( 4 | "github.com/coopernurse/maelstrom/pkg/v1" 5 | "time" 6 | ) 7 | 8 | type nameValueByName []v1.NameValue 9 | 10 | func (s nameValueByName) Len() int { return len(s) } 11 | func (s nameValueByName) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 12 | func (s nameValueByName) Less(i, j int) bool { return s[i].Name < s[j].Name } 13 | 14 | type componentWithEventSourcesByName []v1.ComponentWithEventSources 15 | 16 | func (s componentWithEventSourcesByName) Len() int { return len(s) } 17 | func (s componentWithEventSourcesByName) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 18 | func (s componentWithEventSourcesByName) Less(i, j int) bool { 19 | return s[i].Component.Name < s[j].Component.Name 20 | } 21 | 22 | type NodeStatusByStartedAt []v1.NodeStatus 23 | 24 | func (s NodeStatusByStartedAt) Len() int { return len(s) } 25 | func (s NodeStatusByStartedAt) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 26 | func (s NodeStatusByStartedAt) Less(i, j int) bool { return s[i].StartedAt < s[j].StartedAt } 27 | 28 | type NodeStatusByEmptyThenLoadAvg []v1.NodeStatus 29 | 30 | func (s NodeStatusByEmptyThenLoadAvg) Len() int { return len(s) } 31 | func (s NodeStatusByEmptyThenLoadAvg) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 32 | func (s NodeStatusByEmptyThenLoadAvg) Less(i, j int) bool { 33 | if len(s[i].RunningComponents) == 0 && len(s[j].RunningComponents) > 0 { 34 | return true 35 | } 36 | if len(s[j].RunningComponents) == 0 && len(s[i].RunningComponents) > 0 { 37 | return false 38 | } 39 | return s[i].LoadAvg1m < s[j].LoadAvg1m 40 | } 41 | 42 | type ComponentInfoByRunningCountAndReqTime struct { 43 | Components []v1.ComponentInfo 44 | InstanceCounts map[string]int 45 | } 46 | 47 | func (s ComponentInfoByRunningCountAndReqTime) Len() int { return len(s.Components) } 48 | func (s ComponentInfoByRunningCountAndReqTime) Swap(i, j int) { 49 | s.Components[i], s.Components[j] = s.Components[j], s.Components[i] 50 | } 51 | func (s ComponentInfoByRunningCountAndReqTime) Less(i, j int) bool { 52 | iCount := s.InstanceCounts[s.Components[i].ComponentName] 53 | jCount := s.InstanceCounts[s.Components[j].ComponentName] 54 | 55 | if iCount > jCount { 56 | return true 57 | } 58 | if jCount > iCount { 59 | return false 60 | } 61 | return s.Components[i].LastRequestTime > s.Components[j].LastRequestTime 62 | } 63 | 64 | type ComponentTargetByCompName []v1.ComponentTarget 65 | 66 | func (s ComponentTargetByCompName) Len() int { return len(s) } 67 | func (s ComponentTargetByCompName) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 68 | func (s ComponentTargetByCompName) Less(i, j int) bool { 69 | return s[i].ComponentName < s[j].ComponentName 70 | } 71 | 72 | type projectInfoByName []v1.ProjectInfo 73 | 74 | func (s projectInfoByName) Len() int { return len(s) } 75 | func (s projectInfoByName) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 76 | func (s projectInfoByName) Less(i, j int) bool { return s[i].ProjectName < s[j].ProjectName } 77 | 78 | type httpEventSourcesForResolver []v1.EventSource 79 | 80 | func (s httpEventSourcesForResolver) Len() int { return len(s) } 81 | func (s httpEventSourcesForResolver) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 82 | func (s httpEventSourcesForResolver) Less(i, j int) bool { 83 | // want most specific to least specific 84 | if s[i].Http != nil && s[j].Http != nil { 85 | // if hostname empty, sort to bottom 86 | if s[i].Http.Hostname == "" && s[j].Http.Hostname != "" { 87 | return false 88 | } 89 | if s[j].Http.Hostname == "" && s[i].Http.Hostname != "" { 90 | return true 91 | } 92 | 93 | // if paths are same length, sort by hostname desc 94 | if len(s[i].Http.PathPrefix) == len(s[j].Http.PathPrefix) { 95 | return s[i].Http.Hostname > s[j].Http.Hostname 96 | } 97 | // sort by path length descending so most specific paths are considered first 98 | return len(s[i].Http.PathPrefix) > len(s[j].Http.PathPrefix) 99 | } 100 | return s[i].Name < s[j].Name 101 | } 102 | 103 | type DurationAscend []time.Duration 104 | 105 | func (s DurationAscend) Len() int { return len(s) } 106 | func (s DurationAscend) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 107 | func (s DurationAscend) Less(i, j int) bool { return s[i] < s[j] } 108 | -------------------------------------------------------------------------------- /pkg/maelstrom/integration_test.go: -------------------------------------------------------------------------------- 1 | package maelstrom 2 | 3 | import ( 4 | v1 "github.com/coopernurse/maelstrom/pkg/v1" 5 | "testing" 6 | "time" 7 | ) 8 | 9 | func TestHandlerStartsContainerOnFirstRequest(t *testing.T) { 10 | wrapTest(t, func() { 11 | GivenNoMaelstromContainers(t). 12 | WhenHTTPRequestReceived(). 13 | ThenContainerIsStarted() 14 | }) 15 | } 16 | 17 | func TestRemovesExistingContainersAtStartup(t *testing.T) { 18 | wrapTest(t, func() { 19 | GivenExistingContainer(t). 20 | WhenSystemIsStarted(). 21 | ThenContainerIsStopped() 22 | }) 23 | } 24 | 25 | func TestHealthCheckStopsContainerOnFailure(t *testing.T) { 26 | wrapTest(t, func() { 27 | GivenExistingContainerWithBadHealthCheckPath(t). 28 | WhenHealthCheckTimeoutElapses(). 29 | ThenContainerIsStopped() 30 | }) 31 | } 32 | 33 | func TestHealthCheckKeepsContainerOnSuccess(t *testing.T) { 34 | wrapTest(t, func() { 35 | GivenExistingContainer(t). 36 | WhenHTTPRequestReceived(). 37 | WhenHealthCheckTimeoutElapses(). 38 | ThenContainerIsStarted() 39 | }) 40 | } 41 | 42 | func TestStopDrainsRequestsBeforeStoppingContainers(t *testing.T) { 43 | wrapTest(t, func() { 44 | GivenExistingContainer(t). 45 | WhenContainerIsHealthy(). 46 | WhenNLongRunningRequestsMade(5). 47 | WhenStopRequestReceived(). 48 | ThenContainerIsStopped(). 49 | ThenSuccessfulRequestCountEquals(5) 50 | }) 51 | } 52 | 53 | func TestRestartsContainerIfRequestArrivesAfterStopping(t *testing.T) { 54 | wrapTest(t, func() { 55 | GivenExistingContainer(t). 56 | WhenStopRequestReceived(). 57 | ThenContainerIsStopped(). 58 | WhenHTTPRequestReceived(). 59 | ThenContainerIsStarted() 60 | }) 61 | } 62 | 63 | func TestStopsContainerIfIdle(t *testing.T) { 64 | wrapTest(t, func() { 65 | GivenExistingContainerWithIdleTimeout(t, 1). 66 | WhenIdleTimeoutElapses(). 67 | WhenAutoscaleRuns(). 68 | ThenContainerIsStopped() 69 | }) 70 | } 71 | 72 | func TestRestartsContainerWhenComponentUpdated(t *testing.T) { 73 | wrapTest(t, func() { 74 | GivenExistingContainer(t). 75 | WhenHTTPRequestReceived(). 76 | ThenContainerIsStarted(). 77 | WhenComponentIsUpdated(). 78 | ThenContainerIsStartedWithNewVersionWithin(10 * time.Second) 79 | }) 80 | } 81 | 82 | //func TestOptionallyStartsThenStopsWhenComponentUpdated(t *testing.T) { 83 | // wrapTest(t, func() { 84 | // GivenExistingContainerWith(t, func(c *v1.Component) { 85 | // c.RestartOrder = v1.RestartOrderStartstop 86 | // }). 87 | // WhenComponentIsUpdated(). 88 | // WhenHTTPRequestReceived(). 89 | // ThenContainerIsStartedBeforeTheOlderContainerIsStopped() 90 | // }) 91 | //} 92 | 93 | func TestOptionallyStopsThenStartsWhenComponentUpdated(t *testing.T) { 94 | wrapTest(t, func() { 95 | GivenExistingContainerWith(t, func(c *v1.Component) { 96 | c.RestartOrder = v1.RestartOrderStopstart 97 | }). 98 | WhenComponentIsUpdated(). 99 | WhenHTTPRequestReceived(). 100 | ThenContainerIsStoppedBeforeTheOlderContainerIsStarted() 101 | }) 102 | } 103 | 104 | //func TestOptionallyLockWhenComponentUpdated(t *testing.T) { 105 | // wrapTest(t, func() { 106 | // GivenExistingContainerWith(t, func(c *v1.Component) { 107 | // c.StartParallelism = v1.StartParallelismSeriesfirst 108 | // }). 109 | // WhenAnotherInstanceIsStarted(). 110 | // AndDockerEventsAreReset(). 111 | // WhenComponentIsUpdated(). 112 | // WhenHTTPRequestReceived(). 113 | // AndTimePasses(2 * time.Second). 114 | // ThenContainersAreRestartedInSeries() 115 | // }) 116 | //} 117 | 118 | func TestRoutesRequestsToOldComponentDuringUpdates(t *testing.T) { 119 | wrapTest(t, func() { 120 | GivenExistingContainerWith(t, func(c *v1.Component) { 121 | c.RestartOrder = v1.RestartOrderStartstop 122 | }). 123 | WhenHTTPRequestReceived(). 124 | ThenContainerIsStarted(). 125 | WhenHTTPRequestsAreMadeContinuously(). 126 | WhenComponentIsUpdated(). 127 | AndTimePasses(2 * time.Second). 128 | ThenAllHTTPRequestsCompletedWithoutDelay() 129 | }) 130 | } 131 | 132 | func TestCronStartsContainerWhenTriggered(t *testing.T) { 133 | wrapTest(t, func() { 134 | GivenNoMaelstromContainers(t). 135 | WhenCronEventSourceRegistered("* * * * * *"). 136 | WhenCronServiceStartedWithSeconds(). 137 | ThenContainerStartsWithin(15 * time.Second) 138 | }) 139 | } 140 | -------------------------------------------------------------------------------- /pkg/cert/letsencrypt.go: -------------------------------------------------------------------------------- 1 | package cert 2 | 3 | import ( 4 | "crypto/tls" 5 | "fmt" 6 | log "github.com/mgutz/logxi/v1" 7 | "github.com/mholt/certmagic" 8 | "net" 9 | "net/http" 10 | "time" 11 | ) 12 | 13 | type CertMagicOptions struct { 14 | Email string 15 | } 16 | 17 | func NewCertMagicWrapper(options CertMagicOptions) *CertMagicWrapper { 18 | cache := certmagic.NewCache(certmagic.CacheOptions{ 19 | GetConfigForCert: func(cert certmagic.Certificate) (certmagic.Config, error) { 20 | return certmagic.Config{}, nil 21 | }, 22 | }) 23 | 24 | magic := certmagic.New(cache, certmagic.Config{ 25 | Email: options.Email, 26 | Agreed: true, 27 | }) 28 | 29 | hostCh := make(chan string) 30 | go manageHosts(hostCh, magic) 31 | 32 | wrapper := &CertMagicWrapper{ 33 | magic: magic, 34 | hosts: make(map[string]bool), 35 | hostCh: hostCh, 36 | } 37 | wrapper.initOnDemand() 38 | return wrapper 39 | } 40 | 41 | type CertMagicWrapper struct { 42 | magic *certmagic.Config 43 | hosts map[string]bool 44 | hostCh chan string 45 | } 46 | 47 | func (c *CertMagicWrapper) initOnDemand() { 48 | onDemand := c.magic.OnDemand 49 | if onDemand == nil { 50 | c.magic.OnDemand = &certmagic.OnDemandConfig{ 51 | DecisionFunc: func(name string) error { 52 | c.AddHost(name) 53 | return nil 54 | }, 55 | } 56 | } else { 57 | decisionFunc := onDemand.DecisionFunc 58 | onDemand.DecisionFunc = func(name string) error { 59 | c.AddHost(name) 60 | return decisionFunc(name) 61 | } 62 | } 63 | } 64 | 65 | func (c *CertMagicWrapper) Start(mux http.Handler, httpPort int, httpsPort int) ([]*http.Server, error) { 66 | httpListener, err := net.Listen("tcp", fmt.Sprintf(":%d", httpPort)) 67 | if err != nil { 68 | return nil, fmt.Errorf("letsencrypt: unable to start HTTP listener: %v", err) 69 | } 70 | httpsListener, err := tls.Listen("tcp", fmt.Sprintf(":%d", httpsPort), c.magic.TLSConfig()) 71 | if err != nil { 72 | return nil, fmt.Errorf("letsencrypt: unable to start HTTPS listener: %v", err) 73 | } 74 | 75 | httpServer := &http.Server{ 76 | ReadHeaderTimeout: 5 * time.Second, 77 | ReadTimeout: 5 * time.Second, 78 | WriteTimeout: 5 * time.Second, 79 | IdleTimeout: 5 * time.Second, 80 | Handler: c.magic.HTTPChallengeHandler(http.HandlerFunc(c.httpRedirectHandler)), 81 | } 82 | httpsServer := &http.Server{ 83 | ReadHeaderTimeout: 10 * time.Second, 84 | ReadTimeout: 30 * time.Second, 85 | WriteTimeout: 2 * time.Minute, 86 | IdleTimeout: 5 * time.Minute, 87 | Handler: mux, 88 | } 89 | 90 | go logErr("httpServer.Serve", httpServer.Serve(httpListener)) 91 | go logErr("httpsServer.Serve", httpsServer.Serve(httpsListener)) 92 | 93 | return []*http.Server{httpServer, httpsServer}, nil 94 | } 95 | 96 | func (c *CertMagicWrapper) AddHost(host string) { 97 | go func() { 98 | c.hostCh <- host 99 | }() 100 | } 101 | 102 | // adapted from Matt Holt's certmagic: 103 | // https://github.com/mholt/certmagic/blob/master/certmagic.go#L141 104 | func (c *CertMagicWrapper) httpRedirectHandler(w http.ResponseWriter, r *http.Request) { 105 | toURL := "https://" 106 | 107 | // since we redirect to the standard HTTPS port, we 108 | // do not need to include it in the redirect URL 109 | requestHost, _, err := net.SplitHostPort(r.Host) 110 | if err != nil { 111 | requestHost = r.Host // host probably did not contain a port 112 | } 113 | 114 | // make sure certmagic knows about this domain 115 | c.AddHost(requestHost) 116 | 117 | toURL += requestHost 118 | toURL += r.URL.RequestURI() 119 | 120 | // get rid of this disgusting unencrypted HTTP connection 121 | w.Header().Set("Connection", "close") 122 | 123 | http.Redirect(w, r, toURL, http.StatusMovedPermanently) 124 | } 125 | 126 | func manageHosts(hostCh <-chan string, magic *certmagic.Config) { 127 | hosts := make(map[string]bool, 0) 128 | 129 | for h := range hostCh { 130 | _, ok := hosts[h] 131 | if !ok { 132 | hosts[h] = true 133 | hostSlice := make([]string, len(hosts)) 134 | i := 0 135 | for h, _ := range hosts { 136 | hostSlice[i] = h 137 | i++ 138 | } 139 | err := magic.Manage(hostSlice) 140 | if err != nil { 141 | log.Error("cert: error in magic.Manage", "err", err, "hosts", hostSlice) 142 | } 143 | } 144 | } 145 | } 146 | 147 | func logErr(msg string, err error) { 148 | if err != nil { 149 | log.Error(msg, "err", err) 150 | } 151 | } 152 | -------------------------------------------------------------------------------- /pkg/maelstrom/pruner.go: -------------------------------------------------------------------------------- 1 | package maelstrom 2 | 3 | import ( 4 | "context" 5 | "github.com/coopernurse/maelstrom/pkg/common" 6 | "github.com/coopernurse/maelstrom/pkg/db" 7 | v1 "github.com/coopernurse/maelstrom/pkg/v1" 8 | "github.com/docker/docker/api/types" 9 | "github.com/docker/docker/api/types/filters" 10 | docker "github.com/docker/docker/client" 11 | "github.com/docker/go-units" 12 | log "github.com/mgutz/logxi/v1" 13 | "strings" 14 | "sync" 15 | "time" 16 | ) 17 | 18 | func NewDockerPruner(dockerClient *docker.Client, db db.Db, ctx context.Context, 19 | pruneUnregistered bool, pruneKeep []string) *DockerPruner { 20 | return &DockerPruner{ 21 | dockerClient: dockerClient, 22 | db: db, 23 | ctx: ctx, 24 | pruneUnregistered: pruneUnregistered, 25 | pruneKeep: pruneKeep, 26 | } 27 | } 28 | 29 | type DockerPruner struct { 30 | dockerClient *docker.Client 31 | db db.Db 32 | ctx context.Context 33 | pruneUnregistered bool 34 | pruneKeep []string 35 | } 36 | 37 | func (d *DockerPruner) Run(interval time.Duration, wg *sync.WaitGroup) { 38 | log.Info("maelstrom: docker pruner loop staring", "interval", interval.String()) 39 | defer wg.Done() 40 | ticker := time.Tick(interval) 41 | for { 42 | select { 43 | case <-ticker: 44 | d.runOnce() 45 | case <-d.ctx.Done(): 46 | log.Info("maelstrom: docker pruner loop shutdown gracefully") 47 | return 48 | } 49 | } 50 | } 51 | 52 | func (d *DockerPruner) runOnce() { 53 | if d.pruneUnregistered { 54 | toDelete, err := d.imageIdsToDelete() 55 | if err == nil { 56 | // Delete images not in use and not registered against a component 57 | for _, id := range toDelete { 58 | _, err = d.dockerClient.ImageRemove(d.ctx, id, types.ImageRemoveOptions{}) 59 | if err != nil { 60 | log.Error("maelstrom: unable to remove image", "image", id, "err", err) 61 | } else { 62 | log.Info("maelstrom: pruner removed image", "image", id) 63 | } 64 | } 65 | } else { 66 | log.Error("maelstrom: unable to load image ids to delete", "err", err) 67 | } 68 | } 69 | 70 | // Docker prune containers and images 71 | _, err := d.dockerClient.ContainersPrune(d.ctx, filters.Args{}) 72 | if err != nil { 73 | log.Error("maelstrom: unable to prune containers", "err", err) 74 | } 75 | imagePruneReport, err := d.dockerClient.ImagesPrune(d.ctx, filters.Args{}) 76 | if err != nil { 77 | log.Error("maelstrom: unable to prune images", "err", err) 78 | } else { 79 | log.Info("maelstrom: pruner reclaimed space", "size", 80 | units.HumanSize(float64(imagePruneReport.SpaceReclaimed))) 81 | } 82 | } 83 | 84 | func (d *DockerPruner) imageIdsToDelete() ([]string, error) { 85 | imageNames, err := d.componentImages() 86 | if err != nil { 87 | return nil, err 88 | } 89 | imageNameMap := make(map[string]bool) 90 | for _, name := range imageNames { 91 | imageNameMap[name] = true 92 | } 93 | 94 | containers, err := d.dockerClient.ContainerList(d.ctx, types.ContainerListOptions{ 95 | All: false, 96 | }) 97 | if err != nil { 98 | return nil, err 99 | } 100 | 101 | runningImageIds := make(map[string]bool) 102 | for _, cont := range containers { 103 | runningImageIds[cont.ImageID] = true 104 | } 105 | 106 | toDelete := make([]string, 0) 107 | images, err := d.dockerClient.ImageList(d.ctx, types.ImageListOptions{}) 108 | if err == nil { 109 | for _, img := range images { 110 | if !runningImageIds[img.ID] { 111 | deleteImg := true 112 | for _, tag := range img.RepoTags { 113 | if d.keepImage(tag, imageNameMap) { 114 | deleteImg = false 115 | } 116 | } 117 | if deleteImg { 118 | toDelete = append(toDelete, img.ID) 119 | } 120 | } 121 | } 122 | return toDelete, nil 123 | } else { 124 | return nil, err 125 | } 126 | } 127 | 128 | func (d *DockerPruner) componentImages() ([]string, error) { 129 | nextToken := "" 130 | imageNames := make([]string, 0) 131 | for { 132 | out, err := d.db.ListComponents(v1.ListComponentsInput{ 133 | Limit: 1000, 134 | NextToken: nextToken, 135 | }) 136 | 137 | if err == nil { 138 | for _, c := range out.Components { 139 | imageNames = append(imageNames, c.Docker.Image) 140 | } 141 | } else { 142 | return nil, err 143 | } 144 | 145 | nextToken = out.NextToken 146 | if nextToken == "" { 147 | return imageNames, nil 148 | } 149 | } 150 | } 151 | 152 | func (d *DockerPruner) keepImage(tag string, imageNameMap map[string]bool) bool { 153 | if imageNameMap[tag] { 154 | return true 155 | } 156 | for _, keepRule := range d.pruneKeep { 157 | if common.GlobMatches(strings.TrimSpace(keepRule), tag) { 158 | return true 159 | } 160 | } 161 | return false 162 | } 163 | -------------------------------------------------------------------------------- /pkg/converge/converger_test.go: -------------------------------------------------------------------------------- 1 | package converge 2 | 3 | import ( 4 | v1 "github.com/coopernurse/maelstrom/pkg/v1" 5 | "github.com/stretchr/testify/assert" 6 | "testing" 7 | ) 8 | 9 | func TestConvergePlanNoContainers(t *testing.T) { 10 | conv := NewConverger(ComponentTarget{}) 11 | 12 | target := newTarget(0, 1, v1.StartParallelismParallel, v1.RestartOrderStartstop) 13 | expected := newConvergePlan(false) 14 | assert.Equal(t, expected, conv.plan(target)) 15 | 16 | target.Count = 1 17 | expected = newConvergePlan(true, newStartStep(target, false)) 18 | assert.Equal(t, expected, conv.plan(target)) 19 | } 20 | 21 | func TestConvergePlanScaleDown(t *testing.T) { 22 | conv := NewConverger(ComponentTarget{}) 23 | target := newTarget(0, 1, v1.StartParallelismParallel, v1.RestartOrderStartstop) 24 | 25 | // 2 running containers 26 | conv.containers = newContainers(target, 2) 27 | 28 | // target=0 -> stop both 29 | expected := newConvergePlan(false, newStopStep(1, reasonScaleDown), newStopStep(2, reasonScaleDown)) 30 | assert.Equal(t, expected, conv.plan(target)) 31 | 32 | // target=1 -> stop 1 33 | target = newTarget(1, 1, v1.StartParallelismParallel, v1.RestartOrderStartstop) 34 | expected = newConvergePlan(false, newStopStep(2, reasonScaleDown)) 35 | assert.Equal(t, expected, conv.plan(target)) 36 | } 37 | 38 | func TestConvergePlanRollingDeploy(t *testing.T) { 39 | conv := NewConverger(ComponentTarget{}) 40 | 41 | // v1 - 2 running containers 42 | target := newTarget(2, 1, v1.StartParallelismParallel, v1.RestartOrderStartstop) 43 | conv.containers = newContainers(target, 2) 44 | 45 | // v2 - parallel - startstop 46 | target = newTarget(2, 2, v1.StartParallelismParallel, v1.RestartOrderStartstop) 47 | expected := newConvergePlan(true, 48 | newStartStep(target, false), newStopStep(1, reasonVersionChanged), 49 | newStartStep(target, false), newStopStep(2, reasonVersionChanged)) 50 | assert.Equal(t, expected, conv.plan(target)) 51 | 52 | // v3 - series - stopstart 53 | target = newTarget(2, 3, v1.StartParallelismSeries, v1.RestartOrderStopstart) 54 | expected = newConvergePlan(true, 55 | newStopStep(1, reasonVersionChanged), newStartStep(target, true), 56 | newStopStep(2, reasonVersionChanged), newStartStep(target, true)) 57 | assert.Equal(t, expected, conv.plan(target)) 58 | 59 | // v4 - seriesfirst - stopstart 60 | target = newTarget(2, 4, v1.StartParallelismSeriesfirst, v1.RestartOrderStopstart) 61 | expected = newConvergePlan(true, 62 | newStopStep(1, reasonVersionChanged), newStartStep(target, true), 63 | newStopStep(2, reasonVersionChanged), newStartStep(target, false)) 64 | assert.Equal(t, expected, conv.plan(target)) 65 | 66 | // v5 - hybrid - seriesfirst - startstop - with a scale up to 4 67 | target = newTarget(4, 4, v1.StartParallelismSeriesfirst, v1.RestartOrderStartstop) 68 | expected = newConvergePlan(true, 69 | newStartStep(target, true), newStopStep(1, reasonVersionChanged), 70 | newStartStep(target, false), newStopStep(2, reasonVersionChanged), 71 | newStartStep(target, false), newStartStep(target, false)) 72 | assert.Equal(t, expected, conv.plan(target)) 73 | 74 | // v5 - hybrid - seriesfirst - startstop - with a scale down to 1 75 | // 76 | // note: this is a little odd - we'd probably expect the 2nd stop to have a reason: "scale down" 77 | // but due to how the loop is written we apply the "did the component change" test first 78 | // in theory, both reasons are valid. 79 | target = newTarget(1, 4, v1.StartParallelismSeriesfirst, v1.RestartOrderStartstop) 80 | expected = newConvergePlan(true, 81 | newStartStep(target, true), newStopStep(1, reasonVersionChanged), 82 | newStopStep(2, reasonVersionChanged)) 83 | assert.Equal(t, expected, conv.plan(target)) 84 | } 85 | 86 | ///////////////////////////////////////////////// 87 | 88 | func newTarget(count int, version int64, parallelism v1.StartParallelism, 89 | restartOrder v1.RestartOrder) ComponentTarget { 90 | return ComponentTarget{ 91 | Component: &v1.Component{Name: "foo", Version: version, StartParallelism: parallelism, 92 | RestartOrder: restartOrder, Docker: &v1.DockerComponent{Image: "image/foo"}}, 93 | Count: count, 94 | } 95 | } 96 | 97 | func newConvergePlan(pull bool, steps ...convergeStep) convergePlan { 98 | if steps == nil { 99 | steps = []convergeStep{} 100 | } 101 | return convergePlan{ 102 | pull: pull, 103 | steps: steps, 104 | } 105 | } 106 | 107 | func newStartStep(target ComponentTarget, lock bool) convergeStep { 108 | return convergeStep{ 109 | start: &startStep{ 110 | component: target.Component, 111 | lock: lock, 112 | }, 113 | } 114 | } 115 | 116 | func newStopStep(id maelContainerId, reason string) convergeStep { 117 | return convergeStep{ 118 | stop: &stopStep{ 119 | containerId: id, 120 | reason: reason, 121 | }, 122 | } 123 | } 124 | 125 | func newContainers(target ComponentTarget, count int) []*Container { 126 | containers := make([]*Container, count) 127 | for i := 0; i < count; i++ { 128 | containers[i] = &Container{id: maelContainerId(i + 1), component: target.Component} 129 | } 130 | return containers 131 | } 132 | -------------------------------------------------------------------------------- /pkg/maelstrom/resolver.go: -------------------------------------------------------------------------------- 1 | package maelstrom 2 | 3 | import ( 4 | "fmt" 5 | "github.com/coopernurse/maelstrom/pkg/cert" 6 | "github.com/coopernurse/maelstrom/pkg/db" 7 | "github.com/coopernurse/maelstrom/pkg/v1" 8 | "net/http" 9 | "sort" 10 | "strings" 11 | "sync" 12 | "time" 13 | ) 14 | 15 | type ComponentResolver interface { 16 | ByName(componentName string) (v1.Component, error) 17 | ByHTTPRequest(req *http.Request, public bool) (v1.Component, error) 18 | } 19 | 20 | func NewDbResolver(db db.Db, certWrapper *cert.CertMagicWrapper, cacheDuration time.Duration) *DbComponentResolver { 21 | return &DbComponentResolver{ 22 | db: db, 23 | certWrapper: certWrapper, 24 | cacheDuration: cacheDuration, 25 | lock: &sync.Mutex{}, 26 | } 27 | } 28 | 29 | type DbComponentResolver struct { 30 | db db.Db 31 | certWrapper *cert.CertMagicWrapper 32 | cacheDuration time.Duration 33 | lock *sync.Mutex 34 | 35 | // caches 36 | eventSources []v1.EventSource 37 | eventSourcesExpires time.Time 38 | componentsByName map[string]v1.Component 39 | componentsExpires time.Time 40 | } 41 | 42 | func (r *DbComponentResolver) OnComponentNotification(cn v1.DataChangedUnion) { 43 | r.lock.Lock() 44 | 45 | // remove component by name to force reload 46 | if cn.PutComponent != nil { 47 | delete(r.componentsByName, cn.PutComponent.Name) 48 | } 49 | if cn.RemoveComponent != nil { 50 | delete(r.componentsByName, cn.RemoveComponent.Name) 51 | } 52 | 53 | // force reload of event sources 54 | r.eventSources = nil 55 | 56 | r.lock.Unlock() 57 | } 58 | 59 | func (r *DbComponentResolver) ByName(componentName string) (comp v1.Component, err error) { 60 | ok := false 61 | r.lock.Lock() 62 | now := time.Now() 63 | if now.After(r.componentsExpires) { 64 | r.componentsByName = make(map[string]v1.Component) 65 | r.componentsExpires = now.Add(r.cacheDuration) 66 | } else { 67 | comp, ok = r.componentsByName[componentName] 68 | } 69 | 70 | if !ok { 71 | comp, err = r.db.GetComponent(componentName) 72 | if err == nil { 73 | r.componentsByName[componentName] = comp 74 | } 75 | } 76 | 77 | r.lock.Unlock() 78 | return 79 | } 80 | 81 | func (r *DbComponentResolver) allEnabledHttpEventSources() (sources []v1.EventSource, err error) { 82 | r.lock.Lock() 83 | now := time.Now() 84 | if now.After(r.eventSourcesExpires) { 85 | r.eventSources = nil 86 | r.eventSourcesExpires = now.Add(r.cacheDuration) 87 | } else { 88 | sources = r.eventSources 89 | } 90 | 91 | if r.eventSources == nil { 92 | sources, err = allEnabledHttpEventSources(r.db, r.certWrapper) 93 | if err == nil { 94 | r.eventSources = sources 95 | } 96 | } 97 | 98 | r.lock.Unlock() 99 | return 100 | } 101 | 102 | func (r *DbComponentResolver) ByHTTPRequest(req *http.Request, public bool) (v1.Component, error) { 103 | // private gateway allows component resolution by name or HTTP event source config 104 | // public gateway only routes by HTTP event source 105 | if !public { 106 | compName := req.Header.Get("MAELSTROM-COMPONENT") 107 | if compName != "" { 108 | return r.ByName(compName) 109 | } 110 | } 111 | 112 | httpEventSources, err := r.allEnabledHttpEventSources() 113 | if err != nil { 114 | return v1.Component{}, err 115 | } 116 | 117 | hostname := req.Host 118 | pos := strings.Index(hostname, ":") 119 | if pos > -1 { 120 | hostname = hostname[0:pos] 121 | } 122 | path := req.URL.Path 123 | 124 | for _, es := range httpEventSources { 125 | if httpEventSourceMatches(es, hostname, path) { 126 | 127 | if es.Http.StripPrefix && es.Http.PathPrefix != "" { 128 | req.URL.Path = path[len(es.Http.PathPrefix):] 129 | } 130 | 131 | return r.ByName(es.ComponentName) 132 | } 133 | } 134 | 135 | return v1.Component{}, db.NotFound 136 | } 137 | 138 | func httpEventSourceMatches(es v1.EventSource, hostname string, path string) bool { 139 | if es.Http == nil || (es.Http.Hostname == "" && es.Http.PathPrefix == "") { 140 | return false 141 | } 142 | if es.Http.Hostname == "" && strings.HasPrefix(path, es.Http.PathPrefix) { 143 | return true 144 | } 145 | return hostname == es.Http.Hostname && 146 | (es.Http.PathPrefix == "" || strings.HasPrefix(path, es.Http.PathPrefix)) 147 | } 148 | 149 | func allEnabledHttpEventSources(db db.Db, certWrapper *cert.CertMagicWrapper) ([]v1.EventSource, error) { 150 | nextToken := "" 151 | input := v1.ListEventSourcesInput{EventSourceType: v1.EventSourceTypeHttp} 152 | allSources := make([]v1.EventSource, 0) 153 | for { 154 | input.NextToken = nextToken 155 | output, err := db.ListEventSources(input) 156 | if err != nil { 157 | return nil, fmt.Errorf("resolver ListEventSources error: %v", err) 158 | } 159 | for _, es := range output.EventSources { 160 | if es.Enabled { 161 | allSources = append(allSources, es.EventSource) 162 | if certWrapper != nil && es.EventSource.Http != nil && es.EventSource.Http.Hostname != "" { 163 | certWrapper.AddHost(es.EventSource.Http.Hostname) 164 | } 165 | } 166 | } 167 | nextToken = output.NextToken 168 | if nextToken == "" { 169 | break 170 | } 171 | } 172 | sort.Sort(httpEventSourcesForResolver(allSources)) 173 | return allSources, nil 174 | } 175 | -------------------------------------------------------------------------------- /docs/gitbook/production/shutdown.md: -------------------------------------------------------------------------------- 1 | 2 | # Graceful Shutdown 3 | 4 | Maelstrom attempts to shutdown gracefully when a SIGTERM or SIGINT is received. 5 | The basic shutdown sequence is: 6 | 7 | * Stop all background jobs 8 | * Cron service 9 | * Autoscaling loop 10 | * Event source pollers (including SQS) 11 | * Docker event monitor 12 | * Remove node from `nodestate` table in database 13 | * Notify cluster peers that node is leaving 14 | * Stop HTTP listeners gracefully, draining any in flight requests 15 | * Drain internal queues of any inflight requests 16 | * Stop running containers 17 | 18 | ## AWS Auto Scale Lifecycle Hooks 19 | 20 | In AWS you'll probably run Maelstrom using an Auto Scaling Group. 21 | Auto Scaling Groups support a feature called Lifecycle Hooks which 22 | allows systems to receive notification when a machine is added or 23 | removed from the group. 24 | 25 | Maelstrom has native support for Lifecycle Group termination events. 26 | We highly recommend configuring this feature to provide nodes with 27 | ample time to shutdown. 28 | 29 | When this feature is enabled Maelstrom will poll the given SQS queue for 30 | termination messages and broadcast them to all nodes in the cluster. The 31 | matching node will perform the graceful shutdown steps listed above, then 32 | acknowledge the message by making the `autoscaling:CompleteLifecycleAction` 33 | call back to AWS. 34 | 35 | If your ASG is associated with a load balancer, AWS will automatically remove 36 | the instance from the load balancer when the SQS message is queued, so external 37 | traffic to the host will stop before the shutdown sequence begins. 38 | 39 | See the [EC2 Lifecycle Hooks docs](https://docs.aws.amazon.com/autoscaling/ec2/userguide/lifecycle-hooks.html) 40 | for complete information on how this feature works. 41 | 42 | Briefly the steps required to support this feature are: 43 | 44 | 1. Create a SQS queue for the termination event messages 45 | 2. Create an IAM role that grants the ASG service permission to send messages to the queue 46 | 3. Register a lifecycle hook specification with the ASG, which will cause termination events to be written to SQS 47 | 4. Configure `maelstromd` with the EC2 instance id and SQS queue URL 48 | 5. Ensure that Maelstrom nodes have proper IAM permissions 49 | 50 | ### CloudFormation Example 51 | 52 | Here's a snippet of CloudFormation YAML that creates the queue and role (steps 1 and 2). 53 | 54 | ```yaml 55 | MaelASGTerminateQueue: 56 | Type: AWS::SQS::Queue 57 | Properties: 58 | QueueName: !Sub "${AWS::StackName}-MaelASG-terminate" 59 | 60 | MaelASGTerminateRole: 61 | Type: AWS::IAM::Role 62 | Properties: 63 | RoleName: MaelASGTerminateRole 64 | AssumeRolePolicyDocument: 65 | Version: "2012-10-17" 66 | Statement: 67 | - Effect: "Allow" 68 | Principal: 69 | Service: 70 | - "autoscaling.amazonaws.com" 71 | Action: 72 | - "sts:AssumeRole" 73 | Policies: 74 | - PolicyName: "MaelASGTerminatePolicy" 75 | PolicyDocument: 76 | Version: "2012-10-17" 77 | Statement: 78 | - Effect: "Allow" 79 | Action: 80 | - sqs:SendMessage 81 | - sqs:GetQueueUrl 82 | Resource: !Sub ${MaelASGTerminateQueue.Arn} 83 | ``` 84 | 85 | Here's an example of how to integrate that with your ASG: 86 | 87 | ```yaml 88 | MaelASG: 89 | Type: AWS::AutoScaling::AutoScalingGroup 90 | Properties: 91 | # 92 | LifecycleHookSpecificationList: 93 | - DefaultResult: CONTINUE 94 | HeartbeatTimeout: 600 95 | LifecycleHookName: "MaelASGTerminateHook" 96 | LifecycleTransition: "autoscaling:EC2_INSTANCE_TERMINATING" 97 | NotificationTargetARN: !Sub ${MaelASGTerminateQueue.Arn} 98 | RoleARN: !Sub ${MaelASGTerminateRole.Arn} 99 | ``` 100 | 101 | And the IAM permissions your Maelstrom nodes need in order to dequeue messages 102 | and send the acknowledgement that the hook has completed. 103 | 104 | ```yaml 105 | MaelASGRole: 106 | Type: AWS::IAM::Role 107 | Properties: 108 | RoleName: MaelASGRole 109 | AssumeRolePolicyDocument: 110 | Version: "2012-10-17" 111 | Statement: 112 | - Effect: "Allow" 113 | Principal: 114 | Service: 115 | - "ec2.amazonaws.com" 116 | Action: 117 | - "sts:AssumeRole" 118 | Policies: 119 | - PolicyName: "MaelASGSQS" 120 | PolicyDocument: 121 | Version: "2012-10-17" 122 | Statement: 123 | - Effect: "Allow" 124 | Action: 125 | - autoscaling:CompleteLifecycleAction 126 | Resource: "*" 127 | - Effect: "Allow" 128 | Action: 129 | - sqs:ReceiveMessage 130 | - sqs:DeleteMessage 131 | Resource: !Sub ${MaelASGTerminateQueue.Arn} 132 | ``` 133 | 134 | Finally, when starting `maelstromd` make sure to set these variables. 135 | 136 | ```bash 137 | # required - if set, Maelstrom will internally poll this queue for termination messages 138 | export MAEL_INSTANCEID=`curl -s http://169.254.169.254/latest/meta-data/instance-id` 139 | export MAEL_AWSTERMINATEQUEUEURL="${MaelASGTerminateQueue}" 140 | # optional, but recommended - this provides time for cluster members to notify each other 141 | export MAEL_SHUTDOWNPAUSESECONDS=10 142 | ``` 143 | -------------------------------------------------------------------------------- /pkg/evsource/cron/cron.go: -------------------------------------------------------------------------------- 1 | package evcron 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "fmt" 7 | "github.com/coopernurse/maelstrom/pkg/db" 8 | "github.com/coopernurse/maelstrom/pkg/v1" 9 | "github.com/mgutz/logxi/v1" 10 | "github.com/robfig/cron/v3" 11 | "io" 12 | "net/http" 13 | "net/http/httptest" 14 | "reflect" 15 | "strings" 16 | "sync" 17 | "time" 18 | ) 19 | 20 | func NewCronService(db db.Db, gateway http.Handler, ctx context.Context, nodeId string, refreshRate time.Duration) *CronService { 21 | return &CronService{ 22 | db: db, 23 | gateway: gateway, 24 | ctx: ctx, 25 | nodeId: nodeId, 26 | refreshRate: refreshRate, 27 | acquiredRole: false, 28 | } 29 | } 30 | 31 | type CronService struct { 32 | db db.Db 33 | gateway http.Handler 34 | ctx context.Context 35 | nodeId string 36 | acquiredRole bool 37 | refreshRate time.Duration 38 | cron *cron.Cron 39 | eventSources []v1.EventSourceWithStatus 40 | } 41 | 42 | func (c *CronService) Run(wg *sync.WaitGroup, withSeconds bool) { 43 | defer wg.Done() 44 | log.Info("cron: starting cron service", "refreshRate", c.refreshRate.String()) 45 | lockTicker := time.Tick(15 * time.Second) 46 | reloadTicker := time.Tick(c.refreshRate) 47 | c.reloadRulesAndStartCron(c.acquireRoleOrStop(), withSeconds) 48 | for { 49 | select { 50 | case <-c.ctx.Done(): 51 | if c.cron != nil { 52 | c.cron.Stop() 53 | } 54 | log.Info("cron: shutdown gracefully") 55 | return 56 | 57 | case <-lockTicker: 58 | c.acquireRoleOrStop() 59 | 60 | case <-reloadTicker: 61 | c.reloadRulesAndStartCron(c.acquireRoleOrStop(), withSeconds) 62 | } 63 | } 64 | } 65 | 66 | func (c *CronService) createCronInvoker(es v1.EventSource) func() { 67 | url := fmt.Sprintf("http://127.0.0.1%s", es.Cron.Http.Path) 68 | includeBody := strings.ToLower(es.Cron.Http.Method) != "get" 69 | return func() { 70 | var body io.Reader 71 | if includeBody && es.Cron.Http.Data != "" { 72 | body = bytes.NewBufferString(es.Cron.Http.Data) 73 | } 74 | log.Info("cron: invoking component", "name", es.Name, "component", es.ComponentName, "url", url) 75 | rw := httptest.NewRecorder() 76 | req, err := http.NewRequest(es.Cron.Http.Method, url, body) 77 | if err == nil { 78 | for _, nv := range es.Cron.Http.Headers { 79 | req.Header.Add(nv.Name, nv.Value) 80 | } 81 | req.Header.Set("Maelstrom-Component", es.ComponentName) 82 | c.gateway.ServeHTTP(rw, req) 83 | if rw.Code < 200 || rw.Code > 299 { 84 | log.Warn("cron: invoke returned non 2xx status", "name", es.Name, "component", es.ComponentName, 85 | "status", rw.Code) 86 | } 87 | } else { 88 | log.Error("cron: http.NewRequest failed", "err", err, "name", es.Name, "component", es.ComponentName) 89 | } 90 | } 91 | } 92 | 93 | func (c *CronService) acquireRoleOrStop() bool { 94 | previous := c.acquiredRole 95 | c.acquiredRole = false 96 | roleOk, roleNode, err := c.db.AcquireOrRenewRole(db.RoleCron, c.nodeId, time.Minute) 97 | if err == nil { 98 | c.acquiredRole = roleOk 99 | 100 | if previous && !roleOk { 101 | log.Info("cron: lost role lock", "node", c.nodeId, "newCronNode", roleNode) 102 | } else if !previous && roleOk { 103 | log.Info("cron: acquired role lock, starting cron") 104 | } 105 | 106 | if !roleOk { 107 | c.stopCron() 108 | } 109 | } else { 110 | log.Error("cron: db.AcquireOrRenewRole error", "err", err, "node", c.nodeId) 111 | } 112 | return c.acquiredRole 113 | } 114 | 115 | func (c *CronService) stopCron() { 116 | if c.cron != nil { 117 | log.Info("cron: stopping old cron scheduler") 118 | c.cron.Stop() 119 | c.cron = nil 120 | } 121 | } 122 | 123 | func (c *CronService) reloadRulesAndStartCron(hasRoleLock bool, withSeconds bool) { 124 | if !hasRoleLock { 125 | return 126 | } 127 | 128 | eventSources, err := c.loadAllCronEventSources() 129 | if err == nil { 130 | if c.cron == nil || c.eventSources == nil || !reflect.DeepEqual(eventSources, c.eventSources) { 131 | var newCron *cron.Cron 132 | if len(eventSources) > 0 { 133 | log.Info("cron: creating new cron scheduler", "eventSourceCount", len(eventSources)) 134 | if withSeconds { 135 | newCron = cron.New(cron.WithSeconds()) 136 | } else { 137 | newCron = cron.New() 138 | } 139 | for _, ess := range eventSources { 140 | es := ess.EventSource 141 | if es.Cron != nil && ess.Enabled { 142 | _, err = newCron.AddFunc(es.Cron.Schedule, c.createCronInvoker(es)) 143 | if err != nil { 144 | log.Error("cron: error adding cron", "err", err, "schedule", es.Cron.Schedule) 145 | } 146 | } 147 | } 148 | } 149 | 150 | c.stopCron() 151 | 152 | if newCron != nil { 153 | newCron.Start() 154 | } 155 | c.cron = newCron 156 | c.eventSources = eventSources 157 | } 158 | } else { 159 | log.Error("cron: error loading event sources", "err", err) 160 | } 161 | } 162 | 163 | func (c *CronService) loadAllCronEventSources() ([]v1.EventSourceWithStatus, error) { 164 | eventSources := make([]v1.EventSourceWithStatus, 0) 165 | nextToken := "" 166 | for { 167 | out, err := c.db.ListEventSources(v1.ListEventSourcesInput{ 168 | EventSourceType: v1.EventSourceTypeCron, 169 | Limit: 1000, 170 | NextToken: nextToken, 171 | }) 172 | if err != nil { 173 | return nil, err 174 | } 175 | if len(out.EventSources) > 0 { 176 | eventSources = append(eventSources, out.EventSources...) 177 | } 178 | if out.NextToken == "" { 179 | return eventSources, nil 180 | } 181 | nextToken = out.NextToken 182 | } 183 | } 184 | -------------------------------------------------------------------------------- /pkg/router/router_test.go: -------------------------------------------------------------------------------- 1 | package router 2 | 3 | import ( 4 | "context" 5 | "net/http" 6 | "net/http/httptest" 7 | "sync" 8 | "sync/atomic" 9 | "testing" 10 | "time" 11 | 12 | "github.com/coopernurse/maelstrom/pkg/revproxy" 13 | v1 "github.com/coopernurse/maelstrom/pkg/v1" 14 | "github.com/stretchr/testify/assert" 15 | ) 16 | 17 | func TestInitialState(t *testing.T) { 18 | r := newRouter() 19 | assert.Equal(t, "foo", r.componentName) 20 | assert.Equal(t, int64(0), r.inflightReqs) 21 | assert.Equal(t, int64(0), r.activeHandlers) 22 | assert.Equal(t, StateOff, r.state) 23 | } 24 | 25 | func TestHandlerStart(t *testing.T) { 26 | r := newRouter() 27 | ch := r.HandlerStartRemote() 28 | assert.NotNil(t, ch) 29 | assert.Equal(t, int64(0), r.inflightReqs) 30 | assert.Equal(t, int64(1), r.activeHandlers) 31 | assert.Equal(t, StateOn, r.state) 32 | } 33 | 34 | func TestHandlerStop(t *testing.T) { 35 | r := newRouter() 36 | 37 | // start 2 handlers 38 | r.HandlerStartRemote() 39 | r.HandlerStartLocal() 40 | assert.Equal(t, int64(2), r.activeHandlers) 41 | 42 | // stop 1 - still on 43 | r.HandlerStop() 44 | assert.Equal(t, int64(1), r.activeHandlers) 45 | assert.Equal(t, StateOn, r.state) 46 | 47 | // stop 1 - off 48 | r.HandlerStop() 49 | assert.Equal(t, int64(0), r.activeHandlers) 50 | assert.Equal(t, StateOff, r.state) 51 | } 52 | 53 | func TestStateLabels(t *testing.T) { 54 | assert.Equal(t, "on", StateOn.String()) 55 | assert.Equal(t, "off", StateOff.String()) 56 | assert.Equal(t, "pending", StatePending.String()) 57 | } 58 | 59 | func TestRouteIncrementsInflightCount(t *testing.T) { 60 | r := newRouter() 61 | go r.Route(context.Background(), newReq()) 62 | go r.Route(context.Background(), newReq()) 63 | time.Sleep(10 * time.Millisecond) 64 | assert.Equal(t, int64(2), r.GetInflightReqs()) 65 | runNoOpRemoteHandler(r) 66 | time.Sleep(10 * time.Millisecond) 67 | assert.Equal(t, int64(0), r.GetInflightReqs()) 68 | } 69 | 70 | func TestRouteBalancesInflightCount(t *testing.T) { 71 | reqCount := 500 72 | r := newRouter() 73 | assert.Equal(t, int64(0), r.GetInflightReqs()) 74 | 75 | // run x requests through router 76 | wg := &sync.WaitGroup{} 77 | for i := 0; i < reqCount; i++ { 78 | wg.Add(1) 79 | go func() { 80 | r.Route(context.Background(), newReq()) 81 | wg.Done() 82 | }() 83 | } 84 | 85 | // add no-op handler 86 | runNoOpRemoteHandler(r) 87 | 88 | // block until all are sent to chan 89 | wg.Wait() 90 | r.HandlerStop() 91 | assert.Equal(t, true, r.DestroyIfIdle()) 92 | 93 | // verify inflight is still zero after completion 94 | assert.Equal(t, int64(0), r.GetInflightReqs()) 95 | } 96 | 97 | func TestCallsPlacementFuncWhenPending(t *testing.T) { 98 | startCalls := make(map[string]int) 99 | startComponentFx := func(componentName string) { 100 | startCalls[componentName] = startCalls[componentName] + 1 101 | } 102 | r := newRouter().WithStartComponentFunc(startComponentFx) 103 | go r.Route(context.Background(), newReq()) 104 | time.Sleep(10 * time.Millisecond) 105 | assert.Equal(t, StatePending, r.state) 106 | assert.Equal(t, int64(1), r.inflightReqs) 107 | assert.Equal(t, map[string]int{r.componentName: 1}, startCalls) 108 | } 109 | 110 | func TestRouteLocal(t *testing.T) { 111 | r := newRouter() 112 | 113 | var localReqs int64 114 | var remoteReqs int64 115 | 116 | localProxyFx := func(req *revproxy.Request) { 117 | atomic.AddInt64(&localReqs, 1) 118 | req.Rw.WriteHeader(200) 119 | } 120 | 121 | remoteProxyFx := func(req *revproxy.Request) { 122 | atomic.AddInt64(&remoteReqs, 1) 123 | req.Rw.WriteHeader(200) 124 | } 125 | 126 | localCh := r.HandlerStartLocal() 127 | go func() { 128 | for req := range localCh { 129 | req.Proxy <- localProxyFx 130 | } 131 | }() 132 | remoteCh := r.HandlerStartRemote() 133 | go func() { 134 | for req := range remoteCh { 135 | req.Proxy <- remoteProxyFx 136 | } 137 | }() 138 | 139 | // send 20 requests - preferring local 140 | for i := 0; i < 20; i++ { 141 | time.Sleep(3 * time.Millisecond) 142 | req := newReq() 143 | req.PreferLocal = true 144 | r.Route(context.Background(), req) 145 | } 146 | time.Sleep(3 * time.Millisecond) 147 | 148 | // all 20 should have been handled locally 149 | assert.Equal(t, int64(20), localReqs) 150 | assert.Equal(t, int64(0), remoteReqs) 151 | 152 | // send another 20 without prefer local, and no sleep (so local handler will be busy at points) 153 | for i := 0; i < 20; i++ { 154 | req := newReq() 155 | req.PreferLocal = false 156 | r.Route(context.Background(), req) 157 | } 158 | time.Sleep(3 * time.Millisecond) 159 | 160 | // should have some remote reqs 161 | assert.True(t, remoteReqs > 0) 162 | } 163 | 164 | /////////////////////////////////////////////////////// 165 | 166 | var bufferPool = revproxy.NewProxyBufferPool() 167 | 168 | func newRouter() *Router { 169 | return NewRouter("foo", "node1", bufferPool, func(componentName string) {}) 170 | } 171 | 172 | func newReq() *revproxy.Request { 173 | comp := &v1.Component{ 174 | Name: "foo", 175 | Docker: &v1.DockerComponent{HttpStartHealthCheckSeconds: 5}, 176 | } 177 | rw := httptest.NewRecorder() 178 | req, err := http.NewRequest("GET", "/", nil) 179 | if err != nil { 180 | panic(err) 181 | } 182 | return revproxy.NewRequest(req, rw, comp, false) 183 | } 184 | 185 | func runNoOpRemoteHandler(r *Router) { 186 | runNoOpHandler(r.HandlerStartRemote()) 187 | } 188 | 189 | func runNoOpLocalHandler(r *Router) { 190 | runNoOpHandler(r.HandlerStartLocal()) 191 | } 192 | 193 | func runNoOpHandler(reqCh <-chan *revproxy.GetProxyRequest) { 194 | proxyFx := func(req *revproxy.Request) { 195 | req.Rw.WriteHeader(200) 196 | } 197 | go func() { 198 | for req := range reqCh { 199 | req.Proxy <- proxyFx 200 | } 201 | }() 202 | } 203 | -------------------------------------------------------------------------------- /pkg/config/config.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "io" 7 | "os" 8 | "strings" 9 | 10 | "github.com/coopernurse/envconfig" 11 | "github.com/coopernurse/maelstrom/pkg/common" 12 | ) 13 | 14 | func FileToEnv(fname string) error { 15 | file, err := os.Open(fname) 16 | if err != nil { 17 | return fmt.Errorf("config: error opening env file: %s - %v", fname, err) 18 | } 19 | defer common.CheckClose(file, &err) 20 | err = ReaderToEnv(file) 21 | if err != nil { 22 | return fmt.Errorf("config: error scanning env file: %s - %v", fname, err) 23 | } 24 | return nil 25 | } 26 | 27 | func ReaderToEnv(r io.Reader) error { 28 | scanner := bufio.NewScanner(r) 29 | for scanner.Scan() { 30 | line := strings.TrimSpace(scanner.Text()) 31 | pos := strings.Index(line, "=") 32 | if pos > 0 && !strings.HasPrefix(line, "#") { 33 | key := strings.TrimSpace(line[0:pos]) 34 | val := strings.TrimSpace(line[pos+1:]) 35 | if key != "" { 36 | err := os.Setenv(key, val) 37 | if err != nil { 38 | return fmt.Errorf("config: unable to set env var: %s - %v", key, err) 39 | } 40 | } 41 | } 42 | } 43 | return scanner.Err() 44 | } 45 | 46 | func FromEnvFile(fname string) (Config, error) { 47 | err := FileToEnv(fname) 48 | if err != nil { 49 | return Config{}, err 50 | } 51 | return FromEnv() 52 | } 53 | 54 | func FromEnv() (Config, error) { 55 | var c Config 56 | err := envconfig.ProcessX(&c, envconfig.Options{Prefix: "mael", SplitWords: true}) 57 | if err != nil { 58 | return Config{}, err 59 | } 60 | return c, nil 61 | } 62 | 63 | type Config struct { 64 | // Server options 65 | 66 | // Identifier for the host machine 67 | // This should typically be set to the ID issued by the cloud provider creating the machine 68 | // (e.g. the EC2 instance id if running in AWS) 69 | InstanceId string 70 | 71 | // Port used for public reverse proxying 72 | PublicPort int `default:"80"` 73 | // HTTPS Port used for public reverse proxying 74 | PublicHTTPSPort int `default:"443" envconfig:"PUBLIC_HTTPS_PORT"` 75 | // Port used for private routing and management operations 76 | PrivatePort int `default:"8374"` 77 | 78 | // HTTP Server timeouts (in seconds) 79 | HTTPReadTimeout int `default:"300"` 80 | HTTPWriteTimeout int `default:"310"` 81 | HTTPIdleTimeout int `default:"310"` 82 | 83 | // database/sql driver to use 84 | SqlDriver string 85 | // DSN for sql database - format is specific to each particular database driver 86 | SqlDsn string 87 | // Interval to refresh cron rules from db 88 | CronRefreshSeconds int `default:"60"` 89 | // If > 0, print gc stats every x seconds 90 | LogGcSeconds int 91 | 92 | // If set, log profile data to this filename 93 | CpuProfileFilename string 94 | // If true, bind go pprof endpoints to private gateway /_mael/pprof/ 95 | Pprof bool 96 | 97 | // Memory (MiB) to make available to containers (if set to zero, maelstromd will simply act as a relay) 98 | TotalMemory int64 `default:"-1"` 99 | 100 | // Terminate command - if instance is told to terminate, run this command 101 | TerminateCommand string 102 | 103 | // If > 0, will sleep for this number of seconds after stopping background jobs 104 | // before shutting down HTTP listeners. In clustered environments this can be used 105 | // to give peers time to remove node from routing table, minimizing the chance of 106 | // dropping a request during shutdown 107 | ShutdownPauseSeconds int 108 | 109 | // If > 0, will prune exited containers and untagged images every x minutes 110 | // Similar to the "docker system prune" command 111 | DockerPruneMinutes int 112 | 113 | // If DockerPruneMinutes > 0 and this is true, when prune operation runs 114 | // maelstrom will load the list of components and remove any image that is 115 | // not registered to a maelstrom component. This is useful if your system 116 | // uses version tags. Set this option to true to remove old versions of images 117 | // no longer referenced by a component. 118 | DockerPruneUnregImages bool 119 | 120 | // Comma separated list of image tags to keep. Only relevant if 121 | // DockerPruneUnregImages=true and DockerPruneMinutes > 0 122 | // Supports * globs, so "myorg/*" would match "myorg/image1" and "myorg/image2" but 123 | // not "otherorg/myorg" 124 | DockerPruneUnregKeep string 125 | 126 | // AWS Options 127 | // 128 | // SQS Queue to poll for EC2 Auto Scaling Lifecycle Hooks 129 | // See: https://docs.aws.amazon.com/autoscaling/ec2/userguide/lifecycle-hooks.html 130 | // If this is non-empty, a SQS poller will listen for messages and will notify 131 | // cluster peers that a node is scheduled for termination 132 | AwsTerminateQueueUrl string 133 | 134 | // AWS lifecycle hook messages older than this many seconds will be deleted immediately 135 | // This avoids stuck messages in the queue 136 | AwsTerminateMaxAgeSeconds int `default:"600"` 137 | 138 | // If > 0, poll the EC2 spot/instance-action metadata endpoint every x seconds looking 139 | // for a 'stop' or 'terminate' message. If found, initiate a graceful shutdown 140 | // This setting should be enabled on any server running via a spot instance request, 141 | // although it's safe to use on any EC2 host 142 | AwsSpotTerminatePollSeconds int 143 | 144 | // If > 0, node status rows older than this many seconds will be removed from the database, 145 | // effectively removing the node from the cluster until it reports in again 146 | NodeLivenessSeconds int `default:"300"` 147 | 148 | // Currently unsupported - will dust these off in the future 149 | Cluster ClusterOptions 150 | DigitalOcean *DigitalOceanOptions `envconfig:"DO"` 151 | } 152 | 153 | type ClusterOptions struct { 154 | Name string 155 | MinSize int `default:"1"` 156 | MaxSize int `default:"20"` 157 | } 158 | 159 | type DigitalOceanOptions struct { 160 | AccessToken string 161 | Region string `default:"nyc3"` 162 | SSHFingerprint string 163 | DropletSize string `default:"s-1vcpu-1gb"` 164 | ImageSlug string `default:"debian-9-x64"` 165 | Backups bool `default:"true"` 166 | IPV6 bool 167 | } 168 | -------------------------------------------------------------------------------- /pkg/maelstrom/logs.go: -------------------------------------------------------------------------------- 1 | package maelstrom 2 | 3 | import ( 4 | "context" 5 | "encoding/binary" 6 | "encoding/json" 7 | "fmt" 8 | "github.com/coopernurse/maelstrom/pkg/common" 9 | "github.com/docker/docker/api/types" 10 | "github.com/docker/docker/api/types/filters" 11 | docker "github.com/docker/docker/client" 12 | "github.com/mgutz/logxi/v1" 13 | "io" 14 | "net/http" 15 | "strings" 16 | "sync" 17 | "time" 18 | ) 19 | 20 | func NewLogsHandler(dockerClient *docker.Client) *LogsHandler { 21 | return &LogsHandler{ 22 | dockerClient: dockerClient, 23 | } 24 | } 25 | 26 | type LogsHandler struct { 27 | dockerClient *docker.Client 28 | } 29 | 30 | func (h *LogsHandler) ServeHTTP(rw http.ResponseWriter, req *http.Request) { 31 | ctx, cancel := context.WithCancel(context.Background()) 32 | defer cancel() 33 | 34 | sinceArg := req.FormValue("since") 35 | if sinceArg == "" { 36 | sinceArg = "1m" 37 | } 38 | 39 | componentsArg := req.FormValue("components") 40 | componentNames := make(map[string]bool) 41 | for _, s := range strings.Split(componentsArg, ",") { 42 | s = strings.TrimSpace(s) 43 | if s != "" && s != "*" { 44 | componentNames[s] = true 45 | } 46 | } 47 | 48 | filterArgs := filters.NewArgs() 49 | filterArgs.Add("event", "start") 50 | msgCh, errCh := h.dockerClient.Events(ctx, types.EventsOptions{ 51 | Filters: filterArgs, 52 | }) 53 | 54 | logReadCtx, logCancel := context.WithCancel(context.Background()) 55 | wg := &sync.WaitGroup{} 56 | 57 | logCh := make(chan common.LogMsg, 10) 58 | pingTicker := time.NewTicker(time.Second) 59 | 60 | containers, err := common.ListMaelstromContainers(h.dockerClient) 61 | if err != nil { 62 | log.Error("logs: listContainers failed", "err", err) 63 | rw.WriteHeader(http.StatusInternalServerError) 64 | _, _ = fmt.Fprintf(rw, "Can't list docker containers") 65 | return 66 | } 67 | 68 | for _, c := range containers { 69 | component := c.Labels["maelstrom_component"] 70 | if followComponent(componentNames, component) { 71 | h.startStreamLogs(logReadCtx, wg, logCh, c.ID, component, sinceArg) 72 | } 73 | } 74 | 75 | rw.WriteHeader(http.StatusOK) 76 | for { 77 | var line string 78 | select { 79 | case <-pingTicker.C: 80 | line = common.PingLogMsg 81 | case m := <-logCh: 82 | msgBytes, err := json.Marshal(m) 83 | if err == nil { 84 | line = string(msgBytes) 85 | } else { 86 | log.Error("logs: json.Marshal failed", "err", err) 87 | } 88 | case m := <-msgCh: 89 | if log.IsDebug() { 90 | log.Debug("logs: docker event", "from", m.From, "actor", m.Actor, "type", m.Type, "status", m.Status) 91 | } 92 | if m.Type == "container" && m.Status == "start" { 93 | cont, err := h.dockerClient.ContainerInspect(context.Background(), m.Actor.ID) 94 | if err == nil { 95 | if cont.Config != nil && cont.Config.Labels != nil { 96 | component := cont.Config.Labels["maelstrom_component"] 97 | if followComponent(componentNames, component) { 98 | h.startStreamLogs(logReadCtx, wg, logCh, m.Actor.ID, component, sinceArg) 99 | } 100 | } 101 | } else { 102 | log.Error("logs: ContainerInspect failed", "containerId", m.Actor.ID, "err", err) 103 | } 104 | } 105 | case m := <-errCh: 106 | log.Warn("logs: docker error", "err", m.Error()) 107 | } 108 | 109 | if line != "" { 110 | _, err := rw.Write([]byte(line + "\r\n")) 111 | if err == nil { 112 | if f, ok := rw.(http.Flusher); ok { 113 | f.Flush() 114 | } 115 | } else { 116 | logErr(err, "Unable to write line to log client") 117 | break 118 | } 119 | } 120 | } 121 | logCancel() 122 | wg.Wait() 123 | } 124 | 125 | func (h *LogsHandler) startStreamLogs(ctx context.Context, wg *sync.WaitGroup, logCh chan common.LogMsg, 126 | containerId string, component string, since string) { 127 | if component == "" || containerId == "" { 128 | return 129 | } 130 | reader, err := h.dockerClient.ContainerLogs(ctx, containerId, 131 | types.ContainerLogsOptions{ 132 | ShowStdout: true, 133 | ShowStderr: true, 134 | Follow: true, 135 | Since: since, 136 | }) 137 | if err == nil { 138 | go streamLogs(component, wg, reader, logCh) 139 | } else { 140 | log.Error("logs: dockerClient.ContainerLogs failed", "containerId", containerId, "err", err) 141 | } 142 | } 143 | 144 | func logErr(err error, msg string) { 145 | if err != nil && !strings.Contains(err.Error(), "broken pipe") { 146 | log.Error("logs: "+msg, "err", err) 147 | } 148 | } 149 | 150 | func streamLogs(component string, wg *sync.WaitGroup, reader io.ReadCloser, out chan<- common.LogMsg) { 151 | var err error 152 | wg.Add(1) 153 | defer wg.Done() 154 | defer common.CheckClose(reader, &err) 155 | 156 | if log.IsDebug() { 157 | log.Debug("streamLogs start", "component", component) 158 | } 159 | 160 | var header = make([]byte, 8) 161 | for { 162 | _, err := io.ReadFull(reader, header) 163 | if err == nil { 164 | size := int(binary.BigEndian.Uint32(header[4:8])) 165 | var body = make([]byte, size) 166 | _, err = io.ReadFull(reader, body) 167 | stream := "stdout" 168 | if header[0] == 2 { 169 | stream = "stderr" 170 | } 171 | if err == nil { 172 | out <- common.LogMsg{ 173 | Component: component, 174 | Stream: stream, 175 | Data: string(body), 176 | } 177 | } else { 178 | break 179 | } 180 | } else { 181 | break 182 | } 183 | } 184 | if log.IsDebug() { 185 | log.Debug("streamLogs done", "component", component) 186 | } 187 | } 188 | 189 | func followComponent(componentNames map[string]bool, component string) bool { 190 | if component == "" { 191 | return false 192 | } 193 | if len(componentNames) == 0 { 194 | return true 195 | } 196 | 197 | for k := range componentNames { 198 | if strings.HasPrefix(k, "*") && strings.HasSuffix(k, "*") && len(k) > 2 { 199 | if strings.Contains(component, k[1:len(k)-1]) { 200 | return true 201 | } 202 | } else if strings.HasPrefix(k, "*") { 203 | if strings.HasSuffix(component, k[1:]) { 204 | return true 205 | } 206 | } else if strings.HasSuffix(k, "*") { 207 | if strings.HasPrefix(component, k[0:len(k)-1]) { 208 | return true 209 | } 210 | } else if k == component { 211 | return true 212 | } 213 | } 214 | 215 | return false 216 | } 217 | -------------------------------------------------------------------------------- /pkg/converge/registry.go: -------------------------------------------------------------------------------- 1 | package converge 2 | 3 | import ( 4 | "context" 5 | "sync" 6 | "time" 7 | 8 | "github.com/coopernurse/maelstrom/pkg/common" 9 | "github.com/coopernurse/maelstrom/pkg/revproxy" 10 | "github.com/coopernurse/maelstrom/pkg/router" 11 | v1 "github.com/coopernurse/maelstrom/pkg/v1" 12 | docker "github.com/docker/docker/client" 13 | log "github.com/mgutz/logxi/v1" 14 | "github.com/pkg/errors" 15 | ) 16 | 17 | func NewRegistry(dockerClient *docker.Client, routerReg *router.Registry, maelstromUrl string, 18 | pullImage ConvergePullImage, 19 | startLockAcquire ConvergeStartLockAcquire, 20 | postStartContainer ConvergePostStartContainer, 21 | notifyContainersChanged ConvergeNotifyContainersChanged) *Registry { 22 | bufferPool := revproxy.NewProxyBufferPool() 23 | return &Registry{ 24 | dockerClient: dockerClient, 25 | routerReg: routerReg, 26 | maelstromUrl: maelstromUrl, 27 | pullImage: pullImage, 28 | startLockAcquire: startLockAcquire, 29 | postStartContainer: postStartContainer, 30 | notifyContainersChanged: notifyContainersChanged, 31 | version: common.NowMillis(), 32 | byCompName: make(map[string]*Converger), 33 | containerCounterId: 0, 34 | bufferPool: bufferPool, 35 | lock: &sync.Mutex{}, 36 | } 37 | } 38 | 39 | type Registry struct { 40 | dockerClient *docker.Client 41 | routerReg *router.Registry 42 | maelstromUrl string 43 | pullImage ConvergePullImage 44 | startLockAcquire ConvergeStartLockAcquire 45 | postStartContainer ConvergePostStartContainer 46 | notifyContainersChanged ConvergeNotifyContainersChanged 47 | version int64 48 | byCompName map[string]*Converger 49 | containerCounterId maelContainerId 50 | bufferPool *revproxy.ProxyBufferPool 51 | lock *sync.Mutex 52 | } 53 | 54 | func (r *Registry) RemoveStaleContainers() error { 55 | rmCount, err := common.RemoveMaelstromContainers(r.dockerClient, "removing stale containers") 56 | if err != nil { 57 | return errors.Wrap(err, "converge: remove containers failed") 58 | } 59 | if rmCount > 0 { 60 | log.Info("converge: removed stale containers", "count", rmCount) 61 | } 62 | return nil 63 | } 64 | 65 | func (r *Registry) GetRouterRegistry() *router.Registry { 66 | return r.routerReg 67 | } 68 | 69 | func (r *Registry) Shutdown() { 70 | r.lock.Lock() 71 | defer r.lock.Unlock() 72 | 73 | log.Info("converge: shutdown starting") 74 | wg := &sync.WaitGroup{} 75 | for _, c := range r.byCompName { 76 | wg.Add(1) 77 | go func(conv *Converger) { 78 | defer wg.Done() 79 | conv.Stop() 80 | }(c) 81 | } 82 | wg.Wait() 83 | r.byCompName = make(map[string]*Converger) 84 | } 85 | 86 | func (r *Registry) OnDockerEvent(msg common.DockerEvent) { 87 | r.lock.Lock() 88 | convergers := r.byCompName 89 | r.lock.Unlock() 90 | 91 | for _, c := range convergers { 92 | c.OnDockerEvent(&msg) 93 | } 94 | } 95 | 96 | func (r *Registry) OnComponentNotification(change v1.DataChangedUnion) { 97 | r.lock.Lock() 98 | defer r.lock.Unlock() 99 | 100 | if change.RemoveComponent != nil { 101 | cn, ok := r.byCompName[change.RemoveComponent.Name] 102 | if ok { 103 | log.Info("converge: shutting down component - component removed from db", 104 | "component", change.RemoveComponent.Name) 105 | delete(r.byCompName, change.RemoveComponent.Name) 106 | go cn.Stop() 107 | } 108 | } 109 | if change.PutComponent != nil { 110 | cn, ok := r.byCompName[change.PutComponent.Name] 111 | if ok { 112 | cn.SetComponent(change.PutComponent) 113 | } 114 | } 115 | } 116 | 117 | func (r *Registry) ByComponent(comp *v1.Component) (c *Converger) { 118 | r.lock.Lock() 119 | defer r.lock.Unlock() 120 | 121 | c = r.byCompName[comp.Name] 122 | if c == nil { 123 | c = NewConverger(ComponentTarget{ 124 | Component: comp, 125 | Count: 0, 126 | }). 127 | WithPullImage(r.pullImage). 128 | WithCreateContainer(r.createContainer). 129 | WithStopContainer(r.stopContainer). 130 | WithStartLockAcquire(r.startLockAcquire). 131 | WithPostStartContainer(r.postStartContainer). 132 | WithNotifyContainersChanged(r.onContainersChanged) 133 | c.Start() 134 | r.byCompName[comp.Name] = c 135 | } 136 | return 137 | } 138 | 139 | func (r *Registry) GetState() (version int64, compInfo []v1.ComponentInfo) { 140 | r.lock.Lock() 141 | defer r.lock.Unlock() 142 | 143 | version = r.version 144 | compInfo = make([]v1.ComponentInfo, 0) 145 | for _, conv := range r.byCompName { 146 | compInfo = append(compInfo, conv.GetComponentInfo()...) 147 | } 148 | 149 | return 150 | } 151 | 152 | func (r *Registry) SetTargets(version int64, targets []ComponentTarget, block bool) bool { 153 | r.lock.Lock() 154 | versionMatch := version == r.version 155 | if versionMatch { 156 | r.version++ 157 | } 158 | r.lock.Unlock() 159 | 160 | if versionMatch { 161 | startTime := time.Now() 162 | wg := &sync.WaitGroup{} 163 | for _, t := range targets { 164 | wg.Add(1) 165 | go func(t ComponentTarget) { 166 | defer wg.Done() 167 | conv := r.ByComponent(t.Component) 168 | doneCh := conv.SetTarget(t) 169 | <-doneCh 170 | }(t) 171 | } 172 | if block { 173 | log.Info("converge: blocking until converge completes") 174 | wg.Wait() 175 | log.Info("converge: converge completed", "elapsed", time.Now().Sub(startTime).String()) 176 | } 177 | } 178 | return versionMatch 179 | } 180 | 181 | func (r *Registry) incrContainerIdCounter() (c maelContainerId) { 182 | r.lock.Lock() 183 | r.containerCounterId++ 184 | c = r.containerCounterId 185 | r.lock.Unlock() 186 | return 187 | } 188 | 189 | func (r *Registry) onContainersChanged() { 190 | r.incrContainerIdCounter() 191 | r.notifyContainersChanged() 192 | } 193 | 194 | func (r *Registry) createContainer(ctx context.Context, comp *v1.Component) *Container { 195 | containerId := r.incrContainerIdCounter() 196 | router := r.routerReg.ByComponent(comp.Name) 197 | return NewContainer(r.dockerClient, comp, r.maelstromUrl, router, containerId, 198 | r.bufferPool, ctx) 199 | } 200 | 201 | func (r *Registry) stopContainer(cn *Container, reason string) { 202 | cn.JoinAndStop(reason) 203 | } 204 | -------------------------------------------------------------------------------- /docs/gitbook/appendix/maelstromd_env_vars.md: -------------------------------------------------------------------------------- 1 | 2 | # maelstromd Environment Variables 3 | 4 | * `maelstromd` configuration is done via environment variables. 5 | * All variables (except LOGXI vars) are prefixed with `MAEL_`. 6 | * All variables are upper case 7 | * Variables are bound to the `Config` struct in [config.go](https://github.com/coopernurse/maelstrom/blob/master/pkg/config/config.go#L59) using [envconfig](https://github.com/kelseyhightower/envconfig) 8 | 9 | ## Logging 10 | 11 | `maelstromd` uses [mgutz/logxi](https://github.com/mgutz/logxi) for logging, which has a set of environment variables 12 | that control the logging format. Please read the logxi docs for more details. 13 | 14 | | Variable | Description | Example 15 | |------------------|----------------------------------------------|-----------------------------------| 16 | | LOGXI | Sets log levels | `LOGXI=*=DBG` 17 | | LOGXI_FORMAT | Sets format for logger | `LOGXI_FORMAT=text` 18 | | LOGXI_COLORS | Color schema for log levels | `LOGXI_COLORS=TRC,DBG,WRN=yellow,INF=green,ERR=red` 19 | 20 | ## HTTP 21 | 22 | | Variable | Description | Required? | Default | 23 | |-----------------------------|---------------------------------------------------------------------------|-----------|---------| 24 | | MAEL_PUBLIC_PORT | HTTP port to bind to for external HTTP reqs | No | 80 | 25 | | MAEL_PUBLIC_HTTPS_PORT | HTTP port to bind to for external HTTPS reqs | No | 443 | 26 | | MAEL_PRIVATE_PORT | HTTP port to bind to for internal HTTP reqs (node to node and RPC calls) | No | 8374 | 27 | | MAEL_HTTP_READ_TIMEOUT | Max duration (seconds) for reading the request (including body) | No | 300 | 28 | | MAEL_HTTP_WRITE_TIMEOUT | Duration (seconds) before timing out writes of the response | No | 310 | 29 | | MAEL_HTTP_IDLE_TIMEOUT | Max time to wait (seconds) for next req when keep-alives are enabled | No | 310 | 30 | 31 | ## Database 32 | 33 | | Variable | Description | Required? | Default | 34 | |---------------------------------|----------------------------------------------|-----------|---------| 35 | | MAEL_SQL_DRIVER | sql db driver to use (sqlite3, mysql) | Yes | None | 36 | | MAEL_SQL_DSN | DSN for maelstrom sql db | Yes | None | 37 | 38 | #### Example DSNs: 39 | 40 | | Driver | Project | Example DSN 41 | |----------|---------------------------------------------------------------|---------------------- 42 | | sqlite3 | [go-sqlite3](https://github.com/mattn/go-sqlite3) | `file:test.db?cache=shared&mode=memory` 43 | | mysql | [go-sql-driver/mysql](https://github.com/go-sql-driver/mysql) | `user:passwd@(hostname:3306)/mael` 44 | | postgres | [lib/pq](https://godoc.org/github.com/lib/pq) | `postgres://user:passwd@host:port/mael` 45 | 46 | ## Refresh Intervals 47 | 48 | | Variable | Description | Required? | Default | 49 | |---------------------------------|----------------------------------------------|-----------|---------| 50 | | MAEL_CRON_REFRESH_SECONDS | Interval to reload cron rules from db | No | 60 | 51 | 52 | ## System Resources 53 | 54 | | Variable | Description | Required? | Default 55 | |---------------------------------|-------------------------------------------------------|-----------|--------------------- 56 | | MAEL_TOTAL_MEMORY | Memory (MiB) to make available to containers | No | System total memory 57 | | MAEL_DOCKER_PRUNE_MINUTES | Interval to run the docker image pruner | No | 0 (off) 58 | | MAEL_DOCKER_PRUNE_UNREG_IMAGES | If true, remove images not associated with components | No | false 59 | | MAEL_DOCKER_PRUNE_UNREG_KEEP | Comma separated list of image tags to never delete | No | None 60 | 61 | Read more about [Docker image pruning](../production/prune.html) 62 | 63 | ## System Management 64 | 65 | | Variable | Description | Required? | Default 66 | |--------------------------------------|---------------------------------------------------------|-----------|----------- 67 | | MAEL_INSTANCE_ID | ID of instance with VM provider (e.g. EC2 instance id) | No [1](#awslifecycle) | None 68 | | MAEL_NODE_LIVENESS_SECONDS | If a node doesn't report status within this interval it will be removed from the cluster | No | 300 69 | | MAEL_SHUTDOWN_PAUSE_SECONDS | Seconds to pause before stopping containers at shutdown | No | 0 70 | | MAEL_TERMINATE_COMMAND | Command to run if instance terminated. Only invoked if AWS lifecycle termination runs, not if SIGTERM/SIGINT received. | No | None 71 | | MAEL_AWS_TERMINATE_QUEUE_URL | SQS queue URL for lifecycle hook termination queue | No [1](#awslifecycle) | None 72 | | MAEL_AWS_TERMINATE_MAX_AGE_SECONDS | SQS messages older than this many seconds will be automatically deleted. This prevents stale messages from getting stuck in the queue. | No | 600 73 | | MAEL_AWS_SPOT_TERMINATE_POLL_SECONDS | If > 0, maelstromd will poll EC2 metadata endpoint checking for spot termination requests. If action=stop or terminate, maelstromd will shutdown gracefully. Value of setting sets the polling interval in seconds. | No | 0 74 | 75 | 1: Required for AWS Auto Scale Lifecycle Hook support 76 | 77 | ## Debugging 78 | 79 | | Variable | Description | Required? | Default | 80 | |---------------------------------|-----------------------------------------------|-----------|---------| 81 | | MAEL_LOG_GC_SECONDS | If set, print GC stats every x seconds | No | None | 82 | | MAEL_CPU_PROFILE_FILENAME | If set, write Go profiling info to this file | No | None | 83 | | MAEL_PPROF | If true, expose pprof HTTP routes | No | false | 84 | 85 | `pprof` routes are bound to the internal gateway port. For example, to get a heap profile, set `MAEL_PPROF=true` and 86 | request the `/_mael/pprof/heap` endpoint: 87 | 88 | ``` 89 | curl -sK -v http://localhost:8374/_mael/pprof/heap > heap.out 90 | ``` 91 | -------------------------------------------------------------------------------- /pkg/evsource/poller/evpoller.go: -------------------------------------------------------------------------------- 1 | package poller 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "github.com/aws/aws-sdk-go/aws/session" 7 | "github.com/coopernurse/maelstrom/pkg/db" 8 | "github.com/coopernurse/maelstrom/pkg/evsource" 9 | evsqs "github.com/coopernurse/maelstrom/pkg/evsource/aws/sqs" 10 | evstepfunc "github.com/coopernurse/maelstrom/pkg/evsource/aws/stepfunc" 11 | "github.com/coopernurse/maelstrom/pkg/router" 12 | v1 "github.com/coopernurse/maelstrom/pkg/v1" 13 | log "github.com/mgutz/logxi/v1" 14 | "math" 15 | "net/http" 16 | "sync" 17 | "time" 18 | ) 19 | 20 | func NewEvPoller(myNodeId string, ctx context.Context, db db.Db, gateway http.Handler, routerReg *router.Registry, 21 | awsSession *session.Session) *EvPoller { 22 | return &EvPoller{ 23 | myNodeId: myNodeId, 24 | ctx: ctx, 25 | db: db, 26 | gateway: gateway, 27 | routerReg: routerReg, 28 | awsSession: awsSession, 29 | activeRoles: make(map[string]context.CancelFunc), 30 | pollerWg: &sync.WaitGroup{}, 31 | activeRoleLock: &sync.Mutex{}, 32 | } 33 | } 34 | 35 | type EvPoller struct { 36 | myNodeId string 37 | ctx context.Context 38 | db db.Db 39 | gateway http.Handler 40 | routerReg *router.Registry 41 | awsSession *session.Session 42 | activeRoles map[string]context.CancelFunc 43 | pollerWg *sync.WaitGroup 44 | activeRoleLock *sync.Mutex 45 | } 46 | 47 | func (e *EvPoller) Run(daemonWG *sync.WaitGroup) { 48 | defer daemonWG.Done() 49 | e.reload() 50 | ticker := time.Tick(time.Minute) 51 | for { 52 | select { 53 | case <-ticker: 54 | e.reload() 55 | case <-e.ctx.Done(): 56 | log.Info("evpoller: shutting down pollers") 57 | for _, cancelFx := range e.getActiveRoles() { 58 | cancelFx() 59 | } 60 | e.pollerWg.Wait() 61 | log.Info("evpoller: shutdown gracefully") 62 | return 63 | } 64 | } 65 | } 66 | 67 | func (e *EvPoller) getActiveRoles() map[string]context.CancelFunc { 68 | e.activeRoleLock.Lock() 69 | defer e.activeRoleLock.Unlock() 70 | m := map[string]context.CancelFunc{} 71 | for k, v := range e.activeRoles { 72 | m[k] = v 73 | } 74 | return m 75 | } 76 | 77 | func (e *EvPoller) putActiveRole(roleId string, cancelFunc context.CancelFunc) { 78 | e.activeRoleLock.Lock() 79 | defer e.activeRoleLock.Unlock() 80 | e.activeRoles[roleId] = cancelFunc 81 | } 82 | 83 | func (e *EvPoller) deleteActiveRole(roleId string) { 84 | e.activeRoleLock.Lock() 85 | defer e.activeRoleLock.Unlock() 86 | delete(e.activeRoles, roleId) 87 | log.Info("evpoller: removed active role", "roleId", roleId) 88 | } 89 | 90 | func (e *EvPoller) reload() { 91 | nextToken := "" 92 | input := v1.ListEventSourcesInput{} 93 | validRoleIds := map[string]bool{} 94 | running := true 95 | for running { 96 | input.NextToken = nextToken 97 | output, err := e.db.ListEventSources(input) 98 | if err != nil { 99 | log.Error("evpoller: ListEventSources error", "err", err) 100 | return 101 | } 102 | 103 | // init pollers for event sources found 104 | for _, ess := range output.EventSources { 105 | if ess.Enabled { 106 | var pollCreator evsource.PollCreator 107 | var err error 108 | es := ess.EventSource 109 | if es.Sqs != nil && es.Sqs.QueueName != "" { 110 | pollCreator, err = evsqs.NewPollCreator(es, e.awsSession, e.gateway) 111 | } else if es.Awsstepfunc != nil && es.Awsstepfunc.ActivityName != "" { 112 | pollCreator, err = evstepfunc.NewPollCreator(es, e.awsSession, e.gateway) 113 | } 114 | if err != nil { 115 | log.Error("evpoller: create poller error", "err", err) 116 | } 117 | if pollCreator != nil { 118 | e.startPollerGroup(pollCreator, validRoleIds) 119 | } 120 | } 121 | } 122 | 123 | nextToken = output.NextToken 124 | running = nextToken != "" 125 | } 126 | 127 | // cancel any pollers that are no longer valid roles 128 | for roleId, cancelFx := range e.getActiveRoles() { 129 | _, ok := validRoleIds[roleId] 130 | if !ok { 131 | go cancelFx() 132 | } 133 | } 134 | } 135 | 136 | func (e *EvPoller) startPollerGroup(pollCreator evsource.PollCreator, validRoleIds map[string]bool) { 137 | componentName := pollCreator.ComponentName() 138 | activeHandlers := e.routerReg.ByComponent(componentName).GetHandlerCount() 139 | 140 | maxConcurrency := toMaxConcurrency(pollCreator.MaxConcurrency(), int(activeHandlers)) 141 | roleIdConcurs := toRoleIdConcurrency(pollCreator, maxConcurrency) 142 | 143 | for _, rc := range roleIdConcurs { 144 | roleId := rc.roleId 145 | validRoleIds[roleId] = true 146 | ok, _, err := e.db.AcquireOrRenewRole(roleId, e.myNodeId, 2*time.Minute) 147 | pollerOk := false 148 | if err != nil { 149 | log.Error("evpoller: AcquireOrRenewRole error", "err", err, "roleId", roleId) 150 | } else if ok { 151 | // acquired lock - start poller 152 | cancelFx := e.getActiveRoles()[roleId] 153 | if cancelFx == nil { 154 | poller := pollCreator.NewPoller() 155 | ctx, cancelFunc := context.WithCancel(e.ctx) 156 | e.putActiveRole(roleId, cancelFunc) 157 | e.pollerWg.Add(1) 158 | go func(roleId string) { 159 | defer e.pollerWg.Done() 160 | defer e.deleteActiveRole(roleId) 161 | poller(ctx, rc.concurrency, roleId) 162 | }(roleId) 163 | pollerOk = true 164 | } else { 165 | pollerOk = true 166 | } 167 | } 168 | 169 | if !pollerOk { 170 | // lost lock or no queues defined - cancel poller 171 | cancelFx := e.getActiveRoles()[roleId] 172 | if cancelFx != nil { 173 | go cancelFx() 174 | } 175 | } 176 | } 177 | } 178 | 179 | func toRoleIdConcurrency(pollCreator evsource.PollCreator, maxConcurrency int) []roleIdConcurrency { 180 | roleIdConcur := make([]roleIdConcurrency, 0) 181 | 182 | num := int(math.Round(math.Ceil(float64(maxConcurrency) / float64(pollCreator.MaxConcurrencyPerPoller())))) 183 | concurRemain := pollCreator.MaxConcurrencyPerPoller() * num 184 | if concurRemain > maxConcurrency { 185 | concurRemain = maxConcurrency 186 | } 187 | for i := 0; i < num; i++ { 188 | c := pollCreator.MaxConcurrencyPerPoller() 189 | if c > concurRemain { 190 | c = concurRemain 191 | } 192 | concurRemain -= c 193 | if c > 0 { 194 | roleIdConcur = append(roleIdConcur, roleIdConcurrency{ 195 | // store the concurrency value on the roleId 196 | // if maxConcurrency changes (due to event source modification or change in component instances) 197 | // then that will invalidate the roleId and we'll start a new one and turn off the old one 198 | roleId: fmt.Sprintf("%s-%d-%d", pollCreator.RoleIdPrefix(), i, c), 199 | concurrency: c, 200 | }) 201 | } 202 | } 203 | return roleIdConcur 204 | } 205 | 206 | func toMaxConcurrency(pollerMaxConcurrency int, activeHandlers int) int { 207 | if pollerMaxConcurrency <= 0 { 208 | pollerMaxConcurrency = 1 209 | } 210 | if activeHandlers <= 0 { 211 | activeHandlers = 1 212 | } 213 | maxConcur := activeHandlers 214 | if activeHandlers > pollerMaxConcurrency { 215 | maxConcur = pollerMaxConcurrency 216 | } 217 | return maxConcur 218 | } 219 | 220 | type roleIdConcurrency struct { 221 | roleId string 222 | concurrency int 223 | } 224 | -------------------------------------------------------------------------------- /pkg/evsource/aws/stepfunc/stepfunc.go: -------------------------------------------------------------------------------- 1 | package evstepfunc 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "fmt" 7 | "github.com/aws/aws-sdk-go/aws" 8 | "github.com/aws/aws-sdk-go/aws/awserr" 9 | "github.com/aws/aws-sdk-go/aws/request" 10 | "github.com/aws/aws-sdk-go/aws/session" 11 | "github.com/aws/aws-sdk-go/service/sfn" 12 | "github.com/coopernurse/maelstrom/pkg/common" 13 | "github.com/coopernurse/maelstrom/pkg/evsource" 14 | v1 "github.com/coopernurse/maelstrom/pkg/v1" 15 | log "github.com/mgutz/logxi/v1" 16 | "github.com/pkg/errors" 17 | "net/http" 18 | "net/http/httptest" 19 | "sync" 20 | "time" 21 | ) 22 | 23 | func activityArn(sfnClient *sfn.SFN, activityName string) (*string, error) { 24 | input := &sfn.ListActivitiesInput{} 25 | for { 26 | out, err := sfnClient.ListActivities(input) 27 | if err != nil { 28 | return nil, err 29 | } 30 | 31 | for _, act := range out.Activities { 32 | if act.Name != nil && *act.Name == activityName { 33 | return act.ActivityArn, nil 34 | } 35 | } 36 | 37 | if out.NextToken == nil || *out.NextToken == "" { 38 | return nil, nil 39 | } 40 | input.NextToken = out.NextToken 41 | } 42 | } 43 | 44 | func NewPollCreator(es v1.EventSource, awsSession *session.Session, gateway http.Handler) (evsource.PollCreator, error) { 45 | sfnClient := sfn.New(awsSession) 46 | arn, err := activityArn(sfnClient, es.Awsstepfunc.ActivityName) 47 | if err != nil { 48 | return nil, errors.Wrap(err, "evstepfunc: unable to lookup existing activity: "+es.Awsstepfunc.ActivityName) 49 | } 50 | if arn == nil { 51 | log.Info("evstepfunc: creating activity", "activityName", es.Awsstepfunc.ActivityName) 52 | createOut, err := sfnClient.CreateActivity(&sfn.CreateActivityInput{Name: aws.String(es.Awsstepfunc.ActivityName)}) 53 | if err != nil { 54 | return nil, errors.Wrap(err, "evstepfunc: unable to create activity: "+es.Awsstepfunc.ActivityName) 55 | } 56 | arn = createOut.ActivityArn 57 | } else { 58 | if log.IsDebug() { 59 | log.Info("evstepfunc: found existing activity", "activityName", es.Awsstepfunc.ActivityName, "arn", *arn) 60 | } 61 | } 62 | 63 | return &StepFuncPollCreator{ 64 | es: setDefaults(es), 65 | arn: arn, 66 | gateway: gateway, 67 | sfnClient: sfnClient, 68 | }, nil 69 | } 70 | 71 | type StepFuncPollCreator struct { 72 | es v1.EventSource 73 | arn *string 74 | gateway http.Handler 75 | sfnClient *sfn.SFN 76 | } 77 | 78 | func (s *StepFuncPollCreator) NewPoller() evsource.Poller { 79 | poller := &StepFuncPoller{ 80 | arn: s.arn, 81 | es: s.es, 82 | errSleep: 5 * time.Second, 83 | gateway: s.gateway, 84 | sfnClient: s.sfnClient, 85 | getTaskLock: &sync.Mutex{}, 86 | } 87 | return poller.Run 88 | } 89 | 90 | func (s *StepFuncPollCreator) ComponentName() string { 91 | return s.es.ComponentName 92 | } 93 | 94 | func (s *StepFuncPollCreator) RoleIdPrefix() string { 95 | return fmt.Sprintf("aws-stepfunc-%s", s.es.Name) 96 | } 97 | 98 | func (s *StepFuncPollCreator) MaxConcurrency() int { 99 | return int(s.es.Awsstepfunc.MaxConcurrency) 100 | } 101 | 102 | func (s *StepFuncPollCreator) MaxConcurrencyPerPoller() int { 103 | return int(s.es.Awsstepfunc.ConcurrencyPerPoller) 104 | } 105 | 106 | type StepFuncPoller struct { 107 | arn *string 108 | es v1.EventSource 109 | errSleep time.Duration 110 | gateway http.Handler 111 | sfnClient *sfn.SFN 112 | getTaskLock *sync.Mutex 113 | } 114 | 115 | func (s *StepFuncPoller) Run(ctx context.Context, concurrency int, roleId string) { 116 | log.Info("evstepfunc: starting poller", "component", s.es.ComponentName, "arn", s.arn, 117 | "concurrency", concurrency, "roleId", roleId) 118 | 119 | wg := &sync.WaitGroup{} 120 | for i := 0; i < concurrency; i++ { 121 | wg.Add(1) 122 | go s.workerLoop(ctx, wg) 123 | } 124 | 125 | wg.Wait() 126 | log.Info("evstepfunc: poller exiting gracefully", "component", s.es.ComponentName, "roleId", roleId) 127 | } 128 | 129 | func (s *StepFuncPoller) getTask(ctx context.Context) *sfn.GetActivityTaskOutput { 130 | s.getTaskLock.Lock() 131 | defer s.getTaskLock.Unlock() 132 | 133 | // double check if we're still running - skip poll if we aren't 134 | select { 135 | case <-ctx.Done(): 136 | return nil 137 | default: 138 | break 139 | } 140 | 141 | reqCtx, reqCtxCancel := context.WithTimeout(context.Background(), 100*time.Second) 142 | out, err := s.sfnClient.GetActivityTaskWithContext(reqCtx, &sfn.GetActivityTaskInput{ 143 | ActivityArn: s.arn, 144 | WorkerName: nil, 145 | }) 146 | reqCtxCancel() 147 | if err != nil { 148 | logerr := true 149 | if aerr, ok := err.(awserr.Error); ok { 150 | if aerr.Code() == request.CanceledErrorCode { 151 | logerr = false 152 | log.Warn("evstepfunc: GetActivityTask context canceled", "arn", s.arn) 153 | } 154 | } 155 | if logerr { 156 | log.Error("evstepfunc: error calling GetActivityTask", "err", err, "arn", s.arn) 157 | time.Sleep(s.errSleep) 158 | } 159 | } 160 | return out 161 | } 162 | 163 | func (s *StepFuncPoller) workerLoop(ctx context.Context, wg *sync.WaitGroup) { 164 | defer wg.Done() 165 | for { 166 | select { 167 | case <-ctx.Done(): 168 | return 169 | default: 170 | getTaskOut := s.getTask(ctx) 171 | if getTaskOut != nil && getTaskOut.Input != nil { 172 | s.invokeComponent(getTaskOut) 173 | } 174 | } 175 | } 176 | } 177 | 178 | func (s *StepFuncPoller) invokeComponent(out *sfn.GetActivityTaskOutput) { 179 | req, err := http.NewRequest("POST", s.es.Awsstepfunc.Path, bytes.NewBufferString(*out.Input)) 180 | if err != nil { 181 | log.Error("evstepfunc: http.NewRequest", "err", err, "arn", s.arn) 182 | } else { 183 | startTime := common.NowMillis() 184 | rw := httptest.NewRecorder() 185 | req.Header.Set("Maelstrom-Component", s.es.ComponentName) 186 | s.gateway.ServeHTTP(rw, req) 187 | if log.IsDebug() { 188 | log.Debug("evstepfunc: req done", "component", s.es.ComponentName, "path", s.es.Awsstepfunc.Path, 189 | "millis", common.NowMillis()-startTime, "respcode", rw.Code) 190 | } 191 | if rw.Code == http.StatusOK { 192 | _, err = s.sfnClient.SendTaskSuccess(&sfn.SendTaskSuccessInput{ 193 | Output: aws.String(rw.Body.String()), 194 | TaskToken: out.TaskToken, 195 | }) 196 | if err != nil { 197 | log.Error("evstepfunc: SendTaskSuccess", "err", err, "arn", s.arn) 198 | } 199 | } else { 200 | errStr := common.StrTruncate(rw.Header().Get("step-func-error"), 256) 201 | causeStr := common.StrTruncate(rw.Header().Get("step-func-cause"), 32768) 202 | 203 | if errStr == "" { 204 | errStr = fmt.Sprintf("maelstrom_%d", rw.Code) 205 | } 206 | if causeStr == "" { 207 | causeStr = fmt.Sprintf("maelstrom error: %s", err.Error()) 208 | } 209 | 210 | _, err = s.sfnClient.SendTaskFailure(&sfn.SendTaskFailureInput{ 211 | TaskToken: out.TaskToken, 212 | Error: aws.String(errStr), 213 | Cause: aws.String(causeStr), 214 | }) 215 | if err != nil { 216 | log.Error("evstepfunc: SendTaskFailure", "err", err, "arn", s.arn) 217 | } 218 | } 219 | } 220 | } 221 | 222 | func setDefaults(es v1.EventSource) v1.EventSource { 223 | es.Awsstepfunc.MaxConcurrency = common.DefaultInt64(es.Awsstepfunc.MaxConcurrency, 1) 224 | es.Awsstepfunc.ConcurrencyPerPoller = common.DefaultInt64(es.Awsstepfunc.ConcurrencyPerPoller, 1) 225 | return es 226 | } 227 | -------------------------------------------------------------------------------- /pkg/evsource/aws/sqs/sqs.go: -------------------------------------------------------------------------------- 1 | package evsqs 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "fmt" 7 | "github.com/aws/aws-sdk-go/aws" 8 | "github.com/aws/aws-sdk-go/aws/awserr" 9 | "github.com/aws/aws-sdk-go/aws/request" 10 | "github.com/aws/aws-sdk-go/aws/session" 11 | "github.com/aws/aws-sdk-go/service/sqs" 12 | "github.com/coopernurse/maelstrom/pkg/common" 13 | "github.com/coopernurse/maelstrom/pkg/evsource" 14 | v1 "github.com/coopernurse/maelstrom/pkg/v1" 15 | log "github.com/mgutz/logxi/v1" 16 | "github.com/pkg/errors" 17 | "net/http" 18 | "net/http/httptest" 19 | "sort" 20 | "sync" 21 | "time" 22 | ) 23 | 24 | type sqsMessage struct { 25 | msg *sqs.Message 26 | queueUrl *string 27 | } 28 | 29 | func NewPollCreator(es v1.EventSource, awsSession *session.Session, gateway http.Handler) (evsource.PollCreator, error) { 30 | es = setDefaults(es) 31 | sqsClient, queueUrls, err := sqsQueueUrlsForPrefix(awsSession, es.Sqs.QueueName, es.Sqs.NameAsPrefix) 32 | if err != nil { 33 | return nil, errors.Wrapf(err, "evsqs: error loading queue urls for: %s", es.Sqs.QueueName) 34 | } else if len(queueUrls) <= 0 { 35 | if es.Sqs.NameAsPrefix { 36 | log.Warn("evpoller: No SQS queues found with name prefix", "queueNamePrefix", es.Sqs.QueueName) 37 | } else { 38 | log.Warn("evpoller: SQS queue not found", "queueName", es.Sqs.QueueName) 39 | } 40 | return nil, nil 41 | } else { 42 | return &SqsPollCreator{ 43 | es: es, 44 | sqsClient: sqsClient, 45 | gateway: gateway, 46 | queueUrls: queueUrls, 47 | }, nil 48 | } 49 | } 50 | 51 | type SqsPollCreator struct { 52 | es v1.EventSource 53 | sqsClient *sqs.SQS 54 | gateway http.Handler 55 | queueUrls []*string 56 | } 57 | 58 | func (s *SqsPollCreator) NewPoller() evsource.Poller { 59 | return newPoller(s.es, s.queueUrls, s.sqsClient, s.gateway) 60 | } 61 | 62 | func (s *SqsPollCreator) ComponentName() string { 63 | return s.es.ComponentName 64 | } 65 | 66 | func (s *SqsPollCreator) RoleIdPrefix() string { 67 | return fmt.Sprintf("aws-sqs-%s", s.es.Name) 68 | } 69 | 70 | func (s *SqsPollCreator) MaxConcurrency() int { 71 | return int(s.es.Sqs.MaxConcurrency) 72 | } 73 | 74 | func (s *SqsPollCreator) MaxConcurrencyPerPoller() int { 75 | return int(s.es.Sqs.ConcurrencyPerPoller) 76 | } 77 | 78 | func newPoller(es v1.EventSource, queueUrls []*string, sqsClient *sqs.SQS, gateway http.Handler) evsource.Poller { 79 | return func(ctx context.Context, concurrency int, roleId string) { 80 | resetIdxTicker := time.Tick(15 * time.Second) 81 | idx := 0 82 | 83 | log.Info("sqs: poller starting", "roleId", roleId, "concurrency", concurrency, 84 | "component", es.ComponentName) 85 | 86 | pauseSecLock := &sync.Mutex{} 87 | pauseSecs := 0 88 | setPauseSecs := func(secs int) { 89 | pauseSecLock.Lock() 90 | if secs > 0 { 91 | pauseSecs = secs 92 | } 93 | pauseSecLock.Unlock() 94 | } 95 | sleepPauseSecs := func() bool { 96 | pauseSecLock.Lock() 97 | pauseCopy := pauseSecs 98 | pauseSecs = 0 99 | pauseSecLock.Unlock() 100 | 101 | if pauseCopy > 0 { 102 | log.Info("sqs: pausing poller", "seconds", pauseCopy, "component", es.ComponentName, "roleId", roleId) 103 | time.Sleep(time.Duration(pauseCopy) * time.Second) 104 | return true 105 | } 106 | return false 107 | } 108 | 109 | reqCh := make(chan *sqsMessage) 110 | wg := &sync.WaitGroup{} 111 | for i := 0; i < concurrency; i++ { 112 | wg.Add(1) 113 | go func() { 114 | defer wg.Done() 115 | for m := range reqCh { 116 | req, err := http.NewRequest("POST", es.Sqs.Path, bytes.NewBufferString(*m.msg.Body)) 117 | if err != nil { 118 | log.Error("sqs: error creating http req", "err", err, "queueUrl", m.queueUrl) 119 | } else { 120 | rw := httptest.NewRecorder() 121 | req.Header.Set("Maelstrom-Component", es.ComponentName) 122 | gateway.ServeHTTP(rw, req) 123 | if rw.Code == 200 { 124 | if log.IsDebug() { 125 | log.Debug("sqs: deleting message", "queueUrl", m.queueUrl) 126 | } 127 | _, err := sqsClient.DeleteMessage(&sqs.DeleteMessageInput{ 128 | QueueUrl: m.queueUrl, 129 | ReceiptHandle: m.msg.ReceiptHandle, 130 | }) 131 | if err != nil { 132 | log.Error("sqs: error deleting message", "err", err, "queueUrl", m.queueUrl) 133 | } 134 | } else { 135 | log.Warn("sqs: non-200 status", "component", es.ComponentName, "queueUrl", m.queueUrl) 136 | } 137 | // server has asked us to pause 138 | pauseSecs := common.ToIntOrDefault(rw.Result().Header.Get("pause-seconds"), 0) 139 | if pauseSecs > 0 { 140 | setPauseSecs(pauseSecs) 141 | } 142 | } 143 | } 144 | }() 145 | } 146 | 147 | for { 148 | select { 149 | case <-resetIdxTicker: 150 | // start polling from the front of the queue list again 151 | idx = 0 152 | case <-ctx.Done(): 153 | close(reqCh) 154 | wg.Wait() 155 | return 156 | default: 157 | if sleepPauseSecs() { 158 | // component requested pause - reset to front of queue list 159 | idx = 0 160 | } 161 | 162 | queueUrl := queueUrls[idx] 163 | reqCtx, reqCancel := context.WithTimeout(ctx, 15*time.Second) 164 | msgs, err := poll(es, sqsClient, queueUrl, reqCtx) 165 | reqCancel() 166 | if err != nil { 167 | logerr := true 168 | if aerr, ok := err.(awserr.Error); ok { 169 | if aerr.Code() == request.CanceledErrorCode { 170 | logerr = false 171 | log.Warn("sqs: poll canceled", "component", es.ComponentName, "queueUrl", *queueUrl) 172 | } 173 | } 174 | if logerr { 175 | log.Error("sqs: poll err", "err", err, "component", es.ComponentName, "queueUrl", *queueUrl) 176 | } 177 | } else { 178 | if len(msgs) == 0 { 179 | idx++ 180 | if idx >= len(queueUrls) { 181 | idx = 0 182 | } 183 | } else { 184 | for _, msg := range msgs { 185 | reqCh <- &sqsMessage{ 186 | msg: msg, 187 | queueUrl: queueUrl, 188 | } 189 | } 190 | } 191 | } 192 | } 193 | } 194 | } 195 | } 196 | 197 | func poll(es v1.EventSource, sqsClient *sqs.SQS, queueUrl *string, ctx context.Context) ([]*sqs.Message, error) { 198 | if log.IsDebug() { 199 | log.Debug("sqs: polling queue", "queueUrl", *queueUrl) 200 | } 201 | maxMsgs := es.Sqs.MessagesPerPoll 202 | if maxMsgs < 1 || maxMsgs > 10 { 203 | maxMsgs = 1 204 | } 205 | visibilityTimeout := es.Sqs.VisibilityTimeout 206 | if visibilityTimeout <= 0 { 207 | visibilityTimeout = 300 208 | } 209 | out, err := sqsClient.ReceiveMessageWithContext(ctx, &sqs.ReceiveMessageInput{ 210 | QueueUrl: queueUrl, 211 | MaxNumberOfMessages: aws.Int64(maxMsgs), 212 | VisibilityTimeout: aws.Int64(visibilityTimeout), 213 | WaitTimeSeconds: aws.Int64(1), 214 | }) 215 | if err == nil { 216 | return out.Messages, nil 217 | } 218 | return nil, err 219 | } 220 | 221 | func sqsQueueUrlsForPrefix(awsSession *session.Session, 222 | queueNameOrPrefix string, prefix bool) (*sqs.SQS, []*string, error) { 223 | if awsSession == nil { 224 | return nil, nil, fmt.Errorf("evpoller: cannot create sqs client - awsSession is nil") 225 | } 226 | 227 | sqsClient := sqs.New(awsSession) 228 | queueUrls := []*string{} 229 | if prefix { 230 | out, err := sqsClient.ListQueues(&sqs.ListQueuesInput{QueueNamePrefix: &queueNameOrPrefix}) 231 | if err != nil { 232 | return nil, nil, err 233 | } 234 | sort.Sort(common.StringPtr(out.QueueUrls)) 235 | queueUrls = out.QueueUrls 236 | } else { 237 | out, err := sqsClient.GetQueueUrl(&sqs.GetQueueUrlInput{QueueName: &queueNameOrPrefix}) 238 | if err != nil { 239 | returnErr := true 240 | if aerr, ok := err.(awserr.Error); ok { 241 | switch aerr.Code() { 242 | case sqs.ErrCodeQueueDoesNotExist: 243 | returnErr = false 244 | } 245 | } 246 | if returnErr { 247 | return nil, nil, err 248 | } 249 | } else { 250 | queueUrls = []*string{out.QueueUrl} 251 | } 252 | } 253 | return sqsClient, queueUrls, nil 254 | } 255 | 256 | func setDefaults(es v1.EventSource) v1.EventSource { 257 | es.Sqs.MaxConcurrency = common.DefaultInt64(es.Sqs.MaxConcurrency, 10) 258 | es.Sqs.VisibilityTimeout = common.DefaultInt64(es.Sqs.VisibilityTimeout, 300) 259 | es.Sqs.MessagesPerPoll = common.DefaultInt64(es.Sqs.MessagesPerPoll, 1) 260 | es.Sqs.ConcurrencyPerPoller = common.DefaultInt64(es.Sqs.ConcurrencyPerPoller, es.Sqs.MessagesPerPoll) 261 | return es 262 | } 263 | -------------------------------------------------------------------------------- /pkg/maelstrom/cluster.go: -------------------------------------------------------------------------------- 1 | package maelstrom 2 | 3 | import ( 4 | "fmt" 5 | "github.com/coopernurse/barrister-go" 6 | "github.com/coopernurse/maelstrom/pkg/common" 7 | v1 "github.com/coopernurse/maelstrom/pkg/v1" 8 | log "github.com/mgutz/logxi/v1" 9 | "net/http" 10 | "sync" 11 | "time" 12 | ) 13 | 14 | type ClusterObserver interface { 15 | OnClusterUpdated(nodes map[string]v1.NodeStatus) 16 | } 17 | 18 | func NewCluster(myNodeId string, localNodeService v1.NodeService) *Cluster { 19 | return &Cluster{ 20 | myNodeId: myNodeId, 21 | localNodeService: localNodeService, 22 | observers: []ClusterObserver{}, 23 | nodesById: map[string]v1.NodeStatus{}, 24 | lock: &sync.Mutex{}, 25 | barristerLock: &sync.Mutex{}, 26 | barristerClients: make(map[string]barrister.Client), 27 | } 28 | } 29 | 30 | type Cluster struct { 31 | myNodeId string 32 | localNodeService v1.NodeService 33 | localMaelstromService v1.MaelstromService 34 | observers []ClusterObserver 35 | nodesById map[string]v1.NodeStatus 36 | lock *sync.Mutex 37 | barristerLock *sync.Mutex 38 | barristerClients map[string]barrister.Client 39 | } 40 | 41 | func (c *Cluster) AddObserver(observer ClusterObserver) { 42 | c.lock.Lock() 43 | c.observers = append(c.observers, observer) 44 | c.lock.Unlock() 45 | } 46 | 47 | func (c *Cluster) SetLocalMaelstromService(svc v1.MaelstromService) { 48 | c.localMaelstromService = svc 49 | } 50 | 51 | func (c *Cluster) SetNode(node v1.NodeStatus) bool { 52 | modified := false 53 | c.lock.Lock() 54 | oldNode, ok := c.nodesById[node.NodeId] 55 | if !ok || node.Version > oldNode.Version || node.ObservedAt > oldNode.ObservedAt { 56 | c.nodesById[node.NodeId] = node 57 | modified = true 58 | } 59 | c.lock.Unlock() 60 | if !ok { 61 | log.Info("cluster: added node", "nodeId", common.TruncNodeId(c.myNodeId), 62 | "remoteNode", common.TruncNodeId(node.NodeId)) 63 | } 64 | if modified { 65 | if log.IsDebug() { 66 | log.Debug("cluster: SetNode modified", "myNode", common.TruncNodeId(c.myNodeId), 67 | "peerNode", common.TruncNodeId(node.NodeId), "version", node.Version) 68 | } 69 | c.notifyAll() 70 | } else { 71 | if log.IsDebug() { 72 | log.Debug("cluster: SetNode NOT modified", "peerNode", common.TruncNodeId(node.NodeId), 73 | "version", node.Version) 74 | } 75 | } 76 | return modified 77 | } 78 | 79 | func (c *Cluster) SetAllNodes(nodes []v1.NodeStatus) { 80 | newNodesById := map[string]v1.NodeStatus{} 81 | for _, node := range nodes { 82 | newNodesById[node.NodeId] = node 83 | } 84 | c.lock.Lock() 85 | c.nodesById = newNodesById 86 | c.lock.Unlock() 87 | c.notifyAll() 88 | } 89 | 90 | func (c *Cluster) SetAndBroadcastStatus(node v1.NodeStatus) { 91 | c.SetNode(node) 92 | input := v1.StatusChangedInput{ 93 | NodeId: c.myNodeId, 94 | Exiting: false, 95 | Status: &node, 96 | } 97 | c.broadcastStatusChangeAsync(input, "SetAndBroadcastStatus") 98 | } 99 | 100 | func (c *Cluster) RemoveAndBroadcast() { 101 | c.RemoveNode(c.myNodeId) 102 | input := v1.StatusChangedInput{ 103 | NodeId: c.myNodeId, 104 | Exiting: true, 105 | } 106 | c.broadcastStatusChangeAsync(input, "RemoveAndBroadcast") 107 | } 108 | 109 | func (c *Cluster) broadcastStatusChangeAsync(input v1.StatusChangedInput, logPrefix string) { 110 | for _, svc := range c.GetRemoteNodeServices() { 111 | go func(s v1.NodeService) { 112 | if log.IsDebug() { 113 | var version int64 114 | if input.Status != nil { 115 | version = input.Status.Version 116 | } 117 | log.Debug("cluster: broadcastStatusChangeAsync", "version", version, 118 | "exiting", input.Exiting) 119 | } 120 | _, err := s.StatusChanged(input) 121 | if err != nil { 122 | log.Error("cluster: "+logPrefix+" error calling StatusChanged", "err", err) 123 | } 124 | }(svc) 125 | } 126 | } 127 | 128 | func (c *Cluster) RemoveNode(nodeId string) bool { 129 | c.lock.Lock() 130 | oldNode, found := c.nodesById[nodeId] 131 | if found { 132 | delete(c.nodesById, nodeId) 133 | } 134 | c.lock.Unlock() 135 | if found { 136 | log.Info("cluster: removed node", "nodeId", common.TruncNodeId(c.myNodeId), 137 | "remoteNode", common.TruncNodeId(nodeId), "peerUrl", oldNode.PeerUrl) 138 | c.notifyAll() 139 | } 140 | return found 141 | } 142 | 143 | func (c *Cluster) GetNodes() []v1.NodeStatus { 144 | c.lock.Lock() 145 | nodes := make([]v1.NodeStatus, len(c.nodesById)) 146 | i := 0 147 | for _, n := range c.nodesById { 148 | nodes[i] = n 149 | i++ 150 | } 151 | c.lock.Unlock() 152 | return nodes 153 | } 154 | 155 | func (c *Cluster) GetNodeById(nodeId string) *v1.NodeStatus { 156 | var node *v1.NodeStatus 157 | c.lock.Lock() 158 | n, ok := c.nodesById[nodeId] 159 | c.lock.Unlock() 160 | if ok { 161 | node = &n 162 | } 163 | return node 164 | } 165 | 166 | func (c *Cluster) GetNodeServiceById(nodeId string) v1.NodeService { 167 | node := c.GetNodeById(nodeId) 168 | if node == nil { 169 | return nil 170 | } else { 171 | return c.GetNodeService(*node) 172 | } 173 | } 174 | 175 | func (c *Cluster) GetNodeServiceWithTimeout(node v1.NodeStatus, timeout time.Duration) v1.NodeService { 176 | if node.NodeId == c.myNodeId { 177 | return c.localNodeService 178 | } 179 | return v1.NewNodeServiceProxy(c.getBarristerClient(node.PeerUrl, timeout)) 180 | } 181 | 182 | func (c *Cluster) GetNodeService(node v1.NodeStatus) v1.NodeService { 183 | return c.GetNodeServiceWithTimeout(node, 5*time.Minute) 184 | } 185 | 186 | func (c *Cluster) GetMaelstromServiceWithTimeout(node v1.NodeStatus, timeout time.Duration) v1.MaelstromService { 187 | if node.NodeId == c.myNodeId { 188 | return c.localMaelstromService 189 | } 190 | return v1.NewMaelstromServiceProxy(c.getBarristerClient(node.PeerUrl, timeout)) 191 | } 192 | 193 | func (c *Cluster) getBarristerClient(peerUrl string, timeout time.Duration) barrister.Client { 194 | cacheKey := fmt.Sprintf("%s|%d", peerUrl, timeout.Nanoseconds()) 195 | 196 | c.barristerLock.Lock() 197 | defer c.barristerLock.Unlock() 198 | 199 | client, ok := c.barristerClients[cacheKey] 200 | if ok { 201 | return client 202 | } 203 | transport := &barrister.HttpTransport{Url: peerUrl + "/_mael/v1"} 204 | if timeout > 0 { 205 | transport.Client = &http.Client{Timeout: timeout} 206 | } 207 | client = barrister.NewRemoteClient(transport, false) 208 | c.barristerClients[cacheKey] = client 209 | return client 210 | } 211 | 212 | func (c *Cluster) GetMaelstromService(node v1.NodeStatus) v1.MaelstromService { 213 | return c.GetMaelstromServiceWithTimeout(node, time.Minute) 214 | } 215 | 216 | func (c *Cluster) GetRemoteNodeServices() []v1.NodeService { 217 | svcs := make([]v1.NodeService, 0) 218 | c.lock.Lock() 219 | for nodeId, nodeStatus := range c.nodesById { 220 | if nodeId != c.myNodeId { 221 | svcs = append(svcs, c.GetNodeService(nodeStatus)) 222 | } 223 | } 224 | c.lock.Unlock() 225 | return svcs 226 | } 227 | 228 | func (c *Cluster) GetRemoteMaelstromServices() []v1.MaelstromService { 229 | svcs := make([]v1.MaelstromService, 0) 230 | c.lock.Lock() 231 | for nodeId, nodeStatus := range c.nodesById { 232 | if nodeId != c.myNodeId { 233 | svcs = append(svcs, c.GetMaelstromService(nodeStatus)) 234 | } 235 | } 236 | c.lock.Unlock() 237 | return svcs 238 | } 239 | 240 | func (c *Cluster) BroadcastDataChanged(input v1.NotifyDataChangedInput) { 241 | for _, svc := range c.GetRemoteMaelstromServices() { 242 | go func(s v1.MaelstromService) { 243 | _, err := s.NotifyDataChanged(input) 244 | if err != nil { 245 | log.Warn("cluster: error broadcasting data change", "err", err.Error()) 246 | } 247 | }(svc) 248 | } 249 | } 250 | 251 | func (c *Cluster) BroadcastTerminationEvent(input v1.TerminateNodeInput) { 252 | for _, svc := range c.GetRemoteNodeServices() { 253 | go func(s v1.NodeService) { 254 | out, err := s.TerminateNode(input) 255 | if err != nil { 256 | log.Warn("cluster: error broadcasting termination event", "err", err.Error()) 257 | } else if out.AcceptedMessage { 258 | log.Info("cluster: node accepted termination event", "instanceId", out.InstanceId, 259 | "peerNodeId", common.TruncNodeId(out.NodeId)) 260 | } 261 | }(svc) 262 | } 263 | } 264 | 265 | func (c *Cluster) notifyAll() { 266 | nodesCopy := make(map[string]v1.NodeStatus) 267 | c.lock.Lock() 268 | for k, v := range c.nodesById { 269 | nodesCopy[k] = v 270 | } 271 | c.lock.Unlock() 272 | for _, o := range c.observers { 273 | go o.OnClusterUpdated(nodesCopy) 274 | } 275 | } 276 | -------------------------------------------------------------------------------- /pkg/router/router.go: -------------------------------------------------------------------------------- 1 | package router 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "net" 7 | "net/http" 8 | "net/http/httputil" 9 | "net/url" 10 | "sync" 11 | "time" 12 | 13 | "github.com/coopernurse/maelstrom/pkg/revproxy" 14 | v1 "github.com/coopernurse/maelstrom/pkg/v1" 15 | log "github.com/mgutz/logxi/v1" 16 | ) 17 | 18 | type StartComponentFunc func(componentName string) 19 | 20 | ///////////////////////////////// 21 | 22 | type State int 23 | 24 | var stateLabels = []string{"off", "pending", "on"} 25 | 26 | func (s State) String() string { 27 | if s < 0 || s >= State(len(stateLabels)) { 28 | return fmt.Sprintf("unknown state: %d", s) 29 | } 30 | return stateLabels[s] 31 | } 32 | 33 | const ( 34 | StateOff State = iota 35 | StatePending 36 | StateOn 37 | ) 38 | 39 | ///////////////////////////////// 40 | 41 | func NewRouter(componentName string, nodeId string, bufferPool *revproxy.ProxyBufferPool, 42 | startCompFunc StartComponentFunc) *Router { 43 | return &Router{ 44 | componentName: componentName, 45 | nodeId: nodeId, 46 | startComponentFunc: startCompFunc, 47 | state: StateOff, 48 | remoteReqCh: nil, 49 | localReqCh: nil, 50 | inflightReqs: 0, 51 | bufferPool: bufferPool, 52 | remoteHandlersByUrl: make(map[string][]context.CancelFunc), 53 | lock: &sync.Mutex{}, 54 | } 55 | } 56 | 57 | type Router struct { 58 | componentName string 59 | nodeId string 60 | startComponentFunc StartComponentFunc 61 | state State 62 | remoteReqCh chan *revproxy.GetProxyRequest 63 | localReqCh chan *revproxy.GetProxyRequest 64 | inflightReqs int64 65 | activeHandlers int64 66 | bufferPool *revproxy.ProxyBufferPool 67 | 68 | // key=peerUrl value=slice of cancel funcs (one per rev proxy goroutine) 69 | remoteHandlersByUrl map[string][]context.CancelFunc 70 | 71 | lock *sync.Mutex 72 | } 73 | 74 | func (r *Router) WithStartComponentFunc(f StartComponentFunc) *Router { 75 | r.startComponentFunc = f 76 | return r 77 | } 78 | 79 | func (r *Router) GetState() (state State) { 80 | r.lock.Lock() 81 | state = r.state 82 | r.lock.Unlock() 83 | return 84 | } 85 | 86 | func (r *Router) GetHandlerCount() (count int64) { 87 | r.lock.Lock() 88 | count = r.activeHandlers 89 | r.lock.Unlock() 90 | return 91 | } 92 | 93 | func (r *Router) GetInflightReqs() (count int64) { 94 | r.lock.Lock() 95 | count = r.inflightReqs 96 | r.lock.Unlock() 97 | return 98 | } 99 | 100 | func (r *Router) SetRemoteHandlerCounts(urlToHandlerCount map[string]int) { 101 | r.lock.Lock() 102 | defer r.lock.Unlock() 103 | 104 | for targetUrl, targetCount := range urlToHandlerCount { 105 | cancelFuncs := r.remoteHandlersByUrl[targetUrl] 106 | if cancelFuncs == nil { 107 | cancelFuncs = make([]context.CancelFunc, 0) 108 | } 109 | 110 | var cancelFunc context.CancelFunc 111 | change := false 112 | 113 | if log.IsDebug() { 114 | log.Debug("router: setRemoteNodes", "component", r.componentName, "targetUrl", targetUrl, 115 | "targetCount", targetCount, "oldCount", len(cancelFuncs)) 116 | } 117 | 118 | u, err := url.Parse(targetUrl) 119 | if err == nil { 120 | for targetCount > len(cancelFuncs) { 121 | cancelFunc = r.startRemoteHandler(u, targetCount) 122 | cancelFuncs = append(cancelFuncs, cancelFunc) 123 | r.remoteHandlersByUrl[targetUrl] = cancelFuncs 124 | change = true 125 | } 126 | } else { 127 | log.Error("router: cannot create remote url", "err", err, "url", targetUrl) 128 | } 129 | 130 | for targetCount < len(cancelFuncs) { 131 | cancelFunc, cancelFuncs = cancelFuncs[0], cancelFuncs[1:] 132 | cancelFunc() 133 | r.remoteHandlersByUrl[targetUrl] = cancelFuncs 134 | change = true 135 | } 136 | 137 | if change { 138 | log.Info("router: updated remoteHandlersByUrl", "component", r.componentName, 139 | "handlerCount", urlToHandlerCount) 140 | } 141 | } 142 | for targetUrl, cancelFuncs := range r.remoteHandlersByUrl { 143 | _, ok := urlToHandlerCount[targetUrl] 144 | if !ok { 145 | for _, fx := range cancelFuncs { 146 | fx() 147 | } 148 | delete(r.remoteHandlersByUrl, targetUrl) 149 | log.Info("router: removed remoteHandlersByUrl", "component", r.componentName, "url", targetUrl) 150 | } 151 | } 152 | } 153 | 154 | func (r *Router) startRemoteHandler(targetUrl *url.URL, targetCount int) context.CancelFunc { 155 | proxy := httputil.NewSingleHostReverseProxy(targetUrl) 156 | proxy.BufferPool = r.bufferPool 157 | proxy.Transport = &http.Transport{ 158 | Proxy: http.ProxyFromEnvironment, 159 | MaxIdleConnsPerHost: targetCount, 160 | DialContext: (&net.Dialer{ 161 | Timeout: 30 * time.Second, 162 | KeepAlive: 30 * time.Second, 163 | }).DialContext, 164 | TLSHandshakeTimeout: 10 * time.Second, 165 | } 166 | 167 | ctx, cancelFunc := context.WithCancel(context.Background()) 168 | go func() { 169 | reqCh := r.HandlerStartRemote() 170 | defer r.HandlerStop() 171 | dispenser := revproxy.NewDispenser(targetCount, reqCh, r.nodeId, 172 | r.componentName, proxy, nil, ctx) 173 | dispenser.Run(ctx, nil) 174 | }() 175 | return cancelFunc 176 | } 177 | 178 | func (r *Router) HandlerStartRemote() (ch <-chan *revproxy.GetProxyRequest) { 179 | return r.handlerStart(true) 180 | } 181 | 182 | func (r *Router) HandlerStartLocal() (ch <-chan *revproxy.GetProxyRequest) { 183 | return r.handlerStart(false) 184 | } 185 | 186 | func (r *Router) handlerStart(remote bool) (ch <-chan *revproxy.GetProxyRequest) { 187 | r.lock.Lock() 188 | r.activeHandlers++ 189 | if r.state != StateOn { 190 | log.Info("router: state changed to on", "component", r.componentName) 191 | r.state = StateOn 192 | } 193 | r.ensureReqChan() 194 | if remote { 195 | ch = r.remoteReqCh 196 | } else { 197 | ch = r.localReqCh 198 | } 199 | r.lock.Unlock() 200 | return 201 | } 202 | 203 | func (r *Router) HandlerStop() { 204 | r.lock.Lock() 205 | defer r.lock.Unlock() 206 | 207 | if r.activeHandlers <= 0 { 208 | panic("router: HandlerStop called when activeHandlers <= 0: componentName=" + r.componentName) 209 | } 210 | 211 | r.activeHandlers-- 212 | if r.activeHandlers == 0 { 213 | if r.inflightReqs > 0 { 214 | r.setPendingState(0) 215 | } else { 216 | log.Info("router: state changed to off", "component", r.componentName) 217 | r.state = StateOff 218 | } 219 | } 220 | } 221 | 222 | func (r *Router) DestroyIfIdle() bool { 223 | r.lock.Lock() 224 | defer r.lock.Unlock() 225 | 226 | if r.state != StateOff || r.inflightReqs != 0 { 227 | return false 228 | } 229 | if r.remoteReqCh != nil { 230 | log.Info("router: closing channels", "component", r.componentName) 231 | close(r.remoteReqCh) 232 | close(r.localReqCh) 233 | r.remoteReqCh = nil 234 | r.localReqCh = nil 235 | } 236 | return true 237 | } 238 | 239 | func (r *Router) Route(ctx context.Context, req *revproxy.Request) { 240 | r.routeStart(req.Component) 241 | defer r.routeDone() 242 | 243 | haveProxy := false 244 | getProxyReq := revproxy.NewGetProxyRequest() 245 | 246 | if req.PreferLocal { 247 | localSecs := req.Component.MaxDurationSeconds / 10 248 | if localSecs < 1 { 249 | localSecs = 1 250 | } 251 | timeout := time.After(time.Duration(localSecs) * time.Second) 252 | select { 253 | case r.localReqCh <- getProxyReq: 254 | // ok - done 255 | haveProxy = true 256 | case <-ctx.Done(): 257 | // timeout or shutdown 258 | return 259 | case <-timeout: 260 | // fall through - give remote handlers a chance 261 | } 262 | } 263 | 264 | if !haveProxy { 265 | select { 266 | case r.localReqCh <- getProxyReq: 267 | // ok 268 | case r.remoteReqCh <- getProxyReq: 269 | // ok 270 | case <-ctx.Done(): 271 | // timeout or shutdown 272 | return 273 | } 274 | } 275 | 276 | // handle request 277 | proxyFx := <-getProxyReq.Proxy 278 | proxyFx(req) 279 | } 280 | 281 | func (r *Router) routeStart(comp *v1.Component) { 282 | r.lock.Lock() 283 | r.inflightReqs++ 284 | r.ensureReqChan() 285 | if r.state == StateOff { 286 | r.setPendingState(comp.Docker.HttpStartHealthCheckSeconds) 287 | } 288 | r.lock.Unlock() 289 | } 290 | 291 | func (r *Router) routeDone() { 292 | r.lock.Lock() 293 | r.inflightReqs-- 294 | if r.inflightReqs == 0 && r.activeHandlers == 0 { 295 | r.state = StateOff 296 | } 297 | r.lock.Unlock() 298 | } 299 | 300 | func (r *Router) ensureReqChan() { 301 | if r.remoteReqCh == nil { 302 | r.remoteReqCh = make(chan *revproxy.GetProxyRequest) 303 | } 304 | if r.localReqCh == nil { 305 | r.localReqCh = make(chan *revproxy.GetProxyRequest) 306 | } 307 | } 308 | 309 | func (r *Router) setPendingState(retrySeconds int64) { 310 | if r.state != StatePending { 311 | log.Info("router: state changed to pending", "component", r.componentName) 312 | r.state = StatePending 313 | if retrySeconds <= 0 { 314 | retrySeconds = 30 315 | } 316 | go r.runPendingLoop(time.Duration(retrySeconds) * time.Second) 317 | } 318 | } 319 | 320 | func (r *Router) runPendingLoop(retryDur time.Duration) { 321 | for r.GetState() == StatePending { 322 | r.startComponentFunc(r.componentName) 323 | time.Sleep(retryDur) 324 | } 325 | } 326 | -------------------------------------------------------------------------------- /pkg/maelstrom/placement.go: -------------------------------------------------------------------------------- 1 | package maelstrom 2 | 3 | import ( 4 | v1 "github.com/coopernurse/maelstrom/pkg/v1" 5 | "sort" 6 | ) 7 | 8 | type PlacementOption struct { 9 | TargetNode *v1.NodeStatus 10 | Input *v1.StartStopComponentsInput 11 | } 12 | 13 | func (p *PlacementOption) cloneWithTargetDelta(componentName string, delta int64, requiredRam int64) *PlacementOption { 14 | currentCount := int64(-1) 15 | targetCounts := make([]v1.ComponentTarget, 0) 16 | if p.Input != nil { 17 | for _, tc := range p.Input.TargetCounts { 18 | if tc.ComponentName == componentName { 19 | currentCount = tc.TargetCount 20 | } else { 21 | targetCounts = append(targetCounts, tc) 22 | } 23 | } 24 | } 25 | if currentCount < 0 { 26 | byComp, _ := p.ContainerCountByComponent() 27 | currentCount = int64(byComp[componentName]) 28 | } 29 | 30 | targetCounts = append(targetCounts, v1.ComponentTarget{ 31 | ComponentName: componentName, 32 | RequiredMemoryMiB: requiredRam, 33 | TargetCount: currentCount + delta, 34 | }) 35 | 36 | return &PlacementOption{ 37 | TargetNode: p.TargetNode, 38 | Input: &v1.StartStopComponentsInput{ 39 | ClientNodeId: p.Input.ClientNodeId, 40 | TargetVersion: p.TargetNode.Version, 41 | ReturnStatus: p.Input.ReturnStatus, 42 | TargetCounts: targetCounts, 43 | }, 44 | } 45 | } 46 | 47 | func (p *PlacementOption) RamUsed() int64 { 48 | byComp := map[string]int{} 49 | ramUsed := int64(0) 50 | if p.Input != nil { 51 | for _, tc := range p.Input.TargetCounts { 52 | byComp[tc.ComponentName] = int(tc.TargetCount) 53 | ramUsed += tc.TargetCount * tc.RequiredMemoryMiB 54 | } 55 | } 56 | if p.TargetNode != nil { 57 | for _, ci := range p.TargetNode.RunningComponents { 58 | _, ok := byComp[ci.ComponentName] 59 | if !ok { 60 | ramUsed += ci.MemoryReservedMiB 61 | } 62 | } 63 | } 64 | return ramUsed 65 | } 66 | 67 | func (p *PlacementOption) ContainerCountByComponent() (byComp map[string]int, total int) { 68 | byComp = map[string]int{} 69 | total = 0 70 | 71 | // first add counts that are currently running 72 | if p.TargetNode != nil { 73 | for _, ci := range p.TargetNode.RunningComponents { 74 | byComp[ci.ComponentName] += 1 75 | total++ 76 | } 77 | } 78 | 79 | // then adjust for any components that will be scaled up/down 80 | if p.Input != nil { 81 | for _, tc := range p.Input.TargetCounts { 82 | oldCount, ok := byComp[tc.ComponentName] 83 | if ok { 84 | total -= oldCount 85 | } 86 | byComp[tc.ComponentName] = int(tc.TargetCount) 87 | total += int(tc.TargetCount) 88 | } 89 | } 90 | 91 | return 92 | } 93 | 94 | func (p *PlacementOption) ramForComponent(componentName string) int64 { 95 | maxRam := int64(0) 96 | if p.TargetNode != nil { 97 | for _, ci := range p.TargetNode.RunningComponents { 98 | if componentName == ci.ComponentName && ci.MemoryReservedMiB > maxRam { 99 | maxRam = ci.MemoryReservedMiB 100 | } 101 | } 102 | } 103 | if p.Input != nil { 104 | for _, tc := range p.Input.TargetCounts { 105 | if componentName == tc.ComponentName && tc.RequiredMemoryMiB > maxRam { 106 | maxRam = tc.RequiredMemoryMiB 107 | } 108 | } 109 | } 110 | if maxRam == 0 { 111 | maxRam = 128 112 | } 113 | return maxRam 114 | } 115 | 116 | func (p *PlacementOption) scaleDownCount() int { 117 | _, scaleDown := p.scaleUpDownCounts() 118 | return scaleDown 119 | } 120 | 121 | func (p *PlacementOption) scaleUpDownCounts() (int, int) { 122 | scaleUp := 0 123 | scaleDown := 0 124 | byComp := map[string]int{} 125 | for _, ci := range p.TargetNode.RunningComponents { 126 | byComp[ci.ComponentName] += 1 127 | } 128 | for _, tc := range p.Input.TargetCounts { 129 | if tc.TargetCount < int64(byComp[tc.ComponentName]) { 130 | scaleDown++ 131 | } else if tc.TargetCount > int64(byComp[tc.ComponentName]) { 132 | scaleUp++ 133 | } 134 | } 135 | return scaleUp, scaleDown 136 | } 137 | 138 | type PlacementOptionByNode []*PlacementOption 139 | 140 | func (s PlacementOptionByNode) Len() int { return len(s) } 141 | func (s PlacementOptionByNode) Swap(i, j int) { 142 | s[i], s[j] = s[j], s[i] 143 | } 144 | func (s PlacementOptionByNode) Less(i, j int) bool { 145 | return s[i].TargetNode.NodeId < s[j].TargetNode.NodeId 146 | } 147 | 148 | type PlacementOptionByCostDesc struct { 149 | Options []*PlacementOption 150 | ComponentName string 151 | } 152 | 153 | func (s PlacementOptionByCostDesc) Len() int { return len(s.Options) } 154 | func (s PlacementOptionByCostDesc) Swap(i, j int) { 155 | s.Options[i], s.Options[j] = s.Options[j], s.Options[i] 156 | } 157 | func (s PlacementOptionByCostDesc) Less(i, j int) bool { 158 | iContainersByComp, iContainers := s.Options[i].ContainerCountByComponent() 159 | jContainersByComp, jContainers := s.Options[j].ContainerCountByComponent() 160 | 161 | // sort by # of instances of this component we're already running - lower is better (anti-affinity) 162 | iCompCount := iContainersByComp[s.ComponentName] 163 | jCompCount := jContainersByComp[s.ComponentName] 164 | if iCompCount != jCompCount { 165 | return iCompCount < jCompCount 166 | } 167 | 168 | // prefer options with minimal scale down (displacement) 169 | iScaleDown := s.Options[i].scaleDownCount() 170 | jScaleDown := s.Options[j].scaleDownCount() 171 | if iScaleDown != jScaleDown { 172 | return iScaleDown < jScaleDown 173 | } 174 | 175 | // if one node is empty, prefer it 176 | if iContainers <= 1 && jContainers > 1 { 177 | return true 178 | } 179 | if jContainers <= 1 && iContainers > 1 { 180 | return false 181 | } 182 | 183 | // finally sort by 1min load average - less is better 184 | return s.Options[i].TargetNode.LoadAvg1m < s.Options[j].TargetNode.LoadAvg1m 185 | } 186 | 187 | func BestStartComponentOption(placementByNode map[string]*PlacementOption, componentName string, 188 | requiredMemoryMiB int64, maxInstPerNode int64, displaceOK bool) *PlacementOption { 189 | 190 | options := make([]*PlacementOption, 0) 191 | 192 | // key: componentName, value: # of containers for that component 193 | componentInstanceCounts := map[string]int{} 194 | 195 | for _, placementOption := range placementByNode { 196 | 197 | // Calc memory available 198 | memoryAvailableMiB := placementOption.TargetNode.TotalMemoryMiB - placementOption.RamUsed() 199 | 200 | // Calc # of containers for each component this node is running 201 | countByComp, _ := placementOption.ContainerCountByComponent() 202 | underPerNodeLimit := maxInstPerNode <= 0 || countByComp[componentName] < int(maxInstPerNode) 203 | 204 | // Only consider node if it's under the maxInstPerNode limit AND total ram >= required ram for component 205 | if underPerNodeLimit && placementOption.TargetNode.TotalMemoryMiB >= requiredMemoryMiB { 206 | 207 | // If insufficient memory available and displacement is allowed, displace other 208 | // components on that node to free ram 209 | if (memoryAvailableMiB < requiredMemoryMiB) && displaceOK { 210 | 211 | // build list of components running - ignoring components already marked to scale down 212 | runningComps := make([]v1.ComponentInfo, 0) 213 | 214 | for _, ci := range placementOption.TargetNode.RunningComponents { 215 | if countByComp[ci.ComponentName] > 0 && ci.ComponentName != componentName { 216 | runningComps = append(runningComps, ci) 217 | countByComp[ci.ComponentName] -= 1 218 | } 219 | } 220 | 221 | // sort runningComps so that ones with largest cluster-side instance counts are first 222 | sort.Sort(ComponentInfoByRunningCountAndReqTime{ 223 | Components: runningComps, 224 | InstanceCounts: componentInstanceCounts, 225 | }) 226 | 227 | // stop components in order until sufficient memory is available 228 | for i := 0; memoryAvailableMiB < requiredMemoryMiB && i < len(runningComps); i++ { 229 | memoryAvailableMiB += runningComps[i].MemoryReservedMiB 230 | placementOption = placementOption.cloneWithTargetDelta(runningComps[i].ComponentName, -1, 231 | runningComps[i].MemoryReservedMiB) 232 | } 233 | } 234 | 235 | // if sufficient memory available, add as option 236 | if memoryAvailableMiB >= requiredMemoryMiB { 237 | modifiedOption := placementOption.cloneWithTargetDelta(componentName, 1, requiredMemoryMiB) 238 | options = append(options, modifiedOption) 239 | } 240 | } 241 | } 242 | 243 | if len(options) > 0 { 244 | sort.Sort(PlacementOptionByCostDesc{Options: options, ComponentName: componentName}) 245 | sort.Sort(ComponentTargetByCompName(options[0].Input.TargetCounts)) 246 | return options[0] 247 | } 248 | return nil 249 | } 250 | 251 | func BestStopComponentOption(placementByNode map[string]*PlacementOption, componentName string) *PlacementOption { 252 | 253 | options := make([]*PlacementOption, 0) 254 | 255 | for _, placementOption := range placementByNode { 256 | countByComp, _ := placementOption.ContainerCountByComponent() 257 | if countByComp[componentName] > 0 { 258 | requiredRam := placementOption.ramForComponent(componentName) 259 | modifiedOption := placementOption.cloneWithTargetDelta(componentName, -1, requiredRam) 260 | options = append(options, modifiedOption) 261 | } 262 | } 263 | 264 | if len(options) > 0 { 265 | sort.Sort(PlacementOptionByCostDesc{Options: options, ComponentName: componentName}) 266 | return options[len(options)-1] 267 | } 268 | return nil 269 | } 270 | --------------------------------------------------------------------------------