├── LICENSE ├── README.md ├── batchit.go ├── cmd └── batchit │ └── batchit.go ├── ddv └── ddv.go ├── exsmount └── exsmount.go ├── logof └── logof.go ├── s3upload └── s3upload.go └── submit └── submit.go /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | batchit is a collection of utilities for working with AWS batch. 2 | 3 | 4 | usage 5 | ===== 6 | 7 | ``` 8 | batchit Version: $version 9 | 10 | ddv : detach and delete a volume by id 11 | ebsmount : create and mount an EBS volume from an EC2 instance 12 | efsmount : EFS drive from an EC2 instance 13 | localmount : RAID and mount local storage 14 | submit : run a batch command 15 | 16 | 17 | ``` 18 | 19 | submit 20 | ------ 21 | 22 | example: 23 | 24 | ``` 25 | batchit submit \ 26 | --image worker:latest \ 27 | --role worker-role \ 28 | --queue big-queue \ 29 | --jobname my-work \ 30 | --cpus 32 \ 31 | --mem 32000 32 | --envvars "sample=SS-1234" "reference=hg38" "bucket=my-s3-bucket" \ 33 | --ebs /mnt/my-ebs:500:st1:ext4 \ 34 | align.sh 35 | ``` 36 | 37 | ### Interactive 38 | 39 | To get an interactive job, use the `submit` command, but instead of a script (`align.sh`) above, 40 | use, for example, "interactive:20" to get an interactive job that will run for 20 minutes. 41 | 42 | This command will start a job that sleeps for 20 minutes and the output an ssh command that will drop 43 | the user into the docker container running that command. 44 | 45 | This is useful for debugging as it quickly drops a user into the same environment that the jobs 46 | will be run in. 47 | 48 | #### batchit requirements 49 | 50 | #### AWS 51 | AWS Batch itself requires the `AWSBatchServiceRole` and `ecsInstanceRole` generated by running the [first-run wizard](https://console.aws.amazon.com/batch/home#/wizard). Because batchit containers use EC2 and S3 services, batchit requires an [ecsTaskRole](https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task_IAM_role.html) with `AmazonEC2FullAccess` and `AmazonS3FullAccess`. This is the `worker-role` above. 52 | 53 | 54 | #### Docker 55 | The `image` must be present in your elastic container registry and the container will itself need batchit as a dependency if `--ebs` is used. A typical Dockerfile entry for this will look like: 56 | 57 | ``` 58 | RUN apt-get install -y wget 59 | RUN wget -qO /usr/bin/batchit https://github.com/base2genomics/batchit/releases/latest/download/batchit 60 | RUN chmod +x /usr/bin/batchit 61 | ``` 62 | 63 | In this example `align.sh` contains the commands to be run. It will have access to a 500GB 64 | `st1` volume created with ext4 and mounted to `/mnt/my-ebs`. This will automatically set docker to run in privileged 65 | mode so that it has access to the EBS volume that is attached to /dev in the instance. 66 | (We stole this idea from [aegea](https://github.com/kislyuk/aegea)) 67 | 68 | The volume will be cleaned up automatically when the **container** exits. 69 | 70 | Note that array jobs are also supported with `--arraysize INT` parameter. Currently, the user is responsible for specifying 71 | the dependency mode (`N_TO_N` or `SEQUENTIAL`) to the `--dependson` parameter. 72 | 73 | For this example a simplified `align.sh` might look like (always include the first two lines): 74 | 75 | ``` 76 | set -euo pipefail 77 | cd $TMPDIR 78 | aws s3 cp s3://${bucket}/${sample}_r1.fq . 79 | aws s3 cp s3://${bucket}/${sample}_r2.fq . 80 | aws s3 sync s3://${bucket}/assets/${reference} . 81 | bwa mem -t ${cpus} ${reference}.fa ${sample}_r1.fq ${sample}_r2.fq \ 82 | | samtools sort -o ${sample}.bam 83 | samtools index ${sample}.bam 84 | aws s3 cp ${sample}.bam s3://${bucket}/ 85 | aws s3 cp ${sample}.bam.bai s3://${bucket}/ 86 | ``` 87 | 88 | ebsmount 89 | -------- 90 | 91 | create, attach, format, and mount an EBS volume of the specified size and type to the specified mount-point. 92 | If `-n` is greater than 1, then it will automatically RAID0 (performance, not reliability) the drives. 93 | This is used (in shorthand) by the `--ebs` argument to `batchit submit` above. 94 | 95 | ``` 96 | Usage: batchit [--size SIZE] --mountpoint MOUNTPOINT [--volumetype VOLUMETYPE] [--fstype FSTYPE] [--iops IOPS] [--n N] [--keep] 97 | 98 | Options: 99 | --size SIZE, -s SIZE size in GB of desired EBS volume [default: 200] 100 | --mountpoint MOUNTPOINT, -m MOUNTPOINT 101 | directory on which to mount the EBS volume 102 | --volumetype VOLUMETYPE, -v VOLUMETYPE 103 | desired volume type; gp2 for General Purpose SSD; io1 for Provisioned IOPS SSD; st1 for Throughput Optimized HDD; sc1 for HDD or Magnetic volumes; standard for infrequent [default: gp2] 104 | --fstype FSTYPE, -t FSTYPE 105 | file system type to create (argument must be accepted by mkfs) [default: ext4] 106 | --iops IOPS, -i IOPS Provisioned IOPS. Only valid for volume type io1. Range is 100 to 20000 and <= 50\*size of volume. 107 | --n N, -n N number of volumes to request. These will be RAID0'd into a single volume for better write speed and available as a single drive at the specified mount point. [default: 1] 108 | --keep, -k dont delete the volume(s) on termination (default is to delete) 109 | --help, -h display this help and exit 110 | --version display version and exit 111 | 112 | ``` 113 | 114 | efsmount 115 | -------- 116 | 117 | This is a trivial wrapper around mounting an EFS volume. 118 | 119 | ``` 120 | Usage: batchit [--mountoptions MOUNTOPTIONS] EFS MOUNTPOINT 121 | 122 | Positional arguments: 123 | EFS efs DNS and mount path (e.g.fs-XXXXXX.efs.us-east-1.amazonaws.com:/mnt/efs/) 124 | MOUNTPOINT local directory on which to mount the EBS volume 125 | 126 | Options: 127 | --mountoptions MOUNTOPTIONS, -o MOUNTOPTIONS 128 | options to send to mount command 129 | --help, -h display this help and exit 130 | ``` 131 | -------------------------------------------------------------------------------- /batchit.go: -------------------------------------------------------------------------------- 1 | package batchit 2 | 3 | const Version = "0.4.3" 4 | -------------------------------------------------------------------------------- /cmd/batchit/batchit.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "os" 7 | "sort" 8 | "strconv" 9 | 10 | "github.com/base2genomics/batchit" 11 | "github.com/base2genomics/batchit/ddv" 12 | "github.com/base2genomics/batchit/exsmount" 13 | "github.com/base2genomics/batchit/logof" 14 | "github.com/base2genomics/batchit/s3upload" 15 | "github.com/base2genomics/batchit/submit" 16 | ) 17 | 18 | type progPair struct { 19 | help string 20 | main func() 21 | } 22 | 23 | var progs = map[string]progPair{ 24 | "ebsmount": progPair{"create and mount an EBS volume from an EC2 instance", exsmount.Main}, 25 | "efsmount": progPair{"mount an EFS drive from an EC2 instance", exsmount.EFSMain}, 26 | "localmount": progPair{"RAID and mount local storage", exsmount.LocalMain}, 27 | "logof": progPair{"get the log of a given job id", logof.Main}, 28 | "submit": progPair{"run a batch command", submit.Main}, 29 | "ddv": progPair{"detach and delete a volume by id", ddv.Main}, 30 | "s3upload": progPair{"upload local files to matching s3 paths in parallel", s3upload.Main}, 31 | } 32 | 33 | func printProgs() { 34 | 35 | var wtr io.Writer = os.Stdout 36 | 37 | fmt.Fprintf(wtr, "batchit Version: %s\n\n", batchit.Version) 38 | 39 | wtr.Write([]byte(`batchit is a collection of programs most likely to be of use with AWS batch. 40 | It includes convenience operations so that user-scripts can consist of simple scripts. 41 | 42 | `)) 43 | var keys []string 44 | l := 5 45 | for k := range progs { 46 | keys = append(keys, k) 47 | if len(k) > l { 48 | l = len(k) 49 | } 50 | } 51 | fmtr := "%-" + strconv.Itoa(l) + "s : %s\n" 52 | sort.Strings(keys) 53 | for _, k := range keys { 54 | fmt.Fprintf(wtr, fmtr, k, progs[k].help) 55 | 56 | } 57 | os.Exit(1) 58 | 59 | } 60 | 61 | func main() { 62 | 63 | if len(os.Args) < 2 { 64 | printProgs() 65 | } 66 | var p progPair 67 | var ok bool 68 | if p, ok = progs[os.Args[1]]; !ok { 69 | printProgs() 70 | } 71 | // remove the prog name from the call 72 | os.Args = append(os.Args[:1], os.Args[2:]...) 73 | p.main() 74 | } 75 | -------------------------------------------------------------------------------- /ddv/ddv.go: -------------------------------------------------------------------------------- 1 | package ddv 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "os" 7 | "strings" 8 | "sync" 9 | "time" 10 | 11 | "github.com/base2genomics/batchit/exsmount" 12 | 13 | "github.com/aws/aws-sdk-go/aws" 14 | "github.com/aws/aws-sdk-go/aws/session" 15 | "github.com/aws/aws-sdk-go/service/ec2" 16 | ) 17 | 18 | func DetachAndDelete(vid string) error { 19 | var svc *ec2.EC2 20 | var drsp *ec2.DescribeVolumesOutput 21 | var err error 22 | 23 | for _, region := range []string{"us-east-1", "us-east-2", "us-west-1", "us-west-2", "ap-south-1", 24 | "ap-northeast-2", 25 | "ap-northeast-1", 26 | "ca-central-1", 27 | "cn-north-1", 28 | "eu-west-1", 29 | "eu-west-2", 30 | "sa-east-1", 31 | "us-gov-west-1", 32 | "ap-southeast-1", 33 | "ap-southeast-2", 34 | } { 35 | svc = ec2.New(session.Must(session.NewSession()), &aws.Config{Region: ®ion}) 36 | drsp, err = svc.DescribeVolumes( 37 | &ec2.DescribeVolumesInput{ 38 | VolumeIds: []*string{&vid}, 39 | }) 40 | if err != nil { 41 | continue 42 | } 43 | break 44 | } 45 | if drsp == nil { 46 | return fmt.Errorf("ddv: volume: %s not found", vid) 47 | } 48 | if err != nil { 49 | return err 50 | } 51 | 52 | log.Printf("ddv: found volume for deletion in region: %s", *svc.Config.Region) 53 | 54 | dtvi := &ec2.DetachVolumeInput{ 55 | VolumeId: aws.String(vid), 56 | Force: aws.Bool(true), 57 | } 58 | 59 | var v *ec2.VolumeAttachment 60 | 61 | for i := 0; i < 10; i++ { 62 | v, err = svc.DetachVolume(dtvi) 63 | if err == nil { 64 | if err := exsmount.WaitForVolumeStatus(svc, &vid, "available"); err != nil { 65 | return err 66 | } 67 | break 68 | } 69 | if strings.Contains(err.Error(), "is in the 'available' state") { 70 | break 71 | } 72 | if v != nil && *v.State == "available" { 73 | break 74 | } 75 | if err != nil { 76 | return err 77 | } 78 | time.Sleep(1 * time.Second) 79 | } 80 | 81 | if _, err := svc.DeleteVolume(&ec2.DeleteVolumeInput{VolumeId: aws.String(vid)}); err != nil { 82 | return err 83 | } 84 | return nil 85 | } 86 | 87 | func Main() { 88 | if len(os.Args) < 2 { 89 | fmt.Println("usage: ddv [ ... ]") 90 | os.Exit(1) 91 | } 92 | wg := &sync.WaitGroup{} 93 | for _, vid := range os.Args[1:] { 94 | wg.Add(1) 95 | go func(vid string) { 96 | 97 | if err := DetachAndDelete(vid); err != nil { 98 | log.Println(err) 99 | } else { 100 | log.Printf("volume %s has been deleted", vid) 101 | } 102 | wg.Done() 103 | }(vid) 104 | } 105 | wg.Wait() 106 | // always has non-zero exit status. 107 | } 108 | -------------------------------------------------------------------------------- /exsmount/exsmount.go: -------------------------------------------------------------------------------- 1 | package exsmount 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "encoding/json" 7 | "fmt" 8 | "io" 9 | "log" 10 | "math/rand" 11 | "net/http" 12 | "os" 13 | "os/exec" 14 | "strconv" 15 | "strings" 16 | "time" 17 | 18 | "github.com/base2genomics/batchit" 19 | 20 | arg "github.com/alexflint/go-arg" 21 | "github.com/aws/aws-sdk-go/aws" 22 | "github.com/aws/aws-sdk-go/aws/session" 23 | "github.com/aws/aws-sdk-go/service/ec2" 24 | "github.com/pkg/errors" 25 | ) 26 | 27 | // IID holds the attributes from the instance identity document 28 | type IID struct { 29 | AvailabilityZone string `json:"availabilityZone"` 30 | InstanceId string `json:"instanceId"` 31 | InstanceType string `json:"instanceType"` 32 | ImageId string `json:"imageId"` 33 | Region string `json:"region"` 34 | } 35 | 36 | func init() { 37 | rand.Seed(time.Now().Unix()) 38 | } 39 | 40 | func (i *IID) Get() error { 41 | rsp, err := http.Get("http://169.254.169.254/latest/dynamic/instance-identity/document") 42 | if err != nil { 43 | return err 44 | } 45 | d := json.NewDecoder(rsp.Body) 46 | return d.Decode(i) 47 | } 48 | 49 | type Args struct { 50 | Size int64 `arg:"-s,help:size in GB of desired EBS volume"` 51 | MountPoint string `arg:"-m,required,help:directory on which to mount the EBS volume"` 52 | VolumeType string `arg:"-v,help:desired volume type; gp2 for General Purpose SSD; io1 for Provisioned IOPS SSD; st1 for Throughput Optimized HDD; sc1 for HDD or Magnetic volumes; standard for infrequent"` 53 | FSType string `arg:"-t,help:file system type to create (argument must be accepted by mkfs)"` 54 | Iops int64 `arg:"-i,help:Provisioned IOPS. Only valid for volume type io1. Range is 100 to 20000 and <= 50*size of volume."` 55 | N int `arg:"-n,help:number of volumes to request. These will be RAID0'd into a single volume for better write speed and available as a single drive at the specified mount point."` 56 | Keep bool `arg:"-k,help:dont delete the volume(s) on termination (default is to delete)"` 57 | } 58 | 59 | func (a Args) Version() string { 60 | return batchit.Version 61 | } 62 | 63 | type LocalArgs struct { 64 | MountPrefix string `arg:"positional,required,help:local path to mount devices."` 65 | Devices []string `arg:"positional,help:devices to mount. e.g. (/dev/xvd*). Devices that are already mounted will be skipped."` 66 | } 67 | 68 | func (l LocalArgs) Version() string { 69 | return fmt.Sprintf("localmount %s", batchit.Version) 70 | } 71 | 72 | func (l LocalArgs) Description() string { 73 | return "RAID-0, mkfs and mount a series of drives." 74 | } 75 | 76 | func mountedDevices() map[string]bool { 77 | devices := make(map[string]bool) 78 | f, err := os.Open("/proc/mounts") 79 | if err != nil { 80 | return devices 81 | } 82 | r := bufio.NewReader(f) 83 | for { 84 | line, err := r.ReadString('\n') 85 | if err == io.EOF { 86 | break 87 | } 88 | if err != nil { 89 | panic(err) 90 | } 91 | dev := strings.Fields(line)[0] 92 | devices[dev] = true 93 | for i := len(dev) - 1; i > 1; i-- { 94 | v := dev[i] 95 | if '0' <= v && v <= '9' { 96 | dev = dev[:len(dev)-1] 97 | } else { 98 | break 99 | } 100 | } 101 | devices[dev] = true 102 | 103 | } 104 | 105 | return devices 106 | } 107 | 108 | func contains(haystack []string, needle string) bool { 109 | for _, h := range haystack { 110 | if h == needle { 111 | return true 112 | } 113 | } 114 | return false 115 | } 116 | 117 | // MountLocal RAID-0's all devices onto a single mount-point. 118 | func MountLocal(deviceCandidates []string, mountBase string) ([]string, error) { 119 | inUse := mountedDevices() 120 | var devices []string 121 | for _, dev := range deviceCandidates { 122 | sub := dev[:len(dev)-1] 123 | // skip xvcd1 when we have xvcd 124 | if contains(deviceCandidates, sub) { 125 | continue 126 | } 127 | 128 | if _, err := os.Stat(dev); err != nil { 129 | if os.IsNotExist(err) { 130 | break 131 | } 132 | return nil, err 133 | } 134 | if _, ok := inUse[dev]; ok { 135 | continue 136 | } 137 | devices = append(devices, dev) 138 | } 139 | if len(devices) == 0 { 140 | log.Printf("localmount: no unused local storage found for %s", deviceCandidates) 141 | return nil, fmt.Errorf("exsmount: no unused local storage found") 142 | } 143 | if _, err := exec.LookPath("mdadm"); err != nil || len(devices) == 1 { 144 | if len(devices) > 1 { 145 | log.Println("mdadm not found mounting each device to it's own path") 146 | } 147 | for i, dev := range devices { 148 | log.Printf("making fs for %s", dev) 149 | if err := mkfs("ext4", dev); err != nil { 150 | if err == MountedError { 151 | continue 152 | } 153 | log.Println(err) 154 | return nil, err 155 | } 156 | base := mountBase 157 | log.Printf("mounting: %s to %s", dev, base) 158 | if i > 0 { 159 | base = fmt.Sprintf("%s_%d", mountBase, i) 160 | } 161 | if err = makeAndMount(dev, base); err != nil { 162 | return nil, err 163 | } 164 | } 165 | return devices, nil 166 | } 167 | // RAID0 168 | var raidDev string 169 | for i := 0; i < 20; i++ { 170 | rd := fmt.Sprintf("/dev/md%d", i) 171 | if _, err := os.Stat(rd); err != nil { 172 | if os.IsNotExist(err) { 173 | raidDev = rd 174 | break 175 | } 176 | } 177 | } 178 | if raidDev == "" { 179 | return nil, fmt.Errorf("no available /dev/md path found") 180 | } 181 | 182 | args := []string{"--create", "--verbose", raidDev, "-R", "--level=stripe", fmt.Sprintf("--raid-devices=%d", len(devices))} 183 | args = append(args, devices...) 184 | log.Println("creating RAID0 array with:", strings.Join(append([]string{"mdadm"}, args...), " ")) 185 | 186 | cmd := exec.Command("mdadm", args...) 187 | cmd.Stderr, cmd.Stdout = os.Stderr, os.Stderr 188 | if err := cmd.Run(); err != nil { 189 | return nil, err 190 | } 191 | if err := mkfs("ext4", raidDev); err != nil { 192 | return []string{raidDev}, err 193 | } 194 | return []string{raidDev}, makeAndMount(raidDev, mountBase) 195 | } 196 | 197 | var MountedError = errors.New("drive is already mounted") 198 | 199 | func mkfs(fstype, attachDevice string) error { 200 | 201 | cmd := exec.Command("mkfs", "-t", fstype, attachDevice) 202 | var b bytes.Buffer 203 | cmd.Stderr, cmd.Stdout = &b, os.Stderr 204 | if err := cmd.Run(); err != nil { 205 | stderr := b.String() 206 | if strings.Contains(stderr, "is mounted") { 207 | return MountedError 208 | } 209 | os.Stderr.WriteString(stderr) 210 | return err 211 | } 212 | return nil 213 | } 214 | 215 | func Create(svc *ec2.EC2, iid *IID, size int64, typ string, iops int64, is ...int) (*ec2.Volume, error) { 216 | suf := "" 217 | if len(is) > 0 { 218 | suf = fmt.Sprintf("-%d", is[0]) 219 | } 220 | 221 | cvi := &ec2.CreateVolumeInput{ 222 | AvailabilityZone: aws.String(iid.AvailabilityZone), 223 | Size: aws.Int64(size), //GB 224 | VolumeType: aws.String(typ), 225 | TagSpecifications: []*ec2.TagSpecification{ 226 | &ec2.TagSpecification{ 227 | ResourceType: aws.String("volume"), 228 | Tags: []*ec2.Tag{&ec2.Tag{Key: aws.String("Name"), Value: aws.String(fmt.Sprintf("batchit-%s%s", iid.InstanceId, suf))}}, 229 | }, 230 | }, 231 | } 232 | if typ == "io1" { 233 | cvi.Iops = aws.Int64(iops) 234 | } 235 | 236 | rsp, err := svc.CreateVolume(cvi) 237 | if err != nil { 238 | return nil, err 239 | } 240 | if err := WaitForVolumeStatus(svc, rsp.VolumeId, "available"); err != nil { 241 | return nil, err 242 | } 243 | return rsp, nil 244 | } 245 | 246 | type EFSArgs struct { 247 | MountOptions string `arg:"-o,help:options to send to mount command"` 248 | EFS string `arg:"positional,required,help:efs DNS and mount path (e.g.fs-XXXXXX.efs.us-east-1.amazonaws.com:/mnt/efs/)"` 249 | MountPoint string `arg:"positional,required,help:local directory on which to mount the EBS volume"` 250 | } 251 | 252 | // EFSMain mounts and EFS drive 253 | func EFSMain() { 254 | cli := &EFSArgs{MountPoint: "/mount/efs/"} 255 | arg.MustParse(cli) 256 | 257 | if err := EFSMount(cli.EFS, cli.MountPoint, cli.MountOptions); err != nil { 258 | panic(err) 259 | } 260 | } 261 | 262 | // EFSMount will mount the EFS drive to the requested mount-point. 263 | // the efs argument looks like: fs-XXXXXX.efs.us-east-1.amazonaws.com:/mnt/efs/ 264 | func EFSMount(efs string, mountPoint string, mountOpts string) error { 265 | if err := makeDir(mountPoint); err != nil { 266 | return err 267 | } 268 | opts := "rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2" 269 | if mountOpts != "" { 270 | opts += "," + mountOpts 271 | } 272 | if !strings.Contains(efs, ":") { 273 | return fmt.Errorf("EFS string must end with path within the mount e.g. :/") 274 | } 275 | // https://docs.aws.amazon.com/efs/latest/ug/mounting-fs-mount-cmd-general.html 276 | cmd := exec.Command("mount", "-t", "nfs4", "-o", opts, efs, mountPoint) 277 | cmd.Stderr, cmd.Stdout = os.Stderr, os.Stderr 278 | return cmd.Run() 279 | } 280 | 281 | // http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/device_naming.html 282 | const letters = "bcdefghijklmnopqrstuvwxyz" 283 | 284 | func CreateAttach(cli *Args) ([]string, error) { 285 | iid := &IID{} 286 | if err := iid.Get(); err != nil { 287 | return nil, err 288 | } 289 | sess, err := session.NewSession() 290 | if err != nil { 291 | return nil, errors.Wrap(err, "error creating session") 292 | } 293 | if cli.VolumeType == "io1" { 294 | if cli.Iops == 0 { 295 | cli.Iops = 45 * cli.Size 296 | } 297 | if cli.Iops < 100 || cli.Iops > 20000 { 298 | return nil, fmt.Errorf("ebsmount: Iops must be between 100 and 20000") 299 | } 300 | if cli.Iops > 50*cli.Size { 301 | log.Printf("ebsmount: setting IOPs must be <= 50 times size") 302 | cli.Iops = 45 * cli.Size 303 | if cli.Iops > 200000 { 304 | cli.Iops = 20000 305 | } 306 | } 307 | } 308 | 309 | var devices []string 310 | var volumes []string 311 | svc := ec2.New(sess, &aws.Config{Region: aws.String(iid.Region)}) 312 | 313 | cli.Size = int64(float64(cli.Size)/float64(cli.N) + 0.5) 314 | for i := 0; i < cli.N; i++ { 315 | log.Println("batchit: creating EBS volume:", i) 316 | 317 | var rsp *ec2.Volume 318 | if rsp, err = Create(svc, iid, cli.Size, cli.VolumeType, cli.Iops, i); err != nil { 319 | if strings.Contains(err.Error(), "RequestLimitExceeded") { 320 | time.Sleep(time.Duration(10+rand.Intn(90)) * time.Second) 321 | var err2 error 322 | if rsp, err2 = Create(svc, iid, cli.Size, cli.VolumeType, cli.Iops, i); err2 != nil { 323 | log.Println("WARNING: this usually means you need to space out job submissions") 324 | return nil, errors.Wrap(err, "error creating volume") 325 | } 326 | 327 | } else { 328 | return nil, errors.Wrap(err, "error creating volume") 329 | } 330 | } 331 | attached := false 332 | 333 | defer func() { 334 | if !attached { 335 | log.Println("batchit: unsuccessful EBS volume attachment, deleting volume") 336 | _, err := svc.DeleteVolume(&ec2.DeleteVolumeInput{VolumeId: rsp.VolumeId}) 337 | if err != nil { 338 | log.Println(err) 339 | } 340 | } 341 | }() 342 | time.Sleep(3 * time.Second) // sleep to avoid doing too many requests. 343 | 344 | // http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/device_naming.html 345 | // http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/volume_limits.html 346 | var attachDevice string 347 | for pi, prefix := range []string{"/dev/sd", "/dev/sd", "/dev/xvd"} { 348 | if attached { 349 | break 350 | } 351 | 352 | var koff, off int // these help so we don't retry the same dev multiple times 353 | for k := int64(0); k < 7 && int(k)+koff < len(letters); k++ { 354 | off, attachDevice = findNextDevNode(prefix, pi, letters[int(k)+koff:len(letters)]) 355 | if off == -1 { 356 | break 357 | } 358 | koff += off 359 | if k > 3 { 360 | // if we get high enough, we are probably racing with other jobs 361 | // so introduce some randomness. 362 | koff += rand.Intn(5) 363 | } 364 | 365 | if _, err := svc.AttachVolume(&ec2.AttachVolumeInput{ 366 | InstanceId: aws.String(iid.InstanceId), 367 | VolumeId: rsp.VolumeId, 368 | Device: aws.String(attachDevice), 369 | }); err != nil { 370 | // race condition attaching devices from multiple containers to the same host /dev address. 371 | // so retry 7 times (k) with randomish wait time. 372 | log.Printf("retrying EBS attach because of difficulty getting volume. error was: %+T. %s", err, err) 373 | if strings.Contains(err.Error(), "Invalid value") && strings.Contains(err.Error(), "for unixDevice") { 374 | break 375 | } 376 | if strings.Contains(err.Error(), "is already in use") { 377 | time.Sleep((time.Duration(3 * (k + rand.Int63n(2*k+1)))) * time.Second) 378 | continue 379 | } 380 | 381 | return nil, errors.Wrap(err, "error attaching device") 382 | } 383 | 384 | volumes = append(volumes, *rsp.VolumeId) 385 | 386 | if err := WaitForVolumeStatus(svc, rsp.VolumeId, "in-use"); err != nil { 387 | return nil, err 388 | } 389 | 390 | if !waitForDevice(attachDevice) { 391 | return nil, err 392 | } 393 | devices = append(devices, attachDevice) 394 | attached = true 395 | break 396 | } 397 | } 398 | if !attached { 399 | return nil, fmt.Errorf("ebsmount: unable to attach device") 400 | } 401 | 402 | if !cli.Keep { 403 | if err := DeleteOnTermination(svc, iid.InstanceId, *rsp.VolumeId, attachDevice); err != nil { 404 | return nil, errors.Wrap(err, "error setting delete on termination") 405 | } 406 | } 407 | 408 | } 409 | 410 | fmt.Println(strings.Join(volumes, " ")) 411 | if err = makeDir(cli.MountPoint); err != nil { 412 | return nil, err 413 | } 414 | 415 | return devices, nil 416 | } 417 | 418 | func DeleteOnTermination(svc *ec2.EC2, instanceId string, volumeId string, attachDevice string) error { 419 | // set delete on termination 420 | var ad *string 421 | ad = &attachDevice 422 | log.Println("ebsmount: setting to delete on termination") 423 | moi := &ec2.ModifyInstanceAttributeInput{ 424 | InstanceId: aws.String(instanceId), 425 | BlockDeviceMappings: []*ec2.InstanceBlockDeviceMappingSpecification{ 426 | &ec2.InstanceBlockDeviceMappingSpecification{ 427 | // TODO: see if attachDevice is required 428 | DeviceName: ad, 429 | Ebs: &ec2.EbsInstanceBlockDeviceSpecification{ 430 | DeleteOnTermination: aws.Bool(true), 431 | VolumeId: aws.String(volumeId), 432 | }, 433 | }}, 434 | } 435 | _, err := svc.ModifyInstanceAttribute(moi) 436 | return errors.Wrap(err, "error setting delete on termination") 437 | } 438 | 439 | func makeAndMount(attachDevice, mountPoint string) error { 440 | var err error 441 | 442 | if err = makeDir(mountPoint); err != nil { 443 | return err 444 | } 445 | 446 | opts := []string{"mount", "-o", "noatime", attachDevice, mountPoint} 447 | cmd := exec.Command("mount", opts[1:]...) 448 | cmd.Stderr, cmd.Stdout = os.Stderr, os.Stderr 449 | if err := cmd.Run(); err != nil { 450 | return err 451 | } 452 | return nil 453 | } 454 | 455 | func makeDir(path string) error { 456 | // mkdir 457 | if _, err := os.Stat(path); err != nil { 458 | if os.IsNotExist(err) { 459 | if err := os.MkdirAll(path, os.FileMode(0777)); err != nil { 460 | return err 461 | } 462 | } else { 463 | return err 464 | } 465 | 466 | } 467 | return nil 468 | } 469 | 470 | func LocalMain() { 471 | cli := &LocalArgs{MountPrefix: "/mount/local/"} 472 | arg.MustParse(cli) 473 | 474 | if _, err := MountLocal(cli.Devices, cli.MountPrefix); err != nil { 475 | panic(err) 476 | } 477 | } 478 | 479 | func Main() { 480 | cli := &Args{ 481 | Size: 200, 482 | VolumeType: "gp2", 483 | FSType: "ext4", 484 | N: 1, 485 | } 486 | if p := arg.MustParse(cli); cli.VolumeType != "st1" && cli.VolumeType != "gp2" && cli.VolumeType != "sc1" && cli.VolumeType != "io1" && cli.VolumeType != "standard" { 487 | p.Fail("volume type must be one of st1/gp2/sc1/io1") 488 | } else if cli.N > 16 || cli.N < 1 { 489 | p.Fail("number of volumes should be between 1 and 16") 490 | } 491 | 492 | devices, err := CreateAttach(cli) 493 | if err != nil { 494 | panic(err) 495 | } 496 | 497 | if devices, err := MountLocal(devices, cli.MountPoint); err != nil { 498 | panic(err) 499 | } else if cli.VolumeType == "st1" || cli.VolumeType == "sc1" { 500 | // https://aws.amazon.com/blogs/aws/amazon-ebs-update-new-cold-storage-and-throughput-options/ 501 | for _, d := range devices { 502 | cmd := exec.Command("blockdev", "--setra", "2048", d) 503 | cmd.Stderr, cmd.Stdout = os.Stderr, os.Stderr 504 | if err := cmd.Run(); err != nil { 505 | log.Println("warning: error setting read-ahead", err) 506 | } 507 | } 508 | } 509 | fmt.Fprintf(os.Stderr, "mounted %d EBS drives to %s\n", len(devices), cli.MountPoint) 510 | } 511 | 512 | func findNextDevNode(prefix string, pi int, suffixChars string) (int, string) { 513 | if prefix == "/dev/sd" { 514 | if pi == 0 { 515 | for i, s := range suffixChars { 516 | if _, err := os.Stat(prefix + string(s)); err == nil { 517 | continue 518 | } else if os.IsNotExist(err) { 519 | return i, prefix + string(s) 520 | } 521 | } 522 | return -1, "" 523 | } 524 | if pi != 0 { 525 | for i, s := range suffixChars { 526 | for j := 1; j < 15; j++ { 527 | // can't use sdb1 if sdb exists. 528 | if _, err := os.Stat(prefix + string(s)); err == nil { 529 | continue 530 | } 531 | if _, err := os.Stat(prefix + string(s) + strconv.Itoa(j)); err == nil { 532 | continue 533 | } else if os.IsNotExist(err) { 534 | return i, prefix + string(s) + strconv.Itoa(j) 535 | } 536 | } 537 | } 538 | return -1, "" 539 | } 540 | 541 | } else { 542 | // /dev/xd 543 | for _, a := range "bc" { 544 | for i, b := range "abcdefghijklmnopqrstuvwxyz" { 545 | if _, err := os.Stat(prefix + string(a) + string(b)); err == nil { 546 | continue 547 | } else if os.IsNotExist(err) { 548 | return i, prefix + string(a) + string(b) 549 | } 550 | } 551 | } 552 | } 553 | panic(fmt.Errorf("no available device found with prefix: %s", prefix)) 554 | } 555 | 556 | func waitForDevice(device string) bool { 557 | for i := 0; i < 30; i++ { 558 | if _, err := os.Stat(device); err != nil { 559 | time.Sleep(1 * time.Second) 560 | } else { 561 | return true 562 | } 563 | 564 | } 565 | return false 566 | } 567 | 568 | func WaitForVolumeStatus(svc *ec2.EC2, volumeId *string, status string) error { 569 | var xstatus string 570 | time.Sleep(5 * time.Second) 571 | 572 | for i := 0; i < 30; i++ { 573 | drsp, err := svc.DescribeVolumes( 574 | &ec2.DescribeVolumesInput{ 575 | VolumeIds: []*string{volumeId}, 576 | }) 577 | if err != nil { 578 | return errors.Wrapf(err, "error waiting for volume: %s status: %s", *volumeId, status) 579 | } 580 | if len(drsp.Volumes) == 0 { 581 | panic(fmt.Sprintf("volume: %s not found", *volumeId)) 582 | } 583 | xstatus = *drsp.Volumes[0].State 584 | if xstatus == status { 585 | return nil 586 | } 587 | time.Sleep(4 * time.Second) 588 | if i > 10 { 589 | time.Sleep(time.Duration(i) * time.Second) 590 | } 591 | } 592 | return fmt.Errorf("never found volume: %s with status: %s. last was: %s", *volumeId, status, xstatus) 593 | } 594 | -------------------------------------------------------------------------------- /logof/logof.go: -------------------------------------------------------------------------------- 1 | package logof 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "os" 7 | "sort" 8 | "time" 9 | 10 | "github.com/aws/aws-sdk-go/aws" 11 | "github.com/aws/aws-sdk-go/aws/session" 12 | "github.com/aws/aws-sdk-go/service/batch" 13 | "github.com/aws/aws-sdk-go/service/cloudwatchlogs" 14 | ) 15 | 16 | func LogOf(jobId string, region string) int { 17 | input := batch.DescribeJobsInput{Jobs: []*string{aws.String(jobId)}} 18 | cfg := aws.NewConfig().WithRegion(region) 19 | sess := session.Must(session.NewSession(cfg)) 20 | b := batch.New(sess, cfg) 21 | output, err := b.DescribeJobs(&input) 22 | if err != nil { 23 | log.Printf("[batchit] error finding jobs: %s in %s", jobId, region) 24 | log.Println(err) 25 | os.Exit(1) 26 | } 27 | if len(output.Jobs) == 0 { 28 | return 0 29 | } 30 | sort.Slice(output.Jobs, func(i, j int) bool { return *output.Jobs[i].StartedAt < *output.Jobs[j].StartedAt }) 31 | j := output.Jobs[len(output.Jobs)-1] 32 | stream := j.Container.LogStreamName 33 | if stream == nil { 34 | log.Fatalf("job %s not found. has it started?", jobId) 35 | } 36 | 37 | gli := &cloudwatchlogs.GetLogEventsInput{ 38 | LogGroupName: aws.String("/aws/batch/job"), 39 | LogStreamName: stream, 40 | StartFromHead: aws.Bool(true), 41 | } 42 | 43 | cloud := cloudwatchlogs.New(sess, cfg) 44 | 45 | for { 46 | ev, err := cloud.GetLogEvents(gli) 47 | if err != nil { 48 | panic(err) 49 | } 50 | for _, event := range ev.Events { 51 | t := time.Unix(*event.Timestamp/1000, 0) 52 | fmt.Println("[" + t.Format(time.ANSIC) + "] " + *event.Message) 53 | } 54 | if ev.NextForwardToken == nil || (gli.NextToken != nil && *ev.NextForwardToken == *gli.NextToken) { 55 | break 56 | } 57 | gli.NextToken = ev.NextForwardToken 58 | } 59 | return 0 60 | } 61 | 62 | func Main() { 63 | if len(os.Args) < 3 { 64 | fmt.Println("usage: batchit logof JobId region") 65 | os.Exit(1) 66 | } 67 | os.Exit(LogOf(os.Args[1], os.Args[2])) 68 | } 69 | -------------------------------------------------------------------------------- /s3upload/s3upload.go: -------------------------------------------------------------------------------- 1 | package s3upload 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "os" 7 | "path/filepath" 8 | "strings" 9 | "sync" 10 | "time" 11 | 12 | "github.com/base2genomics/batchit/submit" 13 | 14 | arg "github.com/alexflint/go-arg" 15 | "github.com/aws/aws-sdk-go/aws" 16 | "github.com/aws/aws-sdk-go/aws/session" 17 | "github.com/aws/aws-sdk-go/service/s3" 18 | "github.com/aws/aws-sdk-go/service/s3/s3manager" 19 | ) 20 | 21 | type cliargs struct { 22 | Region string `arg:"env:AWS_DEFAULT_REGION,help:region for batch setup"` 23 | Check bool `arg:"-c,help:check if file exists before uploading and don't upload if it is same size."` 24 | NoFail bool `arg:"help:don't fail if one of the local paths corresponding to an S3 path is not found."` 25 | Processes int `arg:"-p,help:number of parallel uploads."` 26 | S3Paths []string `arg:"required,positional,help:S3 destination paths. The final entry in the Key will be used to look for the local file."` 27 | } 28 | 29 | func (c cliargs) Description() string { 30 | return `Upload files to S3 in parallel using convention (file-naming) 31 | This program requires that if you want to upload to s3://bucket/where/to/send.txt 32 | a local file named 'send.txt' will exist. This program will upload the first 'send.txt' it finds. 33 | 34 | To upload only files that are not already present, use '-c'. To not fail even if a local file is not found, use --nofail. 35 | With '-c', if the local size does not match the size in S3, the file will be uploaded. 36 | ` 37 | } 38 | 39 | func findIn(haystack []string, needle string) int { 40 | for i, h := range haystack { 41 | if needle == h { 42 | return i 43 | } 44 | } 45 | return -1 46 | } 47 | 48 | func getupload(s3paths []string, svc *s3.S3, check bool, nofail bool) ([]*s3manager.UploadInput, error) { 49 | uploads := make([]*s3manager.UploadInput, 0, len(s3paths)) 50 | localpaths := make([]string, len(s3paths)) 51 | founds := make([]bool, len(s3paths)) 52 | 53 | for i, s3path := range s3paths { 54 | if strings.HasPrefix(s3path, "s3://") { 55 | s3path = s3path[5:] 56 | } 57 | 58 | tmp := strings.Split(s3path, "/") 59 | localpaths[i] = tmp[len(tmp)-1] 60 | } 61 | 62 | err := filepath.Walk(".", func(path string, f os.FileInfo, err error) error { 63 | if err != nil { 64 | return err 65 | } 66 | if f.IsDir() { 67 | return nil 68 | } 69 | tmp := strings.Split(f.Name(), "/") 70 | 71 | idx := findIn(localpaths, tmp[len(tmp)-1]) 72 | if idx == -1 { 73 | return nil 74 | } 75 | founds[idx] = true 76 | s3path := s3paths[idx] 77 | if check { 78 | // check if file exists in s3 79 | exists, size, err := submit.OutputExists(svc, s3path) 80 | if err != nil && err != submit.NotFound { 81 | return err 82 | } 83 | if err == nil && exists && size == f.Size() { 84 | fmt.Fprintf(os.Stderr, "[batchit s3uploader] %s already in s3, skipping\n", f.Name()) 85 | return nil 86 | } 87 | 88 | } 89 | 90 | fp, err := os.Open(f.Name()) 91 | if err != nil { 92 | return err 93 | } 94 | if strings.HasPrefix(s3path, "s3://") { 95 | s3path = s3path[5:] 96 | } 97 | bk := strings.SplitN(s3path, "/", 2) 98 | uploads = append(uploads, &s3manager.UploadInput{ 99 | Bucket: aws.String(bk[0]), 100 | Key: aws.String(bk[1]), 101 | Body: fp, 102 | }) 103 | return nil 104 | }) 105 | for i, found := range founds { 106 | if found { 107 | continue 108 | } 109 | if nofail { 110 | log.Println("local file not found for " + s3paths[i]) 111 | } else { 112 | log.Fatal("local file not found for " + s3paths[i]) 113 | } 114 | 115 | } 116 | return uploads, err 117 | } 118 | 119 | func Main() { 120 | 121 | // TODO: check Region with iid. 122 | cli := &cliargs{Processes: 2, Region: "us-east-1"} 123 | arg.MustParse(cli) 124 | cfg := aws.NewConfig().WithRegion(cli.Region) 125 | sess := session.Must(session.NewSession(cfg)) 126 | svc := s3.New(sess) 127 | 128 | uploads, err := getupload(cli.S3Paths, svc, cli.Check, cli.NoFail) 129 | if err != nil { 130 | log.Fatal(err) 131 | } 132 | 133 | iter := make(chan *s3manager.UploadInput, len(uploads)) 134 | for _, u := range uploads { 135 | iter <- u 136 | } 137 | close(iter) 138 | 139 | var wg sync.WaitGroup 140 | wg.Add(cli.Processes) 141 | 142 | for i := 0; i < cli.Processes; i++ { 143 | go func() { 144 | // NOTE: using multiple uploaders, each of which has concurrency. Might want to tune this later. 145 | uploader := s3manager.NewUploaderWithClient(svc, func(u *s3manager.Uploader) { 146 | u.PartSize = 16 * 1024 * 1024 // 64MB per part 147 | u.LeavePartsOnError = false 148 | u.Concurrency = 5 149 | }) 150 | for u := range iter { 151 | 152 | t := time.Now() 153 | fmt.Fprintf(os.Stderr, "[batchit s3upload] starting upload of %s\n", u.Body.(*os.File).Name()) 154 | 155 | _, err := uploader.Upload(u, func(u *s3manager.Uploader) { 156 | u.PartSize = 24 * 1024 * 1024 // 64MB per part 157 | u.LeavePartsOnError = false 158 | }) 159 | if err != nil { 160 | log.Fatal(err) 161 | } 162 | fmt.Fprintf(os.Stderr, "[batchit s3upload] uploaded %s in %s\n", u.Body.(*os.File).Name(), time.Since(t)) 163 | 164 | } 165 | wg.Done() 166 | }() 167 | } 168 | wg.Wait() 169 | 170 | } 171 | -------------------------------------------------------------------------------- /submit/submit.go: -------------------------------------------------------------------------------- 1 | package submit 2 | 3 | import ( 4 | "bytes" 5 | "compress/gzip" 6 | "encoding/base64" 7 | "fmt" 8 | "io" 9 | "log" 10 | "os" 11 | "strconv" 12 | "strings" 13 | "time" 14 | 15 | "github.com/base2genomics/batchit" 16 | 17 | arg "github.com/alexflint/go-arg" 18 | "github.com/aws/aws-sdk-go/aws" 19 | "github.com/aws/aws-sdk-go/aws/awserr" 20 | "github.com/aws/aws-sdk-go/aws/session" 21 | "github.com/aws/aws-sdk-go/service/batch" 22 | "github.com/aws/aws-sdk-go/service/ec2" 23 | "github.com/aws/aws-sdk-go/service/ecs" 24 | "github.com/aws/aws-sdk-go/service/iam" 25 | "github.com/aws/aws-sdk-go/service/s3" 26 | "github.com/aws/aws-sdk-go/service/sts" 27 | "github.com/brentp/xopen" 28 | "github.com/pkg/errors" 29 | ) 30 | 31 | type cliargs struct { 32 | Image string `arg:"-i,required,help:image like $acct.dkr.ecr.$region.amazonaws.com/$image:$tag or $image:$tag"` 33 | Registry string `arg:"env" help:"Docker image registry. [default: $acct.dkr.ecr.$region.amazonaws.com]"` 34 | Role string `arg:"-r,required,help:existing role name"` 35 | Region string `arg:"env:AWS_DEFAULT_REGION,help:region for batch setup"` 36 | Queue string `arg:"-q,required,help:job queue"` 37 | ArraySize int64 `arg:"-a,help:optional size of array job"` 38 | DependsOn []string `arg:"-d,help:jobId(s) that this job depends on"` 39 | Retries int64 `arg:"-r,help:number of times to retry this job on failure"` 40 | EnvVars []string `arg:"-v,help:key-value environment pairs of the form NAME=value"` 41 | CPUs int `arg:"-c,help:number of cpus reserved by the job"` 42 | Volumes []string `arg:"-o,help:HOST_PATH=CONTAINER_PATH"` 43 | S3Outputs string `arg:"help:comma-delimited list of s3 paths indicating the output of this run. If all present job will *not* be run."` 44 | Mem int `arg:"-m,help:memory (MiB) reserved by the job"` 45 | Ebs string `arg:"-e,help:args for ebs mount. format mount-point:size:volume-type:fstype eg /mnt/xx:500:sc1:ext4 where last 2 arguments are optional and default as shown. This assumes that batchit is installed on the host. If type==io1 the 5th argument must specify the IOPs (between 100 and 20000)"` 46 | JobName string `arg:"-j,required,help:name of job"` 47 | Path string `arg:"required,positional,help:path of bash script to run. With '-' it will be read from STDIN. Prefix with 'script:' to send a string."` 48 | } 49 | 50 | func (c cliargs) Version() string { 51 | return batchit.Version 52 | } 53 | 54 | func getRole(svc *iam.IAM, role string) *iam.Role { 55 | inp := &iam.GetRoleInput{RoleName: &role} 56 | op, err := svc.GetRole(inp) 57 | if err != nil { 58 | panic(err) 59 | } 60 | return op.Role 61 | } 62 | 63 | const scriptPrefix = "script:" 64 | const interactivePrefix = "interactive:" 65 | 66 | // gzip and then base64 encode a shell script. 67 | func shellEncode(path string) string { 68 | var b bytes.Buffer 69 | enc := base64.NewEncoder(base64.StdEncoding, &b) 70 | z := gzip.NewWriter(enc) 71 | if strings.HasPrefix(path, scriptPrefix) { 72 | if _, err := z.Write([]byte(path[len(scriptPrefix):])); err != nil { 73 | panic(err) 74 | } 75 | } else if strings.HasPrefix(path, interactivePrefix) { 76 | tmp := strings.Split(path, ":") 77 | minutes := 20 78 | if len(tmp) == 2 { 79 | m, err := strconv.Atoi(tmp[1]) 80 | if err == nil { 81 | minutes = m 82 | } else { 83 | log.Println("couldn't parse minutes from %s", tmp[1]) 84 | } 85 | } 86 | if _, err := z.Write([]byte(fmt.Sprintf("sleep %d", minutes*60))); err != nil { 87 | panic(err) 88 | } 89 | } else { 90 | rdr, err := xopen.Ropen(path) 91 | if err != nil { 92 | panic(err) 93 | } 94 | _, err = io.Copy(z, rdr) 95 | if err != nil { 96 | panic(err) 97 | } 98 | } 99 | if err := z.Close(); err != nil { 100 | panic(err) 101 | } 102 | if err := enc.Close(); err != nil { 103 | panic(err) 104 | } 105 | return b.String() 106 | } 107 | 108 | func getTmp(cli *cliargs) string { 109 | if len(cli.Volumes) == 0 { 110 | return "" 111 | } 112 | mnt := strings.Split(cli.Volumes[0], "=")[1] 113 | tmp := fmt.Sprintf(`# thanks Hao 114 | export TMPDIR="$(mktemp -d -p %s)" 115 | cleanup() { echo "batchit: deleting temp dir ${TMPDIR}"; umount -l /tmp/; rm -rf ${TMPDIR}; } 116 | trap "cleanup_volume EXIT; cleanup;" EXIT 117 | mkdir -p ${TMPDIR}/tmp/ 118 | mount --bind ${TMPDIR}/tmp/ /tmp/ 119 | cd $TMPDIR`, mnt) 120 | return tmp 121 | } 122 | 123 | var NotFound = errors.New("not found") 124 | 125 | // return that the file exists, its size, and any error 126 | func OutputExists(s3o *s3.S3, path string) (bool, int64, error) { 127 | if strings.HasPrefix(path, "s3://") { 128 | path = path[5:] 129 | } 130 | bk := strings.SplitN(path, "/", 2) 131 | ho, err := s3o.HeadObject(&s3.HeadObjectInput{Bucket: aws.String(bk[0]), Key: aws.String(bk[1])}) 132 | if err != nil { 133 | if aerr, ok := err.(awserr.Error); ok { 134 | switch aerr.Code() { 135 | case "Forbidden": 136 | return false, 0, fmt.Errorf("you do not have permissions to access %s", path) 137 | case "NotFound": 138 | return false, 0, NotFound 139 | default: 140 | return false, 0, aerr 141 | } 142 | 143 | } 144 | return false, 0, err 145 | } 146 | 147 | return ho.ContentLength != nil && *ho.ContentLength > 0, *ho.ContentLength, nil 148 | } 149 | 150 | func outputsExist(sess *session.Session, paths []string) bool { 151 | svc := s3.New(sess) 152 | for _, p := range paths { 153 | found, _, err := OutputExists(svc, p) 154 | if err != nil && err != NotFound { 155 | log.Fatal(err) 156 | } 157 | if !found { 158 | return false 159 | } 160 | } 161 | return true 162 | } 163 | 164 | func Main() { 165 | cli := &cliargs{CPUs: 1, Mem: 1048, Retries: 1, Region: "us-east-1"} 166 | p := arg.MustParse(cli) 167 | 168 | cfg := aws.NewConfig().WithRegion(cli.Region) 169 | sess := session.Must(session.NewSession(cfg)) 170 | 171 | if cli.S3Outputs != "" { 172 | if outputsExist(sess, strings.Split(cli.S3Outputs, ",")) { 173 | max := 100 174 | if max > len(cli.S3Outputs) { 175 | max = len(cli.S3Outputs) 176 | } 177 | fmt.Fprintln(os.Stderr, "[batchit submit] all output found for "+cli.S3Outputs[0:max]+"... not re-running\n") 178 | return 179 | } 180 | } 181 | cleanupDefault := `cleanup_volume() { true; }` 182 | var ebsCmd [3]string 183 | if len(cli.Ebs) > 0 { 184 | ebs := strings.Split(cli.Ebs, ":") 185 | if len(ebs) == 3 { 186 | ebs = append(ebs, "ext4") 187 | } 188 | if len(ebs) == 2 { 189 | _, err := strconv.Atoi(ebs[1]) 190 | if err != nil { 191 | panic(fmt.Sprintf("error with specified ebs drive size: %s, %s", ebs[1], err)) 192 | } 193 | ebs = append(ebs, []string{"gp2", "ext4"}...) 194 | } 195 | if len(ebs) != 4 && len(ebs) != 5 { 196 | p.Fail(fmt.Sprintf("expected Ebs argument to have 2 or 4 arguments")) 197 | } 198 | sz, err := strconv.Atoi(ebs[1]) 199 | if err != nil { 200 | panic(fmt.Sprintf("error with specified ebs drive size: %s, %s", ebs[1], err)) 201 | } 202 | //Ebs /mnt/local:500:gp2:ext4 203 | // if possible, we raid-0 2 or 3 drives for better performance. 204 | // http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EBSVolumeTypes.html 205 | // gp2/st1 bandwith maxes at 3,334 GB/ 12.5TB so we RAID0 after that. 206 | n := 1 207 | if (ebs[2] == "gp2" && sz > 3400) || (ebs[2] == "st1" && sz >= 12500) { 208 | n = 2 209 | } 210 | if len(ebs) == 4 { 211 | ebsCmd[0] = fmt.Sprintf("export vid=$(batchit ebsmount -n %d -m %s -s %s -v %s -t %s)", n, ebs[0], ebs[1], ebs[2], ebs[3]) 212 | } else { 213 | ebsCmd[0] = fmt.Sprintf("export vid=$(batchit ebsmount -n %d -m %s -s %s -v %s -t %s -i %s)", n, ebs[0], ebs[1], ebs[2], ebs[3], ebs[4]) 214 | } 215 | // mount the ebs volume and set trap to delete and detach the volume upon exit. 216 | ebsCmd[1] = `echo "vid: $vid"` 217 | // volumes get deleted at instance termination, but this will delete when the container exits. 218 | // unsets the trap for exit if it was already set to avoid loop. 219 | ebsCmd[2] = fmt.Sprintf(`cleanup_volume() { set +e; sig="$1"; echo "batchit: cleaning up volume $vid on signal $sig"; cd /; umount %s || umount -l %s; batchit ddv $vid; if [[ $sig != EXIT ]]; then trap - $sig EXIT; kill -s $sig $$; fi }; for sig in INT TERM EXIT; do trap "cleanup_volume $sig" $sig; done; cd %s;`, ebs[0], ebs[0], ebs[0]) 220 | } 221 | 222 | role := getRole(iam.New(sess, cfg), cli.Role) 223 | if role == nil { 224 | panic(fmt.Sprintf("role: %s not found for your account in region: %s", cli.Role, cli.Region)) 225 | } 226 | b := batch.New(sess, cfg) 227 | tmpMnt := getTmp(cli) 228 | 229 | payload := shellEncode(cli.Path) 230 | var commands []*string 231 | // prelude copied from aegea. 232 | for _, line := range strings.Split(strings.TrimSpace(fmt.Sprintf(` 233 | /bin/bash 234 | -c 235 | for i in "$@"; do eval "$i"; done 236 | batchit 237 | set -a 238 | if [ -f /etc/default/locale ]; then source /etc/default/locale; fi 239 | set +a 240 | if [ -f /etc/profile ]; then source /etc/profile; fi 241 | set -Eeuo pipefail 242 | %s 243 | %s 244 | %s 245 | %s 246 | %s 247 | export BATCH_SCRIPT=$(mktemp) 248 | echo "$B64GZ" | base64 -d | gzip -dc > $BATCH_SCRIPT 249 | chmod +x $BATCH_SCRIPT 250 | $BATCH_SCRIPT 251 | `, cleanupDefault, ebsCmd[0], ebsCmd[1], ebsCmd[2], tmpMnt)), "\n") { 252 | tmp := strings.TrimSpace(line[:]) 253 | if len(tmp) != 0 { 254 | commands = append(commands, &tmp) 255 | } 256 | } 257 | 258 | if cli.S3Outputs != "" { 259 | cmd := fmt.Sprintf("batchit s3upload -c --region %s --nofail %s", cli.Region, strings.Join(strings.Split(cli.S3Outputs, ","), " ")) 260 | commands = append(commands, &cmd) 261 | } 262 | 263 | if cli.Registry == "" { 264 | if !strings.Contains(cli.Image, "/") { 265 | stsvc := sts.New(sess) 266 | user, err := stsvc.GetCallerIdentity(&sts.GetCallerIdentityInput{}) 267 | if err != nil { 268 | panic(err) 269 | } 270 | cli.Image = fmt.Sprintf("%s.dkr.ecr.%s.amazonaws.com/%s", *user.Account, *sess.Config.Region, cli.Image) 271 | } 272 | } else { 273 | if cli.Registry == "hub.docker.com" || cli.Registry == "docker.com" { 274 | cli.Registry = "registry.hub.docker.com" 275 | } 276 | if cli.Registry == "registry.hub.docker.com" { 277 | if !strings.Contains(cli.Image, "/") { 278 | cli.Image = fmt.Sprintf("library/%s", cli.Image) 279 | } 280 | } 281 | cli.Image = fmt.Sprintf("%s/%s", cli.Registry, cli.Image) 282 | } 283 | var arrayProp *batch.ArrayProperties 284 | if cli.ArraySize != 0 { 285 | arrayProp = &batch.ArrayProperties{Size: aws.Int64(cli.ArraySize)} 286 | } 287 | 288 | jdef := &batch.RegisterJobDefinitionInput{ 289 | JobDefinitionName: &cli.JobName, 290 | RetryStrategy: &batch.RetryStrategy{Attempts: aws.Int64(cli.Retries)}, 291 | ContainerProperties: &batch.ContainerProperties{Image: &cli.Image, JobRoleArn: role.Arn, 292 | Memory: aws.Int64(int64(cli.Mem)), 293 | Command: commands, 294 | Ulimits: []*batch.Ulimit{&batch.Ulimit{HardLimit: aws.Int64(40000), SoftLimit: aws.Int64(40000), Name: aws.String("nofile")}}, 295 | Environment: []*batch.KeyValuePair{&batch.KeyValuePair{Name: aws.String("B64GZ"), 296 | Value: aws.String(payload)}}, 297 | Privileged: aws.Bool(true), 298 | Vcpus: aws.Int64(int64(cli.CPUs))}, 299 | Type: aws.String("container"), 300 | } 301 | if cli.Ebs != "" { 302 | // see: http://docs.aws.amazon.com/AmazonECS/latest/developerguide/using_data_volumes.html 303 | // without cloud-init, we must mount /dev by name.This means that the the EBS vol won't get 304 | // cleaned up by default. 305 | jdef.ContainerProperties.Volumes = []*batch.Volume{ 306 | &batch.Volume{Name: aws.String("vol00"), Host: &batch.Host{SourcePath: aws.String("/dev")}}, 307 | } 308 | jdef.ContainerProperties.MountPoints = []*batch.MountPoint{&batch.MountPoint{ 309 | SourceVolume: aws.String("vol00"), 310 | ContainerPath: aws.String("/dev"), 311 | }} 312 | } 313 | if len(cli.Volumes) > 0 { 314 | for k, v := range cli.Volumes { 315 | split := strings.Split(v, "=") 316 | if len(split) != 2 { 317 | panic("expected Volumes in the form: HOST_PATH=CONTAINER_PATH") 318 | } 319 | name := fmt.Sprintf("volxx%d", k) 320 | jdef.ContainerProperties.Volumes = append(jdef.ContainerProperties.Volumes, 321 | &batch.Volume{Host: &batch.Host{SourcePath: aws.String(split[0])}, Name: aws.String(name)}) 322 | jdef.ContainerProperties.MountPoints = append(jdef.ContainerProperties.MountPoints, 323 | &batch.MountPoint{SourceVolume: aws.String(name), ContainerPath: aws.String(split[1])}) 324 | } 325 | } 326 | 327 | ro, err := b.RegisterJobDefinition(jdef) 328 | if err != nil { 329 | panic(errors.Wrap(err, "error registering job definition")) 330 | } 331 | // Ignore return value; there's not much we can do if it fails 332 | // (and we're no worse off than before.) 333 | defer deleteJobDefinition(b, ro) 334 | var deps []*batch.JobDependency 335 | for _, dep := range cli.DependsOn { 336 | deps = append(deps, &batch.JobDependency{JobId: aws.String(dep)}) 337 | } 338 | 339 | submit := &batch.SubmitJobInput{ 340 | DependsOn: deps, 341 | JobDefinition: ro.JobDefinitionName, 342 | JobName: aws.String(cli.JobName), 343 | ArrayProperties: arrayProp, 344 | JobQueue: aws.String(cli.Queue), 345 | ContainerOverrides: &batch.ContainerOverrides{ 346 | Command: commands, 347 | Environment: []*batch.KeyValuePair{ 348 | &batch.KeyValuePair{Name: aws.String("B64GZ"), 349 | Value: aws.String(payload)}, 350 | &batch.KeyValuePair{Name: aws.String("cpus"), 351 | Value: aws.String(strconv.Itoa(cli.CPUs))}, 352 | }, 353 | }, 354 | } 355 | if cli.Ebs != "" { 356 | // set TMPDIR to the EBS mount. 357 | ebs := strings.Split(cli.Ebs, ":") 358 | submit.ContainerOverrides.Environment = append(submit.ContainerOverrides.Environment, 359 | &batch.KeyValuePair{Name: aws.String("TMPDIR"), Value: aws.String(ebs[0])}) 360 | } 361 | 362 | for _, e := range cli.EnvVars { 363 | pair := strings.SplitN(e, "=", 2) 364 | if len(pair) != 2 { 365 | panic(fmt.Sprintf("expecting EnvVars of format key=value. got %s", e)) 366 | } 367 | submit.ContainerOverrides.Environment = append(submit.ContainerOverrides.Environment, 368 | &batch.KeyValuePair{Name: aws.String(pair[0]), Value: aws.String(pair[1])}) 369 | } 370 | 371 | resp, err := b.SubmitJob(submit) 372 | if err != nil { 373 | if resp != nil { 374 | fmt.Fprintln(os.Stderr, resp) 375 | } 376 | panic(errors.Wrap(err, "error submitting job")) 377 | } 378 | 379 | if strings.HasPrefix(cli.Path, interactivePrefix) { 380 | showConnectionInfo(b, *resp.JobId, sess, cli.Queue) 381 | } 382 | fmt.Println(*resp.JobId) 383 | } 384 | 385 | func getCluster(b *batch.Batch, q string, keyPair *string) string { 386 | 387 | qi := &batch.DescribeJobQueuesInput{JobQueues: []*string{&q}} 388 | qr, err := b.DescribeJobQueues(qi) 389 | if err != nil { 390 | log.Println(err) 391 | os.Exit(0) 392 | } 393 | if len(qr.JobQueues) > 1 { 394 | log.Println("instance info only supported for queues with a single compute env") 395 | } 396 | ce := qr.JobQueues[0].ComputeEnvironmentOrder[0].ComputeEnvironment 397 | 398 | ci := &batch.DescribeComputeEnvironmentsInput{ 399 | ComputeEnvironments: []*string{ce}, 400 | } 401 | cr, err := b.DescribeComputeEnvironments(ci) 402 | if err != nil { 403 | log.Println(err) 404 | os.Exit(0) 405 | } 406 | *keyPair = *cr.ComputeEnvironments[0].ComputeResources.Ec2KeyPair 407 | return *cr.ComputeEnvironments[0].EcsClusterArn 408 | } 409 | 410 | func showConnectionInfo(b *batch.Batch, jobid string, sess *session.Session, queue string) { 411 | log.Println("waiting for job to start to get connection info") 412 | 413 | dji := &batch.DescribeJobsInput{ 414 | Jobs: []*string{&jobid}, 415 | } 416 | for i := 0; i < 100; i++ { 417 | time.Sleep(20 * time.Second) 418 | djo, err := b.DescribeJobs(dji) 419 | if err != nil { 420 | log.Println(err) 421 | os.Exit(0) 422 | } 423 | if djo == nil { 424 | break 425 | } 426 | var j = djo.Jobs[0] 427 | if *j.Status != "RUNNING" { 428 | log.Println("job status is ", *j.Status, " waiting") 429 | continue 430 | } 431 | 432 | var ec = ecs.New(sess) 433 | var keyPair = "" 434 | var cluster = getCluster(b, queue, &keyPair) 435 | 436 | tmp := strings.Split(*j.Container.ContainerInstanceArn, "/") 437 | ei := &ecs.DescribeContainerInstancesInput{ 438 | Cluster: aws.String(cluster), 439 | ContainerInstances: []*string{&tmp[1]}, 440 | } 441 | 442 | eo, err := ec.DescribeContainerInstances(ei) 443 | if err != nil { 444 | log.Fatal(err) 445 | } 446 | 447 | instanceId := *eo.ContainerInstances[0].Ec2InstanceId 448 | ec2s := ec2.New(sess) 449 | log.Println("instance-id:", instanceId) 450 | 451 | di := &ec2.DescribeInstancesInput{InstanceIds: []*string{&instanceId}} 452 | 453 | do, err := ec2s.DescribeInstances(di) 454 | if err != nil { 455 | log.Fatal(err) 456 | } 457 | 458 | ti := &ecs.DescribeTasksInput{Cluster: aws.String(cluster), Tasks: []*string{j.Container.TaskArn}} 459 | to, err := ec.DescribeTasks(ti) 460 | if err != nil { 461 | log.Fatal(err) 462 | } 463 | 464 | if len(to.Tasks) != 1 { 465 | log.Println("couldn't find container id") 466 | } 467 | 468 | c := to.Tasks[0].Containers[0] 469 | _ = c 470 | //log.Println(to) 471 | //log.Println(j.Container) 472 | 473 | dockerCmd := fmt.Sprintf(`docker exec -it $(curl -s "http://127.0.0.1:51678/v1/tasks?taskarn=%s" | grep -oP "DockerId..\"[^\"]+" | cut -d\" -f 3) bash`, *j.Container.TaskArn) 474 | 475 | log.Printf("ssh -ti ~/.ssh/%s.pem ec2-user@%s '%s'", keyPair, *do.Reservations[0].Instances[0].PublicIpAddress, dockerCmd) 476 | //log.Println("TODO: get container from Task:", *j.Container.TaskArn, " https://docs.aws.amazon.com/sdk-for-go/api/service/ecs/#Task") 477 | // ssh -ti ~/.ssh/istore.pem ec2-user@34.203.245.158 'docker exec -it $(curl -s "http://127.0.0.1:51678/v1/tasks?taskarn=arn:aws:ecs:us-east-1:321620740768:task/c8fcafec-2f0b-4129-8b21-7fae81ae8be9" | grep -oP "DockerId..\"[^\"]+" | cut -d\" -f 3) bash' 478 | break 479 | /* 480 | 481 | di := &ec2.DescribeAddressesInput{ 482 | //Filters: []*ec2.Filter{ 483 | // &ec2.Filter{Name: aws.String("instance-id"), Values: []*string{&instanceId}}}, 484 | Filters: []*ec2.Filter{ 485 | { 486 | Name: aws.String("domain"), 487 | Values: aws.StringSlice([]string{"vpc"}), 488 | }, 489 | }, 490 | } 491 | do, err := ec2s.DescribeAddresses(di) 492 | if err != nil { 493 | log.Fatal(err) 494 | } 495 | log.Println(do) 496 | log.Println(*do.Addresses[0].PublicIp) 497 | */ 498 | 499 | } 500 | 501 | } 502 | 503 | func deleteJobDefinition(b *batch.Batch, jdef *batch.RegisterJobDefinitionOutput) error { 504 | jobDefToDelete := fmt.Sprintf("%s:%d", *jdef.JobDefinitionName, *jdef.Revision) 505 | input := &batch.DeregisterJobDefinitionInput{ 506 | JobDefinition: aws.String(jobDefToDelete), 507 | } 508 | _, err := b.DeregisterJobDefinition(input) 509 | return err 510 | } 511 | --------------------------------------------------------------------------------