├── docs
    ├── comparisons
    │   ├── commoncrawl
    │   │   ├── languages
    │   │   │   ├── .gitignore
    │   │   │   ├── python
    │   │   │   │   ├── cdx-00210.gz
    │   │   │   │   ├── run.sh
    │   │   │   │   ├── languages2.py
    │   │   │   │   └── languages.py
    │   │   │   ├── ray
    │   │   │   │   ├── cdx-00210.gz
    │   │   │   │   ├── run.sh
    │   │   │   │   └── languages2.py
    │   │   │   ├── super
    │   │   │   │   ├── cdx-00210.gz
    │   │   │   │   ├── histo
    │   │   │   │   ├── run.sh
    │   │   │   │   ├── lang.sh
    │   │   │   │   ├── plainbash.sh
    │   │   │   │   └── parallelbash.sh
    │   │   │   └── init.sh
    │   │   ├── webservers
    │   │   │   ├── ray
    │   │   │   │   ├── run.sh
    │   │   │   │   ├── CC-MAIN-20210304235759-20210305025759-00616.warc.wat.gz
    │   │   │   │   ├── README.md
    │   │   │   │   └── webservers.py
    │   │   │   ├── super
    │   │   │   │   ├── histo
    │   │   │   │   ├── run.sh
    │   │   │   │   ├── wat3
    │   │   │   │   ├── CC-MAIN-20210304235759-20210305025759-00616.warc.wat.gz
    │   │   │   │   ├── wat2
    │   │   │   │   ├── wat
    │   │   │   │   ├── README.md
    │   │   │   │   ├── parallelbash2.sh
    │   │   │   │   ├── parallelbash1.sh
    │   │   │   │   ├── plainbash2.sh
    │   │   │   │   ├── plainbash1.sh
    │   │   │   │   ├── parallelbash3a.sh
    │   │   │   │   ├── plainbash3.sh
    │   │   │   │   └── parallelbash3.sh
    │   │   │   ├── python
    │   │   │   │   ├── run.sh
    │   │   │   │   ├── CC-MAIN-20210304235759-20210305025759-00616.warc.wat.gz
    │   │   │   │   ├── README.md
    │   │   │   │   └── webservers.py
    │   │   │   ├── .gitignore
    │   │   │   └── init.sh
    │   │   ├── wordcount
    │   │   │   ├── ray
    │   │   │   │   ├── run.sh
    │   │   │   │   ├── CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz
    │   │   │   │   ├── wordcount.py
    │   │   │   │   ├── README.md
    │   │   │   │   ├── wordcount2.py
    │   │   │   │   ├── wordcount2-with-filtering.py
    │   │   │   │   ├── wordcount2-with-filtering-defaultdict.py
    │   │   │   │   └── wordcount2-with-filtering-and-minio.py
    │   │   │   ├── super
    │   │   │   │   ├── histo
    │   │   │   │   ├── run.sh
    │   │   │   │   ├── CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz
    │   │   │   │   ├── plainbash.sh
    │   │   │   │   ├── parallelbash.sh
    │   │   │   │   └── README.md
    │   │   │   ├── python
    │   │   │   │   ├── run.sh
    │   │   │   │   ├── CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz
    │   │   │   │   ├── wordcount3.py
    │   │   │   │   ├── wordcount2.py
    │   │   │   │   ├── wordcount1.py
    │   │   │   │   └── README.md
    │   │   │   ├── .gitignore
    │   │   │   └── README.md
    │   │   └── README.md
    │   ├── util
    │   │   ├── histo
    │   │   ├── timeit.sh
    │   │   └── histo.cc
    │   └── README.md
    ├── super.png
    ├── commands
    │   ├── super-up.gif
    │   ├── super-run.md
    │   └── super-up.md
    ├── examples
    │   ├── images
    │   │   ├── runvis1.png
    │   │   ├── runvis2.png
    │   │   ├── runvis3.png
    │   │   ├── runvis4.png
    │   │   ├── runvis5.png
    │   │   └── runvis6.png
    │   ├── README.md
    │   ├── example5.md
    │   ├── example3.md
    │   ├── example4.md
    │   ├── example1.md
    │   ├── example6.md
    │   └── example2.md
    ├── blogs
    │   ├── 2-Super-CommonCrawl
    │   │   ├── wet.png
    │   │   ├── two-stage-tally.png
    │   │   ├── commoncrawl-comparo.png
    │   │   └── README.md
    │   ├── 1-Super-Overview
    │   │   ├── fork-join.png
    │   │   ├── pipelines.png
    │   │   ├── super-architecture.png
    │   │   ├── super-cp-5-with-progress.gif
    │   │   ├── super-lscpu-100-with-progress.gif
    │   │   ├── super-lscpu-100-sequence-diagram.png
    │   │   └── README.md
    │   └── backup.md
    └── tutorial
    │   └── basics
    │       ├── commoncrawling.gif
    │       ├── super-cloudbin.md
    │       ├── super-browse.md
    │       ├── super-cos.md
    │       ├── super-every.md
    │       ├── README.md
    │       └── super-parallelism.md
├── Casks
    └── super.rb
├── .gitignore
└── README.md


/docs/comparisons/commoncrawl/languages/.gitignore:
--------------------------------------------------------------------------------
1 | /cdx-00210.gz


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/languages/python/cdx-00210.gz:
--------------------------------------------------------------------------------
1 | ../cdx-00210.gz


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/languages/ray/cdx-00210.gz:
--------------------------------------------------------------------------------
1 | ../cdx-00210.gz


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/languages/ray/run.sh:
--------------------------------------------------------------------------------
1 | ../../../util/timeit.sh


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/languages/super/cdx-00210.gz:
--------------------------------------------------------------------------------
1 | ../cdx-00210.gz


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/languages/super/histo:
--------------------------------------------------------------------------------
1 | ../../../util/histo


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/webservers/ray/run.sh:
--------------------------------------------------------------------------------
1 | ../../../util/timeit.sh


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/webservers/super/histo:
--------------------------------------------------------------------------------
1 | ../../../util/histo


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/ray/run.sh:
--------------------------------------------------------------------------------
1 | ../../../util/timeit.sh


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/super/histo:
--------------------------------------------------------------------------------
1 | ../../../util/histo


--------------------------------------------------------------------------------
/docs/super.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/super/HEAD/docs/super.png


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/languages/python/run.sh:
--------------------------------------------------------------------------------
1 | ../../../util/timeit.sh


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/languages/super/run.sh:
--------------------------------------------------------------------------------
1 | ../../../util/timeit.sh


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/webservers/python/run.sh:
--------------------------------------------------------------------------------
1 | ../../../util/timeit.sh


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/webservers/super/run.sh:
--------------------------------------------------------------------------------
1 | ../../../util/timeit.sh


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/python/run.sh:
--------------------------------------------------------------------------------
1 | ../../../util/timeit.sh


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/super/run.sh:
--------------------------------------------------------------------------------
1 | ../../../util/timeit.sh


--------------------------------------------------------------------------------
/docs/commands/super-up.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/super/HEAD/docs/commands/super-up.gif


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/languages/super/lang.sh:
--------------------------------------------------------------------------------
1 | awk -F '"languages": "' '{ print $2 }'
2 | 


--------------------------------------------------------------------------------
/docs/comparisons/util/histo:
--------------------------------------------------------------------------------
1 | awk '{n[$1]++} END {for (x in n) print n[x], x}'
2 | # | sort -k1 -n -r
3 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/webservers/.gitignore:
--------------------------------------------------------------------------------
1 | /CC-MAIN-20210304235759-20210305025759-00616.warc.wat.gz


--------------------------------------------------------------------------------
/docs/examples/images/runvis1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/super/HEAD/docs/examples/images/runvis1.png


--------------------------------------------------------------------------------
/docs/examples/images/runvis2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/super/HEAD/docs/examples/images/runvis2.png


--------------------------------------------------------------------------------
/docs/examples/images/runvis3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/super/HEAD/docs/examples/images/runvis3.png


--------------------------------------------------------------------------------
/docs/examples/images/runvis4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/super/HEAD/docs/examples/images/runvis4.png


--------------------------------------------------------------------------------
/docs/examples/images/runvis5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/super/HEAD/docs/examples/images/runvis5.png


--------------------------------------------------------------------------------
/docs/examples/images/runvis6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/super/HEAD/docs/examples/images/runvis6.png


--------------------------------------------------------------------------------
/docs/blogs/2-Super-CommonCrawl/wet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/super/HEAD/docs/blogs/2-Super-CommonCrawl/wet.png


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/webservers/super/wat3:
--------------------------------------------------------------------------------
1 | awk -F '"languages": "' '{ print substr($2, 1, length($2) - 2) }'
2 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/.gitignore:
--------------------------------------------------------------------------------
1 | /CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz


--------------------------------------------------------------------------------
/docs/tutorial/basics/commoncrawling.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/super/HEAD/docs/tutorial/basics/commoncrawling.gif


--------------------------------------------------------------------------------
/docs/blogs/1-Super-Overview/fork-join.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/super/HEAD/docs/blogs/1-Super-Overview/fork-join.png


--------------------------------------------------------------------------------
/docs/blogs/1-Super-Overview/pipelines.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/super/HEAD/docs/blogs/1-Super-Overview/pipelines.png


--------------------------------------------------------------------------------
/docs/blogs/1-Super-Overview/super-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/super/HEAD/docs/blogs/1-Super-Overview/super-architecture.png


--------------------------------------------------------------------------------
/docs/blogs/2-Super-CommonCrawl/two-stage-tally.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/super/HEAD/docs/blogs/2-Super-CommonCrawl/two-stage-tally.png


--------------------------------------------------------------------------------
/docs/blogs/2-Super-CommonCrawl/commoncrawl-comparo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/super/HEAD/docs/blogs/2-Super-CommonCrawl/commoncrawl-comparo.png


--------------------------------------------------------------------------------
/docs/blogs/1-Super-Overview/super-cp-5-with-progress.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/super/HEAD/docs/blogs/1-Super-Overview/super-cp-5-with-progress.gif


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/webservers/ray/CC-MAIN-20210304235759-20210305025759-00616.warc.wat.gz:
--------------------------------------------------------------------------------
1 | ../CC-MAIN-20210304235759-20210305025759-00616.warc.wat.gz


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/webservers/python/CC-MAIN-20210304235759-20210305025759-00616.warc.wat.gz:
--------------------------------------------------------------------------------
1 | ../CC-MAIN-20210304235759-20210305025759-00616.warc.wat.gz


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/webservers/super/CC-MAIN-20210304235759-20210305025759-00616.warc.wat.gz:
--------------------------------------------------------------------------------
1 | ../CC-MAIN-20210304235759-20210305025759-00616.warc.wat.gz


--------------------------------------------------------------------------------
/docs/blogs/1-Super-Overview/super-lscpu-100-with-progress.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/super/HEAD/docs/blogs/1-Super-Overview/super-lscpu-100-with-progress.gif


--------------------------------------------------------------------------------
/docs/blogs/1-Super-Overview/super-lscpu-100-sequence-diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/super/HEAD/docs/blogs/1-Super-Overview/super-lscpu-100-sequence-diagram.png


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/python/CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz:
--------------------------------------------------------------------------------
1 | ../CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/ray/CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz:
--------------------------------------------------------------------------------
1 | ../CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/super/CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz:
--------------------------------------------------------------------------------
1 | ../CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/README.md:
--------------------------------------------------------------------------------
1 | # wordcount Comparative Study
2 | 
3 | - [python](python) Plain sequential Python implementation
4 | - [ray](ray) Ray Python implementation
5 | - [super](super) Bash-based implementation
6 | 


--------------------------------------------------------------------------------
/docs/comparisons/README.md:
--------------------------------------------------------------------------------
1 | # Super Comparative Analysis
2 | 
3 | This directory will hold the source artifacts for comparative analyses
4 | of Super versus other approaches.
5 | 
6 | - [commoncrawl](commoncrawl) Three classification studies against
7 |   [CommonCrawl](https://commoncrawl.org) data.
8 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/webservers/super/wat2:
--------------------------------------------------------------------------------
 1 | awk -v M=$1 -F"[,:}]" '{
 2 |     for(i=1; i<=NF; i++) {
 3 | 	if($i~/^[{ ]?\042'$1'\042/) {
 4 | #	if($i == "\042'$1'\042" || $i == "{\042'$1'\042" || $i == " \042'$1'\042") {
 5 | 	    print $(i+1)
 6 | 	    break
 7 | 	}
 8 |     }
 9 | }'
10 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/languages/init.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | F=cdx-00210.gz
 4 | 
 5 | SCRIPTDIR=$(cd $(dirname "$0") && pwd)
 6 | cd "$SCRIPTDIR"
 7 | 
 8 | if [ ! -f $F ]; then
 9 |     curl -L https://commoncrawl.s3.amazonaws.com/cc-index/collections/CC-MAIN-2021-10/indexes/$F | gunzip -c - | gzip -c > $F
10 | fi
11 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/webservers/init.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | F=CC-MAIN-20210304235759-20210305025759-00616.warc.wat.gz
 4 | 
 5 | SCRIPTDIR=$(cd $(dirname "$0") && pwd)
 6 | cd "$SCRIPTDIR"
 7 | 
 8 | if [ ! -f $F ]; then
 9 |     curl -L https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2021-10/segments/1614178369553.75/wat/$F | gunzip -c - | gzip -c > $F
10 | fi
11 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/webservers/super/wat:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env sh
 2 | 
 3 | while getopts "r:" OPTION; do
 4 |     case "$1" in
 5 |     -r)
 6 |         break
 7 |         ;;
 8 |     esac
 9 |     shift
10 | done
11 | 
12 | if [ "$OPTARG" ]; then
13 |     # e.g. grep -Eo '"Server":"[^"]+"' } | sed -E -e s/"Server":|"//g
14 |     grep -Eo '"'$OPTARG'":"[^"]+"' | sed -E -e 's/"'$OPTARG'":|"//g'
15 | fi
16 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/languages/super/plainbash.sh:
--------------------------------------------------------------------------------
 1 | export LC_ALL=C
 2 | 
 3 | KEY=languages
 4 | 
 5 | IN=cdx-00210.gz
 6 | #IN=yo.gz
 7 | 
 8 | #    grep -Eo "\"$KEY\": \"[^\"]+\"" | grep -Eo ' "[^"]+"' |
 9 | #    grep -Eo ' \{.+\}$' | jq .languages |
10 | cat $IN | \
11 |     gunzip -c - | \
12 |     ./lang.sh | \
13 |     tr -d ' "}' | \
14 |     tr ',' '\012' | \
15 |     ./histo | \
16 |     sort -n -r | \
17 |     head
18 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/webservers/python/README.md:
--------------------------------------------------------------------------------
 1 | # Plain Python Web Server Classification
 2 | 
 3 | All implementations read in a local compressed WAT file.
 4 | 
 5 | - [**webservers.py**](webservers.py) Uses Python nested loops to classify by serving web server
 6 | 
 7 | TODO: Surely can we do filtering in Python in a loop-free way?
 8 | 
 9 | ## Usage
10 | 
11 | ```sh
12 | ../init.sh
13 | ./run.sh ./webservers.py
14 | ```
15 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/webservers/ray/README.md:
--------------------------------------------------------------------------------
 1 | # Ray Python Web Server Classification
 2 | 
 3 | All implementations read in a local compressed WAT file.
 4 | 
 5 | - [**webservers.py**](webservers.py) Uses Ray and Python nested loops to classify by serving web server
 6 | 
 7 | TODO: Surely can we do filtering in Python in a loop-free way?
 8 | 
 9 | ## Usage
10 | 
11 | ```sh
12 | ../init.sh
13 | ./run.sh ./webservers.py
14 | ```
15 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/webservers/super/README.md:
--------------------------------------------------------------------------------
 1 | # Super/Bash Web Server Classification
 2 | 
 3 | All implementations read in a local compressed WAT file.
 4 | 
 5 | - [**plainbash1**](plainbash1.sh) Uses jq for projection (inefficient)
 6 | - [**plainbash2**](plainbash2.sh) Uses grep for projection
 7 | - [**plainbash3**](plainbash3.sh) Uses awk for projection
 8 | 
 9 | ## Usage
10 | 
11 | ```sh
12 | ../init.sh
13 | ./run.sh ./plainbash1.sh
14 | ```
15 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/super/plainbash.sh:
--------------------------------------------------------------------------------
 1 | export LC_ALL=C
 2 | 
 3 | IN=CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz
 4 | 
 5 | cat $IN | \
 6 |     gunzip -c - | \
 7 |     grep --binary-files=text -Ev '^WARC|Content-' | \
 8 |     tr ' ' '\012' | \
 9 |     grep --binary-files=text ..... | \
10 |     grep --binary-files=text -v '[^a-zA-Z]' | \
11 |     tr [:upper:] [:lower:] | \
12 |     ./histo | \
13 |     sort -n -r | \
14 |     head
15 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/webservers/super/parallelbash2.sh:
--------------------------------------------------------------------------------
 1 | export LC_ALL=C
 2 | IN=CC-MAIN-20210304235759-20210305025759-00616.warc.wat.gz
 3 | 
 4 | cat $IN | \
 5 |     unpigz -c - | \
 6 |     parallel -j 150% --pipe --linebuffer --block 10M grep \'^{\"Container\' \| grep -Eo \'\"Server\":\"[^\"]+\"\' \| sed -E -e \'s/\"Server\":\|\"//g\' \| \
 7 |     grep -o \'^[^/-]\\+\' \| \
 8 |     tr [:upper:] [:lower:] | \
 9 |     ../histo | \
10 |     ../histo2 | \
11 |     sort -n -r | \
12 |     head
13 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/super/parallelbash.sh:
--------------------------------------------------------------------------------
 1 | export LC_ALL=C
 2 | 
 3 | IN=CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz
 4 | 
 5 | cat $IN | \
 6 |     gunzip -c - | \
 7 |     parallel -j 150% --pipe --linebuffer --block 10M grep --binary-files=text -Ev \'^WARC\|Content-\' \| ./wordsplit \| grep --binary-files=text ..... \| grep --binary-files=text -v \'[^a-zA-Z]\' \| tr [:upper:] [:lower:] \| ./histo | \
 8 |     ./histo2 | \
 9 |     sort -n -r | \
10 |     head
11 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/webservers/super/parallelbash1.sh:
--------------------------------------------------------------------------------
 1 | export LC_ALL=C
 2 | IN=CC-MAIN-20210304235759-20210305025759-00616.warc.wat.gz
 3 | 
 4 | cat CC-MAIN-20170116095121-00570-ip-10-171-10-70.ec2.internal.warc.wat.gz  | \
 5 |     gunzip -c - | \
 6 |     grep -F \"Envelope | \
 7 |     grep -F \"Server | \
 8 |     parallel --pipe --linebuffer --block 10M -q jq -r '.Envelope."Payload-Metadata"."HTTP-Response-Metadata"."Headers".Server' | \
 9 |     grep -o '^[^/]\+' | \
10 |     tr [:upper:] [:lower:] | \
11 |     ../a.out | \
12 |     head
13 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/webservers/super/plainbash2.sh:
--------------------------------------------------------------------------------
 1 | export LC_ALL=C
 2 | 
 3 | IN=CC-MAIN-20170116095121-00570-ip-10-171-10-70.ec2.internal.warc.wat.gz
 4 | IN=CC-MAIN-20210304235759-20210305025759-00616.warc.wat.gz
 5 | IN=yo.wat.gz
 6 | IN=CC-MAIN-20210304235759-20210305025759-00616.warc.wat.gz
 7 | 
 8 | cat $IN | \
 9 |     unpigz -c - | \
10 |     grep '^{\"Container' | \
11 |     grep -Eo '"Server":"[^"]+"' | sed -E -e 's/"Server":|"//g' | \
12 |     grep -o '^[^/-]\+' | \
13 |     tr [:upper:] [:lower:] | \
14 |     ./histo | \
15 |     sort -n -r | \
16 |     head
17 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/languages/super/parallelbash.sh:
--------------------------------------------------------------------------------
 1 | export LC_ALL=C
 2 | 
 3 | KEY=languages
 4 | 
 5 | IN=cdx-00210.gz
 6 | #IN=yo.gz
 7 | 
 8 | #export PATH=/home/nickm/rust-parallel/parallel-master/target/x86_64-unknown-linux-musl/release:$PATH
 9 | 
10 | cat $IN | \
11 |     gunzip -c - | \
12 |     parallel -j 150% --pipe --linebuffer --block 20M grep -F \'\"languages\":\' \| \
13 |     ./lang.sh \| \
14 |     tr -d \' \"}\' \| \
15 |     tr -d \' \"\' \| \
16 |     tr \',\' \'\\012\' \| \
17 |     ../histo | \
18 |     ../histo2 | \
19 |     sort -n -r | \
20 |     head
21 | 
22 | 


--------------------------------------------------------------------------------
/docs/comparisons/util/timeit.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script requires GNU date. On macOS you may `brew install
 4 | # coreutils`, which provides `gdate`. This script assumes you have
 5 | # done so.
 6 | 
 7 | if [[ $(uname) = Darwin ]]; then
 8 |     DATE=gdate
 9 | else
10 |     # this also assumes that you have installed coredutils for linux
11 |     # and other platforms; it's just that, on macOS, the installed
12 |     # utility is called `gdate`.
13 |     DATE=/usr/bin/date
14 | fi
15 | 
16 | ts=$($DATE +%s%N)
17 | $@
18 | tt=$((($($DATE +%s%N) - $ts)/1000000))
19 | echo $tt
20 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/python/wordcount3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from collections import Counter
 4 | import gzip
 5 | import re
 6 | from io import BufferedReader
 7 | 
 8 | file = r"CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz"
 9 | with gzip.open(file, "r") as gz, BufferedReader(gz, buffer_size=40000096) as f:
10 |     wordcounts = Counter(f.read().split())
11 | 
12 | #most_frequent_words = heapq.nlargest(
13 | #    10, wordcounts, key=wordcounts.get)
14 | for word in wordcounts.most_common(10):
15 |     print("  ", word, wordcounts[word])
16 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/webservers/super/plainbash1.sh:
--------------------------------------------------------------------------------
 1 | export LC_ALL=C
 2 | 
 3 | IN=CC-MAIN-20170116095121-00570-ip-10-171-10-70.ec2.internal.warc.wat.gz
 4 | IN=CC-MAIN-20210304235759-20210305025759-00616.warc.wat.gz
 5 | IN=yo.wat.gz
 6 | IN=CC-MAIN-20210304235759-20210305025759-00616.warc.wat.gz
 7 | 
 8 | cat $IN | \
 9 |     gunzip -c - | \
10 |     grep -F \"Envelope | \
11 |     grep -F \"Server | \
12 |     jq -r '.Envelope."Payload-Metadata"."HTTP-Response-Metadata"."Headers".Server' | \
13 |     grep -o '^[^/]\+' | \
14 |     tr '[:upper:]' '[:lower:]' | \
15 |     ./histo | \
16 |     head
17 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/webservers/super/parallelbash3a.sh:
--------------------------------------------------------------------------------
 1 | export LC_ALL=C
 2 | 
 3 | # sed -En 's/.*"Server":"?([^,"]*)"?.*/\1/p'
 4 | # grep -zoP '"Server":"\K[^"]+' | \
 5 |     #    sed -n 's|.*"Server":"\([^"]*\)".*|\1|p' | \
 6 | 
 7 | KEY=Server
 8 | IN=CC-MAIN-20170116095121-00570-ip-10-171-10-70.ec2.internal.warc.wat.gz
 9 | IN=yo.wat
10 | IN=CC-MAIN-20210304235759-20210305025759-00616.warc.wat.gz
11 | 
12 | cat $IN | \
13 |     grep -F '"Server' | \
14 |     parallel -j ${1-6} --pipe --linebuffer --block 10M ./wat2 $KEY \| \
15 |     tr -d \'\"\' \| \
16 |     tr [:upper:] [:lower:] | \
17 |     ../a.out | \
18 |     head
19 | 
20 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/super/README.md:
--------------------------------------------------------------------------------
 1 | # Bash wordcount
 2 | 
 3 | All implementations read in a local compressed WET file.
 4 | 
 5 | - [**plainbash.sh**](plainbash) filters out trivial words
 6 | - [**parallelbash.sh**](parallelbash.sh) same, but uses GNU Parallel
 7 | 
 8 | ## Usage
 9 | 
10 | ```sh
11 | if [ ! -f ../CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz ]; then (cd .. && curl -LO https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2017-04/segments/1484560280292.50/wet/CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz); fi
12 | ./run.sh ./plainbash.sh
13 | ./run.sh ./parallelbash.sh
14 | ```
15 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/ray/wordcount.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import ray
 4 | import glob
 5 | import gzip
 6 | import heapq
 7 | from collections import Counter
 8 | 
 9 | ray.init()
10 | 
11 | with gzip.open(r"CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz", "r") as f:
12 |     it = (
13 |         ray.util.iter.from_items(f, num_shards=4)
14 |        .for_each(lambda line: Counter(line.split()))
15 |        .batch(1024)
16 | )
17 | 
18 | wordcounts = Counter()
19 | for counts in it.gather_async():
20 |     for count in counts:
21 |         wordcounts.update(count)
22 |     
23 | for word in wordcounts.most_common(10):
24 |     print("  ", word, wordcounts[word])
25 | 
26 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/webservers/super/plainbash3.sh:
--------------------------------------------------------------------------------
 1 | export LC_ALL=C
 2 | 
 3 | # sed -En 's/.*"Server":"?([^,"]*)"?.*/\1/p'
 4 | # grep -zoP '"Server":"\K[^"]+' | \
 5 |     #    sed -n 's|.*"Server":"\([^"]*\)".*|\1|p' | \
 6 | 
 7 | KEY=Server
 8 | 
 9 | IN=CC-MAIN-20170116095121-00570-ip-10-171-10-70.ec2.internal.warc.wat.gz
10 | IN=CC-MAIN-20210304235759-20210305025759-00616.warc.wat.gz
11 | IN=yo.wat.gz
12 | IN=CC-MAIN-20210304235759-20210305025759-00616.warc.wat.gz
13 | 
14 | cat $IN | \
15 |     unpigz -c - | \
16 |     grep '^{\"Container' | \
17 |     ./wat2 $KEY | \
18 |     tr -d '"' | \
19 |     grep -o '^[^/-]\+' | \
20 |     tr [:upper:] [:lower:] | \
21 |     ./histo | \
22 |     sort -n -r | \
23 |     head
24 | 
25 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/python/wordcount2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from collections import Counter
 4 | import gzip
 5 | import re
 6 | #import io
 7 | 
 8 | wordcounts = Counter()
 9 | file = r"CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz"
10 | #with gzip.open(file, "r") as gz, io.BufferedReader(gz, buffer_size=40000096) as f:
11 | with gzip.open(file, "r") as f:
12 |     for line in f:
13 |         wordcounts.update(line.split())
14 | 
15 | # wordcount = Counter(file.read().split())
16 | #most_frequent_words = heapq.nlargest(
17 | #    10, wordcounts, key=wordcounts.get)
18 | #for word in most_frequent_words:
19 | for word in wordcounts.most_common(10):
20 |     print("  ", word, wordcounts[word])
21 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/webservers/super/parallelbash3.sh:
--------------------------------------------------------------------------------
 1 | export LC_ALL=C
 2 | 
 3 | # sed -En 's/.*"Server":"?([^,"]*)"?.*/\1/p'
 4 | # grep -zoP '"Server":"\K[^"]+' | \
 5 |     #    sed -n 's|.*"Server":"\([^"]*\)".*|\1|p' | \
 6 | 
 7 | KEY=Server
 8 | IN=CC-MAIN-20170116095121-00570-ip-10-171-10-70.ec2.internal.warc.wat.gz
 9 | IN=CC-MAIN-20210304235759-20210305025759-00616.warc.wat.gz
10 | IN=yo.wat.gz
11 | IN=CC-MAIN-20210304235759-20210305025759-00616.warc.wat.gz
12 | 
13 | cat $IN | \
14 |     unpigz -c - | \
15 |     parallel -j 150% --pipe --linebuffer --block 10M grep \'^{\"Container\' \| \
16 |     ./wat2 $KEY \| \
17 |     tr -d \'\"\' \| \
18 |     grep -o \'^[^/-]\\+\' \| \
19 |     tr [:upper:] [:lower:] \| \
20 |     ../a.out | \
21 |     ../histo2 | \
22 |     sort -n -r | \
23 |     head
24 | 
25 | 


--------------------------------------------------------------------------------
/docs/examples/README.md:
--------------------------------------------------------------------------------
 1 | # What Kinds of Pipelines Can Super Run?
 2 | 
 3 | Click on an image for more detail on that use case.
 4 | 
 5 | |Example|Visually|
 6 | |-------|--------|
 7 | |Fixed fan-out<br>Output to stdout|[<img src="./images/runvis1.png" height="104">](./example1.md)|
 8 | |Fan-out across S3 input files<br>Output to console|[<img src="./images/runvis3.png" height="104">](./example3.md)|
 9 | |Fan-out across S3 input files<br>Output to S3|[<img src="./images/runvis2.png" height="104">](./example2.md)|
10 | |Fan-out across S3 input files<br>Post-process via streaming join|[<img src="./images/runvis4.png" height="104">](./example4.md)|
11 | |Super supports dropping in custom binaries|[<img src="./images/runvis5.png" height="104">](./example5.md)|
12 | |Periodic pipeline execution|[<img src="./images/runvis6.png" height="104">](./example6.md)|
13 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/languages/python/languages2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from collections import Counter
 4 | import gzip
 5 | import re
 6 | import io
 7 | import json
 8 | 
 9 | file = r"cdx-00210.gz"
10 | 
11 | language_counts = Counter()
12 | with gzip.open(file, "r") as gz, io.BufferedReader(gz) as f:
13 |     for line in f:
14 |         text = line.decode("utf8")
15 |         try:
16 |             idx = text.rindex('"languages"')
17 |             if idx >= 0:
18 |                 idx2 = idx + 14
19 |                 idx3 = text.index('"', idx2)
20 |                 languages = text[idx2:idx3]
21 |                 language_counts.update(Counter(languages.split(",")))
22 |         except ValueError as e:
23 |             continue
24 |                                 
25 | 
26 | for word in language_counts.most_common(10):
27 |     print("  ", word)
28 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/README.md:
--------------------------------------------------------------------------------
 1 | # Python-Ray-Super Comparative Study against CommonCrawl
 2 | 
 3 | These directories contain the **source scripts**, so that others may
 4 | replicate the findings summarized
 5 | [here](https://github.com/starpit/super/tree/comparisons/docs/blogs/2-Super-CommonCrawl#performance-comparisons). There
 6 | are three separate sub-studies, all against
 7 | [CommonCrawl](https://commoncrawl.org) data:
 8 | 
 9 | - [**wordcount**](wordcount) classifies crawled web pages by contained
10 |   words. This is a traditional word count against the WET files.
11 | 
12 | - [**webservers**](webservers) classifies crawled web pages by the
13 |   serving web server. This classification operates against the WAT
14 |   files.
15 | 
16 | - [**languages**](languages) classifies crawled web pages by supported
17 |   languages. This classification operates against the CDX files.


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/python/wordcount1.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from collections import Counter
 4 | import gzip
 5 | import re
 6 | import io
 7 | 
 8 | wordcounts = Counter()
 9 | file = r"CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz"
10 | with gzip.open(file, "r") as gz, io.BufferedReader(gz) as f:
11 |     for line in f:
12 |         text = line.decode("utf8")
13 |         if not re.search("^WARC|Content-", text):
14 |             for word in text.split():
15 |                 if len(word) >= 5 and not re.search('[^a-zA-Z]', word):
16 |                     #wordcounts.update([word])
17 |                     wordcounts[word.lower()] += 1
18 | 
19 | #most_frequent_words = heapq.nlargest(
20 | #    10, wordcounts, key=wordcounts.get)
21 | #for word in most_frequent_words:
22 | for word in wordcounts.most_common(10):
23 |     print("  ", word, wordcounts[word])
24 | 


--------------------------------------------------------------------------------
/Casks/super.rb:
--------------------------------------------------------------------------------
 1 | cask "super" do
 2 |   version "1.7.0"
 3 | 
 4 |   name "Super"
 5 |   desc "CLI for the Serverless Supercomputer"
 6 |   homepage "https://github.com/IBM/super"
 7 | 
 8 |   if Hardware::CPU.intel?
 9 |     url "https://github.com/IBM/super/releases/download/v#{version}/Super-darwin-x64.tar.bz2"
10 |     sha256 "78902a5b2f81f9f657bf86c469fff1e218815bdd69d61c31f66fa22f4fc36a7e"
11 |     app "Super-darwin-x64/Super.app"
12 |   else
13 |     url "https://github.com/IBM/super/releases/download/v#{version}/Super-darwin-arm64.tar.bz2"
14 |     sha256 "ba1b090e82fb9401adabd36ed3c157421fbd67d8a21086bb742c4d5bf33e46f4"
15 |     app "Super-darwin-arm64/Super.app"
16 |   end
17 | 
18 |   livecheck do
19 |     url :url
20 |     strategy :git
21 |     regex(/^v(\d+(?:\.\d+)*)$/)
22 |   end
23 | 
24 |   binary "#{appdir}/Super.app/Contents/Resources/super"
25 | 
26 |   zap trash: "~/Library/Application\ Support/Super"
27 | end
28 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/languages/python/languages.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from collections import Counter
 4 | import gzip
 5 | import re
 6 | import io
 7 | import json
 8 | 
 9 | file = r"cdx-00210.gz"
10 | 
11 | language_counts = Counter()
12 | with gzip.open(file, "r") as gz, io.BufferedReader(gz) as f:
13 |     for line in f:
14 |         text = line.decode("utf8")
15 |         idx = text.index('{')
16 |         if idx >= 0:
17 |             try:
18 |                 record = json.loads(text[idx:])
19 |                 if "languages" in record:
20 |                     languages = record["languages"]
21 |                     if languages:
22 |                         language_counts.update(Counter(languages.split(",")))
23 |             except ValueError as e:
24 |                 continue
25 |                                 
26 | 
27 | for word in language_counts.most_common(10):
28 |     print("  ", word)
29 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/python/README.md:
--------------------------------------------------------------------------------
 1 | # Plain Python wordcount
 2 | 
 3 | All implementations read in a local compressed WET file.
 4 | 
 5 | - [**wordcount1.py**](wordcount1.py) filters out trivial words, using inefficient parsing of one line at a time in a loop
 6 | - [**wordcount2.py**](wordcount2.py) no filtering, and inefficiently reading one line at a time in a loop
 7 | - [**wordcount3.py**](wordcount3.py) no filtering, and more efficiently using Python loop-free parsing
 8 | 
 9 | TODO: Surely can we do filtering in Python in a loop-free way?
10 | 
11 | ## Usage
12 | 
13 | ```sh
14 | if [ ! -f ../CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz ]; then (cd .. && curl -LO https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2017-04/segments/1484560280292.50/wet/CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz); fi
15 | ./run.sh ./wordcount1.py
16 | ./run.sh ./wordcount2.py
17 | ./run.sh ./wordcount3.py
18 | ```
19 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/ray/README.md:
--------------------------------------------------------------------------------
 1 | # Ray wordcount
 2 | 
 3 | Unless otherwise stated, all implementations read in a local
 4 | compressed WET file.
 5 | 
 6 | - [**wordcount.py**](wordcount.py) No filtering. Uses Ray iterators.
 7 | - [**wordcount2.py**](wordcount2.py) No filtering. Uses ray.put.
 8 | - [**wordcount2-with-filtering.py**](wordcount2-with-filtering.py) Ibid, plus filters out trivial words.
 9 | - [**wordcount2-with-filtering-defaultdict.py**](wordcount2-with-filtering-defaultdict.py) Ibid, using defaultdict instead of Counter.
10 | - [**wordcount2-with-filtering-and-minio.py**](wordcount2-with-filtering-defaultdict.py) As with wordcount2-with-filtering, but including the fetch of a remote input file.
11 | 
12 | ## Usage
13 | 
14 | ```sh
15 | if [ ! -f ../CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz ]; then (cd .. && curl -LO https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2017-04/segments/1484560280292.50/wet/CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz); fi
16 | ./run.sh ./wordcount2.py
17 | ```
18 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/ray/wordcount2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from collections import Counter, defaultdict
 4 | import gzip
 5 | import io
 6 | import ray
 7 | import pandas
 8 | import datetime
 9 | #file = open(r"CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet", "r", encoding="utf-8-sig")
10 | ray.init()
11 | wordcount=0
12 | @ray.remote
13 | def count_words(lines):
14 |     count=0
15 |     for line in lines:
16 |         count = count + len(line.split())
17 |     return count
18 | 
19 | begin=begin = datetime.datetime.now()
20 | bufsize = 10000000
21 | results = []
22 | with gzip.open(r"CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz", "r") as infile: 
23 |     while True:
24 |         lines = infile.readlines(bufsize)
25 |         lines_id = ray.put(lines)
26 |         if not lines:
27 |             break
28 |         results.append(count_words.remote(lines_id))
29 | numbers=ray.get(results)
30 | for number in numbers:
31 |     wordcount=wordcount+number
32 | end=datetime.datetime.now()
33 | duration=end-begin
34 | print(wordcount)
35 | print(duration)
36 | 


--------------------------------------------------------------------------------
/docs/examples/example5.md:
--------------------------------------------------------------------------------
 1 | # Super Example 5: Injecting Cloud Binaries
 2 | 
 3 | <img src="images/runvis5.png" align="left" height="125">
 4 | 
 5 | You may also inject custom scripts into the running jobs. You may use
 6 | any Cloud bucket to store your binaries.
 7 | 
 8 | <br>
 9 | <br>
10 | 
11 | ## Example
12 | 
13 | Here, we use the convenience path `/s3/ibm/bin` that Super
14 | provides. Just for fun, we use a [here
15 | document](https://tldp.org/LDP/abs/html/here-docs.html), via `cat
16 | <<EOF` and the `-f -` option to `super run`, to provide the pipeline
17 | to be run.
18 | 
19 | ```sh
20 | super mkdir /s3/ibm/tmp/dst
21 | super cp myAnalysis.sh /s3/ibm/bin
22 | 
23 | cat <<EOF | super run -f -
24 | gunzip -c /s3/ibm/tmp/*.gz | /s3/ibm/bin/myAnalysis.sh > /s3/ibm/tmp/dst/out-$j.txt
25 | EOF
26 | ```
27 | 
28 | ## Other Super Powers
29 | 
30 | [<img src="images/runvis1.png" height="75">](example1.md)
31 | [<img src="images/runvis2.png" height="75">](example2.md)
32 | [<img src="images/runvis3.png" height="75">](example3.md)
33 | [<img src="images/runvis4.png" height="75">](example4.md)
34 | [<img src="images/runvis6.png" height="75">](example6.md)
35 | 


--------------------------------------------------------------------------------
/docs/examples/example3.md:
--------------------------------------------------------------------------------
 1 | # Super Power 3: High-performance Data Pipelines in the Cloud
 2 | 
 3 | <img src="images/runvis3.png" align="left" height="125">
 4 | 
 5 | UNIX pipelines perform incredibly well, and let you mix and match
 6 | off-the-shelf tools in flexible ways.  With Super, you can leverage
 7 | all of this power. Use high-performance UNIX pipes `|`, but against
 8 | Cloud data and compute.
 9 | 
10 | <br>
11 | 
12 | ## Example
13 | 
14 | Following on from [our previous `cp` example](example2.md#example),
15 | this `super run` will spawn 3 jobs and produce the word count output
16 | on your console. Here we are using off-the-shelf tools (`grep` and
17 | `wc`), but in the Cloud, to compute a partial sum of matches.
18 | 
19 | ```sh
20 | super run -- 'cat /s3/ibm/tmp/*.wet.gz | gunzip -c - | grep "WARC-Type: conversion" | wc -l'
21 | [Job 1] 40711
22 | [Job 2] 40880
23 | [Job 3] 40681
24 | ```
25 | 
26 | ## Other Super Powers
27 | 
28 | [<img src="images/runvis1.png" height="76">](example1.md)
29 | [<img src="images/runvis2.png" height="76">](example2.md)
30 | [<img src="images/runvis4.png" height="76">](example4.md)
31 | [<img src="images/runvis5.png" height="76">](example5.md)
32 | [<img src="images/runvis6.png" height="76">](example6.md)
33 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/ray/wordcount2-with-filtering.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from collections import Counter
 4 | import gzip
 5 | import re
 6 | import io
 7 | import ray
 8 | import pandas
 9 | import datetime
10 | 
11 | ray.init()
12 | wordcount=0
13 | @ray.remote
14 | def count_words(lines):
15 |     count=Counter()
16 |     for line in lines:
17 |         text = line.decode("utf8")
18 |         if not re.search("^WARC|Content-", text):
19 |             for word in text.split():
20 |                 if len(word) >= 5 and not re.search('[^a-zA-Z]', word):
21 |                     count[word.lower()] += 1
22 |     return count
23 | 
24 | begin=begin = datetime.datetime.now()
25 | bufsize = 10000000
26 | results = []
27 | with gzip.open(r"CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz", "r") as infile: 
28 |     while True:
29 |         lines = infile.readlines(bufsize)
30 |         lines_id = ray.put(lines)
31 |         if not lines:
32 |             break
33 |         results.append(count_words.remote(lines_id))
34 | counts=ray.get(results)
35 | 
36 | wordcounts = Counter()
37 | for count in counts:
38 |     wordcounts.update(count)
39 | 
40 | for word in wordcounts.most_common(10):
41 |     print("  ", word, wordcounts[word])
42 | end=datetime.datetime.now()
43 | duration=end-begin
44 | 
45 | print(duration)
46 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/ray/wordcount2-with-filtering-defaultdict.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from collections import defaultdict, Counter
 4 | import gzip
 5 | import re
 6 | import io
 7 | import ray
 8 | import pandas
 9 | import datetime
10 | import heapq
11 | 
12 | ray.init()
13 | wordcount=0
14 | @ray.remote
15 | def count_words(lines):
16 |     count=defaultdict(int)
17 |     for line in lines:
18 |         text = line.decode("utf8")
19 |         if not re.search("^WARC|Content-", text):
20 |             for word in text.split():
21 |                 if len(word) >= 5 and not re.search('[^a-zA-Z]', word):
22 |                     count[word] += 1
23 |     return count
24 | 
25 | begin=begin = datetime.datetime.now()
26 | bufsize = 10000000
27 | results = []
28 | with gzip.open(r"CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz", "r") as infile: 
29 |     while True:
30 |         lines = infile.readlines(bufsize)
31 |         lines_id = ray.put(lines)
32 |         if not lines:
33 |             break
34 |         results.append(count_words.remote(lines_id))
35 | counts=ray.get(results)
36 | 
37 | wordcounts = Counter()
38 | for count in counts:
39 |     wordcounts.update(count)
40 | end=datetime.datetime.now()
41 | 
42 | for word in wordcounts.most_common(10):
43 |     print("  ", word)
44 | duration=end-begin
45 | 
46 | print(duration)
47 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # electron builds
 2 | /dist
 3 | 
 4 | # warning for future reference: see https://github.com/IBM/kui/issues/209
 5 | # /package-lock.json
 6 | 
 7 | # in case we have some leftover temporary build output
 8 | /kui
 9 | 
10 | # webpack and electron staging directories
11 | kui-*-tmp
12 | 
13 | packages/*/package-lock.json
14 | plugins/*/package-lock.json
15 | 
16 | *~
17 | #*
18 | dist/app
19 | dist/build
20 | dist/plugins
21 | *.log
22 | *.bak
23 | node_modules
24 | wskng.iml
25 | .pre-scanned.json
26 | app/.version
27 | app/build/webpack-stats.html
28 | app/content/**/*.js.map
29 | app/src/@kui-plugin
30 | .openwhisk-shell
31 | dump.rdb
32 | #openwhisk
33 | .idea
34 | .DS_Store
35 | app.inst
36 | .travis.yml.orig
37 | *_BASE*
38 | *_BACKUP*
39 | *_LOCAL*
40 | *_REMOTE*
41 | keys
42 | *flycheck*.ts
43 | *flycheck*.js
44 | *flycheck*.js.map
45 | /build
46 | *.bak.json
47 | 
48 | # these seem to come from node-pty or xterm.js
49 | .swp
50 | 
51 | # we will copy this file out of packages/kui-builder for local dev
52 | .npmrc
53 | 
54 | nohup.out
55 | 
56 | # tsc composite build files
57 | *.tsbuildinfo
58 | 
59 | # any temporary npm packs
60 | kui-shell-*.tgz
61 | 
62 | # mkclient.sh stage
63 | kui-stage
64 | 
65 | # es6 compiled modules
66 | packages/*/mdist
67 | plugins/*/mdist
68 | clients/**/mdist
69 | 
70 | # webpack-dev-server report
71 | report.*.json
72 | 
73 | # packages/builder/dist/electron currently generates this and does not remove it
74 | /kubectl-kui


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/languages/ray/languages2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from collections import Counter
 4 | import gzip
 5 | import re
 6 | import io
 7 | import ray
 8 | import json
 9 | import datetime
10 | 
11 | ray.init()
12 | @ray.remote
13 | def count_languages(lines):
14 |     language_counts=Counter()
15 |     for line in lines:
16 |         text = line.decode("utf8")
17 |         try:
18 |             idx = text.rindex('"languages"')
19 |             if idx >= 0:
20 |                 idx2 = idx + 14
21 |                 idx3 = text.index('"', idx2)
22 |                 languages = text[idx2:idx3]
23 |                 language_counts.update(Counter(languages.split(",")))
24 |         except ValueError as e:
25 |             continue
26 |     return language_counts
27 | 
28 | begin=begin = datetime.datetime.now()
29 | bufsize = 10000000
30 | results = []
31 | file = r"cdx-00210.gz"
32 | #file = r"yo.gz"
33 | with gzip.open(file, "r") as infile:
34 |     while True:
35 |         lines = infile.readlines(bufsize)
36 |         lines_id = ray.put(lines)
37 |         if not lines:
38 |             break
39 |         results.append(count_languages.remote(lines_id))
40 | counts=ray.get(results)
41 | 
42 | language_counts = Counter()
43 | for count in counts:
44 |     language_counts.update(count)
45 | end=datetime.datetime.now()
46 | 
47 | for word in language_counts.most_common(10):
48 |     print("  ", word)
49 | duration=end-begin
50 | 
51 | print(duration)
52 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/webservers/python/webservers.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from collections import Counter
 4 | import gzip
 5 | import re
 6 | #import io
 7 | import json
 8 | 
 9 | file = r"CC-MAIN-20170116095121-00570-ip-10-171-10-70.ec2.internal.warc.wat.gz"
10 | file = r"CC-MAIN-20210304235759-20210305025759-00616.warc.wat.gz"
11 | file = r"yo.wat.gz"
12 | file=r"CC-MAIN-20210304235759-20210305025759-00616.warc.wat.gz"
13 | 
14 | server_counts = Counter()
15 | #with gzip.open(file, "r") as gz, io.BufferedReader(gz) as f:
16 | with gzip.open(file, "r") as f:
17 |     for line in f:
18 |         text = line.decode("utf8")
19 |         if re.search("^{\"Container", text):
20 |             try:
21 |                 record = json.loads(text)
22 |                 meta = record["Envelope"]["Payload-Metadata"]
23 |                 if "HTTP-Response-Metadata" in meta:
24 |                     response_meta = meta["HTTP-Response-Metadata"]
25 |                     if "Headers" in response_meta:
26 |                         headers = response_meta["Headers"]
27 |                         if "Server" in headers:
28 |                             server = headers["Server"]
29 |                             if server:
30 |                                 server_key = re.sub('[/-].+$', '', server).lower()
31 |                                 server_counts[server_key] = server_counts[server_key] + 1
32 |             except ValueError as e:
33 |                 continue
34 |                                 
35 | 
36 | for word in server_counts.most_common(10):
37 |     print("  ", word)
38 | 


--------------------------------------------------------------------------------
/docs/examples/example4.md:
--------------------------------------------------------------------------------
 1 | # Super Power 4: Joining Output from Jobs
 2 | 
 3 | <img src="images/runvis4.png" align="left" height="125">
 4 | 
 5 | You can also pipe the output of a Cloud job to a local pipeline.
 6 | 
 7 | <br>
 8 | <br>
 9 | <br>
10 | 
11 | ## Example
12 | 
13 | To generate a histogram of the CPU models running in your Cloud, note
14 | how we can do part of the computation in the Cloud, and then use a
15 | local pipeline to assemble the results:
16 | 
17 | ```sh
18 | super run -p3 -- `lscpu | grep "Model name" | cut -f2 -d ":"' | sort | uniq -c
19 | ```
20 | 
21 | # Another Example
22 | 
23 | <img title="Super takes a normal UNIX command line, and runs it in parallel, in the Cloud" alt="Super auto-scales normal UNIX command lines" src="../blogs/1-Super-Overview/super-lscpu-100-with-progress.gif" align="right" width="680">
24 | 
25 | Following on from [our previous `cp` example](example2.md#example),
26 | this `super run`, this pipeline uses `wc -l` to generate partial sum
27 | in the Cloud, and then uses a local `awk` to sum the partial sums
28 | generated by the 3 jobs.
29 | 
30 | ```sh
31 | super run -- \
32 |   'cat /s3/ibm/tmp/*.gz | gunzip -c - | grep "WARC-Type: conversion" | wc -l' \
33 |   | awk '{N+=$1} END {print N}'
34 | 122272
35 | ```
36 | 
37 | ## Other Super Powers
38 | 
39 | [<img src="images/runvis1.png" height="79">](example1.md)
40 | [<img src="images/runvis2.png" height="79">](example2.md)
41 | [<img src="images/runvis3.png" height="79">](example3.md)
42 | [<img src="images/runvis5.png" height="79">](example5.md)
43 | [<img src="images/runvis6.png" height="79">](example6.md)
44 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/ray/wordcount2-with-filtering-and-minio.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from collections import Counter, defaultdict
 4 | import gzip
 5 | import re
 6 | import io
 7 | import ray
 8 | import pandas
 9 | import datetime
10 | from minio import Minio
11 | 
12 | client = Minio("s3.amazonaws.com", "", "")
13 | 
14 | ray.init()
15 | #ray.init(address='auto')
16 | 
17 | wordcount=0
18 | @ray.remote(num_cpus=1)
19 | def count_words(lines):
20 |     count=Counter()
21 |     for line in lines:
22 |         text = line.decode("utf8")
23 |         if not re.search("^WARC|Content-", text):
24 |             for word in text.split():
25 |                 if len(word) >= 5 and not re.search('[^a-zA-Z]', word):
26 |                     count[word] = count[word] + 1
27 |     return count
28 | 
29 | begin=begin = datetime.datetime.now()
30 | bufsize = 10000000
31 | results = []
32 | response = client.get_object('commoncrawl', 'crawl-data/CC-MAIN-2017-04/segments/1484560280292.50/wet/CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz')
33 | with gzip.GzipFile(fileobj=response) as infile:
34 |     while True:
35 |         lines = infile.readlines(bufsize)
36 |         lines_id = ray.put(lines)
37 |         if not lines:
38 |             break
39 |         results.append(count_words.remote(lines_id))
40 | counts=ray.get(results)
41 | 
42 | wordcounts = Counter()
43 | for count in counts:
44 |     wordcounts.update(count)
45 | end=datetime.datetime.now()
46 | 
47 | for word in wordcounts.most_common(10):
48 |     print("  ", word, wordcounts[word])
49 | duration=end-begin
50 | 
51 | print(duration)
52 | 


--------------------------------------------------------------------------------
/docs/comparisons/util/histo.cc:
--------------------------------------------------------------------------------
 1 | #include <algorithm>
 2 | #include <iostream>
 3 | #include <fstream>
 4 | #include <map>
 5 | #include <regex>
 6 | #include <string>
 7 | #include <vector>
 8 |  
 9 | int main() {
10 |     using namespace std;
11 |     regex wordRgx("\\w+");
12 |     map<string, int> freq;
13 |     std::string line;
14 | 
15 |     auto fp_deletor = [](std::istream* is_ptr) {
16 |         if (is_ptr && is_ptr != &std::cin) {
17 |             static_cast<std::ifstream*>(is_ptr)->close();
18 |             delete is_ptr;
19 |             std::cerr << "destroy fp.\n"; 
20 |         }
21 |     };
22 |     std::unique_ptr<std::istream, decltype(fp_deletor)> is_ptr{nullptr, fp_deletor};
23 |     is_ptr.reset(&std::cin); 
24 | 	
25 |     while (std::getline(*is_ptr, line)) {
26 |       auto word = line;
27 |       if (word.size() > 0) {
28 | 	auto entry = freq.find(word);
29 | 	if (entry != freq.end()) {
30 | 	  entry->second++;
31 | 	} else {
32 | 	  freq.insert(make_pair(word, 1));
33 | 	}
34 |       }
35 |     }
36 |  
37 |     for (auto iter = freq.cbegin(); iter != freq.cend(); ++iter) {
38 |       printf("%5d  %4s\n", iter->second, iter->first.c_str());
39 |     }
40 |     
41 |     /*vector<pair<string, int>> pairs;
42 |     for (auto iter = freq.cbegin(); iter != freq.cend(); ++iter) {
43 |         pairs.push_back(*iter);
44 |     }
45 |     sort(pairs.begin(), pairs.end(), [=](pair<string, int>& a, pair<string, int>& b) {
46 |         return a.second > b.second;
47 |     });
48 |  
49 |     // cout << "Rank  Word  Frequency\n";
50 |     // cout << "====  ====  =========\n";
51 |     int rank = 1;
52 |     for (auto iter = pairs.cbegin(); iter != pairs.cend() && rank <= 10; ++iter) {
53 |       //printf("%2d   %4s   %5d\n", rank++, iter->first.c_str(), iter->second);
54 |       printf("%5d  %4s\n", iter->second, iter->first.c_str());
55 |       rank++;
56 |       }*/
57 |  
58 |     return 0;
59 | }
60 | 


--------------------------------------------------------------------------------
/docs/tutorial/basics/super-cloudbin.md:
--------------------------------------------------------------------------------
 1 | ### Table Of Contents
 2 | 
 3 | - [Running UNIX Commands in the Cloud](./README.md#readme)
 4 | - [Interacting with Cloud Object Storage](./super-cos.md#readme)
 5 | - [Visually Browsing Cloud Object Storage](./super-browse.md#readme)
 6 | - [Parallelizing your UNIX Pipeline](./super-parallelism.md#readme)
 7 | - [Injecting Custom Binaries](./super-cloudbin.md#readme) **[You are here]**
 8 | - [Examples of parallel analytics against CommonCrawl data](../../blogs/2-Super-CommonCrawl#readme)
 9 | - [Automating Periodic Tasks](./super-every.md#readme)
10 | 
11 | # The Super Way to Inject Custom Binaries into Cloud Executions
12 | 
13 | So far, we have described how to execute a UNIX pipeline using
14 | built-in tools such as `grep` and `sed`, and using `|` and `>` and `;`
15 | to sequence your computations. If you find this restriction to the use
16 | of built-ins limiting, Super has a solution: a Cloud analog to
17 | `/usr/local/bin`.
18 | 
19 | Locally, this would be a directory in which you install custom
20 | binaries, such as `kubectl`, that don't ship with your operating
21 | system. We would benefit from the same, but for Cloud-based
22 | executions.
23 | 
24 | At the moment, this approach is limited to the injection of scripts
25 | and binaries that self-contain their dependencies --- no `pip` or
26 | `npm` management, yet.
27 | 
28 | With that restriction, you may inject custom shell scripts and
29 | Linux-amd64 binaries by:
30 | 
31 | 1. Copy your custom binaries to a Cloud Object Storage filepath:
32 | 
33 | ```sh
34 | super mkdir /s3/ibm/default/cloudbin
35 | super cp analyze.sh /s3/ibm/default/cloudbin
36 | ```
37 | 
38 | > **Note**: you will need to choose your own bucket name (do not use
39 | > "cloudbin"), as bucket names are global, across all users.
40 | 
41 | 2. Now you may use that binary in your Cloud pipelines:
42 | 
43 | ```sh
44 | super run --- gunzip -c /s3/ibm/default/mydata/*.txt.gz | /s3/ibm/default/cloudbin/analyze.sh
45 | ```
46 | 
47 | And that's it!
48 | 


--------------------------------------------------------------------------------
/docs/commands/super-run.md:
--------------------------------------------------------------------------------
 1 | # The Super Commands: `super run`
 2 | 
 3 | **Super** allows you to specify a UNIX command line to execute *in
 4 | parallel* against a set of Cloud data.  Super takes care of
 5 | provisioning the right amount of compute, memory, and disk capacity,
 6 | scheduling your jobs, granting the needed data access authority to
 7 | your work, and streaming out logs --- all in one command: `super run`.
 8 | 
 9 | Super uses containers running in IBM Cloud [Code
10 | Engine](https://www.ibm.com/cloud/code-engine) as the compute layer,
11 | and gives your jobs access to data via IBM [Cloud Object
12 | Storage](https://www.ibm.com/cloud/object-storage).
13 | 
14 | # Usage
15 | 
16 | ```sh
17 | super run Fires off the bash pipeline a | b | c as a cloud job
18 | 
19 |   To parallelize across a set of files, specify a file glob e.g. '*.txt' or 'file-{1,2,3}.txt'. Use 
20 |   shell > syntax to capture output to cloud storage.
21 | 
22 |   Use 'single quotes' around pipelines to avoid local execution of | > $ etc.
23 | 
24 | Usage:
25 |   super run [options] -- 'a | b | c'
26 |   super run [options] -- 'a | b | c > /s3/ibm/default/myBucket'
27 | 
28 | Examples:
29 | super run -p 5 -- pwd                to test your setup; you should see the pwd output / repeated 5 times
30 | super run -p 20 -- 'echo $JOB_INDEX' the JOB_INDEX env. var. will be in the range 1..20 in this case
31 | super examples                       provides more detailed examples
32 | 
33 | Options:
34 | -f        Input file; use - for stdin
35 | -i        Use a custom docker image default: starpit/sh:0.0.5
36 | -p        Repeat a given pipeline N times default: number of glob matches
37 | -g        Emit more debugging output from the cloud-based jobs
38 | -q        Emit only pipeline output, e.g. for piping into other tasks
39 | --cpu, -m Cores and memory for each task default: cpu=1, memory=1024Mi
40 | --as      Execute the pipeline using the settings defined in the given `<profileName>`
41 | 
42 | Related:
43 |   super logs, super show, super browse s3
44 | ```
45 | 


--------------------------------------------------------------------------------
/docs/examples/example1.md:
--------------------------------------------------------------------------------
 1 | # Super Power 1: Running a Fixed Set of Jobs
 2 | 
 3 | <img src="images/runvis1.png" align="left" height="150">
 4 | 
 5 | The core command for scheduling a set of Cloud jobs is **`super
 6 | run`**.  Using the **`-p`** option to `super run`, Super can run a
 7 | fixed number of UNIX command lines, in the Cloud. The output of the
 8 | `N` jobs will be joined and flowed to `stdout`.
 9 | 
10 | <br>
11 | <br>
12 | 
13 | ## Example
14 | 
15 | ```sh
16 | super run -p3 -- printenv JOB_INDEX
17 | [Job 1] 1
18 | [Job 2] 2
19 | [Job 3] 3
20 | ```
21 | 
22 | Here we specified `-p3`, and printed out a convenience environment
23 | variable `JOB_INDEX` inside of each Cloud job. The output flows to our
24 | terminal. To help distinguish the output flowing from many concurrent
25 | jobs, the log lines emitted by a job with job index `k` are prefixed
26 | by `[Job k]`.
27 | 
28 | <a name="super-run-options">
29 | 
30 | ## Options for `super run`
31 | 
32 | - **`-p`**: Use a fixed number of jobs.
33 | - **`-q`**: Do not show the `[Job k]` prefix. If you pipe the output
34 | of `super run` to other commands on your laptop, Super will
35 | automatically operate in `-q` mode.
36 | - **`-g`**: Add extra debugging information to the output of `super
37 |   run`.
38 | - **`-f`**: You may provide your pipeline as a file, and via `-f -`
39 |   you may do so via `stdin`.
40 | - **`-i`**: Specify a custom base Docker image. It is recommended that
41 |   you extend `starpit/sh:0.0.5`.
42 | - **`-c`**, **`-m`**: Specify a combination of CPU and memory
43 |   allocation requests. Only [certain
44 |   combinations](https://cloud.ibm.com/docs/codeengine?topic=codeengine-mem-cpu-combo)
45 |   are supported.
46 | 
47 | ## Other Super Powers
48 | 
49 | [<img src="images/runvis2.png" height="66">](example2.md)
50 | [<img src="images/runvis3.png" height="66">](example3.md)
51 | [<img src="images/runvis4.png" height="66">](example4.md)
52 | [<img src="images/runvis5.png" height="66">](example5.md)
53 | [<img src="images/runvis6.png" height="66">](example6.md)
54 | 


--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/webservers/ray/webservers.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from collections import Counter
 4 | import gzip
 5 | import re
 6 | import io
 7 | import ray
 8 | import json
 9 | import datetime
10 | 
11 | ray.init()
12 | @ray.remote
13 | def count_servers(lines):
14 |     server_counts=Counter()
15 |     for line in lines:
16 |         text = line.decode("utf8")
17 |         if re.search("^{\"Container", text):
18 |             try:
19 |                 record = json.loads(text)
20 |                 meta = record["Envelope"]["Payload-Metadata"]
21 |                 if "HTTP-Response-Metadata" in meta:
22 |                     response_meta = meta["HTTP-Response-Metadata"]
23 |                     if "Headers" in response_meta:
24 |                         headers = response_meta["Headers"]
25 |                         if "Server" in headers:
26 |                             server = headers["Server"]
27 |                             if server:
28 |                                 server_key = re.sub('[/-].+$', '', server).lower()
29 |                                 server_counts[server_key] = server_counts[server_key] + 1
30 |             except ValueError as e:
31 |                 continue
32 |     return server_counts
33 | 
34 | begin=begin = datetime.datetime.now()
35 | bufsize = 10000000
36 | results = []
37 | file = r"CC-MAIN-20170116095121-00570-ip-10-171-10-70.ec2.internal.warc.wat.gz"
38 | file = r"CC-MAIN-20210304235759-20210305025759-00616.warc.wat.gz"
39 | file = r"yo.wat.gz"
40 | file=r"CC-MAIN-20210304235759-20210305025759-00616.warc.wat.gz"
41 | with gzip.open(file, "r") as infile:
42 |     while True:
43 |         lines = infile.readlines(bufsize)
44 |         lines_id = ray.put(lines)
45 |         if not lines:
46 |             break
47 |         results.append(count_servers.remote(lines_id))
48 | counts=ray.get(results)
49 | 
50 | server_counts = Counter()
51 | for count in counts:
52 |     server_counts.update(count)
53 | end=datetime.datetime.now()
54 | 
55 | for word in server_counts.most_common(10):
56 |     print("  ", word)
57 | duration=end-begin
58 | 
59 | print(duration)
60 | 


--------------------------------------------------------------------------------
/docs/tutorial/basics/super-browse.md:
--------------------------------------------------------------------------------
 1 | ### Table Of Contents
 2 | 
 3 | - [Running UNIX Commands in the Cloud](./README.md#readme)
 4 | - [Interacting with Cloud Object Storage](./super-cos.md#readme)
 5 | - [Visually Browsing Cloud Object Storage](./super-browse.md#readme) **[You are here]**
 6 | - [Parallelizing your UNIX Pipeline](./super-parallelism.md#readme)
 7 | - [Injecting Custom Binaries](./super-cloudbin.md#readme)
 8 | - [Examples of parallel analytics against CommonCrawl data](../../blogs/2-Super-CommonCrawl#readme)
 9 | - [Automating Periodic Tasks](./super-every.md#readme)
10 | 
11 | # Visually Browsing Cloud Object Storage via `super browse`
12 | 
13 | If you wish to analyze data that is stored in the Cloud, it can be
14 | helpful first to browse your Cloud objects. This exercise can yield
15 | the full filepaths that you can plug into your `super run` pipelines,
16 | e.g. `/s3/ibm/us/south/myBucket/myFolder/myData*.gz`.
17 | 
18 | <img src="commoncrawling.gif" width="600" align="right">
19 | 
20 | Super offers two commands to help you visually browse Cloud data.  The
21 | first is:
22 | 
23 | ```sh
24 | super browse cc
25 | ```
26 | 
27 | When you issue this command from your terminal, a window will pop
28 | up. This popup window directs you to the
29 | [CommonCrawl](https://commoncrawl.org/) public data set, hosted on
30 | AWS. This data set has many years worth of web crawls. It can be a
31 | helpful way to get started with rapid development of Cloud-based
32 | parallel analytics against large data sets.
33 | 
34 | From the popup window, you can type normal UNIX `ls` and `cd` commands
35 | to navigate the directory structure of the CommonCrawl
36 | data. Alternatively, you may use your mouse to click-navigate. The GIF
37 | to the right shows this style of navigation.
38 | 
39 | You may also wish to learn the schema of your input data. From the
40 | animated gif to the right, you can see that we ultimately find a file
41 | of interest. These are large compressed files. With `super browse`, we
42 | can easily study the schema of our candidate input file. Also, we can
43 | capture the full filepath, close the window, and return to our plain
44 | terminal with that filepath in our copy buffer.
45 | 
46 | The second browsing command is:
47 | 
48 | ```sh
49 | super browse s3
50 | ```
51 | 
52 | This command behaves identically to the first, except that the
53 | starting point for exploration are your own Cloud objects.
54 | 
55 | 


--------------------------------------------------------------------------------
/docs/commands/super-up.md:
--------------------------------------------------------------------------------
 1 | # Using `super up` to validate your setup
 2 | 
 3 | <img title="The super up command helps you with prerequisites" alt="The super up command helps you with prerequisites" src="super-up.gif" align="right">
 4 | 
 5 | Before you can employ `super run`, you must ensure that you have
 6 | provisioned a set of prerequisites. The `super up` command can check
 7 | these for you. The gif to the right illustrates `super up` when you
 8 | are all set. If you are missing some prerequisites, you can try `super
 9 | up --fix.`
10 | 
11 | ## Using `super up --fix`
12 | 
13 | If some of the prerequisites are not satisfied, Super has a `--fix`
14 | capability. This is still alpha, but is not destructive (it only adds,
15 | such as creating credentials for access to your Cloud Object
16 | Storage).
17 | 
18 | The most common situations are that you have not targeted an IBM Cloud
19 | resource group, or that you do not have an HMAC credential for your
20 | Cloud Object Storage instance. The `super up --fix` command can take
21 | care of both:
22 | 
23 | ```sh
24 | super up --fix
25 | ```
26 | 
27 | ### Specifying Compute and Storage Options
28 | 
29 | When auto-fixing your configuration, Super attempts to make some
30 | intelligent default choices. For example, if you have only a single
31 | way to schedule jobs (e.g. you have access to a single IBM CodeEngine
32 | project), it will assume that choice, without prompt.
33 | 
34 | You may override these default choices with the following additional
35 | options for `super up`, which allow you to create your own custom
36 | "profile":
37 | 
38 | - `--sso`: Log on to IBM Cloud using the Single Sign-On method
39 | - `--apikey <key>`: Log on to IBM Cloud using the apikey method, with the given key
40 | - `--resource-group <name>`: Target the specified Cloud resource group
41 | - `--region <name>`: Target the specified Cloud region
42 | - `--cos-instance <name>`: Use the specified Cloud Object Storage instance for `/s3/ibm/default`
43 | - `--code-engine-project <name>`: Use the named project
44 | 
45 | > **Current Limitations**: `super up --fix` does not automatically
46 | create a Cloud Object Storage instance. Please ensure that you have a
47 | Cloud Object Storage instance. Thanks!
48 | 
49 | > **Alpha Warning**: It is also likely to have some corner cases that
50 | have fallen through the cracks. If so, please let us know! We think
51 | this auto-onboarding capability is critical to a compelling story, so
52 | appreciate your help.
53 | 
54 | 


--------------------------------------------------------------------------------
/docs/tutorial/basics/super-cos.md:
--------------------------------------------------------------------------------
 1 | ### Table Of Contents
 2 | 
 3 | - [Running UNIX Commands in the Cloud](./README.md#readme)
 4 | - [Interacting with Cloud Object Storage](./super-cos.md#readme) **[You are here]**
 5 | - [Visually Browsing Cloud Object Storage](./super-browse.md#readme)
 6 | - [Parallelizing your UNIX Pipeline](./super-parallelism.md#readme)
 7 | - [Injecting Custom Binaries](./super-cloudbin.md#readme)
 8 | - [Examples of parallel analytics against CommonCrawl data](../blogs/2-Super-CommonCrawl/README.md#readme)
 9 | - [Automating Periodic Tasks](./super-every.md)
10 | 
11 | # The Super Way to Interact with Cloud Object Storage
12 | 
13 | > **Note**: Make sure to verify you are good to go, by first running
14 | > [`super up`](../../commands/super-up.md).
15 | 
16 | You may use filepaths of the form `/s3/ibm/...` to specify folders
17 | within your Cloud Object Storage. Most normal filesystem commands will
18 | work against these filepaths: `cat`, `ls`, `head`, `mkdir`, etc. You
19 | may then use `super run` to read data from and write data to storage
20 | that persists beyond the execution of your Cloud jobs.
21 | 
22 | To list all buckets in your default IBM Cloud region:
23 | 
24 | ```sh
25 | super run -- ls /s3/ibm/default
26 | ```
27 | 
28 | ## Local commands for interacting with Cloud Object Storage
29 | 
30 | In some cases, there is no need to execute object storage requests in
31 | the Cloud. For example, if you only wish to list objects, or create
32 | buckets and folders, you may also choose to execute these simple
33 | operations on your laptop:
34 | 
35 | ```sh
36 | ❯ super mkdir /s3/ibm/default/superfun
37 | ❯ super cp /usr/share/dict/words /s3/ibm/default/superfun
38 | ❯ super ls /s3/ibm/default/superfun
39 | words
40 | ```
41 | 
42 | > **Note:** You must choose your own name for the bucket (do not use
43 | > "superfun"). Bucket names are global, across all users.
44 | 
45 | <!-- The first command is run on your laptop, and creates a directory
46 | called `superfun` in the default region of your IBM Cloud Object
47 | Storage. -->
48 | 
49 | ## Redirecting output to Cloud Object Storage
50 | 
51 | You may also leverage the standard UNIX `>` operator to redirect the
52 | output of your Cloud pipelines, and have this output persisted. In
53 | this way, the data created by your Cloud pipeline will survive the
54 | execution of the Cloud job.
55 | 
56 | ```sh
57 | super run -p5 -- 'echo hello > /s3/ibm/default/superfun'
58 | ```
59 | 
60 | This command runs 5 jobs in the Cloud. The output of the `echo`
61 | command will be stored in a corresponding set of 5 files within
62 | "superfun", named by the `JOB_INDEX` of each parallel task.
63 | 


--------------------------------------------------------------------------------
/docs/tutorial/basics/super-every.md:
--------------------------------------------------------------------------------
 1 | ### Table Of Contents
 2 | 
 3 | - [Running UNIX Commands in the Cloud](./README.md#readme)
 4 | - [Interacting with Cloud Object Storage](./super-cos.md#readme)
 5 | - [Visually Browsing Cloud Object Storage](./super-browse.md#readme)
 6 | - [Parallelizing your UNIX Pipeline](./super-parallelism.md#readme)
 7 | - [Injecting Custom Binaries](./super-cloudbin.md#readme)
 8 | - [Examples of parallel analytics against CommonCrawl data](../../blogs/2-Super-CommonCrawl#readme)
 9 | - [Automating Periodic Tasks](./super-every.md#readme) **[You are here]**
10 | 
11 | # Automating Periodic Tasks with `super every`
12 | 
13 | You may execute a given pipeline periodically with the help of the
14 | `super every` command. This command has mostly the same structure as
15 | `super run`, but will execute the given pipeline according to a
16 | specified schedule, rather than once. For example to execute a
17 | pipeline every 30 minutes:
18 | 
19 | ```sh
20 | super every 30m --as <profileName> -- gunzip /s3/ibm/default/inputData/*.gz | analyze.py > /s3/ibm/default/outputData
21 | ```
22 | 
23 | The `--as` option allows you to choose a set of configuration options
24 | under which to run the task. For example, you may want to automate one
25 | task against CodeEngine `projectA` and another against `projectB`. See
26 | [below](#profiles-as-a-way-to-specify-configuration-options) for more
27 | detail on these configuration *profiles*.
28 | 
29 | You may then list your active periodic tasks via
30 | 
31 | ```sh
32 | super get every
33 | ```
34 | 
35 | And may see the low-level details of a specified periodic task via
36 | 
37 | ```sh
38 | super get every <taskName> -o yaml
39 | ```
40 | 
41 | ## Profiles as a way to Specify Configuration Options
42 | 
43 | Generally speaking, an automated task needs to run against a
44 | particular instance of Cloud Object Storage, e.g. so that `>
45 | /s3/ibm/default/...` redirects the output of your automated pipelines
46 | to the right place.
47 | 
48 | Which means that when automating a pipeline, you need to express a set
49 | of choices. 
50 | 
51 | ```sh
52 | super create profile <profileName> \
53 |   --apikey <ibmcloud apikey> \
54 |   --cos-instance <Cloud Object Storage instance name> \
55 |   [--code-engine-project <CodeEngine project name>] \
56 |   [--region <ibmcloud region>] \
57 |   [--resource-group <ibmcloud resource group>] \
58 | ```
59 | 
60 | You may then list your profiles via
61 | 
62 | ```sh
63 | super get profile
64 | ```
65 | 
66 | And you may see the details of a specified profile via
67 | 
68 | ```sh
69 | super get profile <profileName> -o <yaml|json>
70 | ```
71 | 
72 | And may delete a specified profile via
73 | 
74 | ```sh
75 | super delete profile <profileName>
76 | ```
77 | 


--------------------------------------------------------------------------------
/docs/examples/example6.md:
--------------------------------------------------------------------------------
 1 | # Super Power 6: Periodic Pipelines
 2 | 
 3 | <img src="images/runvis6.png" align="left" height="125">
 4 | 
 5 | To automate the execution of a pipeline, so that it executes
 6 | periodically in an unattended fashion, use `super every`. This command
 7 | accepts all of the [options for `super
 8 | run`](example1.md#super-run-options), plus two additional required
 9 | parameters: one for specifying the time period between executions of
10 | the given pipeline, and a second for specifying the *profile* against
11 | which to run the automated task.
12 | 
13 | When automating a pipeline, you will likely be running the pipeline
14 | against a different set of Cloud resources, compared to what you use
15 | for development. In Super, the set of choices that pertain to these
16 | Cloud resources is called a *profile*. This is the second additional
17 | parameter to `super every`, the `--as` option.
18 | 
19 | ## Specifying the Time Period for Your Periodic Automation
20 | 
21 | The time period between executions of a given pipeline is indicated by
22 | the first positional paramter. When specifying your period, you may
23 | use the conventional shorthands for time units. For example `super
24 | every 5m --- a | b | c` will execute that pipeline every five minutes;
25 | `4h` denotes a four-hour period, and `1d` denotes a daily periodicity,
26 | and so on.
27 | 
28 | > For a full list of shorthands, consult the [parse-duration
29 | > API](https://github.com/jkroso/parse-duration#available-unit-types-are).
30 | 
31 | ## Profiles: Specifying the Cloud Resources to Host the Automation
32 | 
33 | 1. Create a profile:
34 | 
35 | ```sh
36 | super create profile <profileName> \
37 |   --apikey=<...> \
38 |   --resource-group=<...> \
39 |   --code-engine-project=<...> \
40 |   --cos-instance=<...> \
41 |   [--region=<...>]
42 | ```
43 | 
44 | You may list your profiles via `super get profile`, and see the
45 |  details of a given profile via `super get profile <profileName> -o
46 |  yaml`. 
47 |  
48 | 2. Now you may bind these choices to future automations via `super
49 |    every --as <profileName>`
50 | 
51 | ## Example
52 | 
53 | To generate a histogram of the CPU models running in your Cloud, every hour:
54 | 
55 | ```sh
56 | super every 60m -p3 -- 'lscpu | grep "Model name" | cut -f2 -d ":"' | sort | uniq -c
57 | ```
58 | 
59 | To list your current periodic tasks:
60 | 
61 | ```sh
62 | super every list
63 | ```
64 | 
65 | ## Other Super Powers
66 | 
67 | [<img src="images/runvis1.png" height="68">](example1.md)
68 | [<img src="images/runvis2.png" height="68">](example2.md)
69 | [<img src="images/runvis3.png" height="68">](example3.md)
70 | [<img src="images/runvis4.png" height="68">](example4.md)
71 | [<img src="images/runvis5.png" height="68">](example5.md)
72 | 


--------------------------------------------------------------------------------
/docs/examples/example2.md:
--------------------------------------------------------------------------------
 1 | # Super Power 2: Auto-scaling across a set of Cloud Data
 2 | 
 3 | <img src="images/runvis2.png" align="left" height="150">
 4 | 
 5 | Super can auto-scale to **process a set of input files in
 6 | parallel**. All you need to do is specify a normal UNIX glob pattern
 7 | over Cloud file paths. Super takes care of determining how many jobs
 8 | it should fire off, and grants your jobs the least privilege they will
 9 | need to access the specified data.
10 | 
11 | In Super, a Cloud file path begins **`/s3`**. For example, to process
12 | a set of [CommonCrawl](https://commoncrawl.org/) data in parallel, you
13 | may point Super to a file path such as
14 | 
15 | ```sh
16 | /s3/aws/commoncrawl/crawl-data/CC-MAIN-2021-21/segments/1620243992721.31/wet/*-0000{1,2,3}.warc.wet.gz
17 | ```
18 | 
19 | In this case, the glob `*-0000{1,2,3}.warc.wet.gz` will expand to 3
20 | files. A `super run` pointed to this filepath will spawn no more than
21 | 3 Cloud jobs to host the computation. No need to set up VMs nor to
22 | transfer the data to and from your laptop.
23 | 
24 | > **Hints**: Try `super ls` to explore `/s3` file paths in your
25 | > terminal. Try `super browse`, which lets you explore via point and
26 | > click.
27 | 
28 | ## Example
29 | 
30 | It is common to need to move data from place to place within the
31 | Cloud. You may use `super run` and `cp` to do so:
32 | 
33 | ```sh
34 | super run -- cp \
35 |   '/s3/aws/commoncrawl/crawl-data/CC-MAIN-2021-21/segments/1620243992721.31/wet/*-0000{1,2,3}.warc.wet.gz' \
36 |   /s3/ibm/tmp
37 | ```
38 | 
39 | <img title="Super can copy your Cloud data rapidly, across providers or regions within the Cloud" alt="Animated GIF of super copy" src="../blogs/1-Super-Overview/super-cp-5-with-progress.gif" align="right" width="600">
40 | 
41 | > **Hint**: Make sure either to `'quote'` or `\`-escape any glob
42 | > characters, such as `*` or `{`, to prevent premature expansion of
43 | > the patterns.
44 | 
45 | ### A Cloud /tmp
46 | 
47 | You may copy your files to any `/s3/...` path. Here, we use the
48 | convenience `/s3/ibm/tmp` path that Super provides for IBM Cloud
49 | Object Storage. This serves as your `/tmp` bucket, in the Cloud.
50 | 
51 | ## Directing Output back to the Cloud via `>`
52 | 
53 | This `cp` example is a special case of a more general pattern of
54 | redirecting the output of a Cloud computation back to Cloud
55 | storage. In UNIX, you would use `>` to "redirect" the output of a
56 | computation to a given file path. In Super, the same holds for Cloud
57 | file paths!
58 | 
59 | This example will redirect the output to N output files in the given
60 | dst bucket.  Have some fun! Fill in our own ideas for the ... part of
61 | this pipeline.
62 | 
63 | ```sh
64 | super mkdir /s3/ibm/tmp/dst
65 | super run -- 'gunzip -c /s3/ibm/tmp/*.gz | ... > /s3/ibm/tmp/dst/out-$j.txt'
66 | ```
67 | 
68 | ## Other Super Powers
69 | 
70 | [<img src="images/runvis1.png" height="69">](example1.md)
71 | [<img src="images/runvis3.png" height="69">](example3.md)
72 | [<img src="images/runvis4.png" height="69">](example4.md)
73 | [<img src="images/runvis5.png" height="69">](example5.md)
74 | [<img src="images/runvis6.png" height="69">](example6.md)
75 | 


--------------------------------------------------------------------------------
/docs/tutorial/basics/README.md:
--------------------------------------------------------------------------------
 1 | ### Table Of Contents
 2 | 
 3 | - [Running UNIX Commands in the Cloud](./README.md#readme) **[You are here]**
 4 | - [Interacting with Cloud Object Storage](./super-cos.md#readme)
 5 | - [Visually Browsing Cloud Object Storage](./super-browse.md#readme)
 6 | - [Parallelizing your UNIX Pipeline](./super-parallelism.md#readme)
 7 | - [Injecting Custom Binaries](./super-cloudbin.md#readme)
 8 | - [Examples of parallel analytics against CommonCrawl data](../../blogs/2-Super-CommonCrawl#readme)
 9 | - [Automating Periodic Tasks](./super-every.md#readme)
10 | 
11 | # The Super Way to Run your UNIX Commands in the Cloud
12 | 
13 | **Super** allows you to run a UNIX command line *in parallel*, using
14 | auto-scaling Cloud compute resources, against a set of Cloud data.
15 | Super takes care of provisioning the right amount of compute, memory,
16 | and disk capacity, scheduling your jobs, granting the needed data
17 | access authority to your work, and streaming out logs --- all in one
18 | command: `super run`.
19 | 
20 | Super uses containers running in IBM Cloud [Code
21 | Engine](https://www.ibm.com/cloud/code-engine) as the compute layer,
22 | and gives your jobs access to data via IBM [Cloud Object
23 | Storage](https://www.ibm.com/cloud/object-storage).
24 | 
25 | This document demonstrates how to use `super run` via examples ranging
26 | from data preparation tasks to more complex analytics. <!--
27 | Separately, you may be interested in the [detailed usage guide for
28 | `super run`](./super-run.md). -->
29 | 
30 | ## Getting Started with a Few Simple Examples
31 | 
32 | > **Note**: Make sure to verify you are good to go, by first running
33 | > [`super up`](../../commands/super-up.md).
34 | 
35 | The command `lscpu` provides information about the CPU your on which
36 | your command executes. Using `super run`, we can easily run this in
37 | the Cloud. Behind the scenes, CPU resources will automatically be spun
38 | up and torn down, without your having to worry about anything beyond
39 | the commands you wish to execute:
40 | 
41 | ```sh
42 | ❯ super run -- lscpu
43 | Architecture: ...
44 | ```
45 | 
46 | You can also link together commands into pipelines, in the normal
47 | [UNIX way](https://en.wikipedia.org/wiki/Unix_philosophy). For
48 | example, to extract the CPU model, you can pipe through
49 | [`grep`](https://en.wikipedia.org/wiki/Grep) to filter the output:
50 | 
51 | ```sh
52 | ❯ super run -- 'lscpu | grep "Model name"'
53 | Model name:                      Intel Core Processor (Broadwell, IBRS)
54 | ```
55 | 
56 | > **Note:** Surround your command with `'single quotes'` to ensure
57 | > that the `|` is executed in the Cloud, not your laptop.
58 | 
59 | ### Specifying CPU and Memory Allocations
60 | 
61 | Your Cloud pipelines are given a default allocation of CPU shares and
62 | phystical memory: 1 (whole) CPU and 4G of memory.  If the default
63 | settings prove insufficient for your needs, then the `--cpu` and
64 | `--memory` options can help:
65 | 
66 | ```sh
67 | super run --cpu 4 --memory=8G -- lscpu
68 | ```
69 | 
70 | > Notes: Your memory allocation request must be specified in "M/G"
71 | units, not "Mi/Gi" units. Only certain cpu-to-memory ratios are
72 | supported, due to the way IBM CodeEngine is designed. Consult [this
73 | page](https://cloud.ibm.com/docs/codeengine?topic=codeengine-mem-cpu-combo)
74 | for a list of supported combinations.
75 | 
76 | 
77 | <!--## Others
78 | 
79 | ```sh
80 | super run -p 30  -- 'yes hello world | dd bs=1000024 count=100000 | mc pipe /s3/ibm/us/south/pdata/data${JOB_INDEX}.txt'
81 | ``` -->
82 | 


--------------------------------------------------------------------------------
/docs/blogs/backup.md:
--------------------------------------------------------------------------------
  1 | ## Tasks at Hand
  2 | 
  3 | Many tasks these days revolve around making sense of large data
  4 | sets. Training models, analyzing log data, constructing business plans
  5 | based on usage characteristics. These tasks may require analyzing
  6 | video feeds or processing semi-structured text. A critical part of any
  7 | data analysis is the preparation phase. In this blog, we will show how
  8 | using **normal bash pipelines** can give you an interactive, and
  9 | surprisingly highly performant [1], way to process data in the
 10 | cloud. With this technique, you can bash data in the cloud, using bash
 11 | pipelines that are identical to what you would write when operating
 12 | against local data.
 13 | 
 14 | Before throwing your data at the model generator, you inevitably spend
 15 | a good amount of time learning the structure of the data, and then
 16 | massaging it into an agreeable form. The data sets can be giant, be
 17 | stored under cryptic filepaths in a variety of cloud object stores,
 18 | are often compressed, and encoded in domain-specific data formats. All
 19 | of these features of the data slow you down, before you can even get
 20 | started with processing it. I probably need to download giant files in
 21 | order to run them through some local tooling, to help me learn the
 22 | data formats, and to prototype a few passes of the code. Already, the
 23 | way I develop differs from the way I operate at scale.
 24 | 
 25 | finding and excluding outliers, projecting out
 26 | fields of interest, splitting or aggregating the data sets to
 27 | facilitate parallel processing, running the data through various
 28 | classifiers in order to facilitate downstream analytics, and so on.
 29 | 
 30 | # Outline notes:
 31 | 
 32 | - data is remote and encoded
 33 | - leads to divergent experiences while prototyping versus at scale
 34 | - data needs to be processed into forms amenable for downstream analytics
 35 |   - to facilitate parallelism
 36 |   - to accommodate format constraints of other systems, such as model generators
 37 |   - to filter and classify in a way that maximizes the efficacy of those systems
 38 | - data is large, but many processing tasks can be easily expressed as streaming data pipelines
 39 |   - streaming data pipelines are easy to parallelize
 40 |   - they run very efficiently in terms of memory consumption and data locality
 41 |   - streaming pipelines express 2-3 degrees of pipeline parallelism
 42 |     naturally, without having to think you are writing a parallel
 43 |     algorithm
 44 | - UNIX pipelines offer a time-tested way to write this kind of code
 45 |   - focus on the data: regexps, accumulators, projections, not the parallel framework
 46 |   - pipeline flow control has been optimized over decades
 47 |   - grep, awk, sed are extremely fast
 48 |   - lack of meta-stability: `a | b | c | d` versus `a | bc | d` is not
 49 |     the kind of optimization you need to worry about
 50 |   - anti-viral: you can mix and match operators written in… whatever
 51 | - then, a few impl details...
 52 | - then, a few numbers
 53 | 
 54 | ## How it Works
 55 | 
 56 | ![Architecture](super-architecture.png)
 57 | 
 58 | ## How it Performs
 59 | 
 60 | ![Performance Comparison](commoncrawl-comparo.png)
 61 | 
 62 | Importantly, our experiments show that this parallel efficiency is
 63 | possible, even after factoring in the latencies of fetching and
 64 | emitting data, and also the load imbalances across pipeline stages.
 65 | 
 66 | Furthermore, the solutions seem to be stable. When adding a fourth
 67 | stage `d`, one does not spend much thought on `a | b | cd` versus `a |
 68 | b | c | d`, because we know that operating systems manage processes
 69 | and FIFOs with aplomb. For example, unless trying to squeeze out a few
 70 | last percentage points, any performance differences `grep foo | grep
 71 | bar` versus `grep 'foo|bar'` are outweighed by the elegance of keeping
 72 | stages simple and orthogonal.
 73 | 
 74 | This core count is aligns nicely with current
 75 | price/performance sweet spots of Cloud offerings.
 76 | 
 77 | ## Idea 1: Full Stream Ahead!
 78 | 
 79 | UNIX offers us a concise language for expressing streaming
 80 | computations. The utterance `cat in | a | b | c > out` expresses how
 81 | to acquire the data, where it should settle, and the stages of
 82 | analysis in-between.  These magic `|` and `>` operators take little
 83 | space on screen, and their cognitive load is low. We have faith that,
 84 | as long as those `a`, `b`, and `c` operators consume and produce
 85 | streams of bits, then the operating system will do us right.
 86 | 
 87 | To extend the approach to new data stores requires only adding new
 88 | interpretations for data ingress (what does it mean to `cat` an object
 89 | from your Cloud storage?) and egress (what does it mean to `>` to your
 90 | Cloud storage?).
 91 | 
 92 | As an example of the power of this "mix and match" approach to
 93 | composition, we compared using `awk` as a way to histogram a stream by
 94 | its first column to custom C++ code to do the same. The compiled and
 95 | gcc-optimized code was, even in the best case, only a few percent
 96 | faster.  If this were one-off code, certainly the effort expended to
 97 | write and maintain bespoke code would not outweigh the few percentage
 98 | points we got in return.
 99 | 
100 | At the same time, this strategy gives us the option to contribute
101 | generally beneficial hand-optimizations to the community. Others can
102 | now replace their custom awk logic, and with minimal effort:
103 | e.g. instead of `a | b | awk ...` it is now `a | b | histo`.
104 | 


--------------------------------------------------------------------------------
/docs/tutorial/basics/super-parallelism.md:
--------------------------------------------------------------------------------
  1 | ### Table Of Contents
  2 | 
  3 | - [Running UNIX Commands in the Cloud](./README.md#readme)
  4 | - [Interacting with Cloud Object Storage](./super-cos.md#readme)
  5 | - [Visually Browsing Cloud Object Storage](./super-browse.md#readme)
  6 | - [Parallelizing your UNIX Pipeline](./super-parallelism.md#readme) **[You are here]**
  7 | - [Injecting Custom Binaries](./super-cloudbin.md#readme)
  8 | - [Examples of parallel analytics against CommonCrawl data](../blogs/2-Super-CommonCrawl/README.md#readme)
  9 | - [Automating Periodic Tasks](./super-every.md)
 10 | 
 11 | # The Super Way to Parallelize your UNIX Pipeline
 12 | 
 13 | > **Note**: Make sure to verify you are good to go, by first running
 14 | > [`super up`](../../commands/super-up.md).
 15 | 
 16 | You probably desire to run many commands in parallel. Here, you have
 17 | two options:
 18 | 
 19 | 1. run a given pipeline a fixed `N` number of times: [`super run -p<N> -- lscpu`](#parallelism-option-1-running-a-given-unix-pipeline-a-fixed-number-of-times)
 20 | 2. run a given pipeline once for each of a set of input files: [`super run -- gunzip -c *.txt.gz | ...`](#parallelism-option-2-running-a-given-pipeline-once-for-each-of-a-set-of-input-files)
 21 | 
 22 | You may also need to [collect and
 23 | combine](#joining-the-results-of-parallel-job-execution) the results
 24 | of your parallel tasks.
 25 | 
 26 | ## Parallelism Option 1: Running a given UNIX Pipeline a fixed number of times
 27 | 
 28 | If you wish to run a given UNIX pipeline a fixed number of times, use
 29 | the `-p` option.  For example, to execute our "Model name" pipeline 5
 30 | times in parallel, in the Cloud:
 31 | 
 32 | ```sh
 33 | ❯ super run -p5 -- 'lscpu | grep "Model name"'
 34 | [Job 4] Model name:                      Intel Core Processor (Broadwell, IBRS)
 35 | [Job 3] Model name:                      Intel Xeon Processor (Cascadelake)
 36 | [Job 1] Model name:                      Intel Core Processor (Broadwell, IBRS)
 37 | [Job 2] Model name:                      Intel Core Processor (Broadwell, IBRS)
 38 | [Job 5] Model name:                      Intel Core Processor (Broadwell, IBRS)
 39 | ```
 40 | 
 41 | In this case, `super run` has prefixed the output with the index of
 42 | each of the five parallel jobs. Note how the job indices occur
 43 | out-of-order. This is because your jobs are executed in parallel.
 44 | 
 45 | > **Note:** Here we must surround our pipeline with 'single quotes',
 46 | > to ensure that the `|` is executed in the Cloud.
 47 | 
 48 | <!--You may use other UNIX constructs, such as `;`. To create the file
 49 | `test.txt` on each of the 5 containers, and then list them to verify
 50 | their existence (note how again the 'single quotes' are critical, so
 51 | that the `;` is executed in the Cloud, not your laptop):
 52 | 
 53 | ```sh
 54 | ❯ super run -p5 -- 'touch test.txt; ls test.txt'
 55 | test.txt
 56 | ``` 
 57 | -->
 58 | 
 59 | ### The JOB_INDEX environment variable
 60 | 
 61 | Each of the `N` parallel tasks is assigned an index in the range
 62 | `1..N`. This index is materialized in the environment variable
 63 | `JOB_INDEX`:
 64 | ```
 65 | ❯ super run -p5 -- 'touch test$JOB_INDEX.txt; ls test*'
 66 | [Job 2] test2.txt
 67 | [Job 4] test4.txt
 68 | [Job 3] test3.txt
 69 | [Job 1] test1.txt
 70 | [Job 5] test5.txt
 71 | ```
 72 | 
 73 | > **Note:** Here again we must surround our pipeline with 'single
 74 | > quotes', to prevent expansion of the `$` environment variable
 75 | > reference prior to Cloud execution, and also to ensure that the `;`
 76 | > and `*` are executed in the Cloud.
 77 | 
 78 | ## Joining the Results of Parallel Job Execution
 79 | 
 80 | You may wish to combine the outputs of your parallel jobs into a
 81 | single output. For example, we may want to combine the `lscpu | grep
 82 | "Model name"` output into a histogram. This pattern is useful for
 83 | *classification* tasks, where you want to group the input data into a
 84 | number of *classes*, and present the results as a histogram.
 85 | 
 86 | First, we can add `sed` to our pipeline to clean up the output a bit:
 87 | 
 88 | ```sh
 89 | ❯ super run -p 5 -- 'lscpu | grep "Model name" | sed "s/Model name:[ ]*//"'
 90 | [Job 4] Intel Core Processor (Broadwell, IBRS)
 91 | [Job 3] Intel Xeon Processor (Cascadelake)
 92 | [Job 1] Intel Core Processor (Broadwell, IBRS)
 93 | [Job 2] Intel Core Processor (Broadwell, IBRS)
 94 | [Job 5] Intel Core Processor (Broadwell, IBRS)
 95 | ```
 96 | 
 97 | Then, we can feed the output of the Cloud jobs into a local histogrammer:
 98 | 
 99 | ```sh
100 | ❯ super run -p 5 -- 'lscpu | grep "Model name" | sed "s/Model name:[ ]*//"' | sort | uniq -c
101 |                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^    ^^^^^^^^^^^^^^
102 |                                    Executed in the Cloud                      on your laptop
103 |    4 Intel Core Processor (Broadwell, IBRS)
104 |    1 Intel Xeon Processor (Cascadelake)
105 | ```
106 | 
107 | ## Parallelism Option 2: Running a given pipeline once for each of a set of input files
108 | 
109 | If you need to apply the same analytics to a set of input files, Super
110 | can help. If you specify a set of filepaths, either by explicitly
111 | enumerating the input files, or by using
112 | [glob](https://en.wikipedia.org/wiki/Glob_(programming)) patterns,
113 | Super will automatically apply your command to each file.
114 | 
115 | ```sh
116 | ❯ export DIR=/s3/aws/commoncrawl/crawl-data/CC-MAIN-2021-10/segments/1614178369553.75/wat
117 | ❯ super run -- \
118 |   "gunzip -c $DIR/CC-MAIN-20210304235759-20210305025759-0000{1,2,3,4,5}.warc.wat.gz \
119 |     | head"
120 | [Job 5] WARC/1.0
121 | [Job 5] WARC-Type: warcinfo
122 | [Job 3] WARC/1.0
123 | [Job 3] WARC-Type: warcinfo
124 | [Job 1] WARC/1.0
125 | [Job 1] WARC-Type: warcinfo
126 | [Job 2] WARC/1.0
127 | [Job 2] WARC-Type: warcinfo
128 | ```
129 | 
130 | Here, we have uncompressed and extracted the first two lines of 5
131 | files from the [CommonCrawl](https://commoncrawl.org/) public data
132 | set.
133 | 
134 | The "glob" pattern `foo{1,2,3,4,5}` expands to the tuple `foo1, foo2,
135 | foo3, foo4, foo5`, and Super will parallelize across the matched
136 | files.
137 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | #### [Installation](#installation) | [Blogs](#blogs) | [Tutorials](#tutorials)
  2 | 
  3 | # Super: A CLI for the Serverless Supercomputer
  4 | 
  5 | **Super** offers a zero-config and zero-code entry to your Cloud. It
  6 | does so by running **normal UNIX command lines** against Cloud data,
  7 | using Cloud compute. Super takes care of hooking these complex and
  8 | disparate resources together under one command: [`super
  9 | run`](docs/commands/super-run.md).
 10 | 
 11 | :rocket: **[Take me to the Installation Instructions](#installation)**
 12 | 
 13 | ## Blogs
 14 | 
 15 | - [Exploring Big Data with a CLI](https://medium.com/the-graphical-terminal/exploring-big-data-with-a-cli-59af31d38756)
 16 | - [Bash the Cloud](https://medium.com/cloud-computer/bash-the-cloud-3d476b7e4d7c)
 17 | - [Analyzing Big Data with `grep` and `awk`](https://medium.com/cloud-computer/analyzing-big-data-with-grep-and-awk-c07d362b6ab8)
 18 | - [Using IBM Cloud Code Engine to Analyze Big Data without Writing a Single Line of Code](https://medium.com/cloud-computer/using-ibm-cloud-code-engine-to-analyze-big-data-without-writing-a-single-line-of-code-12e46a24471c)
 19 | 
 20 | <a name="super-copy">
 21 | 
 22 | ## A Super Way to Copy
 23 | 
 24 | [<img align="right" src="docs/examples/images/runvis2.png"
 25 | height="104">](docs/examples/example2.md)
 26 | 
 27 | For example, Super can **copy a set of files** from one place in the
 28 | Cloud to another.
 29 | 
 30 | ```sh
 31 | super run -- cp /s3/ibm/default/src/foo*.txt /s3/aws/dst
 32 | ```
 33 | 
 34 | Behind the scenes, Super spawns Cloud Compute to mediate the
 35 | Cloud-to-Cloud data transfer. It uses
 36 | ["glob"](https://en.wikipedia.org/wiki/Glob_(programming)) patterns to
 37 | determine how many concurrent jobs to run. For example, if `foo*.txt`
 38 | matches 5 files, Super spawns a set of concurrent jobs, grants each
 39 | job the least privilege to access its assigned files, and more!
 40 | 
 41 | Because Super intelligently parses your command line, it can
 42 | automatically inject progress trackers. Super **tracks the progress of
 43 | any job** against your Cloud data.
 44 | 
 45 | <img title="Super can copy
 46 | your Cloud data rapidly, across providers or regions within the Cloud"
 47 | alt="Animated GIF of super copy"
 48 | src="docs/blogs/1-Super-Overview/super-cp-5-with-progress.gif">
 49 | 
 50 | Super leverges **any Kubernetes cluster** for Compute and **any S3
 51 | provider** for Data. If you wish to target a very large cluster, Super
 52 | integrates with [IBM Cloud Code
 53 | Engine](https://www.ibm.com/cloud/code-engine). It also can hook your
 54 | Compute jobs up with [IBM Cloud Object
 55 | Storage](https://www.ibm.com/cloud/object-storage). The [`super
 56 | up`](docs/commands/super-up.md) command gives you an easy way to
 57 | leverage both.
 58 | 
 59 | **There is no need to code to the Cloud API of the week to make any of
 60 | this happen.**
 61 | 
 62 | ## What Other Kinds of Pipelines Can Super Run?
 63 | 
 64 | Click on an image for more detail on that use case.
 65 | 
 66 | [<img src="docs/examples/images/runvis1.png" height="104">](docs/examples/example1.md)
 67 | [<img src="docs/examples/images/runvis2.png" height="104">](docs/examples/example2.md)
 68 | [<img src="docs/examples/images/runvis3.png" height="104">](docs/examples/example3.md)
 69 | [<img src="docs/examples/images/runvis4.png" height="104">](docs/examples/example4.md)
 70 | [<img src="docs/examples/images/runvis5.png" height="104">](docs/examples/example5.md)
 71 | [<img src="docs/examples/images/runvis6.png" height="104">](docs/examples/example6.md)
 72 | 
 73 | ## Installation
 74 | 
 75 | <!--[macOS Intel](https://github.com/IBM/super/releases/latest/download/Super-darwin-x64.tar.bz2) **|** [macOS Apple Silicon](https://github.com/IBM/super/releases/latest/download/Super-darwin-arm64.tar.bz2)-->
 76 | 
 77 | <table>
 78 |   <tr><th>macOS</th><th>Others</th></tr>
 79 |   <tr>
 80 |     <td>
 81 | 
 82 | ```sh
 83 | brew tap IBM/super https://github.com/IBM/super
 84 | brew install super
 85 | super
 86 | ```
 87 | 
 88 | </td>
 89 | <td>
 90 | Coming soon
 91 | </td>
 92 |   </tr>
 93 | </table>
 94 |         
 95 | You should now see usage information for Super, including the main
 96 | sub-command: [`super run`](docs/commands/super-run.md).
 97 | 
 98 | ## Getting Started: Using `super run` to submit pipelines to Cloud Compute
 99 | 
100 | Out of the box, `super run` will use **your current Kubernetes
101 | context** as the target for Compute, and will have read-only access to
102 | public S3 buckets. 
103 | 
104 | If this works for you, then try `super run -p5 -- echo
105 | hello`. [Above](#super-copy), we used a glob pattern to specify the
106 | Cloud jobs we needed; here, since we are not pulling in Cloud data, we
107 | instead use `-p5` to specify that we want to execute the given command
108 | line as five Cloud jobs.
109 | 
110 | <img title="Super takes a normal UNIX command line, and runs it in parallel, in the Cloud" alt="Super auto-scales normal UNIX command lines" src="docs/blogs/1-Super-Overview/super-lscpu-100-with-progress.gif">
111 | 
112 | ### Using `super browse` to explore input Cloud Data
113 | 
114 | <img align="right" src="docs/tutorial/basics/commoncrawling.gif" width="400">
115 | 
116 | To browse for interesting [CommonCrawl](https://commoncrawl.org/)
117 | input data, you may use [`super browse
118 | cc`](docs/tutorial/basics/super-browse.md). Super pipelines can access
119 | S3 data via a pseudo `/s3` filepath; e.g. `/s3/aws/commoncrawl` is the
120 | prefix for CommonCrawl data.
121 | 
122 | ### Using `super target` to configure your choice of Cloud Compute
123 | 
124 | By default, `super run` will target jobs against your currently
125 | selected Kubernetes context. You may switch contexts using standard
126 | `kubectl` commands. Strictly for convenience, Super offers `super
127 | target` to smooth the enumeration and selection of a context. In
128 | particular, if you are using IBM Cloud Code Engine, the `super target`
129 | command seamlessly integrates with CodeEngine projects.
130 | 
131 | ### Using `super up` to connect to your Cloud Provider
132 | 
133 | <img title="The super up command helps you with prerequisites" alt="The super up command helps you with prerequisites" src="docs/commands/super-up.gif" align="right" width="400">
134 | 
135 | The [`super up`](docs/commands/super-up.md) command will attempt to
136 | connect `super run` to your AWS credentials and to IBM Cloud. The
137 | latter allows `super run` to scale to a large Kubernetes cluster with
138 | hundreds of nodes, via [IBM Cloud Code
139 | Engine](https://www.ibm.com/cloud/code-engine); `super up` can also
140 | connect you to your [IBM Cloud Object
141 | Storage](https://www.ibm.com/cloud/object-storage) instances.
142 | 
143 | ## Tutorials
144 | 
145 | - [Getting to Know Super](docs/tutorial/basics/#readme)
146 | 


--------------------------------------------------------------------------------
/docs/blogs/1-Super-Overview/README.md:
--------------------------------------------------------------------------------
  1 | # Bash the Cloud
  2 | 
  3 | Cloud is the anti-UNIX. It is a world where nothing really is a file,
  4 | and computation must be allocated, planned, and utilized by careful
  5 | employment of a hundred miserable CLIs and APIs.
  6 | 
  7 | That is not to say we are anti-Cloud. We love that resources can be
  8 | pooled, and that the costs of idling and management can be amortized
  9 | across many and disparate consumers. The Cloud truly can be a modern
 10 | day operating system, interpreting our desires and managing hardware
 11 | with precision and transparency.
 12 | 
 13 | <img align="right" alt="Animated GIF of simple example" src="super-lscpu-100-with-progress.gif" width="675">
 14 | 
 15 | What would ["the UNIX
 16 | way"](https://en.wikipedia.org/wiki/Unix_philosophy) be for a Cloud
 17 | computer? This animated GIF captures our desire: UNIX pipelines, but
 18 | against Cloud data and compute resources. We have implemented this
 19 | approach in a tool called Super. You can download it now:
 20 | [https://supe.run](https://supe.run)
 21 | 
 22 | ## UNIX Pipelines in the Cloud
 23 | 
 24 | We believe that a rich subset of Cloud tasks are amenable to a
 25 | lightweight and bash-like approach to analyzing data.
 26 | 
 27 | When one crafts a UNIX pipeline for local operations, the focus is at
 28 | a pretty high level: on the data. We are concerned with where it
 29 | originates (so we can `cat` the right set of filepaths), where it must
 30 | eventually settle (so we can `>` redirect to the proper destination
 31 | filepath), and how the schema of the data can be manipulated with
 32 | existing off-the-shelf tools. APIs are largely a secondary concern.
 33 | 
 34 | We desire a Cloud CLI where typing `cat /s3/data.txt.gz | gunzip -c - | classify.py | sort | uniq -c > /s3/out.txt`
 35 | frees us from the burdens of allocating and scheduling resources, 
 36 | keeping data and computation close, acquiring and propagating data
 37 | authorization to the compute engines, flow control and chunking and
 38 | caching, of when to scale up versus scale out --- that all of this,
 39 | and the APIs needed to direct each of these disparate concerns, is
 40 | hidden behind that simple utterance.
 41 | 
 42 | <img src="pipelines.png" align="right" width="500">
 43 | 
 44 | With some careful thought put into the tooling story, we have found
 45 | that this is indeed possible. We can *compile* high-level utterances
 46 | into executions that have both scale-up and scale-out parallelism,
 47 | without coding to a parallel API such as
 48 | [Spark](https://spark.apache.org/) or [Ray](https://ray.io/).
 49 | 
 50 | Surprisingly, we have found that the resulting executions also have
 51 | high computational density. By always streaming data and leveraging
 52 | fast C-coded utilities, the MiB/sec/core of such pipelines can often
 53 | exceed most other approaches. This knock-on effect has been [observed
 54 | by
 55 | others](https://adamdrake.com/command-line-tools-can-be-235x-faster-than-your-hadoop-cluster.html).
 56 | 
 57 | ### Example: Classify the processors in the Cloud
 58 | 
 59 | The animated GIF shown at the start of this blog (the one with the
 60 | progress bar) illustrates a pipeline that classifies the CPU types of
 61 | a Cloud provider. This example fires off a command line for execution
 62 | in the Cloud, and presents the result on the console; it includes both
 63 | fork/join parallelism and (a bit of) pipeline parallelism:
 64 | 
 65 | ```sh
 66 | ❯ super -p100 -- 'lscpu | grep "Model name" | cut -f 2 -d ":"' | sort | uniq -c
 67 | 91 Intel Core Processor (Broadwell, IBRS)
 68 |  9 Intel Xeon Processor (Cascadelake)
 69 | ```
 70 | 
 71 | <img src="fork-join.png" align="left" width="500">
 72 | 
 73 | Our approach interprets this to mean: "fork off 100 pipelines and
 74 | join the results into a histogram via `sort | uniq`.
 75 | 
 76 | Users of this approach will need some way to express which portions of
 77 | the pipeline are done in the Cloud (i.e. in the fork), and which are
 78 | done on our laptops (after the join).  In this example, the 'quoted
 79 | part' is forked and executed in the Cloud, and the streaming output of
 80 | those jobs is fed into a local pipeline (`| sort | uniq -c`). The
 81 | final output is presented on the user's console, as per usual with
 82 | UNIX pipelines. [GNU Parallel](https://www.gnu.org/software/parallel/)
 83 | adopts a similar syntactic ploy.
 84 | 
 85 | ### Example: Globbing the Cloud
 86 | 
 87 | When operating on Cloud data, we can make the syntax more
 88 | transparent. By leveraging the
 89 | ["globbing"](https://en.wikipedia.org/wiki/Glob_(programming))
 90 | capability of UNIX shells, the set of matched files can represent an
 91 | implicit fork. For example, `cp /s3/src/*.txt.gz /s3/dst` has a glob
 92 | over the source files. This command line can be interpreted to mean
 93 | "fork a copy job for every matched source file". Even simple copy
 94 | tasks may benefit greatly from taking place entirely in the
 95 | Cloud. Doing so avoids downloading and reuploading the
 96 | data. Furthermore, Cloud providers often do not charge for in-Cloud
 97 | data access.
 98 | 
 99 | <img align="right" alt="Animated GIF of super copy" src="super-cp-5-with-progress.gif" width="625">
100 | 
101 | This GIF shows the transparent use Cloud compute resources to mediate
102 | copies between Cloud providers (from IBM to AWS, in this case).  In
103 | the next blog, we will analyze [CommonCrawl](https://commoncrawl.org/)
104 | data using this globbing approach. We will also go into more detail on
105 | what this `/s3` filepath means.
106 | 
107 | ### Big Idea 1: Bash helps us to be Optimal, without Optimizing
108 | 
109 | Optimization and parallelization are hard, unforgiving jobs.  A bash
110 | pipeline, in contrast, benefits from data prefetching and pipeline
111 | parallelism, without any extra coding. We have found that common
112 | pipelines against CommonCrawl data have a net concurrency of 2-3;
113 | i.e. these pipelines, when executed as Kubernetes jobs, utilize an
114 | average 2-3 processor cores, all without any explicit parallelism. For
115 | more details, stay tuned for our next blog.
116 | 
117 | ### Big Idea 2: Bash is Anti-Viral
118 | 
119 | If everything is a stream, then any utility, whether C, Python, Perl,
120 | or Bash --- as long as it accepts and then produces a stream --- can
121 | participate in a streaming pipeline. This allows us to leverage UNIX
122 | standards such as `grep` and `sed` and `awk`, which are versatile and
123 | perform amazingly well. They are also backed by a large corpus of
124 | examples and guidance on StackOverflow and the like.
125 | 
126 | ### Big Idea 3: Bash has a simple Introspection Model
127 | 
128 | It is easy to splice in debugging functionality at any point in a
129 | pipeline. For example, one can insert `tee` or tools like the
130 | [pipeline viewer](http://www.ivarch.com/programs/pv.shtml) `pv` where
131 | needed: `gunzip -c input.txt.gz | pv
132 | | ...` tells you the rate of decompression with only a minor syntactic change. This spliced pipeline has the same output, and has nearly indistinguishable performance.
133 | 
134 | Better yet, this extra functionality is also anti-viral; a single `pv`
135 | utility works with any pipeline. There is no need to find the
136 | Go/NodeJS/Python variant of this functionality, code to its API, find
137 | a way to disable it when in production, etc.
138 | 
139 | ## Further Reading
140 | 
141 |   1. Download Super: [https://supe.run](https://supe.run)
142 |   2. ["Analyzing CommonCrawl Data with `grep` and
143 |      `awk`"](../2-Super-Examples/#readme), which will go into greater
144 |      detail with three examples that classify large data sets from
145 |      CommonCrawl.
146 |   3. ["Browsing CommonCrawl with
147 |      Ease"](https://medium.com/the-graphical-terminal/exploring-big-data-with-a-cli-59af31d38756),
148 |      which will describe a tool that allows you to quickly browse all
149 |      of CommonCrawl, using a shell in your browser.
150 | 


--------------------------------------------------------------------------------
/docs/blogs/2-Super-CommonCrawl/README.md:
--------------------------------------------------------------------------------
  1 | ### Table Of Contents
  2 | 
  3 | - [Running UNIX Commands in the Cloud](../../tutorial/README.md#readme)
  4 | - [Interacting with Cloud Object Storage](../../tutorial/super-cos.md#readme)
  5 | - [Visually Browsing Cloud Object Storage](../../tutorial/super-browse.md#readme)
  6 | - [Parallelizing your UNIX Pipeline](../../tutorial/super-parallelism.md#readme)
  7 | - [Injecting Custom Binaries](../../tutorial/super-cloudbin.md#readme)
  8 | - [Examples of parallel analytics against CommonCrawl data](./2-Super-CommonCrawl/README.md#readme) **[You are here]**
  9 | - [Automating Periodic Tasks](../../tutorial/basics/super-every.md#readme)
 10 | 
 11 | # Analyzing CommonCrawl Data with `grep` and `awk`
 12 | 
 13 | The Cloud has many features of a large, distributed, and very hard to
 14 | use computer.  What would a simpler ["UNIX
 15 | way"](https://en.wikipedia.org/wiki/Unix_philosophy) be for such a
 16 | Cloud computer? We have been experimenting with adopting UNIX
 17 | pipelines, an approach that has proven itself, over decades, as a
 18 | viable way to handle large amounts of data. This approach was designed
 19 | precisely for the situation where input size greatly exceeds available
 20 | local memory.
 21 | 
 22 | With UNIX pipelines, I can stream through data with `cat`, direct
 23 | output flow with `>`, and pipe stages through standard utilities,
 24 | written in any language. In our [previous
 25 | blog](../1-Super-Overview/#readme), we introduced "Bash the Cloud", a
 26 | scheme for compiling a bash command line into a Cloud execution.
 27 | 
 28 | In this blog, we will show how to analyze a very large Cloud-based
 29 | data set using this approach. We will use
 30 | [CommonCrawl](https://commoncrawl.org/) as our corpus, and give
 31 | example pipelines that **classify** web pages in three ways:
 32 | 
 33 | 1. [the canonical word count](#example-1-word-count)
 34 | 2. [classifying pages by hosting web server](#example-2-web-server-classification)
 35 | 3. [classifying pages by language](#example-3-language-classification)
 36 | 
 37 | For each, we will compare the performance of this bash approach to
 38 | standalone Python code and to [Ray](https://ray.io/).
 39 | 
 40 | ## The CommonCrawl Data Sets
 41 | 
 42 | [CommonCrawl](https://commoncrawl.org/) is an archive of a decade's
 43 | worth of web sites from around the world. The archive is stored in
 44 | Amazon S3, and is available for public consumption. CommonCrawl scans
 45 | the web bi-yearly. Each scan has been preprocessed into four formats,
 46 | with the data for each format sharded into several hundred chunks. Our
 47 | three examples operate against three of these four formats: WET files,
 48 | WAT files, and CDX files.
 49 | 
 50 | ### Mounting the Cloud on `/s3`
 51 | 
 52 | When a filepath begins with `/s3`, we consider it to be hosted in an
 53 | [S3](https://en.wikipedia.org/wiki/Amazon_S3) bucket;
 54 | e.g. `/s3/aws/commoncrawl` contains the CommonCrawl data sets. In UNIX
 55 | parlance, we must ensure that the Cloud storage relevant to your needs
 56 | has been [*mounted*](https://en.wikipedia.org/wiki/Mount_(computing)),
 57 | with proper authorization, on `/s3/<provider>`. This means one should
 58 | be able to issue `ls`, `cd`, `cat`, `head`, and other UNIX
 59 | file-oriented commands against those s3 filepaths.
 60 | 
 61 | ### How do we know what input files to use?
 62 | 
 63 | To code an analysis pipeline requires candidate input files and some
 64 | knowledge of their schema. Everything we do to the data depends on
 65 | this: what we grep for to filter out irrelevant records, how we
 66 | project out the needed fields, and so on.
 67 | 
 68 | In our next blog, we will demonstrate a Cloud data discovery tool that
 69 | we find particularly powerful. For now, we will assume this step has
 70 | been done, and assert a set of useful `/s3` CommonCrawl filepaths.
 71 | 
 72 | ## Using UNIX pipelines for Two-Stage Parallel Classification
 73 | 
 74 | Our examples in this blog perform web page
 75 | [*classification*](https://en.wikipedia.org/wiki/Data_classification_(business_intelligence)). Each
 76 | analysis assigns a web page to one or more buckets, which can then be
 77 | tallied up as the analysis runs. For example, the web server
 78 | classification example maps each page to one bucket (identified by the
 79 | name of the hosting web server).
 80 | 
 81 | <img align="right" alt="two-stage classification" src="two-stage-tally.png" width="680">
 82 | 
 83 | In order to scale the analysis across many nodes, each node can
 84 | analyze one segment of the crawl; CommonCrawl breaks up the crawl
 85 | results into segment files. The classification thus proceeds in an
 86 | [embarrassingly
 87 | parallel](https://en.wikipedia.org/wiki/Embarrassingly_parallel)
 88 | fashion across the segments.
 89 | 
 90 | As nodes complete their individual tallies, this first stage of output
 91 | can be streamed into a second stage tally that combines the individual
 92 | tallies by vector addition.
 93 | 
 94 | To implement the tallies, one could write a one-liner
 95 | [`awk`](https://en.wikipedia.org/wiki/AWK) script, write custom Python
 96 | code, use UNIX [`sort`](https://en.wikipedia.org/wiki/Sort_(Unix)) and
 97 | [`uniq`](https://en.wikipedia.org/wiki/Uniq), or whatever other
 98 | methods suit your fancy. The same goes for the other stages of your
 99 | analysis pipeline. Below, we will make some choices that suit *our*
100 | fancies, but there is nothing sacrosact in these choices.
101 | 
102 | ## Example 1: Word Count
103 | 
104 | To classify the web by words used, we can analyze the CommonCrawl
105 | **WET** format files. This information houses the extracted text from
106 | each web page, and is stored in a collection of gzip-compressed files
107 | (these are the CommonCrawl "segment" files). The file format includes
108 | a mix of metadata lines and lines with the extracted text.
109 | 
110 | If we filter out the metadata lines, then split the remaining
111 | text-only lines so that each line contains a single word, we now have
112 | set of classes: each line represents a classification (by words used)
113 | of the web.
114 | 
115 | We can use UNIX utilities to do all of this. To read and uncompress
116 | the WET files, we can use `cat | gunzip`. To filter out irrelevant
117 | lines, `grep -Ev '^WARC|Content-'`. To have each line contain only one
118 | word, `tr ' ' '\012'` will replace whitespace with the octal
119 | representation for an ASCII newline. We can then pipe this into `sort
120 | | uniq -c` which serve, if not optimally at least competently, to tally up the word classes:
121 | 
122 | <img align="right" src="wet.png" width="450">
123 | 
124 | ```sh
125 | cat /s3/.../segment1.wet.gz | \
126 |     gunzip -c - | \
127 |     grep -Ev '^WARC|Content-' | \
128 |     tr ' ' '\012' | \
129 |     sort | uniq -c'
130 | ```
131 | 
132 | The rest boils down to heuristics to eliminate uninteresting
133 | words. For example, most short words can be pretty safely excluded.
134 | This can be done via e.g. `grep .....`, which will exclude any words
135 | shorter than 5 characters. We may also wish to have a case-insensitive
136 | classification; this can be done via `tr [:upper:] [:lower:]`.
137 | 
138 | And so on! We can add as many stages to this pipeline, to iteratively
139 | refine our classification, without much concern for performance. UNIX
140 | pipelines and the standard suite of UNIX utilities are fairly
141 | oblivious to pipeline length.
142 | 
143 | With this, we can now follow the recipe from above, to perform a
144 | two-stage classification against any number of files.
145 | 
146 | ## Example 2: Web server classification
147 | 
148 | To classify the web by the web server that hosted the page, we can
149 | analyze the CommonCrawl **WAT** format files. WAT files, like the WET
150 | files, have a mix of two types of lines. The lines we care about are
151 | those that are a JSON-formatted string. This JSON structure stores
152 | extracted metadata for a given web page, such as the HTTP response
153 | headers from the crawl. We can pull out the `Server` response header
154 | field from these JSON lines.
155 | 
156 | Just as with the WET example, a straightforward pipeline of `grep` and
157 | `tr` can do most of the work we need. To filter down to the JSON
158 | lines, `grep '^{\"Container'`.  Next, if performance is not a major
159 | concern, one could then pipe these lines to `jq` to project out the
160 | field of interest; if you're willing to invest a bit more effort, then
161 | a combination of `grep` and `sed` can get the same effect, and twice
162 | as fast: `grep -Eo '"Server":"[^"]+"' | sed -E -e 's/"Server":|"//g'`;
163 | the `grep` extracts the server field, both key and field, and the
164 | `sed` removes the key part. 
165 | 
166 | We find this approach attractive: there exist off-the-shelf tools that
167 | make the job quick and easy. Then, later, when you're ready to invest
168 | the effort, substantial gains are possible.
169 | 
170 | You may also want to canonicalize the server name in various ways;
171 | e.g. by case-normalizing using `tr` as above, and collapsing the
172 | version variants of each server (e.g. you may want "apache 2.x" and
173 | "apache 2.y" to belong to the same class). As always, straightforward
174 | use of standard UNIX utilities can get you there; but you can write
175 | your logic in any language you please, as long as it accepts a stream
176 | as input and produces a stream as output, it can be plugged into the
177 | two-stage classification scheme.
178 | 
179 | ## Example 3: Language classification
180 | 
181 | To classify the web by the languages to which the pages have been
182 | translated, we can analyze the CommonCrawl CDX files. Each line in
183 | these files represents a web page, and contains a mix of metadata and
184 | a JSON fragment; the latter has a field that specifies languages to
185 | which this web page has been translated.
186 | 
187 | And the rest follows the same approach as the prior two examples. For
188 | example, you may use `awk -F '"languages": "' '{ print $2 }'` to
189 | approximately extract the languages field of the JSON structure,
190 | without even parsing it fully. The languages field is a
191 | comma-separated list (not a JSON array); you may use `tr , '\012'` to
192 | split the languages so that there is one language per line --- just as
193 | we did above for words.
194 | 
195 | ## Performance Comparisons
196 | 
197 | Surprisingly to us, this approach not only gives you the flexibility
198 | of mix-and-matching utilities written in whichever language you
199 | prefer, it also performs quite well.
200 | 
201 | This figure summarizes the megabytes per second for the three
202 | examples, comparing our UNIX pipeline approach (the series labeled
203 | "Bash" in a cyan-blue color) to Ray (in yellow) and plain Python (in
204 | white). These experiments are for analyzing a single file, so that we
205 | could tease out the computational density of each approach.
206 | 
207 | ![CommonCrawl performance comparisons](commoncrawl-comparo.png)
208 | 
209 | For each, we ran the code on 1, 2, 4, and 8 core VMs. Observe that our
210 | Bash approach shows good scaling from 1 to 2 cores, and, for the word
211 | count example, this scaling continues up to 3-4 cores. In other words,
212 | our approach extracts parallelism, even when analyzing one file, and
213 | without any explicit parallelism! All of the scale-up performance
214 | gains come from the inherent pipeline parallelism: UNIX knows how to
215 | run the `cat`, `gunzip` and `grep` in parallel, and also how to
216 | properly balance the flow control so that the pipeline proceeds at the
217 | pace of the slowest pipeline stage.
218 | 
219 | ## Further Reading
220 | 
221 |   1. Download Super: [https://supe.run](https://supe.run)
222 |   2. ["Bash the Cloud"](../1-Super-Overview/#readme), which introduces
223 |      the technique of using UNIX pipelines to analyze large datasets
224 |      in the Cloud.
225 |   3. ["Browsing CommonCrawl with
226 |      Ease"](https://medium.com/the-graphical-terminal/exploring-big-data-with-a-cli-59af31d38756),
227 |      which will describe a tool that allows you to quickly browse all
228 |      of CommonCrawl, using a shell in your browser.
229 | 


--------------------------------------------------------------------------------