](./example1.md)|
8 | |Fan-out across S3 input files
](./example3.md)|
9 | |Fan-out across S3 input files
](./example2.md)|
10 | |Fan-out across S3 input files
](./example4.md)|
11 | |Super supports dropping in custom binaries|[
](./example5.md)|
12 | |Periodic pipeline execution|[
](./example6.md)|
13 |
--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/languages/python/languages2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from collections import Counter
4 | import gzip
5 | import re
6 | import io
7 | import json
8 |
9 | file = r"cdx-00210.gz"
10 |
11 | language_counts = Counter()
12 | with gzip.open(file, "r") as gz, io.BufferedReader(gz) as f:
13 | for line in f:
14 | text = line.decode("utf8")
15 | try:
16 | idx = text.rindex('"languages"')
17 | if idx >= 0:
18 | idx2 = idx + 14
19 | idx3 = text.index('"', idx2)
20 | languages = text[idx2:idx3]
21 | language_counts.update(Counter(languages.split(",")))
22 | except ValueError as e:
23 | continue
24 |
25 |
26 | for word in language_counts.most_common(10):
27 | print(" ", word)
28 |
--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/README.md:
--------------------------------------------------------------------------------
1 | # Python-Ray-Super Comparative Study against CommonCrawl
2 |
3 | These directories contain the **source scripts**, so that others may
4 | replicate the findings summarized
5 | [here](https://github.com/starpit/super/tree/comparisons/docs/blogs/2-Super-CommonCrawl#performance-comparisons). There
6 | are three separate sub-studies, all against
7 | [CommonCrawl](https://commoncrawl.org) data:
8 |
9 | - [**wordcount**](wordcount) classifies crawled web pages by contained
10 | words. This is a traditional word count against the WET files.
11 |
12 | - [**webservers**](webservers) classifies crawled web pages by the
13 | serving web server. This classification operates against the WAT
14 | files.
15 |
16 | - [**languages**](languages) classifies crawled web pages by supported
17 | languages. This classification operates against the CDX files.
--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/python/wordcount1.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from collections import Counter
4 | import gzip
5 | import re
6 | import io
7 |
8 | wordcounts = Counter()
9 | file = r"CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz"
10 | with gzip.open(file, "r") as gz, io.BufferedReader(gz) as f:
11 | for line in f:
12 | text = line.decode("utf8")
13 | if not re.search("^WARC|Content-", text):
14 | for word in text.split():
15 | if len(word) >= 5 and not re.search('[^a-zA-Z]', word):
16 | #wordcounts.update([word])
17 | wordcounts[word.lower()] += 1
18 |
19 | #most_frequent_words = heapq.nlargest(
20 | # 10, wordcounts, key=wordcounts.get)
21 | #for word in most_frequent_words:
22 | for word in wordcounts.most_common(10):
23 | print(" ", word, wordcounts[word])
24 |
--------------------------------------------------------------------------------
/Casks/super.rb:
--------------------------------------------------------------------------------
1 | cask "super" do
2 | version "1.7.0"
3 |
4 | name "Super"
5 | desc "CLI for the Serverless Supercomputer"
6 | homepage "https://github.com/IBM/super"
7 |
8 | if Hardware::CPU.intel?
9 | url "https://github.com/IBM/super/releases/download/v#{version}/Super-darwin-x64.tar.bz2"
10 | sha256 "78902a5b2f81f9f657bf86c469fff1e218815bdd69d61c31f66fa22f4fc36a7e"
11 | app "Super-darwin-x64/Super.app"
12 | else
13 | url "https://github.com/IBM/super/releases/download/v#{version}/Super-darwin-arm64.tar.bz2"
14 | sha256 "ba1b090e82fb9401adabd36ed3c157421fbd67d8a21086bb742c4d5bf33e46f4"
15 | app "Super-darwin-arm64/Super.app"
16 | end
17 |
18 | livecheck do
19 | url :url
20 | strategy :git
21 | regex(/^v(\d+(?:\.\d+)*)$/)
22 | end
23 |
24 | binary "#{appdir}/Super.app/Contents/Resources/super"
25 |
26 | zap trash: "~/Library/Application\ Support/Super"
27 | end
28 |
--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/languages/python/languages.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from collections import Counter
4 | import gzip
5 | import re
6 | import io
7 | import json
8 |
9 | file = r"cdx-00210.gz"
10 |
11 | language_counts = Counter()
12 | with gzip.open(file, "r") as gz, io.BufferedReader(gz) as f:
13 | for line in f:
14 | text = line.decode("utf8")
15 | idx = text.index('{')
16 | if idx >= 0:
17 | try:
18 | record = json.loads(text[idx:])
19 | if "languages" in record:
20 | languages = record["languages"]
21 | if languages:
22 | language_counts.update(Counter(languages.split(",")))
23 | except ValueError as e:
24 | continue
25 |
26 |
27 | for word in language_counts.most_common(10):
28 | print(" ", word)
29 |
--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/python/README.md:
--------------------------------------------------------------------------------
1 | # Plain Python wordcount
2 |
3 | All implementations read in a local compressed WET file.
4 |
5 | - [**wordcount1.py**](wordcount1.py) filters out trivial words, using inefficient parsing of one line at a time in a loop
6 | - [**wordcount2.py**](wordcount2.py) no filtering, and inefficiently reading one line at a time in a loop
7 | - [**wordcount3.py**](wordcount3.py) no filtering, and more efficiently using Python loop-free parsing
8 |
9 | TODO: Surely can we do filtering in Python in a loop-free way?
10 |
11 | ## Usage
12 |
13 | ```sh
14 | if [ ! -f ../CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz ]; then (cd .. && curl -LO https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2017-04/segments/1484560280292.50/wet/CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz); fi
15 | ./run.sh ./wordcount1.py
16 | ./run.sh ./wordcount2.py
17 | ./run.sh ./wordcount3.py
18 | ```
19 |
--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/ray/README.md:
--------------------------------------------------------------------------------
1 | # Ray wordcount
2 |
3 | Unless otherwise stated, all implementations read in a local
4 | compressed WET file.
5 |
6 | - [**wordcount.py**](wordcount.py) No filtering. Uses Ray iterators.
7 | - [**wordcount2.py**](wordcount2.py) No filtering. Uses ray.put.
8 | - [**wordcount2-with-filtering.py**](wordcount2-with-filtering.py) Ibid, plus filters out trivial words.
9 | - [**wordcount2-with-filtering-defaultdict.py**](wordcount2-with-filtering-defaultdict.py) Ibid, using defaultdict instead of Counter.
10 | - [**wordcount2-with-filtering-and-minio.py**](wordcount2-with-filtering-defaultdict.py) As with wordcount2-with-filtering, but including the fetch of a remote input file.
11 |
12 | ## Usage
13 |
14 | ```sh
15 | if [ ! -f ../CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz ]; then (cd .. && curl -LO https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2017-04/segments/1484560280292.50/wet/CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz); fi
16 | ./run.sh ./wordcount2.py
17 | ```
18 |
--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/ray/wordcount2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from collections import Counter, defaultdict
4 | import gzip
5 | import io
6 | import ray
7 | import pandas
8 | import datetime
9 | #file = open(r"CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet", "r", encoding="utf-8-sig")
10 | ray.init()
11 | wordcount=0
12 | @ray.remote
13 | def count_words(lines):
14 | count=0
15 | for line in lines:
16 | count = count + len(line.split())
17 | return count
18 |
19 | begin=begin = datetime.datetime.now()
20 | bufsize = 10000000
21 | results = []
22 | with gzip.open(r"CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz", "r") as infile:
23 | while True:
24 | lines = infile.readlines(bufsize)
25 | lines_id = ray.put(lines)
26 | if not lines:
27 | break
28 | results.append(count_words.remote(lines_id))
29 | numbers=ray.get(results)
30 | for number in numbers:
31 | wordcount=wordcount+number
32 | end=datetime.datetime.now()
33 | duration=end-begin
34 | print(wordcount)
35 | print(duration)
36 |
--------------------------------------------------------------------------------
/docs/examples/example5.md:
--------------------------------------------------------------------------------
1 | # Super Example 5: Injecting Cloud Binaries
2 |
3 |
4 |
5 | You may also inject custom scripts into the running jobs. You may use
6 | any Cloud bucket to store your binaries.
7 |
8 |
](example1.md)
31 | [
](example2.md)
32 | [
](example3.md)
33 | [
](example4.md)
34 | [
](example6.md)
35 |
--------------------------------------------------------------------------------
/docs/examples/example3.md:
--------------------------------------------------------------------------------
1 | # Super Power 3: High-performance Data Pipelines in the Cloud
2 |
3 |
4 |
5 | UNIX pipelines perform incredibly well, and let you mix and match
6 | off-the-shelf tools in flexible ways. With Super, you can leverage
7 | all of this power. Use high-performance UNIX pipes `|`, but against
8 | Cloud data and compute.
9 |
10 |
](example1.md)
29 | [
](example2.md)
30 | [
](example4.md)
31 | [
](example5.md)
32 | [
](example6.md)
33 |
--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/ray/wordcount2-with-filtering.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from collections import Counter
4 | import gzip
5 | import re
6 | import io
7 | import ray
8 | import pandas
9 | import datetime
10 |
11 | ray.init()
12 | wordcount=0
13 | @ray.remote
14 | def count_words(lines):
15 | count=Counter()
16 | for line in lines:
17 | text = line.decode("utf8")
18 | if not re.search("^WARC|Content-", text):
19 | for word in text.split():
20 | if len(word) >= 5 and not re.search('[^a-zA-Z]', word):
21 | count[word.lower()] += 1
22 | return count
23 |
24 | begin=begin = datetime.datetime.now()
25 | bufsize = 10000000
26 | results = []
27 | with gzip.open(r"CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz", "r") as infile:
28 | while True:
29 | lines = infile.readlines(bufsize)
30 | lines_id = ray.put(lines)
31 | if not lines:
32 | break
33 | results.append(count_words.remote(lines_id))
34 | counts=ray.get(results)
35 |
36 | wordcounts = Counter()
37 | for count in counts:
38 | wordcounts.update(count)
39 |
40 | for word in wordcounts.most_common(10):
41 | print(" ", word, wordcounts[word])
42 | end=datetime.datetime.now()
43 | duration=end-begin
44 |
45 | print(duration)
46 |
--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/ray/wordcount2-with-filtering-defaultdict.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from collections import defaultdict, Counter
4 | import gzip
5 | import re
6 | import io
7 | import ray
8 | import pandas
9 | import datetime
10 | import heapq
11 |
12 | ray.init()
13 | wordcount=0
14 | @ray.remote
15 | def count_words(lines):
16 | count=defaultdict(int)
17 | for line in lines:
18 | text = line.decode("utf8")
19 | if not re.search("^WARC|Content-", text):
20 | for word in text.split():
21 | if len(word) >= 5 and not re.search('[^a-zA-Z]', word):
22 | count[word] += 1
23 | return count
24 |
25 | begin=begin = datetime.datetime.now()
26 | bufsize = 10000000
27 | results = []
28 | with gzip.open(r"CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz", "r") as infile:
29 | while True:
30 | lines = infile.readlines(bufsize)
31 | lines_id = ray.put(lines)
32 | if not lines:
33 | break
34 | results.append(count_words.remote(lines_id))
35 | counts=ray.get(results)
36 |
37 | wordcounts = Counter()
38 | for count in counts:
39 | wordcounts.update(count)
40 | end=datetime.datetime.now()
41 |
42 | for word in wordcounts.most_common(10):
43 | print(" ", word)
44 | duration=end-begin
45 |
46 | print(duration)
47 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # electron builds
2 | /dist
3 |
4 | # warning for future reference: see https://github.com/IBM/kui/issues/209
5 | # /package-lock.json
6 |
7 | # in case we have some leftover temporary build output
8 | /kui
9 |
10 | # webpack and electron staging directories
11 | kui-*-tmp
12 |
13 | packages/*/package-lock.json
14 | plugins/*/package-lock.json
15 |
16 | *~
17 | #*
18 | dist/app
19 | dist/build
20 | dist/plugins
21 | *.log
22 | *.bak
23 | node_modules
24 | wskng.iml
25 | .pre-scanned.json
26 | app/.version
27 | app/build/webpack-stats.html
28 | app/content/**/*.js.map
29 | app/src/@kui-plugin
30 | .openwhisk-shell
31 | dump.rdb
32 | #openwhisk
33 | .idea
34 | .DS_Store
35 | app.inst
36 | .travis.yml.orig
37 | *_BASE*
38 | *_BACKUP*
39 | *_LOCAL*
40 | *_REMOTE*
41 | keys
42 | *flycheck*.ts
43 | *flycheck*.js
44 | *flycheck*.js.map
45 | /build
46 | *.bak.json
47 |
48 | # these seem to come from node-pty or xterm.js
49 | .swp
50 |
51 | # we will copy this file out of packages/kui-builder for local dev
52 | .npmrc
53 |
54 | nohup.out
55 |
56 | # tsc composite build files
57 | *.tsbuildinfo
58 |
59 | # any temporary npm packs
60 | kui-shell-*.tgz
61 |
62 | # mkclient.sh stage
63 | kui-stage
64 |
65 | # es6 compiled modules
66 | packages/*/mdist
67 | plugins/*/mdist
68 | clients/**/mdist
69 |
70 | # webpack-dev-server report
71 | report.*.json
72 |
73 | # packages/builder/dist/electron currently generates this and does not remove it
74 | /kubectl-kui
--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/languages/ray/languages2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from collections import Counter
4 | import gzip
5 | import re
6 | import io
7 | import ray
8 | import json
9 | import datetime
10 |
11 | ray.init()
12 | @ray.remote
13 | def count_languages(lines):
14 | language_counts=Counter()
15 | for line in lines:
16 | text = line.decode("utf8")
17 | try:
18 | idx = text.rindex('"languages"')
19 | if idx >= 0:
20 | idx2 = idx + 14
21 | idx3 = text.index('"', idx2)
22 | languages = text[idx2:idx3]
23 | language_counts.update(Counter(languages.split(",")))
24 | except ValueError as e:
25 | continue
26 | return language_counts
27 |
28 | begin=begin = datetime.datetime.now()
29 | bufsize = 10000000
30 | results = []
31 | file = r"cdx-00210.gz"
32 | #file = r"yo.gz"
33 | with gzip.open(file, "r") as infile:
34 | while True:
35 | lines = infile.readlines(bufsize)
36 | lines_id = ray.put(lines)
37 | if not lines:
38 | break
39 | results.append(count_languages.remote(lines_id))
40 | counts=ray.get(results)
41 |
42 | language_counts = Counter()
43 | for count in counts:
44 | language_counts.update(count)
45 | end=datetime.datetime.now()
46 |
47 | for word in language_counts.most_common(10):
48 | print(" ", word)
49 | duration=end-begin
50 |
51 | print(duration)
52 |
--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/webservers/python/webservers.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from collections import Counter
4 | import gzip
5 | import re
6 | #import io
7 | import json
8 |
9 | file = r"CC-MAIN-20170116095121-00570-ip-10-171-10-70.ec2.internal.warc.wat.gz"
10 | file = r"CC-MAIN-20210304235759-20210305025759-00616.warc.wat.gz"
11 | file = r"yo.wat.gz"
12 | file=r"CC-MAIN-20210304235759-20210305025759-00616.warc.wat.gz"
13 |
14 | server_counts = Counter()
15 | #with gzip.open(file, "r") as gz, io.BufferedReader(gz) as f:
16 | with gzip.open(file, "r") as f:
17 | for line in f:
18 | text = line.decode("utf8")
19 | if re.search("^{\"Container", text):
20 | try:
21 | record = json.loads(text)
22 | meta = record["Envelope"]["Payload-Metadata"]
23 | if "HTTP-Response-Metadata" in meta:
24 | response_meta = meta["HTTP-Response-Metadata"]
25 | if "Headers" in response_meta:
26 | headers = response_meta["Headers"]
27 | if "Server" in headers:
28 | server = headers["Server"]
29 | if server:
30 | server_key = re.sub('[/-].+$', '', server).lower()
31 | server_counts[server_key] = server_counts[server_key] + 1
32 | except ValueError as e:
33 | continue
34 |
35 |
36 | for word in server_counts.most_common(10):
37 | print(" ", word)
38 |
--------------------------------------------------------------------------------
/docs/examples/example4.md:
--------------------------------------------------------------------------------
1 | # Super Power 4: Joining Output from Jobs
2 |
3 |
4 |
5 | You can also pipe the output of a Cloud job to a local pipeline.
6 |
7 |
24 |
25 | Following on from [our previous `cp` example](example2.md#example),
26 | this `super run`, this pipeline uses `wc -l` to generate partial sum
27 | in the Cloud, and then uses a local `awk` to sum the partial sums
28 | generated by the 3 jobs.
29 |
30 | ```sh
31 | super run -- \
32 | 'cat /s3/ibm/tmp/*.gz | gunzip -c - | grep "WARC-Type: conversion" | wc -l' \
33 | | awk '{N+=$1} END {print N}'
34 | 122272
35 | ```
36 |
37 | ## Other Super Powers
38 |
39 | [
](example1.md)
40 | [
](example2.md)
41 | [
](example3.md)
42 | [
](example5.md)
43 | [
](example6.md)
44 |
--------------------------------------------------------------------------------
/docs/comparisons/commoncrawl/wordcount/ray/wordcount2-with-filtering-and-minio.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from collections import Counter, defaultdict
4 | import gzip
5 | import re
6 | import io
7 | import ray
8 | import pandas
9 | import datetime
10 | from minio import Minio
11 |
12 | client = Minio("s3.amazonaws.com", "", "")
13 |
14 | ray.init()
15 | #ray.init(address='auto')
16 |
17 | wordcount=0
18 | @ray.remote(num_cpus=1)
19 | def count_words(lines):
20 | count=Counter()
21 | for line in lines:
22 | text = line.decode("utf8")
23 | if not re.search("^WARC|Content-", text):
24 | for word in text.split():
25 | if len(word) >= 5 and not re.search('[^a-zA-Z]', word):
26 | count[word] = count[word] + 1
27 | return count
28 |
29 | begin=begin = datetime.datetime.now()
30 | bufsize = 10000000
31 | results = []
32 | response = client.get_object('commoncrawl', 'crawl-data/CC-MAIN-2017-04/segments/1484560280292.50/wet/CC-MAIN-20170116095120-00055-ip-10-171-10-70.ec2.internal.warc.wet.gz')
33 | with gzip.GzipFile(fileobj=response) as infile:
34 | while True:
35 | lines = infile.readlines(bufsize)
36 | lines_id = ray.put(lines)
37 | if not lines:
38 | break
39 | results.append(count_words.remote(lines_id))
40 | counts=ray.get(results)
41 |
42 | wordcounts = Counter()
43 | for count in counts:
44 | wordcounts.update(count)
45 | end=datetime.datetime.now()
46 |
47 | for word in wordcounts.most_common(10):
48 | print(" ", word, wordcounts[word])
49 | duration=end-begin
50 |
51 | print(duration)
52 |
--------------------------------------------------------------------------------
/docs/comparisons/util/histo.cc:
--------------------------------------------------------------------------------
1 | #include