├── 2018
    ├── daskvsspark
    │   ├── .dockerignore
    │   ├── .gitignore
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── daskvsspark
    │   │   ├── __init__.py
    │   │   ├── aggregate_dask.py
    │   │   ├── aggregate_dask.sh
    │   │   ├── aggregate_dask_kube.sh
    │   │   ├── aggregate_dask_yarn.sh
    │   │   ├── aggregate_spark.py
    │   │   ├── aggregate_spark.sh
    │   │   ├── aggregate_spark_yarn.sh
    │   │   ├── common.py
    │   │   ├── context.py
    │   │   ├── notes.txt
    │   │   ├── prepare.py
    │   │   ├── prepare.sh
    │   │   ├── schema.py
    │   │   ├── show.py
    │   │   ├── start_dask.sh
    │   │   └── start_dask_yarn.py
    │   ├── deployment
    │   │   ├── bootstrap.sh
    │   │   ├── conf.json
    │   │   ├── config.yaml
    │   │   ├── create_cluster.sh
    │   │   ├── deploy_code.sh
    │   │   ├── deploy_data.sh
    │   │   ├── deploy_reqs.sh
    │   │   ├── instances.json
    │   │   ├── log4j.properties
    │   │   └── setup_dvss.sh
    │   ├── docker-compose.yml
    │   ├── dvss-helm
    │   │   ├── .helmignore
    │   │   ├── Chart.yaml
    │   │   ├── templates
    │   │   │   ├── NOTES.txt
    │   │   │   ├── _helpers.tpl
    │   │   │   ├── scheduler-deployment.yaml
    │   │   │   ├── scheduler-service.yaml
    │   │   │   └── worker-deployment.yaml
    │   │   └── values.yaml
    │   ├── requirements-dask.txt
    │   ├── requirements-dev.txt
    │   ├── requirements.txt
    │   ├── samples
    │   │   └── agg1hour.json
    │   ├── scala
    │   │   ├── README.md
    │   │   ├── build.sbt
    │   │   └── src
    │   │   │   ├── main
    │   │   │       └── scala
    │   │   │       │   └── com
    │   │   │       │       └── jbennet
    │   │   │       │           └── daskvsspark
    │   │   │       │               └── udafs.scala
    │   │   │   └── test
    │   │   │       └── scala
    │   │   │           └── com
    │   │   │               └── jbennet
    │   │   │                   └── daskvsspark
    │   │   │                       └── AggregateCounterTest.scala
    │   ├── setup.py
    │   └── trials
    │   │   ├── Aggregate without index.ipynb
    │   │   ├── Custom aggregations.ipynb
    │   │   ├── aggregate1.py
    │   │   └── aggregate2.py
    ├── datetimes
    │   ├── 01_event-table.png
    │   ├── 02_event_table_utc.png
    │   ├── 03_event_table_la.png
    │   ├── 04_event_table_floor.png
    │   ├── 05_events_grouped.png
    │   ├── datetime-challenges.ipynb
    │   └── requirements.txt
    ├── sqlpandas
    │   ├── README.md
    │   ├── data-hp
    │   │   ├── houses.csv
    │   │   └── students.csv
    │   ├── data
    │   │   ├── airport-frequencies.csv
    │   │   ├── airports.csv
    │   │   ├── countries.csv
    │   │   ├── navaids.csv
    │   │   ├── regions.csv
    │   │   └── runways.csv
    │   ├── download_data.sh
    │   ├── explore.ipynb
    │   └── images
    │   │   ├── by_country.png
    │   │   ├── by_country_top10.png
    │   │   ├── having1.png
    │   │   ├── having2.png
    │   │   ├── notebook.png
    │   │   ├── runways.png
    │   │   ├── runways_agg1.png
    │   │   └── runways_agg2.png
    └── windows
    │   ├── README.md
    │   ├── Window functions.ipynb
    │   ├── social_deltas.csv
    │   ├── social_totals.csv
    │   └── social_totals_agg.csv
├── 2019
    ├── pandasdb
    │   └── read_csv_file.py
    └── sparkstart
    │   ├── context.py
    │   ├── driver.py
    │   └── runner.sh
├── 2021
    └── covid-travel
    │   ├── .gitignore
    │   ├── Covid and air travel.ipynb
    │   ├── README.md
    │   ├── all_by_age_race.csv
    │   ├── covid_and_air_travel.png
    │   ├── flight_infection_risk.csv
    │   ├── requirements.txt
    │   └── test.py
├── 2022
    └── uk-covid-deaths
    │   ├── asmr
    │       ├── agestandardisedmortalityratecalculationtemplateusingthe2013esp_tcm77-359944.xls
    │       └── espmortalityratesreport_tcm77-364912.pdf
    │   ├── output1.png
    │   ├── output2.png
    │   ├── output3.png
    │   ├── output4.png
    │   ├── output5.png
    │   ├── referencetable06072022accessible
    │       ├── Contents-Table 1.tsv
    │       ├── Cover-Table 1.tsv
    │       ├── Definitions-Table 1.tsv
    │       ├── Notes-Table 1.tsv
    │       ├── Table 1-Table 1.tsv
    │       ├── Table 2-Table 1.tsv
    │       ├── Table 3-Table 1.tsv
    │       ├── Table 4-Table 1.tsv
    │       ├── Table 5-Table 1.tsv
    │       ├── Table 6-Table 1.tsv
    │       ├── Table 7-Table 1.tsv
    │       ├── Table 8-Table 1.tsv
    │       └── Table 9-Table 1.tsv
    │   ├── requirements.txt
    │   ├── table1.tsv
    │   ├── table6.tsv
    │   ├── table8.tsv
    │   └── uk_deaths_by_vacc.ipynb
├── .gitignore
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | # pycharm
104 | .idea/
105 | 
106 | *.parquet
107 | *.crc
108 | 
109 | spark-warehouse/
110 | metastore_db/
111 | aggs*/
112 | _SUCCESS
113 | 
114 | .DS_Store
115 | /2020/covid-travel/COVID-19_Case_Surveillance_Public_Use_Data.csv
116 | /2022/uk-covid-deaths/owid-covid-data.csv
117 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/.dockerignore:
--------------------------------------------------------------------------------
 1 | build
 2 | *.egg-info
 3 | deployment
 4 | dist
 5 | images
 6 | samples
 7 | scala
 8 | tmp
 9 | trials
10 | 
11 | **/aggs_*
12 | **/events
13 | **/*.log.*
14 | metastore_db
15 | dask-worker-space
16 | spark-warehouse
17 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/.gitignore:
--------------------------------------------------------------------------------
 1 | images/
 2 | aggs_*/
 3 | events/
 4 | dask-worker-space/
 5 | spark-warehouse/
 6 | out/
 7 | project/
 8 | target/
 9 | tmp*
10 | *.log.*
11 | tmp/
12 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM daskdev/dask:latest
 2 | 
 3 | RUN pip install awscli
 4 | 
 5 | ARG AWS_ACCESS_KEY_ID
 6 | ARG AWS_SECRET_ACCESS_KEY
 7 | ARG AWS_DEFAULT_REGION
 8 | 
 9 | ENV AWS_ACCESS_KEY_ID $AWS_ACCESS_KEY_ID
10 | ENV AWS_SECRET_ACCESS_KEY $AWS_SECRET_ACCESS_KEY
11 | ENV AWS_DEFAULT_REGION $AWS_DEFAULT_REGION
12 | ENV CONDA_ROOT $(conda info --root)
13 | 
14 | RUN echo "AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID"
15 | 
16 | # add the reqs
17 | ADD ./requirements*.txt /assets/code/
18 | 
19 | # install the reqs
20 | WORKDIR /assets/code
21 | RUN conda install --copy -y -c conda-forge --file requirements.txt --file requirements-dask.txt --file requirements-dev.txt
22 | 
23 | # add the code
24 | ADD ./daskvsspark/*.py /assets/code/daskvsspark/
25 | ADD ./daskvsspark/aggregate_*.sh /assets/code/daskvsspark/
26 | ADD ./setup.py /assets/code/
27 | 
28 | # install the code into conda root env
29 | RUN python setup.py install
30 | 
31 | RUN apt-get install -y vim
32 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/README.md:
--------------------------------------------------------------------------------
 1 | What is this?
 2 | =============
 3 | 
 4 | An example of data aggregation in Spark and in Dask.
 5 | 
 6 | How do I use it?
 7 | ================
 8 | 
 9 | To run this locally, you need an Apache Spark distribution
10 | (let's say it's in `$HOME/bin/`). Then, after setting some
11 | environment variables:
12 | 
13 | ```
14 |   export SPARK_HOME="$HOME/bin/spark-2.1.1-bin-hadoop2.7"
15 |   export PYTHONPATH="$SPARK_HOME/python/lib/pyspark.zip:$SPARK_HOME/python/lib/py4j-0.10.4-src.zip:$PYTHONPATH"
16 | ```
17 | 
18 | you can run a Spark script as simply as:
19 | 
20 | ```
21 | python main.py
22 | ```
23 | 
24 | The above is good enough for testing. In real life, you'd use ``spark-submit``:
25 | 
26 | ```
27 | PYSPARK_DRIVER_PYTHON=`which python` PYSPARK_PYTHON=`which python` \
28 |     spark-submit \
29 |     --master "local[4]" \
30 |     --deploy-mode client \
31 |     main.py
32 | ```
33 | 
34 | Generate input data
35 | -------------------
36 | 
37 | A script is included to mock some input data. It writes Parquet to `./events/` directory.
38 | To run it with Spark:
39 | 
40 | ```
41 | prepare.sh
42 | ```
43 | 
44 | By default, it'll generate 100 input records and assume 100k records per partition (one parquet
45 | file). You can provide a different number:
46 | 
47 | ```
48 | prepare.sh [total-records] [records-per-partition]
49 | ```
50 | 
51 | The data is partitioned on disk by year, month, day, customer and hour. The script generates 1 day
52 | of data. This means that at least 24 files (partitions) will be created, because we can't create
53 | less than one partition per hour. The script will write parquet to
54 | ``./events/[number-of-records]-[number-of-partitions]``
55 | 
56 | Make sure that the spark-submit settings in ``prepare.sh`` (``driver-memory``,
57 | ``executor-memory``, ``num-executors``) will work for you.
58 | 
59 | Aggregate with Spark
60 | ---------------------
61 | 
62 | Run this:
63 | 
64 | ```
65 | aggregate_spark.sh [number-of-records] [number-of-partitions]
66 | ```
67 | 
68 | This will read the data from ``./events`` and write the aggregates as JSON
69 | to ``./aggs_spark/[number-of-records]-[number-of-partitions]``.
70 | 
71 | Aggregate with Dask
72 | -------------------
73 | 
74 | Run this:
75 | 
76 | ```
77 | python aggregate_dask.py [number-of-records] [number-of-partitions]
78 | ```
79 | 
80 | This will read the data from `./events` and write the aggregates as JSON
81 | to ``./aggs_dask/[number-of-records]-[number-of-partitions]``.
82 | 
83 | Inspect the data
84 | ----------------
85 | 
86 | A script is included to pretty-print generated json records. For example,
87 | this:
88 | 
89 | ```
90 | python show.py ./aggs_dask/100-24 3
91 | ```
92 | 
93 | will pretty-print 3 json records from ``./aggs_dask/100-24`` directory.
94 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/daskvsspark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2018/daskvsspark/daskvsspark/__init__.py


--------------------------------------------------------------------------------
/2018/daskvsspark/daskvsspark/aggregate_dask.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8
  2 | # aggregate_dask.py
  3 | import argparse
  4 | import datetime as dt
  5 | import os
  6 | import shutil
  7 | from collections import Counter
  8 | 
  9 | import dask
 10 | import dask.dataframe as dd
 11 | import s3fs
 12 | import simplejson as json
 13 | import pandas as pd
 14 | from dask.distributed import Client, LocalCluster
 15 | 
 16 | from daskvsspark.common import *
 17 | 
 18 | INPUT_ROOT = './events'
 19 | OUTPUT_ROOT = './aggs_dask'
 20 | 
 21 | INPUT_TEMPLATE = '{root}/{event_count}-{nfiles}/*/*/*/*/*/*.parquet'
 22 | OUTPUT_TEMPLATE = '{root}/{event_count}-{nfiles}/*.json'
 23 | 
 24 | 
 25 | def read_data(read_path):
 26 |     """Reads the original Parquet data.
 27 |     :returns: DataFrame
 28 |     """
 29 |     df = dd.read_parquet(read_path).drop('hour', axis=1)
 30 |     return df
 31 | 
 32 | 
 33 | def counter_chunk(ser):
 34 |     """Return counter of values in series."""
 35 |     return list(Counter(ser.values).items())
 36 | 
 37 | 
 38 | def counter_agg(chunks):
 39 |     """Add all counters together and return dict items."""
 40 |     total = Counter()
 41 |     for chunk in chunks:
 42 |         current = Counter(dict(chunk))
 43 |         total += current
 44 |     return list(total.items())
 45 | 
 46 | 
 47 | def nunique_chunk(ser):
 48 |     """Get all unique values in series."""
 49 |     return ser.unique()
 50 | 
 51 | 
 52 | def nunique_agg(chunks):
 53 |     """Return number of unique values in all chunks."""
 54 |     total = pd.Series()
 55 |     for chunk in chunks:
 56 |         current = pd.Series(chunk)
 57 |         total = total.append(current)
 58 |         total = total.drop_duplicates()
 59 |     res = total.nunique()
 60 |     return res
 61 | 
 62 | 
 63 | def group_data(df):
 64 |     """Aggregate the DataFrame and return the grouped DataFrame.
 65 | 
 66 |     :param df: DataFrame
 67 |     :returns: DataFrame
 68 |     """
 69 |     # round timestamps down to an hour
 70 |     df['ts'] = df['ts'].dt.floor('1H')
 71 | 
 72 |     # group on customer, timestamp (rounded) and url
 73 |     gb = df.groupby(['customer', 'url', 'ts'])
 74 | 
 75 |     counter = dd.Aggregation(
 76 |         'counter',
 77 |         lambda s: s.apply(counter_chunk),
 78 |         lambda s: s.apply(counter_agg),
 79 |     )
 80 | 
 81 |     count_unique = dd.Aggregation(
 82 |         'count_unique',
 83 |         lambda s: s.apply(nunique_chunk),
 84 |         lambda s: s.apply(nunique_agg)
 85 |     )
 86 | 
 87 |     ag = gb.agg({
 88 |         'session_id': [count_unique, 'count'],
 89 |         'referrer': counter}
 90 |     )
 91 | 
 92 |     ag = ag.reset_index()
 93 | 
 94 |     # get rid of multilevel columns
 95 |     ag.columns = ['customer', 'url', 'ts', 'visitors', 'page_views', 'referrers']
 96 |     ag = ag.repartition(npartitions=df.npartitions)
 97 | 
 98 |     return ag
 99 | 
100 | 
101 | def transform_one(ser):
102 |     """Takes a Series object representing a grouped DataFrame row,
103 |     and returns a dict ready to be stored as JSON.
104 | 
105 |     :returns: pd.Series
106 |     """
107 |     data = ser.to_dict()
108 |     if not data:
109 |         return pd.Series([], name='data')
110 |     page_views = data.pop('page_views')
111 |     visitors = data.pop('visitors')
112 |     data.update({
113 |         '_id': format_id(data['customer'], data['url'], data['ts']),
114 |         'ts': data['ts'].strftime('%Y-%m-%dT%H:%M:%S'),
115 |         'metrics': format_metrics(page_views, visitors),
116 |         'referrers': dict(data['referrers'])
117 |     })
118 |     return pd.Series([data], name='data')
119 | 
120 | 
121 | def transform_data(ag):
122 |     """Accepts a Dask DataFrame and returns a Dask Bag, where each record is
123 |     a string, and the contents of the string is a JSON representation of the
124 |     document to be written.
125 | 
126 |     :param ag: DataFrame
127 |     :returns: DataFrame with one column "data" containing a dict.
128 |     """
129 |     tr = ag.apply(transform_one, axis=1, meta={'data': str})
130 |     tr = tr.repartition(npartitions=ag.npartitions)
131 |     return tr
132 | 
133 | 
134 | def delete_path(path):
135 |     """Recursively delete a path and everything under it."""
136 |     if path.startswith('s3://'):
137 |         s3 = s3fs.S3FileSystem()
138 |         if s3.exists(path):
139 |             s3.rm(path)
140 |     elif os.path.exists(path):
141 |         shutil.rmtree(path)
142 | 
143 | 
144 | def create_path(path):
145 |     """Create root dir."""
146 |     if not path.startswith('s3://') and not os.path.exists(path):
147 |         os.makedirs(path)
148 | 
149 | 
150 | def save_json(tr, path):
151 |     """Write records as json."""
152 |     root_dir = os.path.dirname(path)
153 | 
154 |     # cleanup before writing
155 |     delete_path(root_dir)
156 |     create_path(root_dir)
157 | 
158 |     (tr.to_bag()
159 |        .map(lambda t: t[0])
160 |        .map(json.dumps)
161 |        .to_textfiles(path))
162 | 
163 | 
164 | if __name__ == '__main__':
165 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
166 |     parser.add_argument('--count', type=int, default=100)
167 |     parser.add_argument('--nfiles', type=int, default=24)
168 |     parser.add_argument('--wait', action='store_true', default=False)
169 |     parser.add_argument('--scheduler', choices=['thread', 'process', 'default', 'single'],
170 |                         default='default')
171 |     parser.add_argument('--verbose', action='store_true', default=False)
172 |     parser.add_argument('--address', help='Scheduler address')
173 |     parser.add_argument('--input', default=INPUT_ROOT)
174 |     parser.add_argument('--output', default=OUTPUT_ROOT)
175 |     myargs = parser.parse_args()
176 | 
177 |     read_path = INPUT_TEMPLATE.format(root=myargs.input, event_count=myargs.count,
178 |                                       nfiles=myargs.nfiles)
179 |     write_path = OUTPUT_TEMPLATE.format(root=myargs.output, event_count=myargs.count,
180 |                                         nfiles=myargs.nfiles)
181 | 
182 |     set_display_options()
183 |     started = dt.datetime.utcnow()
184 |     if myargs.scheduler != 'default':
185 |         print('Scheduler: {}.'.format(myargs.scheduler))
186 |         getters = {'process': dask.multiprocessing.get,
187 |                    'thread': dask.threaded.get,
188 |                    'single': dask.get}
189 |         dask.set_options(get=getters[myargs.scheduler])
190 | 
191 |     try:
192 |         if myargs.address:
193 |             # explicit address is a workaround for "Worker failed to start":
194 |             # scheduler and worker have to be started in console.
195 |             # see https://github.com/dask/distributed/issues/1825
196 |             cluster = myargs.address
197 |         else:
198 |             cluster = LocalCluster()
199 | 
200 |         if myargs.verbose:
201 |             client = Client(address=cluster, silence_logs=False)
202 |         else:
203 |             client = Client(address=cluster)
204 | 
205 |         df = read_data(read_path)
206 |         aggregated = group_data(df)
207 |         prepared = transform_data(aggregated)
208 |         save_json(prepared, write_path)
209 |         elapsed = dt.datetime.utcnow() - started
210 |         parts_per_hour = int(myargs.nfiles / 24)
211 |         print('{:,} records, {} files ({} per hour): done in {}.'.format(
212 |             myargs.count, myargs.nfiles, parts_per_hour, elapsed))
213 |         if myargs.wait:
214 |             input('Press any key')
215 |     except:
216 |         elapsed = dt.datetime.utcnow() - started
217 |         print('Failed in {}.'.format(elapsed))
218 |         raise
219 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/daskvsspark/aggregate_dask.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [ ! -z $1 ]
 4 | then
 5 |     ADDRESS=$1
 6 | fi
 7 | 
 8 | if [ ! -z $2 ]
 9 | then
10 |     COUNT=$2
11 | else
12 |     COUNT=100
13 | fi
14 | 
15 | if [ ! -z $3 ]
16 | then
17 |     NFILES=$3
18 | else
19 |     NFILES=24
20 | fi
21 | 
22 | if [ ! -z $4 ]
23 | then
24 |     SCHEDULER=$4
25 | else
26 |     SCHEDULER="default"
27 | fi
28 | 
29 | python aggregate_dask.py --count $COUNT --nfiles $NFILES --scheduler $SCHEDULER --address $ADDRESS
30 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/daskvsspark/aggregate_dask_kube.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Stop at any error
 4 | set -e
 5 | 
 6 | if [ ! -z $1 ]
 7 | then
 8 |     ADDRESS=$1
 9 | else
10 |     echo "Usage: $0 <SCHEDULER ADDRESS>"
11 |     exit 1
12 | fi
13 | 
14 | if [ ! -z $2 ]
15 | then
16 |     COUNT=$2
17 | else
18 |     COUNT=100
19 | fi
20 | 
21 | if [ ! -z $3 ]
22 | then
23 |     NFILES=$3
24 | else
25 |     NFILES=24
26 | fi
27 | 
28 | 
29 | if [ ! -z $4 ]
30 | then
31 |     SCHEDULER=$4
32 | else
33 |     SCHEDULER="default"
34 | fi
35 | 
36 | cd /assets/code/daskvsspark
37 | 
38 | python aggregate_dask.py \
39 |     --input "s3://parsely-public/jbennet/daskvsspark/events" \
40 |     --output "s3://parsely-public/jbennet/daskvsspark/aggs_dask" \
41 |     --address $ADDRESS \
42 |     --count $COUNT \
43 |     --nfiles $NFILES \
44 |     --scheduler $SCHEDULER \
45 |     --verbose
46 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/daskvsspark/aggregate_dask_yarn.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Stop at any error
 4 | set -e
 5 | 
 6 | if [ ! -z $1 ]
 7 | then
 8 |     ADDRESS=$1
 9 | else
10 |     echo "Usage: $0 <SCHEDULER ADDRESS>"
11 |     exit 1
12 | fi
13 | 
14 | if [ ! -z $2 ]
15 | then
16 |     COUNT=$2
17 | else
18 |     COUNT=100
19 | fi
20 | 
21 | if [ ! -z $3 ]
22 | then
23 |     NFILES=$3
24 | else
25 |     NFILES=24
26 | fi
27 | 
28 | 
29 | if [ ! -z $4 ]
30 | then
31 |     SCHEDULER=$4
32 | else
33 |     SCHEDULER="default"
34 | fi
35 | 
36 | cd /home/hadoop/daskvsspark/daskvsspark
37 | 
38 | latest_egg=$(ls -t /home/hadoop/reqs/daskvsspark-*.egg | head -n 1)
39 | 
40 | PYTHONPATH=$latest_egg /home/hadoop/conda/envs/dvss/bin/python aggregate_dask.py \
41 |     --input "s3://parsely-public/jbennet/daskvsspark/events" \
42 |     --output "s3://parsely-public/jbennet/daskvsspark/aggs_dask" \
43 |     --address $ADDRESS \
44 |     --count $COUNT \
45 |     --nfiles $NFILES \
46 |     --scheduler $SCHEDULER \
47 |     --verbose
48 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/daskvsspark/aggregate_spark.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8
  2 | # aggregate_spark.py
  3 | import argparse
  4 | import os
  5 | import datetime as dt
  6 | 
  7 | from pyspark.sql.types import StringType, IntegerType, MapType
  8 | from pyspark.sql.column import Column, _to_java_column, _to_seq
  9 | 
 10 | from daskvsspark.context import initialize, INPUT_ROOT, OUTPUT_ROOT, PATH_TEMPLATE
 11 | from daskvsspark.common import *
 12 | 
 13 | if os.environ.get('TZ', '') != 'UTC':
 14 |     raise Exception('Please set TZ=UTC to run this.')
 15 | 
 16 | 
 17 | def load_sql_user_functions(sc, sqlContext):
 18 |     """Load our custom UDAFs into a sql context."""
 19 |     sqlContext.udf.register('format_id',
 20 |                             format_id,
 21 |                             StringType())
 22 |     sqlContext.udf.register('format_metrics',
 23 |                             format_metrics,
 24 |                             MapType(StringType(), IntegerType()))
 25 | 
 26 |     # custom aggregation function. Needs a jar provided in runner script.
 27 |     agg_counter = sc._jvm.com.jbennet.daskvsspark.udafs.AggregateCounter()
 28 |     sqlContext.sparkSession._jsparkSession.udf().register('count_values', agg_counter)
 29 | 
 30 | 
 31 | def count_values(col):
 32 |     """Register UDAF for use in aggregations outside of Spark SQL."""
 33 |     counter = sc._jvm.com.jbennet.daskvsspark.udafs.AggregateCounter().apply
 34 |     return Column(counter(_to_seq(sc, [col], _to_java_column)))
 35 | 
 36 | 
 37 | def aggregate(df):
 38 |     """Group data by customer, url, and 1 hour bucket."""
 39 |     df.createOrReplaceTempView("df")
 40 |     agg = sqlContext.sql("""
 41 |     select
 42 |         customer,
 43 |         url,
 44 |         window(ts, '1 hour').start as ts,
 45 |         count(*) as page_views,
 46 |         count(distinct(session_id)) as visitors,
 47 |         count_values(referrer) as referrers
 48 |     from df
 49 |     group by
 50 |         customer,
 51 |         url,
 52 |         window(ts, '1 hour').start
 53 |     """)
 54 |     return agg
 55 | 
 56 | 
 57 | def transform(df):
 58 |     """Format as needed."""
 59 |     df.createOrReplaceTempView("df")
 60 |     agg = sqlContext.sql("""
 61 |     select 
 62 |       format_id(customer, url, ts) as _id,
 63 |       customer,
 64 |       url,
 65 |       ts,
 66 |       format_metrics(page_views, visitors) as metrics,
 67 |       referrers
 68 |     from df
 69 |     """)
 70 |     return agg
 71 | 
 72 | 
 73 | def save_json(df, path):
 74 |     """Write aggregate rows as json."""
 75 |     df.write.mode('overwrite').json(path)
 76 | 
 77 | 
 78 | if __name__ == '__main__':
 79 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 80 |     parser.add_argument("--count", type=int, default=100)
 81 |     parser.add_argument("--nfiles", type=int, default=24)
 82 |     parser.add_argument("--wait", action='store_true', default=False)
 83 |     parser.add_argument('--input', default=INPUT_ROOT)
 84 |     parser.add_argument('--output', default=OUTPUT_ROOT)
 85 |     myargs = parser.parse_args()
 86 | 
 87 |     read_path = PATH_TEMPLATE.format(root=myargs.input, event_count=myargs.count,
 88 |                                      nfiles=myargs.nfiles)
 89 |     write_path = PATH_TEMPLATE.format(root=myargs.output, event_count=myargs.count,
 90 |                                       nfiles=myargs.nfiles)
 91 |     target_partitions = myargs.nfiles
 92 | 
 93 |     started = dt.datetime.utcnow()
 94 | 
 95 |     sc, sqlContext = initialize(target_partitions=target_partitions)
 96 |     load_sql_user_functions(sc, sqlContext)
 97 | 
 98 |     df = sqlContext.read.parquet(read_path)
 99 |     agg = aggregate(df)
100 |     agg = transform(agg)
101 |     save_json(agg, write_path)
102 |     elapsed = dt.datetime.utcnow() - started
103 | 
104 |     parts_per_hour = int(myargs.nfiles / 24)
105 |     print('{:,} records, {} files ({} per hour): done in {}.'.format(
106 |         myargs.count, myargs.nfiles, parts_per_hour, elapsed))
107 |     if myargs.wait:
108 |         input('Press any key')
109 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/daskvsspark/aggregate_spark.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [ ! -z $1 ]
 4 | then
 5 |     COUNT=$1
 6 | else
 7 |     COUNT=100
 8 | fi
 9 | 
10 | if [ ! -z $2 ]
11 | then
12 |     NFILES=$2
13 | else
14 |     NFILES=24
15 | fi
16 | 
17 | TZ=UTC PYSPARK_DRIVER_PYTHON=`which python` PYSPARK_PYTHON=`which python` \
18 |     $SPARK_HOME/bin/spark-submit \
19 |     --master "local[4]" \
20 |     --deploy-mode client \
21 |     --driver-memory 6g \
22 |     --executor-memory 2g \
23 |     --num-executors 4 \
24 |     --conf "spark.yarn.executor.memoryOverhead=2g" \
25 |     --driver-class-path ../scala/target/scala-2.11/daskvsspark-udafs_2.11-0.0.1.jar \
26 |     --driver-java-options "-Droot.logger=ERROR,console" \
27 |     aggregate_spark.py --count $COUNT --nfiles $NFILES
28 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/daskvsspark/aggregate_spark_yarn.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Stop at any error
 4 | set -e
 5 | 
 6 | build_egg() {
 7 |     cd /home/hadoop/daskvsspark/
 8 |     python3 setup.py bdist_egg
 9 |     cp ./dist/*.egg /home/hadoop/reqs/
10 | }
11 | 
12 | if [ ! -z $1 ]
13 | then
14 |     COUNT=$1
15 | else
16 |     COUNT=100
17 | fi
18 | 
19 | if [ ! -z $2 ]
20 | then
21 |     NFILES=$2
22 | else
23 |     NFILES=24
24 | fi
25 | 
26 | build_egg &> /dev/null
27 | 
28 | latest_egg=$(ls -t /home/hadoop/reqs/daskvsspark-*.egg | head -n 1)
29 | 
30 | cd /home/hadoop/daskvsspark/daskvsspark/
31 | 
32 | TZ=UTC PYSPARK_DRIVER_PYTHON=python3 PYSPARK_PYTHON=python3 \
33 |     spark-submit \
34 |     --master yarn \
35 |     --deploy-mode client \
36 |     --driver-memory 8g \
37 |     --executor-memory 3g \
38 |     --num-executors 4 \
39 |     --executor-cores 4 \
40 |     --conf "spark.yarn.executor.memoryOverhead=2g" \
41 |     --conf "spark.driver.extraJavaOptions=-Dlog4j.configuration=file:///home/hadoop/reqs/log4j.properties" \
42 |     --py-files ${latest_egg} \
43 |     --jars /home/hadoop/reqs/daskvsspark-udafs_2.11-0.0.1.jar \
44 |     aggregate_spark.py \
45 |     --input "s3://parsely-public/jbennet/daskvsspark/events" \
46 |     --output "s3://parsely-public/jbennet/daskvsspark/aggs_spark" \
47 |     --count $COUNT \
48 |     --nfiles $NFILES
49 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/daskvsspark/common.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8
 2 | 
 3 | 
 4 | def set_display_options():
 5 |     import pandas as pd
 6 |     pd.set_option('display.max_colwidth', 1000)
 7 |     pd.set_option('display.expand_frame_repr', False)
 8 | 
 9 | 
10 | def format_id(customer, url, ts):
11 |     """Create a unique id for the aggregated record."""
12 |     return "{}|{}|{:%Y-%m-%dT%H:%M:%S}".format(url, customer, ts)
13 | 
14 | 
15 | def format_metrics(page_views, visitors):
16 |     """Create a dict of metrics."""
17 |     return {
18 |         "page_views": page_views,
19 |         "visitors": visitors
20 |     }
21 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/daskvsspark/context.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8
 2 | from pyspark.context import SparkContext, SparkConf
 3 | from pyspark.sql import SQLContext
 4 | 
 5 | # template path. Event_count will be replaced by a number.
 6 | PATH_TEMPLATE = '{root}/{event_count}-{nfiles}'
 7 | INPUT_ROOT = './events'
 8 | OUTPUT_ROOT = "./aggs_spark"
 9 | 
10 | 
11 | def initialize(target_partitions=None):
12 |     """Returns SparkContext and SQLContext."""
13 |     conf = SparkConf()
14 |     extra_settings = {
15 |         'spark.serializer': 'org.apache.spark.serializer.KryoSerializer',
16 |         'spark.executor.extraJavaOptions': '-XX:+UseG1GC'
17 |     }
18 |     if target_partitions:
19 |         extra_settings['spark.default.parallelism'] = target_partitions
20 | 
21 |     conf.setAll(extra_settings.items())
22 |     environment = {'PYTHON_EGG_CACHE': '/tmp/python-eggs'}
23 |     sc = SparkContext(conf=conf, environment=environment)
24 | 
25 |     sqlContext = SQLContext(sc)
26 |     if target_partitions:
27 |         sqlContext.setConf('spark.sql.shuffle.partitions', target_partitions)
28 | 
29 |     jvm_logger = sc._jvm.org.apache.log4j
30 |     jvm_logger.LogManager.getLogger("org").setLevel(jvm_logger.Level.ERROR)
31 |     jvm_logger.LogManager.getLogger("akka").setLevel(jvm_logger.Level.ERROR)
32 |     return sc, sqlContext
33 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/daskvsspark/notes.txt:
--------------------------------------------------------------------------------
  1 | -------------
  2 | Size of data:
  3 | -------------
  4 | 
  5 | 100-24:         196K
  6 | 10,000-24:       196K
  7 | 1,000,000-24:     1.8M
  8 | 10,000,000-24:    16M
  9 | 10,000,000-96:    17M
 10 | 100,000,000-24:   171M
 11 | 100,000,000-48:   189M
 12 | 100,000,000-96:   198M
 13 | 100,000,000-192:  169M
 14 | 100,000,000-384:  165M
 15 | 100,000,000-984:  169M
 16 | 1,000,000,000-500: 3.3G
 17 | 
 18 | 
 19 | ----------------------------
 20 | Partitioning on 100,000,000:
 21 | ----------------------------
 22 | 
 23 | 100k, 984 files, 41 per hr
 24 | 250k, 384 files, 16 per hr
 25 | 500k, 192 files, 8 per hr
 26 | 1m, 96 files, 4 per hr
 27 | 2m, 48 files, 2 per hr *
 28 | 4m, 24 files, 1 per hr
 29 | 
 30 | *: best for Spark and Dask
 31 | 
 32 | ---------------------------------
 33 | Spark with python3 and custom agg
 34 | ---------------------------------
 35 | 
 36 | (talks3) --- daskvsspark/daskvsspark ‹master*M› » ./tmp_run_all_spark.sh                                                                                126 ↵
 37 | 10 records, 24 files (1 per hour): done in 0:00:10.601392.
 38 | 100 records, 24 files (1 per hour): done in 0:00:11.315226.
 39 | 10,000 records, 24 files (1 per hour): done in 0:00:11.744349.
 40 | 1,000,000 records, 24 files (1 per hour): done in 0:00:15.394712.
 41 | 10,000,000 records, 24 files (1 per hour): done in 0:00:29.044079.
 42 | 10,000,000 records, 96 files (4 per hour): done in 0:00:34.295349.
 43 | 
 44 | 100,000,000 records, 984 files (41 per hour): done in 0:03:41.323534.
 45 | 100,000,000 records, 384 files (16 per hour): done in 0:03:14.743094.
 46 | 100,000,000 records, 192 files (8 per hour): done in 0:02:52.175157.
 47 | 100,000,000 records, 96 files (4 per hour): done in 0:03:09.673154.
 48 | 100,000,000 records, 48 files (2 per hour): done in 0:02:50.821578. *
 49 | 100,000,000 records, 24 files (1 per hour): done in 0:02:57.805231.
 50 | 
 51 | 1,000,000,000 records, 500 files (20 per hour): done in 0:45:08.288687.
 52 | 1,000,000,000 records, 500 files (20 per hour): done in 0:41:14.634671.
 53 | 1,000,000,000 records, 240 files (10 per hour): done in 0:51:39.638557.
 54 | 
 55 | ---------------------------------------
 56 | Dask with default scheduler and python3
 57 | ---------------------------------------
 58 | 
 59 | (talks3) --- daskvsspark/daskvsspark ‹master*AM› » ./tmp_run_all_dask.sh                                                                                  1 ↵
 60 | 10 records, 24 files (1 per hour): done in 0:00:04.280264.
 61 | 100 records, 24 files (1 per hour): done in 0:00:01.490881.
 62 | 10,000 records, 24 files (1 per hour): done in 0:00:02.811427.
 63 | 1,000,000 records, 24 files (1 per hour): done in 0:00:03.013248.
 64 | 10,000,000 records, 24 files (1 per hour): done in 0:00:06.194535.
 65 | 10,000,000 records, 96 files (4 per hour): done in 0:00:08.708831.
 66 | 
 67 | 100,000,000 records, 984 files (41 per hour): done in 0:01:10.351981.
 68 | 100,000,000 records, 384 files (16 per hour): done in 0:00:49.119739.
 69 | 100,000,000 records, 192 files (8 per hour): done in 0:00:41.575053.
 70 | 100,000,000 records, 96 files (4 per hour): done in 0:00:38.806466.
 71 | 100,000,000 records, 48 files (2 per hour): done in 0:00:37.713205. *
 72 | 100,000,000 records, 24 files (1 per hour): done in 0:01:03.122334.
 73 | 
 74 | 1,000,000,000 records, 500 files (20 per hour): done in 0:16:11.660423.
 75 | 1,000,000,000 records, 240 files (10 per hour): done in 0:16:34.453926.
 76 | 
 77 | ---------------
 78 | Running on YARN
 79 | ---------------
 80 | 
 81 | Master:
 82 | -------
 83 | m4.xlarge
 84 | 8 vCore, 16 GiB memory, EBS only storage
 85 | EBS Storage:32 GiB
 86 | 
 87 | Core: 2
 88 | -------
 89 | c4.2xlarge
 90 | 8 vCore, 15 GiB memory, EBS only storage
 91 | EBS Storage:64 GiB
 92 | 
 93 | Settings used:
 94 | 
 95 |     --driver-memory 8g
 96 |     --executor-memory 3g
 97 |     --num-executors 4
 98 |     --executor-cores 4
 99 |     --conf "spark.yarn.executor.memoryOverhead=2g"
100 | 
101 | In Dask, this would correspond to:
102 | 
103 | Master:
104 |     $ dask-scheduler
105 |     $ PYTHONPATH=/home/hadoop/reqs/daskvsspark-0.1-py3.6.egg dask-worker --nthreads 4 --memory-limit 5G tcp://10.21.0.76:8786
106 | 
107 | Core (2):
108 |     $ PYTHONPATH=/home/hadoop/reqs/daskvsspark-0.1-py3.6.egg dask-worker --nprocs 2 --nthreads 4 --memory-limit 5G tcp://10.21.0.76:8786
109 | 
110 | 
111 | To run `aggregate_dask.sh`:
112 | 
113 |     $ export PATH="/home/hadoop/conda/bin:$PATH"
114 |     $ source activate dvss
115 | 
116 | 
117 | -------------
118 | YARN REST API
119 | -------------
120 | 
121 | curl -s $HOSTNAME:8088/ws/v1/cluster | jq
122 | curl -s $HOSTNAME:8088/ws/v1/cluster/metrics | jq
123 | curl -s $HOSTNAME:8088/ws/v1/cluster/scheduler | jq
124 | 
125 |     "totalMB": 23040,
126 |     "totalVirtualCores": 16,
127 |     "totalNodes": 2,
128 | 
129 | -----
130 | Spark
131 | -----
132 | 
133 | [hadoop@ip-10-21-0-173 daskvsspark]$ ./aggregate_spark_yarn.sh 1000000000 500
134 | 1,000,000,000 records, 500 files (20 per hour): done in 0:12:15.945298.
135 | [hadoop@ip-10-21-0-173 daskvsspark]$ ./aggregate_spark_yarn.sh 1000000000 500
136 | 1,000,000,000 records, 500 files (20 per hour): done in 0:11:59.845888.
137 | [hadoop@ip-10-21-0-173 daskvsspark]$ ./aggregate_spark_yarn.sh 1000000000 500
138 | 1,000,000,000 records, 500 files (20 per hour): done in 0:12:14.694722.
139 | 
140 | ----------
141 | Dask Yarn:
142 | ----------
143 | 
144 | n_workers=4, memory=5*1024,4 cpus=3: only uses 12 cpus, but all the mem
145 | [hadoop@ip-10-21-0-76 daskvsspark]$ ./aggregate_dask_yarn.sh tcp://10.21.0.76:36955 1000000000 500
146 | 1,000,000,000 records, 500 files (20 per hour): done in 0:11:20.381808.
147 | [hadoop@ip-10-21-0-63 daskvsspark]$ ./aggregate_dask_yarn.sh tcp://10.21.0.63:46101 1000000000 500
148 | 1,000,000,000 records, 500 files (20 per hour): done in 0:11:15.664728.
149 | [hadoop@ip-10-21-0-63 daskvsspark]$ ./aggregate_dask_yarn.sh tcp://10.21.0.63:46101 1000000000 500
150 | 1,000,000,000 records, 500 files (20 per hour): done in 0:11:12.667145.
151 | 
152 | (5 workers and 3 workers had worse results)
153 | 
154 | -------------
155 | Dask console:
156 | -------------
157 | 
158 | 5 workers (2 x core + 1 x master) with 4 cores and 5G memory each:
159 | 
160 | (dvss) [hadoop@ip-10-21-0-229 daskvsspark]$ ./aggregate_dask_yarn.sh tcp://10.21.0.229:8786 1000000000 500
161 | 1,000,000,000 records, 500 files (20 per hour): done in 0:09:44.437957.
162 | (dvss) [hadoop@ip-10-21-0-229 daskvsspark]$ ./aggregate_dask_yarn.sh tcp://10.21.0.229:8786 1000000000 500
163 | 1,000,000,000 records, 500 files (20 per hour): done in 0:09:45.010486.
164 | (dvss) [hadoop@ip-10-21-0-229 daskvsspark]$ ./aggregate_dask_yarn.sh tcp://10.21.0.229:8786 1000000000 500
165 | 1,000,000,000 records, 500 files (20 per hour): done in 0:09:40.223227.
166 | 
167 | 4 workers (2 x core) with 4 cores and 5G memory each:
168 | 
169 | (dvss) [hadoop@ip-10-21-0-76 daskvsspark]$ ./aggregate_dask_yarn.sh tcp://10.21.0.76:8786 1000000000 500
170 | 1,000,000,000 records, 500 files (20 per hour): done in 0:11:29.925453.
171 | (dvss) [hadoop@ip-10-21-0-76 daskvsspark]$ ./aggregate_dask_yarn.sh tcp://10.21.0.76:8786 1000000000 500
172 | 1,000,000,000 records, 500 files (20 per hour): done in 0:11:37.121077.
173 | (dvss) [hadoop@ip-10-21-0-76 daskvsspark]$ ./aggregate_dask_yarn.sh tcp://10.21.0.76:8786 1000000000 500
174 | 1,000,000,000 records, 500 files (20 per hour): done in 0:11:43.173057.
175 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/daskvsspark/prepare.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8
  2 | # prepare.py
  3 | import argparse
  4 | import datetime as dt
  5 | import math
  6 | import itertools
  7 | import os
  8 | import random
  9 | import sys
 10 | 
 11 | import pytz
 12 | 
 13 | from daskvsspark.context import initialize, INPUT_ROOT, PATH_TEMPLATE
 14 | from daskvsspark.schema import MY_SCHEMA, PARTITION_FIELDS
 15 | 
 16 | 
 17 | DATE = dt.datetime(2017, 9, 17)
 18 | 
 19 | 
 20 | def generate_row(total_articles, session_ids):
 21 |     """Create page view event."""
 22 |     # tuple fields:
 23 |     # customer, url, referrer, session_id, ts, year, month, day, hour
 24 |     minute = random.randint(0, 59)
 25 |     hour = random.randint(0, 23)
 26 |     article_number = random.randint(1, total_articles)
 27 |     referrer = random.choice(['http://google.com/', 'http://bing.com/', 'http://facebook.com/'])
 28 |     session_id = random.choice(session_ids)
 29 |     return (
 30 |         'a.com',
 31 |         'http://a.com/articles/{}'.format(article_number),
 32 |         referrer,
 33 |         session_id,
 34 |         DATE.replace(hour=hour, minute=minute, tzinfo=pytz.UTC),
 35 |         '{:04}'.format(DATE.year),
 36 |         '{:02}'.format(DATE.month),
 37 |         '{:02}'.format(DATE.day),
 38 |         '{:02}'.format(hour)
 39 |     )
 40 | 
 41 | 
 42 | def nfiles(records, records_per_file):
 43 |     """How many files per hour, and total files to generate."""
 44 |     parts_per_hour = max(1, int(records / records_per_file / 24))
 45 |     total_files = parts_per_hour * 24
 46 |     return parts_per_hour, total_files
 47 | 
 48 | 
 49 | def generate_rows(sc, records, records_per_file):
 50 |     """Generate data."""
 51 |     random.seed(records)
 52 |     parts_per_hour, total_files = nfiles(records, records_per_file)
 53 |     part_size = int(records / parts_per_hour)
 54 |     actual_records_per_file = int(records / total_files)
 55 |     print('Generating {} files(s) ({:,} per hour) with {:,} ({:,} actual) records each...'.format(
 56 |         total_files,
 57 |         parts_per_hour,
 58 |         records_per_file,
 59 |         actual_records_per_file))
 60 | 
 61 |     total_articles = math.ceil(math.pow(records, 1.0/3))
 62 |     session_ids = [''.join(t) for t in list(itertools.permutations(list('abcdefg'), 3))]
 63 |     data = (sc.parallelize([], parts_per_hour)
 64 |               .mapPartitions(lambda rs: (generate_row(total_articles, session_ids)
 65 |                                          for _ in range(part_size))))
 66 |     return data
 67 | 
 68 | 
 69 | if __name__ == '__main__':
 70 |     parser = argparse.ArgumentParser()
 71 |     parser.add_argument("--count", type=int, default=100)
 72 |     parser.add_argument("--chunk-size", type=int, default=100000)
 73 |     myargs = parser.parse_args()
 74 | 
 75 |     parts_per_hour, total_files = nfiles(myargs.count, myargs.chunk_size)
 76 |     write_path = PATH_TEMPLATE.format(root=INPUT_ROOT, event_count=myargs.count, nfiles=total_files)
 77 | 
 78 |     # cleanup before writing
 79 |     if os.path.exists(write_path):
 80 |         print('Path exists: {}. Exiting.'.format(write_path))
 81 |         sys.exit(0)
 82 | 
 83 |     sc, sqlContext = initialize()
 84 | 
 85 |     # mock some data
 86 |     started = dt.datetime.now()
 87 |     print('Generating data...')
 88 |     data = generate_rows(sc, myargs.count, myargs.chunk_size)
 89 |     df = sqlContext.createDataFrame(data, MY_SCHEMA)
 90 | 
 91 |     print('Generated {:,} records with {:,} files per hour in {}.'.format(
 92 |         myargs.count, parts_per_hour, dt.datetime.now() - started))
 93 | 
 94 |     # write parquet
 95 |     started = dt.datetime.now()
 96 |     print('Writing {:,} records...'.format(myargs.count))
 97 |     (df.write
 98 |        .parquet(write_path, partitionBy=PARTITION_FIELDS, compression='gzip'))
 99 |     print('Wrote {:,} records in {}.'.format(myargs.count, dt.datetime.now() - started))
100 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/daskvsspark/prepare.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | if [ ! -z $1 ]
 3 | then
 4 |     COUNT=$1
 5 | else
 6 |     COUNT=100
 7 | fi
 8 | 
 9 | if [ ! -z $2 ]
10 | then
11 |     CHUNKSIZE=$2
12 | else
13 |     CHUNKSIZE=100000
14 | fi
15 | 
16 | TZ=UTC PYSPARK_DRIVER_PYTHON=`which python` PYSPARK_PYTHON=`which python` \
17 |     $SPARK_HOME/bin/spark-submit \
18 |     --master "local[4]" \
19 |     --deploy-mode client \
20 |     --driver-memory 8g \
21 |     --executor-memory 2g \
22 |     --num-executors 4 \
23 |     --driver-java-options "-Droot.logger=ERROR,console" \
24 |     prepare.py --count $COUNT --chunk-size $CHUNKSIZE
25 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/daskvsspark/schema.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8
 2 | # schema.py
 3 | from pyspark.sql.types import *
 4 | 
 5 | 
 6 | PARTITION_FIELDS = ['year', 'month', 'day', 'hour', "customer"]
 7 | 
 8 | MY_SCHEMA = StructType([
 9 |     StructField('customer', StringType(), True),
10 |     StructField('url', StringType(), True),
11 |     StructField('referrer', StringType(), True),
12 |     StructField('session_id', StringType(), True),
13 |     StructField('ts', TimestampType(), True),
14 |     # partitioning keys
15 |     StructField('year', StringType(), nullable=False),
16 |     StructField('month', StringType(), nullable=False),
17 |     StructField('day', StringType(), nullable=False),
18 |     StructField('hour', StringType(), nullable=False),
19 | ])
20 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/daskvsspark/show.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8
 2 | # show.py
 3 | 
 4 | import glob
 5 | import os
 6 | import sys
 7 | import simplejson as json
 8 | from pprint import pprint
 9 | 
10 | 
11 | if __name__ == '__main__':
12 |     if len(sys.argv) != 3:
13 |         print('Usage: {} <PATH> <COUNT>'.format(sys.argv[0]))
14 |         sys.exit(0)
15 | 
16 |     json_path = sys.argv[1]
17 |     requested = int(sys.argv[2])
18 |     json_files = glob.glob1(json_path, '*.json')
19 | 
20 |     collected, total = 0, 0
21 |     for file_name in json_files:
22 |         full_name = os.path.join(json_path, file_name)
23 |         with open(full_name, 'r') as f:
24 |             for line in f:
25 |                 if collected < requested:
26 |                     data = json.loads(line)
27 |                     pprint(data)
28 |                     print("")
29 |                     collected += 1
30 |                 total += 1
31 | 
32 |     print('-' * 20)
33 |     print('Total files: {}'.format(len(json_files)))
34 |     print('Total lines: {}'.format(total))
35 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/daskvsspark/start_dask.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | tmux new-session -d -s scheduler "dask-scheduler"
4 | tmux split-window "dask-worker localhost:8786 --nprocs 4"
5 | tmux attach
6 | 
7 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/daskvsspark/start_dask_yarn.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import dask_yarn
 3 | from time import sleep
 4 | 
 5 | 
 6 | if __name__ == '__main__':
 7 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 8 |     parser.add_argument('--verbose', dest='verbose', action='store_true', help='Print logs on exit',
 9 |                         default=False)
10 |     parser.add_argument('nworkers', help='Number of workers', type=int, default=4)
11 |     parser.add_argument('ncores', help='Number of worker cores (threads)', type=int, default=3)
12 |     parser.add_argument('memory', help='Worker memory (MiB)', type=int, default=5*1024)
13 |     myargs = parser.parse_args()
14 | 
15 |     cluster = dask_yarn.DaskYARNCluster(env='/home/hadoop/reqs/dvss.zip', lang='en_US.UTF-8')
16 |     cluster.start(n_workers=myargs.nworkers, memory=myargs.memory, cpus=myargs.ncores)
17 |     try:
18 |         while True:
19 |             print('-' * 20)
20 |             print('Cluster scheduler: {}.'.format(cluster.scheduler_address))
21 |             bk = cluster.local_cluster.scheduler.services['bokeh'].server
22 |             print('Bokeh: http://{}:{}'.format(bk.address, bk.port))
23 |             sleep(20)
24 |     except KeyboardInterrupt:
25 |         print('Interrupted, exiting.')
26 | 
27 |     print('-' * 20)
28 |     if myargs.verbose:
29 |         cluster.knit.print_logs()
30 |     print('Cluster is done.')
31 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/deployment/bootstrap.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Are we running on a master node?
 4 | cat /var/lib/info/instance.json | grep '"isMaster": true'
 5 | IS_MASTER=$?
 6 | 
 7 | unset PYTHON_INSTALL_LAYOUT
 8 | export LC_ALL=C.UTF-8
 9 | export LANG=C.UTF-8
10 | 
11 | # Stop at any error, show all commands
12 | set -ex
13 | 
14 | 
15 | install_python_36() {
16 |     # Ensure Python 3.6 is installed
17 |     if [[ ! -x /usr/local/bin/python3.6 ]]; then
18 |         echo "Python 3.6 not installed, installing"
19 |         # Compilers and related tools:
20 |         sudo yum groupinstall -y "development tools"
21 |         # Libraries needed during compilation to enable all features of Python:
22 |         sudo yum install -y zlib-devel bzip2-devel openssl-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel expat-devel
23 |         # Download and install Python 3.6.1
24 |         wget https://s3.amazonaws.com/parsely-public/chef-pkgs/python_3.6.1_x86_64.rpm
25 |         sudo rpm -ivh python_3.6.1_x86_64.rpm
26 |         # Make sure we have pip
27 |         sudo /usr/local/bin/python3 -m ensurepip --upgrade
28 |     fi
29 | 
30 |     echo "Note: Python 3.6 will be available as python3, not python"
31 |     echo "Be sure to set PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON in Configurations"
32 | }
33 | 
34 | update_packages() {
35 |     # Do this again just in case
36 |     unset PYTHON_INSTALL_LAYOUT
37 | 
38 |     PIP="sudo /usr/local/bin/pip3"
39 | 
40 |     cd /home/hadoop/
41 | 
42 |     # this includes a jar also
43 |     aws s3 cp --recursive s3://parsely-public/jbennet/daskvsspark/reqs/ ./reqs
44 |     chmod +x ./reqs/*.sh
45 | 
46 |     # needed to install python-snappy
47 |     sudo yum install -y snappy-devel
48 | 
49 |     $PIP install -U pip
50 |     $PIP install -r ./reqs/requirements.txt
51 | }
52 | 
53 | install_conda() {
54 |     if [[ ! -d /home/hadoop/conda ]]; then
55 |         echo "Downloading conda"
56 |         wget --quiet https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
57 |         chmod +x ~/miniconda.sh
58 | 
59 |         echo "Installing conda"
60 |         ~/miniconda.sh -b -p ~/conda
61 | 
62 |         export PATH="/home/hadoop/conda/bin:$PATH"
63 |         echo 'export PATH="/home/hadoop/conda/bin:$PATH"' >> ~/.bashrc
64 | 
65 |         echo "Updating conda"
66 |         conda update --yes conda
67 |         conda info -a
68 |     fi
69 | }
70 | 
71 | create_conda_env() {
72 |     if [[ ! -f /home/hadoop/conda/envs/dvss ]]; then
73 |         echo "Creatinv venv dvss"
74 |         conda create -n dvss --copy -y -q python=3
75 |         echo "Installing requirements into venv"
76 |         conda install -n dvss --copy -y -c conda-forge --file ~/reqs/requirements.txt --file ~/reqs/requirements-dask.txt
77 |     fi
78 | }
79 | 
80 | 
81 | install_python_36
82 | update_packages
83 | install_conda
84 | create_conda_env
85 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/deployment/conf.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "Classification": "hdfs-site",
 4 |     "Properties": {
 5 |       "dfs.block.size": "134217728",
 6 |       "dfs.replication": "2"
 7 |     }
 8 |   },
 9 |   {
10 |     "Classification": "mapred-site",
11 |     "Properties": {
12 |       "mapreduce.job.reduces": "400",
13 |       "mapreduce.reduce.memory.mb": "1408",
14 |       "mapreduce.reduce.java.opts": "-Xmx1126m"
15 |     }
16 |   },
17 |   {
18 |     "Classification": "spark-defaults",
19 |     "Properties": {
20 |       "spark.serializer": "org.apache.spark.serializer.KryoSerializer",
21 |       "spark.dynamicAllocation.executorIdleTimeout": "30s",
22 |       "spark.executor.heartbeatInterval" : "5s",
23 |       "spark.default.parallelism": "500",
24 |       "spark.sql.shuffle.partitions": "500",
25 |       "spark.yarn.executor.memoryOverhead": "3g",
26 |       "spark.executor.memory": "4g",
27 |       "spark.executor.cores": "4",
28 |       "spark.driver.memory": "15g"
29 |     }
30 |   },
31 |   {
32 |     "Classification": "yarn-site",
33 |     "Properties": {
34 |       "yarn.nodemanager.vmem-check-enabled": "false"
35 |     }
36 |   },
37 |   {
38 |     "Classification": "zeppelin-env",
39 |     "Properties": {},
40 |     "Configurations": [
41 |       {
42 |         "Classification": "export",
43 |         "Properties": {
44 |           "ZEPPELIN_NOTEBOOK_STORAGE" : "org.apache.zeppelin.notebook.repo.S3NotebookRepo",
45 |           "ZEPPELIN_NOTEBOOK_S3_BUCKET": "parsely-zeppelin-notebooks",
46 |           "ZEPPELIN_NOTEBOOK_S3_USER": "irina",
47 |           "ZEPPELIN_MEM": "\"-Xms4096m -Xmx4096m -XX:MaxPermSize=2048m\"",
48 |           "ZEPPELIN_INTP_MEM": "\"-Xms4096m -Xmx4096m -XX:MaxPermSize=2048m\""
49 |         },
50 |         "Configurations": []
51 |       }
52 |     ]
53 |   }
54 | ]
55 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/deployment/config.yaml:
--------------------------------------------------------------------------------
 1 | scheduler:
 2 |   serviceType: "NodePort"
 3 | 
 4 | worker:
 5 |   replicas: 1
 6 |   resources:
 7 |     limits:
 8 |       cpu: 4
 9 |       memory: 8G
10 |     requests:
11 |       cpu: 4
12 |       memory: 8G
13 | 
14 | jupyter:
15 |   enabled: false
16 | 
17 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/deployment/create_cluster.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | aws emr create-cluster \
 4 | --applications Name=Hadoop Name=Spark Name=Ganglia Name=Zeppelin \
 5 | --bootstrap-actions '[{"Path":"s3://parsely-public/jbennet/daskvsspark/reqs/bootstrap.sh","Name":"Dask Bootstrap"}]' \
 6 | --ebs-root-volume-size 20 \
 7 | --ec2-attributes '{"KeyName":"emr_jobs","InstanceProfile":"EMR_EC2_DefaultRole","SubnetId":"subnet-ca9b41bd","EmrManagedSlaveSecurityGroup":"sg-f6a19e93","EmrManagedMasterSecurityGroup":"sg-f7a19e92"}' \
 8 | --service-role EMR_DefaultRole \
 9 | --release-label emr-5.11.1 \
10 | --log-uri 's3n://parsely-emr-logs/' \
11 | --name 'IT Testing' \
12 | --configurations file://./conf.json \
13 | --instance-groups file://./instances.json \
14 | --scale-down-behavior TERMINATE_AT_TASK_COMPLETION \
15 | --region us-east-1
16 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/deployment/deploy_code.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | cd ..
 4 | 
 5 | rsync -azvr \
 6 |     --include "README.md" \
 7 |     --include "requirements.txt" \
 8 |     --include "setup.py" \
 9 |     --include "/daskvsspark/" \
10 |     --include "/daskvsspark/*.py" \
11 |     --include "/daskvsspark/aggregate_*_yarn.sh" \
12 |     --exclude "*" \
13 |     ./ hadoop@dvss:/home/hadoop/daskvsspark/
14 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/deployment/deploy_data.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Stop at any error, show all commands
 4 | set -ex
 5 | 
 6 | S3_PATH="s3://parsely-public/jbennet/daskvsspark/events/"
 7 | 
 8 | # copy fake data to s3
 9 | aws s3 sync ../daskvsspark/events/ ${S3_PATH}
10 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/deployment/deploy_reqs.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | S3_PATH="s3://parsely-public/jbennet/daskvsspark/reqs/"
 4 | 
 5 | # copy bootstrap script to s3
 6 | aws s3 cp ../deployment/bootstrap.sh ${S3_PATH}
 7 | aws s3 cp ../deployment/setup_dvss.sh ${S3_PATH}
 8 | 
 9 | # copy log conf
10 | aws s3 cp ../deployment/log4j.properties ${S3_PATH}
11 | 
12 | # copy reqs to s3
13 | aws s3 cp ../requirements.txt ${S3_PATH}
14 | aws s3 cp ../requirements-dask.txt ${S3_PATH}
15 | 
16 | # copy jars to s3
17 | aws s3 cp ../scala/target/scala-2.11/daskvsspark-udafs_2.11-0.0.1.jar ${S3_PATH}
18 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/deployment/instances.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "InstanceCount": 1,
 4 |     "BidPrice": "2.50",
 5 |     "EbsConfiguration": {
 6 |       "EbsBlockDeviceConfigs": [
 7 |         {
 8 |           "VolumeSpecification": {
 9 |             "SizeInGB": 32,
10 |             "VolumeType": "gp2"
11 |           },
12 |           "VolumesPerInstance": 1
13 |         }
14 |       ]
15 |     },
16 |     "InstanceGroupType": "MASTER",
17 |     "InstanceType": "m4.xlarge",
18 |     "Name": "Master"
19 |   },
20 |   {
21 |     "InstanceCount": 2,
22 |     "BidPrice": "0.15",
23 |     "EbsConfiguration": {
24 |       "EbsBlockDeviceConfigs": [
25 |         {
26 |           "VolumeSpecification": {
27 |             "SizeInGB": 64,
28 |             "VolumeType": "gp2"
29 |           },
30 |           "VolumesPerInstance": 1
31 |         }
32 |       ],
33 |       "EbsOptimized": true
34 |     },
35 |     "InstanceGroupType": "CORE",
36 |     "InstanceType": "c4.2xlarge",
37 |     "Name": "Core"
38 |   }
39 | ]


--------------------------------------------------------------------------------
/2018/daskvsspark/deployment/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Error level only
 2 | log4j.rootLogger=ERROR, console
 3 | 
 4 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
 7 | 
 8 | # Set the default spark-shell log level to WARN. When running the spark-shell, the
 9 | # log level for this class is used to overwrite the root logger's log level, so that
10 | # the user can have different defaults for the shell and regular Spark apps.
11 | log4j.logger.org.apache.spark.repl.Main=ERROR
12 | 
13 | # Settings to quiet third party logs that are too verbose
14 | log4j.logger.org.spark_project.jetty=WARN
15 | log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR
16 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
17 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
18 | log4j.logger.org.apache.parquet=ERROR
19 | log4j.logger.parquet=ERROR
20 | 
21 | # Quiet even more verbose loggers
22 | log4j.logger.org.apache.spark.sql.execution.datasources.parquet=ERROR
23 | log4j.logger.org.apache.spark.sql.execution.datasources.FileScanRDD=ERROR
24 | log4j.logger.org.apache.hadoop.io.compress.CodecPool=ERROR
25 | log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator=ERROR
26 | log4j.logger.org.apache.spark.ContextCleaner=ERROR
27 | 
28 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
29 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
30 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
31 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/deployment/setup_dvss.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # This should run on master first, and then on workers.
 3 | 
 4 | # Are we running on a master node?
 5 | cat /var/lib/info/instance.json | grep '"isMaster": true'
 6 | IS_MASTER=$?
 7 | 
 8 | set -e
 9 | 
10 | build_egg() {
11 |     echo "Building the egg"
12 |     cd /home/hadoop/daskvsspark/
13 |     python3 setup.py bdist_egg
14 |     cp ./dist/*.egg /home/hadoop/reqs/
15 |     echo "Uploading the egg to s3"
16 |     aws s3 cp ~/reqs/daskvsspark-0.1-py3.6.egg s3://parsely-public/jbennet/daskvsspark/reqs/
17 | }
18 | 
19 | package_env() {
20 |     #echo "Installing daskvsspark into master's venv"
21 |     #cd /home/hadoop/daskvsspark
22 |     #~/conda/envs/dvss/bin/python setup.py install -q
23 | 
24 |     if [[ -f ~/reqs/dvss.zip ]]; then
25 |         rm ~/reqs/dvss.zip
26 |     fi
27 | 
28 |     echo "Zipping up venv"
29 |     cd ~/conda/envs
30 |     zip -qr dvss.zip dvss
31 |     mv dvss.zip ~/reqs/
32 |     echo "Uploading zip to s3"
33 |     aws s3 cp ~/reqs/dvss.zip s3://parsely-public/jbennet/daskvsspark/reqs/
34 | }
35 | 
36 | download_env() {
37 |     echo "Downloading venv and egg"
38 |     aws s3 cp s3://parsely-public/jbennet/daskvsspark/reqs/daskvsspark-0.1-py3.6.egg /home/hadoop/reqs/
39 |     aws s3 cp s3://parsely-public/jbennet/daskvsspark/reqs/dvss.zip /home/hadoop/reqs/
40 | }
41 | 
42 | if [[ $IS_MASTER eq 0 ]]; then
43 |     build_egg
44 |     package_env
45 | else
46 |     download_env
47 | fi


--------------------------------------------------------------------------------
/2018/daskvsspark/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.1"
 2 | 
 3 | services:
 4 |   scheduler:
 5 |     build:
 6 |       context: .
 7 |       dockerfile: Dockerfile
 8 |       args:
 9 |         AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID
10 |         AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY
11 |         AWS_DEFAULT_REGION: $AWS_DEFAULT_REGION
12 |     image: jbennet/dvss-base
13 |     hostname: dvss-scheduler
14 |     ports:
15 |       - "8786:8786"
16 |       - "8787:8787"
17 |     command: ["dask-scheduler"]
18 | 
19 |   worker:
20 |     build:
21 |       context: .
22 |       dockerfile: Dockerfile
23 |       args:
24 |         AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID
25 |         AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY
26 |         AWS_DEFAULT_REGION: $AWS_DEFAULT_REGION
27 |     image: jbennet/dvss-base
28 |     hostname: dvss-worker
29 |     command: ["dask-worker scheduler:8786 --nprocs 4 --nthreads 1"]
30 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/dvss-helm/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *~
18 | # Various IDEs
19 | .project
20 | .idea/
21 | *.tmproj
22 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/dvss-helm/Chart.yaml:
--------------------------------------------------------------------------------
1 | name: dvss
2 | fullname: dvss
3 | version: 1.0.0
4 | appVersion: 0.0.1
5 | description: A Helm chart for dvss
6 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/dvss-helm/templates/NOTES.txt:
--------------------------------------------------------------------------------
 1 | Thank you for installing {{ .Chart.Name | upper }}, released at name: {{ .Release.Name }}.
 2 | 
 3 | To learn more about the release, try:
 4 | 
 5 |   $ helm status {{ .Release.Name }}  # information about running pods and this message
 6 |   $ helm get {{ .Release.Name }}     # get full Kubernetes specification
 7 | 
 8 | This release includes a Dask scheduler and {{ .Values.worker.replicas }} Dask worker(s), each
 9 | with {{ .Values.worker.default_resources.cpu }} cores and {{ .Values.worker.default_resources.memory }} of memory.
10 | 
11 | Dask scheduler exposes external services to connect directly to the Dask cluster. You can get
12 | these addresses by running the following:
13 | 
14 | {{- if contains "LoadBalancer" .Values.scheduler.serviceType }}
15 |   export DASK_SCHEDULER=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ template "dvss-helm.fullname" . }}-scheduler -o jsonpath='{.status.loadBalancer.ingress[0].hostname}')
16 |   export DASK_SCHEDULER_UI_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ template "dvss-helm.fullname" . }}-scheduler -o jsonpath='{.status.loadBalancer.ingress[0].hostname}')
17 |   echo http://$DASK_SCHEDULER_UI_IP:{{ .Values.webUI.servicePort }}  -- Dask dashboard
18 |   echo http://$DASK_SCHEDULER:{{ .Values.scheduler.servicePort }}    -- Dask Client connection
19 | {{- else if contains "NodePort" .Values.scheduler.serviceType }}
20 |   export DASK_SCHEDULER_PORT=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ template "dvss-helm.fullname" . }}-scheduler -o jsonpath='{.spec.ports[0].nodePort}')
21 |   export DASK_SCHEDULER_UI_PORT=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ template "dvss-helm.fullname" . }}-scheduler -o jsonpath='{.spec.ports[1].nodePort}')
22 |   echo http://localhost:$DASK_SCHEDULER_UI_PORT  -- Dask dashboard
23 |   echo http://localhost:$DASK_SCHEDULER_PORT     -- Dask Client connection
24 | {{- end }}


--------------------------------------------------------------------------------
/2018/daskvsspark/dvss-helm/templates/_helpers.tpl:
--------------------------------------------------------------------------------
 1 | {{/* vim: set filetype=mustache: */}}
 2 | {{/*
 3 | Expand the name of the chart.
 4 | */}}
 5 | {{- define "dvss-helm.name" -}}
 6 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
 7 | {{- end -}}
 8 | 
 9 | {{/*
10 | Create a default fully qualified app name.
11 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
12 | If release name contains chart name it will be used as a full name.
13 | */}}
14 | {{- define "dvss-helm.fullname" -}}
15 | {{- if .Values.fullnameOverride -}}
16 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
17 | {{- else -}}
18 | {{- $name := default .Chart.Name .Values.nameOverride -}}
19 | {{- if contains $name .Release.Name -}}
20 | {{- .Release.Name | trunc 63 | trimSuffix "-" -}}
21 | {{- else -}}
22 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
23 | {{- end -}}
24 | {{- end -}}
25 | {{- end -}}
26 | 
27 | {{/*
28 | Create chart name and version as used by the chart label.
29 | */}}
30 | {{- define "dvss-helm.chart" -}}
31 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
32 | {{- end -}}
33 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/dvss-helm/templates/scheduler-deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1beta2
 2 | kind: Deployment
 3 | metadata:
 4 |   name: {{ template "dvss-helm.fullname" . }}-scheduler
 5 |   labels:
 6 |     app: {{ template "dvss-helm.name" . }}
 7 |     heritage: {{ .Release.Service | quote }}
 8 |     release: {{ .Release.Name | quote }}
 9 |     chart: {{ template "dvss-helm.chart" . }}
10 |     component: scheduler
11 | spec:
12 |   replicas: {{ .Values.scheduler.replicas }}
13 |   selector:
14 |     matchLabels:
15 |       app: {{ template "dvss-helm.name" . }}
16 |       release: {{ .Release.Name | quote }}
17 |       component: scheduler
18 |   strategy:
19 |     type: RollingUpdate
20 |   template:
21 |     metadata:
22 |       labels:
23 |         app: {{ template "dvss-helm.name" . }}
24 |         release: {{ .Release.Name | quote }}
25 |         component: scheduler
26 |     spec:
27 |       containers:
28 |         - name: {{ template "dvss-helm.fullname" . }}-scheduler
29 |           image: "{{ .Values.scheduler.image.repository }}:{{ .Values.scheduler.image.tag }}"
30 |           imagePullPolicy: {{ .Values.scheduler.image.pullPolicy }}
31 |           args:
32 |             - dask-scheduler
33 |             - --port
34 |             - "{{ .Values.scheduler.servicePort }}"
35 |             - --bokeh-port
36 |             - "8787"
37 |           ports:
38 |             - containerPort: 8786
39 |             - containerPort: 8787
40 |           resources:
41 | {{ toYaml .Values.scheduler.resources | indent 12 }}
42 |           env:
43 | {{ toYaml .Values.scheduler.env | indent 12 }}
44 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/dvss-helm/templates/scheduler-service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ template "dvss-helm.fullname" . }}-scheduler
 5 |   labels:
 6 |     app: {{ template "dvss-helm.name" . }}
 7 |     heritage: {{ .Release.Service | quote }}
 8 |     release: {{ .Release.Name | quote }}
 9 |     chart: {{ template "dvss-helm.chart" . }}
10 |     component: scheduler
11 | spec:
12 |   ports:
13 |     - name: {{ template "dvss-helm.fullname" . }}-scheduler
14 |       port: {{ .Values.scheduler.servicePort }}
15 |       targetPort: 8786
16 |       nodePort: {{ .Values.scheduler.nodePort }}
17 |     - name: {{ template "dvss-helm.fullname" . }}-webui
18 |       port: {{ .Values.webUI.servicePort }}
19 |       targetPort: 8787
20 |       nodePort: {{ .Values.webUI.nodePort }}
21 |   selector:
22 |     app: {{ template "dvss-helm.name" . }}
23 |     release: {{ .Release.Name | quote }}
24 |     component: scheduler
25 |   type: {{ .Values.scheduler.serviceType }}
26 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/dvss-helm/templates/worker-deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1beta2
 2 | kind: Deployment
 3 | metadata:
 4 |   name: {{ template "dvss-helm.fullname" . }}-worker
 5 |   labels:
 6 |     app: {{ template "dvss-helm.name" . }}
 7 |     heritage: {{ .Release.Service | quote }}
 8 |     release: {{ .Release.Name | quote }}
 9 |     chart: {{ template "dvss-helm.chart" . }}
10 |     component: worker
11 | spec:
12 |   replicas: {{ .Values.worker.replicas }}
13 |   selector:
14 |     matchLabels:
15 |       app: {{ template "dvss-helm.name" . }}
16 |       release: {{ .Release.Name | quote }}
17 |       component: worker
18 |   strategy:
19 |     type: RollingUpdate
20 |   template:
21 |     metadata:
22 |       labels:
23 |         app: {{ template "dvss-helm.name" . }}
24 |         release: {{ .Release.Name | quote }}
25 |         component: worker
26 |     spec:
27 |       containers:
28 |         - name: {{ template "dvss-helm.fullname" . }}-worker
29 |           image: "{{ .Values.worker.image.repository }}:{{ .Values.worker.image.tag }}"
30 |           imagePullPolicy: {{ .Values.worker.image.pullPolicy }}
31 |           args:
32 |             - dask-worker
33 |             {{- if .Values.worker.resources }}
34 |             - {{ template "dvss-helm.fullname" . }}-scheduler:{{ .Values.scheduler.servicePort }}
35 |             - --nthreads
36 |             - {{ default .Values.worker.resources.limits.cpu .Values.worker.default_resources.cpu | quote }}
37 |             - --memory-limit
38 |             - {{ default .Values.worker.resources.limits.memory .Values.worker.default_resources.memory | quote }}
39 |             {{- else if .Values.worker.default_resources }}
40 |             - {{ template "dvss-helm.fullname" . }}-scheduler:{{ .Values.scheduler.servicePort }}
41 |             - --nthreads
42 |             - {{ .Values.worker.default_resources.cpu | quote }}
43 |             - --memory-limit
44 |             - {{ .Values.worker.default_resources.memory | quote }}
45 |             {{- end }}
46 |             - --no-bokeh
47 |           ports:
48 |             - containerPort: 8789
49 |           resources:
50 | {{ toYaml .Values.worker.resources | indent 12 }}
51 |           env:
52 | {{ toYaml .Values.worker.env | indent 12 }}
53 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/dvss-helm/values.yaml:
--------------------------------------------------------------------------------
 1 | nameOverride: dvss
 2 | fullnameOverride: dvss
 3 | 
 4 | scheduler:
 5 |   name: scheduler
 6 |   image:
 7 |     repository: "jbennet/dvss-base"
 8 |     tag: "latest"
 9 |     pullPolicy: IfNotPresent
10 |   replicas: 1
11 |   serviceType: "NodePort"
12 |   servicePort: 8786
13 |   nodePort: 30786
14 |   resources:
15 |      limits:
16 |        cpu: 1
17 |        memory: 1G
18 |      requests:
19 |        cpu: 1
20 |        memory: 1G
21 | 
22 | webUI:
23 |   name: webui
24 |   servicePort: 80
25 |   nodePort: 30787
26 | 
27 | worker:
28 |   name: worker
29 |   image:
30 |     repository: "jbennet/dvss-base"
31 |     tag: "latest"
32 |     pullPolicy: IfNotPresent
33 |   replicas: 1
34 |   aptPackages: >-
35 |   default_resources:
36 |     cpu: 4
37 |     memory: 12GiB
38 |   resources: {}
39 | #     limits:
40 | #       cpu: 1
41 | #       memory: 3G
42 | #     requests:
43 | #       cpu: 1
44 | #       memory: 3G
45 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/requirements-dask.txt:
--------------------------------------------------------------------------------
1 | dask==0.17.2
2 | distributed==1.21.6
3 | bokeh==0.12.15 # for UI
4 | fastparquet==0.1.5 # for parquet
5 | python-snappy==0.5.2 # for snappy compression in parquet
6 | knit==0.2.4
7 | s3fs==0.1.4
8 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | ipython==6.3.1
2 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/requirements.txt:
--------------------------------------------------------------------------------
1 | pytz==2017.2
2 | simplejson>=3.11.1
3 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/samples/agg1hour.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_id": "http://a.com/articles/1|a.com|2017-09-15T01:01",
 3 |   "_index": "events",
 4 |   "customer": "a.com",
 5 |   "url": "http://a.com/articles/1",
 6 |   "freq": "1hour",
 7 |   "ts": "2017-09-15T01:01:00",
 8 |   "metrics": {
 9 |     "page_views": 3,
10 |     "visitors": 3
11 |   },
12 |   "referrers": {
13 |     "http://google.com/": 1,
14 |     "http://bing.com/": 1,
15 |     "http://facebook.com/": 1
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/scala/README.md:
--------------------------------------------------------------------------------
  1 | # Scala UDAFs for Pyspark
  2 | 
  3 | ## Huh?
  4 | 
  5 | Spark has lots and lots of wonderful aggregations! But sometimes, standard
  6 | aggregations (`min`, `max`, `avg` etc.) are not enough. For example, what if
  7 | I have a table like this:
  8 | 
  9 | |          url          | referrer          |
 10 | |:---------------------:|:-----------------:|
 11 | | http://a.com/article1 | http://google.com |
 12 | | http://a.com/article2 | http://google.com |
 13 | | http://a.com/article2 | http://yahoo.com  |
 14 | 
 15 | 
 16 | and I want to group things by `url`, count `referrers` in each group, and put
 17 | those counts in a dict:
 18 | 
 19 | ```sql
 20 | select url, count_values(referrer) as referrers
 21 | from df
 22 | group by url
 23 | ```
 24 | 
 25 | like this:
 26 | 
 27 | |          url          |                    referrers                    |
 28 | |:---------------------:|:-----------------------------------------------:|
 29 | | http://a.com/article1 | {"http://google.com": 1}                        |
 30 | | http://a.com/article2 | {"http://google.com": 1, "http://yahoo.com": 2} |
 31 | 
 32 | There's one little problem. PySpark doesn't support UDAFs written in Python:
 33 | 
 34 | https://issues.apache.org/jira/browse/SPARK-10915
 35 | 
 36 | ## So I'm screwed?
 37 | 
 38 | Not quite. It is possible to write a UDAF in Scala and call it from Python.
 39 | 
 40 | ## You lost me at Scala.
 41 | 
 42 | It's not so bad. Besides, I already wrote it. See the code in "udafs.scala".
 43 | 
 44 | ## How do I build this?
 45 | 
 46 | ```
 47 | $ cd scala
 48 | $ sbt compile
 49 | $ sbt package
 50 | ```
 51 | 
 52 | If you have problems finding dependencies when you compile, try deleting the `~/.ivy2/` cache.
 53 | 
 54 | ## How do I use this?
 55 | 
 56 | Note: this will only work with Spark 2.1.0 and up.
 57 | 
 58 | * Start with the jar on the classpath.
 59 | * Get an instance of class using `sc._jvm` object.
 60 | * Register it as a UDF to use in Spark SQL.
 61 | * Or wrap it in a Python function to use in aggregations.
 62 | 
 63 | ## Show me.
 64 | 
 65 | ```
 66 | --- daskvsspark/scala $ ipyspark --driver-class-path target/scala-2.11/daskvsspark-udafs_2.11-0.0.1.jar
 67 | Using Python version 3.6.5 (default, Apr  2 2018 14:34:27)
 68 | SparkSession available as 'spark'.
 69 | 
 70 | In [1]: df = sqlContext.createDataFrame([('url1', 'ref1'), ('url2', 'ref1'), ('url2', 'ref2')], ['url', 'referrer'])
 71 | 
 72 | In [2]: agg_counter = sc._jvm.com.jbennet.daskvsspark.udafs.AggregateCounter()
 73 | 
 74 | In [4]: sqlContext.sparkSession._jsparkSession.udf().register('count_values', agg_counter)
 75 | Out[4]: JavaObject id=o45
 76 | 
 77 | In [5]: df.createOrReplaceTempView('df')
 78 | 
 79 | In [6]: sqlContext.sql('select url, count_values(referrer) as referrers from df group by url').show()
 80 | +----+--------------------+
 81 | | url|           referrers|
 82 | +----+--------------------+
 83 | |url1|         [ref1 -> 1]|
 84 | |url2|[ref1 -> 1, ref2 ...|
 85 | +----+--------------------+
 86 | ```
 87 | 
 88 | or:
 89 | 
 90 | ```
 91 | In [7]: from pyspark.sql.column import Column, _to_java_column, _to_seq
 92 | 
 93 | In [11]: def count_values(col):
 94 |     ...:     counter = sc._jvm.com.jbennet.daskvsspark.udafs.AggregateCounter().apply
 95 |     ...:     return Column(counter(_to_seq(sc, [col], _to_java_column)))
 96 |     ...:
 97 |     ...:
 98 | 
 99 | In [12]: df.groupBy("url").agg(count_values("referrer").alias("referrer")).show()
100 | +----+--------------------+
101 | | url|            referrer|
102 | +----+--------------------+
103 | |url1|         [ref1 -> 1]|
104 | |url2|[ref1 -> 1, ref2 ...|
105 | +----+--------------------+
106 | ```
107 | 
108 | You're welcome.


--------------------------------------------------------------------------------
/2018/daskvsspark/scala/build.sbt:
--------------------------------------------------------------------------------
1 | name := "daskvsspark-udafs"
2 | version := "0.0.1"
3 | scalaVersion := "2.11.8"
4 | libraryDependencies += "org.apache.spark" %% "spark-core" % "2.1.0"
5 | libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.1.0"
6 | libraryDependencies += "org.apache.spark" %% "spark-hive" % "2.0.0" % "test"
7 | libraryDependencies += "com.holdenkarau" %% "spark-testing-base" % "2.0.0_0.4.4" % "test"
8 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/scala/src/main/scala/com/jbennet/daskvsspark/udafs.scala:
--------------------------------------------------------------------------------
 1 | package com.jbennet.daskvsspark.udafs
 2 | 
 3 | import org.apache.spark.sql.expressions.MutableAggregationBuffer
 4 | import org.apache.spark.sql.expressions.UserDefinedAggregateFunction
 5 | import org.apache.spark.sql.Row
 6 | import org.apache.spark.sql.types._
 7 | 
 8 | 
 9 | /**
10 |   * Aggregate Counter. Counts values and returns a Map with "value" -> count.
11 |   */
12 | class AggregateCounter extends UserDefinedAggregateFunction {
13 | 
14 |   // This is the input fields for your aggregate function.
15 |   override def inputSchema: org.apache.spark.sql.types.StructType =
16 |     StructType(
17 |       StructField("value", StringType) :: Nil
18 |     )
19 | 
20 |   // This is the internal fields you keep for computing your aggregate.
21 |   override def bufferSchema: StructType =
22 |     StructType(
23 |       StructField("counter", MapType(StringType, IntegerType)) :: Nil
24 |     )
25 | 
26 |   // This is the output type of your aggregation function.
27 |   override def dataType: DataType = MapType(StringType, IntegerType)
28 | 
29 |   override def deterministic: Boolean = true
30 | 
31 |   override def initialize(buffer: MutableAggregationBuffer): Unit = {
32 |     buffer(0) = null
33 |   }
34 | 
35 |   override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
36 |     var map = buffer.getAs[Map[String, Integer]](0)
37 |     var value = input.getAs[String](0)
38 |     buffer(0) = addValue(map, value)
39 |   }
40 | 
41 |   override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
42 |     var m1 = buffer1.getAs[Map[String, Integer]](0)
43 |     var m2 = buffer2.getAs[Map[String, Integer]](0)
44 |     buffer1(0) = mergeMap(m1, m2)
45 |   }
46 | 
47 |   override def evaluate(buffer: Row): Any = {
48 |     buffer.getMap(0)
49 |   }
50 | 
51 |   /** Add value to map.
52 |     *
53 |     * @param map map
54 |     * @param value value
55 |     * @return map
56 |     */
57 |   def addValue(map: Map[String, Integer], value: String): Map[String, Integer] = (map, value) match {
58 |     case (null, null) => Map()
59 |     case (null, v) => Map(v -> 1)
60 |     case (m, null) => m
61 |     case _ =>
62 |       val zero: Integer = 0
63 |       if (map.contains(value)) map + (value -> (map.getOrElse(value, zero) + 1))
64 |       else map + (value -> 1)
65 |   }
66 | 
67 |   /** Add two maps into one.
68 |     *
69 |     * @param a first map to merge
70 |     * @param b second map to merge
71 |     * @return merged map
72 |     */
73 |   def mergeMap(a: Map[String, Integer], b: Map[String, Integer]): Map[String, Integer] = (a, b) match {
74 |     case (null, null) => null
75 |     case (null, y) => y
76 |     case (x, null) => x
77 |     case _ =>
78 |       val zero: Integer = 0
79 |       (a.keySet ++ b.keySet).map(
80 |         k => {
81 |           val v1: Integer = a.getOrElse(k, 0)
82 |           val v2: Integer = b.getOrElse(k, 0)
83 |           k -> (v1 + v2:Integer)
84 |         }
85 |       ).toMap
86 |   }
87 | }
88 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/scala/src/test/scala/com/jbennet/daskvsspark/AggregateCounterTest.scala:
--------------------------------------------------------------------------------
 1 | package com.jbennet.daskvsspark
 2 | 
 3 | import com.holdenkarau.spark.testing._
 4 | import com.jbennet.daskvsspark.udafs.AggregateCounter
 5 | import org.apache.spark.sql.Row
 6 | import org.apache.spark.sql.functions._
 7 | import org.apache.spark.sql.types._
 8 | import org.scalatest.FunSuite
 9 | 
10 | 
11 | /**
12 |   * Tests for AggregateSet
13 |   */
14 | class AggregateCounterTest extends FunSuite with DataFrameSuiteBase {
15 | 
16 |   private val schema = StructType(Array(
17 |     StructField("url", StringType),
18 |     StructField("referrer", StringType)
19 |   ))
20 | 
21 |   private val aggcount = new AggregateCounter
22 | 
23 |   test("different keys should combine") {
24 |     val data = Array(
25 |       Row("url1", "ref2"),
26 |       Row("url1", "ref1")
27 |     )
28 |     val df = sqlContext.createDataFrame(sc.parallelize(data), schema)
29 |     val rows = df.groupBy("url")
30 |       .agg(aggcount(col("referrer")))
31 |       .collect()
32 |     val agg1: Map[String, Integer] = rows(0)(1).asInstanceOf[Map[String, Integer]]
33 |     assert(agg1.size == 2)
34 |     assert(agg1 == Map("ref1" -> (1:Integer), "ref2" -> (1:Integer)))
35 |   }
36 | 
37 |   test("same keys should add") {
38 |     val data = Array(
39 |       Row("url1", "ref1"),
40 |       Row("url1", "ref1")
41 |     )
42 |     val df = sqlContext.createDataFrame(sc.parallelize(data), schema)
43 |     val rows = df.groupBy("url")
44 |       .agg(aggcount(col("referrer")))
45 |       .collect()
46 |     val agg1: Map[String, Integer] = rows(0)(1).asInstanceOf[Map[String, Integer]]
47 |     assert(agg1.size == 1)
48 |     assert(agg1 == Map("ref1" -> (2:Integer)))
49 |   }
50 | 
51 |   test("null keys do not count") {
52 |     val data = Array(
53 |       Row("url1", null),
54 |       Row("url1", "ref1")
55 |     )
56 |     val df = sqlContext.createDataFrame(sc.parallelize(data), schema)
57 |     val rows = df.groupBy("url")
58 |       .agg(aggcount(col("referrer")))
59 |       .collect()
60 |     val agg1: Map[String, Integer] = rows(0)(1).asInstanceOf[Map[String, Integer]]
61 |     assert(agg1.size == 1)
62 |     assert(agg1 == Map("ref1" -> (1:Integer)))
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | 
 4 | setup(name='daskvsspark',
 5 |       version='0.1',
 6 |       description='Dask and Spark example',
 7 |       author='Irina Truong',
 8 |       author_email='irinatruong@gmail.com',
 9 |       packages=find_packages()
10 |      )
11 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/trials/Aggregate without index.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 34,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import numpy as np\n",
 11 |     "import dask.dataframe as dd\n",
 12 |     "import itertools as it\n",
 13 |     "from pprint import pprint\n",
 14 |     "\n",
 15 |     "\n",
 16 |     "pd.set_option('display.expand_frame_repr', False)\n"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 14,
 22 |    "metadata": {},
 23 |    "outputs": [
 24 |     {
 25 |      "data": {
 26 |       "text/html": [
 27 |        "<div>\n",
 28 |        "<style scoped>\n",
 29 |        "    .dataframe tbody tr th:only-of-type {\n",
 30 |        "        vertical-align: middle;\n",
 31 |        "    }\n",
 32 |        "\n",
 33 |        "    .dataframe tbody tr th {\n",
 34 |        "        vertical-align: top;\n",
 35 |        "    }\n",
 36 |        "\n",
 37 |        "    .dataframe thead th {\n",
 38 |        "        text-align: right;\n",
 39 |        "    }\n",
 40 |        "</style>\n",
 41 |        "<table border=\"1\" class=\"dataframe\">\n",
 42 |        "  <thead>\n",
 43 |        "    <tr style=\"text-align: right;\">\n",
 44 |        "      <th></th>\n",
 45 |        "      <th>year</th>\n",
 46 |        "      <th>name</th>\n",
 47 |        "      <th>subject</th>\n",
 48 |        "      <th>grade</th>\n",
 49 |        "      <th>pass</th>\n",
 50 |        "    </tr>\n",
 51 |        "  </thead>\n",
 52 |        "  <tbody>\n",
 53 |        "    <tr>\n",
 54 |        "      <th>0</th>\n",
 55 |        "      <td>1</td>\n",
 56 |        "      <td>Mary Smith</td>\n",
 57 |        "      <td>Math</td>\n",
 58 |        "      <td>4</td>\n",
 59 |        "      <td>1</td>\n",
 60 |        "    </tr>\n",
 61 |        "    <tr>\n",
 62 |        "      <th>1</th>\n",
 63 |        "      <td>1</td>\n",
 64 |        "      <td>Mary Smith</td>\n",
 65 |        "      <td>Computer Science</td>\n",
 66 |        "      <td>5</td>\n",
 67 |        "      <td>1</td>\n",
 68 |        "    </tr>\n",
 69 |        "    <tr>\n",
 70 |        "      <th>2</th>\n",
 71 |        "      <td>1</td>\n",
 72 |        "      <td>Mary Smith</td>\n",
 73 |        "      <td>English Literature</td>\n",
 74 |        "      <td>2</td>\n",
 75 |        "      <td>0</td>\n",
 76 |        "    </tr>\n",
 77 |        "  </tbody>\n",
 78 |        "</table>\n",
 79 |        "</div>"
 80 |       ],
 81 |       "text/plain": [
 82 |        "   year        name             subject  grade  pass\n",
 83 |        "0     1  Mary Smith                Math      4     1\n",
 84 |        "1     1  Mary Smith    Computer Science      5     1\n",
 85 |        "2     1  Mary Smith  English Literature      2     0"
 86 |       ]
 87 |      },
 88 |      "execution_count": 14,
 89 |      "metadata": {},
 90 |      "output_type": "execute_result"
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "pdf = pd.DataFrame.from_records([\n",
 95 |     "    (1, \"Mary Smith\", \"Math\", 4, 1),\n",
 96 |     "    (1, \"Mary Smith\", \"Computer Science\", 5, 1),\n",
 97 |     "    (1, \"Mary Smith\", \"English Literature\", 2, 0),\n",
 98 |     "    (2, \"Mary Smith\", \"Math\", 4, 1),\n",
 99 |     "    (2, \"Mary Smith\", \"Computer Science\", 5, 1),\n",
100 |     "    (2, \"Mary Smith\", \"English Literature\", 4, 1),\n",
101 |     "    (1, \"John Brown\", \"Math\", 1, 0),\n",
102 |     "    (1, \"John Brown\", \"Computer Science\", 4, 1),\n",
103 |     "    (1, \"John Brown\", \"English Literature\", 5, 1),\n",
104 |     "    (2, \"John Brown\", \"Math\", 4, 1),\n",
105 |     "    (2, \"John Brown\", \"Computer Science\", 3, 0),\n",
106 |     "    (2, \"John Brown\", \"English Literature\", 5, 1),\n",
107 |     "],\n",
108 |     "columns=['year', 'name', 'subject', 'grade', 'pass'])\n",
109 |     "\n",
110 |     "df = dd.from_pandas(pdf, 2)\n",
111 |     "\n",
112 |     "df.head(3)"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 35,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "collect_list = dd.Aggregation(\n",
122 |     "    'collect_list',\n",
123 |     "    lambda s: s.apply(list),\n",
124 |     "    lambda s: s.apply(lambda chunks: list(it.chain.from_iterable(chunks))),\n",
125 |     ")"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 37,
131 |    "metadata": {},
132 |    "outputs": [
133 |     {
134 |      "data": {
135 |       "text/html": [
136 |        "<div>\n",
137 |        "<style scoped>\n",
138 |        "    .dataframe tbody tr th:only-of-type {\n",
139 |        "        vertical-align: middle;\n",
140 |        "    }\n",
141 |        "\n",
142 |        "    .dataframe tbody tr th {\n",
143 |        "        vertical-align: top;\n",
144 |        "    }\n",
145 |        "\n",
146 |        "    .dataframe thead tr th {\n",
147 |        "        text-align: left;\n",
148 |        "    }\n",
149 |        "\n",
150 |        "    .dataframe thead tr:last-of-type th {\n",
151 |        "        text-align: right;\n",
152 |        "    }\n",
153 |        "</style>\n",
154 |        "<table border=\"1\" class=\"dataframe\">\n",
155 |        "  <thead>\n",
156 |        "    <tr>\n",
157 |        "      <th></th>\n",
158 |        "      <th></th>\n",
159 |        "      <th colspan=\"2\" halign=\"left\">grade</th>\n",
160 |        "      <th>pass</th>\n",
161 |        "    </tr>\n",
162 |        "    <tr>\n",
163 |        "      <th></th>\n",
164 |        "      <th></th>\n",
165 |        "      <th>grades</th>\n",
166 |        "      <th>mean_grade</th>\n",
167 |        "      <th>passes</th>\n",
168 |        "    </tr>\n",
169 |        "    <tr>\n",
170 |        "      <th>year</th>\n",
171 |        "      <th>name</th>\n",
172 |        "      <th></th>\n",
173 |        "      <th></th>\n",
174 |        "      <th></th>\n",
175 |        "    </tr>\n",
176 |        "  </thead>\n",
177 |        "  <tbody>\n",
178 |        "    <tr>\n",
179 |        "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
180 |        "      <th>John Brown</th>\n",
181 |        "      <td>[1, 4, 5]</td>\n",
182 |        "      <td>3.333333</td>\n",
183 |        "      <td>2</td>\n",
184 |        "    </tr>\n",
185 |        "    <tr>\n",
186 |        "      <th>Mary Smith</th>\n",
187 |        "      <td>[4, 5, 2]</td>\n",
188 |        "      <td>3.666667</td>\n",
189 |        "      <td>2</td>\n",
190 |        "    </tr>\n",
191 |        "    <tr>\n",
192 |        "      <th rowspan=\"2\" valign=\"top\">2</th>\n",
193 |        "      <th>John Brown</th>\n",
194 |        "      <td>[4, 3, 5]</td>\n",
195 |        "      <td>4.000000</td>\n",
196 |        "      <td>2</td>\n",
197 |        "    </tr>\n",
198 |        "    <tr>\n",
199 |        "      <th>Mary Smith</th>\n",
200 |        "      <td>[4, 5, 4]</td>\n",
201 |        "      <td>4.333333</td>\n",
202 |        "      <td>3</td>\n",
203 |        "    </tr>\n",
204 |        "  </tbody>\n",
205 |        "</table>\n",
206 |        "</div>"
207 |       ],
208 |       "text/plain": [
209 |        "                     grade              pass\n",
210 |        "                    grades mean_grade passes\n",
211 |        "year name                                   \n",
212 |        "1    John Brown  [1, 4, 5]   3.333333      2\n",
213 |        "     Mary Smith  [4, 5, 2]   3.666667      2\n",
214 |        "2    John Brown  [4, 3, 5]   4.000000      2\n",
215 |        "     Mary Smith  [4, 5, 4]   4.333333      3"
216 |       ]
217 |      },
218 |      "execution_count": 37,
219 |      "metadata": {},
220 |      "output_type": "execute_result"
221 |     }
222 |    ],
223 |    "source": [
224 |     "ag = df.groupby(['year', 'name']).agg({\n",
225 |     "    'grade': {'mean_grade': np.mean,\n",
226 |     "              'grades': collect_list},\n",
227 |     "    'pass': {'passes': 'sum'}\n",
228 |     "})\n",
229 |     "\n",
230 |     "ag.compute()"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": 38,
236 |    "metadata": {},
237 |    "outputs": [
238 |     {
239 |      "data": {
240 |       "text/plain": [
241 |        "MultiIndex(levels=[[1, 2], [u'John Brown', u'Mary Smith']],\n",
242 |        "           labels=[[0, 0, 1, 1], [0, 1, 0, 1]],\n",
243 |        "           names=[u'year', u'name'])"
244 |       ]
245 |      },
246 |      "execution_count": 38,
247 |      "metadata": {},
248 |      "output_type": "execute_result"
249 |     }
250 |    ],
251 |    "source": [
252 |     "ag.index.compute()"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": 39,
258 |    "metadata": {},
259 |    "outputs": [
260 |     {
261 |      "data": {
262 |       "text/plain": [
263 |        "MultiIndex(levels=[[u'grade', u'pass'], [u'grades', u'mean_grade', u'passes']],\n",
264 |        "           labels=[[0, 0, 1], [0, 1, 2]])"
265 |       ]
266 |      },
267 |      "execution_count": 39,
268 |      "metadata": {},
269 |      "output_type": "execute_result"
270 |     }
271 |    ],
272 |    "source": [
273 |     "ag.columns"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 47,
279 |    "metadata": {},
280 |    "outputs": [
281 |     {
282 |      "data": {
283 |       "text/html": [
284 |        "<div>\n",
285 |        "<style scoped>\n",
286 |        "    .dataframe tbody tr th:only-of-type {\n",
287 |        "        vertical-align: middle;\n",
288 |        "    }\n",
289 |        "\n",
290 |        "    .dataframe tbody tr th {\n",
291 |        "        vertical-align: top;\n",
292 |        "    }\n",
293 |        "\n",
294 |        "    .dataframe thead tr th {\n",
295 |        "        text-align: left;\n",
296 |        "    }\n",
297 |        "</style>\n",
298 |        "<table border=\"1\" class=\"dataframe\">\n",
299 |        "  <thead>\n",
300 |        "    <tr>\n",
301 |        "      <th></th>\n",
302 |        "      <th>year</th>\n",
303 |        "      <th>name</th>\n",
304 |        "      <th colspan=\"2\" halign=\"left\">grade</th>\n",
305 |        "      <th>pass</th>\n",
306 |        "    </tr>\n",
307 |        "    <tr>\n",
308 |        "      <th></th>\n",
309 |        "      <th></th>\n",
310 |        "      <th></th>\n",
311 |        "      <th>grades</th>\n",
312 |        "      <th>mean_grade</th>\n",
313 |        "      <th>passes</th>\n",
314 |        "    </tr>\n",
315 |        "  </thead>\n",
316 |        "  <tbody>\n",
317 |        "    <tr>\n",
318 |        "      <th>0</th>\n",
319 |        "      <td>1</td>\n",
320 |        "      <td>John Brown</td>\n",
321 |        "      <td>[1, 4, 5]</td>\n",
322 |        "      <td>3.333333</td>\n",
323 |        "      <td>2</td>\n",
324 |        "    </tr>\n",
325 |        "    <tr>\n",
326 |        "      <th>1</th>\n",
327 |        "      <td>1</td>\n",
328 |        "      <td>Mary Smith</td>\n",
329 |        "      <td>[4, 5, 2]</td>\n",
330 |        "      <td>3.666667</td>\n",
331 |        "      <td>2</td>\n",
332 |        "    </tr>\n",
333 |        "    <tr>\n",
334 |        "      <th>2</th>\n",
335 |        "      <td>2</td>\n",
336 |        "      <td>John Brown</td>\n",
337 |        "      <td>[4, 3, 5]</td>\n",
338 |        "      <td>4.000000</td>\n",
339 |        "      <td>2</td>\n",
340 |        "    </tr>\n",
341 |        "    <tr>\n",
342 |        "      <th>3</th>\n",
343 |        "      <td>2</td>\n",
344 |        "      <td>Mary Smith</td>\n",
345 |        "      <td>[4, 5, 4]</td>\n",
346 |        "      <td>4.333333</td>\n",
347 |        "      <td>3</td>\n",
348 |        "    </tr>\n",
349 |        "  </tbody>\n",
350 |        "</table>\n",
351 |        "</div>"
352 |       ],
353 |       "text/plain": [
354 |        "  year        name      grade              pass\n",
355 |        "                       grades mean_grade passes\n",
356 |        "0    1  John Brown  [1, 4, 5]   3.333333      2\n",
357 |        "1    1  Mary Smith  [4, 5, 2]   3.666667      2\n",
358 |        "2    2  John Brown  [4, 3, 5]   4.000000      2\n",
359 |        "3    2  Mary Smith  [4, 5, 4]   4.333333      3"
360 |       ]
361 |      },
362 |      "execution_count": 47,
363 |      "metadata": {},
364 |      "output_type": "execute_result"
365 |     }
366 |    ],
367 |    "source": [
368 |     "ri = ag.reset_index()\n",
369 |     "ri.compute()"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": 48,
375 |    "metadata": {},
376 |    "outputs": [
377 |     {
378 |      "data": {
379 |       "text/plain": [
380 |        "RangeIndex(start=0, stop=4, step=1)"
381 |       ]
382 |      },
383 |      "execution_count": 48,
384 |      "metadata": {},
385 |      "output_type": "execute_result"
386 |     }
387 |    ],
388 |    "source": [
389 |     "ri.index.compute()"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": 49,
395 |    "metadata": {},
396 |    "outputs": [
397 |     {
398 |      "data": {
399 |       "text/plain": [
400 |        "MultiIndex(levels=[[u'grade', u'pass', u'name', u'year'], [u'grades', u'mean_grade', u'passes', u'']],\n",
401 |        "           labels=[[3, 2, 0, 0, 1], [3, 3, 0, 1, 2]])"
402 |       ]
403 |      },
404 |      "execution_count": 49,
405 |      "metadata": {},
406 |      "output_type": "execute_result"
407 |     }
408 |    ],
409 |    "source": [
410 |     "ri.columns"
411 |    ]
412 |   },
413 |   {
414 |    "cell_type": "code",
415 |    "execution_count": 50,
416 |    "metadata": {},
417 |    "outputs": [
418 |     {
419 |      "data": {
420 |       "text/plain": [
421 |        "pandas.core.indexes.multi.MultiIndex"
422 |       ]
423 |      },
424 |      "execution_count": 50,
425 |      "metadata": {},
426 |      "output_type": "execute_result"
427 |     }
428 |    ],
429 |    "source": [
430 |     "ri.columns.__class__"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "code",
435 |    "execution_count": 51,
436 |    "metadata": {},
437 |    "outputs": [
438 |     {
439 |      "data": {
440 |       "text/plain": [
441 |        "Index([u'year', u'name', u'grade', u'grade', u'pass'], dtype='object')"
442 |       ]
443 |      },
444 |      "execution_count": 51,
445 |      "metadata": {},
446 |      "output_type": "execute_result"
447 |     }
448 |    ],
449 |    "source": [
450 |     "ri.columns.get_level_values(0)"
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "code",
455 |    "execution_count": 52,
456 |    "metadata": {},
457 |    "outputs": [
458 |     {
459 |      "data": {
460 |       "text/plain": [
461 |        "Index([u'', u'', u'grades', u'mean_grade', u'passes'], dtype='object')"
462 |       ]
463 |      },
464 |      "execution_count": 52,
465 |      "metadata": {},
466 |      "output_type": "execute_result"
467 |     }
468 |    ],
469 |    "source": [
470 |     "ri.columns.get_level_values(1)"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "code",
475 |    "execution_count": 55,
476 |    "metadata": {},
477 |    "outputs": [
478 |     {
479 |      "data": {
480 |       "text/html": [
481 |        "<div>\n",
482 |        "<style scoped>\n",
483 |        "    .dataframe tbody tr th:only-of-type {\n",
484 |        "        vertical-align: middle;\n",
485 |        "    }\n",
486 |        "\n",
487 |        "    .dataframe tbody tr th {\n",
488 |        "        vertical-align: top;\n",
489 |        "    }\n",
490 |        "\n",
491 |        "    .dataframe thead th {\n",
492 |        "        text-align: right;\n",
493 |        "    }\n",
494 |        "</style>\n",
495 |        "<table border=\"1\" class=\"dataframe\">\n",
496 |        "  <thead>\n",
497 |        "    <tr style=\"text-align: right;\">\n",
498 |        "      <th></th>\n",
499 |        "      <th>year</th>\n",
500 |        "      <th>name</th>\n",
501 |        "      <th>grades</th>\n",
502 |        "      <th>mean_grade</th>\n",
503 |        "      <th>passes</th>\n",
504 |        "    </tr>\n",
505 |        "  </thead>\n",
506 |        "  <tbody>\n",
507 |        "    <tr>\n",
508 |        "      <th>0</th>\n",
509 |        "      <td>1</td>\n",
510 |        "      <td>John Brown</td>\n",
511 |        "      <td>[1, 4, 5]</td>\n",
512 |        "      <td>3.333333</td>\n",
513 |        "      <td>2</td>\n",
514 |        "    </tr>\n",
515 |        "    <tr>\n",
516 |        "      <th>1</th>\n",
517 |        "      <td>1</td>\n",
518 |        "      <td>Mary Smith</td>\n",
519 |        "      <td>[4, 5, 2]</td>\n",
520 |        "      <td>3.666667</td>\n",
521 |        "      <td>2</td>\n",
522 |        "    </tr>\n",
523 |        "    <tr>\n",
524 |        "      <th>2</th>\n",
525 |        "      <td>2</td>\n",
526 |        "      <td>John Brown</td>\n",
527 |        "      <td>[4, 3, 5]</td>\n",
528 |        "      <td>4.000000</td>\n",
529 |        "      <td>2</td>\n",
530 |        "    </tr>\n",
531 |        "    <tr>\n",
532 |        "      <th>3</th>\n",
533 |        "      <td>2</td>\n",
534 |        "      <td>Mary Smith</td>\n",
535 |        "      <td>[4, 5, 4]</td>\n",
536 |        "      <td>4.333333</td>\n",
537 |        "      <td>3</td>\n",
538 |        "    </tr>\n",
539 |        "  </tbody>\n",
540 |        "</table>\n",
541 |        "</div>"
542 |       ],
543 |       "text/plain": [
544 |        "   year        name     grades  mean_grade  passes\n",
545 |        "0     1  John Brown  [1, 4, 5]    3.333333       2\n",
546 |        "1     1  Mary Smith  [4, 5, 2]    3.666667       2\n",
547 |        "2     2  John Brown  [4, 3, 5]    4.000000       2\n",
548 |        "3     2  Mary Smith  [4, 5, 4]    4.333333       3"
549 |       ]
550 |      },
551 |      "execution_count": 55,
552 |      "metadata": {},
553 |      "output_type": "execute_result"
554 |     }
555 |    ],
556 |    "source": [
557 |     "ri.columns = ['year', 'name', 'grades', 'mean_grade', 'passes']\n",
558 |     "ri.compute()"
559 |    ]
560 |   },
561 |   {
562 |    "cell_type": "code",
563 |    "execution_count": 57,
564 |    "metadata": {},
565 |    "outputs": [
566 |     {
567 |      "data": {
568 |       "text/plain": [
569 |        "['to_bag',\n",
570 |        " 'to_csv',\n",
571 |        " 'to_delayed',\n",
572 |        " 'to_hdf',\n",
573 |        " 'to_html',\n",
574 |        " 'to_parquet',\n",
575 |        " 'to_records',\n",
576 |        " 'to_string',\n",
577 |        " 'to_timestamp']"
578 |       ]
579 |      },
580 |      "execution_count": 57,
581 |      "metadata": {},
582 |      "output_type": "execute_result"
583 |     }
584 |    ],
585 |    "source": [
586 |     "[_ for _ in dir(ri) if _.startswith('to')]"
587 |    ]
588 |   },
589 |   {
590 |    "cell_type": "code",
591 |    "execution_count": 60,
592 |    "metadata": {},
593 |    "outputs": [
594 |     {
595 |      "data": {
596 |       "text/plain": [
597 |        "rec.array([(0, 1, 'John Brown', list([1, 4, 5]), 3.33333333, 2),\n",
598 |        "           (1, 1, 'Mary Smith', list([4, 5, 2]), 3.66666667, 2),\n",
599 |        "           (2, 2, 'John Brown', list([4, 3, 5]), 4.        , 2),\n",
600 |        "           (3, 2, 'Mary Smith', list([4, 5, 4]), 4.33333333, 3)],\n",
601 |        "          dtype=[(u'index', '<i8'), (u'year', '<i8'), (u'name', 'O'), (u'grades', 'O'), (u'mean_grade', '<f8'), (u'passes', '<i8')])"
602 |       ]
603 |      },
604 |      "execution_count": 60,
605 |      "metadata": {},
606 |      "output_type": "execute_result"
607 |     }
608 |    ],
609 |    "source": [
610 |     "ri.to_records().compute()"
611 |    ]
612 |   },
613 |   {
614 |    "cell_type": "code",
615 |    "execution_count": null,
616 |    "metadata": {},
617 |    "outputs": [],
618 |    "source": []
619 |   }
620 |  ],
621 |  "metadata": {
622 |   "kernelspec": {
623 |    "display_name": "Python 2",
624 |    "language": "python",
625 |    "name": "python2"
626 |   },
627 |   "language_info": {
628 |    "codemirror_mode": {
629 |     "name": "ipython",
630 |     "version": 2
631 |    },
632 |    "file_extension": ".py",
633 |    "mimetype": "text/x-python",
634 |    "name": "python",
635 |    "nbconvert_exporter": "python",
636 |    "pygments_lexer": "ipython2",
637 |    "version": "2.7.13"
638 |   }
639 |  },
640 |  "nbformat": 4,
641 |  "nbformat_minor": 2
642 | }
643 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/trials/aggregate1.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8
 2 | import pandas as pd
 3 | import dask.dataframe as dd
 4 | from pprint import pprint
 5 | 
 6 | 
 7 | pd.set_option('display.expand_frame_repr', False)
 8 | 
 9 | 
10 | def transform_one(series):
11 |     """Takes a Series object representing a grouped dataframe row,
12 |     and returns a string (serialized JSON).
13 | 
14 |     :return: dict
15 |     """
16 |     data = series.to_dict()
17 |     if not data:
18 |         return pd.Series([], name='data')
19 |     (customer, url, ts, _) = data.pop('index')
20 |     page_views = data.pop('views')
21 |     visitors = data.pop('visitors')
22 |     data.update({
23 |         'customer': customer,
24 |         'url': url,
25 |         'ts': ts.strftime('%Y-%m-%dT%H:%M:%S'),
26 |         'metrics': {'views': page_views, 'visitors': visitors}
27 |     })
28 |     return pd.Series([data], name='data')
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     pdf = pd.DataFrame.from_records([
33 |         ("http://a.com/articles/1", "http://google.com/", "xxx", "2017-09-15 00:10:00", "a.com"),
34 |         ("http://a.com/articles/2", "http://bing.com/", "yyy", "2017-09-15 00:20:00", "a.com"),
35 |         ("http://a.com/articles/2", "http://facebook.com/", "yyy", "2017-09-15 00:30:00", "a.com"),
36 |         ("http://a.com/articles/1", "http://google.com/", "xxx", "2017-09-15 01:10:00", "a.com"),
37 |         ("http://a.com/articles/2", "http://bing.com/", "yyy", "2017-09-15 01:20:00", "a.com")
38 |     ],
39 |     columns=['url', 'referrer', 'session_id', 'ts', 'customer'])
40 | 
41 |     pdf['ts'] = pd.to_datetime(pdf['ts'])
42 | 
43 |     df = dd.from_pandas(pdf, 2)
44 | 
45 |     # round datetimes down to an hour
46 |     df['ts'] = df['ts'].dt.floor('1H')
47 | 
48 |     # group on customer, timestamp (rounded) and url
49 |     gb = df.groupby(['customer', 'url', 'ts'])
50 | 
51 |     df = gb.apply(lambda d: pd.DataFrame({
52 |             'views': len(d),
53 |             'visitors': d.session_id.count(),
54 |             'referrers': [d.referrer.tolist()]}),
55 |         meta={'views': int, 'visitors': int, 'referrers': int})
56 | 
57 |     # I want index to be a part of dataframe, so it is passed
58 |     # to the next .apply call.
59 |     df['index'] = df.index
60 | 
61 |     df = df.apply(transform_one, axis=1, meta={'data': str}).to_bag()
62 | 
63 |     pprint(df.compute())
64 | 


--------------------------------------------------------------------------------
/2018/daskvsspark/trials/aggregate2.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8
 2 | import pandas as pd
 3 | import dask.dataframe as dd
 4 | import fastparquet as fp
 5 | import glob
 6 | from pprint import pprint
 7 | from dask import delayed
 8 | 
 9 | 
10 | pd.set_option('display.expand_frame_repr', False)
11 | 
12 | 
13 | def transform_one(series):
14 |     """Takes a Series object representing a grouped dataframe row,
15 |     and returns a string (serialized JSON).
16 | 
17 |     :return: dict
18 |     """
19 |     data = series.to_dict()
20 |     if not data:
21 |         return pd.Series([], name='data')
22 |     (customer, url, ts, _) = data.pop('index')
23 |     page_views = data.pop('views')
24 |     visitors = data.pop('visitors')
25 |     data.update({
26 |         'customer': customer,
27 |         'url': url,
28 |         'ts': ts.strftime('%Y-%m-%dT%H:%M:%S'),
29 |         'metrics': {'views': page_views, 'visitors': visitors}
30 |     })
31 |     return pd.Series([data], name='data')
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     file_names = glob.glob('./events/*/*/*/*/*/part*.parquet')
36 |     pf = fp.ParquetFile(file_names, root='./events')
37 |     pf.cats = {'customer': pf.cats['customer']}
38 |     dfs = (delayed(pf.read_row_group_file)(rg, pf.columns, pf.cats) for rg in pf.row_groups)
39 |     df = dd.from_delayed(dfs)
40 | 
41 |     # round datetimes down to an hour
42 |     df['ts'] = df['ts'].dt.floor('1H')
43 | 
44 |     # group on customer, timestamp (rounded) and url
45 |     gb = df.groupby(['customer', 'url', 'ts'])
46 | 
47 |     df = gb.apply(lambda d: pd.DataFrame({
48 |             'views': len(d),
49 |             'visitors': d.session_id.count(),
50 |             'referrers': [d.referrer.tolist()]}),
51 |         meta={'views': int, 'visitors': int, 'referrers': int})
52 | 
53 |     # I want index to be a part of dataframe, so it is passed
54 |     # to the next .apply call.
55 |     df['index'] = df.index
56 | 
57 |     df = df.apply(transform_one, axis=1, meta={'data': str}).to_bag()
58 | 
59 |     pprint(df.compute())
60 | 


--------------------------------------------------------------------------------
/2018/datetimes/01_event-table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2018/datetimes/01_event-table.png


--------------------------------------------------------------------------------
/2018/datetimes/02_event_table_utc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2018/datetimes/02_event_table_utc.png


--------------------------------------------------------------------------------
/2018/datetimes/03_event_table_la.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2018/datetimes/03_event_table_la.png


--------------------------------------------------------------------------------
/2018/datetimes/04_event_table_floor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2018/datetimes/04_event_table_floor.png


--------------------------------------------------------------------------------
/2018/datetimes/05_events_grouped.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2018/datetimes/05_events_grouped.png


--------------------------------------------------------------------------------
/2018/datetimes/datetime-challenges.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 83,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import numpy as np\n",
 11 |     "import datetime as dt\n",
 12 |     "import pytz"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 55,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "def extract_fields(line):\n",
 22 |     "    parts = line.split(' || ')\n",
 23 |     "    return (parts[3].strip(), parts[4].strip(), parts[7].strip(), parts[8].strip())"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 56,
 29 |    "metadata": {},
 30 |    "outputs": [
 31 |     {
 32 |      "data": {
 33 |       "text/html": [
 34 |        "<div>\n",
 35 |        "<style scoped>\n",
 36 |        "    .dataframe tbody tr th:only-of-type {\n",
 37 |        "        vertical-align: middle;\n",
 38 |        "    }\n",
 39 |        "\n",
 40 |        "    .dataframe tbody tr th {\n",
 41 |        "        vertical-align: top;\n",
 42 |        "    }\n",
 43 |        "\n",
 44 |        "    .dataframe thead th {\n",
 45 |        "        text-align: right;\n",
 46 |        "    }\n",
 47 |        "</style>\n",
 48 |        "<table border=\"1\" class=\"dataframe\">\n",
 49 |        "  <thead>\n",
 50 |        "    <tr style=\"text-align: right;\">\n",
 51 |        "      <th></th>\n",
 52 |        "      <th>date</th>\n",
 53 |        "      <th>request</th>\n",
 54 |        "      <th>referrer</th>\n",
 55 |        "      <th>user_agent</th>\n",
 56 |        "    </tr>\n",
 57 |        "  </thead>\n",
 58 |        "  <tbody>\n",
 59 |        "    <tr>\n",
 60 |        "      <th>0</th>\n",
 61 |        "      <td>31/Aug/2015:23:49:01 +0000</td>\n",
 62 |        "      <td>GET /logger/?action-view&amp;site_id=123 HTTP/1.1</td>\n",
 63 |        "      <td>https://foo.com/some/url</td>\n",
 64 |        "      <td>Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...</td>\n",
 65 |        "    </tr>\n",
 66 |        "  </tbody>\n",
 67 |        "</table>\n",
 68 |        "</div>"
 69 |       ],
 70 |       "text/plain": [
 71 |        "                         date                                        request  \\\n",
 72 |        "0  31/Aug/2015:23:49:01 +0000  GET /logger/?action-view&site_id=123 HTTP/1.1   \n",
 73 |        "\n",
 74 |        "                   referrer                                         user_agent  \n",
 75 |        "0  https://foo.com/some/url  Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...  "
 76 |       ]
 77 |      },
 78 |      "execution_count": 56,
 79 |      "metadata": {},
 80 |      "output_type": "execute_result"
 81 |     }
 82 |    ],
 83 |    "source": [
 84 |     "log_lines = [\"/logger/ || 70.123.102.76 || - || 31/Aug/2015:23:49:01 +0000  || GET /logger/?action-view&site_id=123 HTTP/1.1 || 200 || 236 || https://foo.com/some/url || Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36 || - || - || - || 0.000\"]\n",
 85 |     "data = [extract_fields(l) for l in log_lines]\n",
 86 |     "df = pd.DataFrame(data)\n",
 87 |     "df.columns = ['date', 'request', 'referrer', 'user_agent']\n",
 88 |     "df"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 57,
 94 |    "metadata": {},
 95 |    "outputs": [
 96 |     {
 97 |      "data": {
 98 |       "text/plain": [
 99 |        "0    31/Aug/2015:23:49:01 +0000\n",
100 |        "Name: date, dtype: object"
101 |       ]
102 |      },
103 |      "execution_count": 57,
104 |      "metadata": {},
105 |      "output_type": "execute_result"
106 |     }
107 |    ],
108 |    "source": [
109 |     "df.date"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 58,
115 |    "metadata": {},
116 |    "outputs": [
117 |     {
118 |      "data": {
119 |       "text/html": [
120 |        "<div>\n",
121 |        "<style scoped>\n",
122 |        "    .dataframe tbody tr th:only-of-type {\n",
123 |        "        vertical-align: middle;\n",
124 |        "    }\n",
125 |        "\n",
126 |        "    .dataframe tbody tr th {\n",
127 |        "        vertical-align: top;\n",
128 |        "    }\n",
129 |        "\n",
130 |        "    .dataframe thead th {\n",
131 |        "        text-align: right;\n",
132 |        "    }\n",
133 |        "</style>\n",
134 |        "<table border=\"1\" class=\"dataframe\">\n",
135 |        "  <thead>\n",
136 |        "    <tr style=\"text-align: right;\">\n",
137 |        "      <th></th>\n",
138 |        "      <th>date</th>\n",
139 |        "      <th>request</th>\n",
140 |        "      <th>referrer</th>\n",
141 |        "      <th>user_agent</th>\n",
142 |        "    </tr>\n",
143 |        "  </thead>\n",
144 |        "  <tbody>\n",
145 |        "    <tr>\n",
146 |        "      <th>0</th>\n",
147 |        "      <td>2015-08-31 23:49:01+00:00</td>\n",
148 |        "      <td>GET /logger/?action-view&amp;site_id=123 HTTP/1.1</td>\n",
149 |        "      <td>https://foo.com/some/url</td>\n",
150 |        "      <td>Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...</td>\n",
151 |        "    </tr>\n",
152 |        "  </tbody>\n",
153 |        "</table>\n",
154 |        "</div>"
155 |       ],
156 |       "text/plain": [
157 |        "                       date                                        request  \\\n",
158 |        "0 2015-08-31 23:49:01+00:00  GET /logger/?action-view&site_id=123 HTTP/1.1   \n",
159 |        "\n",
160 |        "                   referrer                                         user_agent  \n",
161 |        "0  https://foo.com/some/url  Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...  "
162 |       ]
163 |      },
164 |      "execution_count": 58,
165 |      "metadata": {},
166 |      "output_type": "execute_result"
167 |     }
168 |    ],
169 |    "source": [
170 |     "df['date'] = pd.to_datetime(df['date'], format='%d/%b/%Y:%H:%M:%S +0000', utc=True)\n",
171 |     "df"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": 59,
177 |    "metadata": {},
178 |    "outputs": [
179 |     {
180 |      "data": {
181 |       "text/html": [
182 |        "<div>\n",
183 |        "<style scoped>\n",
184 |        "    .dataframe tbody tr th:only-of-type {\n",
185 |        "        vertical-align: middle;\n",
186 |        "    }\n",
187 |        "\n",
188 |        "    .dataframe tbody tr th {\n",
189 |        "        vertical-align: top;\n",
190 |        "    }\n",
191 |        "\n",
192 |        "    .dataframe thead th {\n",
193 |        "        text-align: right;\n",
194 |        "    }\n",
195 |        "</style>\n",
196 |        "<table border=\"1\" class=\"dataframe\">\n",
197 |        "  <thead>\n",
198 |        "    <tr style=\"text-align: right;\">\n",
199 |        "      <th></th>\n",
200 |        "      <th>request</th>\n",
201 |        "      <th>referrer</th>\n",
202 |        "      <th>user_agent</th>\n",
203 |        "    </tr>\n",
204 |        "    <tr>\n",
205 |        "      <th>date</th>\n",
206 |        "      <th></th>\n",
207 |        "      <th></th>\n",
208 |        "      <th></th>\n",
209 |        "    </tr>\n",
210 |        "  </thead>\n",
211 |        "  <tbody>\n",
212 |        "    <tr>\n",
213 |        "      <th>2015-08-31 23:49:01+00:00</th>\n",
214 |        "      <td>GET /logger/?action-view&amp;site_id=123 HTTP/1.1</td>\n",
215 |        "      <td>https://foo.com/some/url</td>\n",
216 |        "      <td>Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...</td>\n",
217 |        "    </tr>\n",
218 |        "  </tbody>\n",
219 |        "</table>\n",
220 |        "</div>"
221 |       ],
222 |       "text/plain": [
223 |        "                                                                 request  \\\n",
224 |        "date                                                                       \n",
225 |        "2015-08-31 23:49:01+00:00  GET /logger/?action-view&site_id=123 HTTP/1.1   \n",
226 |        "\n",
227 |        "                                           referrer  \\\n",
228 |        "date                                                  \n",
229 |        "2015-08-31 23:49:01+00:00  https://foo.com/some/url   \n",
230 |        "\n",
231 |        "                                                                  user_agent  \n",
232 |        "date                                                                          \n",
233 |        "2015-08-31 23:49:01+00:00  Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...  "
234 |       ]
235 |      },
236 |      "execution_count": 59,
237 |      "metadata": {},
238 |      "output_type": "execute_result"
239 |     }
240 |    ],
241 |    "source": [
242 |     "df.set_index('date', inplace=True)\n",
243 |     "df"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": 60,
249 |    "metadata": {},
250 |    "outputs": [
251 |     {
252 |      "data": {
253 |       "text/plain": [
254 |        "DatetimeIndex(['2015-08-31 23:49:01+00:00'], dtype='datetime64[ns, UTC]', name=u'date', freq=None)"
255 |       ]
256 |      },
257 |      "execution_count": 60,
258 |      "metadata": {},
259 |      "output_type": "execute_result"
260 |     }
261 |    ],
262 |    "source": [
263 |     "df.index"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": 61,
269 |    "metadata": {},
270 |    "outputs": [
271 |     {
272 |      "data": {
273 |       "text/html": [
274 |        "<div>\n",
275 |        "<style scoped>\n",
276 |        "    .dataframe tbody tr th:only-of-type {\n",
277 |        "        vertical-align: middle;\n",
278 |        "    }\n",
279 |        "\n",
280 |        "    .dataframe tbody tr th {\n",
281 |        "        vertical-align: top;\n",
282 |        "    }\n",
283 |        "\n",
284 |        "    .dataframe thead th {\n",
285 |        "        text-align: right;\n",
286 |        "    }\n",
287 |        "</style>\n",
288 |        "<table border=\"1\" class=\"dataframe\">\n",
289 |        "  <thead>\n",
290 |        "    <tr style=\"text-align: right;\">\n",
291 |        "      <th></th>\n",
292 |        "      <th>request</th>\n",
293 |        "      <th>referrer</th>\n",
294 |        "      <th>user_agent</th>\n",
295 |        "    </tr>\n",
296 |        "    <tr>\n",
297 |        "      <th>date</th>\n",
298 |        "      <th></th>\n",
299 |        "      <th></th>\n",
300 |        "      <th></th>\n",
301 |        "    </tr>\n",
302 |        "  </thead>\n",
303 |        "  <tbody>\n",
304 |        "    <tr>\n",
305 |        "      <th>2015-08-31 16:49:01-07:00</th>\n",
306 |        "      <td>GET /logger/?action-view&amp;site_id=123 HTTP/1.1</td>\n",
307 |        "      <td>https://foo.com/some/url</td>\n",
308 |        "      <td>Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...</td>\n",
309 |        "    </tr>\n",
310 |        "  </tbody>\n",
311 |        "</table>\n",
312 |        "</div>"
313 |       ],
314 |       "text/plain": [
315 |        "                                                                 request  \\\n",
316 |        "date                                                                       \n",
317 |        "2015-08-31 16:49:01-07:00  GET /logger/?action-view&site_id=123 HTTP/1.1   \n",
318 |        "\n",
319 |        "                                           referrer  \\\n",
320 |        "date                                                  \n",
321 |        "2015-08-31 16:49:01-07:00  https://foo.com/some/url   \n",
322 |        "\n",
323 |        "                                                                  user_agent  \n",
324 |        "date                                                                          \n",
325 |        "2015-08-31 16:49:01-07:00  Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...  "
326 |       ]
327 |      },
328 |      "execution_count": 61,
329 |      "metadata": {},
330 |      "output_type": "execute_result"
331 |     }
332 |    ],
333 |    "source": [
334 |     "df.index = df.index.tz_convert('America/Los_Angeles')\n",
335 |     "df"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": 71,
341 |    "metadata": {},
342 |    "outputs": [
343 |     {
344 |      "data": {
345 |       "text/html": [
346 |        "<div>\n",
347 |        "<style scoped>\n",
348 |        "    .dataframe tbody tr th:only-of-type {\n",
349 |        "        vertical-align: middle;\n",
350 |        "    }\n",
351 |        "\n",
352 |        "    .dataframe tbody tr th {\n",
353 |        "        vertical-align: top;\n",
354 |        "    }\n",
355 |        "\n",
356 |        "    .dataframe thead th {\n",
357 |        "        text-align: right;\n",
358 |        "    }\n",
359 |        "</style>\n",
360 |        "<table border=\"1\" class=\"dataframe\">\n",
361 |        "  <thead>\n",
362 |        "    <tr style=\"text-align: right;\">\n",
363 |        "      <th></th>\n",
364 |        "      <th>request</th>\n",
365 |        "      <th>referrer</th>\n",
366 |        "      <th>user_agent</th>\n",
367 |        "    </tr>\n",
368 |        "    <tr>\n",
369 |        "      <th>date</th>\n",
370 |        "      <th></th>\n",
371 |        "      <th></th>\n",
372 |        "      <th></th>\n",
373 |        "    </tr>\n",
374 |        "  </thead>\n",
375 |        "  <tbody>\n",
376 |        "    <tr>\n",
377 |        "      <th>2015-08-31 16:00:00-07:00</th>\n",
378 |        "      <td>GET /logger/?action-view&amp;site_id=123 HTTP/1.1</td>\n",
379 |        "      <td>https://foo.com/some/url</td>\n",
380 |        "      <td>Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...</td>\n",
381 |        "    </tr>\n",
382 |        "  </tbody>\n",
383 |        "</table>\n",
384 |        "</div>"
385 |       ],
386 |       "text/plain": [
387 |        "                                                                 request  \\\n",
388 |        "date                                                                       \n",
389 |        "2015-08-31 16:00:00-07:00  GET /logger/?action-view&site_id=123 HTTP/1.1   \n",
390 |        "\n",
391 |        "                                           referrer  \\\n",
392 |        "date                                                  \n",
393 |        "2015-08-31 16:00:00-07:00  https://foo.com/some/url   \n",
394 |        "\n",
395 |        "                                                                  user_agent  \n",
396 |        "date                                                                          \n",
397 |        "2015-08-31 16:00:00-07:00  Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...  "
398 |       ]
399 |      },
400 |      "execution_count": 71,
401 |      "metadata": {},
402 |      "output_type": "execute_result"
403 |     }
404 |    ],
405 |    "source": [
406 |     "df.index = df.index.floor('1H')\n",
407 |     "df"
408 |    ]
409 |   },
410 |   {
411 |    "cell_type": "code",
412 |    "execution_count": 82,
413 |    "metadata": {},
414 |    "outputs": [
415 |     {
416 |      "data": {
417 |       "text/html": [
418 |        "<div>\n",
419 |        "<style scoped>\n",
420 |        "    .dataframe tbody tr th:only-of-type {\n",
421 |        "        vertical-align: middle;\n",
422 |        "    }\n",
423 |        "\n",
424 |        "    .dataframe tbody tr th {\n",
425 |        "        vertical-align: top;\n",
426 |        "    }\n",
427 |        "\n",
428 |        "    .dataframe thead th {\n",
429 |        "        text-align: right;\n",
430 |        "    }\n",
431 |        "</style>\n",
432 |        "<table border=\"1\" class=\"dataframe\">\n",
433 |        "  <thead>\n",
434 |        "    <tr style=\"text-align: right;\">\n",
435 |        "      <th></th>\n",
436 |        "      <th></th>\n",
437 |        "      <th>referrer</th>\n",
438 |        "      <th>user_agent</th>\n",
439 |        "    </tr>\n",
440 |        "    <tr>\n",
441 |        "      <th>date</th>\n",
442 |        "      <th>request</th>\n",
443 |        "      <th></th>\n",
444 |        "      <th></th>\n",
445 |        "    </tr>\n",
446 |        "  </thead>\n",
447 |        "  <tbody>\n",
448 |        "    <tr>\n",
449 |        "      <th>2015-08-31 16:00:00-07:00</th>\n",
450 |        "      <th>GET /logger/?action-view&amp;site_id=123 HTTP/1.1</th>\n",
451 |        "      <td>1</td>\n",
452 |        "      <td>1</td>\n",
453 |        "    </tr>\n",
454 |        "  </tbody>\n",
455 |        "</table>\n",
456 |        "</div>"
457 |       ],
458 |       "text/plain": [
459 |        "                                                                         referrer  \\\n",
460 |        "date                      request                                                   \n",
461 |        "2015-08-31 16:00:00-07:00 GET /logger/?action-view&site_id=123 HTTP/1.1         1   \n",
462 |        "\n",
463 |        "                                                                         user_agent  \n",
464 |        "date                      request                                                    \n",
465 |        "2015-08-31 16:00:00-07:00 GET /logger/?action-view&site_id=123 HTTP/1.1           1  "
466 |       ]
467 |      },
468 |      "execution_count": 82,
469 |      "metadata": {},
470 |      "output_type": "execute_result"
471 |     }
472 |    ],
473 |    "source": [
474 |     "df.groupby(['date', 'request']).count()"
475 |    ]
476 |   },
477 |   {
478 |    "cell_type": "code",
479 |    "execution_count": 84,
480 |    "metadata": {},
481 |    "outputs": [
482 |     {
483 |      "data": {
484 |       "text/plain": [
485 |        "datetime.datetime(2017, 12, 27, 15, 17, 35, 311839)"
486 |       ]
487 |      },
488 |      "execution_count": 84,
489 |      "metadata": {},
490 |      "output_type": "execute_result"
491 |     }
492 |    ],
493 |    "source": [
494 |     "now = dt.datetime.now()\n",
495 |     "now"
496 |    ]
497 |   },
498 |   {
499 |    "cell_type": "code",
500 |    "execution_count": 124,
501 |    "metadata": {},
502 |    "outputs": [
503 |     {
504 |      "data": {
505 |       "text/plain": [
506 |        "datetime.datetime(2018, 1, 26, 15, 17, 35, 311839)"
507 |       ]
508 |      },
509 |      "execution_count": 124,
510 |      "metadata": {},
511 |      "output_type": "execute_result"
512 |     }
513 |    ],
514 |    "source": [
515 |     "next_month = now + dt.timedelta(days=30)\n",
516 |     "next_month"
517 |    ]
518 |   },
519 |   {
520 |    "cell_type": "code",
521 |    "execution_count": 102,
522 |    "metadata": {},
523 |    "outputs": [
524 |     {
525 |      "data": {
526 |       "text/html": [
527 |        "<div>\n",
528 |        "<style scoped>\n",
529 |        "    .dataframe tbody tr th:only-of-type {\n",
530 |        "        vertical-align: middle;\n",
531 |        "    }\n",
532 |        "\n",
533 |        "    .dataframe tbody tr th {\n",
534 |        "        vertical-align: top;\n",
535 |        "    }\n",
536 |        "\n",
537 |        "    .dataframe thead th {\n",
538 |        "        text-align: right;\n",
539 |        "    }\n",
540 |        "</style>\n",
541 |        "<table border=\"1\" class=\"dataframe\">\n",
542 |        "  <thead>\n",
543 |        "    <tr style=\"text-align: right;\">\n",
544 |        "      <th></th>\n",
545 |        "      <th>request</th>\n",
546 |        "      <th>referrer</th>\n",
547 |        "      <th>user_agent</th>\n",
548 |        "      <th>week_start</th>\n",
549 |        "    </tr>\n",
550 |        "    <tr>\n",
551 |        "      <th>date</th>\n",
552 |        "      <th></th>\n",
553 |        "      <th></th>\n",
554 |        "      <th></th>\n",
555 |        "      <th></th>\n",
556 |        "    </tr>\n",
557 |        "  </thead>\n",
558 |        "  <tbody>\n",
559 |        "    <tr>\n",
560 |        "      <th>2015-08-31 16:00:00-07:00</th>\n",
561 |        "      <td>GET /logger/?action-view&amp;site_id=123 HTTP/1.1</td>\n",
562 |        "      <td>https://foo.com/some/url</td>\n",
563 |        "      <td>Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...</td>\n",
564 |        "      <td>2015-08-31</td>\n",
565 |        "    </tr>\n",
566 |        "  </tbody>\n",
567 |        "</table>\n",
568 |        "</div>"
569 |       ],
570 |       "text/plain": [
571 |        "                                                                 request  \\\n",
572 |        "date                                                                       \n",
573 |        "2015-08-31 16:00:00-07:00  GET /logger/?action-view&site_id=123 HTTP/1.1   \n",
574 |        "\n",
575 |        "                                           referrer  \\\n",
576 |        "date                                                  \n",
577 |        "2015-08-31 16:00:00-07:00  https://foo.com/some/url   \n",
578 |        "\n",
579 |        "                                                                  user_agent  \\\n",
580 |        "date                                                                           \n",
581 |        "2015-08-31 16:00:00-07:00  Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...   \n",
582 |        "\n",
583 |        "                          week_start  \n",
584 |        "date                                  \n",
585 |        "2015-08-31 16:00:00-07:00 2015-08-31  "
586 |       ]
587 |      },
588 |      "execution_count": 102,
589 |      "metadata": {},
590 |      "output_type": "execute_result"
591 |     }
592 |    ],
593 |    "source": [
594 |     "df['week_start'] = df.index.to_period('W').start_time\n",
595 |     "df"
596 |    ]
597 |   },
598 |   {
599 |    "cell_type": "code",
600 |    "execution_count": 120,
601 |    "metadata": {},
602 |    "outputs": [
603 |     {
604 |      "data": {
605 |       "text/html": [
606 |        "<div>\n",
607 |        "<style scoped>\n",
608 |        "    .dataframe tbody tr th:only-of-type {\n",
609 |        "        vertical-align: middle;\n",
610 |        "    }\n",
611 |        "\n",
612 |        "    .dataframe tbody tr th {\n",
613 |        "        vertical-align: top;\n",
614 |        "    }\n",
615 |        "\n",
616 |        "    .dataframe thead th {\n",
617 |        "        text-align: right;\n",
618 |        "    }\n",
619 |        "</style>\n",
620 |        "<table border=\"1\" class=\"dataframe\">\n",
621 |        "  <thead>\n",
622 |        "    <tr style=\"text-align: right;\">\n",
623 |        "      <th></th>\n",
624 |        "      <th>request</th>\n",
625 |        "      <th>referrer</th>\n",
626 |        "      <th>user_agent</th>\n",
627 |        "      <th>week_start</th>\n",
628 |        "      <th>next_week_start</th>\n",
629 |        "    </tr>\n",
630 |        "    <tr>\n",
631 |        "      <th>date</th>\n",
632 |        "      <th></th>\n",
633 |        "      <th></th>\n",
634 |        "      <th></th>\n",
635 |        "      <th></th>\n",
636 |        "      <th></th>\n",
637 |        "    </tr>\n",
638 |        "  </thead>\n",
639 |        "  <tbody>\n",
640 |        "    <tr>\n",
641 |        "      <th>2015-08-31 16:00:00-07:00</th>\n",
642 |        "      <td>GET /logger/?action-view&amp;site_id=123 HTTP/1.1</td>\n",
643 |        "      <td>https://foo.com/some/url</td>\n",
644 |        "      <td>Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...</td>\n",
645 |        "      <td>2015-08-31</td>\n",
646 |        "      <td>2015-09-07</td>\n",
647 |        "    </tr>\n",
648 |        "  </tbody>\n",
649 |        "</table>\n",
650 |        "</div>"
651 |       ],
652 |       "text/plain": [
653 |        "                                                                 request  \\\n",
654 |        "date                                                                       \n",
655 |        "2015-08-31 16:00:00-07:00  GET /logger/?action-view&site_id=123 HTTP/1.1   \n",
656 |        "\n",
657 |        "                                           referrer  \\\n",
658 |        "date                                                  \n",
659 |        "2015-08-31 16:00:00-07:00  https://foo.com/some/url   \n",
660 |        "\n",
661 |        "                                                                  user_agent  \\\n",
662 |        "date                                                                           \n",
663 |        "2015-08-31 16:00:00-07:00  Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...   \n",
664 |        "\n",
665 |        "                          week_start next_week_start  \n",
666 |        "date                                                  \n",
667 |        "2015-08-31 16:00:00-07:00 2015-08-31      2015-09-07  "
668 |       ]
669 |      },
670 |      "execution_count": 120,
671 |      "metadata": {},
672 |      "output_type": "execute_result"
673 |     }
674 |    ],
675 |    "source": [
676 |     "df['next_week_start'] = df['week_start'] + pd.DateOffset(weeks=1)\n",
677 |     "df"
678 |    ]
679 |   },
680 |   {
681 |    "cell_type": "code",
682 |    "execution_count": 121,
683 |    "metadata": {},
684 |    "outputs": [
685 |     {
686 |      "data": {
687 |       "text/plain": [
688 |        "DatetimeIndex(['2017-12-27 15:17:35.311839', '2017-12-28 15:17:35.311839',\n",
689 |        "               '2017-12-29 15:17:35.311839', '2017-12-30 15:17:35.311839',\n",
690 |        "               '2017-12-31 15:17:35.311839', '2018-01-01 15:17:35.311839',\n",
691 |        "               '2018-01-02 15:17:35.311839', '2018-01-03 15:17:35.311839',\n",
692 |        "               '2018-01-04 15:17:35.311839', '2018-01-05 15:17:35.311839'],\n",
693 |        "              dtype='datetime64[ns]', freq='D')"
694 |       ]
695 |      },
696 |      "execution_count": 121,
697 |      "metadata": {},
698 |      "output_type": "execute_result"
699 |     }
700 |    ],
701 |    "source": [
702 |     "pd.date_range(now, periods=10, freq='D')"
703 |    ]
704 |   },
705 |   {
706 |    "cell_type": "code",
707 |    "execution_count": 125,
708 |    "metadata": {},
709 |    "outputs": [
710 |     {
711 |      "data": {
712 |       "text/plain": [
713 |        "DatetimeIndex(['2017-12-31 15:17:35.311839', '2018-01-07 15:17:35.311839',\n",
714 |        "               '2018-01-14 15:17:35.311839', '2018-01-21 15:17:35.311839'],\n",
715 |        "              dtype='datetime64[ns]', freq='W-SUN')"
716 |       ]
717 |      },
718 |      "execution_count": 125,
719 |      "metadata": {},
720 |      "output_type": "execute_result"
721 |     }
722 |    ],
723 |    "source": [
724 |     "pd.date_range(now, next_month, freq='W')"
725 |    ]
726 |   },
727 |   {
728 |    "cell_type": "code",
729 |    "execution_count": null,
730 |    "metadata": {},
731 |    "outputs": [],
732 |    "source": []
733 |   }
734 |  ],
735 |  "metadata": {
736 |   "kernelspec": {
737 |    "display_name": "Python 2",
738 |    "language": "python",
739 |    "name": "python2"
740 |   },
741 |   "language_info": {
742 |    "codemirror_mode": {
743 |     "name": "ipython",
744 |     "version": 2
745 |    },
746 |    "file_extension": ".py",
747 |    "mimetype": "text/x-python",
748 |    "name": "python",
749 |    "nbconvert_exporter": "python",
750 |    "pygments_lexer": "ipython2",
751 |    "version": "2.7.13"
752 |   }
753 |  },
754 |  "nbformat": 4,
755 |  "nbformat_minor": 2
756 | }
757 | 


--------------------------------------------------------------------------------
/2018/datetimes/requirements.txt:
--------------------------------------------------------------------------------
1 | arrow==0.10.0
2 | Babel==2.4.0
3 | maya==0.3.2
4 | pendulum==1.2.4
5 | pytz==2017.2
6 | tzlocal==1.4
7 | 


--------------------------------------------------------------------------------
/2018/sqlpandas/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2018/sqlpandas/README.md


--------------------------------------------------------------------------------
/2018/sqlpandas/data-hp/houses.csv:
--------------------------------------------------------------------------------
1 | code,name,headmaster
2 | G,Gryffindor,Minerva McGonagall
3 | S,Slytherin,Severus Snape
4 | R,Ravenclaw,Filius Flitwick
5 | H,Hufflepuff,Pomona Sprout


--------------------------------------------------------------------------------
/2018/sqlpandas/data-hp/students.csv:
--------------------------------------------------------------------------------
 1 | name,house_code
 2 | Harry Potter,G
 3 | Ron Weasley,G
 4 | Hermione Granger,G
 5 | Draco Malfoy,S
 6 | Vincent Crabbe,S
 7 | Gregory Goyle,S
 8 | Luna Lovegood,R
 9 | Padma Patil,R
10 | Parvati Patil,R
11 | Hannah Abbott,H
12 | Susan Bones,H
13 | Cedric Diggory,H


--------------------------------------------------------------------------------
/2018/sqlpandas/data/countries.csv:
--------------------------------------------------------------------------------
  1 | "id","code","name","continent","wikipedia_link","keywords"
  2 | 302672,"AD","Andorra","EU","http://en.wikipedia.org/wiki/Andorra",
  3 | 302618,"AE","United Arab Emirates","AS","http://en.wikipedia.org/wiki/United_Arab_Emirates","UAE,مطارات في الإمارات العربية المتحدة"
  4 | 302619,"AF","Afghanistan","AS","http://en.wikipedia.org/wiki/Afghanistan",
  5 | 302722,"AG","Antigua and Barbuda","NA","http://en.wikipedia.org/wiki/Antigua_and_Barbuda",
  6 | 302723,"AI","Anguilla","NA","http://en.wikipedia.org/wiki/Anguilla",
  7 | 302673,"AL","Albania","EU","http://en.wikipedia.org/wiki/Albania",
  8 | 302620,"AM","Armenia","AS","http://en.wikipedia.org/wiki/Armenia",
  9 | 302556,"AO","Angola","AF","http://en.wikipedia.org/wiki/Angola",
 10 | 302615,"AQ","Antarctica","AN","http://en.wikipedia.org/wiki/Antarctica",
 11 | 302789,"AR","Argentina","SA","http://en.wikipedia.org/wiki/Argentina","Aeropuertos de Argentina"
 12 | 302763,"AS","American Samoa","OC","http://en.wikipedia.org/wiki/American_Samoa",
 13 | 302674,"AT","Austria","EU","http://en.wikipedia.org/wiki/Austria","Flughäfen in Österreich"
 14 | 302764,"AU","Australia","OC","http://en.wikipedia.org/wiki/Australia",
 15 | 302725,"AW","Aruba","NA","http://en.wikipedia.org/wiki/Aruba",
 16 | 302621,"AZ","Azerbaijan","AS","http://en.wikipedia.org/wiki/Azerbaijan",
 17 | 302675,"BA","Bosnia and Herzegovina","EU","http://en.wikipedia.org/wiki/Bosnia_and_Herzegovina",
 18 | 302726,"BB","Barbados","NA","http://en.wikipedia.org/wiki/Barbados",
 19 | 302622,"BD","Bangladesh","AS","http://en.wikipedia.org/wiki/Bangladesh",
 20 | 302676,"BE","Belgium","EU","http://en.wikipedia.org/wiki/Belgium","Aéroports de Belgique,Luchthavens van België"
 21 | 302557,"BF","Burkina Faso","AF","http://en.wikipedia.org/wiki/Burkina_Faso",
 22 | 302677,"BG","Bulgaria","EU","http://en.wikipedia.org/wiki/Bulgaria",
 23 | 302623,"BH","Bahrain","AS","http://en.wikipedia.org/wiki/Bahrain","مطارات البحرين"
 24 | 302558,"BI","Burundi","AF","http://en.wikipedia.org/wiki/Burundi",
 25 | 302559,"BJ","Benin","AF","http://en.wikipedia.org/wiki/Benin",
 26 | 302760,"BL","Saint Barthélemy","NA","http://en.wikipedia.org/wiki/Saint_Barthélemy",
 27 | 302727,"BM","Bermuda","NA","http://en.wikipedia.org/wiki/Bermuda",
 28 | 302624,"BN","Brunei","AS","http://en.wikipedia.org/wiki/Brunei",
 29 | 302790,"BO","Bolivia","SA","http://en.wikipedia.org/wiki/Bolivia","Aeropuertos de Bolivia"
 30 | 302724,"BQ","Caribbean Netherlands","NA","http://en.wikipedia.org/wiki/Caribbean_Netherlands",
 31 | 302791,"BR","Brazil","SA","http://en.wikipedia.org/wiki/Brazil","Brasil, Brasilian"
 32 | 302728,"BS","Bahamas","NA","http://en.wikipedia.org/wiki/Bahamas",
 33 | 302625,"BT","Bhutan","AS","http://en.wikipedia.org/wiki/Bhutan",
 34 | 302560,"BW","Botswana","AF","http://en.wikipedia.org/wiki/Botswana",
 35 | 302678,"BY","Belarus","EU","http://en.wikipedia.org/wiki/Belarus","Belarussian, Беларусь"
 36 | 302729,"BZ","Belize","NA","http://en.wikipedia.org/wiki/Belize",
 37 | 302730,"CA","Canada","NA","http://en.wikipedia.org/wiki/Canada",
 38 | 302626,"CC","Cocos (Keeling) Islands","AS","http://en.wikipedia.org/wiki/Cocos_(Keeling)_Islands",
 39 | 302561,"CD","Congo (Kinshasa)","AF","http://en.wikipedia.org/wiki/Congo_(Kinshasa)",
 40 | 302562,"CF","Central African Republic","AF","http://en.wikipedia.org/wiki/Central_African_Republic",
 41 | 302563,"CG","Congo (Brazzaville)","AF","http://en.wikipedia.org/wiki/Congo_(Brazzaville)",
 42 | 302679,"CH","Switzerland","EU","http://en.wikipedia.org/wiki/Switzerland","Aéroports de la Suisse,Flughäfen der Schweiz"
 43 | 302564,"CI","Côte d'Ivoire","AF","http://en.wikipedia.org/wiki/Côte_d'Ivoire","Ivory Coast"
 44 | 302765,"CK","Cook Islands","OC","http://en.wikipedia.org/wiki/Cook_Islands",
 45 | 302792,"CL","Chile","SA","http://en.wikipedia.org/wiki/Chile","Aeropuertos de Chile"
 46 | 302565,"CM","Cameroon","AF","http://en.wikipedia.org/wiki/Cameroon",
 47 | 302627,"CN","China","AS","http://en.wikipedia.org/wiki/China","中国的机场"
 48 | 302793,"CO","Colombia","SA","http://en.wikipedia.org/wiki/Colombia","Aeropuertos de Colombia"
 49 | 302731,"CR","Costa Rica","NA","http://en.wikipedia.org/wiki/Costa_Rica","Aeropuertos de Costa Rica"
 50 | 302732,"CU","Cuba","NA","http://en.wikipedia.org/wiki/Cuba","Aeropuertos de Cuba"
 51 | 302566,"CV","Cape Verde","AF","http://en.wikipedia.org/wiki/Cape_Verde",
 52 | 302762,"CW","Curaçao","NA","http://en.wikipedia.org/wiki/Cura%C3%A7ao",
 53 | 302628,"CX","Christmas Island","AS","http://en.wikipedia.org/wiki/Christmas_Island",
 54 | 302629,"CY","Cyprus","AS","http://en.wikipedia.org/wiki/Cyprus",
 55 | 302680,"CZ","Czechia","EU","http://en.wikipedia.org/wiki/Czech_Republic","Letiště České republiky"
 56 | 302681,"DE","Germany","EU","http://en.wikipedia.org/wiki/Germany","Flughäfen in Deutschland"
 57 | 302567,"DJ","Djibouti","AF","http://en.wikipedia.org/wiki/Djibouti",
 58 | 302682,"DK","Denmark","EU","http://en.wikipedia.org/wiki/Denmark","Lufthavnene i Danmark"
 59 | 302733,"DM","Dominica","NA","http://en.wikipedia.org/wiki/Dominica",
 60 | 302734,"DO","Dominican Republic","NA","http://en.wikipedia.org/wiki/Dominican_Republic",
 61 | 302568,"DZ","Algeria","AF","http://en.wikipedia.org/wiki/Algeria","مطارات الجزائر"
 62 | 302794,"EC","Ecuador","SA","http://en.wikipedia.org/wiki/Ecuador","Aeropuertos de Ecuador"
 63 | 302683,"EE","Estonia","EU","http://en.wikipedia.org/wiki/Estonia",
 64 | 302569,"EG","Egypt","AF","http://en.wikipedia.org/wiki/Egypt","مطارات مصر"
 65 | 302570,"EH","Western Sahara","AF","http://en.wikipedia.org/wiki/Western_Sahara","Sahrawian, مطارات الصحراء الغربية"
 66 | 302571,"ER","Eritrea","AF","http://en.wikipedia.org/wiki/Eritrea",
 67 | 302684,"ES","Spain","EU","http://en.wikipedia.org/wiki/Spain","Aeropuertos de España"
 68 | 302572,"ET","Ethiopia","AF","http://en.wikipedia.org/wiki/Ethiopia",
 69 | 302685,"FI","Finland","EU","http://en.wikipedia.org/wiki/Finland","Lentokentät, Suomen"
 70 | 302766,"FJ","Fiji","OC","http://en.wikipedia.org/wiki/Fiji",
 71 | 302795,"FK","Falkland Islands","SA","http://en.wikipedia.org/wiki/Falkland_Islands",
 72 | 302767,"FM","Micronesia","OC","https://en.wikipedia.org/wiki/Federated_States_of_Micronesia",
 73 | 302686,"FO","Faroe Islands","EU","http://en.wikipedia.org/wiki/Faroe_Islands",
 74 | 302687,"FR","France","EU","http://en.wikipedia.org/wiki/France","Aéroports de France"
 75 | 302573,"GA","Gabon","AF","http://en.wikipedia.org/wiki/Gabon",
 76 | 302688,"GB","United Kingdom","EU","http://en.wikipedia.org/wiki/United_Kingdom","Great Britain"
 77 | 302735,"GD","Grenada","NA","http://en.wikipedia.org/wiki/Grenada",
 78 | 302630,"GE","Georgia","AS","http://en.wikipedia.org/wiki/Georgia_(country)",
 79 | 302796,"GF","French Guiana","SA","http://en.wikipedia.org/wiki/French_Guiana","French Guyana"
 80 | 302689,"GG","Guernsey","EU","http://en.wikipedia.org/wiki/Guernsey",
 81 | 302574,"GH","Ghana","AF","http://en.wikipedia.org/wiki/Ghana",
 82 | 302690,"GI","Gibraltar","EU","http://en.wikipedia.org/wiki/Gibraltar",
 83 | 302736,"GL","Greenland","NA","http://en.wikipedia.org/wiki/Greenland",
 84 | 302575,"GM","Gambia","AF","http://en.wikipedia.org/wiki/Gambia",
 85 | 302576,"GN","Guinea","AF","http://en.wikipedia.org/wiki/Guinea","Aéroports de la Guinée"
 86 | 302737,"GP","Guadeloupe","NA","http://en.wikipedia.org/wiki/Guadeloupe",
 87 | 302577,"GQ","Equatorial Guinea","AF","http://en.wikipedia.org/wiki/Equatorial_Guinea",
 88 | 302691,"GR","Greece","EU","http://en.wikipedia.org/wiki/Greece","αεροδρόμια στην Ελλάδα"
 89 | 302616,"GS","South Georgia and the South Sandwich Islands","AN","http://en.wikipedia.org/wiki/South_Georgia_and_the_South_Sandwich_Islands",
 90 | 302738,"GT","Guatemala","NA","http://en.wikipedia.org/wiki/Guatemala","Aeropuertos de Guatemala"
 91 | 302768,"GU","Guam","OC","http://en.wikipedia.org/wiki/Guam",
 92 | 302578,"GW","Guinea-Bissau","AF","http://en.wikipedia.org/wiki/Guinea-Bissau",
 93 | 302797,"GY","Guyana","SA","http://en.wikipedia.org/wiki/Guyana",
 94 | 302631,"HK","Hong Kong","AS","http://en.wikipedia.org/wiki/Hong_Kong",
 95 | 302739,"HN","Honduras","NA","http://en.wikipedia.org/wiki/Honduras","Aeropuertos de Honduras"
 96 | 302692,"HR","Croatia","EU","http://en.wikipedia.org/wiki/Croatia",
 97 | 302740,"HT","Haiti","NA","http://en.wikipedia.org/wiki/Haiti","Aéroports de Haïti"
 98 | 302693,"HU","Hungary","EU","http://en.wikipedia.org/wiki/Hungary","Repülőterek Magyarország"
 99 | 302632,"ID","Indonesia","AS","http://en.wikipedia.org/wiki/Indonesia","Bandara di Indonesia"
100 | 302694,"IE","Ireland","EU","http://en.wikipedia.org/wiki/Ireland","Eire"
101 | 302633,"IL","Israel","AS","http://en.wikipedia.org/wiki/Israel","שדות התעופה של ישראל"
102 | 302695,"IM","Isle of Man","EU","http://en.wikipedia.org/wiki/Isle_of_Man",
103 | 302634,"IN","India","AS","http://en.wikipedia.org/wiki/India",
104 | 302635,"IO","British Indian Ocean Territory","AS","http://en.wikipedia.org/wiki/British_Indian_Ocean_Territory",
105 | 302636,"IQ","Iraq","AS","http://en.wikipedia.org/wiki/Iraq","مطارات العراق"
106 | 302637,"IR","Iran","AS","http://en.wikipedia.org/wiki/Iran","فرودگاه های ایران"
107 | 302696,"IS","Iceland","EU","http://en.wikipedia.org/wiki/Iceland",
108 | 302697,"IT","Italy","EU","http://en.wikipedia.org/wiki/Italy","Aeroporti d'Italia"
109 | 302698,"JE","Jersey","EU","http://en.wikipedia.org/wiki/Jersey",
110 | 302741,"JM","Jamaica","NA","http://en.wikipedia.org/wiki/Jamaica",
111 | 302638,"JO","Jordan","AS","http://en.wikipedia.org/wiki/Jordan","مطارات في الأردن"
112 | 302639,"JP","Japan","AS","http://en.wikipedia.org/wiki/Japan","Nippon, 日本の空港"
113 | 302579,"KE","Kenya","AF","http://en.wikipedia.org/wiki/Kenya",
114 | 302640,"KG","Kyrgyzstan","AS","http://en.wikipedia.org/wiki/Kyrgyzstan",
115 | 302641,"KH","Cambodia","AS","http://en.wikipedia.org/wiki/Cambodia",
116 | 302769,"KI","Kiribati","OC","http://en.wikipedia.org/wiki/Kiribati",
117 | 302580,"KM","Comoros","AF","http://en.wikipedia.org/wiki/Comoros","جزر القمر"
118 | 302742,"KN","Saint Kitts and Nevis","NA","http://en.wikipedia.org/wiki/Saint_Kitts_and_Nevis",
119 | 302642,"KP","North Korea","AS","http://en.wikipedia.org/wiki/North_Korea",
120 | 302643,"KR","South Korea","AS","http://en.wikipedia.org/wiki/South_Korea","한국의 공항"
121 | 302644,"KW","Kuwait","AS","http://en.wikipedia.org/wiki/Kuwait",
122 | 302743,"KY","Cayman Islands","NA","http://en.wikipedia.org/wiki/Cayman_Islands",
123 | 302645,"KZ","Kazakhstan","AS","http://en.wikipedia.org/wiki/Kazakhstan","Kazakh"
124 | 302646,"LA","Laos","AS","http://en.wikipedia.org/wiki/Laos",
125 | 302647,"LB","Lebanon","AS","http://en.wikipedia.org/wiki/Lebanon","المطارات في لبنان"
126 | 302744,"LC","Saint Lucia","NA","http://en.wikipedia.org/wiki/Saint_Lucia",
127 | 302699,"LI","Liechtenstein","EU","http://en.wikipedia.org/wiki/Liechtenstein",
128 | 302648,"LK","Sri Lanka","AS","http://en.wikipedia.org/wiki/Sri_Lanka",
129 | 302581,"LR","Liberia","AF","http://en.wikipedia.org/wiki/Liberia",
130 | 302582,"LS","Lesotho","AF","http://en.wikipedia.org/wiki/Lesotho",
131 | 302700,"LT","Lithuania","EU","http://en.wikipedia.org/wiki/Lithuania",
132 | 302701,"LU","Luxembourg","EU","http://en.wikipedia.org/wiki/Luxembourg",
133 | 302702,"LV","Latvia","EU","http://en.wikipedia.org/wiki/Latvia",
134 | 302583,"LY","Libya","AF","http://en.wikipedia.org/wiki/Libya","مطارات في ليبيا"
135 | 302584,"MA","Morocco","AF","http://en.wikipedia.org/wiki/Morocco","مطارات المغرب"
136 | 302703,"MC","Monaco","EU","http://en.wikipedia.org/wiki/Monaco",
137 | 302704,"MD","Moldova","EU","http://en.wikipedia.org/wiki/Moldova",
138 | 302705,"ME","Montenegro","EU","http://en.wikipedia.org/wiki/Montenegro",
139 | 302759,"MF","Saint Martin","NA","http://en.wikipedia.org/wiki/Saint_Martin_(France)",
140 | 302585,"MG","Madagascar","AF","http://en.wikipedia.org/wiki/Madagascar",
141 | 302770,"MH","Marshall Islands","OC","http://en.wikipedia.org/wiki/Marshall_Islands",
142 | 302706,"MK","Macedonia","EU","http://en.wikipedia.org/wiki/Macedonia",
143 | 302586,"ML","Mali","AF","http://en.wikipedia.org/wiki/Mali","Aéroports du Mali"
144 | 302649,"MM","Burma","AS","http://en.wikipedia.org/wiki/Burma","Myanmar"
145 | 302650,"MN","Mongolia","AS","http://en.wikipedia.org/wiki/Mongolia",
146 | 302651,"MO","Macau","AS","http://en.wikipedia.org/wiki/Macau","Macao"
147 | 302771,"MP","Northern Mariana Islands","OC","http://en.wikipedia.org/wiki/Northern_Mariana_Islands",
148 | 302745,"MQ","Martinique","NA","http://en.wikipedia.org/wiki/Martinique",
149 | 302587,"MR","Mauritania","AF","http://en.wikipedia.org/wiki/Mauritania","مطارات موريتانيا"
150 | 302746,"MS","Montserrat","NA","http://en.wikipedia.org/wiki/Montserrat",
151 | 302707,"MT","Malta","EU","http://en.wikipedia.org/wiki/Malta",
152 | 302588,"MU","Mauritius","AF","http://en.wikipedia.org/wiki/Mauritius",
153 | 302652,"MV","Maldives","AS","http://en.wikipedia.org/wiki/Maldives",
154 | 302589,"MW","Malawi","AF","http://en.wikipedia.org/wiki/Malawi",
155 | 302747,"MX","Mexico","NA","http://en.wikipedia.org/wiki/Mexico","Aeropuertos de México"
156 | 302653,"MY","Malaysia","AS","http://en.wikipedia.org/wiki/Malaysia","Lapangan Terbang Malaysia"
157 | 302590,"MZ","Mozambique","AF","http://en.wikipedia.org/wiki/Mozambique",
158 | 302591,"NA","Namibia","AF","http://en.wikipedia.org/wiki/Namibia",
159 | 302772,"NC","New Caledonia","OC","http://en.wikipedia.org/wiki/New_Caledonia",
160 | 302592,"NE","Niger","AF","http://en.wikipedia.org/wiki/Niger",
161 | 302773,"NF","Norfolk Island","OC","http://en.wikipedia.org/wiki/Norfolk_Island",
162 | 302593,"NG","Nigeria","AF","http://en.wikipedia.org/wiki/Nigeria",
163 | 302748,"NI","Nicaragua","NA","http://en.wikipedia.org/wiki/Nicaragua","Aeropuertos de Nicaragua"
164 | 302708,"NL","Netherlands","EU","http://en.wikipedia.org/wiki/Netherlands","Holland,Luchthavens van Nederland"
165 | 302709,"NO","Norway","EU","http://en.wikipedia.org/wiki/Norway","Flyplasser i Norge"
166 | 302654,"NP","Nepal","AS","http://en.wikipedia.org/wiki/Nepal","नेपाल विमानस्थलको"
167 | 302774,"NR","Nauru","OC","http://en.wikipedia.org/wiki/Nauru",
168 | 302775,"NU","Niue","OC","http://en.wikipedia.org/wiki/Niue",
169 | 302776,"NZ","New Zealand","OC","http://en.wikipedia.org/wiki/New_Zealand",
170 | 302655,"OM","Oman","AS","http://en.wikipedia.org/wiki/Oman","مطارات عمان"
171 | 302749,"PA","Panama","NA","http://en.wikipedia.org/wiki/Panama","Aeropuertos de Panamá"
172 | 302798,"PE","Perú","SA","http://en.wikipedia.org/wiki/Perú","Aeropuertos de Perú"
173 | 302777,"PF","French Polynesia","OC","http://en.wikipedia.org/wiki/French_Polynesia",
174 | 302778,"PG","Papua New Guinea","OC","http://en.wikipedia.org/wiki/Papua_New_Guinea",
175 | 302656,"PH","Philippines","AS","http://en.wikipedia.org/wiki/Philippines","Mga alternatibong byahe mula sa Pilipinas"
176 | 302657,"PK","Pakistan","AS","http://en.wikipedia.org/wiki/Pakistan","پاکستان کے ہوائی اڈوں"
177 | 302710,"PL","Poland","EU","http://en.wikipedia.org/wiki/Poland","Lotniska Polski"
178 | 302750,"PM","Saint Pierre and Miquelon","NA","http://en.wikipedia.org/wiki/Saint_Pierre_and_Miquelon",
179 | 302779,"PN","Pitcairn","OC","http://en.wikipedia.org/wiki/Pitcairn",
180 | 302751,"PR","Puerto Rico","NA","http://en.wikipedia.org/wiki/Puerto_Rico",
181 | 302658,"PS","Palestinian Territory","AS","http://en.wikipedia.org/wiki/Palestinian_Territory",
182 | 302711,"PT","Portugal","EU","http://en.wikipedia.org/wiki/Portugal","Aeroportos do Brasil"
183 | 302780,"PW","Palau","OC","http://en.wikipedia.org/wiki/Palau",
184 | 302799,"PY","Paraguay","SA","http://en.wikipedia.org/wiki/Paraguay","Aeropuertos de Paraguay"
185 | 302659,"QA","Qatar","AS","http://en.wikipedia.org/wiki/Qatar","مطارات قطر"
186 | 302594,"RE","Réunion","AF","http://en.wikipedia.org/wiki/Réunion","Île Bourbon, La Réunion"
187 | 302712,"RO","Romania","EU","http://en.wikipedia.org/wiki/Romania","Aeroporturi din România"
188 | 302713,"RS","Serbia","EU","http://en.wikipedia.org/wiki/Serbia","Serb"
189 | 302714,"RU","Russia","EU","http://en.wikipedia.org/wiki/Russia","Soviet, Sovietskaya, Sovetskaya, Аэропорты России"
190 | 302595,"RW","Rwanda","AF","http://en.wikipedia.org/wiki/Rwanda",
191 | 302660,"SA","Saudi Arabia","AS","http://en.wikipedia.org/wiki/Saudi_Arabia","مطارات المملكة العربية السعودية,المطارات لموسم الحج"
192 | 302781,"SB","Solomon Islands","OC","http://en.wikipedia.org/wiki/Solomon_Islands",
193 | 302596,"SC","Seychelles","AF","http://en.wikipedia.org/wiki/Seychelles",
194 | 302597,"SD","Sudan","AF","http://en.wikipedia.org/wiki/Sudan","مطارات السودان"
195 | 302715,"SE","Sweden","EU","http://en.wikipedia.org/wiki/Sweden","Flygplatserna i Sverige"
196 | 302661,"SG","Singapore","AS","http://en.wikipedia.org/wiki/Singapore",
197 | 302598,"SH","Saint Helena","AF","http://en.wikipedia.org/wiki/Saint_Helena",
198 | 302716,"SI","Slovenia","EU","http://en.wikipedia.org/wiki/Slovenia",
199 | 302717,"SK","Slovakia","EU","http://en.wikipedia.org/wiki/Slovakia","letisko Slovenska"
200 | 302599,"SL","Sierra Leone","AF","http://en.wikipedia.org/wiki/Sierra_Leone",
201 | 302718,"SM","San Marino","EU","http://en.wikipedia.org/wiki/San_Marino",
202 | 302600,"SN","Senegal","AF","http://en.wikipedia.org/wiki/Senegal","Aéroports du Sénégal"
203 | 302601,"SO","Somalia","AF","http://en.wikipedia.org/wiki/Somalia",
204 | 302800,"SR","Suriname","SA","http://en.wikipedia.org/wiki/Suriname",
205 | 302614,"SS","South Sudan","AF","http://en.wikipedia.org/wiki/South_Sudan",
206 | 302602,"ST","São Tomé and Principe","AF","http://en.wikipedia.org/wiki/São_Tomé_and_Principe",
207 | 302752,"SV","El Salvador","NA","http://en.wikipedia.org/wiki/El_Salvador","Salvadorian, Salvadorean"
208 | 302761,"SX","Sint Maarten","NA","http://en.wikipedia.org/wiki/Sint_Maarten",
209 | 302662,"SY","Syria","AS","http://en.wikipedia.org/wiki/Syria","مطارات سوريا"
210 | 302603,"SZ","Swaziland","AF","http://en.wikipedia.org/wiki/Swaziland",
211 | 302753,"TC","Turks and Caicos Islands","NA","http://en.wikipedia.org/wiki/Turks_and_Caicos_Islands",
212 | 302604,"TD","Chad","AF","http://en.wikipedia.org/wiki/Chad",
213 | 302617,"TF","French Southern Territories","AN","http://en.wikipedia.org/wiki/French_Southern_Territories",
214 | 302605,"TG","Togo","AF","http://en.wikipedia.org/wiki/Togo",
215 | 302663,"TH","Thailand","AS","http://en.wikipedia.org/wiki/Thailand","Siam, Siamese"
216 | 302664,"TJ","Tajikistan","AS","http://en.wikipedia.org/wiki/Tajikistan","Tajik"
217 | 302782,"TK","Tokelau","OC","http://en.wikipedia.org/wiki/Tokelau",
218 | 302665,"TL","Timor-Leste","AS","http://en.wikipedia.org/wiki/Timor-Leste","East Timor"
219 | 302666,"TM","Turkmenistan","AS","http://en.wikipedia.org/wiki/Turkmenistan",
220 | 302606,"TN","Tunisia","AF","http://en.wikipedia.org/wiki/Tunisia","مطارات تونس"
221 | 302783,"TO","Tonga","OC","http://en.wikipedia.org/wiki/Tonga",
222 | 302667,"TR","Turkey","AS","http://en.wikipedia.org/wiki/Turkey","Türkiye havaalanları"
223 | 302754,"TT","Trinidad and Tobago","NA","http://en.wikipedia.org/wiki/Trinidad_and_Tobago",
224 | 302784,"TV","Tuvalu","OC","http://en.wikipedia.org/wiki/Tuvalu",
225 | 302668,"TW","Taiwan","AS","http://en.wikipedia.org/wiki/Taiwan",
226 | 302607,"TZ","Tanzania","AF","http://en.wikipedia.org/wiki/Tanzania",
227 | 302719,"UA","Ukraine","EU","http://en.wikipedia.org/wiki/Ukraine","Аеропорти України"
228 | 302608,"UG","Uganda","AF","http://en.wikipedia.org/wiki/Uganda",
229 | 302785,"UM","United States Minor Outlying Islands","OC","http://en.wikipedia.org/wiki/United_States_Minor_Outlying_Islands",
230 | 302755,"US","United States","NA","http://en.wikipedia.org/wiki/United_States","America"
231 | 302801,"UY","Uruguay","SA","http://en.wikipedia.org/wiki/Uruguay","Aeropuertos de Uruguay"
232 | 302669,"UZ","Uzbekistan","AS","http://en.wikipedia.org/wiki/Uzbekistan","Uzbek"
233 | 302721,"VA","Vatican City","EU","http://en.wikipedia.org/wiki/Vatican_City","The Holy See"
234 | 302756,"VC","Saint Vincent and the Grenadines","NA","http://en.wikipedia.org/wiki/Saint_Vincent_and_the_Grenadines",
235 | 302802,"VE","Venezuela","SA","http://en.wikipedia.org/wiki/Venezuela","Aeropuertos de Venezuela"
236 | 302757,"VG","British Virgin Islands","NA","http://en.wikipedia.org/wiki/British_Virgin_Islands",
237 | 302758,"VI","U.S. Virgin Islands","NA","http://en.wikipedia.org/wiki/U.S._Virgin_Islands",
238 | 302670,"VN","Vietnam","AS","http://en.wikipedia.org/wiki/Vietnam","Các sân bay của Việt Nam"
239 | 302786,"VU","Vanuatu","OC","http://en.wikipedia.org/wiki/Vanuatu",
240 | 302787,"WF","Wallis and Futuna","OC","http://en.wikipedia.org/wiki/Wallis_and_Futuna",
241 | 302788,"WS","Samoa","OC","http://en.wikipedia.org/wiki/Samoa",
242 | 302720,"XK","Kosovo","EU","http://en.wikipedia.org/wiki/Kosovo","Kosova"
243 | 302671,"YE","Yemen","AS","http://en.wikipedia.org/wiki/Yemen","مطارات اليمن"
244 | 302609,"YT","Mayotte","AF","http://en.wikipedia.org/wiki/Mayotte",
245 | 302610,"ZA","South Africa","AF","http://en.wikipedia.org/wiki/South_Africa",
246 | 302611,"ZM","Zambia","AF","http://en.wikipedia.org/wiki/Zambia",
247 | 302612,"ZW","Zimbabwe","AF","http://en.wikipedia.org/wiki/Zimbabwe",
248 | 302613,"ZZ","Unknown or unassigned country","AF","http://en.wikipedia.org/wiki/Unknown_or_unassigned_country",
249 | 


--------------------------------------------------------------------------------
/2018/sqlpandas/download_data.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | # Data from http://ourairports.com/data/
4 | 
5 | wget -r -A "*.csv" -I "data" -nH -e robots=off http://ourairports.com/
6 | 


--------------------------------------------------------------------------------
/2018/sqlpandas/images/by_country.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2018/sqlpandas/images/by_country.png


--------------------------------------------------------------------------------
/2018/sqlpandas/images/by_country_top10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2018/sqlpandas/images/by_country_top10.png


--------------------------------------------------------------------------------
/2018/sqlpandas/images/having1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2018/sqlpandas/images/having1.png


--------------------------------------------------------------------------------
/2018/sqlpandas/images/having2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2018/sqlpandas/images/having2.png


--------------------------------------------------------------------------------
/2018/sqlpandas/images/notebook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2018/sqlpandas/images/notebook.png


--------------------------------------------------------------------------------
/2018/sqlpandas/images/runways.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2018/sqlpandas/images/runways.png


--------------------------------------------------------------------------------
/2018/sqlpandas/images/runways_agg1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2018/sqlpandas/images/runways_agg1.png


--------------------------------------------------------------------------------
/2018/sqlpandas/images/runways_agg2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2018/sqlpandas/images/runways_agg2.png


--------------------------------------------------------------------------------
/2018/windows/README.md:
--------------------------------------------------------------------------------
1 | # Window functions
2 | 
3 | Sample "social shares" data to use with window functions.
4 | 
5 | The ipython notebook has examples of Pandas and Spark SQL 
6 | queries (Pyspark).
7 | 
8 | 


--------------------------------------------------------------------------------
/2018/windows/social_deltas.csv:
--------------------------------------------------------------------------------
 1 | url,ts,service,delta
 2 | url1,2018-08-15 00:00:00,tw,1
 3 | url1,2018-08-15 00:05:00,tw,3
 4 | url1,2018-08-15 00:11:00,tw,1
 5 | url1,2018-08-15 00:18:00,tw,3
 6 | url1,2018-08-15 00:21:00,tw,4
 7 | url1,2018-08-15 00:30:00,tw,13
 8 | url1,2018-08-15 00:35:00,tw,16
 9 | url1,2018-08-15 00:38:00,tw,4
10 | url1,2018-08-15 00:41:00,tw,14
11 | url1,2018-08-15 00:00:00,fb,5
12 | url1,2018-08-15 00:05:00,fb,15
13 | url1,2018-08-15 00:11:00,fb,11
14 | url1,2018-08-15 00:18:00,fb,14
15 | url1,2018-08-15 00:21:00,fb,14
16 | url1,2018-08-15 00:30:00,fb,8
17 | url1,2018-08-15 00:35:00,fb,43
18 | url1,2018-08-15 00:38:00,fb,120
19 | url1,2018-08-15 00:41:00,fb,130
20 | url2,2018-08-15 00:00:00,tw,1
21 | url2,2018-08-15 00:05:00,tw,6
22 | url2,2018-08-15 00:07:00,tw,13
23 | url2,2018-08-15 00:15:00,tw,80
24 | url2,2018-08-15 00:19:00,tw,455
25 | url2,2018-08-15 00:26:00,tw,645
26 | url2,2018-08-15 00:00:00,fb,1
27 | url2,2018-08-15 00:05:00,fb,2
28 | url2,2018-08-15 00:07:00,fb,7
29 | url2,2018-08-15 00:15:00,fb,6
30 | url2,2018-08-15 00:19:00,fb,9
31 | url2,2018-08-15 00:26:00,fb,13
32 | 


--------------------------------------------------------------------------------
/2018/windows/social_totals.csv:
--------------------------------------------------------------------------------
 1 | url,ts,service,total
 2 | url1,2018-08-15 00:00:00,tw,1
 3 | url1,2018-08-15 00:05:00,tw,4
 4 | url1,2018-08-15 00:11:00,tw,5
 5 | url1,2018-08-15 00:18:00,tw,8
 6 | url1,2018-08-15 00:21:00,tw,12
 7 | url1,2018-08-15 00:30:00,tw,25
 8 | url1,2018-08-15 00:35:00,tw,41
 9 | url1,2018-08-15 00:38:00,tw,45
10 | url1,2018-08-15 00:41:00,tw,59
11 | url1,2018-08-15 00:00:00,fb,5
12 | url1,2018-08-15 00:05:00,fb,20
13 | url1,2018-08-15 00:11:00,fb,31
14 | url1,2018-08-15 00:18:00,fb,45
15 | url1,2018-08-15 00:21:00,fb,59
16 | url1,2018-08-15 00:30:00,fb,67
17 | url1,2018-08-15 00:35:00,fb,110
18 | url1,2018-08-15 00:38:00,fb,230
19 | url1,2018-08-15 00:41:00,fb,360
20 | url2,2018-08-15 00:00:00,tw,1
21 | url2,2018-08-15 00:05:00,tw,7
22 | url2,2018-08-15 00:07:00,tw,20
23 | url2,2018-08-15 00:15:00,tw,100
24 | url2,2018-08-15 00:19:00,tw,555
25 | url2,2018-08-15 00:26:00,tw,1200
26 | url2,2018-08-15 00:00:00,fb,1
27 | url2,2018-08-15 00:05:00,fb,3
28 | url2,2018-08-15 00:07:00,fb,10
29 | url2,2018-08-15 00:15:00,fb,16
30 | url2,2018-08-15 00:19:00,fb,25
31 | url2,2018-08-15 00:26:00,fb,38
32 | 


--------------------------------------------------------------------------------
/2018/windows/social_totals_agg.csv:
--------------------------------------------------------------------------------
 1 | url,service,total
 2 | url1,tw,5
 3 | url2,tw,8
 4 | url3,tw,12
 5 | url4,tw,100
 6 | url5,tw,175
 7 | url6,tw,25
 8 | url7,tw,80
 9 | url8,tw,35
10 | url9,tw,150
11 | url10,tw,260
12 | url1,fb,210
13 | url2,fb,370
14 | url3,fb,500
15 | url4,fb,20
16 | url5,fb,300
17 | url6,fb,95
18 | url7,fb,150
19 | url8,fb,47
20 | url9,fb,28
21 | url10,fb,5
22 | 


--------------------------------------------------------------------------------
/2019/pandasdb/read_csv_file.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | 
3 | 
4 | df = pd.read_csv("sample_file.csv")
5 | 


--------------------------------------------------------------------------------
/2019/sparkstart/context.py:
--------------------------------------------------------------------------------
 1 | from pyspark.context import SparkContext, SparkConf
 2 | from pyspark.sql import SQLContext
 3 | 
 4 | 
 5 | def get_spark_context():
 6 |     conf = SparkConf()
 7 |     extra_settings = {
 8 |         "spark.serializer": "org.apache.spark.serializer.KryoSerializer",
 9 |         "spark.executor.extraJavaOptions": "-XX:+UseG1GC",
10 |         "spark.default.parallelism": 200,
11 |     }
12 |     conf.setAll(extra_settings.items())
13 |     environment = {"PYTHON_EGG_CACHE": "/tmp/python-eggs"}
14 |     sc = SparkContext(conf=conf, environment=environment)
15 |     return sc
16 | 


--------------------------------------------------------------------------------
/2019/sparkstart/driver.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkContext, SparkConf
 2 | from pyspark.sql import SQLContext
 3 | 
 4 | 
 5 | def get_spark_context():
 6 |     pass
 7 | 
 8 | 
 9 | def read_input_data(sc, sqlContext, **kwargs):
10 |     pass
11 | 
12 | 
13 | def transform_or_aggregate(sc, sqlContext, df, **kwargs):
14 |     pass
15 | 
16 | 
17 | def save_output_data(df_out, **kwargs):
18 |     pass
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     with get_spark_context() as sc:
23 |         sqlContext = SQLContext(sc)
24 |         df_in = read_input_data(sc, sqlContext)
25 |         df_out = transform_or_aggregate(sc, sqlContext, df)
26 |         save_output_data(df_out)
27 | 


--------------------------------------------------------------------------------
/2019/sparkstart/runner.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | spark-submit \
 4 |     --master yarn \
 5 |     --deploy-mode client \
 6 |     --driver-memory 8g \
 7 |     --executor-memory 3g \
 8 |     --num-executors 4 \
 9 |     --executor-cores 4 \
10 |     --conf "spark.yarn.executor.memoryOverhead=2g" \
11 |     --conf "spark.driver.extraJavaOptions=-Dlog4j.configuration=file:///home/hadoop/log4j.properties" \
12 |     --py-files my_custom_package.egg \
13 |     --jars my_extra_java_lib.jar \
14 |     driver.py
15 | 


--------------------------------------------------------------------------------
/2021/covid-travel/.gitignore:
--------------------------------------------------------------------------------
1 | COVID-19_Case_Surveillance_Public_Use_Data.csv
2 | 
3 | 


--------------------------------------------------------------------------------
/2021/covid-travel/README.md:
--------------------------------------------------------------------------------
1 | https://www.washingtonpost.com/opinions/2021/03/15/flying-safer-than-driving-pandemic/
2 | 
3 | https://data.cdc.gov/Case-Surveillance/COVID-19-Case-Surveillance-Public-Use-Data/vbim-akqf
4 | 
5 | https://www.medrxiv.org/content/10.1101/2020.07.02.20143826v4.full-text
6 | 
7 | 


--------------------------------------------------------------------------------
/2021/covid-travel/all_by_age_race.csv:
--------------------------------------------------------------------------------
 1 | ,age_group,race_ethnicity,cases,deaths,prob_death,prob_death_full,prob_death_middle,odds_full,odds_middle
 2 | 0,0 - 9 Years,"American Indian/Alaska Native, Non-Hispanic",8926,2,4.870968056191488e-05,1.2400631507613324e-08,7.583633903457413e-09,"1 in 80,641,054","1 in 131,862,905"
 3 | 1,0 - 9 Years,"Black, Non-Hispanic",64642,21,7.062308392847295e-05,1.7979400185452858e-08,1.099534235224373e-08,"1 in 55,619,208","1 in 90,947,600"
 4 | 2,0 - 9 Years,Hispanic/Latino,136842,29,4.607026955238126e-05,1.1728683694595735e-08,7.172702717168578e-09,"1 in 85,261,060","1 in 139,417,461"
 5 | 3,0 - 9 Years,"Multiple/Other, Non-Hispanic",46751,6,2.7899891469422186e-05,7.102823693843331e-09,4.343747698803931e-09,"1 in 140,789,078","1 in 230,215,949"
 6 | 4,0 - 9 Years,"White, Non-Hispanic",221859,44,4.31139480088901e-05,1.0976056010407707e-08,6.71243157541384e-09,"1 in 91,107,407","1 in 148,977,310"
 7 | 5,0 - 9 Years,,397171,23,1.25890359568045e-05,3.2049480541756573e-09,1.9599931428931117e-09,"1 in 312,017,538","1 in 510,205,867"
 8 | 6,10 - 19 Years,"American Indian/Alaska Native, Non-Hispanic",18740,4,4.6401559092385505e-05,1.1813024208853417e-08,7.224281347092942e-09,"1 in 84,652,328","1 in 138,422,073"
 9 | 7,10 - 19 Years,"Asian, Non-Hispanic",35733,9,5.4753917642807345e-05,1.3939388401933908e-08,8.524664120005986e-09,"1 in 71,739,159","1 in 117,306,675"
10 | 8,10 - 19 Years,"Black, Non-Hispanic",129927,48,8.031265717437987e-05,2.0446195818320723e-08,1.2503916732736544e-08,"1 in 48,908,854","1 in 79,974,941"
11 | 9,10 - 19 Years,Hispanic/Latino,282124,56,4.315093024158973e-05,1.0985471039099057e-08,6.718189357244702e-09,"1 in 91,029,324","1 in 148,849,630"
12 | 10,10 - 19 Years,"Multiple/Other, Non-Hispanic",94194,16,3.6926565063222896e-05,9.40085668615466e-09,5.749114909421905e-09,"1 in 106,373,284","1 in 173,939,818"
13 | 11,10 - 19 Years,"Native Hawaiian/Other Pacific Islander, Non-Hispanic",4484,2,9.696311523096615e-05,2.4685110802175854e-08,1.5096234661521158e-08,"1 in 40,510,250","1 in 66,241,684"
14 | 12,10 - 19 Years,"White, Non-Hispanic",649931,67,2.241040570661247e-05,5.705296768484722e-09,3.4890869853041925e-09,"1 in 175,275,720","1 in 286,607,930"
15 | 13,10 - 19 Years,,957080,59,1.3401269440926296e-05,3.411728472740228e-09,2.0864501698465076e-09,"1 in 293,106,561","1 in 479,282,954"
16 | 14,20 - 29 Years,"American Indian/Alaska Native, Non-Hispanic",26042,41,0.00034225648868216227,8.713250730195871e-08,5.328607950834605e-08,"1 in 11,476,773","1 in 18,766,627"
17 | 15,20 - 29 Years,"Asian, Non-Hispanic",83841,36,9.334438945768465e-05,2.3763846603275454e-08,1.4532833482433362e-08,"1 in 42,080,729","1 in 68,809,706"
18 | 16,20 - 29 Years,"Black, Non-Hispanic",253353,332,0.00028487490988256806,7.252416239371464e-08,4.43523135423512e-08,"1 in 13,788,508","1 in 22,546,738"
19 | 17,20 - 29 Years,Hispanic/Latino,463305,315,0.0001478038459968384,3.7628270365786365e-08,2.3011652809717327e-08,"1 in 26,575,763","1 in 43,456,244"
20 | 18,20 - 29 Years,"Multiple/Other, Non-Hispanic",159161,63,8.604904577071671e-05,2.1906579880524313e-08,1.3397017868706637e-08,"1 in 45,648,385","1 in 74,643,477"
21 | 19,20 - 29 Years,"Native Hawaiian/Other Pacific Islander, Non-Hispanic",8069,7,0.0001885907956915086,4.801191336340908e-08,2.9361792883617926e-08,"1 in 20,828,164","1 in 34,057,866"
22 | 20,20 - 29 Years,"White, Non-Hispanic",1095381,352,6.985855983482896e-05,1.778476574206084e-08,1.0876313223543143e-08,"1 in 56,227,898","1 in 91,942,920"
23 | 21,20 - 29 Years,,1745942,451,5.615506028314204e-05,1.4296094776764462e-08,8.742808700472196e-09,"1 in 69,949,173","1 in 114,379,719"
24 | 22,30 - 39 Years,"American Indian/Alaska Native, Non-Hispanic",25721,110,0.000929708933488623,2.366876103585634e-07,1.4474683691241974e-07,"1 in 4,224,978","1 in 6,908,614"
25 | 23,30 - 39 Years,"Asian, Non-Hispanic",80384,139,0.00037591300886181115,9.570086783649083e-08,5.852607953631482e-08,"1 in 10,449,226","1 in 17,086,400"
26 | 24,30 - 39 Years,"Black, Non-Hispanic",248562,778,0.0006804356047288351,1.732269869472248e-07,1.059373508841234e-07,"1 in 5,772,773","1 in 9,439,541"
27 | 25,30 - 39 Years,Hispanic/Latino,427640,910,0.00046259958599878813,1.177697520363278e-07,7.202235497411069e-08,"1 in 8,491,145","1 in 13,884,578"
28 | 26,30 - 39 Years,"Multiple/Other, Non-Hispanic",139344,154,0.00024025620672268072,6.11650220780633e-08,3.740560590419456e-08,"1 in 16,349,213","1 in 26,733,961"
29 | 27,30 - 39 Years,"Native Hawaiian/Other Pacific Islander, Non-Hispanic",7662,23,0.0006525711302531977,1.6613317979966068e-07,1.015991172743409e-07,"1 in 6,019,267","1 in 9,842,605"
30 | 28,30 - 39 Years,"White, Non-Hispanic",912079,831,0.0001980663669627779,5.042422784183765e-08,3.083704919239385e-08,"1 in 19,831,737","1 in 32,428,524"
31 | 29,30 - 39 Years,,1497278,1260,0.0001829406719916147,4.6573490832886485e-08,2.848212237141289e-08,"1 in 21,471,442","1 in 35,109,743"
32 | 30,40 - 49 Years,"American Indian/Alaska Native, Non-Hispanic",21048,237,0.002447821129344108,6.231723852707787e-07,3.811024644782361e-07,"1 in 1,604,692","1 in 2,623,966"
33 | 31,40 - 49 Years,"Asian, Non-Hispanic",66604,336,0.00109668305598567,2.7919629734863815e-07,1.70743119412345e-07,"1 in 3,581,709","1 in 5,856,751"
34 | 32,40 - 49 Years,"Black, Non-Hispanic",215371,1773,0.001789631763834015,4.5560890117964793e-07,2.7862864141892015e-07,"1 in 2,194,865","1 in 3,589,006"
35 | 33,40 - 49 Years,Hispanic/Latino,392976,2231,0.0012341720613981517,3.141985899689167e-07,1.9214884966494648e-07,"1 in 3,182,700","1 in 5,204,299"
36 | 34,40 - 49 Years,"Multiple/Other, Non-Hispanic",121013,350,0.0006287502708117238,1.600688062147653e-07,9.789043605972624e-08,"1 in 6,247,313","1 in 10,215,503"
37 | 35,40 - 49 Years,"Native Hawaiian/Other Pacific Islander, Non-Hispanic",6215,42,0.001469096505649026,3.74006238708944e-07,2.287243508716779e-07,"1 in 2,673,752","1 in 4,372,075"
38 | 36,40 - 49 Years,"White, Non-Hispanic",874428,2208,0.0005489302721321824,1.3974803262018143e-07,8.546322156812911e-08,"1 in 7,155,736","1 in 11,700,940"
39 | 37,40 - 49 Years,,1353455,3216,0.0005165524046108728,1.3150519465650954e-07,8.042229559563893e-08,"1 in 7,604,262","1 in 12,434,363"
40 | 38,50 - 59 Years,"American Indian/Alaska Native, Non-Hispanic",18692,432,0.005024237292866514,1.2790828138659645e-06,7.822259524934918e-07,"1 in 781,810","1 in 1,278,403"
41 | 39,50 - 59 Years,"Asian, Non-Hispanic",61310,851,0.003017452291632686,7.681905019429945e-07,4.6978861772259003e-07,"1 in 1,301,760","1 in 2,128,617"
42 | 40,50 - 59 Years,"Black, Non-Hispanic",213598,4624,0.004706117994102697,1.1980952123476975e-06,7.326978038458559e-07,"1 in 834,658","1 in 1,364,819"
43 | 41,50 - 59 Years,Hispanic/Latino,291746,4805,0.0035803925928420766,9.115052425767211e-07,5.574330675449937e-07,"1 in 1,097,086","1 in 1,793,937"
44 | 42,50 - 59 Years,"Multiple/Other, Non-Hispanic",114298,993,0.0018886556651681685,4.8081865202846e-07,2.940457208730907e-07,"1 in 2,079,786","1 in 3,400,832"
45 | 43,50 - 59 Years,"Native Hawaiian/Other Pacific Islander, Non-Hispanic",4955,94,0.004124073180362392,1.049916797444391e-06,6.420789631576713e-07,"1 in 952,456","1 in 1,557,441"
46 | 44,50 - 59 Years,"White, Non-Hispanic",980669,7753,0.0017186581635686413,4.3754026567421016e-07,2.6757872700737385e-07,"1 in 2,285,504","1 in 3,737,218"
47 | 45,50 - 59 Years,,1288655,7977,0.0013456902233589352,3.4258916073285796e-07,2.0951116664467018e-07,"1 in 2,918,948","1 in 4,773,015"
48 | 46,60 - 69 Years,"American Indian/Alaska Native, Non-Hispanic",13145,709,0.011725403939338814,2.9850824692811784e-06,1.825533853236324e-06,"1 in 334,999","1 in 547,785"
49 | 47,60 - 69 Years,"Asian, Non-Hispanic",44214,2108,0.01036461006842216,2.638648184424666e-06,1.6136711923432159e-06,"1 in 378,982","1 in 619,705"
50 | 48,60 - 69 Years,"Black, Non-Hispanic",158846,9960,0.013630921718547196,3.4701939201997124e-06,2.1222048448614325e-06,"1 in 288,168","1 in 471,208"
51 | 49,60 - 69 Years,Hispanic/Latino,155256,7261,0.01016693886786704,2.5883245590287696e-06,1.5828956668013995e-06,"1 in 386,350","1 in 631,754"
52 | 50,60 - 69 Years,"Multiple/Other, Non-Hispanic",79797,2287,0.006230483765598685,1.5861720380848877e-06,9.700270536505796e-07,"1 in 630,449","1 in 1,030,899"
53 | 51,60 - 69 Years,"Native Hawaiian/Other Pacific Islander, Non-Hispanic",3140,197,0.013638881196344505,3.4722202638344646e-06,2.1234440598384382e-06,"1 in 288,000","1 in 470,933"
54 | 52,60 - 69 Years,"White, Non-Hispanic",809271,23005,0.00617974319668163,1.5732543779736237e-06,9.621272297494358e-07,"1 in 635,625","1 in 1,039,364"
55 | 53,60 - 69 Years,,849835,15493,0.003963173413969618,1.0089545351244462e-06,6.170284001197218e-07,"1 in 991,125","1 in 1,620,671"
56 | 54,70 - 79 Years,"American Indian/Alaska Native, Non-Hispanic",6257,716,0.024876486161585984,6.333117658243637e-06,3.873032253087645e-06,"1 in 157,900","1 in 258,196"
57 | 55,70 - 79 Years,"Asian, Non-Hispanic",21528,3025,0.030546669251773226,7.776646958189184e-06,4.755825821542622e-06,"1 in 128,590","1 in 210,268"
58 | 56,70 - 79 Years,"Black, Non-Hispanic",81625,11500,0.030627871362940276,7.797319593415007e-06,4.768468217800918e-06,"1 in 128,249","1 in 209,711"
59 | 57,70 - 79 Years,Hispanic/Latino,70704,8297,0.02551051782323367,6.494531014060225e-06,3.971744951460303e-06,"1 in 153,976","1 in 251,779"
60 | 58,70 - 79 Years,"Multiple/Other, Non-Hispanic",42299,3384,0.01739171549949274,4.427626145491156e-06,2.7077246612937747e-06,"1 in 225,855","1 in 369,314"
61 | 59,70 - 79 Years,"Native Hawaiian/Other Pacific Islander, Non-Hispanic",1471,163,0.024088907279874682,6.132613869620637e-06,3.750413713198002e-06,"1 in 163,063","1 in 266,637"
62 | 60,70 - 79 Years,"White, Non-Hispanic",526749,46483,0.019183709888390865,4.883836529630099e-06,2.98672114096026e-06,"1 in 204,757","1 in 334,815"
63 | 61,70 - 79 Years,,421857,20629,0.01063053408475219,2.706347781250016e-06,1.6550730320333899e-06,"1 in 369,502","1 in 604,203"
64 | 62,80+ Years,"American Indian/Alaska Native, Non-Hispanic",2964,605,0.04437305638678637,1.1296602949790604e-05,6.908462772346125e-06,"1 in 88,522","1 in 144,750"
65 | 63,80+ Years,"Asian, Non-Hispanic",15805,5680,0.07812607457739021,1.9889530187722652e-05,1.2163486622665034e-05,"1 in 50,278","1 in 82,213"
66 | 64,80+ Years,"Black, Non-Hispanic",49054,13183,0.05842274972922476,1.487340879053286e-05,9.09586637540328e-06,"1 in 67,234","1 in 109,940"
67 | 65,80+ Years,Hispanic/Latino,38416,9838,0.055672002607609156,1.4173116753461818e-05,8.667601215569066e-06,"1 in 70,556","1 in 115,372"
68 | 66,80+ Years,"Multiple/Other, Non-Hispanic",30084,6323,0.04569090604279034,1.163210438971008e-05,7.113639427492279e-06,"1 in 85,969","1 in 140,575"
69 | 67,80+ Years,"Native Hawaiian/Other Pacific Islander, Non-Hispanic",593,136,0.04985702764132268,1.2692725977930933e-05,7.762264929365495e-06,"1 in 78,785","1 in 128,828"
70 | 68,80+ Years,"White, Non-Hispanic",413173,100420,0.052836063301834095,1.3451136278468311e-05,8.22607244306796e-06,"1 in 74,343","1 in 121,565"
71 | 69,80+ Years,,298756,37030,0.026945065538432705,6.859741735852168e-06,4.195090384310809e-06,"1 in 145,778","1 in 238,374"
72 | 70,,"Black, Non-Hispanic",14580,3,4.473072105922347e-05,1.1387658110797995e-08,6.964147759491858e-09,"1 in 87,814,368","1 in 143,592,588"
73 | 71,,Hispanic/Latino,1916,2,0.00022692202959063269,5.777037413202324e-08,3.5329601368612143e-08,"1 in 17,309,910","1 in 28,304,876"
74 | 72,,"Native Hawaiian/Other Pacific Islander, Non-Hispanic",66,1,0.003293807641633729,8.38545733613307e-07,5.128145168353096e-07,"1 in 1,192,541","1 in 1,950,023"
75 | 73,,"White, Non-Hispanic",43405,11,5.50928314209443e-05,1.402566991367955e-08,8.577429771280608e-09,"1 in 71,297,842","1 in 116,585,041"
76 | 74,,,96345,19,4.2871293607438855e-05,1.0914280449956792e-08,6.674652593403688e-09,"1 in 91,623,081","1 in 149,820,532"
77 | 


--------------------------------------------------------------------------------
/2021/covid-travel/covid_and_air_travel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2021/covid-travel/covid_and_air_travel.png


--------------------------------------------------------------------------------
/2021/covid-travel/flight_infection_risk.csv:
--------------------------------------------------------------------------------
1 | seat,full_flight,middle_empty
2 | "Window (A/F)",0.0017699115044247787,0.0010869565217391304
3 | "Middle (B/E)",0.002061855670103093,N/A
4 | "Aisle (C/F)",0.0022172949002217295,0.0013774104683195593
5 | "Any seat",0.0002545824847250509,0.00015569048731122528
6 | 


--------------------------------------------------------------------------------
/2021/covid-travel/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2021/covid-travel/requirements.txt


--------------------------------------------------------------------------------
/2021/covid-travel/test.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import matplotlib.pyplot as plt
 3 | import seaborn as sns
 4 | 
 5 | pd.set_option("display.max_rows", None)
 6 | sns.set_theme(style="ticks")
 7 | 
 8 | df = pd.read_csv("all_by_age_race.csv")
 9 | print(df.head())
10 | print(df.dtypes)
11 | 
12 | dff = (
13 |     df[["age_group", "race_ethnicity", "prob_death_full", "odds_full"]]
14 |     .copy()
15 |     .dropna()
16 | )
17 | dff.columns = ["age_group", "race_ethnicity", "prob_death", "odds"]
18 | dff["full"] = True
19 | 
20 | dfm = (
21 |     df[["age_group", "race_ethnicity", "prob_death_middle", "odds_middle"]]
22 |     .copy()
23 |     .dropna()
24 | )
25 | dfm.columns = ["age_group", "race_ethnicity", "prob_death", "odds"]
26 | dfm["full"] = False
27 | 
28 | dfg = dff.append(dfm, ignore_index=True)
29 | print(dfg)
30 | 
31 | 
32 | g = sns.catplot(
33 |     x="age_group",
34 |     y="prob_death",
35 |     col="race_ethnicity",
36 |     col_wrap=3,
37 |     hue="full",
38 |     marker="o",
39 |     palette="husl",
40 |     kind="swarm",
41 |     data=dfg,
42 | )
43 | g.set_xticklabels(rotation=30)
44 | g.set_xlabels("Age group")
45 | g.set_ylabels("Probability of dying")
46 | 
47 | g.savefig("covid_and_air_travel.png")
48 | 
49 | plt.show()
50 | 


--------------------------------------------------------------------------------
/2022/uk-covid-deaths/asmr/agestandardisedmortalityratecalculationtemplateusingthe2013esp_tcm77-359944.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2022/uk-covid-deaths/asmr/agestandardisedmortalityratecalculationtemplateusingthe2013esp_tcm77-359944.xls


--------------------------------------------------------------------------------
/2022/uk-covid-deaths/asmr/espmortalityratesreport_tcm77-364912.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2022/uk-covid-deaths/asmr/espmortalityratesreport_tcm77-364912.pdf


--------------------------------------------------------------------------------
/2022/uk-covid-deaths/output1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2022/uk-covid-deaths/output1.png


--------------------------------------------------------------------------------
/2022/uk-covid-deaths/output2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2022/uk-covid-deaths/output2.png


--------------------------------------------------------------------------------
/2022/uk-covid-deaths/output3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2022/uk-covid-deaths/output3.png


--------------------------------------------------------------------------------
/2022/uk-covid-deaths/output4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2022/uk-covid-deaths/output4.png


--------------------------------------------------------------------------------
/2022/uk-covid-deaths/output5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2022/uk-covid-deaths/output5.png


--------------------------------------------------------------------------------
/2022/uk-covid-deaths/referencetable06072022accessible/Contents-Table 1.tsv:
--------------------------------------------------------------------------------
 1 | Contents				
 2 | This worksheet contains 1 table.				
 3 | Worksheet Name	Worksheet Title			
 4 | Definitions	"Definitions used in 'Age-standardised mortality rates for deaths by vaccination status, England: deaths occurring between 1 January 2021 and 31 March 2022: 6 July 2022'"			
 5 | Notes	"Notes for 'Age-standardised mortality rates for deaths by vaccination status, England: deaths occurring between 1 January 2021 and 31 May 2022: 6 July 2022'"			
 6 | Table 1	"Monthly age-standardised mortality rates by vaccination status for all cause deaths, deaths involving COVID-19 and deaths not involving COVID-19, per 100,000 person-years, England, deaths occurring between 1 January 2021 and 31 May 2022"			
 7 | Table 2	"Monthly age-standardised mortality rates by vaccination status by age group for all cause deaths, deaths involving COVID-19 and deaths not involving COVID-19, per 100,000 person-years, England, deaths occurring between 1 January 2021 and 31 May 2022"			
 8 | Table 3	"Whole period age-standardised mortality rates by vaccination status for all cause deaths, deaths involving COVID-19 and deaths not involving COVID-19, per 100,000 person-years, England, deaths occurring between 1 January 2021 and 31 May 2022"			
 9 | Table 4	"Monthly age-standardised mortality rates by vaccination status by sex for all cause deaths, deaths involving COVID-19 and deaths not involving COVID-19, per 100,000 person-years, England, deaths occurring between 1 January 2021 and 31 May 2022"			
10 | Table 5	"Monthly age-standardised mortality rates by vaccination status by age group by sex for all cause deaths and deaths involving COVID-19, per 100,000 person-years, England, deaths occurring between 1 January 2021 and 31 May 2022"			
11 | Table 6	"Whole period counts of all cause deaths, deaths involving COVID-19 and deaths not involving COVID-19, and person-years by vaccination status and five-year age group, England, deaths occurring between 1 January 2021 and 31 May 2022"			
12 | Table 7	"Whole period counts of all registered deaths by vaccination status by age group; for all deaths and deaths involving COVID-19, deaths occurring between 1 January 2021 and 31 May 2022, England"			
13 | Table 8	"Monthly counts of all registered deaths for 'unvaccinated' and 'ever vaccinated' by age group; for all deaths and deaths involving COVID-19, deaths occurring between 1 January 2021 and 31 May 2022, England"			
14 | Table 9	"Whole period counts of all registered deaths grouped by how many weeks after vaccination the deaths occurred; for deaths involving COVID-19 and deaths not involving COVID-19, deaths occurring between 1 January 2021 and 31 May 2022, England"			


--------------------------------------------------------------------------------
/2022/uk-covid-deaths/referencetable06072022accessible/Cover-Table 1.tsv:
--------------------------------------------------------------------------------
1 | "All data relating to 'Age-standardised mortality rates for deaths by vaccination status, England: deaths occurring between 1 January 2021 and 31 May 2022: 6 July 2022'"				
2 | "Age-standardised mortality rates for deaths by vaccination status, England: deaths occurring between 1 January 2021 and 31 May 2022"				
3 | Publication date: 6 July 2022				
4 | Contact e-mail: Health.Data@ons.gov.uk				
5 | Office for National Statistics				
6 | Crown Copyright 2020				
7 | 


--------------------------------------------------------------------------------
/2022/uk-covid-deaths/referencetable06072022accessible/Definitions-Table 1.tsv:
--------------------------------------------------------------------------------
 1 | Definitions of COVID-19 deaths used in this dataset and accompanying publication				
 2 | We use the term “involving COVID-19” when referring only to deaths where COVID-19 is mentioned anywhere on the death certificate. Information on cause of death coding is available in the User Guide to Mortality Statistics.				
 3 | "For this analysis we define a death as involving COVID-19 if either of the ICD10 codes U07.1 (COVID-19, virus identified) or U07.2 (COVID-19, virus not identified) is mentioned on the death certificate. In contrast to the definition used in the weekly deaths released, deaths where the ICD10 code U09.9 (Post-COVID condition, where the acute COVID had ended before the condition immediately causing death occurred) is mentioned on the death certificate and neither of the other two COVID-19 codes are mentioned are not included, as they are likely to be the result of an infection caught a long time previously, and therefore not linked to the vaccination status of the person at date of death. Deaths involving U10.9 (Multisystem inflammatory syndrome associated with COVID-19) where neither U07.1 nor U07.2 are mentioned are also excluded."				
 4 | ICD-10 code	Description			
 5 | U07.1 	"COVID-19, virus identified"			
 6 | U07.2	"COVID-19, virus not identified"			
 7 | Vaccination status and age				
 8 | Vaccination status is defined on each day for each person and is one of:	Category Name			
 9 |      unvaccinated	(Unvaccinated)			
10 | "     vaccinated with first dose only, less than 21 days after first vaccination"	"(First dose, less than 21 days ago)"			
11 | "     vaccinated with first dose only, at least 21 days after first vaccination"	"(First dose, at least 21 days ago)"			
12 | "     vaccinated with first and second doses, less than 21 days after second vaccination"	"(Second dose, less than 21 days ago)"			
13 | "     vaccinated with first and second doses, at least 21 days but less than 6 months after second vaccination"	"(Second dose, between 21 days and 6 months ago)"			
14 | "     vaccinated with first and second doses, at least 6 months after second vaccination"	"(Second dose, at least 6 months ago)"			
15 | "     vaccinated with at least first, second and third dose and/or booster, less than 21 days after third or booster vaccination"	"(Third dose or booster, less than 21 days ago)"			
16 | "     vaccinated with at least first, second and third dose and/or booster, at least 21 days after third or booster vaccination"	"(Third dose or booster, at least 21 days ago)"			
17 | "For the age-breakdowns, due to low numbers, the “vaccinated with first and second doses, at least 21 days but less than 6 months after second vaccination” and “vaccinated with first and second doses, at least 6 months after second vaccination” are combined into:"				
18 | "     vaccinated with first and second doses, at least 21 days after second vaccination"	"(Second dose, at least 21 days ago)"			
19 | "We also include an ""Ever vaccinated"" category, which includes anyone who has had at least 1 dose of the vaccine, regardless of further doses."				
20 | Age in years is defined on the first day of each month. Children aged <10 each week are not included when calculating the ASMR as they would not be included in our dataset due to the linkage to the 2011 census.				
21 | "The Joint Committee on Vaccination and Immunisation (JCVI) advised in February 2022 a spring booster for the most vulnerable. This spring booster may be present in the NIMS dataset if it is the person’s third dose or booster, but it is not being differentiated from a normal third dose or booster in our analysis. Further developments to the handling of spring boosters will be available in future publications."				


--------------------------------------------------------------------------------
/2022/uk-covid-deaths/referencetable06072022accessible/Notes-Table 1.tsv:
--------------------------------------------------------------------------------
 1 | "Notes for 'Age-standardised mortality rates for deaths by vaccination status, England: deaths occurring between 1 January 2021 and 31 May 2022: 6 July 2022'"				
 2 | This worksheet contains 1 table.				
 3 | Note Number	Note Text	Applies to Tables		
 4 | Note 1	"Age-standardised mortality rates per 100,000 person-years, standardised to the 2013 European Standard Population using five-year age groups from those aged 10 years and over. 'Person-years' take into account both the number of people and the amount of time spent in each vaccination status. For more information, see our methodology article."	"1,2,3,4,5"		
 5 | Note 2	"Office for National Statistics (ONS) figures based on deaths that occurred between 1 January 2021 and 31 May 2022 and were registered by 8 June 2022. These figures represent death occurrences, there can be a delay between the date a death occurred and the date a death was registered. More information can be found in our Impact of registration delays release."	"1,2,3,4,5,6,7,8,9"		
 6 | Note 3	"ASMRs are calculated using the Public Health Data Asset, a linked dataset of people resident in England, who could be linked to the 2011 Census and GP Patient Register. This dataset covers approximately 79% of the population in England aged 10+."	"1,2,3,4,5,6"		
 7 | Note 4	"Deaths were defined using the International Classification of Diseases, tenth revision (ICD-10). Deaths involving the coronavirus (COVID-19) are defined as those with an underlying cause, or any mention of, ICD-10 codes U07.1 (COVID-19 virus identified) or U07.2 (COVID-19, virus not identified). Please note, this differs from the definition used in the majority of mortality outputs."	"1,2,3,4,5,6,7,8,9"		
 8 | Note 5	"95% confidence intervals are indicated by the shaded regions. Where the total number of deaths is less than 100, Dobson’s method is used, otherwise the normal approximation is used. Non-overlapping confidence intervals denote a statistically significant difference in ASMR."	"1,2,3,4,5"		
 9 | Note 6	"Rates marked with u in 'Noted as Unreliable' column are unreliable due to small numbers of deaths. Otherwise, column left blank."	"1,2,4,5"		
10 | Note 7	x denotes data are not available; age-standardised rates are not provided for categories with fewer than 10 deaths.	"1,2,3,4,5"		
11 | Note 8	Age is defined on the first day of the month.	"1,2,3,4,5,6"		
12 | Note 9	"Caution must be taken when comparing mortality rates and counts as the characteristics of people in the different vaccination status groups, such as health, may differ, particularly due to the prioritisation of the vaccine to more clinically vulnerable people. While differences in the ages of people in the vaccination status groups are accounted for, other differences, such as ethnicity or level of deprivation, may remain, which can affect the mortality rates and counts."	"1,2,3,4,5,6"		
13 | Note 10	Totals of person-years may not exactly equal the sum of totals from breakdowns due to rounding.	"1,2,3,4,5,6"		
14 | Note 11	"Third dose and booster vaccinations are defined as a third or booster dose received after 16 September 2021, the date from which booster doses were first administered. Due to our definition of a third dose or booster only including data from 16th September, there are 0 Person-years and consequently 0 counts of death before September for 'Within 21 days of third dose or booster' and  '21 days or more after third dose or booster', and within September for '21 days or more after third dose or booster'. Spring boosters have not yet been distinguished in this analysis."	"1,2,3,4,5,6,7,8"		
15 | Note 12	'21 days or more after a second dose' is separated into '21 days or more but less than 6 months after a second dose' and '6 months or more after a second dose'.	"1,3,4,7"		
16 | Note 13	'21 days or more but less than 6 months after a second dose' and '6 months or more after a second dose' are combined into '21 days or more after a second dose' due to low counts.	"2,5"		
17 | Note 14	"These counts are for all deaths that have been registered, not solely those in the Public Health Data Asset which are used to calculate the mortality rates."	"7,8,9"		
18 | Note 15	Age is defined on the date of death.	"7,8,9"		
19 | Note 16	"When the category of 'Ever Vaccinated' is included, the total counts and person-years of those in all the vaccination categories will add up to the 'Ever Vaccinated' totals (differences in person-years may occur due to rounding)."	"1,3,4,8"		
20 | Note 17	"There were some people who were vaccinated but not included in the NIMS data as they died soon after vaccination. Of these, 1,436 linked to our Public health Data Asset dataset. We included the latest vaccination records for these people in our dataset. This data is provisional and extends up to the 25 May 2022. This will be updated in future releases."	"1,2,3,4,5,6,7,8,9"		
21 | Note 18	Primary and secondary suppression are applied to counts less than 3.	"2,5,8,9"		


--------------------------------------------------------------------------------
/2022/uk-covid-deaths/referencetable06072022accessible/Table 3-Table 1.tsv:
--------------------------------------------------------------------------------
 1 | "Whole period age-standardised mortality rates by vaccination status for all cause deaths, deaths involving COVID-19 and deaths not involving COVID-19, per 100,000 person-years, England, deaths occurring between 1 January 2021 and 31 May 2022"						
 2 | This worksheet contains 1 table.						
 3 | "Source: Source: Office for National Statistics, National Immunisation Management Service."						
 4 | Cause of Death	Vaccination status	Count of deaths	Person-years	"Age-standardised mortality rate / 100,000 person-years"	Lower confidence limit	Upper confidence limit
 5 | All causes	Unvaccinated	109891	16375484	2337.5	2322.6	2352.4
 6 | All causes	"First dose, less than 21 days ago"	17699	1925587	826.8	814.6	839.1
 7 | All causes	"First dose, at least 21 days ago"	77200	5536696	1289.2	1280.1	1298.3
 8 | All causes	"Second dose, less than 21 days ago"	11986	1878686	512.6	503.3	521.8
 9 | All causes	"Second dose, between 21 days and 6 months ago"	156537	13454401	868.4	864.1	872.7
10 | All causes	"Second dose, at least 6 months ago"	71790	2664983	2106.5	2086.6	2126.4
11 | All causes	"Third dose or booster, less than 21 days ago"	12868	1529103	569.4	550.1	588.7
12 | All causes	"Third dose or booster, at least 21 days ago"	183038	11871491	883.2	869.5	897
13 | All causes	Ever vaccinated	531118	38860947	957.4	954.8	960
14 | Deaths involving COVID-19	Unvaccinated	38285	16375484	863.2	854.1	872.4
15 | Deaths involving COVID-19	"First dose, less than 21 days ago"	4037	1925587	190.1	184.2	195.9
16 | Deaths involving COVID-19	"First dose, at least 21 days ago"	7270	5536696	122	119.2	124.8
17 | Deaths involving COVID-19	"Second dose, less than 21 days ago"	200	1878686	8.4	7.2	9.5
18 | Deaths involving COVID-19	"Second dose, between 21 days and 6 months ago"	5462	13454401	30.4	29.6	31.2
19 | Deaths involving COVID-19	"Second dose, at least 6 months ago"	6664	2664983	197.5	192.5	202.4
20 | Deaths involving COVID-19	"Third dose or booster, less than 21 days ago"	494	1529103	21.6	19.6	23.5
21 | Deaths involving COVID-19	"Third dose or booster, at least 21 days ago"	12048	11871491	58.5	53.4	63.5
22 | Deaths involving COVID-19	Ever vaccinated	36175	38860947	64.5	63.8	65.1
23 | Non-COVID-19 deaths	Unvaccinated	71606	16375484	1474.3	1462.5	1486
24 | Non-COVID-19 deaths	"First dose, less than 21 days ago"	13662	1925587	636.8	626.1	647.5
25 | Non-COVID-19 deaths	"First dose, at least 21 days ago"	69930	5536696	1167.2	1158.5	1175.9
26 | Non-COVID-19 deaths	"Second dose, less than 21 days ago"	11786	1878686	504.2	495.1	513.3
27 | Non-COVID-19 deaths	"Second dose, between 21 days and 6 months ago"	151075	13454401	838	833.7	842.2
28 | Non-COVID-19 deaths	"Second dose, at least 6 months ago"	65126	2664983	1909	1889.8	1928.2
29 | Non-COVID-19 deaths	"Third dose or booster, less than 21 days ago"	12374	1529103	547.9	528.6	567.1
30 | Non-COVID-19 deaths	"Third dose or booster, at least 21 days ago"	170990	11871491	824.8	812	837.6
31 | Non-COVID-19 deaths	Ever vaccinated	494943	38860947	892.9	890.4	895.4


--------------------------------------------------------------------------------
/2022/uk-covid-deaths/referencetable06072022accessible/Table 6-Table 1.tsv:
--------------------------------------------------------------------------------
  1 | "Whole period counts of all cause deaths, deaths involving COVID-19 and deaths not involving COVID-19, and person-years by vaccination status and five-year age group, England, deaths occurring between 1 January 2021 and 31 May 2022"					
  2 | This worksheet contains 1 table.					
  3 | "Source: Source: Office for National Statistics, National Immunisation Management Service."					
  4 | Age group	Vaccination status	Person-years	Count of deaths involving COVID-19	Count of deaths non-COVID-19 deaths	Count of all cause deaths
  5 | 10-14	Unvaccinated	2881265	9	175	184
  6 | 10-14	"First dose, less than 21 days ago"	61754	2	2	4
  7 | 10-14	"First dose, at least 21 days ago"	280645	0	14	14
  8 | 10-14	"Second dose, less than 21 days ago"	36646	0	0	0
  9 | 10-14	"Second dose, between 21 days and 6 months ago"	135989	0	13	13
 10 | 10-14	"Second dose, at least 6 months ago"	1028	0	1	1
 11 | 10-14	"Third dose or booster, less than 21 days ago"	723	0	1	1
 12 | 10-14	"Third dose or booster, at least 21 days ago"	2422	1	6	7
 13 | 15-19	Unvaccinated	1991761	24	265	289
 14 | 15-19	"First dose, less than 21 days ago"	115758	0	13	13
 15 | 15-19	"First dose, at least 21 days ago"	465610	2	79	81
 16 | 15-19	"Second dose, less than 21 days ago"	97554	1	3	4
 17 | 15-19	"Second dose, between 21 days and 6 months ago"	520292	2	74	76
 18 | 15-19	"Second dose, at least 6 months ago"	63581	0	23	23
 19 | 15-19	"Third dose or booster, less than 21 days ago"	35398	1	2	3
 20 | 15-19	"Third dose or booster, at least 21 days ago"	160272	1	31	32
 21 | 20-24	Unvaccinated	1531301	43	335	378
 22 | 20-24	"First dose, less than 21 days ago"	116923	1	21	22
 23 | 20-24	"First dose, at least 21 days ago"	342619	5	104	109
 24 | 20-24	"Second dose, less than 21 days ago"	110074	0	20	20
 25 | 20-24	"Second dose, between 21 days and 6 months ago"	710759	4	151	155
 26 | 20-24	"Second dose, at least 6 months ago"	209425	2	41	43
 27 | 20-24	"Third dose or booster, less than 21 days ago"	67390	0	12	12
 28 | 20-24	"Third dose or booster, at least 21 days ago"	414003	4	48	52
 29 | 25-29	Unvaccinated	1567892	68	525	593
 30 | 25-29	"First dose, less than 21 days ago"	117976	2	30	32
 31 | 25-29	"First dose, at least 21 days ago"	339758	5	145	150
 32 | 25-29	"Second dose, less than 21 days ago"	112913	0	19	19
 33 | 25-29	"Second dose, between 21 days and 6 months ago"	748987	11	189	200
 34 | 25-29	"Second dose, at least 6 months ago"	228001	3	92	95
 35 | 25-29	"Third dose or booster, less than 21 days ago"	73969	0	7	7
 36 | 25-29	"Third dose or booster, at least 21 days ago"	475006	5	95	100
 37 | 30-34	Unvaccinated	1432230	129	649	778
 38 | 30-34	"First dose, less than 21 days ago"	116485	3	46	49
 39 | 30-34	"First dose, at least 21 days ago"	330087	9	222	231
 40 | 30-34	"Second dose, less than 21 days ago"	112049	0	22	22
 41 | 30-34	"Second dose, between 21 days and 6 months ago"	757164	10	297	307
 42 | 30-34	"Second dose, at least 6 months ago"	219824	13	116	129
 43 | 30-34	"Third dose or booster, less than 21 days ago"	77326	1	15	16
 44 | 30-34	"Third dose or booster, at least 21 days ago"	510682	4	168	172
 45 | 35-39	Unvaccinated	1351742	238	903	1141
 46 | 35-39	"First dose, less than 21 days ago"	126946	9	62	71
 47 | 35-39	"First dose, at least 21 days ago"	341664	15	364	379
 48 | 35-39	"Second dose, less than 21 days ago"	123317	1	45	46
 49 | 35-39	"Second dose, between 21 days and 6 months ago"	853238	24	463	487
 50 | 35-39	"Second dose, at least 6 months ago"	225736	11	183	194
 51 | 35-39	"Third dose or booster, less than 21 days ago"	91720	1	28	29
 52 | 35-39	"Third dose or booster, at least 21 days ago"	620364	10	265	275
 53 | 40-44	Unvaccinated	1158559	299	1225	1524
 54 | 40-44	"First dose, less than 21 days ago"	133705	3	79	82
 55 | 40-44	"First dose, at least 21 days ago"	354401	13	557	570
 56 | 40-44	"Second dose, less than 21 days ago"	131852	0	75	75
 57 | 40-44	"Second dose, between 21 days and 6 months ago"	951898	40	865	905
 58 | 40-44	"Second dose, at least 6 months ago"	226923	39	344	383
 59 | 40-44	"Third dose or booster, less than 21 days ago"	105778	1	44	45
 60 | 40-44	"Third dose or booster, at least 21 days ago"	745195	25	516	541
 61 | 45-49	Unvaccinated	1015441	597	1965	2562
 62 | 45-49	"First dose, less than 21 days ago"	147654	13	166	179
 63 | 45-49	"First dose, at least 21 days ago"	390029	39	926	965
 64 | 45-49	"Second dose, less than 21 days ago"	144704	1	145	146
 65 | 45-49	"Second dose, between 21 days and 6 months ago"	1056962	65	1556	1621
 66 | 45-49	"Second dose, at least 6 months ago"	222412	60	563	623
 67 | 45-49	"Third dose or booster, less than 21 days ago"	119884	6	85	91
 68 | 45-49	"Third dose or booster, at least 21 days ago"	854328	39	975	1014
 69 | 50-54	Unvaccinated	892001	1069	3014	4083
 70 | 50-54	"First dose, less than 21 days ago"	170638	21	250	271
 71 | 50-54	"First dose, at least 21 days ago"	461101	66	1640	1706
 72 | 50-54	"Second dose, less than 21 days ago"	169954	0	225	225
 73 | 50-54	"Second dose, between 21 days and 6 months ago"	1281727	120	2926	3046
 74 | 50-54	"Second dose, at least 6 months ago"	239007	110	1161	1271
 75 | 50-54	"Third dose or booster, less than 21 days ago"	149814	5	160	165
 76 | 50-54	"Third dose or booster, at least 21 days ago"	1134549	85	2109	2194
 77 | 55-59	Unvaccinated	811218	1626	4084	5710
 78 | 55-59	"First dose, less than 21 days ago"	173094	44	391	435
 79 | 55-59	"First dose, at least 21 days ago"	469286	113	2494	2607
 80 | 55-59	"Second dose, less than 21 days ago"	173062	2	341	343
 81 | 55-59	"Second dose, between 21 days and 6 months ago"	1318673	239	4690	4929
 82 | 55-59	"Second dose, at least 6 months ago"	220215	181	1965	2146
 83 | 55-59	"Third dose or booster, less than 21 days ago"	159483	7	314	321
 84 | 55-59	"Third dose or booster, at least 21 days ago"	1244053	168	3708	3876
 85 | 60-64	Unvaccinated	619004	2425	5135	7560
 86 | 60-64	"First dose, less than 21 days ago"	153232	67	544	611
 87 | 60-64	"First dose, at least 21 days ago"	417699	176	3463	3639
 88 | 60-64	"Second dose, less than 21 days ago"	153930	2	514	516
 89 | 60-64	"Second dose, between 21 days and 6 months ago"	1178767	345	6761	7106
 90 | 60-64	"Second dose, at least 6 months ago"	175218	290	2774	3064
 91 | 60-64	"Third dose or booster, less than 21 days ago"	146505	17	475	492
 92 | 60-64	"Third dose or booster, at least 21 days ago"	1180409	265	5866	6131
 93 | 65-69	Unvaccinated	429644	3051	6303	9354
 94 | 65-69	"First dose, less than 21 days ago"	134086	122	796	918
 95 | 65-69	"First dose, at least 21 days ago"	361884	242	4686	4928
 96 | 65-69	"Second dose, less than 21 days ago"	134189	9	681	690
 97 | 65-69	"Second dose, between 21 days and 6 months ago"	1026322	445	9597	10042
 98 | 65-69	"Second dose, at least 6 months ago"	142579	436	3778	4214
 99 | 65-69	"Third dose or booster, less than 21 days ago"	129293	32	698	730
100 | 65-69	"Third dose or booster, at least 21 days ago"	1089793	490	9186	9676
101 | 70-74	Unvaccinated	322630	4194	8090	12284
102 | 70-74	"First dose, less than 21 days ago"	137755	235	1355	1590
103 | 70-74	"First dose, at least 21 days ago"	372505	413	7651	8064
104 | 70-74	"Second dose, less than 21 days ago"	137947	12	1193	1205
105 | 70-74	"Second dose, between 21 days and 6 months ago"	1049233	708	16101	16809
106 | 70-74	"Second dose, at least 6 months ago"	136845	673	6201	6874
107 | 70-74	"Third dose or booster, less than 21 days ago"	131781	44	1196	1240
108 | 70-74	"Third dose or booster, at least 21 days ago"	1149162	946	15934	16880
109 | 75-79	Unvaccinated	181758	5044	8515	13559
110 | 75-79	"First dose, less than 21 days ago"	99957	467	1738	2205
111 | 75-79	"First dose, at least 21 days ago"	272819	810	9579	10389
112 | 75-79	"Second dose, less than 21 days ago"	101660	16	1556	1572
113 | 75-79	"Second dose, between 21 days and 6 months ago"	793830	825	20720	21545
114 | 75-79	"Second dose, at least 6 months ago"	109217	928	8302	9230
115 | 75-79	"Third dose or booster, less than 21 days ago"	103909	74	1599	1673
116 | 75-79	"Third dose or booster, at least 21 days ago"	986552	1592	23794	25386
117 | 80-84	Unvaccinated	89993	5841	8887	14728
118 | 80-84	"First dose, less than 21 days ago"	60797	803	2153	2956
119 | 80-84	"First dose, at least 21 days ago"	169997	1414	11035	12449
120 | 80-84	"Second dose, less than 21 days ago"	71225	44	1937	1981
121 | 80-84	"Second dose, between 21 days and 6 months ago"	545087	852	25161	26013
122 | 80-84	"Second dose, at least 6 months ago"	114217	1228	11737	12965
123 | 80-84	"Third dose or booster, less than 21 days ago"	68864	81	2041	2122
124 | 80-84	"Third dose or booster, at least 21 days ago"	662376	2198	30106	32304
125 | 85-89	Unvaccinated	58281	6437	9707	16144
126 | 85-89	"First dose, less than 21 days ago"	37103	961	2595	3556
127 | 85-89	"First dose, at least 21 days ago"	104886	1797	12234	14031
128 | 85-89	"Second dose, less than 21 days ago"	43168	62	2156	2218
129 | 85-89	"Second dose, between 21 days and 6 months ago"	334788	835	27721	28556
130 | 85-89	"Second dose, at least 6 months ago"	81233	1326	12935	14261
131 | 85-89	"Third dose or booster, less than 21 days ago"	42875	106	2414	2520
132 | 85-89	"Third dose or booster, at least 21 days ago"	413276	2783	34908	37691
133 | 90+	Unvaccinated	40762	7191	11829	19020
134 | 90+	"First dose, less than 21 days ago"	21725	1284	3421	4705
135 | 90+	"First dose, at least 21 days ago"	61706	2151	14737	16888
136 | 90+	"Second dose, less than 21 days ago"	24443	50	2854	2904
137 | 90+	"Second dose, between 21 days and 6 months ago"	190688	937	33790	34727
138 | 90+	"Second dose, at least 6 months ago"	49520	1364	14910	16274
139 | 90+	"Third dose or booster, less than 21 days ago"	24392	118	3283	3401
140 | 90+	"Third dose or booster, at least 21 days ago"	229050	3432	43275	46707


--------------------------------------------------------------------------------
/2022/uk-covid-deaths/referencetable06072022accessible/Table 7-Table 1.tsv:
--------------------------------------------------------------------------------
  1 | "Whole period counts of all registered deaths by vaccination status by age group; for all deaths and deaths involving COVID-19, deaths occurring between 1 January 2021 and 31 May 2022, England"				
  2 | This worksheet contains 1 table.				
  3 | "Source: Source: Office for National Statistics, National Immunisation Management Service."				
  4 | Cause of Death	Age group	Vaccination status	Count of Deaths	
  5 | All causes	10-39	Unvaccinated	5678	
  6 | All causes	10-39	"First dose, less than 21 days ago"	243	
  7 | All causes	10-39	"First dose, at least 21 days ago"	1316	
  8 | All causes	10-39	"Second dose, less than 21 days ago"	159	
  9 | All causes	10-39	"Second dose, between 21 days and 6 months ago"	1607	
 10 | All causes	10-39	"Second dose, at least 6 months ago"	630	
 11 | All causes	10-39	"Third dose or booster, less than 21 days ago"	78	
 12 | All causes	10-39	"Third dose or booster, at least 21 days ago"	746	
 13 | All causes	40-49	Unvaccinated	6908	
 14 | All causes	40-49	"First dose, less than 21 days ago"	360	
 15 | All causes	40-49	"First dose, at least 21 days ago"	2225	
 16 | All causes	40-49	"Second dose, less than 21 days ago"	298	
 17 | All causes	40-49	"Second dose, between 21 days and 6 months ago"	3281	
 18 | All causes	40-49	"Second dose, at least 6 months ago"	1332	
 19 | All causes	40-49	"Third dose or booster, less than 21 days ago"	161	
 20 | All causes	40-49	"Third dose or booster, at least 21 days ago"	1900	
 21 | All causes	50-59	Unvaccinated	14466	
 22 | All causes	50-59	"First dose, less than 21 days ago"	882	
 23 | All causes	50-59	"First dose, at least 21 days ago"	5699	
 24 | All causes	50-59	"Second dose, less than 21 days ago"	709	
 25 | All causes	50-59	"Second dose, between 21 days and 6 months ago"	9923	
 26 | All causes	50-59	"Second dose, at least 6 months ago"	4381	
 27 | All causes	50-59	"Third dose or booster, less than 21 days ago"	600	
 28 | All causes	50-59	"Third dose or booster, at least 21 days ago"	7333	
 29 | All causes	60-69	Unvaccinated	22133	
 30 | All causes	60-69	"First dose, less than 21 days ago"	1794	
 31 | All causes	60-69	"First dose, at least 21 days ago"	10333	
 32 | All causes	60-69	"Second dose, less than 21 days ago"	1436	
 33 | All causes	60-69	"Second dose, between 21 days and 6 months ago"	20247	
 34 | All causes	60-69	"Second dose, at least 6 months ago"	8795	
 35 | All causes	60-69	"Third dose or booster, less than 21 days ago"	1410	
 36 | All causes	60-69	"Third dose or booster, at least 21 days ago"	18413	
 37 | All causes	70-79	Unvaccinated	31333	
 38 | All causes	70-79	"First dose, less than 21 days ago"	4312	
 39 | All causes	70-79	"First dose, at least 21 days ago"	21214	
 40 | All causes	70-79	"Second dose, less than 21 days ago"	3128	
 41 | All causes	70-79	"Second dose, between 21 days and 6 months ago"	43738	
 42 | All causes	70-79	"Second dose, at least 6 months ago"	18453	
 43 | All causes	70-79	"Third dose or booster, less than 21 days ago"	3279	
 44 | All causes	70-79	"Third dose or booster, at least 21 days ago"	47630	
 45 | All causes	80-89	Unvaccinated	36259	
 46 | All causes	80-89	"First dose, less than 21 days ago"	7362	
 47 | All causes	80-89	"First dose, at least 21 days ago"	29945	
 48 | All causes	80-89	"Second dose, less than 21 days ago"	4730	
 49 | All causes	80-89	"Second dose, between 21 days and 6 months ago"	61294	
 50 | All causes	80-89	"Second dose, at least 6 months ago"	30668	
 51 | All causes	80-89	"Third dose or booster, less than 21 days ago"	5187	
 52 | All causes	80-89	"Third dose or booster, at least 21 days ago"	78216	
 53 | All causes	90+	Unvaccinated	21927	
 54 | All causes	90+	"First dose, less than 21 days ago"	5286	
 55 | All causes	90+	"First dose, at least 21 days ago"	19056	
 56 | All causes	90+	"Second dose, less than 21 days ago"	3242	
 57 | All causes	90+	"Second dose, between 21 days and 6 months ago"	39061	
 58 | All causes	90+	"Second dose, at least 6 months ago"	18277	
 59 | All causes	90+	"Third dose or booster, less than 21 days ago"	3814	
 60 | All causes	90+	"Third dose or booster, at least 21 days ago"	52353	
 61 | Deaths involving COVID-19	10-39	Unvaccinated	795	
 62 | Deaths involving COVID-19	10-39	"First dose, less than 21 days ago"	14	
 63 | Deaths involving COVID-19	10-39	"First dose, at least 21 days ago"	47	
 64 | Deaths involving COVID-19	10-39	"Second dose, less than 21 days ago"	3	
 65 | Deaths involving COVID-19	10-39	"Second dose, between 21 days and 6 months ago"	62	
 66 | Deaths involving COVID-19	10-39	"Second dose, at least 6 months ago"	39	
 67 | Deaths involving COVID-19	10-39	"Third dose or booster, less than 21 days ago"	4	
 68 | Deaths involving COVID-19	10-39	"Third dose or booster, at least 21 days ago"	33	
 69 | Deaths involving COVID-19	40-49	Unvaccinated	1441	
 70 | Deaths involving COVID-19	40-49	"First dose, less than 21 days ago"	19	
 71 | Deaths involving COVID-19	40-49	"First dose, at least 21 days ago"	86	
 72 | Deaths involving COVID-19	40-49	"Second dose, less than 21 days ago"	4	
 73 | Deaths involving COVID-19	40-49	"Second dose, between 21 days and 6 months ago"	134	
 74 | Deaths involving COVID-19	40-49	"Second dose, at least 6 months ago"	126	
 75 | Deaths involving COVID-19	40-49	"Third dose or booster, less than 21 days ago"	10	
 76 | Deaths involving COVID-19	40-49	"Third dose or booster, at least 21 days ago"	76	
 77 | Deaths involving COVID-19	50-59	Unvaccinated	3743	
 78 | Deaths involving COVID-19	50-59	"First dose, less than 21 days ago"	79	
 79 | Deaths involving COVID-19	50-59	"First dose, at least 21 days ago"	239	
 80 | Deaths involving COVID-19	50-59	"Second dose, less than 21 days ago"	3	
 81 | Deaths involving COVID-19	50-59	"Second dose, between 21 days and 6 months ago"	418	
 82 | Deaths involving COVID-19	50-59	"Second dose, at least 6 months ago"	350	
 83 | Deaths involving COVID-19	50-59	"Third dose or booster, less than 21 days ago"	17	
 84 | Deaths involving COVID-19	50-59	"Third dose or booster, at least 21 days ago"	297	
 85 | Deaths involving COVID-19	60-69	Unvaccinated	6937	
 86 | Deaths involving COVID-19	60-69	"First dose, less than 21 days ago"	218	
 87 | Deaths involving COVID-19	60-69	"First dose, at least 21 days ago"	506	
 88 | Deaths involving COVID-19	60-69	"Second dose, less than 21 days ago"	14	
 89 | Deaths involving COVID-19	60-69	"Second dose, between 21 days and 6 months ago"	933	
 90 | Deaths involving COVID-19	60-69	"Second dose, at least 6 months ago"	865	
 91 | Deaths involving COVID-19	60-69	"Third dose or booster, less than 21 days ago"	59	
 92 | Deaths involving COVID-19	60-69	"Third dose or booster, at least 21 days ago"	862	
 93 | Deaths involving COVID-19	70-79	Unvaccinated	11011	
 94 | Deaths involving COVID-19	70-79	"First dose, less than 21 days ago"	787	
 95 | Deaths involving COVID-19	70-79	"First dose, at least 21 days ago"	1410	
 96 | Deaths involving COVID-19	70-79	"Second dose, less than 21 days ago"	31	
 97 | Deaths involving COVID-19	70-79	"Second dose, between 21 days and 6 months ago"	1744	
 98 | Deaths involving COVID-19	70-79	"Second dose, at least 6 months ago"	1804	
 99 | Deaths involving COVID-19	70-79	"Third dose or booster, less than 21 days ago"	141	
100 | Deaths involving COVID-19	70-79	"Third dose or booster, at least 21 days ago"	2830	
101 | Deaths involving COVID-19	80-89	Unvaccinated	14248	
102 | Deaths involving COVID-19	80-89	"First dose, less than 21 days ago"	2007	
103 | Deaths involving COVID-19	80-89	"First dose, at least 21 days ago"	3637	
104 | Deaths involving COVID-19	80-89	"Second dose, less than 21 days ago"	121	
105 | Deaths involving COVID-19	80-89	"Second dose, between 21 days and 6 months ago"	1923	
106 | Deaths involving COVID-19	80-89	"Second dose, at least 6 months ago"	2878	
107 | Deaths involving COVID-19	80-89	"Third dose or booster, less than 21 days ago"	203	
108 | Deaths involving COVID-19	80-89	"Third dose or booster, at least 21 days ago"	5566	
109 | Deaths involving COVID-19	90+	Unvaccinated	8187	
110 | Deaths involving COVID-19	90+	"First dose, less than 21 days ago"	1459	
111 | Deaths involving COVID-19	90+	"First dose, at least 21 days ago"	2447	
112 | Deaths involving COVID-19	90+	"Second dose, less than 21 days ago"	58	
113 | Deaths involving COVID-19	90+	"Second dose, between 21 days and 6 months ago"	1058	
114 | Deaths involving COVID-19	90+	"Second dose, at least 6 months ago"	1534	
115 | Deaths involving COVID-19	90+	"Third dose or booster, less than 21 days ago"	131	
116 | Deaths involving COVID-19	90+	"Third dose or booster, at least 21 days ago"	3830	


--------------------------------------------------------------------------------
/2022/uk-covid-deaths/referencetable06072022accessible/Table 9-Table 1.tsv:
--------------------------------------------------------------------------------
 1 | "Whole period counts of all registered deaths grouped by how many weeks after vaccination the deaths occurred; for deaths involving COVID-19 and deaths not involving COVID-19, deaths occurring between 1 January 2021 and 31 May 2022, England"				
 2 | This worksheet contains 1 table.				
 3 | "Source: Source: Office for National Statistics, National Immunisation Management Service."				
 4 | Week after vaccination	Age group	Count of Deaths involving COVID-19	Count of Non-COVID-19 Deaths	
 5 | 1	10-39	<3	115	
 6 | 2	10-39	11	168	
 7 | 3	10-39	8	176	
 8 | 4	10-39	7	210	
 9 | 5	10-39	6	175	
10 | 6	10-39	6	190	
11 | 7	10-39	4	207	
12 | 8	10-39	6	185	
13 | 9	10-39	7	210	
14 | 10	10-39	3	182	
15 | 11	10-39	8	182	
16 | 12+	10-39	134	2577	
17 | 1	40-49	5	175	
18 | 2	40-49	14	275	
19 | 3	40-49	14	336	
20 | 4	40-49	10	330	
21 | 5	40-49	14	353	
22 | 6	40-49	10	349	
23 | 7	40-49	12	386	
24 | 8	40-49	10	373	
25 | 9	40-49	12	374	
26 | 10	40-49	13	358	
27 | 11	40-49	13	353	
28 | 12+	40-49	328	5440	
29 | 1	50-59	8	431	
30 | 2	50-59	34	809	
31 | 3	50-59	57	852	
32 | 4	50-59	53	915	
33 | 5	50-59	46	1034	
34 | 6	50-59	26	1058	
35 | 7	50-59	30	1063	
36 | 8	50-59	27	1054	
37 | 9	50-59	31	1091	
38 | 10	50-59	26	1181	
39 | 11	50-59	34	1038	
40 | 12+	50-59	1031	17598	
41 | 1	60-69	35	950	
42 | 2	60-69	120	1557	
43 | 3	60-69	136	1842	
44 | 4	60-69	120	2009	
45 | 5	60-69	123	2073	
46 | 6	60-69	95	2130	
47 | 7	60-69	78	2227	
48 | 8	60-69	67	2249	
49 | 9	60-69	54	2255	
50 | 10	60-69	58	2388	
51 | 11	60-69	73	2278	
52 | 12+	60-69	2498	37013	
53 | 1	70-79	104	2118	
54 | 2	70-79	347	3600	
55 | 3	70-79	508	4042	
56 | 4	70-79	449	4333	
57 | 5	70-79	345	4696	
58 | 6	70-79	257	4929	
59 | 7	70-79	198	5032	
60 | 8	70-79	155	5120	
61 | 9	70-79	151	5030	
62 | 10	70-79	156	5250	
63 | 11	70-79	170	5098	
64 | 12+	70-79	5907	83759	
65 | 1	80-89	225	3395	
66 | 2	80-89	843	5415	
67 | 3	80-89	1263	6138	
68 | 4	80-89	1148	6622	
69 | 5	80-89	807	6949	
70 | 6	80-89	635	7110	
71 | 7	80-89	475	7270	
72 | 8	80-89	380	7372	
73 | 9	80-89	329	7400	
74 | 10	80-89	266	7364	
75 | 11	80-89	294	7225	
76 | 12+	80-89	9670	128807	
77 | 1	90+	184	2403	
78 | 2	90+	549	3865	
79 | 3	90+	915	4426	
80 | 4	90+	795	4670	
81 | 5	90+	585	4798	
82 | 6	90+	402	4853	
83 | 7	90+	319	4863	
84 | 8	90+	233	4951	
85 | 9	90+	206	4970	
86 | 10	90+	179	4750	
87 | 11	90+	167	4640	
88 | 12+	90+	5983	81383	


--------------------------------------------------------------------------------
/2022/uk-covid-deaths/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | pandas
3 | seaborn
4 | dask
5 | 


--------------------------------------------------------------------------------
/2022/uk-covid-deaths/table6.tsv:
--------------------------------------------------------------------------------
  1 | "Whole period counts of all cause deaths, deaths involving COVID-19 and deaths not involving COVID-19, and person-years by vaccination status and five-year age group, England, deaths occurring between 1 January 2021 and 31 May 2022"					
  2 | This worksheet contains 1 table.					
  3 | "Source: Source: Office for National Statistics, National Immunisation Management Service."					
  4 | Age group	Vaccination status	Person-years	Count of deaths involving COVID-19	Count of deaths non-COVID-19 deaths	Count of all cause deaths
  5 | 10-14	Unvaccinated	2881265	9	175	184
  6 | 10-14	"First dose, less than 21 days ago"	61754	2	2	4
  7 | 10-14	"First dose, at least 21 days ago"	280645	0	14	14
  8 | 10-14	"Second dose, less than 21 days ago"	36646	0	0	0
  9 | 10-14	"Second dose, between 21 days and 6 months ago"	135989	0	13	13
 10 | 10-14	"Second dose, at least 6 months ago"	1028	0	1	1
 11 | 10-14	"Third dose or booster, less than 21 days ago"	723	0	1	1
 12 | 10-14	"Third dose or booster, at least 21 days ago"	2422	1	6	7
 13 | 15-19	Unvaccinated	1991761	24	265	289
 14 | 15-19	"First dose, less than 21 days ago"	115758	0	13	13
 15 | 15-19	"First dose, at least 21 days ago"	465610	2	79	81
 16 | 15-19	"Second dose, less than 21 days ago"	97554	1	3	4
 17 | 15-19	"Second dose, between 21 days and 6 months ago"	520292	2	74	76
 18 | 15-19	"Second dose, at least 6 months ago"	63581	0	23	23
 19 | 15-19	"Third dose or booster, less than 21 days ago"	35398	1	2	3
 20 | 15-19	"Third dose or booster, at least 21 days ago"	160272	1	31	32
 21 | 20-24	Unvaccinated	1531301	43	335	378
 22 | 20-24	"First dose, less than 21 days ago"	116923	1	21	22
 23 | 20-24	"First dose, at least 21 days ago"	342619	5	104	109
 24 | 20-24	"Second dose, less than 21 days ago"	110074	0	20	20
 25 | 20-24	"Second dose, between 21 days and 6 months ago"	710759	4	151	155
 26 | 20-24	"Second dose, at least 6 months ago"	209425	2	41	43
 27 | 20-24	"Third dose or booster, less than 21 days ago"	67390	0	12	12
 28 | 20-24	"Third dose or booster, at least 21 days ago"	414003	4	48	52
 29 | 25-29	Unvaccinated	1567892	68	525	593
 30 | 25-29	"First dose, less than 21 days ago"	117976	2	30	32
 31 | 25-29	"First dose, at least 21 days ago"	339758	5	145	150
 32 | 25-29	"Second dose, less than 21 days ago"	112913	0	19	19
 33 | 25-29	"Second dose, between 21 days and 6 months ago"	748987	11	189	200
 34 | 25-29	"Second dose, at least 6 months ago"	228001	3	92	95
 35 | 25-29	"Third dose or booster, less than 21 days ago"	73969	0	7	7
 36 | 25-29	"Third dose or booster, at least 21 days ago"	475006	5	95	100
 37 | 30-34	Unvaccinated	1432230	129	649	778
 38 | 30-34	"First dose, less than 21 days ago"	116485	3	46	49
 39 | 30-34	"First dose, at least 21 days ago"	330087	9	222	231
 40 | 30-34	"Second dose, less than 21 days ago"	112049	0	22	22
 41 | 30-34	"Second dose, between 21 days and 6 months ago"	757164	10	297	307
 42 | 30-34	"Second dose, at least 6 months ago"	219824	13	116	129
 43 | 30-34	"Third dose or booster, less than 21 days ago"	77326	1	15	16
 44 | 30-34	"Third dose or booster, at least 21 days ago"	510682	4	168	172
 45 | 35-39	Unvaccinated	1351742	238	903	1141
 46 | 35-39	"First dose, less than 21 days ago"	126946	9	62	71
 47 | 35-39	"First dose, at least 21 days ago"	341664	15	364	379
 48 | 35-39	"Second dose, less than 21 days ago"	123317	1	45	46
 49 | 35-39	"Second dose, between 21 days and 6 months ago"	853238	24	463	487
 50 | 35-39	"Second dose, at least 6 months ago"	225736	11	183	194
 51 | 35-39	"Third dose or booster, less than 21 days ago"	91720	1	28	29
 52 | 35-39	"Third dose or booster, at least 21 days ago"	620364	10	265	275
 53 | 40-44	Unvaccinated	1158559	299	1225	1524
 54 | 40-44	"First dose, less than 21 days ago"	133705	3	79	82
 55 | 40-44	"First dose, at least 21 days ago"	354401	13	557	570
 56 | 40-44	"Second dose, less than 21 days ago"	131852	0	75	75
 57 | 40-44	"Second dose, between 21 days and 6 months ago"	951898	40	865	905
 58 | 40-44	"Second dose, at least 6 months ago"	226923	39	344	383
 59 | 40-44	"Third dose or booster, less than 21 days ago"	105778	1	44	45
 60 | 40-44	"Third dose or booster, at least 21 days ago"	745195	25	516	541
 61 | 45-49	Unvaccinated	1015441	597	1965	2562
 62 | 45-49	"First dose, less than 21 days ago"	147654	13	166	179
 63 | 45-49	"First dose, at least 21 days ago"	390029	39	926	965
 64 | 45-49	"Second dose, less than 21 days ago"	144704	1	145	146
 65 | 45-49	"Second dose, between 21 days and 6 months ago"	1056962	65	1556	1621
 66 | 45-49	"Second dose, at least 6 months ago"	222412	60	563	623
 67 | 45-49	"Third dose or booster, less than 21 days ago"	119884	6	85	91
 68 | 45-49	"Third dose or booster, at least 21 days ago"	854328	39	975	1014
 69 | 50-54	Unvaccinated	892001	1069	3014	4083
 70 | 50-54	"First dose, less than 21 days ago"	170638	21	250	271
 71 | 50-54	"First dose, at least 21 days ago"	461101	66	1640	1706
 72 | 50-54	"Second dose, less than 21 days ago"	169954	0	225	225
 73 | 50-54	"Second dose, between 21 days and 6 months ago"	1281727	120	2926	3046
 74 | 50-54	"Second dose, at least 6 months ago"	239007	110	1161	1271
 75 | 50-54	"Third dose or booster, less than 21 days ago"	149814	5	160	165
 76 | 50-54	"Third dose or booster, at least 21 days ago"	1134549	85	2109	2194
 77 | 55-59	Unvaccinated	811218	1626	4084	5710
 78 | 55-59	"First dose, less than 21 days ago"	173094	44	391	435
 79 | 55-59	"First dose, at least 21 days ago"	469286	113	2494	2607
 80 | 55-59	"Second dose, less than 21 days ago"	173062	2	341	343
 81 | 55-59	"Second dose, between 21 days and 6 months ago"	1318673	239	4690	4929
 82 | 55-59	"Second dose, at least 6 months ago"	220215	181	1965	2146
 83 | 55-59	"Third dose or booster, less than 21 days ago"	159483	7	314	321
 84 | 55-59	"Third dose or booster, at least 21 days ago"	1244053	168	3708	3876
 85 | 60-64	Unvaccinated	619004	2425	5135	7560
 86 | 60-64	"First dose, less than 21 days ago"	153232	67	544	611
 87 | 60-64	"First dose, at least 21 days ago"	417699	176	3463	3639
 88 | 60-64	"Second dose, less than 21 days ago"	153930	2	514	516
 89 | 60-64	"Second dose, between 21 days and 6 months ago"	1178767	345	6761	7106
 90 | 60-64	"Second dose, at least 6 months ago"	175218	290	2774	3064
 91 | 60-64	"Third dose or booster, less than 21 days ago"	146505	17	475	492
 92 | 60-64	"Third dose or booster, at least 21 days ago"	1180409	265	5866	6131
 93 | 65-69	Unvaccinated	429644	3051	6303	9354
 94 | 65-69	"First dose, less than 21 days ago"	134086	122	796	918
 95 | 65-69	"First dose, at least 21 days ago"	361884	242	4686	4928
 96 | 65-69	"Second dose, less than 21 days ago"	134189	9	681	690
 97 | 65-69	"Second dose, between 21 days and 6 months ago"	1026322	445	9597	10042
 98 | 65-69	"Second dose, at least 6 months ago"	142579	436	3778	4214
 99 | 65-69	"Third dose or booster, less than 21 days ago"	129293	32	698	730
100 | 65-69	"Third dose or booster, at least 21 days ago"	1089793	490	9186	9676
101 | 70-74	Unvaccinated	322630	4194	8090	12284
102 | 70-74	"First dose, less than 21 days ago"	137755	235	1355	1590
103 | 70-74	"First dose, at least 21 days ago"	372505	413	7651	8064
104 | 70-74	"Second dose, less than 21 days ago"	137947	12	1193	1205
105 | 70-74	"Second dose, between 21 days and 6 months ago"	1049233	708	16101	16809
106 | 70-74	"Second dose, at least 6 months ago"	136845	673	6201	6874
107 | 70-74	"Third dose or booster, less than 21 days ago"	131781	44	1196	1240
108 | 70-74	"Third dose or booster, at least 21 days ago"	1149162	946	15934	16880
109 | 75-79	Unvaccinated	181758	5044	8515	13559
110 | 75-79	"First dose, less than 21 days ago"	99957	467	1738	2205
111 | 75-79	"First dose, at least 21 days ago"	272819	810	9579	10389
112 | 75-79	"Second dose, less than 21 days ago"	101660	16	1556	1572
113 | 75-79	"Second dose, between 21 days and 6 months ago"	793830	825	20720	21545
114 | 75-79	"Second dose, at least 6 months ago"	109217	928	8302	9230
115 | 75-79	"Third dose or booster, less than 21 days ago"	103909	74	1599	1673
116 | 75-79	"Third dose or booster, at least 21 days ago"	986552	1592	23794	25386
117 | 80-84	Unvaccinated	89993	5841	8887	14728
118 | 80-84	"First dose, less than 21 days ago"	60797	803	2153	2956
119 | 80-84	"First dose, at least 21 days ago"	169997	1414	11035	12449
120 | 80-84	"Second dose, less than 21 days ago"	71225	44	1937	1981
121 | 80-84	"Second dose, between 21 days and 6 months ago"	545087	852	25161	26013
122 | 80-84	"Second dose, at least 6 months ago"	114217	1228	11737	12965
123 | 80-84	"Third dose or booster, less than 21 days ago"	68864	81	2041	2122
124 | 80-84	"Third dose or booster, at least 21 days ago"	662376	2198	30106	32304
125 | 85-89	Unvaccinated	58281	6437	9707	16144
126 | 85-89	"First dose, less than 21 days ago"	37103	961	2595	3556
127 | 85-89	"First dose, at least 21 days ago"	104886	1797	12234	14031
128 | 85-89	"Second dose, less than 21 days ago"	43168	62	2156	2218
129 | 85-89	"Second dose, between 21 days and 6 months ago"	334788	835	27721	28556
130 | 85-89	"Second dose, at least 6 months ago"	81233	1326	12935	14261
131 | 85-89	"Third dose or booster, less than 21 days ago"	42875	106	2414	2520
132 | 85-89	"Third dose or booster, at least 21 days ago"	413276	2783	34908	37691
133 | 90+	Unvaccinated	40762	7191	11829	19020
134 | 90+	"First dose, less than 21 days ago"	21725	1284	3421	4705
135 | 90+	"First dose, at least 21 days ago"	61706	2151	14737	16888
136 | 90+	"Second dose, less than 21 days ago"	24443	50	2854	2904
137 | 90+	"Second dose, between 21 days and 6 months ago"	190688	937	33790	34727
138 | 90+	"Second dose, at least 6 months ago"	49520	1364	14910	16274
139 | 90+	"Third dose or booster, less than 21 days ago"	24392	118	3283	3401
140 | 90+	"Third dose or booster, at least 21 days ago"	229050	3432	43275	46707


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # talks
2 | Code snippets to use in talks
3 | 


--------------------------------------------------------------------------------