├── 2018 ├── daskvsspark │ ├── .dockerignore │ ├── .gitignore │ ├── Dockerfile │ ├── README.md │ ├── daskvsspark │ │ ├── __init__.py │ │ ├── aggregate_dask.py │ │ ├── aggregate_dask.sh │ │ ├── aggregate_dask_kube.sh │ │ ├── aggregate_dask_yarn.sh │ │ ├── aggregate_spark.py │ │ ├── aggregate_spark.sh │ │ ├── aggregate_spark_yarn.sh │ │ ├── common.py │ │ ├── context.py │ │ ├── notes.txt │ │ ├── prepare.py │ │ ├── prepare.sh │ │ ├── schema.py │ │ ├── show.py │ │ ├── start_dask.sh │ │ └── start_dask_yarn.py │ ├── deployment │ │ ├── bootstrap.sh │ │ ├── conf.json │ │ ├── config.yaml │ │ ├── create_cluster.sh │ │ ├── deploy_code.sh │ │ ├── deploy_data.sh │ │ ├── deploy_reqs.sh │ │ ├── instances.json │ │ ├── log4j.properties │ │ └── setup_dvss.sh │ ├── docker-compose.yml │ ├── dvss-helm │ │ ├── .helmignore │ │ ├── Chart.yaml │ │ ├── templates │ │ │ ├── NOTES.txt │ │ │ ├── _helpers.tpl │ │ │ ├── scheduler-deployment.yaml │ │ │ ├── scheduler-service.yaml │ │ │ └── worker-deployment.yaml │ │ └── values.yaml │ ├── requirements-dask.txt │ ├── requirements-dev.txt │ ├── requirements.txt │ ├── samples │ │ └── agg1hour.json │ ├── scala │ │ ├── README.md │ │ ├── build.sbt │ │ └── src │ │ │ ├── main │ │ │ └── scala │ │ │ │ └── com │ │ │ │ └── jbennet │ │ │ │ └── daskvsspark │ │ │ │ └── udafs.scala │ │ │ └── test │ │ │ └── scala │ │ │ └── com │ │ │ └── jbennet │ │ │ └── daskvsspark │ │ │ └── AggregateCounterTest.scala │ ├── setup.py │ └── trials │ │ ├── Aggregate without index.ipynb │ │ ├── Custom aggregations.ipynb │ │ ├── aggregate1.py │ │ └── aggregate2.py ├── datetimes │ ├── 01_event-table.png │ ├── 02_event_table_utc.png │ ├── 03_event_table_la.png │ ├── 04_event_table_floor.png │ ├── 05_events_grouped.png │ ├── datetime-challenges.ipynb │ └── requirements.txt ├── sqlpandas │ ├── README.md │ ├── data-hp │ │ ├── houses.csv │ │ └── students.csv │ ├── data │ │ ├── airport-frequencies.csv │ │ ├── airports.csv │ │ ├── countries.csv │ │ ├── navaids.csv │ │ ├── regions.csv │ │ └── runways.csv │ ├── download_data.sh │ ├── explore.ipynb │ └── images │ │ ├── by_country.png │ │ ├── by_country_top10.png │ │ ├── having1.png │ │ ├── having2.png │ │ ├── notebook.png │ │ ├── runways.png │ │ ├── runways_agg1.png │ │ └── runways_agg2.png └── windows │ ├── README.md │ ├── Window functions.ipynb │ ├── social_deltas.csv │ ├── social_totals.csv │ └── social_totals_agg.csv ├── 2019 ├── pandasdb │ └── read_csv_file.py └── sparkstart │ ├── context.py │ ├── driver.py │ └── runner.sh ├── 2021 └── covid-travel │ ├── .gitignore │ ├── Covid and air travel.ipynb │ ├── README.md │ ├── all_by_age_race.csv │ ├── covid_and_air_travel.png │ ├── flight_infection_risk.csv │ ├── requirements.txt │ └── test.py ├── 2022 └── uk-covid-deaths │ ├── asmr │ ├── agestandardisedmortalityratecalculationtemplateusingthe2013esp_tcm77-359944.xls │ └── espmortalityratesreport_tcm77-364912.pdf │ ├── output1.png │ ├── output2.png │ ├── output3.png │ ├── output4.png │ ├── output5.png │ ├── referencetable06072022accessible │ ├── Contents-Table 1.tsv │ ├── Cover-Table 1.tsv │ ├── Definitions-Table 1.tsv │ ├── Notes-Table 1.tsv │ ├── Table 1-Table 1.tsv │ ├── Table 2-Table 1.tsv │ ├── Table 3-Table 1.tsv │ ├── Table 4-Table 1.tsv │ ├── Table 5-Table 1.tsv │ ├── Table 6-Table 1.tsv │ ├── Table 7-Table 1.tsv │ ├── Table 8-Table 1.tsv │ └── Table 9-Table 1.tsv │ ├── requirements.txt │ ├── table1.tsv │ ├── table6.tsv │ ├── table8.tsv │ └── uk_deaths_by_vacc.ipynb ├── .gitignore └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # pycharm 104 | .idea/ 105 | 106 | *.parquet 107 | *.crc 108 | 109 | spark-warehouse/ 110 | metastore_db/ 111 | aggs*/ 112 | _SUCCESS 113 | 114 | .DS_Store 115 | /2020/covid-travel/COVID-19_Case_Surveillance_Public_Use_Data.csv 116 | /2022/uk-covid-deaths/owid-covid-data.csv 117 | -------------------------------------------------------------------------------- /2018/daskvsspark/.dockerignore: -------------------------------------------------------------------------------- 1 | build 2 | *.egg-info 3 | deployment 4 | dist 5 | images 6 | samples 7 | scala 8 | tmp 9 | trials 10 | 11 | **/aggs_* 12 | **/events 13 | **/*.log.* 14 | metastore_db 15 | dask-worker-space 16 | spark-warehouse 17 | -------------------------------------------------------------------------------- /2018/daskvsspark/.gitignore: -------------------------------------------------------------------------------- 1 | images/ 2 | aggs_*/ 3 | events/ 4 | dask-worker-space/ 5 | spark-warehouse/ 6 | out/ 7 | project/ 8 | target/ 9 | tmp* 10 | *.log.* 11 | tmp/ 12 | -------------------------------------------------------------------------------- /2018/daskvsspark/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM daskdev/dask:latest 2 | 3 | RUN pip install awscli 4 | 5 | ARG AWS_ACCESS_KEY_ID 6 | ARG AWS_SECRET_ACCESS_KEY 7 | ARG AWS_DEFAULT_REGION 8 | 9 | ENV AWS_ACCESS_KEY_ID $AWS_ACCESS_KEY_ID 10 | ENV AWS_SECRET_ACCESS_KEY $AWS_SECRET_ACCESS_KEY 11 | ENV AWS_DEFAULT_REGION $AWS_DEFAULT_REGION 12 | ENV CONDA_ROOT $(conda info --root) 13 | 14 | RUN echo "AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID" 15 | 16 | # add the reqs 17 | ADD ./requirements*.txt /assets/code/ 18 | 19 | # install the reqs 20 | WORKDIR /assets/code 21 | RUN conda install --copy -y -c conda-forge --file requirements.txt --file requirements-dask.txt --file requirements-dev.txt 22 | 23 | # add the code 24 | ADD ./daskvsspark/*.py /assets/code/daskvsspark/ 25 | ADD ./daskvsspark/aggregate_*.sh /assets/code/daskvsspark/ 26 | ADD ./setup.py /assets/code/ 27 | 28 | # install the code into conda root env 29 | RUN python setup.py install 30 | 31 | RUN apt-get install -y vim 32 | -------------------------------------------------------------------------------- /2018/daskvsspark/README.md: -------------------------------------------------------------------------------- 1 | What is this? 2 | ============= 3 | 4 | An example of data aggregation in Spark and in Dask. 5 | 6 | How do I use it? 7 | ================ 8 | 9 | To run this locally, you need an Apache Spark distribution 10 | (let's say it's in `$HOME/bin/`). Then, after setting some 11 | environment variables: 12 | 13 | ``` 14 | export SPARK_HOME="$HOME/bin/spark-2.1.1-bin-hadoop2.7" 15 | export PYTHONPATH="$SPARK_HOME/python/lib/pyspark.zip:$SPARK_HOME/python/lib/py4j-0.10.4-src.zip:$PYTHONPATH" 16 | ``` 17 | 18 | you can run a Spark script as simply as: 19 | 20 | ``` 21 | python main.py 22 | ``` 23 | 24 | The above is good enough for testing. In real life, you'd use ``spark-submit``: 25 | 26 | ``` 27 | PYSPARK_DRIVER_PYTHON=`which python` PYSPARK_PYTHON=`which python` \ 28 | spark-submit \ 29 | --master "local[4]" \ 30 | --deploy-mode client \ 31 | main.py 32 | ``` 33 | 34 | Generate input data 35 | ------------------- 36 | 37 | A script is included to mock some input data. It writes Parquet to `./events/` directory. 38 | To run it with Spark: 39 | 40 | ``` 41 | prepare.sh 42 | ``` 43 | 44 | By default, it'll generate 100 input records and assume 100k records per partition (one parquet 45 | file). You can provide a different number: 46 | 47 | ``` 48 | prepare.sh [total-records] [records-per-partition] 49 | ``` 50 | 51 | The data is partitioned on disk by year, month, day, customer and hour. The script generates 1 day 52 | of data. This means that at least 24 files (partitions) will be created, because we can't create 53 | less than one partition per hour. The script will write parquet to 54 | ``./events/[number-of-records]-[number-of-partitions]`` 55 | 56 | Make sure that the spark-submit settings in ``prepare.sh`` (``driver-memory``, 57 | ``executor-memory``, ``num-executors``) will work for you. 58 | 59 | Aggregate with Spark 60 | --------------------- 61 | 62 | Run this: 63 | 64 | ``` 65 | aggregate_spark.sh [number-of-records] [number-of-partitions] 66 | ``` 67 | 68 | This will read the data from ``./events`` and write the aggregates as JSON 69 | to ``./aggs_spark/[number-of-records]-[number-of-partitions]``. 70 | 71 | Aggregate with Dask 72 | ------------------- 73 | 74 | Run this: 75 | 76 | ``` 77 | python aggregate_dask.py [number-of-records] [number-of-partitions] 78 | ``` 79 | 80 | This will read the data from `./events` and write the aggregates as JSON 81 | to ``./aggs_dask/[number-of-records]-[number-of-partitions]``. 82 | 83 | Inspect the data 84 | ---------------- 85 | 86 | A script is included to pretty-print generated json records. For example, 87 | this: 88 | 89 | ``` 90 | python show.py ./aggs_dask/100-24 3 91 | ``` 92 | 93 | will pretty-print 3 json records from ``./aggs_dask/100-24`` directory. 94 | -------------------------------------------------------------------------------- /2018/daskvsspark/daskvsspark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2018/daskvsspark/daskvsspark/__init__.py -------------------------------------------------------------------------------- /2018/daskvsspark/daskvsspark/aggregate_dask.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 2 | # aggregate_dask.py 3 | import argparse 4 | import datetime as dt 5 | import os 6 | import shutil 7 | from collections import Counter 8 | 9 | import dask 10 | import dask.dataframe as dd 11 | import s3fs 12 | import simplejson as json 13 | import pandas as pd 14 | from dask.distributed import Client, LocalCluster 15 | 16 | from daskvsspark.common import * 17 | 18 | INPUT_ROOT = './events' 19 | OUTPUT_ROOT = './aggs_dask' 20 | 21 | INPUT_TEMPLATE = '{root}/{event_count}-{nfiles}/*/*/*/*/*/*.parquet' 22 | OUTPUT_TEMPLATE = '{root}/{event_count}-{nfiles}/*.json' 23 | 24 | 25 | def read_data(read_path): 26 | """Reads the original Parquet data. 27 | :returns: DataFrame 28 | """ 29 | df = dd.read_parquet(read_path).drop('hour', axis=1) 30 | return df 31 | 32 | 33 | def counter_chunk(ser): 34 | """Return counter of values in series.""" 35 | return list(Counter(ser.values).items()) 36 | 37 | 38 | def counter_agg(chunks): 39 | """Add all counters together and return dict items.""" 40 | total = Counter() 41 | for chunk in chunks: 42 | current = Counter(dict(chunk)) 43 | total += current 44 | return list(total.items()) 45 | 46 | 47 | def nunique_chunk(ser): 48 | """Get all unique values in series.""" 49 | return ser.unique() 50 | 51 | 52 | def nunique_agg(chunks): 53 | """Return number of unique values in all chunks.""" 54 | total = pd.Series() 55 | for chunk in chunks: 56 | current = pd.Series(chunk) 57 | total = total.append(current) 58 | total = total.drop_duplicates() 59 | res = total.nunique() 60 | return res 61 | 62 | 63 | def group_data(df): 64 | """Aggregate the DataFrame and return the grouped DataFrame. 65 | 66 | :param df: DataFrame 67 | :returns: DataFrame 68 | """ 69 | # round timestamps down to an hour 70 | df['ts'] = df['ts'].dt.floor('1H') 71 | 72 | # group on customer, timestamp (rounded) and url 73 | gb = df.groupby(['customer', 'url', 'ts']) 74 | 75 | counter = dd.Aggregation( 76 | 'counter', 77 | lambda s: s.apply(counter_chunk), 78 | lambda s: s.apply(counter_agg), 79 | ) 80 | 81 | count_unique = dd.Aggregation( 82 | 'count_unique', 83 | lambda s: s.apply(nunique_chunk), 84 | lambda s: s.apply(nunique_agg) 85 | ) 86 | 87 | ag = gb.agg({ 88 | 'session_id': [count_unique, 'count'], 89 | 'referrer': counter} 90 | ) 91 | 92 | ag = ag.reset_index() 93 | 94 | # get rid of multilevel columns 95 | ag.columns = ['customer', 'url', 'ts', 'visitors', 'page_views', 'referrers'] 96 | ag = ag.repartition(npartitions=df.npartitions) 97 | 98 | return ag 99 | 100 | 101 | def transform_one(ser): 102 | """Takes a Series object representing a grouped DataFrame row, 103 | and returns a dict ready to be stored as JSON. 104 | 105 | :returns: pd.Series 106 | """ 107 | data = ser.to_dict() 108 | if not data: 109 | return pd.Series([], name='data') 110 | page_views = data.pop('page_views') 111 | visitors = data.pop('visitors') 112 | data.update({ 113 | '_id': format_id(data['customer'], data['url'], data['ts']), 114 | 'ts': data['ts'].strftime('%Y-%m-%dT%H:%M:%S'), 115 | 'metrics': format_metrics(page_views, visitors), 116 | 'referrers': dict(data['referrers']) 117 | }) 118 | return pd.Series([data], name='data') 119 | 120 | 121 | def transform_data(ag): 122 | """Accepts a Dask DataFrame and returns a Dask Bag, where each record is 123 | a string, and the contents of the string is a JSON representation of the 124 | document to be written. 125 | 126 | :param ag: DataFrame 127 | :returns: DataFrame with one column "data" containing a dict. 128 | """ 129 | tr = ag.apply(transform_one, axis=1, meta={'data': str}) 130 | tr = tr.repartition(npartitions=ag.npartitions) 131 | return tr 132 | 133 | 134 | def delete_path(path): 135 | """Recursively delete a path and everything under it.""" 136 | if path.startswith('s3://'): 137 | s3 = s3fs.S3FileSystem() 138 | if s3.exists(path): 139 | s3.rm(path) 140 | elif os.path.exists(path): 141 | shutil.rmtree(path) 142 | 143 | 144 | def create_path(path): 145 | """Create root dir.""" 146 | if not path.startswith('s3://') and not os.path.exists(path): 147 | os.makedirs(path) 148 | 149 | 150 | def save_json(tr, path): 151 | """Write records as json.""" 152 | root_dir = os.path.dirname(path) 153 | 154 | # cleanup before writing 155 | delete_path(root_dir) 156 | create_path(root_dir) 157 | 158 | (tr.to_bag() 159 | .map(lambda t: t[0]) 160 | .map(json.dumps) 161 | .to_textfiles(path)) 162 | 163 | 164 | if __name__ == '__main__': 165 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 166 | parser.add_argument('--count', type=int, default=100) 167 | parser.add_argument('--nfiles', type=int, default=24) 168 | parser.add_argument('--wait', action='store_true', default=False) 169 | parser.add_argument('--scheduler', choices=['thread', 'process', 'default', 'single'], 170 | default='default') 171 | parser.add_argument('--verbose', action='store_true', default=False) 172 | parser.add_argument('--address', help='Scheduler address') 173 | parser.add_argument('--input', default=INPUT_ROOT) 174 | parser.add_argument('--output', default=OUTPUT_ROOT) 175 | myargs = parser.parse_args() 176 | 177 | read_path = INPUT_TEMPLATE.format(root=myargs.input, event_count=myargs.count, 178 | nfiles=myargs.nfiles) 179 | write_path = OUTPUT_TEMPLATE.format(root=myargs.output, event_count=myargs.count, 180 | nfiles=myargs.nfiles) 181 | 182 | set_display_options() 183 | started = dt.datetime.utcnow() 184 | if myargs.scheduler != 'default': 185 | print('Scheduler: {}.'.format(myargs.scheduler)) 186 | getters = {'process': dask.multiprocessing.get, 187 | 'thread': dask.threaded.get, 188 | 'single': dask.get} 189 | dask.set_options(get=getters[myargs.scheduler]) 190 | 191 | try: 192 | if myargs.address: 193 | # explicit address is a workaround for "Worker failed to start": 194 | # scheduler and worker have to be started in console. 195 | # see https://github.com/dask/distributed/issues/1825 196 | cluster = myargs.address 197 | else: 198 | cluster = LocalCluster() 199 | 200 | if myargs.verbose: 201 | client = Client(address=cluster, silence_logs=False) 202 | else: 203 | client = Client(address=cluster) 204 | 205 | df = read_data(read_path) 206 | aggregated = group_data(df) 207 | prepared = transform_data(aggregated) 208 | save_json(prepared, write_path) 209 | elapsed = dt.datetime.utcnow() - started 210 | parts_per_hour = int(myargs.nfiles / 24) 211 | print('{:,} records, {} files ({} per hour): done in {}.'.format( 212 | myargs.count, myargs.nfiles, parts_per_hour, elapsed)) 213 | if myargs.wait: 214 | input('Press any key') 215 | except: 216 | elapsed = dt.datetime.utcnow() - started 217 | print('Failed in {}.'.format(elapsed)) 218 | raise 219 | -------------------------------------------------------------------------------- /2018/daskvsspark/daskvsspark/aggregate_dask.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ ! -z $1 ] 4 | then 5 | ADDRESS=$1 6 | fi 7 | 8 | if [ ! -z $2 ] 9 | then 10 | COUNT=$2 11 | else 12 | COUNT=100 13 | fi 14 | 15 | if [ ! -z $3 ] 16 | then 17 | NFILES=$3 18 | else 19 | NFILES=24 20 | fi 21 | 22 | if [ ! -z $4 ] 23 | then 24 | SCHEDULER=$4 25 | else 26 | SCHEDULER="default" 27 | fi 28 | 29 | python aggregate_dask.py --count $COUNT --nfiles $NFILES --scheduler $SCHEDULER --address $ADDRESS 30 | -------------------------------------------------------------------------------- /2018/daskvsspark/daskvsspark/aggregate_dask_kube.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Stop at any error 4 | set -e 5 | 6 | if [ ! -z $1 ] 7 | then 8 | ADDRESS=$1 9 | else 10 | echo "Usage: $0 " 11 | exit 1 12 | fi 13 | 14 | if [ ! -z $2 ] 15 | then 16 | COUNT=$2 17 | else 18 | COUNT=100 19 | fi 20 | 21 | if [ ! -z $3 ] 22 | then 23 | NFILES=$3 24 | else 25 | NFILES=24 26 | fi 27 | 28 | 29 | if [ ! -z $4 ] 30 | then 31 | SCHEDULER=$4 32 | else 33 | SCHEDULER="default" 34 | fi 35 | 36 | cd /assets/code/daskvsspark 37 | 38 | python aggregate_dask.py \ 39 | --input "s3://parsely-public/jbennet/daskvsspark/events" \ 40 | --output "s3://parsely-public/jbennet/daskvsspark/aggs_dask" \ 41 | --address $ADDRESS \ 42 | --count $COUNT \ 43 | --nfiles $NFILES \ 44 | --scheduler $SCHEDULER \ 45 | --verbose 46 | -------------------------------------------------------------------------------- /2018/daskvsspark/daskvsspark/aggregate_dask_yarn.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Stop at any error 4 | set -e 5 | 6 | if [ ! -z $1 ] 7 | then 8 | ADDRESS=$1 9 | else 10 | echo "Usage: $0 " 11 | exit 1 12 | fi 13 | 14 | if [ ! -z $2 ] 15 | then 16 | COUNT=$2 17 | else 18 | COUNT=100 19 | fi 20 | 21 | if [ ! -z $3 ] 22 | then 23 | NFILES=$3 24 | else 25 | NFILES=24 26 | fi 27 | 28 | 29 | if [ ! -z $4 ] 30 | then 31 | SCHEDULER=$4 32 | else 33 | SCHEDULER="default" 34 | fi 35 | 36 | cd /home/hadoop/daskvsspark/daskvsspark 37 | 38 | latest_egg=$(ls -t /home/hadoop/reqs/daskvsspark-*.egg | head -n 1) 39 | 40 | PYTHONPATH=$latest_egg /home/hadoop/conda/envs/dvss/bin/python aggregate_dask.py \ 41 | --input "s3://parsely-public/jbennet/daskvsspark/events" \ 42 | --output "s3://parsely-public/jbennet/daskvsspark/aggs_dask" \ 43 | --address $ADDRESS \ 44 | --count $COUNT \ 45 | --nfiles $NFILES \ 46 | --scheduler $SCHEDULER \ 47 | --verbose 48 | -------------------------------------------------------------------------------- /2018/daskvsspark/daskvsspark/aggregate_spark.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 2 | # aggregate_spark.py 3 | import argparse 4 | import os 5 | import datetime as dt 6 | 7 | from pyspark.sql.types import StringType, IntegerType, MapType 8 | from pyspark.sql.column import Column, _to_java_column, _to_seq 9 | 10 | from daskvsspark.context import initialize, INPUT_ROOT, OUTPUT_ROOT, PATH_TEMPLATE 11 | from daskvsspark.common import * 12 | 13 | if os.environ.get('TZ', '') != 'UTC': 14 | raise Exception('Please set TZ=UTC to run this.') 15 | 16 | 17 | def load_sql_user_functions(sc, sqlContext): 18 | """Load our custom UDAFs into a sql context.""" 19 | sqlContext.udf.register('format_id', 20 | format_id, 21 | StringType()) 22 | sqlContext.udf.register('format_metrics', 23 | format_metrics, 24 | MapType(StringType(), IntegerType())) 25 | 26 | # custom aggregation function. Needs a jar provided in runner script. 27 | agg_counter = sc._jvm.com.jbennet.daskvsspark.udafs.AggregateCounter() 28 | sqlContext.sparkSession._jsparkSession.udf().register('count_values', agg_counter) 29 | 30 | 31 | def count_values(col): 32 | """Register UDAF for use in aggregations outside of Spark SQL.""" 33 | counter = sc._jvm.com.jbennet.daskvsspark.udafs.AggregateCounter().apply 34 | return Column(counter(_to_seq(sc, [col], _to_java_column))) 35 | 36 | 37 | def aggregate(df): 38 | """Group data by customer, url, and 1 hour bucket.""" 39 | df.createOrReplaceTempView("df") 40 | agg = sqlContext.sql(""" 41 | select 42 | customer, 43 | url, 44 | window(ts, '1 hour').start as ts, 45 | count(*) as page_views, 46 | count(distinct(session_id)) as visitors, 47 | count_values(referrer) as referrers 48 | from df 49 | group by 50 | customer, 51 | url, 52 | window(ts, '1 hour').start 53 | """) 54 | return agg 55 | 56 | 57 | def transform(df): 58 | """Format as needed.""" 59 | df.createOrReplaceTempView("df") 60 | agg = sqlContext.sql(""" 61 | select 62 | format_id(customer, url, ts) as _id, 63 | customer, 64 | url, 65 | ts, 66 | format_metrics(page_views, visitors) as metrics, 67 | referrers 68 | from df 69 | """) 70 | return agg 71 | 72 | 73 | def save_json(df, path): 74 | """Write aggregate rows as json.""" 75 | df.write.mode('overwrite').json(path) 76 | 77 | 78 | if __name__ == '__main__': 79 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 80 | parser.add_argument("--count", type=int, default=100) 81 | parser.add_argument("--nfiles", type=int, default=24) 82 | parser.add_argument("--wait", action='store_true', default=False) 83 | parser.add_argument('--input', default=INPUT_ROOT) 84 | parser.add_argument('--output', default=OUTPUT_ROOT) 85 | myargs = parser.parse_args() 86 | 87 | read_path = PATH_TEMPLATE.format(root=myargs.input, event_count=myargs.count, 88 | nfiles=myargs.nfiles) 89 | write_path = PATH_TEMPLATE.format(root=myargs.output, event_count=myargs.count, 90 | nfiles=myargs.nfiles) 91 | target_partitions = myargs.nfiles 92 | 93 | started = dt.datetime.utcnow() 94 | 95 | sc, sqlContext = initialize(target_partitions=target_partitions) 96 | load_sql_user_functions(sc, sqlContext) 97 | 98 | df = sqlContext.read.parquet(read_path) 99 | agg = aggregate(df) 100 | agg = transform(agg) 101 | save_json(agg, write_path) 102 | elapsed = dt.datetime.utcnow() - started 103 | 104 | parts_per_hour = int(myargs.nfiles / 24) 105 | print('{:,} records, {} files ({} per hour): done in {}.'.format( 106 | myargs.count, myargs.nfiles, parts_per_hour, elapsed)) 107 | if myargs.wait: 108 | input('Press any key') 109 | -------------------------------------------------------------------------------- /2018/daskvsspark/daskvsspark/aggregate_spark.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ ! -z $1 ] 4 | then 5 | COUNT=$1 6 | else 7 | COUNT=100 8 | fi 9 | 10 | if [ ! -z $2 ] 11 | then 12 | NFILES=$2 13 | else 14 | NFILES=24 15 | fi 16 | 17 | TZ=UTC PYSPARK_DRIVER_PYTHON=`which python` PYSPARK_PYTHON=`which python` \ 18 | $SPARK_HOME/bin/spark-submit \ 19 | --master "local[4]" \ 20 | --deploy-mode client \ 21 | --driver-memory 6g \ 22 | --executor-memory 2g \ 23 | --num-executors 4 \ 24 | --conf "spark.yarn.executor.memoryOverhead=2g" \ 25 | --driver-class-path ../scala/target/scala-2.11/daskvsspark-udafs_2.11-0.0.1.jar \ 26 | --driver-java-options "-Droot.logger=ERROR,console" \ 27 | aggregate_spark.py --count $COUNT --nfiles $NFILES 28 | -------------------------------------------------------------------------------- /2018/daskvsspark/daskvsspark/aggregate_spark_yarn.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Stop at any error 4 | set -e 5 | 6 | build_egg() { 7 | cd /home/hadoop/daskvsspark/ 8 | python3 setup.py bdist_egg 9 | cp ./dist/*.egg /home/hadoop/reqs/ 10 | } 11 | 12 | if [ ! -z $1 ] 13 | then 14 | COUNT=$1 15 | else 16 | COUNT=100 17 | fi 18 | 19 | if [ ! -z $2 ] 20 | then 21 | NFILES=$2 22 | else 23 | NFILES=24 24 | fi 25 | 26 | build_egg &> /dev/null 27 | 28 | latest_egg=$(ls -t /home/hadoop/reqs/daskvsspark-*.egg | head -n 1) 29 | 30 | cd /home/hadoop/daskvsspark/daskvsspark/ 31 | 32 | TZ=UTC PYSPARK_DRIVER_PYTHON=python3 PYSPARK_PYTHON=python3 \ 33 | spark-submit \ 34 | --master yarn \ 35 | --deploy-mode client \ 36 | --driver-memory 8g \ 37 | --executor-memory 3g \ 38 | --num-executors 4 \ 39 | --executor-cores 4 \ 40 | --conf "spark.yarn.executor.memoryOverhead=2g" \ 41 | --conf "spark.driver.extraJavaOptions=-Dlog4j.configuration=file:///home/hadoop/reqs/log4j.properties" \ 42 | --py-files ${latest_egg} \ 43 | --jars /home/hadoop/reqs/daskvsspark-udafs_2.11-0.0.1.jar \ 44 | aggregate_spark.py \ 45 | --input "s3://parsely-public/jbennet/daskvsspark/events" \ 46 | --output "s3://parsely-public/jbennet/daskvsspark/aggs_spark" \ 47 | --count $COUNT \ 48 | --nfiles $NFILES 49 | -------------------------------------------------------------------------------- /2018/daskvsspark/daskvsspark/common.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 2 | 3 | 4 | def set_display_options(): 5 | import pandas as pd 6 | pd.set_option('display.max_colwidth', 1000) 7 | pd.set_option('display.expand_frame_repr', False) 8 | 9 | 10 | def format_id(customer, url, ts): 11 | """Create a unique id for the aggregated record.""" 12 | return "{}|{}|{:%Y-%m-%dT%H:%M:%S}".format(url, customer, ts) 13 | 14 | 15 | def format_metrics(page_views, visitors): 16 | """Create a dict of metrics.""" 17 | return { 18 | "page_views": page_views, 19 | "visitors": visitors 20 | } 21 | -------------------------------------------------------------------------------- /2018/daskvsspark/daskvsspark/context.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 2 | from pyspark.context import SparkContext, SparkConf 3 | from pyspark.sql import SQLContext 4 | 5 | # template path. Event_count will be replaced by a number. 6 | PATH_TEMPLATE = '{root}/{event_count}-{nfiles}' 7 | INPUT_ROOT = './events' 8 | OUTPUT_ROOT = "./aggs_spark" 9 | 10 | 11 | def initialize(target_partitions=None): 12 | """Returns SparkContext and SQLContext.""" 13 | conf = SparkConf() 14 | extra_settings = { 15 | 'spark.serializer': 'org.apache.spark.serializer.KryoSerializer', 16 | 'spark.executor.extraJavaOptions': '-XX:+UseG1GC' 17 | } 18 | if target_partitions: 19 | extra_settings['spark.default.parallelism'] = target_partitions 20 | 21 | conf.setAll(extra_settings.items()) 22 | environment = {'PYTHON_EGG_CACHE': '/tmp/python-eggs'} 23 | sc = SparkContext(conf=conf, environment=environment) 24 | 25 | sqlContext = SQLContext(sc) 26 | if target_partitions: 27 | sqlContext.setConf('spark.sql.shuffle.partitions', target_partitions) 28 | 29 | jvm_logger = sc._jvm.org.apache.log4j 30 | jvm_logger.LogManager.getLogger("org").setLevel(jvm_logger.Level.ERROR) 31 | jvm_logger.LogManager.getLogger("akka").setLevel(jvm_logger.Level.ERROR) 32 | return sc, sqlContext 33 | -------------------------------------------------------------------------------- /2018/daskvsspark/daskvsspark/notes.txt: -------------------------------------------------------------------------------- 1 | ------------- 2 | Size of data: 3 | ------------- 4 | 5 | 100-24: 196K 6 | 10,000-24: 196K 7 | 1,000,000-24: 1.8M 8 | 10,000,000-24: 16M 9 | 10,000,000-96: 17M 10 | 100,000,000-24: 171M 11 | 100,000,000-48: 189M 12 | 100,000,000-96: 198M 13 | 100,000,000-192: 169M 14 | 100,000,000-384: 165M 15 | 100,000,000-984: 169M 16 | 1,000,000,000-500: 3.3G 17 | 18 | 19 | ---------------------------- 20 | Partitioning on 100,000,000: 21 | ---------------------------- 22 | 23 | 100k, 984 files, 41 per hr 24 | 250k, 384 files, 16 per hr 25 | 500k, 192 files, 8 per hr 26 | 1m, 96 files, 4 per hr 27 | 2m, 48 files, 2 per hr * 28 | 4m, 24 files, 1 per hr 29 | 30 | *: best for Spark and Dask 31 | 32 | --------------------------------- 33 | Spark with python3 and custom agg 34 | --------------------------------- 35 | 36 | (talks3) --- daskvsspark/daskvsspark ‹master*M› » ./tmp_run_all_spark.sh 126 ↵ 37 | 10 records, 24 files (1 per hour): done in 0:00:10.601392. 38 | 100 records, 24 files (1 per hour): done in 0:00:11.315226. 39 | 10,000 records, 24 files (1 per hour): done in 0:00:11.744349. 40 | 1,000,000 records, 24 files (1 per hour): done in 0:00:15.394712. 41 | 10,000,000 records, 24 files (1 per hour): done in 0:00:29.044079. 42 | 10,000,000 records, 96 files (4 per hour): done in 0:00:34.295349. 43 | 44 | 100,000,000 records, 984 files (41 per hour): done in 0:03:41.323534. 45 | 100,000,000 records, 384 files (16 per hour): done in 0:03:14.743094. 46 | 100,000,000 records, 192 files (8 per hour): done in 0:02:52.175157. 47 | 100,000,000 records, 96 files (4 per hour): done in 0:03:09.673154. 48 | 100,000,000 records, 48 files (2 per hour): done in 0:02:50.821578. * 49 | 100,000,000 records, 24 files (1 per hour): done in 0:02:57.805231. 50 | 51 | 1,000,000,000 records, 500 files (20 per hour): done in 0:45:08.288687. 52 | 1,000,000,000 records, 500 files (20 per hour): done in 0:41:14.634671. 53 | 1,000,000,000 records, 240 files (10 per hour): done in 0:51:39.638557. 54 | 55 | --------------------------------------- 56 | Dask with default scheduler and python3 57 | --------------------------------------- 58 | 59 | (talks3) --- daskvsspark/daskvsspark ‹master*AM› » ./tmp_run_all_dask.sh 1 ↵ 60 | 10 records, 24 files (1 per hour): done in 0:00:04.280264. 61 | 100 records, 24 files (1 per hour): done in 0:00:01.490881. 62 | 10,000 records, 24 files (1 per hour): done in 0:00:02.811427. 63 | 1,000,000 records, 24 files (1 per hour): done in 0:00:03.013248. 64 | 10,000,000 records, 24 files (1 per hour): done in 0:00:06.194535. 65 | 10,000,000 records, 96 files (4 per hour): done in 0:00:08.708831. 66 | 67 | 100,000,000 records, 984 files (41 per hour): done in 0:01:10.351981. 68 | 100,000,000 records, 384 files (16 per hour): done in 0:00:49.119739. 69 | 100,000,000 records, 192 files (8 per hour): done in 0:00:41.575053. 70 | 100,000,000 records, 96 files (4 per hour): done in 0:00:38.806466. 71 | 100,000,000 records, 48 files (2 per hour): done in 0:00:37.713205. * 72 | 100,000,000 records, 24 files (1 per hour): done in 0:01:03.122334. 73 | 74 | 1,000,000,000 records, 500 files (20 per hour): done in 0:16:11.660423. 75 | 1,000,000,000 records, 240 files (10 per hour): done in 0:16:34.453926. 76 | 77 | --------------- 78 | Running on YARN 79 | --------------- 80 | 81 | Master: 82 | ------- 83 | m4.xlarge 84 | 8 vCore, 16 GiB memory, EBS only storage 85 | EBS Storage:32 GiB 86 | 87 | Core: 2 88 | ------- 89 | c4.2xlarge 90 | 8 vCore, 15 GiB memory, EBS only storage 91 | EBS Storage:64 GiB 92 | 93 | Settings used: 94 | 95 | --driver-memory 8g 96 | --executor-memory 3g 97 | --num-executors 4 98 | --executor-cores 4 99 | --conf "spark.yarn.executor.memoryOverhead=2g" 100 | 101 | In Dask, this would correspond to: 102 | 103 | Master: 104 | $ dask-scheduler 105 | $ PYTHONPATH=/home/hadoop/reqs/daskvsspark-0.1-py3.6.egg dask-worker --nthreads 4 --memory-limit 5G tcp://10.21.0.76:8786 106 | 107 | Core (2): 108 | $ PYTHONPATH=/home/hadoop/reqs/daskvsspark-0.1-py3.6.egg dask-worker --nprocs 2 --nthreads 4 --memory-limit 5G tcp://10.21.0.76:8786 109 | 110 | 111 | To run `aggregate_dask.sh`: 112 | 113 | $ export PATH="/home/hadoop/conda/bin:$PATH" 114 | $ source activate dvss 115 | 116 | 117 | ------------- 118 | YARN REST API 119 | ------------- 120 | 121 | curl -s $HOSTNAME:8088/ws/v1/cluster | jq 122 | curl -s $HOSTNAME:8088/ws/v1/cluster/metrics | jq 123 | curl -s $HOSTNAME:8088/ws/v1/cluster/scheduler | jq 124 | 125 | "totalMB": 23040, 126 | "totalVirtualCores": 16, 127 | "totalNodes": 2, 128 | 129 | ----- 130 | Spark 131 | ----- 132 | 133 | [hadoop@ip-10-21-0-173 daskvsspark]$ ./aggregate_spark_yarn.sh 1000000000 500 134 | 1,000,000,000 records, 500 files (20 per hour): done in 0:12:15.945298. 135 | [hadoop@ip-10-21-0-173 daskvsspark]$ ./aggregate_spark_yarn.sh 1000000000 500 136 | 1,000,000,000 records, 500 files (20 per hour): done in 0:11:59.845888. 137 | [hadoop@ip-10-21-0-173 daskvsspark]$ ./aggregate_spark_yarn.sh 1000000000 500 138 | 1,000,000,000 records, 500 files (20 per hour): done in 0:12:14.694722. 139 | 140 | ---------- 141 | Dask Yarn: 142 | ---------- 143 | 144 | n_workers=4, memory=5*1024,4 cpus=3: only uses 12 cpus, but all the mem 145 | [hadoop@ip-10-21-0-76 daskvsspark]$ ./aggregate_dask_yarn.sh tcp://10.21.0.76:36955 1000000000 500 146 | 1,000,000,000 records, 500 files (20 per hour): done in 0:11:20.381808. 147 | [hadoop@ip-10-21-0-63 daskvsspark]$ ./aggregate_dask_yarn.sh tcp://10.21.0.63:46101 1000000000 500 148 | 1,000,000,000 records, 500 files (20 per hour): done in 0:11:15.664728. 149 | [hadoop@ip-10-21-0-63 daskvsspark]$ ./aggregate_dask_yarn.sh tcp://10.21.0.63:46101 1000000000 500 150 | 1,000,000,000 records, 500 files (20 per hour): done in 0:11:12.667145. 151 | 152 | (5 workers and 3 workers had worse results) 153 | 154 | ------------- 155 | Dask console: 156 | ------------- 157 | 158 | 5 workers (2 x core + 1 x master) with 4 cores and 5G memory each: 159 | 160 | (dvss) [hadoop@ip-10-21-0-229 daskvsspark]$ ./aggregate_dask_yarn.sh tcp://10.21.0.229:8786 1000000000 500 161 | 1,000,000,000 records, 500 files (20 per hour): done in 0:09:44.437957. 162 | (dvss) [hadoop@ip-10-21-0-229 daskvsspark]$ ./aggregate_dask_yarn.sh tcp://10.21.0.229:8786 1000000000 500 163 | 1,000,000,000 records, 500 files (20 per hour): done in 0:09:45.010486. 164 | (dvss) [hadoop@ip-10-21-0-229 daskvsspark]$ ./aggregate_dask_yarn.sh tcp://10.21.0.229:8786 1000000000 500 165 | 1,000,000,000 records, 500 files (20 per hour): done in 0:09:40.223227. 166 | 167 | 4 workers (2 x core) with 4 cores and 5G memory each: 168 | 169 | (dvss) [hadoop@ip-10-21-0-76 daskvsspark]$ ./aggregate_dask_yarn.sh tcp://10.21.0.76:8786 1000000000 500 170 | 1,000,000,000 records, 500 files (20 per hour): done in 0:11:29.925453. 171 | (dvss) [hadoop@ip-10-21-0-76 daskvsspark]$ ./aggregate_dask_yarn.sh tcp://10.21.0.76:8786 1000000000 500 172 | 1,000,000,000 records, 500 files (20 per hour): done in 0:11:37.121077. 173 | (dvss) [hadoop@ip-10-21-0-76 daskvsspark]$ ./aggregate_dask_yarn.sh tcp://10.21.0.76:8786 1000000000 500 174 | 1,000,000,000 records, 500 files (20 per hour): done in 0:11:43.173057. 175 | -------------------------------------------------------------------------------- /2018/daskvsspark/daskvsspark/prepare.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 2 | # prepare.py 3 | import argparse 4 | import datetime as dt 5 | import math 6 | import itertools 7 | import os 8 | import random 9 | import sys 10 | 11 | import pytz 12 | 13 | from daskvsspark.context import initialize, INPUT_ROOT, PATH_TEMPLATE 14 | from daskvsspark.schema import MY_SCHEMA, PARTITION_FIELDS 15 | 16 | 17 | DATE = dt.datetime(2017, 9, 17) 18 | 19 | 20 | def generate_row(total_articles, session_ids): 21 | """Create page view event.""" 22 | # tuple fields: 23 | # customer, url, referrer, session_id, ts, year, month, day, hour 24 | minute = random.randint(0, 59) 25 | hour = random.randint(0, 23) 26 | article_number = random.randint(1, total_articles) 27 | referrer = random.choice(['http://google.com/', 'http://bing.com/', 'http://facebook.com/']) 28 | session_id = random.choice(session_ids) 29 | return ( 30 | 'a.com', 31 | 'http://a.com/articles/{}'.format(article_number), 32 | referrer, 33 | session_id, 34 | DATE.replace(hour=hour, minute=minute, tzinfo=pytz.UTC), 35 | '{:04}'.format(DATE.year), 36 | '{:02}'.format(DATE.month), 37 | '{:02}'.format(DATE.day), 38 | '{:02}'.format(hour) 39 | ) 40 | 41 | 42 | def nfiles(records, records_per_file): 43 | """How many files per hour, and total files to generate.""" 44 | parts_per_hour = max(1, int(records / records_per_file / 24)) 45 | total_files = parts_per_hour * 24 46 | return parts_per_hour, total_files 47 | 48 | 49 | def generate_rows(sc, records, records_per_file): 50 | """Generate data.""" 51 | random.seed(records) 52 | parts_per_hour, total_files = nfiles(records, records_per_file) 53 | part_size = int(records / parts_per_hour) 54 | actual_records_per_file = int(records / total_files) 55 | print('Generating {} files(s) ({:,} per hour) with {:,} ({:,} actual) records each...'.format( 56 | total_files, 57 | parts_per_hour, 58 | records_per_file, 59 | actual_records_per_file)) 60 | 61 | total_articles = math.ceil(math.pow(records, 1.0/3)) 62 | session_ids = [''.join(t) for t in list(itertools.permutations(list('abcdefg'), 3))] 63 | data = (sc.parallelize([], parts_per_hour) 64 | .mapPartitions(lambda rs: (generate_row(total_articles, session_ids) 65 | for _ in range(part_size)))) 66 | return data 67 | 68 | 69 | if __name__ == '__main__': 70 | parser = argparse.ArgumentParser() 71 | parser.add_argument("--count", type=int, default=100) 72 | parser.add_argument("--chunk-size", type=int, default=100000) 73 | myargs = parser.parse_args() 74 | 75 | parts_per_hour, total_files = nfiles(myargs.count, myargs.chunk_size) 76 | write_path = PATH_TEMPLATE.format(root=INPUT_ROOT, event_count=myargs.count, nfiles=total_files) 77 | 78 | # cleanup before writing 79 | if os.path.exists(write_path): 80 | print('Path exists: {}. Exiting.'.format(write_path)) 81 | sys.exit(0) 82 | 83 | sc, sqlContext = initialize() 84 | 85 | # mock some data 86 | started = dt.datetime.now() 87 | print('Generating data...') 88 | data = generate_rows(sc, myargs.count, myargs.chunk_size) 89 | df = sqlContext.createDataFrame(data, MY_SCHEMA) 90 | 91 | print('Generated {:,} records with {:,} files per hour in {}.'.format( 92 | myargs.count, parts_per_hour, dt.datetime.now() - started)) 93 | 94 | # write parquet 95 | started = dt.datetime.now() 96 | print('Writing {:,} records...'.format(myargs.count)) 97 | (df.write 98 | .parquet(write_path, partitionBy=PARTITION_FIELDS, compression='gzip')) 99 | print('Wrote {:,} records in {}.'.format(myargs.count, dt.datetime.now() - started)) 100 | -------------------------------------------------------------------------------- /2018/daskvsspark/daskvsspark/prepare.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | if [ ! -z $1 ] 3 | then 4 | COUNT=$1 5 | else 6 | COUNT=100 7 | fi 8 | 9 | if [ ! -z $2 ] 10 | then 11 | CHUNKSIZE=$2 12 | else 13 | CHUNKSIZE=100000 14 | fi 15 | 16 | TZ=UTC PYSPARK_DRIVER_PYTHON=`which python` PYSPARK_PYTHON=`which python` \ 17 | $SPARK_HOME/bin/spark-submit \ 18 | --master "local[4]" \ 19 | --deploy-mode client \ 20 | --driver-memory 8g \ 21 | --executor-memory 2g \ 22 | --num-executors 4 \ 23 | --driver-java-options "-Droot.logger=ERROR,console" \ 24 | prepare.py --count $COUNT --chunk-size $CHUNKSIZE 25 | -------------------------------------------------------------------------------- /2018/daskvsspark/daskvsspark/schema.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 2 | # schema.py 3 | from pyspark.sql.types import * 4 | 5 | 6 | PARTITION_FIELDS = ['year', 'month', 'day', 'hour', "customer"] 7 | 8 | MY_SCHEMA = StructType([ 9 | StructField('customer', StringType(), True), 10 | StructField('url', StringType(), True), 11 | StructField('referrer', StringType(), True), 12 | StructField('session_id', StringType(), True), 13 | StructField('ts', TimestampType(), True), 14 | # partitioning keys 15 | StructField('year', StringType(), nullable=False), 16 | StructField('month', StringType(), nullable=False), 17 | StructField('day', StringType(), nullable=False), 18 | StructField('hour', StringType(), nullable=False), 19 | ]) 20 | -------------------------------------------------------------------------------- /2018/daskvsspark/daskvsspark/show.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 2 | # show.py 3 | 4 | import glob 5 | import os 6 | import sys 7 | import simplejson as json 8 | from pprint import pprint 9 | 10 | 11 | if __name__ == '__main__': 12 | if len(sys.argv) != 3: 13 | print('Usage: {} '.format(sys.argv[0])) 14 | sys.exit(0) 15 | 16 | json_path = sys.argv[1] 17 | requested = int(sys.argv[2]) 18 | json_files = glob.glob1(json_path, '*.json') 19 | 20 | collected, total = 0, 0 21 | for file_name in json_files: 22 | full_name = os.path.join(json_path, file_name) 23 | with open(full_name, 'r') as f: 24 | for line in f: 25 | if collected < requested: 26 | data = json.loads(line) 27 | pprint(data) 28 | print("") 29 | collected += 1 30 | total += 1 31 | 32 | print('-' * 20) 33 | print('Total files: {}'.format(len(json_files))) 34 | print('Total lines: {}'.format(total)) 35 | -------------------------------------------------------------------------------- /2018/daskvsspark/daskvsspark/start_dask.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | tmux new-session -d -s scheduler "dask-scheduler" 4 | tmux split-window "dask-worker localhost:8786 --nprocs 4" 5 | tmux attach 6 | 7 | -------------------------------------------------------------------------------- /2018/daskvsspark/daskvsspark/start_dask_yarn.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import dask_yarn 3 | from time import sleep 4 | 5 | 6 | if __name__ == '__main__': 7 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 8 | parser.add_argument('--verbose', dest='verbose', action='store_true', help='Print logs on exit', 9 | default=False) 10 | parser.add_argument('nworkers', help='Number of workers', type=int, default=4) 11 | parser.add_argument('ncores', help='Number of worker cores (threads)', type=int, default=3) 12 | parser.add_argument('memory', help='Worker memory (MiB)', type=int, default=5*1024) 13 | myargs = parser.parse_args() 14 | 15 | cluster = dask_yarn.DaskYARNCluster(env='/home/hadoop/reqs/dvss.zip', lang='en_US.UTF-8') 16 | cluster.start(n_workers=myargs.nworkers, memory=myargs.memory, cpus=myargs.ncores) 17 | try: 18 | while True: 19 | print('-' * 20) 20 | print('Cluster scheduler: {}.'.format(cluster.scheduler_address)) 21 | bk = cluster.local_cluster.scheduler.services['bokeh'].server 22 | print('Bokeh: http://{}:{}'.format(bk.address, bk.port)) 23 | sleep(20) 24 | except KeyboardInterrupt: 25 | print('Interrupted, exiting.') 26 | 27 | print('-' * 20) 28 | if myargs.verbose: 29 | cluster.knit.print_logs() 30 | print('Cluster is done.') 31 | -------------------------------------------------------------------------------- /2018/daskvsspark/deployment/bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Are we running on a master node? 4 | cat /var/lib/info/instance.json | grep '"isMaster": true' 5 | IS_MASTER=$? 6 | 7 | unset PYTHON_INSTALL_LAYOUT 8 | export LC_ALL=C.UTF-8 9 | export LANG=C.UTF-8 10 | 11 | # Stop at any error, show all commands 12 | set -ex 13 | 14 | 15 | install_python_36() { 16 | # Ensure Python 3.6 is installed 17 | if [[ ! -x /usr/local/bin/python3.6 ]]; then 18 | echo "Python 3.6 not installed, installing" 19 | # Compilers and related tools: 20 | sudo yum groupinstall -y "development tools" 21 | # Libraries needed during compilation to enable all features of Python: 22 | sudo yum install -y zlib-devel bzip2-devel openssl-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel expat-devel 23 | # Download and install Python 3.6.1 24 | wget https://s3.amazonaws.com/parsely-public/chef-pkgs/python_3.6.1_x86_64.rpm 25 | sudo rpm -ivh python_3.6.1_x86_64.rpm 26 | # Make sure we have pip 27 | sudo /usr/local/bin/python3 -m ensurepip --upgrade 28 | fi 29 | 30 | echo "Note: Python 3.6 will be available as python3, not python" 31 | echo "Be sure to set PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON in Configurations" 32 | } 33 | 34 | update_packages() { 35 | # Do this again just in case 36 | unset PYTHON_INSTALL_LAYOUT 37 | 38 | PIP="sudo /usr/local/bin/pip3" 39 | 40 | cd /home/hadoop/ 41 | 42 | # this includes a jar also 43 | aws s3 cp --recursive s3://parsely-public/jbennet/daskvsspark/reqs/ ./reqs 44 | chmod +x ./reqs/*.sh 45 | 46 | # needed to install python-snappy 47 | sudo yum install -y snappy-devel 48 | 49 | $PIP install -U pip 50 | $PIP install -r ./reqs/requirements.txt 51 | } 52 | 53 | install_conda() { 54 | if [[ ! -d /home/hadoop/conda ]]; then 55 | echo "Downloading conda" 56 | wget --quiet https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh 57 | chmod +x ~/miniconda.sh 58 | 59 | echo "Installing conda" 60 | ~/miniconda.sh -b -p ~/conda 61 | 62 | export PATH="/home/hadoop/conda/bin:$PATH" 63 | echo 'export PATH="/home/hadoop/conda/bin:$PATH"' >> ~/.bashrc 64 | 65 | echo "Updating conda" 66 | conda update --yes conda 67 | conda info -a 68 | fi 69 | } 70 | 71 | create_conda_env() { 72 | if [[ ! -f /home/hadoop/conda/envs/dvss ]]; then 73 | echo "Creatinv venv dvss" 74 | conda create -n dvss --copy -y -q python=3 75 | echo "Installing requirements into venv" 76 | conda install -n dvss --copy -y -c conda-forge --file ~/reqs/requirements.txt --file ~/reqs/requirements-dask.txt 77 | fi 78 | } 79 | 80 | 81 | install_python_36 82 | update_packages 83 | install_conda 84 | create_conda_env 85 | -------------------------------------------------------------------------------- /2018/daskvsspark/deployment/conf.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "Classification": "hdfs-site", 4 | "Properties": { 5 | "dfs.block.size": "134217728", 6 | "dfs.replication": "2" 7 | } 8 | }, 9 | { 10 | "Classification": "mapred-site", 11 | "Properties": { 12 | "mapreduce.job.reduces": "400", 13 | "mapreduce.reduce.memory.mb": "1408", 14 | "mapreduce.reduce.java.opts": "-Xmx1126m" 15 | } 16 | }, 17 | { 18 | "Classification": "spark-defaults", 19 | "Properties": { 20 | "spark.serializer": "org.apache.spark.serializer.KryoSerializer", 21 | "spark.dynamicAllocation.executorIdleTimeout": "30s", 22 | "spark.executor.heartbeatInterval" : "5s", 23 | "spark.default.parallelism": "500", 24 | "spark.sql.shuffle.partitions": "500", 25 | "spark.yarn.executor.memoryOverhead": "3g", 26 | "spark.executor.memory": "4g", 27 | "spark.executor.cores": "4", 28 | "spark.driver.memory": "15g" 29 | } 30 | }, 31 | { 32 | "Classification": "yarn-site", 33 | "Properties": { 34 | "yarn.nodemanager.vmem-check-enabled": "false" 35 | } 36 | }, 37 | { 38 | "Classification": "zeppelin-env", 39 | "Properties": {}, 40 | "Configurations": [ 41 | { 42 | "Classification": "export", 43 | "Properties": { 44 | "ZEPPELIN_NOTEBOOK_STORAGE" : "org.apache.zeppelin.notebook.repo.S3NotebookRepo", 45 | "ZEPPELIN_NOTEBOOK_S3_BUCKET": "parsely-zeppelin-notebooks", 46 | "ZEPPELIN_NOTEBOOK_S3_USER": "irina", 47 | "ZEPPELIN_MEM": "\"-Xms4096m -Xmx4096m -XX:MaxPermSize=2048m\"", 48 | "ZEPPELIN_INTP_MEM": "\"-Xms4096m -Xmx4096m -XX:MaxPermSize=2048m\"" 49 | }, 50 | "Configurations": [] 51 | } 52 | ] 53 | } 54 | ] 55 | -------------------------------------------------------------------------------- /2018/daskvsspark/deployment/config.yaml: -------------------------------------------------------------------------------- 1 | scheduler: 2 | serviceType: "NodePort" 3 | 4 | worker: 5 | replicas: 1 6 | resources: 7 | limits: 8 | cpu: 4 9 | memory: 8G 10 | requests: 11 | cpu: 4 12 | memory: 8G 13 | 14 | jupyter: 15 | enabled: false 16 | 17 | -------------------------------------------------------------------------------- /2018/daskvsspark/deployment/create_cluster.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | aws emr create-cluster \ 4 | --applications Name=Hadoop Name=Spark Name=Ganglia Name=Zeppelin \ 5 | --bootstrap-actions '[{"Path":"s3://parsely-public/jbennet/daskvsspark/reqs/bootstrap.sh","Name":"Dask Bootstrap"}]' \ 6 | --ebs-root-volume-size 20 \ 7 | --ec2-attributes '{"KeyName":"emr_jobs","InstanceProfile":"EMR_EC2_DefaultRole","SubnetId":"subnet-ca9b41bd","EmrManagedSlaveSecurityGroup":"sg-f6a19e93","EmrManagedMasterSecurityGroup":"sg-f7a19e92"}' \ 8 | --service-role EMR_DefaultRole \ 9 | --release-label emr-5.11.1 \ 10 | --log-uri 's3n://parsely-emr-logs/' \ 11 | --name 'IT Testing' \ 12 | --configurations file://./conf.json \ 13 | --instance-groups file://./instances.json \ 14 | --scale-down-behavior TERMINATE_AT_TASK_COMPLETION \ 15 | --region us-east-1 16 | -------------------------------------------------------------------------------- /2018/daskvsspark/deployment/deploy_code.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cd .. 4 | 5 | rsync -azvr \ 6 | --include "README.md" \ 7 | --include "requirements.txt" \ 8 | --include "setup.py" \ 9 | --include "/daskvsspark/" \ 10 | --include "/daskvsspark/*.py" \ 11 | --include "/daskvsspark/aggregate_*_yarn.sh" \ 12 | --exclude "*" \ 13 | ./ hadoop@dvss:/home/hadoop/daskvsspark/ 14 | -------------------------------------------------------------------------------- /2018/daskvsspark/deployment/deploy_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Stop at any error, show all commands 4 | set -ex 5 | 6 | S3_PATH="s3://parsely-public/jbennet/daskvsspark/events/" 7 | 8 | # copy fake data to s3 9 | aws s3 sync ../daskvsspark/events/ ${S3_PATH} 10 | -------------------------------------------------------------------------------- /2018/daskvsspark/deployment/deploy_reqs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | S3_PATH="s3://parsely-public/jbennet/daskvsspark/reqs/" 4 | 5 | # copy bootstrap script to s3 6 | aws s3 cp ../deployment/bootstrap.sh ${S3_PATH} 7 | aws s3 cp ../deployment/setup_dvss.sh ${S3_PATH} 8 | 9 | # copy log conf 10 | aws s3 cp ../deployment/log4j.properties ${S3_PATH} 11 | 12 | # copy reqs to s3 13 | aws s3 cp ../requirements.txt ${S3_PATH} 14 | aws s3 cp ../requirements-dask.txt ${S3_PATH} 15 | 16 | # copy jars to s3 17 | aws s3 cp ../scala/target/scala-2.11/daskvsspark-udafs_2.11-0.0.1.jar ${S3_PATH} 18 | -------------------------------------------------------------------------------- /2018/daskvsspark/deployment/instances.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "InstanceCount": 1, 4 | "BidPrice": "2.50", 5 | "EbsConfiguration": { 6 | "EbsBlockDeviceConfigs": [ 7 | { 8 | "VolumeSpecification": { 9 | "SizeInGB": 32, 10 | "VolumeType": "gp2" 11 | }, 12 | "VolumesPerInstance": 1 13 | } 14 | ] 15 | }, 16 | "InstanceGroupType": "MASTER", 17 | "InstanceType": "m4.xlarge", 18 | "Name": "Master" 19 | }, 20 | { 21 | "InstanceCount": 2, 22 | "BidPrice": "0.15", 23 | "EbsConfiguration": { 24 | "EbsBlockDeviceConfigs": [ 25 | { 26 | "VolumeSpecification": { 27 | "SizeInGB": 64, 28 | "VolumeType": "gp2" 29 | }, 30 | "VolumesPerInstance": 1 31 | } 32 | ], 33 | "EbsOptimized": true 34 | }, 35 | "InstanceGroupType": "CORE", 36 | "InstanceType": "c4.2xlarge", 37 | "Name": "Core" 38 | } 39 | ] -------------------------------------------------------------------------------- /2018/daskvsspark/deployment/log4j.properties: -------------------------------------------------------------------------------- 1 | # Error level only 2 | log4j.rootLogger=ERROR, console 3 | 4 | log4j.appender.console=org.apache.log4j.ConsoleAppender 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 7 | 8 | # Set the default spark-shell log level to WARN. When running the spark-shell, the 9 | # log level for this class is used to overwrite the root logger's log level, so that 10 | # the user can have different defaults for the shell and regular Spark apps. 11 | log4j.logger.org.apache.spark.repl.Main=ERROR 12 | 13 | # Settings to quiet third party logs that are too verbose 14 | log4j.logger.org.spark_project.jetty=WARN 15 | log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR 16 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 17 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 18 | log4j.logger.org.apache.parquet=ERROR 19 | log4j.logger.parquet=ERROR 20 | 21 | # Quiet even more verbose loggers 22 | log4j.logger.org.apache.spark.sql.execution.datasources.parquet=ERROR 23 | log4j.logger.org.apache.spark.sql.execution.datasources.FileScanRDD=ERROR 24 | log4j.logger.org.apache.hadoop.io.compress.CodecPool=ERROR 25 | log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator=ERROR 26 | log4j.logger.org.apache.spark.ContextCleaner=ERROR 27 | 28 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support 29 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL 30 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR 31 | -------------------------------------------------------------------------------- /2018/daskvsspark/deployment/setup_dvss.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # This should run on master first, and then on workers. 3 | 4 | # Are we running on a master node? 5 | cat /var/lib/info/instance.json | grep '"isMaster": true' 6 | IS_MASTER=$? 7 | 8 | set -e 9 | 10 | build_egg() { 11 | echo "Building the egg" 12 | cd /home/hadoop/daskvsspark/ 13 | python3 setup.py bdist_egg 14 | cp ./dist/*.egg /home/hadoop/reqs/ 15 | echo "Uploading the egg to s3" 16 | aws s3 cp ~/reqs/daskvsspark-0.1-py3.6.egg s3://parsely-public/jbennet/daskvsspark/reqs/ 17 | } 18 | 19 | package_env() { 20 | #echo "Installing daskvsspark into master's venv" 21 | #cd /home/hadoop/daskvsspark 22 | #~/conda/envs/dvss/bin/python setup.py install -q 23 | 24 | if [[ -f ~/reqs/dvss.zip ]]; then 25 | rm ~/reqs/dvss.zip 26 | fi 27 | 28 | echo "Zipping up venv" 29 | cd ~/conda/envs 30 | zip -qr dvss.zip dvss 31 | mv dvss.zip ~/reqs/ 32 | echo "Uploading zip to s3" 33 | aws s3 cp ~/reqs/dvss.zip s3://parsely-public/jbennet/daskvsspark/reqs/ 34 | } 35 | 36 | download_env() { 37 | echo "Downloading venv and egg" 38 | aws s3 cp s3://parsely-public/jbennet/daskvsspark/reqs/daskvsspark-0.1-py3.6.egg /home/hadoop/reqs/ 39 | aws s3 cp s3://parsely-public/jbennet/daskvsspark/reqs/dvss.zip /home/hadoop/reqs/ 40 | } 41 | 42 | if [[ $IS_MASTER eq 0 ]]; then 43 | build_egg 44 | package_env 45 | else 46 | download_env 47 | fi -------------------------------------------------------------------------------- /2018/daskvsspark/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.1" 2 | 3 | services: 4 | scheduler: 5 | build: 6 | context: . 7 | dockerfile: Dockerfile 8 | args: 9 | AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID 10 | AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY 11 | AWS_DEFAULT_REGION: $AWS_DEFAULT_REGION 12 | image: jbennet/dvss-base 13 | hostname: dvss-scheduler 14 | ports: 15 | - "8786:8786" 16 | - "8787:8787" 17 | command: ["dask-scheduler"] 18 | 19 | worker: 20 | build: 21 | context: . 22 | dockerfile: Dockerfile 23 | args: 24 | AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID 25 | AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY 26 | AWS_DEFAULT_REGION: $AWS_DEFAULT_REGION 27 | image: jbennet/dvss-base 28 | hostname: dvss-worker 29 | command: ["dask-worker scheduler:8786 --nprocs 4 --nthreads 1"] 30 | -------------------------------------------------------------------------------- /2018/daskvsspark/dvss-helm/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *~ 18 | # Various IDEs 19 | .project 20 | .idea/ 21 | *.tmproj 22 | -------------------------------------------------------------------------------- /2018/daskvsspark/dvss-helm/Chart.yaml: -------------------------------------------------------------------------------- 1 | name: dvss 2 | fullname: dvss 3 | version: 1.0.0 4 | appVersion: 0.0.1 5 | description: A Helm chart for dvss 6 | -------------------------------------------------------------------------------- /2018/daskvsspark/dvss-helm/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | Thank you for installing {{ .Chart.Name | upper }}, released at name: {{ .Release.Name }}. 2 | 3 | To learn more about the release, try: 4 | 5 | $ helm status {{ .Release.Name }} # information about running pods and this message 6 | $ helm get {{ .Release.Name }} # get full Kubernetes specification 7 | 8 | This release includes a Dask scheduler and {{ .Values.worker.replicas }} Dask worker(s), each 9 | with {{ .Values.worker.default_resources.cpu }} cores and {{ .Values.worker.default_resources.memory }} of memory. 10 | 11 | Dask scheduler exposes external services to connect directly to the Dask cluster. You can get 12 | these addresses by running the following: 13 | 14 | {{- if contains "LoadBalancer" .Values.scheduler.serviceType }} 15 | export DASK_SCHEDULER=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ template "dvss-helm.fullname" . }}-scheduler -o jsonpath='{.status.loadBalancer.ingress[0].hostname}') 16 | export DASK_SCHEDULER_UI_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ template "dvss-helm.fullname" . }}-scheduler -o jsonpath='{.status.loadBalancer.ingress[0].hostname}') 17 | echo http://$DASK_SCHEDULER_UI_IP:{{ .Values.webUI.servicePort }} -- Dask dashboard 18 | echo http://$DASK_SCHEDULER:{{ .Values.scheduler.servicePort }} -- Dask Client connection 19 | {{- else if contains "NodePort" .Values.scheduler.serviceType }} 20 | export DASK_SCHEDULER_PORT=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ template "dvss-helm.fullname" . }}-scheduler -o jsonpath='{.spec.ports[0].nodePort}') 21 | export DASK_SCHEDULER_UI_PORT=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ template "dvss-helm.fullname" . }}-scheduler -o jsonpath='{.spec.ports[1].nodePort}') 22 | echo http://localhost:$DASK_SCHEDULER_UI_PORT -- Dask dashboard 23 | echo http://localhost:$DASK_SCHEDULER_PORT -- Dask Client connection 24 | {{- end }} -------------------------------------------------------------------------------- /2018/daskvsspark/dvss-helm/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* vim: set filetype=mustache: */}} 2 | {{/* 3 | Expand the name of the chart. 4 | */}} 5 | {{- define "dvss-helm.name" -}} 6 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} 7 | {{- end -}} 8 | 9 | {{/* 10 | Create a default fully qualified app name. 11 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 12 | If release name contains chart name it will be used as a full name. 13 | */}} 14 | {{- define "dvss-helm.fullname" -}} 15 | {{- if .Values.fullnameOverride -}} 16 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} 17 | {{- else -}} 18 | {{- $name := default .Chart.Name .Values.nameOverride -}} 19 | {{- if contains $name .Release.Name -}} 20 | {{- .Release.Name | trunc 63 | trimSuffix "-" -}} 21 | {{- else -}} 22 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} 23 | {{- end -}} 24 | {{- end -}} 25 | {{- end -}} 26 | 27 | {{/* 28 | Create chart name and version as used by the chart label. 29 | */}} 30 | {{- define "dvss-helm.chart" -}} 31 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} 32 | {{- end -}} 33 | -------------------------------------------------------------------------------- /2018/daskvsspark/dvss-helm/templates/scheduler-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1beta2 2 | kind: Deployment 3 | metadata: 4 | name: {{ template "dvss-helm.fullname" . }}-scheduler 5 | labels: 6 | app: {{ template "dvss-helm.name" . }} 7 | heritage: {{ .Release.Service | quote }} 8 | release: {{ .Release.Name | quote }} 9 | chart: {{ template "dvss-helm.chart" . }} 10 | component: scheduler 11 | spec: 12 | replicas: {{ .Values.scheduler.replicas }} 13 | selector: 14 | matchLabels: 15 | app: {{ template "dvss-helm.name" . }} 16 | release: {{ .Release.Name | quote }} 17 | component: scheduler 18 | strategy: 19 | type: RollingUpdate 20 | template: 21 | metadata: 22 | labels: 23 | app: {{ template "dvss-helm.name" . }} 24 | release: {{ .Release.Name | quote }} 25 | component: scheduler 26 | spec: 27 | containers: 28 | - name: {{ template "dvss-helm.fullname" . }}-scheduler 29 | image: "{{ .Values.scheduler.image.repository }}:{{ .Values.scheduler.image.tag }}" 30 | imagePullPolicy: {{ .Values.scheduler.image.pullPolicy }} 31 | args: 32 | - dask-scheduler 33 | - --port 34 | - "{{ .Values.scheduler.servicePort }}" 35 | - --bokeh-port 36 | - "8787" 37 | ports: 38 | - containerPort: 8786 39 | - containerPort: 8787 40 | resources: 41 | {{ toYaml .Values.scheduler.resources | indent 12 }} 42 | env: 43 | {{ toYaml .Values.scheduler.env | indent 12 }} 44 | -------------------------------------------------------------------------------- /2018/daskvsspark/dvss-helm/templates/scheduler-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ template "dvss-helm.fullname" . }}-scheduler 5 | labels: 6 | app: {{ template "dvss-helm.name" . }} 7 | heritage: {{ .Release.Service | quote }} 8 | release: {{ .Release.Name | quote }} 9 | chart: {{ template "dvss-helm.chart" . }} 10 | component: scheduler 11 | spec: 12 | ports: 13 | - name: {{ template "dvss-helm.fullname" . }}-scheduler 14 | port: {{ .Values.scheduler.servicePort }} 15 | targetPort: 8786 16 | nodePort: {{ .Values.scheduler.nodePort }} 17 | - name: {{ template "dvss-helm.fullname" . }}-webui 18 | port: {{ .Values.webUI.servicePort }} 19 | targetPort: 8787 20 | nodePort: {{ .Values.webUI.nodePort }} 21 | selector: 22 | app: {{ template "dvss-helm.name" . }} 23 | release: {{ .Release.Name | quote }} 24 | component: scheduler 25 | type: {{ .Values.scheduler.serviceType }} 26 | -------------------------------------------------------------------------------- /2018/daskvsspark/dvss-helm/templates/worker-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1beta2 2 | kind: Deployment 3 | metadata: 4 | name: {{ template "dvss-helm.fullname" . }}-worker 5 | labels: 6 | app: {{ template "dvss-helm.name" . }} 7 | heritage: {{ .Release.Service | quote }} 8 | release: {{ .Release.Name | quote }} 9 | chart: {{ template "dvss-helm.chart" . }} 10 | component: worker 11 | spec: 12 | replicas: {{ .Values.worker.replicas }} 13 | selector: 14 | matchLabels: 15 | app: {{ template "dvss-helm.name" . }} 16 | release: {{ .Release.Name | quote }} 17 | component: worker 18 | strategy: 19 | type: RollingUpdate 20 | template: 21 | metadata: 22 | labels: 23 | app: {{ template "dvss-helm.name" . }} 24 | release: {{ .Release.Name | quote }} 25 | component: worker 26 | spec: 27 | containers: 28 | - name: {{ template "dvss-helm.fullname" . }}-worker 29 | image: "{{ .Values.worker.image.repository }}:{{ .Values.worker.image.tag }}" 30 | imagePullPolicy: {{ .Values.worker.image.pullPolicy }} 31 | args: 32 | - dask-worker 33 | {{- if .Values.worker.resources }} 34 | - {{ template "dvss-helm.fullname" . }}-scheduler:{{ .Values.scheduler.servicePort }} 35 | - --nthreads 36 | - {{ default .Values.worker.resources.limits.cpu .Values.worker.default_resources.cpu | quote }} 37 | - --memory-limit 38 | - {{ default .Values.worker.resources.limits.memory .Values.worker.default_resources.memory | quote }} 39 | {{- else if .Values.worker.default_resources }} 40 | - {{ template "dvss-helm.fullname" . }}-scheduler:{{ .Values.scheduler.servicePort }} 41 | - --nthreads 42 | - {{ .Values.worker.default_resources.cpu | quote }} 43 | - --memory-limit 44 | - {{ .Values.worker.default_resources.memory | quote }} 45 | {{- end }} 46 | - --no-bokeh 47 | ports: 48 | - containerPort: 8789 49 | resources: 50 | {{ toYaml .Values.worker.resources | indent 12 }} 51 | env: 52 | {{ toYaml .Values.worker.env | indent 12 }} 53 | -------------------------------------------------------------------------------- /2018/daskvsspark/dvss-helm/values.yaml: -------------------------------------------------------------------------------- 1 | nameOverride: dvss 2 | fullnameOverride: dvss 3 | 4 | scheduler: 5 | name: scheduler 6 | image: 7 | repository: "jbennet/dvss-base" 8 | tag: "latest" 9 | pullPolicy: IfNotPresent 10 | replicas: 1 11 | serviceType: "NodePort" 12 | servicePort: 8786 13 | nodePort: 30786 14 | resources: 15 | limits: 16 | cpu: 1 17 | memory: 1G 18 | requests: 19 | cpu: 1 20 | memory: 1G 21 | 22 | webUI: 23 | name: webui 24 | servicePort: 80 25 | nodePort: 30787 26 | 27 | worker: 28 | name: worker 29 | image: 30 | repository: "jbennet/dvss-base" 31 | tag: "latest" 32 | pullPolicy: IfNotPresent 33 | replicas: 1 34 | aptPackages: >- 35 | default_resources: 36 | cpu: 4 37 | memory: 12GiB 38 | resources: {} 39 | # limits: 40 | # cpu: 1 41 | # memory: 3G 42 | # requests: 43 | # cpu: 1 44 | # memory: 3G 45 | -------------------------------------------------------------------------------- /2018/daskvsspark/requirements-dask.txt: -------------------------------------------------------------------------------- 1 | dask==0.17.2 2 | distributed==1.21.6 3 | bokeh==0.12.15 # for UI 4 | fastparquet==0.1.5 # for parquet 5 | python-snappy==0.5.2 # for snappy compression in parquet 6 | knit==0.2.4 7 | s3fs==0.1.4 8 | -------------------------------------------------------------------------------- /2018/daskvsspark/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | ipython==6.3.1 2 | -------------------------------------------------------------------------------- /2018/daskvsspark/requirements.txt: -------------------------------------------------------------------------------- 1 | pytz==2017.2 2 | simplejson>=3.11.1 3 | -------------------------------------------------------------------------------- /2018/daskvsspark/samples/agg1hour.json: -------------------------------------------------------------------------------- 1 | { 2 | "_id": "http://a.com/articles/1|a.com|2017-09-15T01:01", 3 | "_index": "events", 4 | "customer": "a.com", 5 | "url": "http://a.com/articles/1", 6 | "freq": "1hour", 7 | "ts": "2017-09-15T01:01:00", 8 | "metrics": { 9 | "page_views": 3, 10 | "visitors": 3 11 | }, 12 | "referrers": { 13 | "http://google.com/": 1, 14 | "http://bing.com/": 1, 15 | "http://facebook.com/": 1 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /2018/daskvsspark/scala/README.md: -------------------------------------------------------------------------------- 1 | # Scala UDAFs for Pyspark 2 | 3 | ## Huh? 4 | 5 | Spark has lots and lots of wonderful aggregations! But sometimes, standard 6 | aggregations (`min`, `max`, `avg` etc.) are not enough. For example, what if 7 | I have a table like this: 8 | 9 | | url | referrer | 10 | |:---------------------:|:-----------------:| 11 | | http://a.com/article1 | http://google.com | 12 | | http://a.com/article2 | http://google.com | 13 | | http://a.com/article2 | http://yahoo.com | 14 | 15 | 16 | and I want to group things by `url`, count `referrers` in each group, and put 17 | those counts in a dict: 18 | 19 | ```sql 20 | select url, count_values(referrer) as referrers 21 | from df 22 | group by url 23 | ``` 24 | 25 | like this: 26 | 27 | | url | referrers | 28 | |:---------------------:|:-----------------------------------------------:| 29 | | http://a.com/article1 | {"http://google.com": 1} | 30 | | http://a.com/article2 | {"http://google.com": 1, "http://yahoo.com": 2} | 31 | 32 | There's one little problem. PySpark doesn't support UDAFs written in Python: 33 | 34 | https://issues.apache.org/jira/browse/SPARK-10915 35 | 36 | ## So I'm screwed? 37 | 38 | Not quite. It is possible to write a UDAF in Scala and call it from Python. 39 | 40 | ## You lost me at Scala. 41 | 42 | It's not so bad. Besides, I already wrote it. See the code in "udafs.scala". 43 | 44 | ## How do I build this? 45 | 46 | ``` 47 | $ cd scala 48 | $ sbt compile 49 | $ sbt package 50 | ``` 51 | 52 | If you have problems finding dependencies when you compile, try deleting the `~/.ivy2/` cache. 53 | 54 | ## How do I use this? 55 | 56 | Note: this will only work with Spark 2.1.0 and up. 57 | 58 | * Start with the jar on the classpath. 59 | * Get an instance of class using `sc._jvm` object. 60 | * Register it as a UDF to use in Spark SQL. 61 | * Or wrap it in a Python function to use in aggregations. 62 | 63 | ## Show me. 64 | 65 | ``` 66 | --- daskvsspark/scala $ ipyspark --driver-class-path target/scala-2.11/daskvsspark-udafs_2.11-0.0.1.jar 67 | Using Python version 3.6.5 (default, Apr 2 2018 14:34:27) 68 | SparkSession available as 'spark'. 69 | 70 | In [1]: df = sqlContext.createDataFrame([('url1', 'ref1'), ('url2', 'ref1'), ('url2', 'ref2')], ['url', 'referrer']) 71 | 72 | In [2]: agg_counter = sc._jvm.com.jbennet.daskvsspark.udafs.AggregateCounter() 73 | 74 | In [4]: sqlContext.sparkSession._jsparkSession.udf().register('count_values', agg_counter) 75 | Out[4]: JavaObject id=o45 76 | 77 | In [5]: df.createOrReplaceTempView('df') 78 | 79 | In [6]: sqlContext.sql('select url, count_values(referrer) as referrers from df group by url').show() 80 | +----+--------------------+ 81 | | url| referrers| 82 | +----+--------------------+ 83 | |url1| [ref1 -> 1]| 84 | |url2|[ref1 -> 1, ref2 ...| 85 | +----+--------------------+ 86 | ``` 87 | 88 | or: 89 | 90 | ``` 91 | In [7]: from pyspark.sql.column import Column, _to_java_column, _to_seq 92 | 93 | In [11]: def count_values(col): 94 | ...: counter = sc._jvm.com.jbennet.daskvsspark.udafs.AggregateCounter().apply 95 | ...: return Column(counter(_to_seq(sc, [col], _to_java_column))) 96 | ...: 97 | ...: 98 | 99 | In [12]: df.groupBy("url").agg(count_values("referrer").alias("referrer")).show() 100 | +----+--------------------+ 101 | | url| referrer| 102 | +----+--------------------+ 103 | |url1| [ref1 -> 1]| 104 | |url2|[ref1 -> 1, ref2 ...| 105 | +----+--------------------+ 106 | ``` 107 | 108 | You're welcome. -------------------------------------------------------------------------------- /2018/daskvsspark/scala/build.sbt: -------------------------------------------------------------------------------- 1 | name := "daskvsspark-udafs" 2 | version := "0.0.1" 3 | scalaVersion := "2.11.8" 4 | libraryDependencies += "org.apache.spark" %% "spark-core" % "2.1.0" 5 | libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.1.0" 6 | libraryDependencies += "org.apache.spark" %% "spark-hive" % "2.0.0" % "test" 7 | libraryDependencies += "com.holdenkarau" %% "spark-testing-base" % "2.0.0_0.4.4" % "test" 8 | -------------------------------------------------------------------------------- /2018/daskvsspark/scala/src/main/scala/com/jbennet/daskvsspark/udafs.scala: -------------------------------------------------------------------------------- 1 | package com.jbennet.daskvsspark.udafs 2 | 3 | import org.apache.spark.sql.expressions.MutableAggregationBuffer 4 | import org.apache.spark.sql.expressions.UserDefinedAggregateFunction 5 | import org.apache.spark.sql.Row 6 | import org.apache.spark.sql.types._ 7 | 8 | 9 | /** 10 | * Aggregate Counter. Counts values and returns a Map with "value" -> count. 11 | */ 12 | class AggregateCounter extends UserDefinedAggregateFunction { 13 | 14 | // This is the input fields for your aggregate function. 15 | override def inputSchema: org.apache.spark.sql.types.StructType = 16 | StructType( 17 | StructField("value", StringType) :: Nil 18 | ) 19 | 20 | // This is the internal fields you keep for computing your aggregate. 21 | override def bufferSchema: StructType = 22 | StructType( 23 | StructField("counter", MapType(StringType, IntegerType)) :: Nil 24 | ) 25 | 26 | // This is the output type of your aggregation function. 27 | override def dataType: DataType = MapType(StringType, IntegerType) 28 | 29 | override def deterministic: Boolean = true 30 | 31 | override def initialize(buffer: MutableAggregationBuffer): Unit = { 32 | buffer(0) = null 33 | } 34 | 35 | override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { 36 | var map = buffer.getAs[Map[String, Integer]](0) 37 | var value = input.getAs[String](0) 38 | buffer(0) = addValue(map, value) 39 | } 40 | 41 | override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { 42 | var m1 = buffer1.getAs[Map[String, Integer]](0) 43 | var m2 = buffer2.getAs[Map[String, Integer]](0) 44 | buffer1(0) = mergeMap(m1, m2) 45 | } 46 | 47 | override def evaluate(buffer: Row): Any = { 48 | buffer.getMap(0) 49 | } 50 | 51 | /** Add value to map. 52 | * 53 | * @param map map 54 | * @param value value 55 | * @return map 56 | */ 57 | def addValue(map: Map[String, Integer], value: String): Map[String, Integer] = (map, value) match { 58 | case (null, null) => Map() 59 | case (null, v) => Map(v -> 1) 60 | case (m, null) => m 61 | case _ => 62 | val zero: Integer = 0 63 | if (map.contains(value)) map + (value -> (map.getOrElse(value, zero) + 1)) 64 | else map + (value -> 1) 65 | } 66 | 67 | /** Add two maps into one. 68 | * 69 | * @param a first map to merge 70 | * @param b second map to merge 71 | * @return merged map 72 | */ 73 | def mergeMap(a: Map[String, Integer], b: Map[String, Integer]): Map[String, Integer] = (a, b) match { 74 | case (null, null) => null 75 | case (null, y) => y 76 | case (x, null) => x 77 | case _ => 78 | val zero: Integer = 0 79 | (a.keySet ++ b.keySet).map( 80 | k => { 81 | val v1: Integer = a.getOrElse(k, 0) 82 | val v2: Integer = b.getOrElse(k, 0) 83 | k -> (v1 + v2:Integer) 84 | } 85 | ).toMap 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /2018/daskvsspark/scala/src/test/scala/com/jbennet/daskvsspark/AggregateCounterTest.scala: -------------------------------------------------------------------------------- 1 | package com.jbennet.daskvsspark 2 | 3 | import com.holdenkarau.spark.testing._ 4 | import com.jbennet.daskvsspark.udafs.AggregateCounter 5 | import org.apache.spark.sql.Row 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.types._ 8 | import org.scalatest.FunSuite 9 | 10 | 11 | /** 12 | * Tests for AggregateSet 13 | */ 14 | class AggregateCounterTest extends FunSuite with DataFrameSuiteBase { 15 | 16 | private val schema = StructType(Array( 17 | StructField("url", StringType), 18 | StructField("referrer", StringType) 19 | )) 20 | 21 | private val aggcount = new AggregateCounter 22 | 23 | test("different keys should combine") { 24 | val data = Array( 25 | Row("url1", "ref2"), 26 | Row("url1", "ref1") 27 | ) 28 | val df = sqlContext.createDataFrame(sc.parallelize(data), schema) 29 | val rows = df.groupBy("url") 30 | .agg(aggcount(col("referrer"))) 31 | .collect() 32 | val agg1: Map[String, Integer] = rows(0)(1).asInstanceOf[Map[String, Integer]] 33 | assert(agg1.size == 2) 34 | assert(agg1 == Map("ref1" -> (1:Integer), "ref2" -> (1:Integer))) 35 | } 36 | 37 | test("same keys should add") { 38 | val data = Array( 39 | Row("url1", "ref1"), 40 | Row("url1", "ref1") 41 | ) 42 | val df = sqlContext.createDataFrame(sc.parallelize(data), schema) 43 | val rows = df.groupBy("url") 44 | .agg(aggcount(col("referrer"))) 45 | .collect() 46 | val agg1: Map[String, Integer] = rows(0)(1).asInstanceOf[Map[String, Integer]] 47 | assert(agg1.size == 1) 48 | assert(agg1 == Map("ref1" -> (2:Integer))) 49 | } 50 | 51 | test("null keys do not count") { 52 | val data = Array( 53 | Row("url1", null), 54 | Row("url1", "ref1") 55 | ) 56 | val df = sqlContext.createDataFrame(sc.parallelize(data), schema) 57 | val rows = df.groupBy("url") 58 | .agg(aggcount(col("referrer"))) 59 | .collect() 60 | val agg1: Map[String, Integer] = rows(0)(1).asInstanceOf[Map[String, Integer]] 61 | assert(agg1.size == 1) 62 | assert(agg1 == Map("ref1" -> (1:Integer))) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /2018/daskvsspark/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | setup(name='daskvsspark', 5 | version='0.1', 6 | description='Dask and Spark example', 7 | author='Irina Truong', 8 | author_email='irinatruong@gmail.com', 9 | packages=find_packages() 10 | ) 11 | -------------------------------------------------------------------------------- /2018/daskvsspark/trials/Aggregate without index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 34, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "import dask.dataframe as dd\n", 12 | "import itertools as it\n", 13 | "from pprint import pprint\n", 14 | "\n", 15 | "\n", 16 | "pd.set_option('display.expand_frame_repr', False)\n" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 14, 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "data": { 26 | "text/html": [ 27 | "
\n", 28 | "\n", 41 | "\n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | "
yearnamesubjectgradepass
01Mary SmithMath41
11Mary SmithComputer Science51
21Mary SmithEnglish Literature20
\n", 79 | "
" 80 | ], 81 | "text/plain": [ 82 | " year name subject grade pass\n", 83 | "0 1 Mary Smith Math 4 1\n", 84 | "1 1 Mary Smith Computer Science 5 1\n", 85 | "2 1 Mary Smith English Literature 2 0" 86 | ] 87 | }, 88 | "execution_count": 14, 89 | "metadata": {}, 90 | "output_type": "execute_result" 91 | } 92 | ], 93 | "source": [ 94 | "pdf = pd.DataFrame.from_records([\n", 95 | " (1, \"Mary Smith\", \"Math\", 4, 1),\n", 96 | " (1, \"Mary Smith\", \"Computer Science\", 5, 1),\n", 97 | " (1, \"Mary Smith\", \"English Literature\", 2, 0),\n", 98 | " (2, \"Mary Smith\", \"Math\", 4, 1),\n", 99 | " (2, \"Mary Smith\", \"Computer Science\", 5, 1),\n", 100 | " (2, \"Mary Smith\", \"English Literature\", 4, 1),\n", 101 | " (1, \"John Brown\", \"Math\", 1, 0),\n", 102 | " (1, \"John Brown\", \"Computer Science\", 4, 1),\n", 103 | " (1, \"John Brown\", \"English Literature\", 5, 1),\n", 104 | " (2, \"John Brown\", \"Math\", 4, 1),\n", 105 | " (2, \"John Brown\", \"Computer Science\", 3, 0),\n", 106 | " (2, \"John Brown\", \"English Literature\", 5, 1),\n", 107 | "],\n", 108 | "columns=['year', 'name', 'subject', 'grade', 'pass'])\n", 109 | "\n", 110 | "df = dd.from_pandas(pdf, 2)\n", 111 | "\n", 112 | "df.head(3)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 35, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "collect_list = dd.Aggregation(\n", 122 | " 'collect_list',\n", 123 | " lambda s: s.apply(list),\n", 124 | " lambda s: s.apply(lambda chunks: list(it.chain.from_iterable(chunks))),\n", 125 | ")" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 37, 131 | "metadata": {}, 132 | "outputs": [ 133 | { 134 | "data": { 135 | "text/html": [ 136 | "
\n", 137 | "\n", 154 | "\n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | "
gradepass
gradesmean_gradepasses
yearname
1John Brown[1, 4, 5]3.3333332
Mary Smith[4, 5, 2]3.6666672
2John Brown[4, 3, 5]4.0000002
Mary Smith[4, 5, 4]4.3333333
\n", 206 | "
" 207 | ], 208 | "text/plain": [ 209 | " grade pass\n", 210 | " grades mean_grade passes\n", 211 | "year name \n", 212 | "1 John Brown [1, 4, 5] 3.333333 2\n", 213 | " Mary Smith [4, 5, 2] 3.666667 2\n", 214 | "2 John Brown [4, 3, 5] 4.000000 2\n", 215 | " Mary Smith [4, 5, 4] 4.333333 3" 216 | ] 217 | }, 218 | "execution_count": 37, 219 | "metadata": {}, 220 | "output_type": "execute_result" 221 | } 222 | ], 223 | "source": [ 224 | "ag = df.groupby(['year', 'name']).agg({\n", 225 | " 'grade': {'mean_grade': np.mean,\n", 226 | " 'grades': collect_list},\n", 227 | " 'pass': {'passes': 'sum'}\n", 228 | "})\n", 229 | "\n", 230 | "ag.compute()" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 38, 236 | "metadata": {}, 237 | "outputs": [ 238 | { 239 | "data": { 240 | "text/plain": [ 241 | "MultiIndex(levels=[[1, 2], [u'John Brown', u'Mary Smith']],\n", 242 | " labels=[[0, 0, 1, 1], [0, 1, 0, 1]],\n", 243 | " names=[u'year', u'name'])" 244 | ] 245 | }, 246 | "execution_count": 38, 247 | "metadata": {}, 248 | "output_type": "execute_result" 249 | } 250 | ], 251 | "source": [ 252 | "ag.index.compute()" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 39, 258 | "metadata": {}, 259 | "outputs": [ 260 | { 261 | "data": { 262 | "text/plain": [ 263 | "MultiIndex(levels=[[u'grade', u'pass'], [u'grades', u'mean_grade', u'passes']],\n", 264 | " labels=[[0, 0, 1], [0, 1, 2]])" 265 | ] 266 | }, 267 | "execution_count": 39, 268 | "metadata": {}, 269 | "output_type": "execute_result" 270 | } 271 | ], 272 | "source": [ 273 | "ag.columns" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 47, 279 | "metadata": {}, 280 | "outputs": [ 281 | { 282 | "data": { 283 | "text/html": [ 284 | "
\n", 285 | "\n", 298 | "\n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | "
yearnamegradepass
gradesmean_gradepasses
01John Brown[1, 4, 5]3.3333332
11Mary Smith[4, 5, 2]3.6666672
22John Brown[4, 3, 5]4.0000002
32Mary Smith[4, 5, 4]4.3333333
\n", 351 | "
" 352 | ], 353 | "text/plain": [ 354 | " year name grade pass\n", 355 | " grades mean_grade passes\n", 356 | "0 1 John Brown [1, 4, 5] 3.333333 2\n", 357 | "1 1 Mary Smith [4, 5, 2] 3.666667 2\n", 358 | "2 2 John Brown [4, 3, 5] 4.000000 2\n", 359 | "3 2 Mary Smith [4, 5, 4] 4.333333 3" 360 | ] 361 | }, 362 | "execution_count": 47, 363 | "metadata": {}, 364 | "output_type": "execute_result" 365 | } 366 | ], 367 | "source": [ 368 | "ri = ag.reset_index()\n", 369 | "ri.compute()" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 48, 375 | "metadata": {}, 376 | "outputs": [ 377 | { 378 | "data": { 379 | "text/plain": [ 380 | "RangeIndex(start=0, stop=4, step=1)" 381 | ] 382 | }, 383 | "execution_count": 48, 384 | "metadata": {}, 385 | "output_type": "execute_result" 386 | } 387 | ], 388 | "source": [ 389 | "ri.index.compute()" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 49, 395 | "metadata": {}, 396 | "outputs": [ 397 | { 398 | "data": { 399 | "text/plain": [ 400 | "MultiIndex(levels=[[u'grade', u'pass', u'name', u'year'], [u'grades', u'mean_grade', u'passes', u'']],\n", 401 | " labels=[[3, 2, 0, 0, 1], [3, 3, 0, 1, 2]])" 402 | ] 403 | }, 404 | "execution_count": 49, 405 | "metadata": {}, 406 | "output_type": "execute_result" 407 | } 408 | ], 409 | "source": [ 410 | "ri.columns" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": 50, 416 | "metadata": {}, 417 | "outputs": [ 418 | { 419 | "data": { 420 | "text/plain": [ 421 | "pandas.core.indexes.multi.MultiIndex" 422 | ] 423 | }, 424 | "execution_count": 50, 425 | "metadata": {}, 426 | "output_type": "execute_result" 427 | } 428 | ], 429 | "source": [ 430 | "ri.columns.__class__" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": 51, 436 | "metadata": {}, 437 | "outputs": [ 438 | { 439 | "data": { 440 | "text/plain": [ 441 | "Index([u'year', u'name', u'grade', u'grade', u'pass'], dtype='object')" 442 | ] 443 | }, 444 | "execution_count": 51, 445 | "metadata": {}, 446 | "output_type": "execute_result" 447 | } 448 | ], 449 | "source": [ 450 | "ri.columns.get_level_values(0)" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": 52, 456 | "metadata": {}, 457 | "outputs": [ 458 | { 459 | "data": { 460 | "text/plain": [ 461 | "Index([u'', u'', u'grades', u'mean_grade', u'passes'], dtype='object')" 462 | ] 463 | }, 464 | "execution_count": 52, 465 | "metadata": {}, 466 | "output_type": "execute_result" 467 | } 468 | ], 469 | "source": [ 470 | "ri.columns.get_level_values(1)" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": 55, 476 | "metadata": {}, 477 | "outputs": [ 478 | { 479 | "data": { 480 | "text/html": [ 481 | "
\n", 482 | "\n", 495 | "\n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | "
yearnamegradesmean_gradepasses
01John Brown[1, 4, 5]3.3333332
11Mary Smith[4, 5, 2]3.6666672
22John Brown[4, 3, 5]4.0000002
32Mary Smith[4, 5, 4]4.3333333
\n", 541 | "
" 542 | ], 543 | "text/plain": [ 544 | " year name grades mean_grade passes\n", 545 | "0 1 John Brown [1, 4, 5] 3.333333 2\n", 546 | "1 1 Mary Smith [4, 5, 2] 3.666667 2\n", 547 | "2 2 John Brown [4, 3, 5] 4.000000 2\n", 548 | "3 2 Mary Smith [4, 5, 4] 4.333333 3" 549 | ] 550 | }, 551 | "execution_count": 55, 552 | "metadata": {}, 553 | "output_type": "execute_result" 554 | } 555 | ], 556 | "source": [ 557 | "ri.columns = ['year', 'name', 'grades', 'mean_grade', 'passes']\n", 558 | "ri.compute()" 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": 57, 564 | "metadata": {}, 565 | "outputs": [ 566 | { 567 | "data": { 568 | "text/plain": [ 569 | "['to_bag',\n", 570 | " 'to_csv',\n", 571 | " 'to_delayed',\n", 572 | " 'to_hdf',\n", 573 | " 'to_html',\n", 574 | " 'to_parquet',\n", 575 | " 'to_records',\n", 576 | " 'to_string',\n", 577 | " 'to_timestamp']" 578 | ] 579 | }, 580 | "execution_count": 57, 581 | "metadata": {}, 582 | "output_type": "execute_result" 583 | } 584 | ], 585 | "source": [ 586 | "[_ for _ in dir(ri) if _.startswith('to')]" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": 60, 592 | "metadata": {}, 593 | "outputs": [ 594 | { 595 | "data": { 596 | "text/plain": [ 597 | "rec.array([(0, 1, 'John Brown', list([1, 4, 5]), 3.33333333, 2),\n", 598 | " (1, 1, 'Mary Smith', list([4, 5, 2]), 3.66666667, 2),\n", 599 | " (2, 2, 'John Brown', list([4, 3, 5]), 4. , 2),\n", 600 | " (3, 2, 'Mary Smith', list([4, 5, 4]), 4.33333333, 3)],\n", 601 | " dtype=[(u'index', '\n", 35 | "\n", 48 | "\n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | "
daterequestreferreruser_agent
031/Aug/2015:23:49:01 +0000GET /logger/?action-view&site_id=123 HTTP/1.1https://foo.com/some/urlMozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...
\n", 68 | "" 69 | ], 70 | "text/plain": [ 71 | " date request \\\n", 72 | "0 31/Aug/2015:23:49:01 +0000 GET /logger/?action-view&site_id=123 HTTP/1.1 \n", 73 | "\n", 74 | " referrer user_agent \n", 75 | "0 https://foo.com/some/url Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3... " 76 | ] 77 | }, 78 | "execution_count": 56, 79 | "metadata": {}, 80 | "output_type": "execute_result" 81 | } 82 | ], 83 | "source": [ 84 | "log_lines = [\"/logger/ || 70.123.102.76 || - || 31/Aug/2015:23:49:01 +0000 || GET /logger/?action-view&site_id=123 HTTP/1.1 || 200 || 236 || https://foo.com/some/url || Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36 || - || - || - || 0.000\"]\n", 85 | "data = [extract_fields(l) for l in log_lines]\n", 86 | "df = pd.DataFrame(data)\n", 87 | "df.columns = ['date', 'request', 'referrer', 'user_agent']\n", 88 | "df" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 57, 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "data": { 98 | "text/plain": [ 99 | "0 31/Aug/2015:23:49:01 +0000\n", 100 | "Name: date, dtype: object" 101 | ] 102 | }, 103 | "execution_count": 57, 104 | "metadata": {}, 105 | "output_type": "execute_result" 106 | } 107 | ], 108 | "source": [ 109 | "df.date" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 58, 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "data": { 119 | "text/html": [ 120 | "
\n", 121 | "\n", 134 | "\n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | "
daterequestreferreruser_agent
02015-08-31 23:49:01+00:00GET /logger/?action-view&site_id=123 HTTP/1.1https://foo.com/some/urlMozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...
\n", 154 | "
" 155 | ], 156 | "text/plain": [ 157 | " date request \\\n", 158 | "0 2015-08-31 23:49:01+00:00 GET /logger/?action-view&site_id=123 HTTP/1.1 \n", 159 | "\n", 160 | " referrer user_agent \n", 161 | "0 https://foo.com/some/url Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3... " 162 | ] 163 | }, 164 | "execution_count": 58, 165 | "metadata": {}, 166 | "output_type": "execute_result" 167 | } 168 | ], 169 | "source": [ 170 | "df['date'] = pd.to_datetime(df['date'], format='%d/%b/%Y:%H:%M:%S +0000', utc=True)\n", 171 | "df" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 59, 177 | "metadata": {}, 178 | "outputs": [ 179 | { 180 | "data": { 181 | "text/html": [ 182 | "
\n", 183 | "\n", 196 | "\n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | "
requestreferreruser_agent
date
2015-08-31 23:49:01+00:00GET /logger/?action-view&site_id=123 HTTP/1.1https://foo.com/some/urlMozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...
\n", 220 | "
" 221 | ], 222 | "text/plain": [ 223 | " request \\\n", 224 | "date \n", 225 | "2015-08-31 23:49:01+00:00 GET /logger/?action-view&site_id=123 HTTP/1.1 \n", 226 | "\n", 227 | " referrer \\\n", 228 | "date \n", 229 | "2015-08-31 23:49:01+00:00 https://foo.com/some/url \n", 230 | "\n", 231 | " user_agent \n", 232 | "date \n", 233 | "2015-08-31 23:49:01+00:00 Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3... " 234 | ] 235 | }, 236 | "execution_count": 59, 237 | "metadata": {}, 238 | "output_type": "execute_result" 239 | } 240 | ], 241 | "source": [ 242 | "df.set_index('date', inplace=True)\n", 243 | "df" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 60, 249 | "metadata": {}, 250 | "outputs": [ 251 | { 252 | "data": { 253 | "text/plain": [ 254 | "DatetimeIndex(['2015-08-31 23:49:01+00:00'], dtype='datetime64[ns, UTC]', name=u'date', freq=None)" 255 | ] 256 | }, 257 | "execution_count": 60, 258 | "metadata": {}, 259 | "output_type": "execute_result" 260 | } 261 | ], 262 | "source": [ 263 | "df.index" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 61, 269 | "metadata": {}, 270 | "outputs": [ 271 | { 272 | "data": { 273 | "text/html": [ 274 | "
\n", 275 | "\n", 288 | "\n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | "
requestreferreruser_agent
date
2015-08-31 16:49:01-07:00GET /logger/?action-view&site_id=123 HTTP/1.1https://foo.com/some/urlMozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...
\n", 312 | "
" 313 | ], 314 | "text/plain": [ 315 | " request \\\n", 316 | "date \n", 317 | "2015-08-31 16:49:01-07:00 GET /logger/?action-view&site_id=123 HTTP/1.1 \n", 318 | "\n", 319 | " referrer \\\n", 320 | "date \n", 321 | "2015-08-31 16:49:01-07:00 https://foo.com/some/url \n", 322 | "\n", 323 | " user_agent \n", 324 | "date \n", 325 | "2015-08-31 16:49:01-07:00 Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3... " 326 | ] 327 | }, 328 | "execution_count": 61, 329 | "metadata": {}, 330 | "output_type": "execute_result" 331 | } 332 | ], 333 | "source": [ 334 | "df.index = df.index.tz_convert('America/Los_Angeles')\n", 335 | "df" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 71, 341 | "metadata": {}, 342 | "outputs": [ 343 | { 344 | "data": { 345 | "text/html": [ 346 | "
\n", 347 | "\n", 360 | "\n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | "
requestreferreruser_agent
date
2015-08-31 16:00:00-07:00GET /logger/?action-view&site_id=123 HTTP/1.1https://foo.com/some/urlMozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...
\n", 384 | "
" 385 | ], 386 | "text/plain": [ 387 | " request \\\n", 388 | "date \n", 389 | "2015-08-31 16:00:00-07:00 GET /logger/?action-view&site_id=123 HTTP/1.1 \n", 390 | "\n", 391 | " referrer \\\n", 392 | "date \n", 393 | "2015-08-31 16:00:00-07:00 https://foo.com/some/url \n", 394 | "\n", 395 | " user_agent \n", 396 | "date \n", 397 | "2015-08-31 16:00:00-07:00 Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3... " 398 | ] 399 | }, 400 | "execution_count": 71, 401 | "metadata": {}, 402 | "output_type": "execute_result" 403 | } 404 | ], 405 | "source": [ 406 | "df.index = df.index.floor('1H')\n", 407 | "df" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 82, 413 | "metadata": {}, 414 | "outputs": [ 415 | { 416 | "data": { 417 | "text/html": [ 418 | "
\n", 419 | "\n", 432 | "\n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | "
referreruser_agent
daterequest
2015-08-31 16:00:00-07:00GET /logger/?action-view&site_id=123 HTTP/1.111
\n", 456 | "
" 457 | ], 458 | "text/plain": [ 459 | " referrer \\\n", 460 | "date request \n", 461 | "2015-08-31 16:00:00-07:00 GET /logger/?action-view&site_id=123 HTTP/1.1 1 \n", 462 | "\n", 463 | " user_agent \n", 464 | "date request \n", 465 | "2015-08-31 16:00:00-07:00 GET /logger/?action-view&site_id=123 HTTP/1.1 1 " 466 | ] 467 | }, 468 | "execution_count": 82, 469 | "metadata": {}, 470 | "output_type": "execute_result" 471 | } 472 | ], 473 | "source": [ 474 | "df.groupby(['date', 'request']).count()" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": 84, 480 | "metadata": {}, 481 | "outputs": [ 482 | { 483 | "data": { 484 | "text/plain": [ 485 | "datetime.datetime(2017, 12, 27, 15, 17, 35, 311839)" 486 | ] 487 | }, 488 | "execution_count": 84, 489 | "metadata": {}, 490 | "output_type": "execute_result" 491 | } 492 | ], 493 | "source": [ 494 | "now = dt.datetime.now()\n", 495 | "now" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": 124, 501 | "metadata": {}, 502 | "outputs": [ 503 | { 504 | "data": { 505 | "text/plain": [ 506 | "datetime.datetime(2018, 1, 26, 15, 17, 35, 311839)" 507 | ] 508 | }, 509 | "execution_count": 124, 510 | "metadata": {}, 511 | "output_type": "execute_result" 512 | } 513 | ], 514 | "source": [ 515 | "next_month = now + dt.timedelta(days=30)\n", 516 | "next_month" 517 | ] 518 | }, 519 | { 520 | "cell_type": "code", 521 | "execution_count": 102, 522 | "metadata": {}, 523 | "outputs": [ 524 | { 525 | "data": { 526 | "text/html": [ 527 | "
\n", 528 | "\n", 541 | "\n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | "
requestreferreruser_agentweek_start
date
2015-08-31 16:00:00-07:00GET /logger/?action-view&site_id=123 HTTP/1.1https://foo.com/some/urlMozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...2015-08-31
\n", 568 | "
" 569 | ], 570 | "text/plain": [ 571 | " request \\\n", 572 | "date \n", 573 | "2015-08-31 16:00:00-07:00 GET /logger/?action-view&site_id=123 HTTP/1.1 \n", 574 | "\n", 575 | " referrer \\\n", 576 | "date \n", 577 | "2015-08-31 16:00:00-07:00 https://foo.com/some/url \n", 578 | "\n", 579 | " user_agent \\\n", 580 | "date \n", 581 | "2015-08-31 16:00:00-07:00 Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3... \n", 582 | "\n", 583 | " week_start \n", 584 | "date \n", 585 | "2015-08-31 16:00:00-07:00 2015-08-31 " 586 | ] 587 | }, 588 | "execution_count": 102, 589 | "metadata": {}, 590 | "output_type": "execute_result" 591 | } 592 | ], 593 | "source": [ 594 | "df['week_start'] = df.index.to_period('W').start_time\n", 595 | "df" 596 | ] 597 | }, 598 | { 599 | "cell_type": "code", 600 | "execution_count": 120, 601 | "metadata": {}, 602 | "outputs": [ 603 | { 604 | "data": { 605 | "text/html": [ 606 | "
\n", 607 | "\n", 620 | "\n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | "
requestreferreruser_agentweek_startnext_week_start
date
2015-08-31 16:00:00-07:00GET /logger/?action-view&site_id=123 HTTP/1.1https://foo.com/some/urlMozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...2015-08-312015-09-07
\n", 650 | "
" 651 | ], 652 | "text/plain": [ 653 | " request \\\n", 654 | "date \n", 655 | "2015-08-31 16:00:00-07:00 GET /logger/?action-view&site_id=123 HTTP/1.1 \n", 656 | "\n", 657 | " referrer \\\n", 658 | "date \n", 659 | "2015-08-31 16:00:00-07:00 https://foo.com/some/url \n", 660 | "\n", 661 | " user_agent \\\n", 662 | "date \n", 663 | "2015-08-31 16:00:00-07:00 Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3... \n", 664 | "\n", 665 | " week_start next_week_start \n", 666 | "date \n", 667 | "2015-08-31 16:00:00-07:00 2015-08-31 2015-09-07 " 668 | ] 669 | }, 670 | "execution_count": 120, 671 | "metadata": {}, 672 | "output_type": "execute_result" 673 | } 674 | ], 675 | "source": [ 676 | "df['next_week_start'] = df['week_start'] + pd.DateOffset(weeks=1)\n", 677 | "df" 678 | ] 679 | }, 680 | { 681 | "cell_type": "code", 682 | "execution_count": 121, 683 | "metadata": {}, 684 | "outputs": [ 685 | { 686 | "data": { 687 | "text/plain": [ 688 | "DatetimeIndex(['2017-12-27 15:17:35.311839', '2017-12-28 15:17:35.311839',\n", 689 | " '2017-12-29 15:17:35.311839', '2017-12-30 15:17:35.311839',\n", 690 | " '2017-12-31 15:17:35.311839', '2018-01-01 15:17:35.311839',\n", 691 | " '2018-01-02 15:17:35.311839', '2018-01-03 15:17:35.311839',\n", 692 | " '2018-01-04 15:17:35.311839', '2018-01-05 15:17:35.311839'],\n", 693 | " dtype='datetime64[ns]', freq='D')" 694 | ] 695 | }, 696 | "execution_count": 121, 697 | "metadata": {}, 698 | "output_type": "execute_result" 699 | } 700 | ], 701 | "source": [ 702 | "pd.date_range(now, periods=10, freq='D')" 703 | ] 704 | }, 705 | { 706 | "cell_type": "code", 707 | "execution_count": 125, 708 | "metadata": {}, 709 | "outputs": [ 710 | { 711 | "data": { 712 | "text/plain": [ 713 | "DatetimeIndex(['2017-12-31 15:17:35.311839', '2018-01-07 15:17:35.311839',\n", 714 | " '2018-01-14 15:17:35.311839', '2018-01-21 15:17:35.311839'],\n", 715 | " dtype='datetime64[ns]', freq='W-SUN')" 716 | ] 717 | }, 718 | "execution_count": 125, 719 | "metadata": {}, 720 | "output_type": "execute_result" 721 | } 722 | ], 723 | "source": [ 724 | "pd.date_range(now, next_month, freq='W')" 725 | ] 726 | }, 727 | { 728 | "cell_type": "code", 729 | "execution_count": null, 730 | "metadata": {}, 731 | "outputs": [], 732 | "source": [] 733 | } 734 | ], 735 | "metadata": { 736 | "kernelspec": { 737 | "display_name": "Python 2", 738 | "language": "python", 739 | "name": "python2" 740 | }, 741 | "language_info": { 742 | "codemirror_mode": { 743 | "name": "ipython", 744 | "version": 2 745 | }, 746 | "file_extension": ".py", 747 | "mimetype": "text/x-python", 748 | "name": "python", 749 | "nbconvert_exporter": "python", 750 | "pygments_lexer": "ipython2", 751 | "version": "2.7.13" 752 | } 753 | }, 754 | "nbformat": 4, 755 | "nbformat_minor": 2 756 | } 757 | -------------------------------------------------------------------------------- /2018/datetimes/requirements.txt: -------------------------------------------------------------------------------- 1 | arrow==0.10.0 2 | Babel==2.4.0 3 | maya==0.3.2 4 | pendulum==1.2.4 5 | pytz==2017.2 6 | tzlocal==1.4 7 | -------------------------------------------------------------------------------- /2018/sqlpandas/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2018/sqlpandas/README.md -------------------------------------------------------------------------------- /2018/sqlpandas/data-hp/houses.csv: -------------------------------------------------------------------------------- 1 | code,name,headmaster 2 | G,Gryffindor,Minerva McGonagall 3 | S,Slytherin,Severus Snape 4 | R,Ravenclaw,Filius Flitwick 5 | H,Hufflepuff,Pomona Sprout -------------------------------------------------------------------------------- /2018/sqlpandas/data-hp/students.csv: -------------------------------------------------------------------------------- 1 | name,house_code 2 | Harry Potter,G 3 | Ron Weasley,G 4 | Hermione Granger,G 5 | Draco Malfoy,S 6 | Vincent Crabbe,S 7 | Gregory Goyle,S 8 | Luna Lovegood,R 9 | Padma Patil,R 10 | Parvati Patil,R 11 | Hannah Abbott,H 12 | Susan Bones,H 13 | Cedric Diggory,H -------------------------------------------------------------------------------- /2018/sqlpandas/data/countries.csv: -------------------------------------------------------------------------------- 1 | "id","code","name","continent","wikipedia_link","keywords" 2 | 302672,"AD","Andorra","EU","http://en.wikipedia.org/wiki/Andorra", 3 | 302618,"AE","United Arab Emirates","AS","http://en.wikipedia.org/wiki/United_Arab_Emirates","UAE,مطارات في الإمارات العربية المتحدة" 4 | 302619,"AF","Afghanistan","AS","http://en.wikipedia.org/wiki/Afghanistan", 5 | 302722,"AG","Antigua and Barbuda","NA","http://en.wikipedia.org/wiki/Antigua_and_Barbuda", 6 | 302723,"AI","Anguilla","NA","http://en.wikipedia.org/wiki/Anguilla", 7 | 302673,"AL","Albania","EU","http://en.wikipedia.org/wiki/Albania", 8 | 302620,"AM","Armenia","AS","http://en.wikipedia.org/wiki/Armenia", 9 | 302556,"AO","Angola","AF","http://en.wikipedia.org/wiki/Angola", 10 | 302615,"AQ","Antarctica","AN","http://en.wikipedia.org/wiki/Antarctica", 11 | 302789,"AR","Argentina","SA","http://en.wikipedia.org/wiki/Argentina","Aeropuertos de Argentina" 12 | 302763,"AS","American Samoa","OC","http://en.wikipedia.org/wiki/American_Samoa", 13 | 302674,"AT","Austria","EU","http://en.wikipedia.org/wiki/Austria","Flughäfen in Österreich" 14 | 302764,"AU","Australia","OC","http://en.wikipedia.org/wiki/Australia", 15 | 302725,"AW","Aruba","NA","http://en.wikipedia.org/wiki/Aruba", 16 | 302621,"AZ","Azerbaijan","AS","http://en.wikipedia.org/wiki/Azerbaijan", 17 | 302675,"BA","Bosnia and Herzegovina","EU","http://en.wikipedia.org/wiki/Bosnia_and_Herzegovina", 18 | 302726,"BB","Barbados","NA","http://en.wikipedia.org/wiki/Barbados", 19 | 302622,"BD","Bangladesh","AS","http://en.wikipedia.org/wiki/Bangladesh", 20 | 302676,"BE","Belgium","EU","http://en.wikipedia.org/wiki/Belgium","Aéroports de Belgique,Luchthavens van België" 21 | 302557,"BF","Burkina Faso","AF","http://en.wikipedia.org/wiki/Burkina_Faso", 22 | 302677,"BG","Bulgaria","EU","http://en.wikipedia.org/wiki/Bulgaria", 23 | 302623,"BH","Bahrain","AS","http://en.wikipedia.org/wiki/Bahrain","مطارات البحرين" 24 | 302558,"BI","Burundi","AF","http://en.wikipedia.org/wiki/Burundi", 25 | 302559,"BJ","Benin","AF","http://en.wikipedia.org/wiki/Benin", 26 | 302760,"BL","Saint Barthélemy","NA","http://en.wikipedia.org/wiki/Saint_Barthélemy", 27 | 302727,"BM","Bermuda","NA","http://en.wikipedia.org/wiki/Bermuda", 28 | 302624,"BN","Brunei","AS","http://en.wikipedia.org/wiki/Brunei", 29 | 302790,"BO","Bolivia","SA","http://en.wikipedia.org/wiki/Bolivia","Aeropuertos de Bolivia" 30 | 302724,"BQ","Caribbean Netherlands","NA","http://en.wikipedia.org/wiki/Caribbean_Netherlands", 31 | 302791,"BR","Brazil","SA","http://en.wikipedia.org/wiki/Brazil","Brasil, Brasilian" 32 | 302728,"BS","Bahamas","NA","http://en.wikipedia.org/wiki/Bahamas", 33 | 302625,"BT","Bhutan","AS","http://en.wikipedia.org/wiki/Bhutan", 34 | 302560,"BW","Botswana","AF","http://en.wikipedia.org/wiki/Botswana", 35 | 302678,"BY","Belarus","EU","http://en.wikipedia.org/wiki/Belarus","Belarussian, Беларусь" 36 | 302729,"BZ","Belize","NA","http://en.wikipedia.org/wiki/Belize", 37 | 302730,"CA","Canada","NA","http://en.wikipedia.org/wiki/Canada", 38 | 302626,"CC","Cocos (Keeling) Islands","AS","http://en.wikipedia.org/wiki/Cocos_(Keeling)_Islands", 39 | 302561,"CD","Congo (Kinshasa)","AF","http://en.wikipedia.org/wiki/Congo_(Kinshasa)", 40 | 302562,"CF","Central African Republic","AF","http://en.wikipedia.org/wiki/Central_African_Republic", 41 | 302563,"CG","Congo (Brazzaville)","AF","http://en.wikipedia.org/wiki/Congo_(Brazzaville)", 42 | 302679,"CH","Switzerland","EU","http://en.wikipedia.org/wiki/Switzerland","Aéroports de la Suisse,Flughäfen der Schweiz" 43 | 302564,"CI","Côte d'Ivoire","AF","http://en.wikipedia.org/wiki/Côte_d'Ivoire","Ivory Coast" 44 | 302765,"CK","Cook Islands","OC","http://en.wikipedia.org/wiki/Cook_Islands", 45 | 302792,"CL","Chile","SA","http://en.wikipedia.org/wiki/Chile","Aeropuertos de Chile" 46 | 302565,"CM","Cameroon","AF","http://en.wikipedia.org/wiki/Cameroon", 47 | 302627,"CN","China","AS","http://en.wikipedia.org/wiki/China","中国的机场" 48 | 302793,"CO","Colombia","SA","http://en.wikipedia.org/wiki/Colombia","Aeropuertos de Colombia" 49 | 302731,"CR","Costa Rica","NA","http://en.wikipedia.org/wiki/Costa_Rica","Aeropuertos de Costa Rica" 50 | 302732,"CU","Cuba","NA","http://en.wikipedia.org/wiki/Cuba","Aeropuertos de Cuba" 51 | 302566,"CV","Cape Verde","AF","http://en.wikipedia.org/wiki/Cape_Verde", 52 | 302762,"CW","Curaçao","NA","http://en.wikipedia.org/wiki/Cura%C3%A7ao", 53 | 302628,"CX","Christmas Island","AS","http://en.wikipedia.org/wiki/Christmas_Island", 54 | 302629,"CY","Cyprus","AS","http://en.wikipedia.org/wiki/Cyprus", 55 | 302680,"CZ","Czechia","EU","http://en.wikipedia.org/wiki/Czech_Republic","Letiště České republiky" 56 | 302681,"DE","Germany","EU","http://en.wikipedia.org/wiki/Germany","Flughäfen in Deutschland" 57 | 302567,"DJ","Djibouti","AF","http://en.wikipedia.org/wiki/Djibouti", 58 | 302682,"DK","Denmark","EU","http://en.wikipedia.org/wiki/Denmark","Lufthavnene i Danmark" 59 | 302733,"DM","Dominica","NA","http://en.wikipedia.org/wiki/Dominica", 60 | 302734,"DO","Dominican Republic","NA","http://en.wikipedia.org/wiki/Dominican_Republic", 61 | 302568,"DZ","Algeria","AF","http://en.wikipedia.org/wiki/Algeria","مطارات الجزائر" 62 | 302794,"EC","Ecuador","SA","http://en.wikipedia.org/wiki/Ecuador","Aeropuertos de Ecuador" 63 | 302683,"EE","Estonia","EU","http://en.wikipedia.org/wiki/Estonia", 64 | 302569,"EG","Egypt","AF","http://en.wikipedia.org/wiki/Egypt","مطارات مصر" 65 | 302570,"EH","Western Sahara","AF","http://en.wikipedia.org/wiki/Western_Sahara","Sahrawian, مطارات الصحراء الغربية" 66 | 302571,"ER","Eritrea","AF","http://en.wikipedia.org/wiki/Eritrea", 67 | 302684,"ES","Spain","EU","http://en.wikipedia.org/wiki/Spain","Aeropuertos de España" 68 | 302572,"ET","Ethiopia","AF","http://en.wikipedia.org/wiki/Ethiopia", 69 | 302685,"FI","Finland","EU","http://en.wikipedia.org/wiki/Finland","Lentokentät, Suomen" 70 | 302766,"FJ","Fiji","OC","http://en.wikipedia.org/wiki/Fiji", 71 | 302795,"FK","Falkland Islands","SA","http://en.wikipedia.org/wiki/Falkland_Islands", 72 | 302767,"FM","Micronesia","OC","https://en.wikipedia.org/wiki/Federated_States_of_Micronesia", 73 | 302686,"FO","Faroe Islands","EU","http://en.wikipedia.org/wiki/Faroe_Islands", 74 | 302687,"FR","France","EU","http://en.wikipedia.org/wiki/France","Aéroports de France" 75 | 302573,"GA","Gabon","AF","http://en.wikipedia.org/wiki/Gabon", 76 | 302688,"GB","United Kingdom","EU","http://en.wikipedia.org/wiki/United_Kingdom","Great Britain" 77 | 302735,"GD","Grenada","NA","http://en.wikipedia.org/wiki/Grenada", 78 | 302630,"GE","Georgia","AS","http://en.wikipedia.org/wiki/Georgia_(country)", 79 | 302796,"GF","French Guiana","SA","http://en.wikipedia.org/wiki/French_Guiana","French Guyana" 80 | 302689,"GG","Guernsey","EU","http://en.wikipedia.org/wiki/Guernsey", 81 | 302574,"GH","Ghana","AF","http://en.wikipedia.org/wiki/Ghana", 82 | 302690,"GI","Gibraltar","EU","http://en.wikipedia.org/wiki/Gibraltar", 83 | 302736,"GL","Greenland","NA","http://en.wikipedia.org/wiki/Greenland", 84 | 302575,"GM","Gambia","AF","http://en.wikipedia.org/wiki/Gambia", 85 | 302576,"GN","Guinea","AF","http://en.wikipedia.org/wiki/Guinea","Aéroports de la Guinée" 86 | 302737,"GP","Guadeloupe","NA","http://en.wikipedia.org/wiki/Guadeloupe", 87 | 302577,"GQ","Equatorial Guinea","AF","http://en.wikipedia.org/wiki/Equatorial_Guinea", 88 | 302691,"GR","Greece","EU","http://en.wikipedia.org/wiki/Greece","αεροδρόμια στην Ελλάδα" 89 | 302616,"GS","South Georgia and the South Sandwich Islands","AN","http://en.wikipedia.org/wiki/South_Georgia_and_the_South_Sandwich_Islands", 90 | 302738,"GT","Guatemala","NA","http://en.wikipedia.org/wiki/Guatemala","Aeropuertos de Guatemala" 91 | 302768,"GU","Guam","OC","http://en.wikipedia.org/wiki/Guam", 92 | 302578,"GW","Guinea-Bissau","AF","http://en.wikipedia.org/wiki/Guinea-Bissau", 93 | 302797,"GY","Guyana","SA","http://en.wikipedia.org/wiki/Guyana", 94 | 302631,"HK","Hong Kong","AS","http://en.wikipedia.org/wiki/Hong_Kong", 95 | 302739,"HN","Honduras","NA","http://en.wikipedia.org/wiki/Honduras","Aeropuertos de Honduras" 96 | 302692,"HR","Croatia","EU","http://en.wikipedia.org/wiki/Croatia", 97 | 302740,"HT","Haiti","NA","http://en.wikipedia.org/wiki/Haiti","Aéroports de Haïti" 98 | 302693,"HU","Hungary","EU","http://en.wikipedia.org/wiki/Hungary","Repülőterek Magyarország" 99 | 302632,"ID","Indonesia","AS","http://en.wikipedia.org/wiki/Indonesia","Bandara di Indonesia" 100 | 302694,"IE","Ireland","EU","http://en.wikipedia.org/wiki/Ireland","Eire" 101 | 302633,"IL","Israel","AS","http://en.wikipedia.org/wiki/Israel","שדות התעופה של ישראל" 102 | 302695,"IM","Isle of Man","EU","http://en.wikipedia.org/wiki/Isle_of_Man", 103 | 302634,"IN","India","AS","http://en.wikipedia.org/wiki/India", 104 | 302635,"IO","British Indian Ocean Territory","AS","http://en.wikipedia.org/wiki/British_Indian_Ocean_Territory", 105 | 302636,"IQ","Iraq","AS","http://en.wikipedia.org/wiki/Iraq","مطارات العراق" 106 | 302637,"IR","Iran","AS","http://en.wikipedia.org/wiki/Iran","فرودگاه های ایران" 107 | 302696,"IS","Iceland","EU","http://en.wikipedia.org/wiki/Iceland", 108 | 302697,"IT","Italy","EU","http://en.wikipedia.org/wiki/Italy","Aeroporti d'Italia" 109 | 302698,"JE","Jersey","EU","http://en.wikipedia.org/wiki/Jersey", 110 | 302741,"JM","Jamaica","NA","http://en.wikipedia.org/wiki/Jamaica", 111 | 302638,"JO","Jordan","AS","http://en.wikipedia.org/wiki/Jordan","مطارات في الأردن" 112 | 302639,"JP","Japan","AS","http://en.wikipedia.org/wiki/Japan","Nippon, 日本の空港" 113 | 302579,"KE","Kenya","AF","http://en.wikipedia.org/wiki/Kenya", 114 | 302640,"KG","Kyrgyzstan","AS","http://en.wikipedia.org/wiki/Kyrgyzstan", 115 | 302641,"KH","Cambodia","AS","http://en.wikipedia.org/wiki/Cambodia", 116 | 302769,"KI","Kiribati","OC","http://en.wikipedia.org/wiki/Kiribati", 117 | 302580,"KM","Comoros","AF","http://en.wikipedia.org/wiki/Comoros","جزر القمر" 118 | 302742,"KN","Saint Kitts and Nevis","NA","http://en.wikipedia.org/wiki/Saint_Kitts_and_Nevis", 119 | 302642,"KP","North Korea","AS","http://en.wikipedia.org/wiki/North_Korea", 120 | 302643,"KR","South Korea","AS","http://en.wikipedia.org/wiki/South_Korea","한국의 공항" 121 | 302644,"KW","Kuwait","AS","http://en.wikipedia.org/wiki/Kuwait", 122 | 302743,"KY","Cayman Islands","NA","http://en.wikipedia.org/wiki/Cayman_Islands", 123 | 302645,"KZ","Kazakhstan","AS","http://en.wikipedia.org/wiki/Kazakhstan","Kazakh" 124 | 302646,"LA","Laos","AS","http://en.wikipedia.org/wiki/Laos", 125 | 302647,"LB","Lebanon","AS","http://en.wikipedia.org/wiki/Lebanon","المطارات في لبنان" 126 | 302744,"LC","Saint Lucia","NA","http://en.wikipedia.org/wiki/Saint_Lucia", 127 | 302699,"LI","Liechtenstein","EU","http://en.wikipedia.org/wiki/Liechtenstein", 128 | 302648,"LK","Sri Lanka","AS","http://en.wikipedia.org/wiki/Sri_Lanka", 129 | 302581,"LR","Liberia","AF","http://en.wikipedia.org/wiki/Liberia", 130 | 302582,"LS","Lesotho","AF","http://en.wikipedia.org/wiki/Lesotho", 131 | 302700,"LT","Lithuania","EU","http://en.wikipedia.org/wiki/Lithuania", 132 | 302701,"LU","Luxembourg","EU","http://en.wikipedia.org/wiki/Luxembourg", 133 | 302702,"LV","Latvia","EU","http://en.wikipedia.org/wiki/Latvia", 134 | 302583,"LY","Libya","AF","http://en.wikipedia.org/wiki/Libya","مطارات في ليبيا" 135 | 302584,"MA","Morocco","AF","http://en.wikipedia.org/wiki/Morocco","مطارات المغرب" 136 | 302703,"MC","Monaco","EU","http://en.wikipedia.org/wiki/Monaco", 137 | 302704,"MD","Moldova","EU","http://en.wikipedia.org/wiki/Moldova", 138 | 302705,"ME","Montenegro","EU","http://en.wikipedia.org/wiki/Montenegro", 139 | 302759,"MF","Saint Martin","NA","http://en.wikipedia.org/wiki/Saint_Martin_(France)", 140 | 302585,"MG","Madagascar","AF","http://en.wikipedia.org/wiki/Madagascar", 141 | 302770,"MH","Marshall Islands","OC","http://en.wikipedia.org/wiki/Marshall_Islands", 142 | 302706,"MK","Macedonia","EU","http://en.wikipedia.org/wiki/Macedonia", 143 | 302586,"ML","Mali","AF","http://en.wikipedia.org/wiki/Mali","Aéroports du Mali" 144 | 302649,"MM","Burma","AS","http://en.wikipedia.org/wiki/Burma","Myanmar" 145 | 302650,"MN","Mongolia","AS","http://en.wikipedia.org/wiki/Mongolia", 146 | 302651,"MO","Macau","AS","http://en.wikipedia.org/wiki/Macau","Macao" 147 | 302771,"MP","Northern Mariana Islands","OC","http://en.wikipedia.org/wiki/Northern_Mariana_Islands", 148 | 302745,"MQ","Martinique","NA","http://en.wikipedia.org/wiki/Martinique", 149 | 302587,"MR","Mauritania","AF","http://en.wikipedia.org/wiki/Mauritania","مطارات موريتانيا" 150 | 302746,"MS","Montserrat","NA","http://en.wikipedia.org/wiki/Montserrat", 151 | 302707,"MT","Malta","EU","http://en.wikipedia.org/wiki/Malta", 152 | 302588,"MU","Mauritius","AF","http://en.wikipedia.org/wiki/Mauritius", 153 | 302652,"MV","Maldives","AS","http://en.wikipedia.org/wiki/Maldives", 154 | 302589,"MW","Malawi","AF","http://en.wikipedia.org/wiki/Malawi", 155 | 302747,"MX","Mexico","NA","http://en.wikipedia.org/wiki/Mexico","Aeropuertos de México" 156 | 302653,"MY","Malaysia","AS","http://en.wikipedia.org/wiki/Malaysia","Lapangan Terbang Malaysia" 157 | 302590,"MZ","Mozambique","AF","http://en.wikipedia.org/wiki/Mozambique", 158 | 302591,"NA","Namibia","AF","http://en.wikipedia.org/wiki/Namibia", 159 | 302772,"NC","New Caledonia","OC","http://en.wikipedia.org/wiki/New_Caledonia", 160 | 302592,"NE","Niger","AF","http://en.wikipedia.org/wiki/Niger", 161 | 302773,"NF","Norfolk Island","OC","http://en.wikipedia.org/wiki/Norfolk_Island", 162 | 302593,"NG","Nigeria","AF","http://en.wikipedia.org/wiki/Nigeria", 163 | 302748,"NI","Nicaragua","NA","http://en.wikipedia.org/wiki/Nicaragua","Aeropuertos de Nicaragua" 164 | 302708,"NL","Netherlands","EU","http://en.wikipedia.org/wiki/Netherlands","Holland,Luchthavens van Nederland" 165 | 302709,"NO","Norway","EU","http://en.wikipedia.org/wiki/Norway","Flyplasser i Norge" 166 | 302654,"NP","Nepal","AS","http://en.wikipedia.org/wiki/Nepal","नेपाल विमानस्थलको" 167 | 302774,"NR","Nauru","OC","http://en.wikipedia.org/wiki/Nauru", 168 | 302775,"NU","Niue","OC","http://en.wikipedia.org/wiki/Niue", 169 | 302776,"NZ","New Zealand","OC","http://en.wikipedia.org/wiki/New_Zealand", 170 | 302655,"OM","Oman","AS","http://en.wikipedia.org/wiki/Oman","مطارات عمان" 171 | 302749,"PA","Panama","NA","http://en.wikipedia.org/wiki/Panama","Aeropuertos de Panamá" 172 | 302798,"PE","Perú","SA","http://en.wikipedia.org/wiki/Perú","Aeropuertos de Perú" 173 | 302777,"PF","French Polynesia","OC","http://en.wikipedia.org/wiki/French_Polynesia", 174 | 302778,"PG","Papua New Guinea","OC","http://en.wikipedia.org/wiki/Papua_New_Guinea", 175 | 302656,"PH","Philippines","AS","http://en.wikipedia.org/wiki/Philippines","Mga alternatibong byahe mula sa Pilipinas" 176 | 302657,"PK","Pakistan","AS","http://en.wikipedia.org/wiki/Pakistan","پاکستان کے ہوائی اڈوں" 177 | 302710,"PL","Poland","EU","http://en.wikipedia.org/wiki/Poland","Lotniska Polski" 178 | 302750,"PM","Saint Pierre and Miquelon","NA","http://en.wikipedia.org/wiki/Saint_Pierre_and_Miquelon", 179 | 302779,"PN","Pitcairn","OC","http://en.wikipedia.org/wiki/Pitcairn", 180 | 302751,"PR","Puerto Rico","NA","http://en.wikipedia.org/wiki/Puerto_Rico", 181 | 302658,"PS","Palestinian Territory","AS","http://en.wikipedia.org/wiki/Palestinian_Territory", 182 | 302711,"PT","Portugal","EU","http://en.wikipedia.org/wiki/Portugal","Aeroportos do Brasil" 183 | 302780,"PW","Palau","OC","http://en.wikipedia.org/wiki/Palau", 184 | 302799,"PY","Paraguay","SA","http://en.wikipedia.org/wiki/Paraguay","Aeropuertos de Paraguay" 185 | 302659,"QA","Qatar","AS","http://en.wikipedia.org/wiki/Qatar","مطارات قطر" 186 | 302594,"RE","Réunion","AF","http://en.wikipedia.org/wiki/Réunion","Île Bourbon, La Réunion" 187 | 302712,"RO","Romania","EU","http://en.wikipedia.org/wiki/Romania","Aeroporturi din România" 188 | 302713,"RS","Serbia","EU","http://en.wikipedia.org/wiki/Serbia","Serb" 189 | 302714,"RU","Russia","EU","http://en.wikipedia.org/wiki/Russia","Soviet, Sovietskaya, Sovetskaya, Аэропорты России" 190 | 302595,"RW","Rwanda","AF","http://en.wikipedia.org/wiki/Rwanda", 191 | 302660,"SA","Saudi Arabia","AS","http://en.wikipedia.org/wiki/Saudi_Arabia","مطارات المملكة العربية السعودية,المطارات لموسم الحج" 192 | 302781,"SB","Solomon Islands","OC","http://en.wikipedia.org/wiki/Solomon_Islands", 193 | 302596,"SC","Seychelles","AF","http://en.wikipedia.org/wiki/Seychelles", 194 | 302597,"SD","Sudan","AF","http://en.wikipedia.org/wiki/Sudan","مطارات السودان" 195 | 302715,"SE","Sweden","EU","http://en.wikipedia.org/wiki/Sweden","Flygplatserna i Sverige" 196 | 302661,"SG","Singapore","AS","http://en.wikipedia.org/wiki/Singapore", 197 | 302598,"SH","Saint Helena","AF","http://en.wikipedia.org/wiki/Saint_Helena", 198 | 302716,"SI","Slovenia","EU","http://en.wikipedia.org/wiki/Slovenia", 199 | 302717,"SK","Slovakia","EU","http://en.wikipedia.org/wiki/Slovakia","letisko Slovenska" 200 | 302599,"SL","Sierra Leone","AF","http://en.wikipedia.org/wiki/Sierra_Leone", 201 | 302718,"SM","San Marino","EU","http://en.wikipedia.org/wiki/San_Marino", 202 | 302600,"SN","Senegal","AF","http://en.wikipedia.org/wiki/Senegal","Aéroports du Sénégal" 203 | 302601,"SO","Somalia","AF","http://en.wikipedia.org/wiki/Somalia", 204 | 302800,"SR","Suriname","SA","http://en.wikipedia.org/wiki/Suriname", 205 | 302614,"SS","South Sudan","AF","http://en.wikipedia.org/wiki/South_Sudan", 206 | 302602,"ST","São Tomé and Principe","AF","http://en.wikipedia.org/wiki/São_Tomé_and_Principe", 207 | 302752,"SV","El Salvador","NA","http://en.wikipedia.org/wiki/El_Salvador","Salvadorian, Salvadorean" 208 | 302761,"SX","Sint Maarten","NA","http://en.wikipedia.org/wiki/Sint_Maarten", 209 | 302662,"SY","Syria","AS","http://en.wikipedia.org/wiki/Syria","مطارات سوريا" 210 | 302603,"SZ","Swaziland","AF","http://en.wikipedia.org/wiki/Swaziland", 211 | 302753,"TC","Turks and Caicos Islands","NA","http://en.wikipedia.org/wiki/Turks_and_Caicos_Islands", 212 | 302604,"TD","Chad","AF","http://en.wikipedia.org/wiki/Chad", 213 | 302617,"TF","French Southern Territories","AN","http://en.wikipedia.org/wiki/French_Southern_Territories", 214 | 302605,"TG","Togo","AF","http://en.wikipedia.org/wiki/Togo", 215 | 302663,"TH","Thailand","AS","http://en.wikipedia.org/wiki/Thailand","Siam, Siamese" 216 | 302664,"TJ","Tajikistan","AS","http://en.wikipedia.org/wiki/Tajikistan","Tajik" 217 | 302782,"TK","Tokelau","OC","http://en.wikipedia.org/wiki/Tokelau", 218 | 302665,"TL","Timor-Leste","AS","http://en.wikipedia.org/wiki/Timor-Leste","East Timor" 219 | 302666,"TM","Turkmenistan","AS","http://en.wikipedia.org/wiki/Turkmenistan", 220 | 302606,"TN","Tunisia","AF","http://en.wikipedia.org/wiki/Tunisia","مطارات تونس" 221 | 302783,"TO","Tonga","OC","http://en.wikipedia.org/wiki/Tonga", 222 | 302667,"TR","Turkey","AS","http://en.wikipedia.org/wiki/Turkey","Türkiye havaalanları" 223 | 302754,"TT","Trinidad and Tobago","NA","http://en.wikipedia.org/wiki/Trinidad_and_Tobago", 224 | 302784,"TV","Tuvalu","OC","http://en.wikipedia.org/wiki/Tuvalu", 225 | 302668,"TW","Taiwan","AS","http://en.wikipedia.org/wiki/Taiwan", 226 | 302607,"TZ","Tanzania","AF","http://en.wikipedia.org/wiki/Tanzania", 227 | 302719,"UA","Ukraine","EU","http://en.wikipedia.org/wiki/Ukraine","Аеропорти України" 228 | 302608,"UG","Uganda","AF","http://en.wikipedia.org/wiki/Uganda", 229 | 302785,"UM","United States Minor Outlying Islands","OC","http://en.wikipedia.org/wiki/United_States_Minor_Outlying_Islands", 230 | 302755,"US","United States","NA","http://en.wikipedia.org/wiki/United_States","America" 231 | 302801,"UY","Uruguay","SA","http://en.wikipedia.org/wiki/Uruguay","Aeropuertos de Uruguay" 232 | 302669,"UZ","Uzbekistan","AS","http://en.wikipedia.org/wiki/Uzbekistan","Uzbek" 233 | 302721,"VA","Vatican City","EU","http://en.wikipedia.org/wiki/Vatican_City","The Holy See" 234 | 302756,"VC","Saint Vincent and the Grenadines","NA","http://en.wikipedia.org/wiki/Saint_Vincent_and_the_Grenadines", 235 | 302802,"VE","Venezuela","SA","http://en.wikipedia.org/wiki/Venezuela","Aeropuertos de Venezuela" 236 | 302757,"VG","British Virgin Islands","NA","http://en.wikipedia.org/wiki/British_Virgin_Islands", 237 | 302758,"VI","U.S. Virgin Islands","NA","http://en.wikipedia.org/wiki/U.S._Virgin_Islands", 238 | 302670,"VN","Vietnam","AS","http://en.wikipedia.org/wiki/Vietnam","Các sân bay của Việt Nam" 239 | 302786,"VU","Vanuatu","OC","http://en.wikipedia.org/wiki/Vanuatu", 240 | 302787,"WF","Wallis and Futuna","OC","http://en.wikipedia.org/wiki/Wallis_and_Futuna", 241 | 302788,"WS","Samoa","OC","http://en.wikipedia.org/wiki/Samoa", 242 | 302720,"XK","Kosovo","EU","http://en.wikipedia.org/wiki/Kosovo","Kosova" 243 | 302671,"YE","Yemen","AS","http://en.wikipedia.org/wiki/Yemen","مطارات اليمن" 244 | 302609,"YT","Mayotte","AF","http://en.wikipedia.org/wiki/Mayotte", 245 | 302610,"ZA","South Africa","AF","http://en.wikipedia.org/wiki/South_Africa", 246 | 302611,"ZM","Zambia","AF","http://en.wikipedia.org/wiki/Zambia", 247 | 302612,"ZW","Zimbabwe","AF","http://en.wikipedia.org/wiki/Zimbabwe", 248 | 302613,"ZZ","Unknown or unassigned country","AF","http://en.wikipedia.org/wiki/Unknown_or_unassigned_country", 249 | -------------------------------------------------------------------------------- /2018/sqlpandas/download_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Data from http://ourairports.com/data/ 4 | 5 | wget -r -A "*.csv" -I "data" -nH -e robots=off http://ourairports.com/ 6 | -------------------------------------------------------------------------------- /2018/sqlpandas/images/by_country.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2018/sqlpandas/images/by_country.png -------------------------------------------------------------------------------- /2018/sqlpandas/images/by_country_top10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2018/sqlpandas/images/by_country_top10.png -------------------------------------------------------------------------------- /2018/sqlpandas/images/having1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2018/sqlpandas/images/having1.png -------------------------------------------------------------------------------- /2018/sqlpandas/images/having2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2018/sqlpandas/images/having2.png -------------------------------------------------------------------------------- /2018/sqlpandas/images/notebook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2018/sqlpandas/images/notebook.png -------------------------------------------------------------------------------- /2018/sqlpandas/images/runways.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2018/sqlpandas/images/runways.png -------------------------------------------------------------------------------- /2018/sqlpandas/images/runways_agg1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2018/sqlpandas/images/runways_agg1.png -------------------------------------------------------------------------------- /2018/sqlpandas/images/runways_agg2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2018/sqlpandas/images/runways_agg2.png -------------------------------------------------------------------------------- /2018/windows/README.md: -------------------------------------------------------------------------------- 1 | # Window functions 2 | 3 | Sample "social shares" data to use with window functions. 4 | 5 | The ipython notebook has examples of Pandas and Spark SQL 6 | queries (Pyspark). 7 | 8 | -------------------------------------------------------------------------------- /2018/windows/social_deltas.csv: -------------------------------------------------------------------------------- 1 | url,ts,service,delta 2 | url1,2018-08-15 00:00:00,tw,1 3 | url1,2018-08-15 00:05:00,tw,3 4 | url1,2018-08-15 00:11:00,tw,1 5 | url1,2018-08-15 00:18:00,tw,3 6 | url1,2018-08-15 00:21:00,tw,4 7 | url1,2018-08-15 00:30:00,tw,13 8 | url1,2018-08-15 00:35:00,tw,16 9 | url1,2018-08-15 00:38:00,tw,4 10 | url1,2018-08-15 00:41:00,tw,14 11 | url1,2018-08-15 00:00:00,fb,5 12 | url1,2018-08-15 00:05:00,fb,15 13 | url1,2018-08-15 00:11:00,fb,11 14 | url1,2018-08-15 00:18:00,fb,14 15 | url1,2018-08-15 00:21:00,fb,14 16 | url1,2018-08-15 00:30:00,fb,8 17 | url1,2018-08-15 00:35:00,fb,43 18 | url1,2018-08-15 00:38:00,fb,120 19 | url1,2018-08-15 00:41:00,fb,130 20 | url2,2018-08-15 00:00:00,tw,1 21 | url2,2018-08-15 00:05:00,tw,6 22 | url2,2018-08-15 00:07:00,tw,13 23 | url2,2018-08-15 00:15:00,tw,80 24 | url2,2018-08-15 00:19:00,tw,455 25 | url2,2018-08-15 00:26:00,tw,645 26 | url2,2018-08-15 00:00:00,fb,1 27 | url2,2018-08-15 00:05:00,fb,2 28 | url2,2018-08-15 00:07:00,fb,7 29 | url2,2018-08-15 00:15:00,fb,6 30 | url2,2018-08-15 00:19:00,fb,9 31 | url2,2018-08-15 00:26:00,fb,13 32 | -------------------------------------------------------------------------------- /2018/windows/social_totals.csv: -------------------------------------------------------------------------------- 1 | url,ts,service,total 2 | url1,2018-08-15 00:00:00,tw,1 3 | url1,2018-08-15 00:05:00,tw,4 4 | url1,2018-08-15 00:11:00,tw,5 5 | url1,2018-08-15 00:18:00,tw,8 6 | url1,2018-08-15 00:21:00,tw,12 7 | url1,2018-08-15 00:30:00,tw,25 8 | url1,2018-08-15 00:35:00,tw,41 9 | url1,2018-08-15 00:38:00,tw,45 10 | url1,2018-08-15 00:41:00,tw,59 11 | url1,2018-08-15 00:00:00,fb,5 12 | url1,2018-08-15 00:05:00,fb,20 13 | url1,2018-08-15 00:11:00,fb,31 14 | url1,2018-08-15 00:18:00,fb,45 15 | url1,2018-08-15 00:21:00,fb,59 16 | url1,2018-08-15 00:30:00,fb,67 17 | url1,2018-08-15 00:35:00,fb,110 18 | url1,2018-08-15 00:38:00,fb,230 19 | url1,2018-08-15 00:41:00,fb,360 20 | url2,2018-08-15 00:00:00,tw,1 21 | url2,2018-08-15 00:05:00,tw,7 22 | url2,2018-08-15 00:07:00,tw,20 23 | url2,2018-08-15 00:15:00,tw,100 24 | url2,2018-08-15 00:19:00,tw,555 25 | url2,2018-08-15 00:26:00,tw,1200 26 | url2,2018-08-15 00:00:00,fb,1 27 | url2,2018-08-15 00:05:00,fb,3 28 | url2,2018-08-15 00:07:00,fb,10 29 | url2,2018-08-15 00:15:00,fb,16 30 | url2,2018-08-15 00:19:00,fb,25 31 | url2,2018-08-15 00:26:00,fb,38 32 | -------------------------------------------------------------------------------- /2018/windows/social_totals_agg.csv: -------------------------------------------------------------------------------- 1 | url,service,total 2 | url1,tw,5 3 | url2,tw,8 4 | url3,tw,12 5 | url4,tw,100 6 | url5,tw,175 7 | url6,tw,25 8 | url7,tw,80 9 | url8,tw,35 10 | url9,tw,150 11 | url10,tw,260 12 | url1,fb,210 13 | url2,fb,370 14 | url3,fb,500 15 | url4,fb,20 16 | url5,fb,300 17 | url6,fb,95 18 | url7,fb,150 19 | url8,fb,47 20 | url9,fb,28 21 | url10,fb,5 22 | -------------------------------------------------------------------------------- /2019/pandasdb/read_csv_file.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | df = pd.read_csv("sample_file.csv") 5 | -------------------------------------------------------------------------------- /2019/sparkstart/context.py: -------------------------------------------------------------------------------- 1 | from pyspark.context import SparkContext, SparkConf 2 | from pyspark.sql import SQLContext 3 | 4 | 5 | def get_spark_context(): 6 | conf = SparkConf() 7 | extra_settings = { 8 | "spark.serializer": "org.apache.spark.serializer.KryoSerializer", 9 | "spark.executor.extraJavaOptions": "-XX:+UseG1GC", 10 | "spark.default.parallelism": 200, 11 | } 12 | conf.setAll(extra_settings.items()) 13 | environment = {"PYTHON_EGG_CACHE": "/tmp/python-eggs"} 14 | sc = SparkContext(conf=conf, environment=environment) 15 | return sc 16 | -------------------------------------------------------------------------------- /2019/sparkstart/driver.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext, SparkConf 2 | from pyspark.sql import SQLContext 3 | 4 | 5 | def get_spark_context(): 6 | pass 7 | 8 | 9 | def read_input_data(sc, sqlContext, **kwargs): 10 | pass 11 | 12 | 13 | def transform_or_aggregate(sc, sqlContext, df, **kwargs): 14 | pass 15 | 16 | 17 | def save_output_data(df_out, **kwargs): 18 | pass 19 | 20 | 21 | if __name__ == "__main__": 22 | with get_spark_context() as sc: 23 | sqlContext = SQLContext(sc) 24 | df_in = read_input_data(sc, sqlContext) 25 | df_out = transform_or_aggregate(sc, sqlContext, df) 26 | save_output_data(df_out) 27 | -------------------------------------------------------------------------------- /2019/sparkstart/runner.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | spark-submit \ 4 | --master yarn \ 5 | --deploy-mode client \ 6 | --driver-memory 8g \ 7 | --executor-memory 3g \ 8 | --num-executors 4 \ 9 | --executor-cores 4 \ 10 | --conf "spark.yarn.executor.memoryOverhead=2g" \ 11 | --conf "spark.driver.extraJavaOptions=-Dlog4j.configuration=file:///home/hadoop/log4j.properties" \ 12 | --py-files my_custom_package.egg \ 13 | --jars my_extra_java_lib.jar \ 14 | driver.py 15 | -------------------------------------------------------------------------------- /2021/covid-travel/.gitignore: -------------------------------------------------------------------------------- 1 | COVID-19_Case_Surveillance_Public_Use_Data.csv 2 | 3 | -------------------------------------------------------------------------------- /2021/covid-travel/README.md: -------------------------------------------------------------------------------- 1 | https://www.washingtonpost.com/opinions/2021/03/15/flying-safer-than-driving-pandemic/ 2 | 3 | https://data.cdc.gov/Case-Surveillance/COVID-19-Case-Surveillance-Public-Use-Data/vbim-akqf 4 | 5 | https://www.medrxiv.org/content/10.1101/2020.07.02.20143826v4.full-text 6 | 7 | -------------------------------------------------------------------------------- /2021/covid-travel/all_by_age_race.csv: -------------------------------------------------------------------------------- 1 | ,age_group,race_ethnicity,cases,deaths,prob_death,prob_death_full,prob_death_middle,odds_full,odds_middle 2 | 0,0 - 9 Years,"American Indian/Alaska Native, Non-Hispanic",8926,2,4.870968056191488e-05,1.2400631507613324e-08,7.583633903457413e-09,"1 in 80,641,054","1 in 131,862,905" 3 | 1,0 - 9 Years,"Black, Non-Hispanic",64642,21,7.062308392847295e-05,1.7979400185452858e-08,1.099534235224373e-08,"1 in 55,619,208","1 in 90,947,600" 4 | 2,0 - 9 Years,Hispanic/Latino,136842,29,4.607026955238126e-05,1.1728683694595735e-08,7.172702717168578e-09,"1 in 85,261,060","1 in 139,417,461" 5 | 3,0 - 9 Years,"Multiple/Other, Non-Hispanic",46751,6,2.7899891469422186e-05,7.102823693843331e-09,4.343747698803931e-09,"1 in 140,789,078","1 in 230,215,949" 6 | 4,0 - 9 Years,"White, Non-Hispanic",221859,44,4.31139480088901e-05,1.0976056010407707e-08,6.71243157541384e-09,"1 in 91,107,407","1 in 148,977,310" 7 | 5,0 - 9 Years,,397171,23,1.25890359568045e-05,3.2049480541756573e-09,1.9599931428931117e-09,"1 in 312,017,538","1 in 510,205,867" 8 | 6,10 - 19 Years,"American Indian/Alaska Native, Non-Hispanic",18740,4,4.6401559092385505e-05,1.1813024208853417e-08,7.224281347092942e-09,"1 in 84,652,328","1 in 138,422,073" 9 | 7,10 - 19 Years,"Asian, Non-Hispanic",35733,9,5.4753917642807345e-05,1.3939388401933908e-08,8.524664120005986e-09,"1 in 71,739,159","1 in 117,306,675" 10 | 8,10 - 19 Years,"Black, Non-Hispanic",129927,48,8.031265717437987e-05,2.0446195818320723e-08,1.2503916732736544e-08,"1 in 48,908,854","1 in 79,974,941" 11 | 9,10 - 19 Years,Hispanic/Latino,282124,56,4.315093024158973e-05,1.0985471039099057e-08,6.718189357244702e-09,"1 in 91,029,324","1 in 148,849,630" 12 | 10,10 - 19 Years,"Multiple/Other, Non-Hispanic",94194,16,3.6926565063222896e-05,9.40085668615466e-09,5.749114909421905e-09,"1 in 106,373,284","1 in 173,939,818" 13 | 11,10 - 19 Years,"Native Hawaiian/Other Pacific Islander, Non-Hispanic",4484,2,9.696311523096615e-05,2.4685110802175854e-08,1.5096234661521158e-08,"1 in 40,510,250","1 in 66,241,684" 14 | 12,10 - 19 Years,"White, Non-Hispanic",649931,67,2.241040570661247e-05,5.705296768484722e-09,3.4890869853041925e-09,"1 in 175,275,720","1 in 286,607,930" 15 | 13,10 - 19 Years,,957080,59,1.3401269440926296e-05,3.411728472740228e-09,2.0864501698465076e-09,"1 in 293,106,561","1 in 479,282,954" 16 | 14,20 - 29 Years,"American Indian/Alaska Native, Non-Hispanic",26042,41,0.00034225648868216227,8.713250730195871e-08,5.328607950834605e-08,"1 in 11,476,773","1 in 18,766,627" 17 | 15,20 - 29 Years,"Asian, Non-Hispanic",83841,36,9.334438945768465e-05,2.3763846603275454e-08,1.4532833482433362e-08,"1 in 42,080,729","1 in 68,809,706" 18 | 16,20 - 29 Years,"Black, Non-Hispanic",253353,332,0.00028487490988256806,7.252416239371464e-08,4.43523135423512e-08,"1 in 13,788,508","1 in 22,546,738" 19 | 17,20 - 29 Years,Hispanic/Latino,463305,315,0.0001478038459968384,3.7628270365786365e-08,2.3011652809717327e-08,"1 in 26,575,763","1 in 43,456,244" 20 | 18,20 - 29 Years,"Multiple/Other, Non-Hispanic",159161,63,8.604904577071671e-05,2.1906579880524313e-08,1.3397017868706637e-08,"1 in 45,648,385","1 in 74,643,477" 21 | 19,20 - 29 Years,"Native Hawaiian/Other Pacific Islander, Non-Hispanic",8069,7,0.0001885907956915086,4.801191336340908e-08,2.9361792883617926e-08,"1 in 20,828,164","1 in 34,057,866" 22 | 20,20 - 29 Years,"White, Non-Hispanic",1095381,352,6.985855983482896e-05,1.778476574206084e-08,1.0876313223543143e-08,"1 in 56,227,898","1 in 91,942,920" 23 | 21,20 - 29 Years,,1745942,451,5.615506028314204e-05,1.4296094776764462e-08,8.742808700472196e-09,"1 in 69,949,173","1 in 114,379,719" 24 | 22,30 - 39 Years,"American Indian/Alaska Native, Non-Hispanic",25721,110,0.000929708933488623,2.366876103585634e-07,1.4474683691241974e-07,"1 in 4,224,978","1 in 6,908,614" 25 | 23,30 - 39 Years,"Asian, Non-Hispanic",80384,139,0.00037591300886181115,9.570086783649083e-08,5.852607953631482e-08,"1 in 10,449,226","1 in 17,086,400" 26 | 24,30 - 39 Years,"Black, Non-Hispanic",248562,778,0.0006804356047288351,1.732269869472248e-07,1.059373508841234e-07,"1 in 5,772,773","1 in 9,439,541" 27 | 25,30 - 39 Years,Hispanic/Latino,427640,910,0.00046259958599878813,1.177697520363278e-07,7.202235497411069e-08,"1 in 8,491,145","1 in 13,884,578" 28 | 26,30 - 39 Years,"Multiple/Other, Non-Hispanic",139344,154,0.00024025620672268072,6.11650220780633e-08,3.740560590419456e-08,"1 in 16,349,213","1 in 26,733,961" 29 | 27,30 - 39 Years,"Native Hawaiian/Other Pacific Islander, Non-Hispanic",7662,23,0.0006525711302531977,1.6613317979966068e-07,1.015991172743409e-07,"1 in 6,019,267","1 in 9,842,605" 30 | 28,30 - 39 Years,"White, Non-Hispanic",912079,831,0.0001980663669627779,5.042422784183765e-08,3.083704919239385e-08,"1 in 19,831,737","1 in 32,428,524" 31 | 29,30 - 39 Years,,1497278,1260,0.0001829406719916147,4.6573490832886485e-08,2.848212237141289e-08,"1 in 21,471,442","1 in 35,109,743" 32 | 30,40 - 49 Years,"American Indian/Alaska Native, Non-Hispanic",21048,237,0.002447821129344108,6.231723852707787e-07,3.811024644782361e-07,"1 in 1,604,692","1 in 2,623,966" 33 | 31,40 - 49 Years,"Asian, Non-Hispanic",66604,336,0.00109668305598567,2.7919629734863815e-07,1.70743119412345e-07,"1 in 3,581,709","1 in 5,856,751" 34 | 32,40 - 49 Years,"Black, Non-Hispanic",215371,1773,0.001789631763834015,4.5560890117964793e-07,2.7862864141892015e-07,"1 in 2,194,865","1 in 3,589,006" 35 | 33,40 - 49 Years,Hispanic/Latino,392976,2231,0.0012341720613981517,3.141985899689167e-07,1.9214884966494648e-07,"1 in 3,182,700","1 in 5,204,299" 36 | 34,40 - 49 Years,"Multiple/Other, Non-Hispanic",121013,350,0.0006287502708117238,1.600688062147653e-07,9.789043605972624e-08,"1 in 6,247,313","1 in 10,215,503" 37 | 35,40 - 49 Years,"Native Hawaiian/Other Pacific Islander, Non-Hispanic",6215,42,0.001469096505649026,3.74006238708944e-07,2.287243508716779e-07,"1 in 2,673,752","1 in 4,372,075" 38 | 36,40 - 49 Years,"White, Non-Hispanic",874428,2208,0.0005489302721321824,1.3974803262018143e-07,8.546322156812911e-08,"1 in 7,155,736","1 in 11,700,940" 39 | 37,40 - 49 Years,,1353455,3216,0.0005165524046108728,1.3150519465650954e-07,8.042229559563893e-08,"1 in 7,604,262","1 in 12,434,363" 40 | 38,50 - 59 Years,"American Indian/Alaska Native, Non-Hispanic",18692,432,0.005024237292866514,1.2790828138659645e-06,7.822259524934918e-07,"1 in 781,810","1 in 1,278,403" 41 | 39,50 - 59 Years,"Asian, Non-Hispanic",61310,851,0.003017452291632686,7.681905019429945e-07,4.6978861772259003e-07,"1 in 1,301,760","1 in 2,128,617" 42 | 40,50 - 59 Years,"Black, Non-Hispanic",213598,4624,0.004706117994102697,1.1980952123476975e-06,7.326978038458559e-07,"1 in 834,658","1 in 1,364,819" 43 | 41,50 - 59 Years,Hispanic/Latino,291746,4805,0.0035803925928420766,9.115052425767211e-07,5.574330675449937e-07,"1 in 1,097,086","1 in 1,793,937" 44 | 42,50 - 59 Years,"Multiple/Other, Non-Hispanic",114298,993,0.0018886556651681685,4.8081865202846e-07,2.940457208730907e-07,"1 in 2,079,786","1 in 3,400,832" 45 | 43,50 - 59 Years,"Native Hawaiian/Other Pacific Islander, Non-Hispanic",4955,94,0.004124073180362392,1.049916797444391e-06,6.420789631576713e-07,"1 in 952,456","1 in 1,557,441" 46 | 44,50 - 59 Years,"White, Non-Hispanic",980669,7753,0.0017186581635686413,4.3754026567421016e-07,2.6757872700737385e-07,"1 in 2,285,504","1 in 3,737,218" 47 | 45,50 - 59 Years,,1288655,7977,0.0013456902233589352,3.4258916073285796e-07,2.0951116664467018e-07,"1 in 2,918,948","1 in 4,773,015" 48 | 46,60 - 69 Years,"American Indian/Alaska Native, Non-Hispanic",13145,709,0.011725403939338814,2.9850824692811784e-06,1.825533853236324e-06,"1 in 334,999","1 in 547,785" 49 | 47,60 - 69 Years,"Asian, Non-Hispanic",44214,2108,0.01036461006842216,2.638648184424666e-06,1.6136711923432159e-06,"1 in 378,982","1 in 619,705" 50 | 48,60 - 69 Years,"Black, Non-Hispanic",158846,9960,0.013630921718547196,3.4701939201997124e-06,2.1222048448614325e-06,"1 in 288,168","1 in 471,208" 51 | 49,60 - 69 Years,Hispanic/Latino,155256,7261,0.01016693886786704,2.5883245590287696e-06,1.5828956668013995e-06,"1 in 386,350","1 in 631,754" 52 | 50,60 - 69 Years,"Multiple/Other, Non-Hispanic",79797,2287,0.006230483765598685,1.5861720380848877e-06,9.700270536505796e-07,"1 in 630,449","1 in 1,030,899" 53 | 51,60 - 69 Years,"Native Hawaiian/Other Pacific Islander, Non-Hispanic",3140,197,0.013638881196344505,3.4722202638344646e-06,2.1234440598384382e-06,"1 in 288,000","1 in 470,933" 54 | 52,60 - 69 Years,"White, Non-Hispanic",809271,23005,0.00617974319668163,1.5732543779736237e-06,9.621272297494358e-07,"1 in 635,625","1 in 1,039,364" 55 | 53,60 - 69 Years,,849835,15493,0.003963173413969618,1.0089545351244462e-06,6.170284001197218e-07,"1 in 991,125","1 in 1,620,671" 56 | 54,70 - 79 Years,"American Indian/Alaska Native, Non-Hispanic",6257,716,0.024876486161585984,6.333117658243637e-06,3.873032253087645e-06,"1 in 157,900","1 in 258,196" 57 | 55,70 - 79 Years,"Asian, Non-Hispanic",21528,3025,0.030546669251773226,7.776646958189184e-06,4.755825821542622e-06,"1 in 128,590","1 in 210,268" 58 | 56,70 - 79 Years,"Black, Non-Hispanic",81625,11500,0.030627871362940276,7.797319593415007e-06,4.768468217800918e-06,"1 in 128,249","1 in 209,711" 59 | 57,70 - 79 Years,Hispanic/Latino,70704,8297,0.02551051782323367,6.494531014060225e-06,3.971744951460303e-06,"1 in 153,976","1 in 251,779" 60 | 58,70 - 79 Years,"Multiple/Other, Non-Hispanic",42299,3384,0.01739171549949274,4.427626145491156e-06,2.7077246612937747e-06,"1 in 225,855","1 in 369,314" 61 | 59,70 - 79 Years,"Native Hawaiian/Other Pacific Islander, Non-Hispanic",1471,163,0.024088907279874682,6.132613869620637e-06,3.750413713198002e-06,"1 in 163,063","1 in 266,637" 62 | 60,70 - 79 Years,"White, Non-Hispanic",526749,46483,0.019183709888390865,4.883836529630099e-06,2.98672114096026e-06,"1 in 204,757","1 in 334,815" 63 | 61,70 - 79 Years,,421857,20629,0.01063053408475219,2.706347781250016e-06,1.6550730320333899e-06,"1 in 369,502","1 in 604,203" 64 | 62,80+ Years,"American Indian/Alaska Native, Non-Hispanic",2964,605,0.04437305638678637,1.1296602949790604e-05,6.908462772346125e-06,"1 in 88,522","1 in 144,750" 65 | 63,80+ Years,"Asian, Non-Hispanic",15805,5680,0.07812607457739021,1.9889530187722652e-05,1.2163486622665034e-05,"1 in 50,278","1 in 82,213" 66 | 64,80+ Years,"Black, Non-Hispanic",49054,13183,0.05842274972922476,1.487340879053286e-05,9.09586637540328e-06,"1 in 67,234","1 in 109,940" 67 | 65,80+ Years,Hispanic/Latino,38416,9838,0.055672002607609156,1.4173116753461818e-05,8.667601215569066e-06,"1 in 70,556","1 in 115,372" 68 | 66,80+ Years,"Multiple/Other, Non-Hispanic",30084,6323,0.04569090604279034,1.163210438971008e-05,7.113639427492279e-06,"1 in 85,969","1 in 140,575" 69 | 67,80+ Years,"Native Hawaiian/Other Pacific Islander, Non-Hispanic",593,136,0.04985702764132268,1.2692725977930933e-05,7.762264929365495e-06,"1 in 78,785","1 in 128,828" 70 | 68,80+ Years,"White, Non-Hispanic",413173,100420,0.052836063301834095,1.3451136278468311e-05,8.22607244306796e-06,"1 in 74,343","1 in 121,565" 71 | 69,80+ Years,,298756,37030,0.026945065538432705,6.859741735852168e-06,4.195090384310809e-06,"1 in 145,778","1 in 238,374" 72 | 70,,"Black, Non-Hispanic",14580,3,4.473072105922347e-05,1.1387658110797995e-08,6.964147759491858e-09,"1 in 87,814,368","1 in 143,592,588" 73 | 71,,Hispanic/Latino,1916,2,0.00022692202959063269,5.777037413202324e-08,3.5329601368612143e-08,"1 in 17,309,910","1 in 28,304,876" 74 | 72,,"Native Hawaiian/Other Pacific Islander, Non-Hispanic",66,1,0.003293807641633729,8.38545733613307e-07,5.128145168353096e-07,"1 in 1,192,541","1 in 1,950,023" 75 | 73,,"White, Non-Hispanic",43405,11,5.50928314209443e-05,1.402566991367955e-08,8.577429771280608e-09,"1 in 71,297,842","1 in 116,585,041" 76 | 74,,,96345,19,4.2871293607438855e-05,1.0914280449956792e-08,6.674652593403688e-09,"1 in 91,623,081","1 in 149,820,532" 77 | -------------------------------------------------------------------------------- /2021/covid-travel/covid_and_air_travel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2021/covid-travel/covid_and_air_travel.png -------------------------------------------------------------------------------- /2021/covid-travel/flight_infection_risk.csv: -------------------------------------------------------------------------------- 1 | seat,full_flight,middle_empty 2 | "Window (A/F)",0.0017699115044247787,0.0010869565217391304 3 | "Middle (B/E)",0.002061855670103093,N/A 4 | "Aisle (C/F)",0.0022172949002217295,0.0013774104683195593 5 | "Any seat",0.0002545824847250509,0.00015569048731122528 6 | -------------------------------------------------------------------------------- /2021/covid-travel/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2021/covid-travel/requirements.txt -------------------------------------------------------------------------------- /2021/covid-travel/test.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | pd.set_option("display.max_rows", None) 6 | sns.set_theme(style="ticks") 7 | 8 | df = pd.read_csv("all_by_age_race.csv") 9 | print(df.head()) 10 | print(df.dtypes) 11 | 12 | dff = ( 13 | df[["age_group", "race_ethnicity", "prob_death_full", "odds_full"]] 14 | .copy() 15 | .dropna() 16 | ) 17 | dff.columns = ["age_group", "race_ethnicity", "prob_death", "odds"] 18 | dff["full"] = True 19 | 20 | dfm = ( 21 | df[["age_group", "race_ethnicity", "prob_death_middle", "odds_middle"]] 22 | .copy() 23 | .dropna() 24 | ) 25 | dfm.columns = ["age_group", "race_ethnicity", "prob_death", "odds"] 26 | dfm["full"] = False 27 | 28 | dfg = dff.append(dfm, ignore_index=True) 29 | print(dfg) 30 | 31 | 32 | g = sns.catplot( 33 | x="age_group", 34 | y="prob_death", 35 | col="race_ethnicity", 36 | col_wrap=3, 37 | hue="full", 38 | marker="o", 39 | palette="husl", 40 | kind="swarm", 41 | data=dfg, 42 | ) 43 | g.set_xticklabels(rotation=30) 44 | g.set_xlabels("Age group") 45 | g.set_ylabels("Probability of dying") 46 | 47 | g.savefig("covid_and_air_travel.png") 48 | 49 | plt.show() 50 | -------------------------------------------------------------------------------- /2022/uk-covid-deaths/asmr/agestandardisedmortalityratecalculationtemplateusingthe2013esp_tcm77-359944.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2022/uk-covid-deaths/asmr/agestandardisedmortalityratecalculationtemplateusingthe2013esp_tcm77-359944.xls -------------------------------------------------------------------------------- /2022/uk-covid-deaths/asmr/espmortalityratesreport_tcm77-364912.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2022/uk-covid-deaths/asmr/espmortalityratesreport_tcm77-364912.pdf -------------------------------------------------------------------------------- /2022/uk-covid-deaths/output1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2022/uk-covid-deaths/output1.png -------------------------------------------------------------------------------- /2022/uk-covid-deaths/output2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2022/uk-covid-deaths/output2.png -------------------------------------------------------------------------------- /2022/uk-covid-deaths/output3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2022/uk-covid-deaths/output3.png -------------------------------------------------------------------------------- /2022/uk-covid-deaths/output4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2022/uk-covid-deaths/output4.png -------------------------------------------------------------------------------- /2022/uk-covid-deaths/output5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/j-bennet/talks/49d8d12290f1199dfbd123dcf5218f4c51a5c51f/2022/uk-covid-deaths/output5.png -------------------------------------------------------------------------------- /2022/uk-covid-deaths/referencetable06072022accessible/Contents-Table 1.tsv: -------------------------------------------------------------------------------- 1 | Contents 2 | This worksheet contains 1 table. 3 | Worksheet Name Worksheet Title 4 | Definitions "Definitions used in 'Age-standardised mortality rates for deaths by vaccination status, England: deaths occurring between 1 January 2021 and 31 March 2022: 6 July 2022'" 5 | Notes "Notes for 'Age-standardised mortality rates for deaths by vaccination status, England: deaths occurring between 1 January 2021 and 31 May 2022: 6 July 2022'" 6 | Table 1 "Monthly age-standardised mortality rates by vaccination status for all cause deaths, deaths involving COVID-19 and deaths not involving COVID-19, per 100,000 person-years, England, deaths occurring between 1 January 2021 and 31 May 2022" 7 | Table 2 "Monthly age-standardised mortality rates by vaccination status by age group for all cause deaths, deaths involving COVID-19 and deaths not involving COVID-19, per 100,000 person-years, England, deaths occurring between 1 January 2021 and 31 May 2022" 8 | Table 3 "Whole period age-standardised mortality rates by vaccination status for all cause deaths, deaths involving COVID-19 and deaths not involving COVID-19, per 100,000 person-years, England, deaths occurring between 1 January 2021 and 31 May 2022" 9 | Table 4 "Monthly age-standardised mortality rates by vaccination status by sex for all cause deaths, deaths involving COVID-19 and deaths not involving COVID-19, per 100,000 person-years, England, deaths occurring between 1 January 2021 and 31 May 2022" 10 | Table 5 "Monthly age-standardised mortality rates by vaccination status by age group by sex for all cause deaths and deaths involving COVID-19, per 100,000 person-years, England, deaths occurring between 1 January 2021 and 31 May 2022" 11 | Table 6 "Whole period counts of all cause deaths, deaths involving COVID-19 and deaths not involving COVID-19, and person-years by vaccination status and five-year age group, England, deaths occurring between 1 January 2021 and 31 May 2022" 12 | Table 7 "Whole period counts of all registered deaths by vaccination status by age group; for all deaths and deaths involving COVID-19, deaths occurring between 1 January 2021 and 31 May 2022, England" 13 | Table 8 "Monthly counts of all registered deaths for 'unvaccinated' and 'ever vaccinated' by age group; for all deaths and deaths involving COVID-19, deaths occurring between 1 January 2021 and 31 May 2022, England" 14 | Table 9 "Whole period counts of all registered deaths grouped by how many weeks after vaccination the deaths occurred; for deaths involving COVID-19 and deaths not involving COVID-19, deaths occurring between 1 January 2021 and 31 May 2022, England" -------------------------------------------------------------------------------- /2022/uk-covid-deaths/referencetable06072022accessible/Cover-Table 1.tsv: -------------------------------------------------------------------------------- 1 | "All data relating to 'Age-standardised mortality rates for deaths by vaccination status, England: deaths occurring between 1 January 2021 and 31 May 2022: 6 July 2022'" 2 | "Age-standardised mortality rates for deaths by vaccination status, England: deaths occurring between 1 January 2021 and 31 May 2022" 3 | Publication date: 6 July 2022 4 | Contact e-mail: Health.Data@ons.gov.uk 5 | Office for National Statistics 6 | Crown Copyright 2020 7 | -------------------------------------------------------------------------------- /2022/uk-covid-deaths/referencetable06072022accessible/Definitions-Table 1.tsv: -------------------------------------------------------------------------------- 1 | Definitions of COVID-19 deaths used in this dataset and accompanying publication 2 | We use the term “involving COVID-19” when referring only to deaths where COVID-19 is mentioned anywhere on the death certificate. Information on cause of death coding is available in the User Guide to Mortality Statistics. 3 | "For this analysis we define a death as involving COVID-19 if either of the ICD10 codes U07.1 (COVID-19, virus identified) or U07.2 (COVID-19, virus not identified) is mentioned on the death certificate. In contrast to the definition used in the weekly deaths released, deaths where the ICD10 code U09.9 (Post-COVID condition, where the acute COVID had ended before the condition immediately causing death occurred) is mentioned on the death certificate and neither of the other two COVID-19 codes are mentioned are not included, as they are likely to be the result of an infection caught a long time previously, and therefore not linked to the vaccination status of the person at date of death. Deaths involving U10.9 (Multisystem inflammatory syndrome associated with COVID-19) where neither U07.1 nor U07.2 are mentioned are also excluded." 4 | ICD-10 code Description 5 | U07.1  "COVID-19, virus identified" 6 | U07.2 "COVID-19, virus not identified" 7 | Vaccination status and age 8 | Vaccination status is defined on each day for each person and is one of: Category Name 9 | unvaccinated (Unvaccinated) 10 | " vaccinated with first dose only, less than 21 days after first vaccination" "(First dose, less than 21 days ago)" 11 | " vaccinated with first dose only, at least 21 days after first vaccination" "(First dose, at least 21 days ago)" 12 | " vaccinated with first and second doses, less than 21 days after second vaccination" "(Second dose, less than 21 days ago)" 13 | " vaccinated with first and second doses, at least 21 days but less than 6 months after second vaccination" "(Second dose, between 21 days and 6 months ago)" 14 | " vaccinated with first and second doses, at least 6 months after second vaccination" "(Second dose, at least 6 months ago)" 15 | " vaccinated with at least first, second and third dose and/or booster, less than 21 days after third or booster vaccination" "(Third dose or booster, less than 21 days ago)" 16 | " vaccinated with at least first, second and third dose and/or booster, at least 21 days after third or booster vaccination" "(Third dose or booster, at least 21 days ago)" 17 | "For the age-breakdowns, due to low numbers, the “vaccinated with first and second doses, at least 21 days but less than 6 months after second vaccination” and “vaccinated with first and second doses, at least 6 months after second vaccination” are combined into:" 18 | " vaccinated with first and second doses, at least 21 days after second vaccination" "(Second dose, at least 21 days ago)" 19 | "We also include an ""Ever vaccinated"" category, which includes anyone who has had at least 1 dose of the vaccine, regardless of further doses." 20 | Age in years is defined on the first day of each month. Children aged <10 each week are not included when calculating the ASMR as they would not be included in our dataset due to the linkage to the 2011 census. 21 | "The Joint Committee on Vaccination and Immunisation (JCVI) advised in February 2022 a spring booster for the most vulnerable. This spring booster may be present in the NIMS dataset if it is the person’s third dose or booster, but it is not being differentiated from a normal third dose or booster in our analysis. Further developments to the handling of spring boosters will be available in future publications." -------------------------------------------------------------------------------- /2022/uk-covid-deaths/referencetable06072022accessible/Notes-Table 1.tsv: -------------------------------------------------------------------------------- 1 | "Notes for 'Age-standardised mortality rates for deaths by vaccination status, England: deaths occurring between 1 January 2021 and 31 May 2022: 6 July 2022'" 2 | This worksheet contains 1 table. 3 | Note Number Note Text Applies to Tables 4 | Note 1 "Age-standardised mortality rates per 100,000 person-years, standardised to the 2013 European Standard Population using five-year age groups from those aged 10 years and over. 'Person-years' take into account both the number of people and the amount of time spent in each vaccination status. For more information, see our methodology article." "1,2,3,4,5" 5 | Note 2 "Office for National Statistics (ONS) figures based on deaths that occurred between 1 January 2021 and 31 May 2022 and were registered by 8 June 2022. These figures represent death occurrences, there can be a delay between the date a death occurred and the date a death was registered. More information can be found in our Impact of registration delays release." "1,2,3,4,5,6,7,8,9" 6 | Note 3 "ASMRs are calculated using the Public Health Data Asset, a linked dataset of people resident in England, who could be linked to the 2011 Census and GP Patient Register. This dataset covers approximately 79% of the population in England aged 10+." "1,2,3,4,5,6" 7 | Note 4 "Deaths were defined using the International Classification of Diseases, tenth revision (ICD-10). Deaths involving the coronavirus (COVID-19) are defined as those with an underlying cause, or any mention of, ICD-10 codes U07.1 (COVID-19 virus identified) or U07.2 (COVID-19, virus not identified). Please note, this differs from the definition used in the majority of mortality outputs." "1,2,3,4,5,6,7,8,9" 8 | Note 5 "95% confidence intervals are indicated by the shaded regions. Where the total number of deaths is less than 100, Dobson’s method is used, otherwise the normal approximation is used. Non-overlapping confidence intervals denote a statistically significant difference in ASMR." "1,2,3,4,5" 9 | Note 6 "Rates marked with u in 'Noted as Unreliable' column are unreliable due to small numbers of deaths. Otherwise, column left blank." "1,2,4,5" 10 | Note 7 x denotes data are not available; age-standardised rates are not provided for categories with fewer than 10 deaths. "1,2,3,4,5" 11 | Note 8 Age is defined on the first day of the month. "1,2,3,4,5,6" 12 | Note 9 "Caution must be taken when comparing mortality rates and counts as the characteristics of people in the different vaccination status groups, such as health, may differ, particularly due to the prioritisation of the vaccine to more clinically vulnerable people. While differences in the ages of people in the vaccination status groups are accounted for, other differences, such as ethnicity or level of deprivation, may remain, which can affect the mortality rates and counts." "1,2,3,4,5,6" 13 | Note 10 Totals of person-years may not exactly equal the sum of totals from breakdowns due to rounding. "1,2,3,4,5,6" 14 | Note 11 "Third dose and booster vaccinations are defined as a third or booster dose received after 16 September 2021, the date from which booster doses were first administered. Due to our definition of a third dose or booster only including data from 16th September, there are 0 Person-years and consequently 0 counts of death before September for 'Within 21 days of third dose or booster' and '21 days or more after third dose or booster', and within September for '21 days or more after third dose or booster'. Spring boosters have not yet been distinguished in this analysis." "1,2,3,4,5,6,7,8" 15 | Note 12 '21 days or more after a second dose' is separated into '21 days or more but less than 6 months after a second dose' and '6 months or more after a second dose'. "1,3,4,7" 16 | Note 13 '21 days or more but less than 6 months after a second dose' and '6 months or more after a second dose' are combined into '21 days or more after a second dose' due to low counts. "2,5" 17 | Note 14 "These counts are for all deaths that have been registered, not solely those in the Public Health Data Asset which are used to calculate the mortality rates." "7,8,9" 18 | Note 15 Age is defined on the date of death. "7,8,9" 19 | Note 16 "When the category of 'Ever Vaccinated' is included, the total counts and person-years of those in all the vaccination categories will add up to the 'Ever Vaccinated' totals (differences in person-years may occur due to rounding)." "1,3,4,8" 20 | Note 17 "There were some people who were vaccinated but not included in the NIMS data as they died soon after vaccination. Of these, 1,436 linked to our Public health Data Asset dataset. We included the latest vaccination records for these people in our dataset. This data is provisional and extends up to the 25 May 2022. This will be updated in future releases." "1,2,3,4,5,6,7,8,9" 21 | Note 18 Primary and secondary suppression are applied to counts less than 3. "2,5,8,9" -------------------------------------------------------------------------------- /2022/uk-covid-deaths/referencetable06072022accessible/Table 3-Table 1.tsv: -------------------------------------------------------------------------------- 1 | "Whole period age-standardised mortality rates by vaccination status for all cause deaths, deaths involving COVID-19 and deaths not involving COVID-19, per 100,000 person-years, England, deaths occurring between 1 January 2021 and 31 May 2022" 2 | This worksheet contains 1 table. 3 | "Source: Source: Office for National Statistics, National Immunisation Management Service." 4 | Cause of Death Vaccination status Count of deaths Person-years "Age-standardised mortality rate / 100,000 person-years" Lower confidence limit Upper confidence limit 5 | All causes Unvaccinated 109891 16375484 2337.5 2322.6 2352.4 6 | All causes "First dose, less than 21 days ago" 17699 1925587 826.8 814.6 839.1 7 | All causes "First dose, at least 21 days ago" 77200 5536696 1289.2 1280.1 1298.3 8 | All causes "Second dose, less than 21 days ago" 11986 1878686 512.6 503.3 521.8 9 | All causes "Second dose, between 21 days and 6 months ago" 156537 13454401 868.4 864.1 872.7 10 | All causes "Second dose, at least 6 months ago" 71790 2664983 2106.5 2086.6 2126.4 11 | All causes "Third dose or booster, less than 21 days ago" 12868 1529103 569.4 550.1 588.7 12 | All causes "Third dose or booster, at least 21 days ago" 183038 11871491 883.2 869.5 897 13 | All causes Ever vaccinated 531118 38860947 957.4 954.8 960 14 | Deaths involving COVID-19 Unvaccinated 38285 16375484 863.2 854.1 872.4 15 | Deaths involving COVID-19 "First dose, less than 21 days ago" 4037 1925587 190.1 184.2 195.9 16 | Deaths involving COVID-19 "First dose, at least 21 days ago" 7270 5536696 122 119.2 124.8 17 | Deaths involving COVID-19 "Second dose, less than 21 days ago" 200 1878686 8.4 7.2 9.5 18 | Deaths involving COVID-19 "Second dose, between 21 days and 6 months ago" 5462 13454401 30.4 29.6 31.2 19 | Deaths involving COVID-19 "Second dose, at least 6 months ago" 6664 2664983 197.5 192.5 202.4 20 | Deaths involving COVID-19 "Third dose or booster, less than 21 days ago" 494 1529103 21.6 19.6 23.5 21 | Deaths involving COVID-19 "Third dose or booster, at least 21 days ago" 12048 11871491 58.5 53.4 63.5 22 | Deaths involving COVID-19 Ever vaccinated 36175 38860947 64.5 63.8 65.1 23 | Non-COVID-19 deaths Unvaccinated 71606 16375484 1474.3 1462.5 1486 24 | Non-COVID-19 deaths "First dose, less than 21 days ago" 13662 1925587 636.8 626.1 647.5 25 | Non-COVID-19 deaths "First dose, at least 21 days ago" 69930 5536696 1167.2 1158.5 1175.9 26 | Non-COVID-19 deaths "Second dose, less than 21 days ago" 11786 1878686 504.2 495.1 513.3 27 | Non-COVID-19 deaths "Second dose, between 21 days and 6 months ago" 151075 13454401 838 833.7 842.2 28 | Non-COVID-19 deaths "Second dose, at least 6 months ago" 65126 2664983 1909 1889.8 1928.2 29 | Non-COVID-19 deaths "Third dose or booster, less than 21 days ago" 12374 1529103 547.9 528.6 567.1 30 | Non-COVID-19 deaths "Third dose or booster, at least 21 days ago" 170990 11871491 824.8 812 837.6 31 | Non-COVID-19 deaths Ever vaccinated 494943 38860947 892.9 890.4 895.4 -------------------------------------------------------------------------------- /2022/uk-covid-deaths/referencetable06072022accessible/Table 6-Table 1.tsv: -------------------------------------------------------------------------------- 1 | "Whole period counts of all cause deaths, deaths involving COVID-19 and deaths not involving COVID-19, and person-years by vaccination status and five-year age group, England, deaths occurring between 1 January 2021 and 31 May 2022" 2 | This worksheet contains 1 table. 3 | "Source: Source: Office for National Statistics, National Immunisation Management Service." 4 | Age group Vaccination status Person-years Count of deaths involving COVID-19 Count of deaths non-COVID-19 deaths Count of all cause deaths 5 | 10-14 Unvaccinated 2881265 9 175 184 6 | 10-14 "First dose, less than 21 days ago" 61754 2 2 4 7 | 10-14 "First dose, at least 21 days ago" 280645 0 14 14 8 | 10-14 "Second dose, less than 21 days ago" 36646 0 0 0 9 | 10-14 "Second dose, between 21 days and 6 months ago" 135989 0 13 13 10 | 10-14 "Second dose, at least 6 months ago" 1028 0 1 1 11 | 10-14 "Third dose or booster, less than 21 days ago" 723 0 1 1 12 | 10-14 "Third dose or booster, at least 21 days ago" 2422 1 6 7 13 | 15-19 Unvaccinated 1991761 24 265 289 14 | 15-19 "First dose, less than 21 days ago" 115758 0 13 13 15 | 15-19 "First dose, at least 21 days ago" 465610 2 79 81 16 | 15-19 "Second dose, less than 21 days ago" 97554 1 3 4 17 | 15-19 "Second dose, between 21 days and 6 months ago" 520292 2 74 76 18 | 15-19 "Second dose, at least 6 months ago" 63581 0 23 23 19 | 15-19 "Third dose or booster, less than 21 days ago" 35398 1 2 3 20 | 15-19 "Third dose or booster, at least 21 days ago" 160272 1 31 32 21 | 20-24 Unvaccinated 1531301 43 335 378 22 | 20-24 "First dose, less than 21 days ago" 116923 1 21 22 23 | 20-24 "First dose, at least 21 days ago" 342619 5 104 109 24 | 20-24 "Second dose, less than 21 days ago" 110074 0 20 20 25 | 20-24 "Second dose, between 21 days and 6 months ago" 710759 4 151 155 26 | 20-24 "Second dose, at least 6 months ago" 209425 2 41 43 27 | 20-24 "Third dose or booster, less than 21 days ago" 67390 0 12 12 28 | 20-24 "Third dose or booster, at least 21 days ago" 414003 4 48 52 29 | 25-29 Unvaccinated 1567892 68 525 593 30 | 25-29 "First dose, less than 21 days ago" 117976 2 30 32 31 | 25-29 "First dose, at least 21 days ago" 339758 5 145 150 32 | 25-29 "Second dose, less than 21 days ago" 112913 0 19 19 33 | 25-29 "Second dose, between 21 days and 6 months ago" 748987 11 189 200 34 | 25-29 "Second dose, at least 6 months ago" 228001 3 92 95 35 | 25-29 "Third dose or booster, less than 21 days ago" 73969 0 7 7 36 | 25-29 "Third dose or booster, at least 21 days ago" 475006 5 95 100 37 | 30-34 Unvaccinated 1432230 129 649 778 38 | 30-34 "First dose, less than 21 days ago" 116485 3 46 49 39 | 30-34 "First dose, at least 21 days ago" 330087 9 222 231 40 | 30-34 "Second dose, less than 21 days ago" 112049 0 22 22 41 | 30-34 "Second dose, between 21 days and 6 months ago" 757164 10 297 307 42 | 30-34 "Second dose, at least 6 months ago" 219824 13 116 129 43 | 30-34 "Third dose or booster, less than 21 days ago" 77326 1 15 16 44 | 30-34 "Third dose or booster, at least 21 days ago" 510682 4 168 172 45 | 35-39 Unvaccinated 1351742 238 903 1141 46 | 35-39 "First dose, less than 21 days ago" 126946 9 62 71 47 | 35-39 "First dose, at least 21 days ago" 341664 15 364 379 48 | 35-39 "Second dose, less than 21 days ago" 123317 1 45 46 49 | 35-39 "Second dose, between 21 days and 6 months ago" 853238 24 463 487 50 | 35-39 "Second dose, at least 6 months ago" 225736 11 183 194 51 | 35-39 "Third dose or booster, less than 21 days ago" 91720 1 28 29 52 | 35-39 "Third dose or booster, at least 21 days ago" 620364 10 265 275 53 | 40-44 Unvaccinated 1158559 299 1225 1524 54 | 40-44 "First dose, less than 21 days ago" 133705 3 79 82 55 | 40-44 "First dose, at least 21 days ago" 354401 13 557 570 56 | 40-44 "Second dose, less than 21 days ago" 131852 0 75 75 57 | 40-44 "Second dose, between 21 days and 6 months ago" 951898 40 865 905 58 | 40-44 "Second dose, at least 6 months ago" 226923 39 344 383 59 | 40-44 "Third dose or booster, less than 21 days ago" 105778 1 44 45 60 | 40-44 "Third dose or booster, at least 21 days ago" 745195 25 516 541 61 | 45-49 Unvaccinated 1015441 597 1965 2562 62 | 45-49 "First dose, less than 21 days ago" 147654 13 166 179 63 | 45-49 "First dose, at least 21 days ago" 390029 39 926 965 64 | 45-49 "Second dose, less than 21 days ago" 144704 1 145 146 65 | 45-49 "Second dose, between 21 days and 6 months ago" 1056962 65 1556 1621 66 | 45-49 "Second dose, at least 6 months ago" 222412 60 563 623 67 | 45-49 "Third dose or booster, less than 21 days ago" 119884 6 85 91 68 | 45-49 "Third dose or booster, at least 21 days ago" 854328 39 975 1014 69 | 50-54 Unvaccinated 892001 1069 3014 4083 70 | 50-54 "First dose, less than 21 days ago" 170638 21 250 271 71 | 50-54 "First dose, at least 21 days ago" 461101 66 1640 1706 72 | 50-54 "Second dose, less than 21 days ago" 169954 0 225 225 73 | 50-54 "Second dose, between 21 days and 6 months ago" 1281727 120 2926 3046 74 | 50-54 "Second dose, at least 6 months ago" 239007 110 1161 1271 75 | 50-54 "Third dose or booster, less than 21 days ago" 149814 5 160 165 76 | 50-54 "Third dose or booster, at least 21 days ago" 1134549 85 2109 2194 77 | 55-59 Unvaccinated 811218 1626 4084 5710 78 | 55-59 "First dose, less than 21 days ago" 173094 44 391 435 79 | 55-59 "First dose, at least 21 days ago" 469286 113 2494 2607 80 | 55-59 "Second dose, less than 21 days ago" 173062 2 341 343 81 | 55-59 "Second dose, between 21 days and 6 months ago" 1318673 239 4690 4929 82 | 55-59 "Second dose, at least 6 months ago" 220215 181 1965 2146 83 | 55-59 "Third dose or booster, less than 21 days ago" 159483 7 314 321 84 | 55-59 "Third dose or booster, at least 21 days ago" 1244053 168 3708 3876 85 | 60-64 Unvaccinated 619004 2425 5135 7560 86 | 60-64 "First dose, less than 21 days ago" 153232 67 544 611 87 | 60-64 "First dose, at least 21 days ago" 417699 176 3463 3639 88 | 60-64 "Second dose, less than 21 days ago" 153930 2 514 516 89 | 60-64 "Second dose, between 21 days and 6 months ago" 1178767 345 6761 7106 90 | 60-64 "Second dose, at least 6 months ago" 175218 290 2774 3064 91 | 60-64 "Third dose or booster, less than 21 days ago" 146505 17 475 492 92 | 60-64 "Third dose or booster, at least 21 days ago" 1180409 265 5866 6131 93 | 65-69 Unvaccinated 429644 3051 6303 9354 94 | 65-69 "First dose, less than 21 days ago" 134086 122 796 918 95 | 65-69 "First dose, at least 21 days ago" 361884 242 4686 4928 96 | 65-69 "Second dose, less than 21 days ago" 134189 9 681 690 97 | 65-69 "Second dose, between 21 days and 6 months ago" 1026322 445 9597 10042 98 | 65-69 "Second dose, at least 6 months ago" 142579 436 3778 4214 99 | 65-69 "Third dose or booster, less than 21 days ago" 129293 32 698 730 100 | 65-69 "Third dose or booster, at least 21 days ago" 1089793 490 9186 9676 101 | 70-74 Unvaccinated 322630 4194 8090 12284 102 | 70-74 "First dose, less than 21 days ago" 137755 235 1355 1590 103 | 70-74 "First dose, at least 21 days ago" 372505 413 7651 8064 104 | 70-74 "Second dose, less than 21 days ago" 137947 12 1193 1205 105 | 70-74 "Second dose, between 21 days and 6 months ago" 1049233 708 16101 16809 106 | 70-74 "Second dose, at least 6 months ago" 136845 673 6201 6874 107 | 70-74 "Third dose or booster, less than 21 days ago" 131781 44 1196 1240 108 | 70-74 "Third dose or booster, at least 21 days ago" 1149162 946 15934 16880 109 | 75-79 Unvaccinated 181758 5044 8515 13559 110 | 75-79 "First dose, less than 21 days ago" 99957 467 1738 2205 111 | 75-79 "First dose, at least 21 days ago" 272819 810 9579 10389 112 | 75-79 "Second dose, less than 21 days ago" 101660 16 1556 1572 113 | 75-79 "Second dose, between 21 days and 6 months ago" 793830 825 20720 21545 114 | 75-79 "Second dose, at least 6 months ago" 109217 928 8302 9230 115 | 75-79 "Third dose or booster, less than 21 days ago" 103909 74 1599 1673 116 | 75-79 "Third dose or booster, at least 21 days ago" 986552 1592 23794 25386 117 | 80-84 Unvaccinated 89993 5841 8887 14728 118 | 80-84 "First dose, less than 21 days ago" 60797 803 2153 2956 119 | 80-84 "First dose, at least 21 days ago" 169997 1414 11035 12449 120 | 80-84 "Second dose, less than 21 days ago" 71225 44 1937 1981 121 | 80-84 "Second dose, between 21 days and 6 months ago" 545087 852 25161 26013 122 | 80-84 "Second dose, at least 6 months ago" 114217 1228 11737 12965 123 | 80-84 "Third dose or booster, less than 21 days ago" 68864 81 2041 2122 124 | 80-84 "Third dose or booster, at least 21 days ago" 662376 2198 30106 32304 125 | 85-89 Unvaccinated 58281 6437 9707 16144 126 | 85-89 "First dose, less than 21 days ago" 37103 961 2595 3556 127 | 85-89 "First dose, at least 21 days ago" 104886 1797 12234 14031 128 | 85-89 "Second dose, less than 21 days ago" 43168 62 2156 2218 129 | 85-89 "Second dose, between 21 days and 6 months ago" 334788 835 27721 28556 130 | 85-89 "Second dose, at least 6 months ago" 81233 1326 12935 14261 131 | 85-89 "Third dose or booster, less than 21 days ago" 42875 106 2414 2520 132 | 85-89 "Third dose or booster, at least 21 days ago" 413276 2783 34908 37691 133 | 90+ Unvaccinated 40762 7191 11829 19020 134 | 90+ "First dose, less than 21 days ago" 21725 1284 3421 4705 135 | 90+ "First dose, at least 21 days ago" 61706 2151 14737 16888 136 | 90+ "Second dose, less than 21 days ago" 24443 50 2854 2904 137 | 90+ "Second dose, between 21 days and 6 months ago" 190688 937 33790 34727 138 | 90+ "Second dose, at least 6 months ago" 49520 1364 14910 16274 139 | 90+ "Third dose or booster, less than 21 days ago" 24392 118 3283 3401 140 | 90+ "Third dose or booster, at least 21 days ago" 229050 3432 43275 46707 -------------------------------------------------------------------------------- /2022/uk-covid-deaths/referencetable06072022accessible/Table 7-Table 1.tsv: -------------------------------------------------------------------------------- 1 | "Whole period counts of all registered deaths by vaccination status by age group; for all deaths and deaths involving COVID-19, deaths occurring between 1 January 2021 and 31 May 2022, England" 2 | This worksheet contains 1 table. 3 | "Source: Source: Office for National Statistics, National Immunisation Management Service." 4 | Cause of Death Age group Vaccination status Count of Deaths 5 | All causes 10-39 Unvaccinated 5678 6 | All causes 10-39 "First dose, less than 21 days ago" 243 7 | All causes 10-39 "First dose, at least 21 days ago" 1316 8 | All causes 10-39 "Second dose, less than 21 days ago" 159 9 | All causes 10-39 "Second dose, between 21 days and 6 months ago" 1607 10 | All causes 10-39 "Second dose, at least 6 months ago" 630 11 | All causes 10-39 "Third dose or booster, less than 21 days ago" 78 12 | All causes 10-39 "Third dose or booster, at least 21 days ago" 746 13 | All causes 40-49 Unvaccinated 6908 14 | All causes 40-49 "First dose, less than 21 days ago" 360 15 | All causes 40-49 "First dose, at least 21 days ago" 2225 16 | All causes 40-49 "Second dose, less than 21 days ago" 298 17 | All causes 40-49 "Second dose, between 21 days and 6 months ago" 3281 18 | All causes 40-49 "Second dose, at least 6 months ago" 1332 19 | All causes 40-49 "Third dose or booster, less than 21 days ago" 161 20 | All causes 40-49 "Third dose or booster, at least 21 days ago" 1900 21 | All causes 50-59 Unvaccinated 14466 22 | All causes 50-59 "First dose, less than 21 days ago" 882 23 | All causes 50-59 "First dose, at least 21 days ago" 5699 24 | All causes 50-59 "Second dose, less than 21 days ago" 709 25 | All causes 50-59 "Second dose, between 21 days and 6 months ago" 9923 26 | All causes 50-59 "Second dose, at least 6 months ago" 4381 27 | All causes 50-59 "Third dose or booster, less than 21 days ago" 600 28 | All causes 50-59 "Third dose or booster, at least 21 days ago" 7333 29 | All causes 60-69 Unvaccinated 22133 30 | All causes 60-69 "First dose, less than 21 days ago" 1794 31 | All causes 60-69 "First dose, at least 21 days ago" 10333 32 | All causes 60-69 "Second dose, less than 21 days ago" 1436 33 | All causes 60-69 "Second dose, between 21 days and 6 months ago" 20247 34 | All causes 60-69 "Second dose, at least 6 months ago" 8795 35 | All causes 60-69 "Third dose or booster, less than 21 days ago" 1410 36 | All causes 60-69 "Third dose or booster, at least 21 days ago" 18413 37 | All causes 70-79 Unvaccinated 31333 38 | All causes 70-79 "First dose, less than 21 days ago" 4312 39 | All causes 70-79 "First dose, at least 21 days ago" 21214 40 | All causes 70-79 "Second dose, less than 21 days ago" 3128 41 | All causes 70-79 "Second dose, between 21 days and 6 months ago" 43738 42 | All causes 70-79 "Second dose, at least 6 months ago" 18453 43 | All causes 70-79 "Third dose or booster, less than 21 days ago" 3279 44 | All causes 70-79 "Third dose or booster, at least 21 days ago" 47630 45 | All causes 80-89 Unvaccinated 36259 46 | All causes 80-89 "First dose, less than 21 days ago" 7362 47 | All causes 80-89 "First dose, at least 21 days ago" 29945 48 | All causes 80-89 "Second dose, less than 21 days ago" 4730 49 | All causes 80-89 "Second dose, between 21 days and 6 months ago" 61294 50 | All causes 80-89 "Second dose, at least 6 months ago" 30668 51 | All causes 80-89 "Third dose or booster, less than 21 days ago" 5187 52 | All causes 80-89 "Third dose or booster, at least 21 days ago" 78216 53 | All causes 90+ Unvaccinated 21927 54 | All causes 90+ "First dose, less than 21 days ago" 5286 55 | All causes 90+ "First dose, at least 21 days ago" 19056 56 | All causes 90+ "Second dose, less than 21 days ago" 3242 57 | All causes 90+ "Second dose, between 21 days and 6 months ago" 39061 58 | All causes 90+ "Second dose, at least 6 months ago" 18277 59 | All causes 90+ "Third dose or booster, less than 21 days ago" 3814 60 | All causes 90+ "Third dose or booster, at least 21 days ago" 52353 61 | Deaths involving COVID-19 10-39 Unvaccinated 795 62 | Deaths involving COVID-19 10-39 "First dose, less than 21 days ago" 14 63 | Deaths involving COVID-19 10-39 "First dose, at least 21 days ago" 47 64 | Deaths involving COVID-19 10-39 "Second dose, less than 21 days ago" 3 65 | Deaths involving COVID-19 10-39 "Second dose, between 21 days and 6 months ago" 62 66 | Deaths involving COVID-19 10-39 "Second dose, at least 6 months ago" 39 67 | Deaths involving COVID-19 10-39 "Third dose or booster, less than 21 days ago" 4 68 | Deaths involving COVID-19 10-39 "Third dose or booster, at least 21 days ago" 33 69 | Deaths involving COVID-19 40-49 Unvaccinated 1441 70 | Deaths involving COVID-19 40-49 "First dose, less than 21 days ago" 19 71 | Deaths involving COVID-19 40-49 "First dose, at least 21 days ago" 86 72 | Deaths involving COVID-19 40-49 "Second dose, less than 21 days ago" 4 73 | Deaths involving COVID-19 40-49 "Second dose, between 21 days and 6 months ago" 134 74 | Deaths involving COVID-19 40-49 "Second dose, at least 6 months ago" 126 75 | Deaths involving COVID-19 40-49 "Third dose or booster, less than 21 days ago" 10 76 | Deaths involving COVID-19 40-49 "Third dose or booster, at least 21 days ago" 76 77 | Deaths involving COVID-19 50-59 Unvaccinated 3743 78 | Deaths involving COVID-19 50-59 "First dose, less than 21 days ago" 79 79 | Deaths involving COVID-19 50-59 "First dose, at least 21 days ago" 239 80 | Deaths involving COVID-19 50-59 "Second dose, less than 21 days ago" 3 81 | Deaths involving COVID-19 50-59 "Second dose, between 21 days and 6 months ago" 418 82 | Deaths involving COVID-19 50-59 "Second dose, at least 6 months ago" 350 83 | Deaths involving COVID-19 50-59 "Third dose or booster, less than 21 days ago" 17 84 | Deaths involving COVID-19 50-59 "Third dose or booster, at least 21 days ago" 297 85 | Deaths involving COVID-19 60-69 Unvaccinated 6937 86 | Deaths involving COVID-19 60-69 "First dose, less than 21 days ago" 218 87 | Deaths involving COVID-19 60-69 "First dose, at least 21 days ago" 506 88 | Deaths involving COVID-19 60-69 "Second dose, less than 21 days ago" 14 89 | Deaths involving COVID-19 60-69 "Second dose, between 21 days and 6 months ago" 933 90 | Deaths involving COVID-19 60-69 "Second dose, at least 6 months ago" 865 91 | Deaths involving COVID-19 60-69 "Third dose or booster, less than 21 days ago" 59 92 | Deaths involving COVID-19 60-69 "Third dose or booster, at least 21 days ago" 862 93 | Deaths involving COVID-19 70-79 Unvaccinated 11011 94 | Deaths involving COVID-19 70-79 "First dose, less than 21 days ago" 787 95 | Deaths involving COVID-19 70-79 "First dose, at least 21 days ago" 1410 96 | Deaths involving COVID-19 70-79 "Second dose, less than 21 days ago" 31 97 | Deaths involving COVID-19 70-79 "Second dose, between 21 days and 6 months ago" 1744 98 | Deaths involving COVID-19 70-79 "Second dose, at least 6 months ago" 1804 99 | Deaths involving COVID-19 70-79 "Third dose or booster, less than 21 days ago" 141 100 | Deaths involving COVID-19 70-79 "Third dose or booster, at least 21 days ago" 2830 101 | Deaths involving COVID-19 80-89 Unvaccinated 14248 102 | Deaths involving COVID-19 80-89 "First dose, less than 21 days ago" 2007 103 | Deaths involving COVID-19 80-89 "First dose, at least 21 days ago" 3637 104 | Deaths involving COVID-19 80-89 "Second dose, less than 21 days ago" 121 105 | Deaths involving COVID-19 80-89 "Second dose, between 21 days and 6 months ago" 1923 106 | Deaths involving COVID-19 80-89 "Second dose, at least 6 months ago" 2878 107 | Deaths involving COVID-19 80-89 "Third dose or booster, less than 21 days ago" 203 108 | Deaths involving COVID-19 80-89 "Third dose or booster, at least 21 days ago" 5566 109 | Deaths involving COVID-19 90+ Unvaccinated 8187 110 | Deaths involving COVID-19 90+ "First dose, less than 21 days ago" 1459 111 | Deaths involving COVID-19 90+ "First dose, at least 21 days ago" 2447 112 | Deaths involving COVID-19 90+ "Second dose, less than 21 days ago" 58 113 | Deaths involving COVID-19 90+ "Second dose, between 21 days and 6 months ago" 1058 114 | Deaths involving COVID-19 90+ "Second dose, at least 6 months ago" 1534 115 | Deaths involving COVID-19 90+ "Third dose or booster, less than 21 days ago" 131 116 | Deaths involving COVID-19 90+ "Third dose or booster, at least 21 days ago" 3830 -------------------------------------------------------------------------------- /2022/uk-covid-deaths/referencetable06072022accessible/Table 9-Table 1.tsv: -------------------------------------------------------------------------------- 1 | "Whole period counts of all registered deaths grouped by how many weeks after vaccination the deaths occurred; for deaths involving COVID-19 and deaths not involving COVID-19, deaths occurring between 1 January 2021 and 31 May 2022, England" 2 | This worksheet contains 1 table. 3 | "Source: Source: Office for National Statistics, National Immunisation Management Service." 4 | Week after vaccination Age group Count of Deaths involving COVID-19 Count of Non-COVID-19 Deaths 5 | 1 10-39 <3 115 6 | 2 10-39 11 168 7 | 3 10-39 8 176 8 | 4 10-39 7 210 9 | 5 10-39 6 175 10 | 6 10-39 6 190 11 | 7 10-39 4 207 12 | 8 10-39 6 185 13 | 9 10-39 7 210 14 | 10 10-39 3 182 15 | 11 10-39 8 182 16 | 12+ 10-39 134 2577 17 | 1 40-49 5 175 18 | 2 40-49 14 275 19 | 3 40-49 14 336 20 | 4 40-49 10 330 21 | 5 40-49 14 353 22 | 6 40-49 10 349 23 | 7 40-49 12 386 24 | 8 40-49 10 373 25 | 9 40-49 12 374 26 | 10 40-49 13 358 27 | 11 40-49 13 353 28 | 12+ 40-49 328 5440 29 | 1 50-59 8 431 30 | 2 50-59 34 809 31 | 3 50-59 57 852 32 | 4 50-59 53 915 33 | 5 50-59 46 1034 34 | 6 50-59 26 1058 35 | 7 50-59 30 1063 36 | 8 50-59 27 1054 37 | 9 50-59 31 1091 38 | 10 50-59 26 1181 39 | 11 50-59 34 1038 40 | 12+ 50-59 1031 17598 41 | 1 60-69 35 950 42 | 2 60-69 120 1557 43 | 3 60-69 136 1842 44 | 4 60-69 120 2009 45 | 5 60-69 123 2073 46 | 6 60-69 95 2130 47 | 7 60-69 78 2227 48 | 8 60-69 67 2249 49 | 9 60-69 54 2255 50 | 10 60-69 58 2388 51 | 11 60-69 73 2278 52 | 12+ 60-69 2498 37013 53 | 1 70-79 104 2118 54 | 2 70-79 347 3600 55 | 3 70-79 508 4042 56 | 4 70-79 449 4333 57 | 5 70-79 345 4696 58 | 6 70-79 257 4929 59 | 7 70-79 198 5032 60 | 8 70-79 155 5120 61 | 9 70-79 151 5030 62 | 10 70-79 156 5250 63 | 11 70-79 170 5098 64 | 12+ 70-79 5907 83759 65 | 1 80-89 225 3395 66 | 2 80-89 843 5415 67 | 3 80-89 1263 6138 68 | 4 80-89 1148 6622 69 | 5 80-89 807 6949 70 | 6 80-89 635 7110 71 | 7 80-89 475 7270 72 | 8 80-89 380 7372 73 | 9 80-89 329 7400 74 | 10 80-89 266 7364 75 | 11 80-89 294 7225 76 | 12+ 80-89 9670 128807 77 | 1 90+ 184 2403 78 | 2 90+ 549 3865 79 | 3 90+ 915 4426 80 | 4 90+ 795 4670 81 | 5 90+ 585 4798 82 | 6 90+ 402 4853 83 | 7 90+ 319 4863 84 | 8 90+ 233 4951 85 | 9 90+ 206 4970 86 | 10 90+ 179 4750 87 | 11 90+ 167 4640 88 | 12+ 90+ 5983 81383 -------------------------------------------------------------------------------- /2022/uk-covid-deaths/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter 2 | pandas 3 | seaborn 4 | dask 5 | -------------------------------------------------------------------------------- /2022/uk-covid-deaths/table6.tsv: -------------------------------------------------------------------------------- 1 | "Whole period counts of all cause deaths, deaths involving COVID-19 and deaths not involving COVID-19, and person-years by vaccination status and five-year age group, England, deaths occurring between 1 January 2021 and 31 May 2022" 2 | This worksheet contains 1 table. 3 | "Source: Source: Office for National Statistics, National Immunisation Management Service." 4 | Age group Vaccination status Person-years Count of deaths involving COVID-19 Count of deaths non-COVID-19 deaths Count of all cause deaths 5 | 10-14 Unvaccinated 2881265 9 175 184 6 | 10-14 "First dose, less than 21 days ago" 61754 2 2 4 7 | 10-14 "First dose, at least 21 days ago" 280645 0 14 14 8 | 10-14 "Second dose, less than 21 days ago" 36646 0 0 0 9 | 10-14 "Second dose, between 21 days and 6 months ago" 135989 0 13 13 10 | 10-14 "Second dose, at least 6 months ago" 1028 0 1 1 11 | 10-14 "Third dose or booster, less than 21 days ago" 723 0 1 1 12 | 10-14 "Third dose or booster, at least 21 days ago" 2422 1 6 7 13 | 15-19 Unvaccinated 1991761 24 265 289 14 | 15-19 "First dose, less than 21 days ago" 115758 0 13 13 15 | 15-19 "First dose, at least 21 days ago" 465610 2 79 81 16 | 15-19 "Second dose, less than 21 days ago" 97554 1 3 4 17 | 15-19 "Second dose, between 21 days and 6 months ago" 520292 2 74 76 18 | 15-19 "Second dose, at least 6 months ago" 63581 0 23 23 19 | 15-19 "Third dose or booster, less than 21 days ago" 35398 1 2 3 20 | 15-19 "Third dose or booster, at least 21 days ago" 160272 1 31 32 21 | 20-24 Unvaccinated 1531301 43 335 378 22 | 20-24 "First dose, less than 21 days ago" 116923 1 21 22 23 | 20-24 "First dose, at least 21 days ago" 342619 5 104 109 24 | 20-24 "Second dose, less than 21 days ago" 110074 0 20 20 25 | 20-24 "Second dose, between 21 days and 6 months ago" 710759 4 151 155 26 | 20-24 "Second dose, at least 6 months ago" 209425 2 41 43 27 | 20-24 "Third dose or booster, less than 21 days ago" 67390 0 12 12 28 | 20-24 "Third dose or booster, at least 21 days ago" 414003 4 48 52 29 | 25-29 Unvaccinated 1567892 68 525 593 30 | 25-29 "First dose, less than 21 days ago" 117976 2 30 32 31 | 25-29 "First dose, at least 21 days ago" 339758 5 145 150 32 | 25-29 "Second dose, less than 21 days ago" 112913 0 19 19 33 | 25-29 "Second dose, between 21 days and 6 months ago" 748987 11 189 200 34 | 25-29 "Second dose, at least 6 months ago" 228001 3 92 95 35 | 25-29 "Third dose or booster, less than 21 days ago" 73969 0 7 7 36 | 25-29 "Third dose or booster, at least 21 days ago" 475006 5 95 100 37 | 30-34 Unvaccinated 1432230 129 649 778 38 | 30-34 "First dose, less than 21 days ago" 116485 3 46 49 39 | 30-34 "First dose, at least 21 days ago" 330087 9 222 231 40 | 30-34 "Second dose, less than 21 days ago" 112049 0 22 22 41 | 30-34 "Second dose, between 21 days and 6 months ago" 757164 10 297 307 42 | 30-34 "Second dose, at least 6 months ago" 219824 13 116 129 43 | 30-34 "Third dose or booster, less than 21 days ago" 77326 1 15 16 44 | 30-34 "Third dose or booster, at least 21 days ago" 510682 4 168 172 45 | 35-39 Unvaccinated 1351742 238 903 1141 46 | 35-39 "First dose, less than 21 days ago" 126946 9 62 71 47 | 35-39 "First dose, at least 21 days ago" 341664 15 364 379 48 | 35-39 "Second dose, less than 21 days ago" 123317 1 45 46 49 | 35-39 "Second dose, between 21 days and 6 months ago" 853238 24 463 487 50 | 35-39 "Second dose, at least 6 months ago" 225736 11 183 194 51 | 35-39 "Third dose or booster, less than 21 days ago" 91720 1 28 29 52 | 35-39 "Third dose or booster, at least 21 days ago" 620364 10 265 275 53 | 40-44 Unvaccinated 1158559 299 1225 1524 54 | 40-44 "First dose, less than 21 days ago" 133705 3 79 82 55 | 40-44 "First dose, at least 21 days ago" 354401 13 557 570 56 | 40-44 "Second dose, less than 21 days ago" 131852 0 75 75 57 | 40-44 "Second dose, between 21 days and 6 months ago" 951898 40 865 905 58 | 40-44 "Second dose, at least 6 months ago" 226923 39 344 383 59 | 40-44 "Third dose or booster, less than 21 days ago" 105778 1 44 45 60 | 40-44 "Third dose or booster, at least 21 days ago" 745195 25 516 541 61 | 45-49 Unvaccinated 1015441 597 1965 2562 62 | 45-49 "First dose, less than 21 days ago" 147654 13 166 179 63 | 45-49 "First dose, at least 21 days ago" 390029 39 926 965 64 | 45-49 "Second dose, less than 21 days ago" 144704 1 145 146 65 | 45-49 "Second dose, between 21 days and 6 months ago" 1056962 65 1556 1621 66 | 45-49 "Second dose, at least 6 months ago" 222412 60 563 623 67 | 45-49 "Third dose or booster, less than 21 days ago" 119884 6 85 91 68 | 45-49 "Third dose or booster, at least 21 days ago" 854328 39 975 1014 69 | 50-54 Unvaccinated 892001 1069 3014 4083 70 | 50-54 "First dose, less than 21 days ago" 170638 21 250 271 71 | 50-54 "First dose, at least 21 days ago" 461101 66 1640 1706 72 | 50-54 "Second dose, less than 21 days ago" 169954 0 225 225 73 | 50-54 "Second dose, between 21 days and 6 months ago" 1281727 120 2926 3046 74 | 50-54 "Second dose, at least 6 months ago" 239007 110 1161 1271 75 | 50-54 "Third dose or booster, less than 21 days ago" 149814 5 160 165 76 | 50-54 "Third dose or booster, at least 21 days ago" 1134549 85 2109 2194 77 | 55-59 Unvaccinated 811218 1626 4084 5710 78 | 55-59 "First dose, less than 21 days ago" 173094 44 391 435 79 | 55-59 "First dose, at least 21 days ago" 469286 113 2494 2607 80 | 55-59 "Second dose, less than 21 days ago" 173062 2 341 343 81 | 55-59 "Second dose, between 21 days and 6 months ago" 1318673 239 4690 4929 82 | 55-59 "Second dose, at least 6 months ago" 220215 181 1965 2146 83 | 55-59 "Third dose or booster, less than 21 days ago" 159483 7 314 321 84 | 55-59 "Third dose or booster, at least 21 days ago" 1244053 168 3708 3876 85 | 60-64 Unvaccinated 619004 2425 5135 7560 86 | 60-64 "First dose, less than 21 days ago" 153232 67 544 611 87 | 60-64 "First dose, at least 21 days ago" 417699 176 3463 3639 88 | 60-64 "Second dose, less than 21 days ago" 153930 2 514 516 89 | 60-64 "Second dose, between 21 days and 6 months ago" 1178767 345 6761 7106 90 | 60-64 "Second dose, at least 6 months ago" 175218 290 2774 3064 91 | 60-64 "Third dose or booster, less than 21 days ago" 146505 17 475 492 92 | 60-64 "Third dose or booster, at least 21 days ago" 1180409 265 5866 6131 93 | 65-69 Unvaccinated 429644 3051 6303 9354 94 | 65-69 "First dose, less than 21 days ago" 134086 122 796 918 95 | 65-69 "First dose, at least 21 days ago" 361884 242 4686 4928 96 | 65-69 "Second dose, less than 21 days ago" 134189 9 681 690 97 | 65-69 "Second dose, between 21 days and 6 months ago" 1026322 445 9597 10042 98 | 65-69 "Second dose, at least 6 months ago" 142579 436 3778 4214 99 | 65-69 "Third dose or booster, less than 21 days ago" 129293 32 698 730 100 | 65-69 "Third dose or booster, at least 21 days ago" 1089793 490 9186 9676 101 | 70-74 Unvaccinated 322630 4194 8090 12284 102 | 70-74 "First dose, less than 21 days ago" 137755 235 1355 1590 103 | 70-74 "First dose, at least 21 days ago" 372505 413 7651 8064 104 | 70-74 "Second dose, less than 21 days ago" 137947 12 1193 1205 105 | 70-74 "Second dose, between 21 days and 6 months ago" 1049233 708 16101 16809 106 | 70-74 "Second dose, at least 6 months ago" 136845 673 6201 6874 107 | 70-74 "Third dose or booster, less than 21 days ago" 131781 44 1196 1240 108 | 70-74 "Third dose or booster, at least 21 days ago" 1149162 946 15934 16880 109 | 75-79 Unvaccinated 181758 5044 8515 13559 110 | 75-79 "First dose, less than 21 days ago" 99957 467 1738 2205 111 | 75-79 "First dose, at least 21 days ago" 272819 810 9579 10389 112 | 75-79 "Second dose, less than 21 days ago" 101660 16 1556 1572 113 | 75-79 "Second dose, between 21 days and 6 months ago" 793830 825 20720 21545 114 | 75-79 "Second dose, at least 6 months ago" 109217 928 8302 9230 115 | 75-79 "Third dose or booster, less than 21 days ago" 103909 74 1599 1673 116 | 75-79 "Third dose or booster, at least 21 days ago" 986552 1592 23794 25386 117 | 80-84 Unvaccinated 89993 5841 8887 14728 118 | 80-84 "First dose, less than 21 days ago" 60797 803 2153 2956 119 | 80-84 "First dose, at least 21 days ago" 169997 1414 11035 12449 120 | 80-84 "Second dose, less than 21 days ago" 71225 44 1937 1981 121 | 80-84 "Second dose, between 21 days and 6 months ago" 545087 852 25161 26013 122 | 80-84 "Second dose, at least 6 months ago" 114217 1228 11737 12965 123 | 80-84 "Third dose or booster, less than 21 days ago" 68864 81 2041 2122 124 | 80-84 "Third dose or booster, at least 21 days ago" 662376 2198 30106 32304 125 | 85-89 Unvaccinated 58281 6437 9707 16144 126 | 85-89 "First dose, less than 21 days ago" 37103 961 2595 3556 127 | 85-89 "First dose, at least 21 days ago" 104886 1797 12234 14031 128 | 85-89 "Second dose, less than 21 days ago" 43168 62 2156 2218 129 | 85-89 "Second dose, between 21 days and 6 months ago" 334788 835 27721 28556 130 | 85-89 "Second dose, at least 6 months ago" 81233 1326 12935 14261 131 | 85-89 "Third dose or booster, less than 21 days ago" 42875 106 2414 2520 132 | 85-89 "Third dose or booster, at least 21 days ago" 413276 2783 34908 37691 133 | 90+ Unvaccinated 40762 7191 11829 19020 134 | 90+ "First dose, less than 21 days ago" 21725 1284 3421 4705 135 | 90+ "First dose, at least 21 days ago" 61706 2151 14737 16888 136 | 90+ "Second dose, less than 21 days ago" 24443 50 2854 2904 137 | 90+ "Second dose, between 21 days and 6 months ago" 190688 937 33790 34727 138 | 90+ "Second dose, at least 6 months ago" 49520 1364 14910 16274 139 | 90+ "Third dose or booster, less than 21 days ago" 24392 118 3283 3401 140 | 90+ "Third dose or booster, at least 21 days ago" 229050 3432 43275 46707 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # talks 2 | Code snippets to use in talks 3 | --------------------------------------------------------------------------------