├── CH7.ipynb ├── DSP-CH1.ipynb ├── DSP-CH2.ipynb ├── DSP-Ch8.ipynb ├── DSP_CH3.ipynb ├── DSP_CH5.ipynb ├── README.md ├── Redis-py-NHL.ipynb ├── Stackdriver.ipynb ├── append.py ├── apply.py ├── book_sample.pdf ├── ch6_pyspark.html ├── ch6_pyspark.ipynb ├── dash_app.py ├── dataflow_read.py ├── echo.py ├── keras_games.py ├── logit.py ├── natality.py ├── predict.py └── stream.py /CH7.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## append.py" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "scrolled": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import apache_beam as beam\n", 19 | "import argparse\n", 20 | "from apache_beam.options.pipeline_options import PipelineOptions\n", 21 | "from apache_beam.io import ReadFromText\n", 22 | "from apache_beam.io import WriteToText\n", 23 | "\n", 24 | "# define a function for transforming the data \n", 25 | "class AppendDoFn(beam.DoFn):\n", 26 | " def process(self, element):\n", 27 | " return element + \" - Hello World!\"\n", 28 | " \n", 29 | "# set up pipeline parameters \n", 30 | "parser = argparse.ArgumentParser()\n", 31 | "parser.add_argument('--input', dest='input',\n", 32 | " default='gs://dataflow-samples/shakespeare/kinglear.txt')\n", 33 | "parser.add_argument('--output', dest='output',\n", 34 | " default='gs://dsp_model_store/shakespeare/kinglear.txt')\n", 35 | "known_args, pipeline_args = parser.parse_known_args(None)\n", 36 | "pipeline_options = PipelineOptions(pipeline_args)\n", 37 | "\n", 38 | "# define the pipeline steps \n", 39 | "p = beam.Pipeline(options=pipeline_options)\n", 40 | "lines = p | 'read' >> ReadFromText(known_args.input)\n", 41 | "appended = lines | 'append' >> beam.ParDo(AppendDoFn())\n", 42 | "appended | 'write' >> WriteToText(known_args.output)\n", 43 | "\n", 44 | "# run the pipeline \n", 45 | "result = p.run()\n", 46 | "result.wait_until_finish()\n" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "## dataflow_read.py" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 2, 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "name": "stderr", 63 | "output_type": "stream", 64 | "text": [ 65 | "ERROR:root:Exception at bundle , due to an exception.\n", 66 | " Traceback (most recent call last):\n", 67 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/executor.py\", line 343, in call\n", 68 | " finish_state)\n", 69 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/executor.py\", line 383, in attempt_call\n", 70 | " result = evaluator.finish_bundle()\n", 71 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/transform_evaluator.py\", line 319, in finish_bundle\n", 72 | " with self._source.reader() as reader:\n", 73 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/io/gcp/bigquery.py\", line 495, in reader\n", 74 | " kms_key=self.kms_key)\n", 75 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/io/gcp/bigquery_tools.py\", line 864, in __init__\n", 76 | " 'Missing executing project information. Please use the --project '\n", 77 | "RuntimeError: Missing executing project information. Please use the --project command line option to specify it.\n", 78 | "\n", 79 | "ERROR:root:Exception at bundle , due to an exception.\n", 80 | " Traceback (most recent call last):\n", 81 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/executor.py\", line 343, in call\n", 82 | " finish_state)\n", 83 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/executor.py\", line 383, in attempt_call\n", 84 | " result = evaluator.finish_bundle()\n", 85 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/transform_evaluator.py\", line 319, in finish_bundle\n", 86 | " with self._source.reader() as reader:\n", 87 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/io/gcp/bigquery.py\", line 495, in reader\n", 88 | " kms_key=self.kms_key)\n", 89 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/io/gcp/bigquery_tools.py\", line 864, in __init__\n", 90 | " 'Missing executing project information. Please use the --project '\n", 91 | "RuntimeError: Missing executing project information. Please use the --project command line option to specify it.\n", 92 | "\n", 93 | "ERROR:root:Exception at bundle , due to an exception.\n", 94 | " Traceback (most recent call last):\n", 95 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/executor.py\", line 343, in call\n", 96 | " finish_state)\n", 97 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/executor.py\", line 383, in attempt_call\n", 98 | " result = evaluator.finish_bundle()\n", 99 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/transform_evaluator.py\", line 319, in finish_bundle\n", 100 | " with self._source.reader() as reader:\n", 101 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/io/gcp/bigquery.py\", line 495, in reader\n", 102 | " kms_key=self.kms_key)\n", 103 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/io/gcp/bigquery_tools.py\", line 864, in __init__\n", 104 | " 'Missing executing project information. Please use the --project '\n", 105 | "RuntimeError: Missing executing project information. Please use the --project command line option to specify it.\n", 106 | "\n", 107 | "ERROR:root:Exception at bundle , due to an exception.\n", 108 | " Traceback (most recent call last):\n", 109 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/executor.py\", line 343, in call\n", 110 | " finish_state)\n", 111 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/executor.py\", line 383, in attempt_call\n", 112 | " result = evaluator.finish_bundle()\n", 113 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/transform_evaluator.py\", line 319, in finish_bundle\n", 114 | " with self._source.reader() as reader:\n", 115 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/io/gcp/bigquery.py\", line 495, in reader\n", 116 | " kms_key=self.kms_key)\n", 117 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/io/gcp/bigquery_tools.py\", line 864, in __init__\n", 118 | " 'Missing executing project information. Please use the --project '\n", 119 | "RuntimeError: Missing executing project information. Please use the --project command line option to specify it.\n", 120 | "\n", 121 | "ERROR:root:Giving up after 4 attempts.\n", 122 | "WARNING:root:A task failed with exception: Missing executing project information. Please use the --project command line option to specify it.\n" 123 | ] 124 | }, 125 | { 126 | "ename": "RuntimeError", 127 | "evalue": "Missing executing project information. Please use the --project command line option to specify it.", 128 | "output_type": "error", 129 | "traceback": [ 130 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 131 | "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", 132 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[0;31m# run the pipeline\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 29\u001b[0;31m \u001b[0mresult\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait_until_finish\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 133 | "\u001b[0;32m~/.local/lib/python3.7/site-packages/apache_beam/runners/direct/direct_runner.py\u001b[0m in \u001b[0;36mwait_until_finish\u001b[0;34m(self, duration)\u001b[0m\n\u001b[1;32m 429\u001b[0m 'DirectRunner does not support duration argument.')\n\u001b[1;32m 430\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 431\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_executor\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mawait_completion\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 432\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_state\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mPipelineState\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDONE\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 433\u001b[0m \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# pylint: disable=broad-except\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 134 | "\u001b[0;32m~/.local/lib/python3.7/site-packages/apache_beam/runners/direct/executor.py\u001b[0m in \u001b[0;36mawait_completion\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 398\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 399\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mawait_completion\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 400\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_executor\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mawait_completion\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 401\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 402\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mshutdown\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 135 | "\u001b[0;32m~/.local/lib/python3.7/site-packages/apache_beam/runners/direct/executor.py\u001b[0m in \u001b[0;36mawait_completion\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 444\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mupdate\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexception\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 445\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mupdate\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexc_info\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 446\u001b[0;31m \u001b[0mraise_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 447\u001b[0m \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 448\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexecutor_service\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshutdown\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 136 | "\u001b[0;32m~/.local/lib/python3.7/site-packages/future/utils/__init__.py\u001b[0m in \u001b[0;36mraise_\u001b[0;34m(tp, value, tb)\u001b[0m\n\u001b[1;32m 411\u001b[0m \u001b[0mexc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 412\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__traceback__\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mtb\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 413\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwith_traceback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 414\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 415\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 137 | "\u001b[0;32m~/.local/lib/python3.7/site-packages/apache_beam/runners/direct/executor.py\u001b[0m in \u001b[0;36mcall\u001b[0;34m(self, state_sampler)\u001b[0m\n\u001b[1;32m 341\u001b[0m \u001b[0mstart_state\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 342\u001b[0m \u001b[0mprocess_state\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 343\u001b[0;31m finish_state)\n\u001b[0m\u001b[1;32m 344\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 345\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 138 | "\u001b[0;32m~/.local/lib/python3.7/site-packages/apache_beam/runners/direct/executor.py\u001b[0m in \u001b[0;36mattempt_call\u001b[0;34m(self, metrics_container, side_input_values, start_state, process_state, finish_state)\u001b[0m\n\u001b[1;32m 381\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 382\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mfinish_state\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 383\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mevaluator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfinish_bundle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 384\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlogical_metric_updates\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmetrics_container\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_cumulative\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 385\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 139 | "\u001b[0;32m~/.local/lib/python3.7/site-packages/apache_beam/runners/direct/transform_evaluator.py\u001b[0m in \u001b[0;36mfinish_bundle\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 317\u001b[0m \u001b[0mbundles\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_read_values_to_bundles\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreader\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 318\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 319\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_source\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mreader\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 320\u001b[0m \u001b[0mbundles\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_read_values_to_bundles\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreader\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 321\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 140 | "\u001b[0;32m~/.local/lib/python3.7/site-packages/apache_beam/io/gcp/bigquery.py\u001b[0m in \u001b[0;36mreader\u001b[0;34m(self, test_bigquery_client)\u001b[0m\n\u001b[1;32m 493\u001b[0m \u001b[0muse_legacy_sql\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muse_legacy_sql\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 494\u001b[0m \u001b[0mflatten_results\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflatten_results\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 495\u001b[0;31m kms_key=self.kms_key)\n\u001b[0m\u001b[1;32m 496\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 497\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 141 | "\u001b[0;32m~/.local/lib/python3.7/site-packages/apache_beam/io/gcp/bigquery_tools.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, source, test_bigquery_client, use_legacy_sql, flatten_results, kms_key)\u001b[0m\n\u001b[1;32m 862\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexecuting_project\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mtest_bigquery_client\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 863\u001b[0m raise RuntimeError(\n\u001b[0;32m--> 864\u001b[0;31m \u001b[0;34m'Missing executing project information. Please use the --project '\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 865\u001b[0m 'command line option to specify it.')\n\u001b[1;32m 866\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrow_as_dict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msource\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcoder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mRowAsDictJsonCoder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 142 | "\u001b[0;31mRuntimeError\u001b[0m: Missing executing project information. Please use the --project command line option to specify it." 143 | ] 144 | } 145 | ], 146 | "source": [ 147 | "import apache_beam as beam\n", 148 | "import argparse\n", 149 | "from apache_beam.options.pipeline_options import PipelineOptions\n", 150 | "\n", 151 | "parser = argparse.ArgumentParser()\n", 152 | "known_args, pipeline_args = parser.parse_known_args(None)\n", 153 | "pipeline_options = PipelineOptions(pipeline_args)\n", 154 | "\n", 155 | "class ApplyDoFn(beam.DoFn):\n", 156 | " def process(self, element):\n", 157 | " print(element)\n", 158 | "\n", 159 | "\n", 160 | "query = \"\"\"\n", 161 | "select *\n", 162 | "from `bigquery-public-data.samples.natality`\n", 163 | "order by rand()\n", 164 | "limit 100\n", 165 | "\"\"\"\n", 166 | "\n", 167 | "# define the pipeline steps\n", 168 | "p = beam.Pipeline(options=pipeline_options)\n", 169 | "data = p | 'Read from BigQuery' >> beam.io.Read(\n", 170 | " beam.io.BigQuerySource(query=query, use_standard_sql=True))\n", 171 | "scored = data | 'Apply Model' >> beam.ParDo(ApplyDoFn())\n", 172 | "\n", 173 | "# run the pipeline\n", 174 | "result = p.run()\n", 175 | "result.wait_until_finish()" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "## Query Natality" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 16, 188 | "metadata": {}, 189 | "outputs": [ 190 | { 191 | "data": { 192 | "text/html": [ 193 | "
\n", 194 | "\n", 207 | "\n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | "
yearpluralityapgar_5minmother_agefather_agegestation_weeksever_bornmother_marriedweight
019700.00.0374638.0817.625790
119711.00.0434738.01217.438397
219721.00.0464841.01318.437091
319721.00.0383499.01017.374463
419731.00.0424999.01015.813590
\n", 285 | "
" 286 | ], 287 | "text/plain": [ 288 | " year plurality apgar_5min mother_age father_age gestation_weeks \\\n", 289 | "0 1970 0.0 0.0 37 46 38.0 \n", 290 | "1 1971 1.0 0.0 43 47 38.0 \n", 291 | "2 1972 1.0 0.0 46 48 41.0 \n", 292 | "3 1972 1.0 0.0 38 34 99.0 \n", 293 | "4 1973 1.0 0.0 42 49 99.0 \n", 294 | "\n", 295 | " ever_born mother_married weight \n", 296 | "0 8 1 7.625790 \n", 297 | "1 12 1 7.438397 \n", 298 | "2 13 1 8.437091 \n", 299 | "3 10 1 7.374463 \n", 300 | "4 10 1 5.813590 " 301 | ] 302 | }, 303 | "execution_count": 16, 304 | "metadata": {}, 305 | "output_type": "execute_result" 306 | } 307 | ], 308 | "source": [ 309 | "from google.cloud import bigquery\n", 310 | "client = bigquery.Client()\n", 311 | "\n", 312 | "sql = \"\"\"\n", 313 | "SELECT year, plurality, apgar_5min, \n", 314 | " mother_age, father_age, \n", 315 | " gestation_weeks, ever_born\n", 316 | " ,case when mother_married = true \n", 317 | " then 1 else 0 end as mother_married\n", 318 | " ,weight_pounds as weight\n", 319 | " FROM `bigquery-public-data.samples.natality`\n", 320 | " limit 10000\n", 321 | "\"\"\"\n", 322 | "\n", 323 | "natalityDF = client.query(sql).to_dataframe().fillna(0)\n", 324 | "natalityDF.head()" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "## Train and Save Model" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 20, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "from sklearn.linear_model import LinearRegression\n", 341 | "import pickle\n", 342 | "from google.cloud import storage\n", 343 | "\n", 344 | "# fit and pickle a model \n", 345 | "model = LinearRegression()\n", 346 | "model.fit(natalityDF.iloc[:,1:8], natalityDF['weight'])\n", 347 | "pickle.dump(model, open(\"natality.pkl\", 'wb'))\n", 348 | "\n", 349 | "# Save to GCS\n", 350 | "bucket = storage.Client().get_bucket('dsp_model_store')\n", 351 | "blob = bucket.blob('natality/sklearn-linear')\n", 352 | "blob.upload_from_filename('natality.pkl')\n" 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": {}, 358 | "source": [ 359 | "## Test Model Loading " 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 12, 365 | "metadata": {}, 366 | "outputs": [ 367 | { 368 | "data": { 369 | "text/plain": [ 370 | "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)" 371 | ] 372 | }, 373 | "execution_count": 12, 374 | "metadata": {}, 375 | "output_type": "execute_result" 376 | } 377 | ], 378 | "source": [ 379 | "\n", 380 | "from google.cloud import storage\n", 381 | "import pickle \n", 382 | "\n", 383 | "bucket = storage.Client().get_bucket('dsp_model_store')\n", 384 | "blob = bucket.get_blob('natality/sklearn-linear')\n", 385 | "model = pickle.loads(blob.download_as_string())\n", 386 | "model\n" 387 | ] 388 | }, 389 | { 390 | "cell_type": "markdown", 391 | "metadata": {}, 392 | "source": [ 393 | "## Prediction Pipeline " 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": 26, 399 | "metadata": {}, 400 | "outputs": [ 401 | { 402 | "name": "stderr", 403 | "output_type": "stream", 404 | "text": [ 405 | "/home/ec2-user/.local/lib/python3.7/site-packages/ipykernel_launcher.py:45: BeamDeprecationWarning: parse_table_schema_from_json is deprecated since 2.11.0. Use bigquery_tools.parse_table_schema_from_json instead.\n", 406 | "/home/ec2-user/.local/lib/python3.7/site-packages/ipykernel_launcher.py:73: BeamDeprecationWarning: BigQuerySink is deprecated since 2.11.0. Use WriteToBigQuery instead.\n", 407 | "ERROR:root:Exception at bundle , due to an exception.\n", 408 | " Traceback (most recent call last):\n", 409 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/executor.py\", line 343, in call\n", 410 | " finish_state)\n", 411 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/executor.py\", line 383, in attempt_call\n", 412 | " result = evaluator.finish_bundle()\n", 413 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/transform_evaluator.py\", line 319, in finish_bundle\n", 414 | " with self._source.reader() as reader:\n", 415 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/io/gcp/bigquery.py\", line 495, in reader\n", 416 | " kms_key=self.kms_key)\n", 417 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/io/gcp/bigquery_tools.py\", line 864, in __init__\n", 418 | " 'Missing executing project information. Please use the --project '\n", 419 | "RuntimeError: Missing executing project information. Please use the --project command line option to specify it.\n", 420 | "\n", 421 | "ERROR:root:Exception at bundle , due to an exception.\n", 422 | " Traceback (most recent call last):\n", 423 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/executor.py\", line 343, in call\n", 424 | " finish_state)\n", 425 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/executor.py\", line 383, in attempt_call\n", 426 | " result = evaluator.finish_bundle()\n", 427 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/transform_evaluator.py\", line 319, in finish_bundle\n", 428 | " with self._source.reader() as reader:\n", 429 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/io/gcp/bigquery.py\", line 495, in reader\n", 430 | " kms_key=self.kms_key)\n", 431 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/io/gcp/bigquery_tools.py\", line 864, in __init__\n", 432 | " 'Missing executing project information. Please use the --project '\n", 433 | "RuntimeError: Missing executing project information. Please use the --project command line option to specify it.\n", 434 | "\n", 435 | "ERROR:root:Exception at bundle , due to an exception.\n", 436 | " Traceback (most recent call last):\n", 437 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/executor.py\", line 343, in call\n", 438 | " finish_state)\n", 439 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/executor.py\", line 383, in attempt_call\n", 440 | " result = evaluator.finish_bundle()\n", 441 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/transform_evaluator.py\", line 319, in finish_bundle\n", 442 | " with self._source.reader() as reader:\n", 443 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/io/gcp/bigquery.py\", line 495, in reader\n", 444 | " kms_key=self.kms_key)\n", 445 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/io/gcp/bigquery_tools.py\", line 864, in __init__\n", 446 | " 'Missing executing project information. Please use the --project '\n", 447 | "RuntimeError: Missing executing project information. Please use the --project command line option to specify it.\n", 448 | "\n", 449 | "ERROR:root:Exception at bundle , due to an exception.\n", 450 | " Traceback (most recent call last):\n", 451 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/executor.py\", line 343, in call\n", 452 | " finish_state)\n", 453 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/executor.py\", line 383, in attempt_call\n", 454 | " result = evaluator.finish_bundle()\n", 455 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/transform_evaluator.py\", line 319, in finish_bundle\n", 456 | " with self._source.reader() as reader:\n", 457 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/io/gcp/bigquery.py\", line 495, in reader\n", 458 | " kms_key=self.kms_key)\n", 459 | " File \"/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/io/gcp/bigquery_tools.py\", line 864, in __init__\n", 460 | " 'Missing executing project information. Please use the --project '\n", 461 | "RuntimeError: Missing executing project information. Please use the --project command line option to specify it.\n", 462 | "\n", 463 | "ERROR:root:Giving up after 4 attempts.\n", 464 | "WARNING:root:A task failed with exception: Missing executing project information. Please use the --project command line option to specify it.\n" 465 | ] 466 | }, 467 | { 468 | "ename": "RuntimeError", 469 | "evalue": "Missing executing project information. Please use the --project command line option to specify it.", 470 | "output_type": "error", 471 | "traceback": [ 472 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 473 | "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", 474 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 77\u001b[0m \u001b[0;31m# run the pipeline\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 78\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 79\u001b[0;31m \u001b[0mresult\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait_until_finish\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 475 | "\u001b[0;32m~/.local/lib/python3.7/site-packages/apache_beam/runners/direct/direct_runner.py\u001b[0m in \u001b[0;36mwait_until_finish\u001b[0;34m(self, duration)\u001b[0m\n\u001b[1;32m 429\u001b[0m 'DirectRunner does not support duration argument.')\n\u001b[1;32m 430\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 431\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_executor\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mawait_completion\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 432\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_state\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mPipelineState\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDONE\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 433\u001b[0m \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# pylint: disable=broad-except\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 476 | "\u001b[0;32m~/.local/lib/python3.7/site-packages/apache_beam/runners/direct/executor.py\u001b[0m in \u001b[0;36mawait_completion\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 398\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 399\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mawait_completion\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 400\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_executor\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mawait_completion\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 401\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 402\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mshutdown\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 477 | "\u001b[0;32m~/.local/lib/python3.7/site-packages/apache_beam/runners/direct/executor.py\u001b[0m in \u001b[0;36mawait_completion\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 444\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mupdate\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexception\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 445\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mupdate\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexc_info\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 446\u001b[0;31m \u001b[0mraise_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 447\u001b[0m \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 448\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexecutor_service\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshutdown\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 478 | "\u001b[0;32m~/.local/lib/python3.7/site-packages/future/utils/__init__.py\u001b[0m in \u001b[0;36mraise_\u001b[0;34m(tp, value, tb)\u001b[0m\n\u001b[1;32m 411\u001b[0m \u001b[0mexc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 412\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__traceback__\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mtb\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 413\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwith_traceback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 414\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 415\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 479 | "\u001b[0;32m~/.local/lib/python3.7/site-packages/apache_beam/runners/direct/executor.py\u001b[0m in \u001b[0;36mcall\u001b[0;34m(self, state_sampler)\u001b[0m\n\u001b[1;32m 341\u001b[0m \u001b[0mstart_state\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 342\u001b[0m \u001b[0mprocess_state\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 343\u001b[0;31m finish_state)\n\u001b[0m\u001b[1;32m 344\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 345\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 480 | "\u001b[0;32m~/.local/lib/python3.7/site-packages/apache_beam/runners/direct/executor.py\u001b[0m in \u001b[0;36mattempt_call\u001b[0;34m(self, metrics_container, side_input_values, start_state, process_state, finish_state)\u001b[0m\n\u001b[1;32m 381\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 382\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mfinish_state\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 383\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mevaluator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfinish_bundle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 384\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlogical_metric_updates\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmetrics_container\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_cumulative\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 385\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 481 | "\u001b[0;32m~/.local/lib/python3.7/site-packages/apache_beam/runners/direct/transform_evaluator.py\u001b[0m in \u001b[0;36mfinish_bundle\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 317\u001b[0m \u001b[0mbundles\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_read_values_to_bundles\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreader\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 318\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 319\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_source\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mreader\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 320\u001b[0m \u001b[0mbundles\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_read_values_to_bundles\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreader\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 321\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 482 | "\u001b[0;32m~/.local/lib/python3.7/site-packages/apache_beam/io/gcp/bigquery.py\u001b[0m in \u001b[0;36mreader\u001b[0;34m(self, test_bigquery_client)\u001b[0m\n\u001b[1;32m 493\u001b[0m \u001b[0muse_legacy_sql\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muse_legacy_sql\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 494\u001b[0m \u001b[0mflatten_results\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflatten_results\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 495\u001b[0;31m kms_key=self.kms_key)\n\u001b[0m\u001b[1;32m 496\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 497\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 483 | "\u001b[0;32m~/.local/lib/python3.7/site-packages/apache_beam/io/gcp/bigquery_tools.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, source, test_bigquery_client, use_legacy_sql, flatten_results, kms_key)\u001b[0m\n\u001b[1;32m 862\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexecuting_project\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mtest_bigquery_client\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 863\u001b[0m raise RuntimeError(\n\u001b[0;32m--> 864\u001b[0;31m \u001b[0;34m'Missing executing project information. Please use the --project '\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 865\u001b[0m 'command line option to specify it.')\n\u001b[1;32m 866\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrow_as_dict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msource\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcoder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mRowAsDictJsonCoder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 484 | "\u001b[0;31mRuntimeError\u001b[0m: Missing executing project information. Please use the --project command line option to specify it." 485 | ] 486 | } 487 | ], 488 | "source": [ 489 | "import apache_beam as beam\n", 490 | "import argparse\n", 491 | "from apache_beam.options.pipeline_options import PipelineOptions\n", 492 | "from apache_beam.options.pipeline_options import SetupOptions\n", 493 | "from apache_beam.io.gcp.bigquery import parse_table_schema_from_json\n", 494 | "import json\n", 495 | "\n", 496 | "query = \"\"\"\n", 497 | " SELECT year, plurality, apgar_5min, \n", 498 | " mother_age, father_age, \n", 499 | " gestation_weeks, ever_born\n", 500 | " ,case when mother_married = true \n", 501 | " then 1 else 0 end as mother_married\n", 502 | " ,weight_pounds as weight\n", 503 | " ,current_timestamp as time\n", 504 | " ,GENERATE_UUID() as guid\n", 505 | " FROM `bigquery-public-data.samples.natality` \n", 506 | " limit 100 \n", 507 | "\"\"\"\n", 508 | "\n", 509 | "class ApplyDoFn(beam.DoFn):\n", 510 | "\n", 511 | " def __init__(self):\n", 512 | " self._model = None\n", 513 | " from google.cloud import storage\n", 514 | " import pandas as pd\n", 515 | " import pickle as pkl\n", 516 | " self._storage = storage\n", 517 | " self._pkl = pkl\n", 518 | " self._pd = pd\n", 519 | " \n", 520 | " def process(self, element):\n", 521 | " if self._model is None:\n", 522 | " bucket = self._storage.Client().get_bucket('dsp_model_store')\n", 523 | " blob = bucket.get_blob('natality/sklearn-linear')\n", 524 | " self._model = self._pkl.loads(blob.download_as_string())\n", 525 | " \n", 526 | " new_x = self._pd.DataFrame.from_dict(element, orient = \"index\").transpose().fillna(0) \n", 527 | " weight = self._model.predict(new_x.iloc[:,1:8])[0]\n", 528 | " return [ { 'guid': element['guid'], 'weight': weight, 'time': str(element['time']) } ]\n", 529 | "\n", 530 | "schema = parse_table_schema_from_json(json.dumps({'fields':\n", 531 | " [ { 'name': 'guid', 'type': 'STRING'},\n", 532 | " { 'name': 'weight', 'type': 'FLOAT64'},\n", 533 | " { 'name': 'time', 'type': 'STRING'} ]}))\n", 534 | "\n", 535 | "class PublishDoFn(beam.DoFn):\n", 536 | " \n", 537 | " def __init__(self):\n", 538 | " from google.cloud import datastore \n", 539 | " self._ds = datastore\n", 540 | " \n", 541 | " def process(self, element):\n", 542 | " client = self._ds.Client()\n", 543 | " key = client.key('natality-guid', element['guid'])\n", 544 | " entity = self._ds.Entity(key)\n", 545 | " entity['weight'] = element['weight'] \n", 546 | " entity['time'] = element['time']\n", 547 | " client.put(entity)\n", 548 | "\n", 549 | "parser = argparse.ArgumentParser()\n", 550 | "known_args, pipeline_args = parser.parse_known_args(None)\n", 551 | "pipeline_options = PipelineOptions(pipeline_args)\n", 552 | "\n", 553 | "# define the pipeline steps\n", 554 | "p = beam.Pipeline(options=pipeline_options)\n", 555 | "data = p | 'Read from BigQuery' >> beam.io.Read(\n", 556 | " beam.io.BigQuerySource(query=query, use_standard_sql=True))\n", 557 | "scored = data | 'Apply Model' >> beam.ParDo(ApplyDoFn())\n", 558 | "scored | 'Save to BigQuery' >> beam.io.Write(beam.io.BigQuerySink(\n", 559 | " 'weight_preds', 'dsp_demo', schema = schema,\n", 560 | " create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,\n", 561 | " write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))\n", 562 | "\n", 563 | "scored | 'Create entities' >> beam.ParDo(PublishDoFn())\n", 564 | "\n", 565 | "# run the pipeline\n", 566 | "result = p.run()\n", 567 | "result.wait_until_finish()\n" 568 | ] 569 | }, 570 | { 571 | "cell_type": "markdown", 572 | "metadata": {}, 573 | "source": [ 574 | "## Read from Datastore" 575 | ] 576 | }, 577 | { 578 | "cell_type": "code", 579 | "execution_count": 25, 580 | "metadata": {}, 581 | "outputs": [ 582 | { 583 | "name": "stdout", 584 | "output_type": "stream", 585 | "text": [ 586 | "\n" 587 | ] 588 | } 589 | ], 590 | "source": [ 591 | "\n", 592 | "from google.cloud import datastore\n", 593 | "client = datastore.Client()\n", 594 | "query = client.query(kind='natality-guid')\n", 595 | "\n", 596 | "query_iter = query.fetch()\n", 597 | "for entity in query_iter:\n", 598 | " print(entity)\n", 599 | " break\n", 600 | "\n" 601 | ] 602 | } 603 | ], 604 | "metadata": { 605 | "kernelspec": { 606 | "display_name": "Python 3", 607 | "language": "python", 608 | "name": "python3" 609 | }, 610 | "language_info": { 611 | "codemirror_mode": { 612 | "name": "ipython", 613 | "version": 3 614 | }, 615 | "file_extension": ".py", 616 | "mimetype": "text/x-python", 617 | "name": "python", 618 | "nbconvert_exporter": "python", 619 | "pygments_lexer": "ipython3", 620 | "version": "3.7.4" 621 | } 622 | }, 623 | "nbformat": 4, 624 | "nbformat_minor": 2 625 | } 626 | -------------------------------------------------------------------------------- /DSP-Ch8.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Spark Streaming" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Kafka Producer" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "from kafka import KafkaProducer\n", 24 | "from json import dumps\n", 25 | "import time\n", 26 | " \n", 27 | "producer = KafkaProducer(bootstrap_servers=['localhost:9092'],\n", 28 | " value_serializer=lambda x: dumps(x).encode('utf-8'))\n", 29 | " \n", 30 | "data = {'hello' : 'world', 'time': time.time()}\n", 31 | "producer.send('dsp', data)\n" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "## Kafka Consumer" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "from kafka import KafkaConsumer\n", 48 | "from json import loads\n", 49 | " \n", 50 | "consumer = KafkaConsumer('dsp',\n", 51 | " bootstrap_servers=['localhost:9092'],\n", 52 | " value_deserializer=lambda x: loads(x.decode('utf-8')))\n", 53 | " \n", 54 | "for x in consumer:\n", 55 | " print(x.value)\n" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "## Model Producer" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 1, 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "data": { 72 | "text/plain": [ 73 | "RecordMetadata(topic='dsp', partition=0, topic_partition=TopicPartition(topic='dsp', partition=0), offset=109, timestamp=1576709681368, checksum=None, serialized_key_size=-1, serialized_value_size=142, serialized_header_size=-1)" 74 | ] 75 | }, 76 | "execution_count": 1, 77 | "metadata": {}, 78 | "output_type": "execute_result" 79 | } 80 | ], 81 | "source": [ 82 | "\n", 83 | "from kafka import KafkaProducer\n", 84 | "from json import dumps\n", 85 | "import time\n", 86 | "import uuid\n", 87 | "\n", 88 | "producer = KafkaProducer(bootstrap_servers=['54.166.148.190:9092'],\n", 89 | " value_serializer=lambda x: dumps(x).encode('utf-8'))\n", 90 | "\n", 91 | "data = { 'G1': 1, 'G2': 0, 'G3': 0, 'G4': 0, 'G5': 0, \n", 92 | " 'G6': 0, 'G7': 0, 'G8': 0, 'G9': 0, 'G10': 0, \n", 93 | " 'User_ID': str(uuid.uuid1())}\n", 94 | "result = producer.send('dsp', data)\n", 95 | "result.get()\n" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "## Streaming Pipeline " 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "\n", 112 | "from pyspark.sql.types import StringType\n", 113 | "import json \n", 114 | "import pandas as pd\n", 115 | "from sklearn.linear_model import LogisticRegression\n", 116 | "\n", 117 | "# build a logsitic regression model \n", 118 | "gamesDF = pd.read_csv(\"https://github.com/bgweber/Twitch/raw/master/Recommendations/games-expand.csv\")\n", 119 | "model = LogisticRegression() \n", 120 | "model.fit(gamesDF.iloc[:,0:10], gamesDF['label'])\n", 121 | "\n", 122 | "# read from Kafka \n", 123 | "df = spark .readStream.format(\"kafka\") \\\n", 124 | " .option(\"kafka.bootstrap.servers\", \"54.166.148.190:9092\") \\\n", 125 | " .option(\"subscribe\", \"dsp\").load()\n", 126 | "\n", 127 | "# define the UDF for scoring users \n", 128 | "def score(row):\n", 129 | " d = json.loads(row)\n", 130 | " p = pd.DataFrame.from_dict(d, orient = \"index\").transpose() \n", 131 | " pred = model.predict_proba(p.iloc[:,0:10])[0][0]\n", 132 | " result = {'User_ID': d['User_ID'], 'pred': pred }\n", 133 | " return str(json.dumps(result))\n", 134 | " \n", 135 | "# select the value field and apply the UDF \n", 136 | "df = df.selectExpr(\"CAST(value AS STRING)\")\n", 137 | "score_udf = udf(score, StringType()) \n", 138 | "df = df.select( score_udf(\"value\").alias(\"value\"))\n", 139 | "\n", 140 | "# Write results to Kafka \n", 141 | "query = df.writeStream.format(\"kafka\") \\\n", 142 | " .option(\"kafka.bootstrap.servers\", \"54.166.148.190:9092\") \\\n", 143 | " .option(\"topic\", \"preds\") \\\n", 144 | " .option(\"checkpointLocation\", \"/temp\").start()\n" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "## Model Consumer" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "from kafka import KafkaConsumer\n", 161 | "from json import loads\n", 162 | "\n", 163 | "consumer = KafkaConsumer('preds',\n", 164 | " bootstrap_servers=['54.166.148.190:9092'],\n", 165 | " value_deserializer=lambda x: loads(x))\n", 166 | "\n", 167 | "for x in consumer:\n", 168 | " print(x.value)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "# Dataflow Streaming" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "## PubSub Consumer" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "import time\n", 192 | "from google.cloud import pubsub_v1\n", 193 | "\n", 194 | "subscriber = pubsub_v1.SubscriberClient()\n", 195 | "subscription_path = subscriber.subscription_path(\"gameanalytics-199018\", \"dsp\")\n", 196 | "\n", 197 | "def callback(message):\n", 198 | " print(message.data)\n", 199 | " message.ack()\n", 200 | "\n", 201 | "subscriber.subscribe(subscription_path, callback=callback)\n", 202 | "\n", 203 | "while True:\n", 204 | " time.sleep(10)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "## PubSub Producer" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "from google.cloud import pubsub_v1\n", 221 | "\n", 222 | "publisher = pubsub_v1.PublisherClient()\n", 223 | "topic_path = publisher.topic_path(\"gameanalytics-199018\", \"natality\")\n", 224 | "\n", 225 | "data = \"Hello World!\".encode('utf-8')\n", 226 | "publisher.publish(topic_path, data=data)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "## Streaming Pipeline" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "import apache_beam as beam\n", 243 | "import argparse\n", 244 | "from apache_beam.options.pipeline_options import PipelineOptions\n", 245 | "from apache_beam.io.gcp.bigquery import parse_table_schema_from_json\n", 246 | "import json\n", 247 | "\n", 248 | "class ApplyDoFn(beam.DoFn):\n", 249 | "\n", 250 | " def __init__(self):\n", 251 | " self._model = None\n", 252 | " from google.cloud import storage\n", 253 | " import pandas as pd\n", 254 | " import pickle as pkl\n", 255 | " import json as js\n", 256 | " self._storage = storage\n", 257 | " self._pkl = pkl\n", 258 | " self._pd = pd\n", 259 | " self._json = js\n", 260 | " \n", 261 | " def process(self, element):\n", 262 | " if self._model is None:\n", 263 | " bucket = self._storage.Client().get_bucket(\n", 264 | " 'dsp_model_store')\n", 265 | " blob = bucket.get_blob('natality/sklearn-linear')\n", 266 | " self._model =self._pkl.loads(blob.download_as_string())\n", 267 | " \n", 268 | " element = self._json.loads(element.decode('utf-8'))\n", 269 | " new_x = self._pd.DataFrame.from_dict(element, \n", 270 | " orient = \"index\").transpose().fillna(0) \n", 271 | " weight = self._model.predict(new_x.iloc[:,1:8])[0]\n", 272 | " return [ { 'guid': element['guid'], 'weight': weight, \n", 273 | " 'time': str(element['time']) } ]\n", 274 | " \n", 275 | "class PublishDoFn(beam.DoFn):\n", 276 | " \n", 277 | " def __init__(self):\n", 278 | " from google.cloud import datastore \n", 279 | " self._ds = datastore\n", 280 | " \n", 281 | " def process(self, element):\n", 282 | " client = self._ds.Client()\n", 283 | " key = client.key('natality-guid', element['guid'])\n", 284 | " entity = self._ds.Entity(key)\n", 285 | " entity['weight'] = element['weight'] \n", 286 | " entity['time'] = element['time']\n", 287 | " client.put(entity)\n", 288 | " \n", 289 | "# set up pipeline parameters \n", 290 | "parser = argparse.ArgumentParser()\n", 291 | "known_args, pipeline_args = parser.parse_known_args(None)\n", 292 | "pipeline_options = PipelineOptions(pipeline_args)\n", 293 | "\n", 294 | "# define the topics \n", 295 | "topic = \"projects/{project}/topics/{topic}\"\n", 296 | "topic = topic.format(project = \"gameanalytics-199018\", topic = \"natality\")\n", 297 | "\n", 298 | "schema = parse_table_schema_from_json(json.dumps({'fields':\n", 299 | " [ { 'name': 'guid', 'type': 'STRING'},\n", 300 | " { 'name': 'weight', 'type': 'FLOAT64'},\n", 301 | " { 'name': 'time', 'type': 'STRING'} ]}))\n", 302 | "\n", 303 | "# define the pipeline steps \n", 304 | "p = beam.Pipeline(options=pipeline_options)\n", 305 | "lines = p | 'Read PubSub' >> beam.io.ReadFromPubSub(topic=topic)\n", 306 | "scored = lines | 'apply' >> beam.ParDo(ApplyDoFn())\n", 307 | "scored | 'Create entities' >> beam.ParDo(PublishDoFn())\n", 308 | "\n", 309 | "# run the pipeline \n", 310 | "result = p.run()\n", 311 | "result.wait_until_finish()" 312 | ] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "metadata": {}, 317 | "source": [ 318 | "## Streaming Producer" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 2, 324 | "metadata": {}, 325 | "outputs": [ 326 | { 327 | "data": { 328 | "text/plain": [ 329 | "" 330 | ] 331 | }, 332 | "execution_count": 2, 333 | "metadata": {}, 334 | "output_type": "execute_result" 335 | } 336 | ], 337 | "source": [ 338 | "\n", 339 | "import json\n", 340 | "from google.cloud import pubsub_v1\n", 341 | "import time \n", 342 | "\n", 343 | "publisher = pubsub_v1.PublisherClient()\n", 344 | "topic_path = publisher.topic_path(\"gameanalytics-199018\", \"natality\")\n", 345 | "\n", 346 | "data = json.dumps({'year': 2001, 'plurality': 1, 'apgar_5min': 99, 'mother_age': 33, \n", 347 | " 'father_age': 40, 'gestation_weeks': 38, 'ever_born': 8, \n", 348 | " 'mother_married': 1, 'weight': 6.8122838958, \n", 349 | " 'time': str(time.time()), \n", 350 | " 'guid': 'b281c5e8-85b2-4cbd-a2d8-e501ca816363'}\n", 351 | ").encode('utf-8') \n", 352 | "\n", 353 | "publisher.publish(topic_path, data=data)" 354 | ] 355 | } 356 | ], 357 | "metadata": { 358 | "kernelspec": { 359 | "display_name": "Python 3", 360 | "language": "python", 361 | "name": "python3" 362 | }, 363 | "language_info": { 364 | "codemirror_mode": { 365 | "name": "ipython", 366 | "version": 3 367 | }, 368 | "file_extension": ".py", 369 | "mimetype": "text/x-python", 370 | "name": "python", 371 | "nbconvert_exporter": "python", 372 | "pygments_lexer": "ipython3", 373 | "version": "3.7.4" 374 | } 375 | }, 376 | "nbformat": 4, 377 | "nbformat_minor": 2 378 | } 379 | -------------------------------------------------------------------------------- /DSP_CH3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "ename": "ModuleNotFoundError", 10 | "evalue": "No module named 'google.appengine'", 11 | "output_type": "error", 12 | "traceback": [ 13 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 14 | "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", 15 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;31m#import webapp2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mgoogle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappengine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapi\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mapp_identity\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 16 | "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'google.appengine'" 17 | ] 18 | } 19 | ], 20 | "source": [ 21 | "\n", 22 | "\n", 23 | "# CLOUD STORAGE\n", 24 | "# https://cloud.google.com/appengine/docs/standard/python/googlecloudstorageclient/read-write-to-cloud-storage\n", 25 | "\n", 26 | "#import logging\n", 27 | "#import os\n", 28 | "#import cloudstorage as gcs\n", 29 | "#import webapp2\n", 30 | "\n", 31 | "#from google.appengine.api import app_identity\n" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 1, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "name": "stdout", 48 | "output_type": "stream", 49 | "text": [ 50 | "{'response': 'Hello from Cloud Function', 'success': True}\n" 51 | ] 52 | } 53 | ], 54 | "source": [ 55 | "\n", 56 | "import requests\n", 57 | "\n", 58 | "result = requests.post(\"https://us-central1-gameanalytics-199018.cloudfunctions.net/echo\", json = { 'msg': 'Hello from Cloud Function' })\n", 59 | "print(result.json())\n", 60 | "\n" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 4, 66 | "metadata": {}, 67 | "outputs": [ 68 | { 69 | "name": "stdout", 70 | "output_type": "stream", 71 | "text": [ 72 | "{'response': 'Hello from Auth', 'success': True}\n" 73 | ] 74 | } 75 | ], 76 | "source": [ 77 | "\n", 78 | "\n", 79 | "import requests\n", 80 | "\n", 81 | "result = requests.post(\"https://us-central1-gameanalytics-199018.cloudfunctions.net/auth\", json = { 'msg': 'Hello from Auth' })\n", 82 | "print(result.json())\n", 83 | "\n", 84 | "\n", 85 | "\n" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 5, 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "ename": "JSONDecodeError", 95 | "evalue": "Expecting value: line 2 column 1 (char 1)", 96 | "output_type": "error", 97 | "traceback": [ 98 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 99 | "\u001b[0;31mJSONDecodeError\u001b[0m Traceback (most recent call last)", 100 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m result = requests.post(\"https://us-central1-gameanalytics-199018.cloudfunctions.net/predict\", \\\n\u001b[1;32m 4\u001b[0m json = { 'G1': '1', 'G2': '0', 'G3': '0', 'G4': '0', 'G5': '0', 'G6': '0', 'G7': '0', 'G8': '0', 'G9': '0', 'G10': '0' })\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 101 | "\u001b[0;32m~/.local/lib/python3.7/site-packages/requests/models.py\u001b[0m in \u001b[0;36mjson\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 895\u001b[0m \u001b[0;31m# used.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 896\u001b[0m \u001b[0;32mpass\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 897\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mcomplexjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 898\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 899\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 102 | "\u001b[0;32m~/.local/lib/python3.7/site-packages/simplejson/__init__.py\u001b[0m in \u001b[0;36mloads\u001b[0;34m(s, encoding, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, use_decimal, **kw)\u001b[0m\n\u001b[1;32m 516\u001b[0m \u001b[0mparse_constant\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mobject_pairs_hook\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 517\u001b[0m and not use_decimal and not kw):\n\u001b[0;32m--> 518\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_default_decoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 519\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcls\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 520\u001b[0m \u001b[0mcls\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mJSONDecoder\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 103 | "\u001b[0;32m~/.local/lib/python3.7/site-packages/simplejson/decoder.py\u001b[0m in \u001b[0;36mdecode\u001b[0;34m(self, s, _w, _PY3)\u001b[0m\n\u001b[1;32m 368\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0m_PY3\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbytes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 369\u001b[0m \u001b[0ms\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencoding\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 370\u001b[0;31m \u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraw_decode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 371\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_w\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 372\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 104 | "\u001b[0;32m~/.local/lib/python3.7/site-packages/simplejson/decoder.py\u001b[0m in \u001b[0;36mraw_decode\u001b[0;34m(self, s, idx, _w, _PY3)\u001b[0m\n\u001b[1;32m 398\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mord0\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0xef\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0midx\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'\\xef\\xbb\\xbf'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 399\u001b[0m \u001b[0midx\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 400\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscan_once\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0m_w\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 105 | "\u001b[0;31mJSONDecodeError\u001b[0m: Expecting value: line 2 column 1 (char 1)" 106 | ] 107 | } 108 | ], 109 | "source": [ 110 | "\n", 111 | "import requests\n", 112 | "\n", 113 | "result = requests.post(\"https://us-central1-gameanalytics-199018.cloudfunctions.net/predict\", \\\n", 114 | " json = { 'G1': '1', 'G2': '0', 'G3': '0', 'G4': '0', 'G5': '0', 'G6': '0', 'G7': '0', 'G8': '0', 'G9': '0', 'G10': '0' })\n", 115 | "print(result.json())\n", 116 | "\n", 117 | "\n" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 7, 130 | "metadata": {}, 131 | "outputs": [ 132 | { 133 | "name": "stdout", 134 | "output_type": "stream", 135 | "text": [ 136 | "Prediction 0.06745113592634559\n" 137 | ] 138 | } 139 | ], 140 | "source": [ 141 | "\n", 142 | "import requests\n", 143 | "\n", 144 | "result = requests.post(\"https://3z5btf0ucb.execute-api.us-east-1.amazonaws.com/default/logit\", \\\n", 145 | " json = { 'G1': '1', 'G2': '0', 'G3': '0', 'G4': '0', 'G5': '0', 'G6': '0', 'G7': '0', 'G8': '0', 'G9': '0', 'G10': '0' })\n", 146 | "print(result.text)\n", 147 | "\n", 148 | "\n", 149 | "\n" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 5, 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "name": "stdout", 166 | "output_type": "stream", 167 | "text": [ 168 | "ben-df-test\n", 169 | "dataproc-de8793b2-5dc6-4500-9aff-cfd3d4b60aa8-us\n", 170 | "dsp_model_store\n", 171 | "dsp_model_store_1\n", 172 | "dsp_pmodel_store\n", 173 | "gameanalytics-199018.appspot.com\n", 174 | "staging.gameanalytics-199018.appspot.com\n" 175 | ] 176 | } 177 | ], 178 | "source": [ 179 | "from google.cloud import storage\n", 180 | "bucket_name = \"dsp_model_store_1\"\n", 181 | "\n", 182 | "storage_client = storage.Client()\n", 183 | "#storage_client.create_bucket(bucket_name)\n", 184 | "\n", 185 | "for bucket in storage_client.list_buckets():\n", 186 | " print(bucket.name)\n" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "\n", 196 | "# save to GCS\n", 197 | "bucket = storage_client.get_bucket(bucket_name)\n", 198 | "blob = bucket.blob(\"serverless/keras/v1\")\n", 199 | "blob.upload_from_filename(\"logit.pkl\")\n", 200 | "blob.upload_from_filename(\"games.h5\")\n", 201 | "\n" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 7, 207 | "metadata": {}, 208 | "outputs": [ 209 | { 210 | "data": { 211 | "text/plain": [ 212 | "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", 213 | " intercept_scaling=1, l1_ratio=None, max_iter=100,\n", 214 | " multi_class='warn', n_jobs=None, penalty='l2',\n", 215 | " random_state=None, solver='warn', tol=0.0001, verbose=0,\n", 216 | " warm_start=False)" 217 | ] 218 | }, 219 | "execution_count": 7, 220 | "metadata": {}, 221 | "output_type": "execute_result" 222 | } 223 | ], 224 | "source": [ 225 | "\n", 226 | "# load from GCS\n", 227 | "blob = bucket.blob(\"serverless/logit/v1\")\n", 228 | "blob.download_to_filename(\"/tmp/local_logit.pkl\")\n", 229 | "\n", 230 | "import pickle as pk\n", 231 | "model = pk.load(open(\"/tmp/local_logit.pkl\", 'rb'))\n", 232 | "model\n" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 3, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "from google.cloud import storage\n", 249 | "\n", 250 | "storage_client = storage.Client()\n", 251 | "bucket_name = \"dsp_model_store_1\"\n", 252 | "\n", 253 | "storage_client = storage.Client()\n", 254 | "bucket = storage_client.get_bucket(bucket_name)\n", 255 | "\n", 256 | "blob = bucket.blob(\"serverless/logit/v1\")\n", 257 | "blob.download_to_filename(\"/tmp/local_logit.pkl\")\n", 258 | "\n", 259 | "model = pickle.load(open(\"/tmp/local_logit.pkl\", 'rb'))\n", 260 | "\n", 261 | "\n" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 28, 274 | "metadata": {}, 275 | "outputs": [ 276 | { 277 | "data": { 278 | "text/plain": [ 279 | "b'\\x80\\x03csklearn.linear_model.logistic\\nLogisticRegression\\nq\\x00)\\x81q\\x01}q\\x02(X\\x07\\x00\\x00\\x00penaltyq\\x03X\\x02\\x00\\x00\\x00l2q\\x04X\\x04\\x00\\x00\\x00dualq\\x05\\x89X\\x03\\x00\\x00\\x00tolq\\x06G?\\x1a6\\xe2\\xeb\\x1cC-X\\x01\\x00\\x00\\x00Cq\\x07G?\\xf0\\x00\\x00\\x00\\x00\\x00\\x00X\\r\\x00\\x00\\x00fit_interceptq\\x08\\x88X\\x11\\x00\\x00\\x00intercept_scalingq\\tK\\x01X\\x0c\\x00\\x00\\x00class_weightq\\nNX\\x0c\\x00\\x00\\x00random_stateq\\x0bNX\\x06\\x00\\x00\\x00solverq\\x0cX\\x04\\x00\\x00\\x00warnq\\rX\\x08\\x00\\x00\\x00max_iterq\\x0eKdX\\x0b\\x00\\x00\\x00multi_classq\\x0fh\\rX\\x07\\x00\\x00\\x00verboseq\\x10K\\x00X\\n\\x00\\x00\\x00warm_startq\\x11\\x89X\\x06\\x00\\x00\\x00n_jobsq\\x12NX\\x08\\x00\\x00\\x00l1_ratioq\\x13NX\\x08\\x00\\x00\\x00classes_q\\x14cnumpy.core.multiarray\\n_reconstruct\\nq\\x15cnumpy\\nndarray\\nq\\x16K\\x00\\x85q\\x17C\\x01bq\\x18\\x87q\\x19Rq\\x1a(K\\x01K\\x02\\x85q\\x1bcnumpy\\ndtype\\nq\\x1cX\\x02\\x00\\x00\\x00i8q\\x1dK\\x00K\\x01\\x87q\\x1eRq\\x1f(K\\x03X\\x01\\x00\\x00\\x00b\\x89C\\x04\\x05\\x00\\x00\\x00q?tq@bX\\x10\\x00\\x00\\x00_sklearn_versionqAX\\x06\\x00\\x00\\x000.21.3qBub.'" 280 | ] 281 | }, 282 | "execution_count": 28, 283 | "metadata": {}, 284 | "output_type": "execute_result" 285 | } 286 | ], 287 | "source": [ 288 | "from google.cloud import storage\n", 289 | "\n", 290 | "storage_client = storage.Client()\n", 291 | "bucket_name = \"dsp_model_store_1\"\n", 292 | "\n", 293 | "storage_client = storage.Client()\n", 294 | "bucket = storage_client.get_bucket(bucket_name)\n", 295 | "blob = bucket.blob(\"serverless/logit/v1\")\n", 296 | "\n", 297 | "contents = blob.download_as_string()\n", 298 | "contents" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 24, 304 | "metadata": {}, 305 | "outputs": [ 306 | { 307 | "data": { 308 | "text/plain": [ 309 | "" 310 | ] 311 | }, 312 | "execution_count": 24, 313 | "metadata": {}, 314 | "output_type": "execute_result" 315 | } 316 | ], 317 | "source": [ 318 | "blob" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": null, 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [ 355 | "from google.cloud import storage\n", 356 | "\n", 357 | "storage_client = storage.Client()\n", 358 | "\n", 359 | "def hello_gcs_generic(data, context):\n", 360 | " bucket = storage_client.get_bucket(data['bucket'])\n", 361 | " blob = bucket.blob(data['name'])\n", 362 | " contents = blob.download_as_string()\n", 363 | " # Process the file contents, etc..." 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": {}, 370 | "outputs": [], 371 | "source": [] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 10, 376 | "metadata": {}, 377 | "outputs": [], 378 | "source": [ 379 | "import cloudstorage as gcs" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": 11, 385 | "metadata": {}, 386 | "outputs": [ 387 | { 388 | "ename": "AttributeError", 389 | "evalue": "module 'cloudstorage' has no attribute 'open'", 390 | "output_type": "error", 391 | "traceback": [ 392 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 393 | "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", 394 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mfilenamee\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"gs://dsp_model_store_1/serverless/logit/v1/local_logit.pkl\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mgcs_file\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgcs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0mcontents\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgcs_file\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mgcs_file\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 395 | "\u001b[0;31mAttributeError\u001b[0m: module 'cloudstorage' has no attribute 'open'" 396 | ] 397 | } 398 | ], 399 | "source": [ 400 | "filenamee = \"gs://dsp_model_store_1/serverless/logit/v1/local_logit.pkl\"\n", 401 | "\n", 402 | "gcs_file = gcs.open(filename)\n", 403 | "contents = gcs_file.read()\n", 404 | "gcs_file.close()" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "metadata": {}, 411 | "outputs": [], 412 | "source": [] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": 1, 417 | "metadata": {}, 418 | "outputs": [], 419 | "source": [ 420 | "from google.cloud import storage\n", 421 | "bucket_name = \"dsp_model_store_1\"\n", 422 | "\n", 423 | "storage_client = storage.Client()\n", 424 | "bucket = storage_client.get_bucket(bucket_name)\n", 425 | "\n", 426 | "blob = bucket.blob(\"serverless/logit/v1\")\n", 427 | "blob.download_to_filename(\"local_logit.pkl\")\n", 428 | "\n" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": null, 434 | "metadata": {}, 435 | "outputs": [], 436 | "source": [ 437 | "import pickle \n", 438 | "\n", 439 | "blob = bucket.blob(\"serverless/logit/v1\")\n", 440 | "#blob.download_to_filename(\"local_logit.pkl\")\n", 441 | "#model = pickle.load(open(\"local_logit.pkl\", 'rb'))\n", 442 | "\n", 443 | "sm = blob.download_as_string(\"local_logit.pkl\")\n", 444 | "#model = pickle.load(sm)\n", 445 | "\n", 446 | "\n", 447 | "#model\n" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": 2, 460 | "metadata": {}, 461 | "outputs": [], 462 | "source": [ 463 | "from google.resumable_media.requests import Download\n", 464 | "\n" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": null, 470 | "metadata": {}, 471 | "outputs": [], 472 | "source": [ 473 | "\n", 474 | "download = Download(\n", 475 | " download_url, stream=file_obj, headers=headers, start=start, end=end\n", 476 | " )\n", 477 | " download.consume(transport)" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": null, 483 | "metadata": {}, 484 | "outputs": [], 485 | "source": [] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": null, 490 | "metadata": {}, 491 | "outputs": [], 492 | "source": [] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": 30, 497 | "metadata": {}, 498 | "outputs": [ 499 | { 500 | "ename": "AttributeError", 501 | "evalue": "'str' object has no attribute '_http'", 502 | "output_type": "error", 503 | "traceback": [ 504 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 505 | "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", 506 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;31m#model = pickle.load(open(\"local_logit.pkl\", 'rb'))\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0msm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mblob\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdownload_as_string\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"local_logit.pkl\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0;31m#model = pickle.load(sm)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 507 | "\u001b[0;32m~/.local/lib/python3.7/site-packages/google/cloud/storage/blob.py\u001b[0m in \u001b[0;36mdownload_as_string\u001b[0;34m(self, client, start, end)\u001b[0m\n\u001b[1;32m 705\u001b[0m \"\"\"\n\u001b[1;32m 706\u001b[0m \u001b[0mstring_buffer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mBytesIO\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 707\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdownload_to_file\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstring_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mclient\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mclient\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 708\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mstring_buffer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetvalue\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 709\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 508 | "\u001b[0;32m~/.local/lib/python3.7/site-packages/google/cloud/storage/blob.py\u001b[0m in \u001b[0;36mdownload_to_file\u001b[0;34m(self, file_obj, client, start, end)\u001b[0m\n\u001b[1;32m 642\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"accept-encoding\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"gzip\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 643\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 644\u001b[0;31m \u001b[0mtransport\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_transport\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclient\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 645\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 646\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_do_download\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtransport\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile_obj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdownload_url\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 509 | "\u001b[0;32m~/.local/lib/python3.7/site-packages/google/cloud/storage/blob.py\u001b[0m in \u001b[0;36m_get_transport\u001b[0;34m(self, client)\u001b[0m\n\u001b[1;32m 525\u001b[0m \"\"\"\n\u001b[1;32m 526\u001b[0m \u001b[0mclient\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_require_client\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclient\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 527\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mclient\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_http\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 528\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 529\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_get_download_url\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 510 | "\u001b[0;31mAttributeError\u001b[0m: 'str' object has no attribute '_http'" 511 | ] 512 | } 513 | ], 514 | "source": [ 515 | "import pickle \n", 516 | "\n", 517 | "blob = bucket.blob(\"serverless/logit/v1\")\n", 518 | "#blob.download_to_filename(\"local_logit.pkl\")\n", 519 | "#model = pickle.load(open(\"local_logit.pkl\", 'rb'))\n", 520 | "\n", 521 | "sm = blob.download_as_string(\"local_logit.pkl\")\n", 522 | "#model = pickle.load(sm)\n", 523 | "\n", 524 | "\n", 525 | "#model\n" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": null, 531 | "metadata": {}, 532 | "outputs": [], 533 | "source": [] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": 25, 538 | "metadata": {}, 539 | "outputs": [ 540 | { 541 | "data": { 542 | "text/plain": [ 543 | "" 544 | ] 545 | }, 546 | "execution_count": 25, 547 | "metadata": {}, 548 | "output_type": "execute_result" 549 | } 550 | ], 551 | "source": [ 552 | "blob = bucket.blob(\"serverless/logit/v1\")\n", 553 | "blob" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": null, 559 | "metadata": {}, 560 | "outputs": [], 561 | "source": [] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": null, 566 | "metadata": {}, 567 | "outputs": [], 568 | "source": [] 569 | }, 570 | { 571 | "cell_type": "code", 572 | "execution_count": null, 573 | "metadata": {}, 574 | "outputs": [], 575 | "source": [] 576 | }, 577 | { 578 | "cell_type": "code", 579 | "execution_count": null, 580 | "metadata": {}, 581 | "outputs": [], 582 | "source": [ 583 | "storage_client = storage.Client()\n", 584 | " bucket = storage_client.get_bucket(bucket_name)\n", 585 | " blob = bucket.blob(source_blob_name)\n", 586 | "\n", 587 | " blob.download_to_filename(destination_file_name)\n", 588 | "\n", 589 | " print('Blob {} downloaded to {}.'.format(\n", 590 | " source_blob_name,\n", 591 | " destination_file_name))" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": null, 597 | "metadata": {}, 598 | "outputs": [], 599 | "source": [] 600 | } 601 | ], 602 | "metadata": { 603 | "kernelspec": { 604 | "display_name": "Python 3", 605 | "language": "python", 606 | "name": "python3" 607 | }, 608 | "language_info": { 609 | "codemirror_mode": { 610 | "name": "ipython", 611 | "version": 3 612 | }, 613 | "file_extension": ".py", 614 | "mimetype": "text/x-python", 615 | "name": "python", 616 | "nbconvert_exporter": "python", 617 | "pygments_lexer": "ipython3", 618 | "version": "3.7.3" 619 | } 620 | }, 621 | "nbformat": 4, 622 | "nbformat_minor": 2 623 | } 624 | -------------------------------------------------------------------------------- /DSP_CH5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/home/ec2-user/.local/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", 13 | " FutureWarning)\n", 14 | "1it [00:03, 3.35s/it]\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "import pandas as pd\n", 20 | "import numpy as np\n", 21 | "from google.oauth2 import service_account\n", 22 | "from sklearn.linear_model import LogisticRegression\n", 23 | "from datetime import datetime\n", 24 | "import pandas_gbq\n", 25 | "\n", 26 | "# fetch the data set and add IDs \n", 27 | "gamesDF = pd.read_csv(\"https://github.com/bgweber/Twitch/raw/master/Recommendations/games-expand.csv\")\n", 28 | "gamesDF['User_ID'] = gamesDF.index \n", 29 | "gamesDF['New_User'] = np.floor(np.random.randint(0, 10, gamesDF.shape[0])/9)\n", 30 | "\n", 31 | "# train and test groups \n", 32 | "train = gamesDF[gamesDF['New_User'] == 0]\n", 33 | "x_train = train.iloc[:,0:10]\n", 34 | "y_train = train['label']\n", 35 | "test = gamesDF[gamesDF['New_User'] == 1]\n", 36 | "x_test = test.iloc[:,0:10]\n", 37 | "\n", 38 | "# build a model\n", 39 | "model = LogisticRegression()\n", 40 | "model.fit(x_train, y_train)\n", 41 | "y_pred = model.predict_proba(x_test)[:, 1]\n", 42 | "\n", 43 | "# build a predictions data frame\n", 44 | "resultDF = pd.DataFrame({'User_ID':test['User_ID'], 'Pred':y_pred} ) \n", 45 | "resultDF['time'] = str(datetime. now())\n", 46 | "\n", 47 | "# save predictions to BigQuery \n", 48 | "table_id = \"dsp_demo.user_scores\"\n", 49 | "project_id = \"gameanalytics-199018\"\n", 50 | "credentials = service_account.Credentials.from_service_account_file('dsdemo.json')\n", 51 | "pandas_gbq.to_gbq(resultDF, table_id, project_id=project_id, if_exists = 'replace', credentials=credentials)\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 10, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "data": { 61 | "text/html": [ 62 | "
\n", 63 | "\n", 76 | "\n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | "
User_IDPredtime
027350.1121542019-11-04 01:06:02.929789
165970.2096462019-11-04 01:06:02.929789
2131590.2096462019-11-04 01:06:02.929789
3181790.2096462019-11-04 01:06:02.929789
44660.2839512019-11-04 01:06:02.929789
\n", 118 | "
" 119 | ], 120 | "text/plain": [ 121 | " User_ID Pred time\n", 122 | "0 2735 0.112154 2019-11-04 01:06:02.929789\n", 123 | "1 6597 0.209646 2019-11-04 01:06:02.929789\n", 124 | "2 13159 0.209646 2019-11-04 01:06:02.929789\n", 125 | "3 18179 0.209646 2019-11-04 01:06:02.929789\n", 126 | "4 466 0.283951 2019-11-04 01:06:02.929789" 127 | ] 128 | }, 129 | "execution_count": 10, 130 | "metadata": {}, 131 | "output_type": "execute_result" 132 | } 133 | ], 134 | "source": [ 135 | "from google.oauth2 import service_account\n", 136 | "import pandas_gbq\n", 137 | "\n", 138 | "credentials = service_account.Credentials.from_service_account_file(\n", 139 | " 'dsdemo.json',\n", 140 | ")\n", 141 | "\n", 142 | "project_id = \"gameanalytics-199018\"\n", 143 | "sql = \"SELECT * FROM dsp_demo.user_scores\"\n", 144 | "df = pandas_gbq.read_gbq(sql, project_id=project_id, credentials=credentials)\n", 145 | "df.head()\n" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [] 154 | } 155 | ], 156 | "metadata": { 157 | "kernelspec": { 158 | "display_name": "Python 3", 159 | "language": "python", 160 | "name": "python3" 161 | }, 162 | "language_info": { 163 | "codemirror_mode": { 164 | "name": "ipython", 165 | "version": 3 166 | }, 167 | "file_extension": ".py", 168 | "mimetype": "text/x-python", 169 | "name": "python", 170 | "nbconvert_exporter": "python", 171 | "pygments_lexer": "ipython3", 172 | "version": "3.7.4" 173 | } 174 | }, 175 | "nbformat": 4, 176 | "nbformat_minor": 2 177 | } 178 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DS_Production -------------------------------------------------------------------------------- /Redis-py-NHL.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "FakeStrictRedis,db=0>>>\n", 13 | "None\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import fakeredis\n", 19 | "import json\n", 20 | "\n", 21 | "server = fakeredis.FakeServer()\n", 22 | "redis = fakeredis.FakeStrictRedis(server=server)\n", 23 | "print(redis)\n", 24 | "\n", 25 | "userID = 12345\n", 26 | "record = redis.get(userID)\n", 27 | "print(record)\n" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "Sessions:0\n" 40 | ] 41 | }, 42 | { 43 | "data": { 44 | "text/plain": [ 45 | "1" 46 | ] 47 | }, 48 | "execution_count": 2, 49 | "metadata": {}, 50 | "output_type": "execute_result" 51 | } 52 | ], 53 | "source": [ 54 | "\n", 55 | "# Create \n", 56 | "record = redis.get(userID)\n", 57 | "if record is None:\n", 58 | " profile = {\"sessions\": 0 }\n", 59 | " redis.set(userID, json.dumps(profile))\n", 60 | "\n", 61 | "# Read\n", 62 | "record = redis.get(userID)\n", 63 | "profile = json.loads(record)\n", 64 | "print(\"Sessions:\" + str(profile['sessions']))\n", 65 | "\n", 66 | "# Update\n", 67 | "profile['sessions'] += 1\n", 68 | "redis.set(userID, json.dumps(profile))\n", 69 | "\n", 70 | "# Expire/Delete \n", 71 | "redis.expire(userID, 15)\n", 72 | "redis.delete(userID)\n" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 3, 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "text/html": [ 83 | "
\n", 84 | "\n", 97 | "\n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | "
game_idplayer_idteam_idtimeOnIceassistsgoalsshotshitspowerPlayGoalspowerPlayAssists...faceoffTakentakeawaysgiveawaysshortHandedGoalsshortHandedAssistsblockedplusMinusevenTimeOnIceshortHandedTimeOnIcepowerPlayTimeOnIce
0201103022184674121999001300...000000-18859816
36201103022284674121993001400...00000109195420
902011030223846741211091114300...10000011023680
\n", 199 | "

3 rows × 22 columns

\n", 200 | "
" 201 | ], 202 | "text/plain": [ 203 | " game_id player_id team_id timeOnIce assists goals shots hits \\\n", 204 | "0 2011030221 8467412 1 999 0 0 1 3 \n", 205 | "36 2011030222 8467412 1 993 0 0 1 4 \n", 206 | "90 2011030223 8467412 1 1091 1 1 4 3 \n", 207 | "\n", 208 | " powerPlayGoals powerPlayAssists ... faceoffTaken takeaways giveaways \\\n", 209 | "0 0 0 ... 0 0 0 \n", 210 | "36 0 0 ... 0 0 0 \n", 211 | "90 0 0 ... 1 0 0 \n", 212 | "\n", 213 | " shortHandedGoals shortHandedAssists blocked plusMinus evenTimeOnIce \\\n", 214 | "0 0 0 0 -1 885 \n", 215 | "36 0 0 1 0 919 \n", 216 | "90 0 0 0 1 1023 \n", 217 | "\n", 218 | " shortHandedTimeOnIce powerPlayTimeOnIce \n", 219 | "0 98 16 \n", 220 | "36 54 20 \n", 221 | "90 68 0 \n", 222 | "\n", 223 | "[3 rows x 22 columns]" 224 | ] 225 | }, 226 | "execution_count": 3, 227 | "metadata": {}, 228 | "output_type": "execute_result" 229 | } 230 | ], 231 | "source": [ 232 | "import pandas as pd\n", 233 | "\n", 234 | "df = pd.read_csv(\"game_skater_stats.csv\")\n", 235 | "df = df[df['player_id'] == 8467412]\n", 236 | "df.head(3)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 4, 242 | "metadata": {}, 243 | "outputs": [ 244 | { 245 | "name": "stdout", 246 | "output_type": "stream", 247 | "text": [ 248 | "{'playerID': 8467412, 'Game_ID': 2011030221, 'goals': 0, 'assists': 0, 'shots': 1, 'hits': 3}\n" 249 | ] 250 | } 251 | ], 252 | "source": [ 253 | "import requests\n", 254 | "\n", 255 | "for index, row in df.iterrows():\n", 256 | " event = { \"playerID\": int(row['player_id']), \"Game_ID\": int(row['game_id']),\n", 257 | " \"goals\": int(row['goals']), \"assists\": int(row['assists']), \n", 258 | " \"shots\": int(row['shots']), \"hits\": int(row['hits']) }\n", 259 | " print(event)\n", 260 | " \n", 261 | " #requests.post(\"http://localhost:5000/update\", json = event) \n", 262 | " #requests.get(\"http://localhost:5000/score?player=8467412\") \n", 263 | " break\n", 264 | " \n" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 5, 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "name": "stdout", 274 | "output_type": "stream", 275 | "text": [ 276 | " * Serving Flask app \"__main__\" (lazy loading)\n", 277 | " * Environment: production\n", 278 | " WARNING: Do not use the development server in a production environment.\n", 279 | " Use a production WSGI server instead.\n", 280 | " * Debug mode: off\n" 281 | ] 282 | }, 283 | { 284 | "name": "stderr", 285 | "output_type": "stream", 286 | "text": [ 287 | " * Running on http://0.0.0.0:5000/ (Press CTRL+C to quit)\n" 288 | ] 289 | } 290 | ], 291 | "source": [ 292 | "import flask\n", 293 | "import fakeredis\n", 294 | "import json\n", 295 | "\n", 296 | "server = fakeredis.FakeServer()\n", 297 | "redis = fakeredis.FakeStrictRedis(server=server)\n", 298 | "app = flask.Flask(__name__)\n", 299 | "\n", 300 | "# endpoint for profile updates\n", 301 | "@app.route(\"/update\", methods=[\"GET\",\"POST\"])\n", 302 | "def update():\n", 303 | " \n", 304 | " # get the player ID to update\n", 305 | " event = flask.request.json\n", 306 | " playerID = event.get('playerID')\n", 307 | " \n", 308 | " # CREATE: heck if a record exists \n", 309 | " record = redis.get(playerID)\n", 310 | " if record is None:\n", 311 | " profile = {\"goals\": 0, \"shots\": 0, \"assists\": 0, \"hits\": 0 }\n", 312 | " redis.set(playerID, json.dumps(profile))\n", 313 | "\n", 314 | " # READ: get the user summary\n", 315 | " record = redis.get(playerID)\n", 316 | " profile = json.loads(record)\n", 317 | "\n", 318 | " # UPDATE: add the new attributes\n", 319 | " profile['goals'] += event['goals']\n", 320 | " profile['shots'] += event['shots']\n", 321 | " profile['assists'] += event['assists']\n", 322 | " profile['hits'] += event['hits']\n", 323 | " redis.set(playerID, json.dumps(profile))\n", 324 | " \n", 325 | " # return the updated profile\n", 326 | " return flask.jsonify(profile)\n", 327 | "\n", 328 | "# endpoint for model serving\n", 329 | "@app.route(\"/score\", methods=[\"GET\"])\n", 330 | "def score():\n", 331 | " result = {}\n", 332 | "\n", 333 | " try:\n", 334 | " # get the user profile \n", 335 | " playerID = flask.request.args['playerID']\n", 336 | " record = redis.get(playerID)\n", 337 | " profile = json.loads(record)\n", 338 | " \n", 339 | " # calculate a regression value\n", 340 | " score = 1 + profile['goals'] * 10.0 \\\n", 341 | " + profile['shots'] * 1.0 \\\n", 342 | " + profile['assists'] * 2.0 \\\n", 343 | " + profile['hits'] * 0.5\n", 344 | " \n", 345 | " result['score'] = score\n", 346 | " except:\n", 347 | " None\n", 348 | " \n", 349 | " return flask.jsonify(result)\n", 350 | "\n", 351 | "# start the flask app, allow remote connections\n", 352 | "if __name__ == '__main__':\n", 353 | " app.run(host='0.0.0.0')" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 9, 359 | "metadata": {}, 360 | "outputs": [ 361 | { 362 | "name": "stdout", 363 | "output_type": "stream", 364 | "text": [ 365 | "{'playerID': 8467412, 'Game_ID': 2011030221, 'goals': 0, 'assists': 0, 'shots': 1, 'hits': 3}\n" 366 | ] 367 | } 368 | ], 369 | "source": [ 370 | "\n", 371 | "for index, row in df.iterrows():\n", 372 | " event = { \"playerID\": int(row['player_id']), \"Game_ID\": int(row['game_id']),\n", 373 | " \"goals\": int(row['goals']), \"assists\": int(row['assists']), \n", 374 | " \"shots\": int(row['shots']), \"hits\": int(row['hits']) }\n", 375 | " print(event)\n", 376 | " \n", 377 | " requests.post(\"http://localhost:5000/update\", json = event) \n", 378 | " prediction = requests.get(\"http://localhost:5000/score?playerID=8467412\") \n", 379 | " print(prediction.json())\n", 380 | " break" 381 | ] 382 | } 383 | ], 384 | "metadata": { 385 | "kernelspec": { 386 | "display_name": "Python 3", 387 | "language": "python", 388 | "name": "python3" 389 | }, 390 | "language_info": { 391 | "codemirror_mode": { 392 | "name": "ipython", 393 | "version": 3 394 | }, 395 | "file_extension": ".py", 396 | "mimetype": "text/x-python", 397 | "name": "python", 398 | "nbconvert_exporter": "python", 399 | "pygments_lexer": "ipython3", 400 | "version": "3.7.3" 401 | } 402 | }, 403 | "nbformat": 4, 404 | "nbformat_minor": 2 405 | } 406 | -------------------------------------------------------------------------------- /Stackdriver.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Initial Endpoint" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "\n", 17 | "import pandas as pd\n", 18 | "from sklearn.linear_model import LogisticRegression\n", 19 | "import flask\n", 20 | "\n", 21 | "df = pd.read_csv(\"https://github.com/bgweber/Twitch/raw/master/Recommendations/games-expand.csv\")\n", 22 | "model = LogisticRegression()\n", 23 | "model.fit(df.drop(['label'], axis=1), df['label'])\n", 24 | "\n", 25 | "app = flask.Flask(__name__)\n", 26 | "\n", 27 | "@app.route(\"/\", methods=[\"GET\",\"POST\"])\n", 28 | "def predict():\n", 29 | " data = {\"success\": False}\n", 30 | " \n", 31 | " params = flask.request.json\n", 32 | " if params is None:\n", 33 | " params = flask.request.args\n", 34 | "\n", 35 | " if \"G1\" in params.keys(): \n", 36 | " new_row = { \"G1\": params.get(\"G1\"), \"G2\": params.get(\"G2\"), \n", 37 | " \"G3\": params.get(\"G3\"), \"G4\": params.get(\"G4\"), \n", 38 | " \"G5\": params.get(\"G5\"), \"G6\": params.get(\"G6\"), \n", 39 | " \"G7\": params.get(\"G7\"), \"G8\": params.get(\"G8\"), \n", 40 | " \"G9\": params.get(\"G9\"), \"G10\": params.get(\"G10\") }\n", 41 | "\n", 42 | " new_x = pd.DataFrame.from_dict(new_row, orient = \"index\").transpose() \n", 43 | " data[\"response\"] = str(model.predict_proba(new_x)[0][1])\n", 44 | " data[\"success\"] = True\n", 45 | "\n", 46 | " return flask.jsonify(data)\n", 47 | "\n", 48 | "if __name__ == '__main__':\n", 49 | " app.run(host='0.0.0.0')" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "# Production Endpoint" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "from google.cloud import monitoring_v3\n", 66 | "from google.oauth2 import service_account\n", 67 | "from google.cloud import logging\n", 68 | "import socket \n", 69 | "import random\n", 70 | "import time\n", 71 | "import pandas as pd\n", 72 | "from sklearn.linear_model import LogisticRegression\n", 73 | "import flask\n", 74 | "from multiprocessing import Value\n", 75 | "import threading\n", 76 | "\n", 77 | "# create a unique host name for the pod \n", 78 | "host = socket.gethostbyname(socket.gethostname()) + \" - \" + str(random.randint(0, 1000000))\n", 79 | "\n", 80 | "# load GCP credentials and set up the Stackdriver monitor connection\n", 81 | "credentials = service_account.Credentials.from_service_account_file('serving.json')\n", 82 | "client = monitoring_v3.MetricServiceClient(credentials = credentials)\n", 83 | "project_name = client.project_path('serving-268422')\n", 84 | "\n", 85 | "# set up the Stackdriver logging connection\n", 86 | "logging_client = logging.Client(project = 'serving-268422', credentials = credentials)\n", 87 | "logger = logging_client.logger('model_service')\n", 88 | "logger.log_text(\"(\" + host + \") Launching model service\")\n", 89 | "\n", 90 | "# train a scikit-learn model \n", 91 | "df = pd.read_csv(\"https://github.com/bgweber/Twitch/raw/master/Recommendations/games-expand.csv\")\n", 92 | "model = LogisticRegression()\n", 93 | "model.fit(df.drop(['label'], axis=1), df['label'])\n", 94 | "\n", 95 | "# set up the app and a request tracker \n", 96 | "counter = Value('i', 0)\n", 97 | "app = flask.Flask(__name__)\n", 98 | "\n", 99 | "# define a function for writing metrics to Stackdriver \n", 100 | "def write_metric_value(value):\n", 101 | " series = monitoring_v3.types.TimeSeries()\n", 102 | " series.metric.type = 'custom.googleapis.com/serving/requests'\n", 103 | " series.metric.labels['ip'] = host\n", 104 | " point = series.points.add()\n", 105 | " point.value.double_value = value\n", 106 | " now = time.time()\n", 107 | " point.interval.end_time.seconds = int(now)\n", 108 | " client.create_time_series(project_name, [series])\n", 109 | "\n", 110 | "# set up a callback for recording requests per minute to Stackdriver \n", 111 | "def log_requests():\n", 112 | " threading.Timer(60.0, log_requests).start() \n", 113 | "\n", 114 | " requests = 0\n", 115 | " with counter.get_lock():\n", 116 | " requests = counter.value\n", 117 | " counter.value = 0 \n", 118 | " \n", 119 | " print(\"writing value: \" + str(requests))\n", 120 | " write_metric_value(requests)\n", 121 | "\n", 122 | "# initiate the request per minute tracking \n", 123 | "log_requests()\n", 124 | " \n", 125 | "# define the model endpoint \n", 126 | "@app.route(\"/\", methods=[\"GET\",\"POST\"])\n", 127 | "def predict():\n", 128 | " try :\n", 129 | " \n", 130 | " # update the number of requests \n", 131 | " with counter.get_lock():\n", 132 | " counter.value += 1 \n", 133 | " \n", 134 | " data = {\"success\": False}\n", 135 | "\n", 136 | " # check for passed in parameters \n", 137 | " params = flask.request.json\n", 138 | " if params is None:\n", 139 | " params = flask.request.args\n", 140 | " \n", 141 | " # get a model prediction \n", 142 | " if \"G1\" in params.keys(): \n", 143 | " new_row = { \"G1\": params.get(\"G1\"), \"G2\": params.get(\"G2\"), \n", 144 | " \"G3\": params.get(\"G3\"), \"G4\": params.get(\"G4\"), \n", 145 | " \"G5\": params.get(\"G5\"), \"G6\": params.get(\"G6\"), \n", 146 | " \"G7\": params.get(\"G7\"), \"G8\": params.get(\"G8\"), \n", 147 | " \"G9\": params.get(\"G9\"), \"G10\": params.get(\"G10\") }\n", 148 | "\n", 149 | " new_x = pd.DataFrame.from_dict(new_row, orient = \"index\").transpose() \n", 150 | " data[\"response\"] = str(model.predict_proba(new_x)[0][1])\n", 151 | " data[\"success\"] = True\n", 152 | " \n", 153 | " return flask.jsonify(data)\n", 154 | " except:\n", 155 | " \n", 156 | " # log any invalid requests \n", 157 | " logger.log_text(\"(\" + host + \") Error servicing request: \" + str(flask.request) + \" \" + str(params))\n", 158 | " flask.abort(400)\n", 159 | "\n", 160 | "# let gunicorn manage the ports to use \n", 161 | "if __name__ == '__main__':\n", 162 | " app.run(host='0.0.0.0')\n", 163 | "\n" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "# Dockerfile" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "FROM ubuntu:latest\n", 180 | "MAINTAINER Ben Weber \n", 181 | "\n", 182 | "RUN apt-get update \\ \n", 183 | " && apt-get install -y python3-pip python3-dev \\ \n", 184 | " && cd /usr/local/bin \\ \n", 185 | " && ln -s /usr/bin/python3 python \n", 186 | "\n", 187 | "RUN pip3 install flask \n", 188 | "RUN pip3 install pandas \n", 189 | "RUN pip3 install gunicorn \n", 190 | "RUN pip3 install scikit-learn \n", 191 | "RUN pip3 install google-cloud-logging \n", 192 | "RUN pip3 install google-cloud-monitoring \n", 193 | " \n", 194 | "COPY serving.json serving.json\n", 195 | "COPY app.py app.py\n", 196 | "\n", 197 | "ENTRYPOINT [\"gunicorn\", \"--bind\", \"0.0.0.0\", \"app:app\"]" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "# Stackdriver Monitoring" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 5, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "\n", 214 | "# connect to the monitoring service \n", 215 | "from google.cloud import monitoring_v3\n", 216 | "from google.oauth2 import service_account\n", 217 | "import time\n", 218 | "\n", 219 | "credentials = service_account.Credentials.from_service_account_file('serving.json')\n", 220 | "client = monitoring_v3.MetricServiceClient(credentials = credentials)\n", 221 | "project_name = client.project_path('serving-268422')\n" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 6, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "\n", 231 | "# create a custom metric\n", 232 | "descriptor = monitoring_v3.types.MetricDescriptor()\n", 233 | "descriptor.type = 'custom.googleapis.com/serving/requests' \n", 234 | "descriptor.metric_kind = (monitoring_v3.enums.MetricDescriptor.MetricKind.GAUGE)\n", 235 | "descriptor.value_type = (monitoring_v3.enums.MetricDescriptor.ValueType.DOUBLE)\n", 236 | "descriptor.description = 'Model serving requests.'\n" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "\n", 246 | "# record a data point to the custom metric \n", 247 | "series = monitoring_v3.types.TimeSeries()\n", 248 | "series.metric.type = 'custom.googleapis.com/serving/requests'\n", 249 | "series.metric.labels['ip'] = \"1.2.3.4\"\n", 250 | "point = series.points.add()\n", 251 | "point.value.double_value = 50\n", 252 | "now = time.time()\n", 253 | "point.interval.end_time.seconds = int(now)\n", 254 | "client.create_time_series(project_name, [series])\n" 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": {}, 260 | "source": [ 261 | "# Stackdriver Logging" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "\n", 271 | "# connect to the monitoring service \n", 272 | "from google.cloud import logging\n", 273 | "\n", 274 | "logging_client = logging.Client(project = 'serving-268422', credentials = credentials)\n", 275 | "logger = logging_client.logger('model_service')\n" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "\n", 285 | "# log a message to stack driver \n", 286 | "logger.log_text('Hello World!')\n" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "# Endpoint Testing" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 11, 299 | "metadata": {}, 300 | "outputs": [ 301 | { 302 | "name": "stdout", 303 | "output_type": "stream", 304 | "text": [ 305 | "\n", 306 | "{'response': '0.06730006696024816', 'success': True}\n" 307 | ] 308 | } 309 | ], 310 | "source": [ 311 | "import requests\n", 312 | "\n", 313 | "result = requests.post(\"http://35.226.218.118/\", json = { 'G1':'1', 'G2':'0', 'G3':'0', 'G4':'0', 'G5':'0', \\\n", 314 | " 'G6':'0', 'G7':'0', 'G8':'0', 'G9':'0', 'G10':'0'}) \n", 315 | "print(result)\n", 316 | "print(result.json())\n" 317 | ] 318 | } 319 | ], 320 | "metadata": { 321 | "kernelspec": { 322 | "display_name": "Python 3", 323 | "language": "python", 324 | "name": "python3" 325 | }, 326 | "language_info": { 327 | "codemirror_mode": { 328 | "name": "ipython", 329 | "version": 3 330 | }, 331 | "file_extension": ".py", 332 | "mimetype": "text/x-python", 333 | "name": "python", 334 | "nbconvert_exporter": "python", 335 | "pygments_lexer": "ipython3", 336 | "version": "3.8.1" 337 | } 338 | }, 339 | "nbformat": 4, 340 | "nbformat_minor": 4 341 | } 342 | -------------------------------------------------------------------------------- /append.py: -------------------------------------------------------------------------------- 1 | import apache_beam as beam 2 | import argparse 3 | from apache_beam.options.pipeline_options import PipelineOptions 4 | from apache_beam.io import ReadFromText 5 | from apache_beam.io import WriteToText 6 | 7 | # define a function for transforming the data 8 | class AppendDoFn(beam.DoFn): 9 | def process(self, element): 10 | return "Hello World! " + element 11 | 12 | # set up pipeline parameters 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('--input', dest='input', 15 | default='gs://dataflow-samples/shakespeare/kinglear.txt') 16 | parser.add_argument('--output', dest='output', 17 | default='gs://dsp_model_store/shakespeare/kinglear3.txt') 18 | known_args, pipeline_args = parser.parse_known_args(None) 19 | pipeline_options = PipelineOptions(pipeline_args) 20 | 21 | # define the pipeline steps 22 | p = beam.Pipeline(options=pipeline_options) 23 | lines = p | 'read' >> ReadFromText(known_args.input) 24 | appended = lines | 'append' >> beam.ParDo(AppendDoFn()) 25 | appended | 'write' >> WriteToText(known_args.output) 26 | 27 | # run the pipeline 28 | result = p.run() 29 | result.wait_until_finish() 30 | 31 | -------------------------------------------------------------------------------- /apply.py: -------------------------------------------------------------------------------- 1 | import apache_beam as beam 2 | import argparse 3 | from apache_beam.options.pipeline_options import PipelineOptions 4 | from apache_beam.options.pipeline_options import SetupOptions 5 | from apache_beam.io.gcp.bigquery import parse_table_schema_from_json 6 | import json 7 | 8 | query = """ 9 | SELECT year, plurality, apgar_5min, 10 | mother_age, father_age, 11 | gestation_weeks, ever_born 12 | ,case when mother_married = true 13 | then 1 else 0 end as mother_married 14 | ,weight_pounds as weight 15 | ,current_timestamp as time 16 | ,GENERATE_UUID() as guid 17 | FROM `bigquery-public-data.samples.natality` 18 | limit 100 19 | """ 20 | 21 | class ApplyDoFn(beam.DoFn): 22 | 23 | def __init__(self): 24 | self._model = None 25 | from google.cloud import storage 26 | import pandas as pd 27 | import pickle as pkl 28 | self._storage = storage 29 | self._pkl = pkl 30 | self._pd = pd 31 | 32 | def process(self, element): 33 | if self._model is None: 34 | bucket = self._storage.Client().get_bucket('dsp_model_store') 35 | blob = bucket.get_blob('natality/sklearn-linear') 36 | self._model = self._pkl.loads(blob.download_as_string()) 37 | 38 | new_x = self._pd.DataFrame.from_dict(element, orient = "index").transpose().fillna(0) 39 | weight = self._model.predict(new_x.iloc[:,1:8])[0] 40 | return [ { 'guid': element['guid'], 'weight': weight, 'time': str(element['time']) } ] 41 | 42 | schema = parse_table_schema_from_json(json.dumps({'fields': 43 | [ { 'name': 'guid', 'type': 'STRING'}, 44 | { 'name': 'weight', 'type': 'FLOAT64'}, 45 | { 'name': 'time', 'type': 'STRING'} ]})) 46 | 47 | class PublishDoFn(beam.DoFn): 48 | 49 | def __init__(self): 50 | from google.cloud import datastore 51 | self._ds = datastore 52 | 53 | def process(self, element): 54 | client = self._ds.Client() 55 | key = client.key('natality-guid', element['guid']) 56 | entity = self._ds.Entity(key) 57 | entity['weight'] = element['weight'] 58 | entity['time'] = element['time'] 59 | client.put(entity) 60 | 61 | parser = argparse.ArgumentParser() 62 | known_args, pipeline_args = parser.parse_known_args(None) 63 | pipeline_options = PipelineOptions(pipeline_args) 64 | 65 | # define the pipeline steps 66 | p = beam.Pipeline(options=pipeline_options) 67 | data = p | 'Read from BigQuery' >> beam.io.Read( 68 | beam.io.BigQuerySource(query=query, use_standard_sql=True)) 69 | scored = data | 'Apply Model' >> beam.ParDo(ApplyDoFn()) 70 | scored | 'Save to BigQuery' >> beam.io.Write(beam.io.BigQuerySink( 71 | 'weight_preds', 'dsp_demo', schema = schema, 72 | create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, 73 | write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) 74 | 75 | scored | 'Create entities' >> beam.ParDo(PublishDoFn()) 76 | 77 | # run the pipeline 78 | result = p.run() 79 | result.wait_until_finish() 80 | 81 | -------------------------------------------------------------------------------- /book_sample.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bgweber/DS_Production/3b909e326f18caddd6562d6a9d54c64e52b1fa48/book_sample.pdf -------------------------------------------------------------------------------- /dash_app.py: -------------------------------------------------------------------------------- 1 | import dash 2 | import dash_html_components as html 3 | import dash_core_components as dcc 4 | from dash.dependencies import Input, Output 5 | import pandas as pd 6 | import mlflow.sklearn 7 | 8 | app = dash.Dash(__name__) 9 | 10 | app.layout = html.Div(children=[ 11 | html.H1(children='Model UI'), 12 | html.P([ 13 | html.Label('Game 1 '), 14 | dcc.Input(value='1', type='text', id='g1'), 15 | ]), 16 | html.Div([ 17 | html.Label('Game 2 '), 18 | dcc.Input(value='0', type='text', id='g2'), 19 | ]), 20 | html.P([ 21 | html.Label('Prediction '), 22 | dcc.Input(value='0', type='text', id='pred') 23 | ]), 24 | ]) 25 | 26 | model_path = "models/logit_games_v1" 27 | model = mlflow.sklearn.load_model(model_path) 28 | 29 | @app.callback( 30 | Output(component_id='pred', component_property='value'), 31 | [Input(component_id='g1', component_property='value'), 32 | Input(component_id='g2', component_property='value')] 33 | ) 34 | def update_prediction(game1, game2): 35 | 36 | new_row = { "G1": float(game1), "G2": float(game2), 37 | "G3": 0, "G4": 0, 38 | "G5": 0, "G6": 0, 39 | "G7": 0, "G8": 0, 40 | "G9": 0, "G10":0 } 41 | 42 | new_x = pd.DataFrame.from_dict(new_row, orient = "index").transpose() 43 | return str(model.predict_proba(new_x)[0][1]) 44 | 45 | if __name__ == '__main__': 46 | app.run_server(host='0.0.0.0') 47 | -------------------------------------------------------------------------------- /dataflow_read.py: -------------------------------------------------------------------------------- 1 | 2 | import apache_beam as beam 3 | import argparse 4 | from apache_beam.options.pipeline_options import PipelineOptions 5 | 6 | parser = argparse.ArgumentParser() 7 | known_args, pipeline_args = parser.parse_known_args(None) 8 | pipeline_options = PipelineOptions(pipeline_args) 9 | 10 | class ApplyDoFn(beam.DoFn): 11 | def process(self, element): 12 | print(element) 13 | 14 | 15 | query = """ 16 | select * 17 | from `bigquery-public-data.samples.natality` 18 | order by rand() 19 | limit 100 20 | """ 21 | 22 | # define the pipeline steps 23 | p = beam.Pipeline(options=pipeline_options) 24 | data = p | 'Read from BigQuery' >> beam.io.Read( 25 | beam.io.BigQuerySource(query=query, use_standard_sql=True)) 26 | scored = data | 'Apply Model' >> beam.ParDo(ApplyDoFn()) 27 | 28 | # run the pipeline 29 | result = p.run() 30 | result.wait_until_finish() 31 | 32 | -------------------------------------------------------------------------------- /echo.py: -------------------------------------------------------------------------------- 1 | # load Flask 2 | import flask 3 | app = flask.Flask(__name__) 4 | 5 | # define a predict function as an endpoint 6 | @app.route("/", methods=["GET","POST"]) 7 | def predict(): 8 | data = {"success": False} 9 | 10 | # check for passed in parameters 11 | params = flask.request.json 12 | if params is None: 13 | params = flask.request.args 14 | 15 | # if parameters are found, echo the msg parameter 16 | if "msg" in params.keys(): 17 | data["response"] = params.get("msg") 18 | data["success"] = True 19 | 20 | # return a response in json format 21 | return flask.jsonify(data) 22 | 23 | # start the flask app, allow remote connections 24 | if __name__ == '__main__': 25 | app.run(host='0.0.0.0') 26 | 27 | -------------------------------------------------------------------------------- /keras_games.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | import mlflow 4 | import mlflow.keras 5 | import flask 6 | import tensorflow as tf 7 | import keras as k 8 | 9 | def auc(y_true, y_pred): 10 | auc = tf.metrics.auc(y_true, y_pred)[1] 11 | k.backend.get_session().run( 12 | tf.local_variables_initializer()) 13 | return auc 14 | 15 | global graph 16 | graph = tf.get_default_graph() 17 | model_path = "models/keras_games_v1" 18 | model = mlflow.keras.load_model(model_path, custom_objects={'auc': auc}) 19 | 20 | app = flask.Flask(__name__) 21 | 22 | @app.route("/", methods=["GET","POST"]) 23 | def predict(): 24 | data = {"success": False} 25 | params = flask.request.args 26 | 27 | if "G1" in params.keys(): 28 | new_row = { "G1": params.get("G1"), "G2": params.get("G2"), 29 | "G3": params.get("G3"), "G4": params.get("G4"), 30 | "G5": params.get("G5"), "G6": params.get("G6"), 31 | "G7": params.get("G7"), "G8": params.get("G8"), 32 | "G9": params.get("G9"), "G10": params.get("G10") } 33 | 34 | new_x = pd.DataFrame.from_dict(new_row, orient = "index").transpose() 35 | 36 | with graph.as_default(): 37 | data["response"] = str(model.predict(new_x)[0][0]) 38 | data["success"] = True 39 | 40 | return flask.jsonify(data) 41 | 42 | if __name__ == '__main__': 43 | app.run(host='0.0.0.0') 44 | 45 | #model.evaluate(x, y, verbose = 0) 46 | 47 | 48 | -------------------------------------------------------------------------------- /logit.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | from sklearn.linear_model import LogisticRegression 4 | import pickle 5 | 6 | model = pickle.load(open("logit.pkl", 'rb')) 7 | 8 | # load Flask 9 | import flask 10 | app = flask.Flask(__name__) 11 | 12 | # define a predict function as an endpoint 13 | @app.route("/", methods=["GET"]) 14 | def predict(): 15 | data = {"success": False} 16 | 17 | # get the request parameters 18 | params = flask.request.args 19 | 20 | # if parameters are found, echo the msg parameter 21 | if (params != None): 22 | new_row = { "G1": params.get("G1"), "G2": params.get("G2"), 23 | "G3": params.get("G3"), "G4": params.get("G4"), 24 | "G5": params.get("G5"), "G6": params.get("G6"), 25 | "G7": params.get("G7"), "G8": params.get("G8"), 26 | "G9": params.get("G9"), "G10": params.get("G10") } 27 | 28 | new_x = pd.DataFrame.from_dict(new_row, orient = "index").transpose() 29 | data["response"] = str(model.predict_proba(new_x)[0][1]) 30 | data["success"] = True 31 | 32 | # return a response in json format 33 | return flask.jsonify(data) 34 | 35 | # start the flask app, allow remote connections 36 | app.run(host='0.0.0.0') 37 | 38 | 39 | -------------------------------------------------------------------------------- /natality.py: -------------------------------------------------------------------------------- 1 | import apache_beam as beam 2 | import argparse 3 | from apache_beam.options.pipeline_options import PipelineOptions 4 | from apache_beam.io.gcp.bigquery import parse_table_schema_from_json 5 | import json 6 | 7 | class ApplyDoFn(beam.DoFn): 8 | 9 | def __init__(self): 10 | self._model = None 11 | from google.cloud import storage 12 | import pandas as pd 13 | import pickle as pkl 14 | import json as js 15 | self._storage = storage 16 | self._pkl = pkl 17 | self._pd = pd 18 | self._json = js 19 | 20 | def process(self, element): 21 | if self._model is None: 22 | bucket = self._storage.Client().get_bucket( 23 | 'dsp_model_store') 24 | blob = bucket.get_blob('natality/sklearn-linear') 25 | self._model =self._pkl.loads(blob.download_as_string()) 26 | 27 | element = self._json.loads(element.decode('utf-8')) 28 | new_x = self._pd.DataFrame.from_dict(element, 29 | orient = "index").transpose().fillna(0) 30 | weight = self._model.predict(new_x.iloc[:,1:8])[0] 31 | print(str(weight)) 32 | return [ { 'guid': element['guid'], 'weight': weight, 33 | 'time': str(element['time']) } ] 34 | 35 | class PublishDoFn(beam.DoFn): 36 | 37 | def __init__(self): 38 | from google.cloud import datastore 39 | self._ds = datastore 40 | 41 | def process(self, element): 42 | client = self._ds.Client() 43 | key = client.key('natality-guid', element['guid']) 44 | entity = self._ds.Entity(key) 45 | entity['weight'] = element['weight'] 46 | entity['time'] = element['time'] 47 | print("publish") 48 | print(entity) 49 | client.put(entity) 50 | 51 | # set up pipeline parameters 52 | parser = argparse.ArgumentParser() 53 | known_args, pipeline_args = parser.parse_known_args(None) 54 | pipeline_options = PipelineOptions(pipeline_args) 55 | 56 | # define the topics 57 | topic = "projects/{project}/topics/{topic}" 58 | topic = topic.format(project = "gameanalytics-199018", topic = "natality") 59 | 60 | schema = parse_table_schema_from_json(json.dumps({'fields': 61 | [ { 'name': 'guid', 'type': 'STRING'}, 62 | { 'name': 'weight', 'type': 'FLOAT64'}, 63 | { 'name': 'time', 'type': 'STRING'} ]})) 64 | 65 | # define the pipeline steps 66 | p = beam.Pipeline(options=pipeline_options) 67 | lines = p | 'Read PubSub' >> beam.io.ReadFromPubSub(topic=topic) 68 | scored = lines | 'apply' >> beam.ParDo(ApplyDoFn()) 69 | scored | 'Create entities' >> beam.ParDo(PublishDoFn()) 70 | 71 | # run the pipeline 72 | result = p.run() 73 | result.wait_until_finish() 74 | 75 | -------------------------------------------------------------------------------- /predict.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | from sklearn.linear_model import LogisticRegression 4 | #import pickle 5 | import mlflow 6 | import mlflow.sklearn 7 | 8 | 9 | #model = pickle.load(open("logit.pkl", 'rb')) 10 | model_path = "models/logit_games_v1" 11 | model = mlflow.sklearn.load_model(model_path) 12 | 13 | import flask 14 | app = flask.Flask(__name__) 15 | 16 | @app.route("/", methods=["GET","POST"]) 17 | def predict(): 18 | data = {"success": False} 19 | params = flask.request.args 20 | 21 | if "G1" in params.keys(): 22 | new_row = { "G1": params.get("G1"), "G2": params.get("G2"), 23 | "G3": params.get("G3"), "G4": params.get("G4"), 24 | "G5": params.get("G5"), "G6": params.get("G6"), 25 | "G7": params.get("G7"), "G8": params.get("G8"), 26 | "G9": params.get("G9"), "G10":params.get("G10") } 27 | 28 | new_x = pd.DataFrame.from_dict(new_row, orient = "index").transpose() 29 | data["response"] = str(model.predict_proba(new_x)[0][1]) 30 | data["success"] = True 31 | 32 | return flask.jsonify(data) 33 | 34 | if __name__ == '__main__': 35 | app.run(host='0.0.0.0') 36 | 37 | -------------------------------------------------------------------------------- /stream.py: -------------------------------------------------------------------------------- 1 | import apache_beam as beam 2 | import argparse 3 | from apache_beam.options.pipeline_options import PipelineOptions 4 | 5 | # define a function for transforming the data 6 | class AppendDoFn(beam.DoFn): 7 | def process(self, element): 8 | print("Hellow World! - " + element) 9 | 10 | # set up pipeline parameters 11 | parser = argparse.ArgumentParser() 12 | known_args, pipeline_args = parser.parse_known_args(None) 13 | pipeline_options = PipelineOptions(pipeline_args) 14 | 15 | # define the topics 16 | topic = "projects/{project}/topics/{topic}" 17 | topic = topic.format(project = "gameanalytics-199018", topic = "natality") 18 | 19 | 20 | # define the pipeline steps 21 | p = beam.Pipeline(options=pipeline_options) 22 | lines = p | 'Read PubSub' >> beam.io.ReadFromPubSub(topic=topic) 23 | appended = lines | 'append' >> beam.ParDo(AppendDoFn()) 24 | 25 | # run the pipeline 26 | result = p.run() 27 | result.wait_until_finish() 28 | --------------------------------------------------------------------------------