├── .github └── FUNDING.yml ├── .gitignore ├── LICENSE ├── README.md ├── dask-hdfs-parquet.ipynb ├── elasticsearch-geo-aggregations.ipynb ├── elasticsearch.ipynb ├── mongodb.ipynb ├── naturalearth.ipynb ├── parquet-example-iris.ipynb ├── parquet.ipynb ├── presto-geospatial.ipynb ├── presto-pyhive.ipynb ├── presto.ipynb ├── scripts ├── configuration │ ├── core-site.xml │ ├── hadoop-env.sh │ ├── hbase-env.sh │ ├── hbase-site.xml │ ├── hdfs-site.xml │ ├── hive-site.xml │ ├── mapred-site.xml │ ├── metastore-site.xml │ ├── presto │ │ ├── catalog │ │ │ └── hive.properties │ │ ├── config.properties │ │ ├── jvm.config │ │ └── node.properties │ ├── spark-defaults.conf │ ├── yarn-site.xml │ └── zoo.cfg ├── download_gdelt.sh ├── elasticsearch │ ├── logstash_ingest_gdelt.sh │ ├── logstash_ingest_gdelt_2019.conf │ ├── logstash_ingest_gdelt_2020.conf │ ├── logstash_ingest_ne_countries.conf │ └── logstash_ingest_ne_countries.sh ├── geomesa │ ├── geomesa_fs_ingest.sh │ ├── geomesa_fs_ingest_gdelt_custom.sh │ ├── geomesa_fs_ingest_ne_countries.sh │ ├── geomesa_gdelt_custom.conf │ ├── geomesa_hbase_ingest.sh │ ├── geomesa_hbase_ingest_gdelt_custom.sh │ ├── geomesa_hbase_ingest_ne_countries.sh │ ├── geomesa_ne_countries.conf │ ├── geomesa_parquet_example.conf │ ├── geomesa_spark_shell.sh │ └── install_geomesa_pyspark.sh ├── hive │ ├── calculate_table_statistics.sh │ ├── create_table_gdelt_csv.hql │ ├── create_table_gdelt_parquet.hql │ └── create_table_ne_parquet.hql ├── install_hadoop.sh ├── install_hive.sh ├── install_hive_metastore.sh ├── install_trino.sh ├── load_gdelt_to_hdfs.sh ├── load_ne_to_hdfs.sh ├── load_ne_to_postgres.sh ├── mongodb │ ├── gdelt_2020_create_geometry_field.js │ ├── mongodb_gdelt_fields.txt │ └── mongodb_import_csv.sh ├── postgres │ ├── add_geometry.sql │ ├── copy_to_postgres.sql │ ├── create_cstore_test.sql │ ├── create_table.sql │ └── spatial_join.sql ├── presto │ ├── create_tables.sh │ ├── presto_query_stats.sql │ ├── query │ │ ├── query_01.sql │ │ ├── query_02.sql │ │ ├── query_03.sql │ │ ├── query_04.sql │ │ ├── query_05.sql │ │ ├── query_06.sql │ │ ├── query_07.sql │ │ ├── query_08.sql │ │ ├── query_09.sql │ │ └── query_10.sql │ └── run_query.sh ├── python │ ├── gdelt_dask_filter_merge.py │ ├── gdelt_download.py │ ├── gdelt_filter_csv.py │ ├── gdelt_load_to_postgres.py │ ├── gdelt_merge_csv.py │ └── gdelt_merge_parquet.py └── spark │ ├── gdelt_event_count.scala │ └── gdelt_event_count_hive.scala ├── spark-geomesa-fs.ipynb ├── spark-geomesa-hbase.ipynb └── spark-parquet.ipynb /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: ["https://www.paypal.me/njanakiev"] 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | log/ 2 | data/ 3 | dask-worker-space/ 4 | spark-warehouse/ 5 | processed_data/ 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | *.py,cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | cover/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | db.sqlite3 68 | db.sqlite3-journal 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # PyBuilder 81 | .pybuilder/ 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | 87 | # IPython 88 | profile_default/ 89 | ipython_config.py 90 | 91 | # pyenv 92 | # For a library or package, you might want to ignore these files since the code is 93 | # intended to run in multiple environments; otherwise, check them in: 94 | # .python-version 95 | 96 | # pipenv 97 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 98 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 99 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 100 | # install all needed dependencies. 101 | #Pipfile.lock 102 | 103 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 104 | __pypackages__/ 105 | 106 | # Celery stuff 107 | celerybeat-schedule 108 | celerybeat.pid 109 | 110 | # SageMath parsed files 111 | *.sage.py 112 | 113 | # Environments 114 | .env 115 | .venv 116 | env/ 117 | venv/ 118 | ENV/ 119 | env.bak/ 120 | venv.bak/ 121 | 122 | # Spyder project settings 123 | .spyderproject 124 | .spyproject 125 | 126 | # Rope project settings 127 | .ropeproject 128 | 129 | # mkdocs documentation 130 | /site 131 | 132 | # mypy 133 | .mypy_cache/ 134 | .dmypy.json 135 | dmypy.json 136 | 137 | # Pyre type checker 138 | .pyre/ 139 | 140 | # pytype static type analyzer 141 | .pytype/ 142 | 143 | # Cython debug symbols 144 | cython_debug/ 145 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Nikolai Janakiev 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # scalable-geospatial-data-science 2 | The big data landscape is vast and continuously growing and it is also becoming aware of the challenges when dealing with geospatial data. This project will go over and compare the recent open source developments in this space that enable working with geospatial data at scale. 3 | 4 | Scripts and notebooks for the presentation "Scalable Geospatial Data Science". Slides can be found here: https://njanakiev.github.io/slides/scalable-geospatial-data-science/ 5 | 6 | # License 7 | This project is licensed under the MIT license. See the [LICENSE](LICENSE) for details. 8 | -------------------------------------------------------------------------------- /parquet-example-iris.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Create Parquet with Iris Data Set" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 7, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "%matplotlib inline" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 8, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import os\n", 26 | "import json\n", 27 | "import numpy as np\n", 28 | "import pandas as pd\n", 29 | "import dask\n", 30 | "import dask.dataframe as dd\n", 31 | "import pyarrow as pa\n", 32 | "import pyarrow.parquet as pq\n", 33 | "\n", 34 | "from IPython.display import JSON" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 9, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "HDFS_HOME = \"hdfs://node-master:54310/user/hadoop\"" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 4, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "pandas 1.0.5\n", 56 | "dask 2.30.0\n", 57 | "pyarrow 3.0.0\n", 58 | "numpy 1.18.5\n" 59 | ] 60 | } 61 | ], 62 | "source": [ 63 | "for module in [pd, dask, pa, np]:\n", 64 | " print(module.__name__, module.__version__)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 37, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "Deleted iris_parquet/iris.parq\n" 77 | ] 78 | } 79 | ], 80 | "source": [ 81 | "!hdfs dfs -rm -r iris_parquet/iris.parq" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 38, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "name": "stderr", 91 | "output_type": "stream", 92 | "text": [ 93 | "/home/hadoop/anaconda3/envs/sgds/lib/python3.7/site-packages/ipykernel_launcher.py:1: DeprecationWarning: pyarrow.hdfs.connect is deprecated as of 2.0.0, please use pyarrow.fs.HadoopFileSystem instead.\n", 94 | " \"\"\"Entry point for launching an IPython kernel.\n" 95 | ] 96 | }, 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "\n", 102 | "RangeIndex: 150 entries, 0 to 149\n", 103 | "Data columns (total 6 columns):\n", 104 | " # Column Non-Null Count Dtype \n", 105 | "--- ------ -------------- ----- \n", 106 | " 0 sepal_length 150 non-null float64\n", 107 | " 1 sepal_width 150 non-null float64\n", 108 | " 2 petal_length 150 non-null float64\n", 109 | " 3 petal_width 150 non-null float64\n", 110 | " 4 class 150 non-null object \n", 111 | " 5 date_test 150 non-null object \n", 112 | "dtypes: float64(4), object(2)\n", 113 | "memory usage: 21.0 KB\n" 114 | ] 115 | } 116 | ], 117 | "source": [ 118 | "hdfs = pa.hdfs.connect('node-master', port=54310)\n", 119 | "\n", 120 | "schema = pa.schema([\n", 121 | " pa.field('sepal_length', pa.float64()),\n", 122 | " pa.field('sepal_width', pa.float64()),\n", 123 | " pa.field('petal_length', pa.float64()),\n", 124 | " pa.field('petal_width', pa.float64()),\n", 125 | " pa.field('class', pa.string()),\n", 126 | " pa.field('date_test', pa.date32()),\n", 127 | "])\n", 128 | "\n", 129 | "columns = [\n", 130 | " 'sepal_length',\n", 131 | " 'sepal_width',\n", 132 | " 'petal_length',\n", 133 | " 'petal_width',\n", 134 | " 'class'\n", 135 | "]\n", 136 | "\n", 137 | "df = pd.read_csv(\"data/iris.data\", names=columns)\n", 138 | "df['date_test'] = pd.date_range(\"2020-01-01\", periods=len(df))\n", 139 | "df['date_test'] = df['date_test'].dt.date\n", 140 | "\n", 141 | "df.info(memory_usage='deep')\n", 142 | "\n", 143 | "with hdfs.open(\"iris_parquet/iris.parq\", \"wb\") as f:\n", 144 | " df.to_parquet(f,\n", 145 | " engine='pyarrow',\n", 146 | " compression='snappy',\n", 147 | " #schema=schema,\n", 148 | " index=False)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "# Load to Hive" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 39, 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "name": "stdout", 165 | "output_type": "stream", 166 | "text": [ 167 | "gdelt_parquet\n", 168 | "gdelt_parquet_2020\n", 169 | "iris_csv\n", 170 | "iris_parquet\n", 171 | "ne_10_states_provinces_parquet\n", 172 | "ne_110_countries_parquet\n" 173 | ] 174 | } 175 | ], 176 | "source": [ 177 | "%%bash\n", 178 | "hive -e \"SHOW TABLES;\" 2> /dev/null" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 40, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "%%bash\n", 188 | "hive -e 'set parquet.compression=SNAPPY;\n", 189 | "DROP TABLE IF EXISTS iris_parquet;\n", 190 | "CREATE EXTERNAL TABLE iris_parquet (\n", 191 | " sepal_length DOUBLE,\n", 192 | " sepal_width DOUBLE,\n", 193 | " petal_length DOUBLE,\n", 194 | " petal_width DOUBLE,\n", 195 | " class STRING,\n", 196 | " date_test DATE\n", 197 | ") \n", 198 | "STORED AS PARQUET\n", 199 | "LOCATION \"hdfs://node-master:54310/user/hadoop/iris_parquet\";' 2> /dev/null" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 41, 205 | "metadata": {}, 206 | "outputs": [ 207 | { 208 | "name": "stdout", 209 | "output_type": "stream", 210 | "text": [ 211 | "sepal_length \tdouble \t \n", 212 | "sepal_width \tdouble \t \n", 213 | "petal_length \tdouble \t \n", 214 | "petal_width \tdouble \t \n", 215 | "class \tstring \t \n", 216 | "date_test \tdate \t \n" 217 | ] 218 | } 219 | ], 220 | "source": [ 221 | "%%bash\n", 222 | "hive -e 'DESCRIBE iris_parquet;' 2> /dev/null" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 42, 228 | "metadata": {}, 229 | "outputs": [ 230 | { 231 | "name": "stdout", 232 | "output_type": "stream", 233 | "text": [ 234 | "5.1\t3.5\t1.4\t0.2\tIris-setosa\t2020-01-01\n", 235 | "4.9\t3.0\t1.4\t0.2\tIris-setosa\t2020-01-02\n", 236 | "4.7\t3.2\t1.3\t0.2\tIris-setosa\t2020-01-03\n", 237 | "4.6\t3.1\t1.5\t0.2\tIris-setosa\t2020-01-04\n", 238 | "5.0\t3.6\t1.4\t0.2\tIris-setosa\t2020-01-05\n" 239 | ] 240 | } 241 | ], 242 | "source": [ 243 | "%%bash\n", 244 | "hive -e 'SELECT * FROM iris_parquet LIMIT 5;' 2> /dev/null" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": {}, 250 | "source": [ 251 | "# Dask Parquet\n", 252 | "\n", 253 | "- [Best Practices](https://docs.dask.org/en/latest/dataframe-best-practices.html)\n", 254 | "- [Remote Data](https://docs.dask.org/en/latest/remote-data-services.html)" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 20, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "!hdfs dfs -rm -r iris_parquet/iris.parq" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 22, 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [ 272 | "dst_filepath = HDFS_HOME + \"iris_parquet\"\n", 273 | "\n", 274 | "schema = pa.schema([\n", 275 | " pa.field('sepal_length', pa.float64()),\n", 276 | " pa.field('sepal_width', pa.float64()),\n", 277 | " pa.field('petal_length', pa.float64()),\n", 278 | " pa.field('petal_width', pa.float64()),\n", 279 | " pa.field('class', pa.string())\n", 280 | "])\n", 281 | "\n", 282 | "columns = [\n", 283 | " 'sepal_length',\n", 284 | " 'sepal_width',\n", 285 | " 'petal_length',\n", 286 | " 'petal_width',\n", 287 | " 'class' \n", 288 | "]\n", 289 | "\n", 290 | "ddf = dd.read_csv(\"data/iris.data\", names=columns)\n", 291 | "ddf.to_parquet(dst_filepath,\n", 292 | " engine='pyarrow',\n", 293 | " schema=schema,\n", 294 | " compression='snappy')" 295 | ] 296 | } 297 | ], 298 | "metadata": { 299 | "kernelspec": { 300 | "display_name": "sgds", 301 | "language": "python", 302 | "name": "sgds" 303 | }, 304 | "language_info": { 305 | "codemirror_mode": { 306 | "name": "ipython", 307 | "version": 3 308 | }, 309 | "file_extension": ".py", 310 | "mimetype": "text/x-python", 311 | "name": "python", 312 | "nbconvert_exporter": "python", 313 | "pygments_lexer": "ipython3", 314 | "version": "3.7.7" 315 | } 316 | }, 317 | "nbformat": 4, 318 | "nbformat_minor": 4 319 | } 320 | -------------------------------------------------------------------------------- /parquet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Explore Parquet Files\n", 8 | "\n", 9 | "- 2017 - [Development update: High speed Apache Parquet in Python with Apache Arrow ](https://wesmckinney.com/blog/python-parquet-update/)" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "%matplotlib inline" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "import os\n", 28 | "import json\n", 29 | "import numpy as np\n", 30 | "import pandas as pd\n", 31 | "import fastparquet\n", 32 | "import pyarrow as pa\n", 33 | "import pyarrow.parquet as pq\n", 34 | "\n", 35 | "from IPython.display import JSON" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "pandas 1.0.5\n", 48 | "numpy 1.18.5\n", 49 | "pyarrow 3.0.0\n", 50 | "fastparquet 0.4.0\n" 51 | ] 52 | } 53 | ], 54 | "source": [ 55 | "for module in [pd, np, pa, fastparquet]:\n", 56 | " print(module.__name__, module.__version__)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "# PyArrow Parquet\n", 64 | "\n", 65 | "- [Reading and Writing the Apache Parquet Format](https://arrow.apache.org/docs/python/parquet.html)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 7, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "data": { 75 | "text/plain": [ 76 | "\n", 77 | " created_by: parquet-cpp version 1.5.1-SNAPSHOT\n", 78 | " num_columns: 13\n", 79 | " num_rows: 937936\n", 80 | " num_row_groups: 1\n", 81 | " format_version: 1.0\n", 82 | " serialized_size: 7166" 83 | ] 84 | }, 85 | "execution_count": 7, 86 | "metadata": {}, 87 | "output_type": "execute_result" 88 | } 89 | ], 90 | "source": [ 91 | "parquet_file = pq.ParquetFile(\n", 92 | " 'processed_data/gdelt_2020_500MB.snappy.parq/part.0.parquet')\n", 93 | "\n", 94 | "parquet_file.metadata" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 8, 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "data": { 104 | "text/plain": [ 105 | "\n", 106 | " created_by: parquet-cpp version 1.5.1-SNAPSHOT\n", 107 | " num_columns: 13\n", 108 | " num_rows: 43452639\n", 109 | " num_row_groups: 46\n", 110 | " format_version: 1.0\n", 111 | " serialized_size: 84707" 112 | ] 113 | }, 114 | "execution_count": 8, 115 | "metadata": {}, 116 | "output_type": "execute_result" 117 | } 118 | ], 119 | "source": [ 120 | "filepath = 'processed_data/gdelt_2020_500MB.snappy.parq'\n", 121 | "\n", 122 | "dataset = pq.ParquetDataset(filepath)\n", 123 | "dataset.metadata" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 9, 129 | "metadata": {}, 130 | "outputs": [ 131 | { 132 | "data": { 133 | "text/plain": [ 134 | "\n", 135 | "required group field_id=0 schema {\n", 136 | " optional binary field_id=1 event_id (String);\n", 137 | " optional int32 field_id=2 date (Date);\n", 138 | " optional int32 field_id=3 event_date (Date);\n", 139 | " optional int64 field_id=4 event_code;\n", 140 | " optional int64 field_id=5 event_base_code;\n", 141 | " optional int64 field_id=6 event_root_code;\n", 142 | " optional double field_id=7 lat;\n", 143 | " optional double field_id=8 lon;\n", 144 | " optional int64 field_id=9 geo_type;\n", 145 | " optional binary field_id=10 country_code (String);\n", 146 | " optional binary field_id=11 adm1_code (String);\n", 147 | " optional binary field_id=12 source_url (String);\n", 148 | " optional binary field_id=13 netloc (String);\n", 149 | "}" 150 | ] 151 | }, 152 | "execution_count": 9, 153 | "metadata": {}, 154 | "output_type": "execute_result" 155 | } 156 | ], 157 | "source": [ 158 | "dataset.schema" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "# Fastparquet\n", 166 | "\n", 167 | "- https://fastparquet.readthedocs.io/en/latest/" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 10, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "filepath = 'processed_data/gdelt_500MB.snappy.parq/'\n", 177 | "\n", 178 | "pf = fastparquet.ParquetFile(filepath)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 11, 184 | "metadata": {}, 185 | "outputs": [ 186 | { 187 | "data": { 188 | "text/plain": [ 189 | "{'name': 'processed_data/gdelt_500MB.snappy.parq/_metadata',\n", 190 | " 'columns': ['event_id',\n", 191 | " 'date',\n", 192 | " 'event_date',\n", 193 | " 'event_code',\n", 194 | " 'event_base_code',\n", 195 | " 'event_root_code',\n", 196 | " 'lat',\n", 197 | " 'lon',\n", 198 | " 'geo_type',\n", 199 | " 'country_code',\n", 200 | " 'adm1_code',\n", 201 | " 'source_url',\n", 202 | " 'netloc'],\n", 203 | " 'partitions': [],\n", 204 | " 'rows': 640389681}" 205 | ] 206 | }, 207 | "execution_count": 11, 208 | "metadata": {}, 209 | "output_type": "execute_result" 210 | } 211 | ], 212 | "source": [ 213 | "pf.info" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 12, 219 | "metadata": {}, 220 | "outputs": [ 221 | { 222 | "data": { 223 | "text/plain": [ 224 | "OrderedDict([('event_id', dtype('O')),\n", 225 | " ('date', dtype('" 379 | ] 380 | }, 381 | "execution_count": 14, 382 | "metadata": { 383 | "application/json": { 384 | "expanded": false, 385 | "root": "root" 386 | } 387 | }, 388 | "output_type": "execute_result" 389 | } 390 | ], 391 | "source": [ 392 | "JSON(json.loads(pf.key_value_metadata['pandas']), expanded=False)" 393 | ] 394 | } 395 | ], 396 | "metadata": { 397 | "kernelspec": { 398 | "display_name": "sgds", 399 | "language": "python", 400 | "name": "sgds" 401 | }, 402 | "language_info": { 403 | "codemirror_mode": { 404 | "name": "ipython", 405 | "version": 3 406 | }, 407 | "file_extension": ".py", 408 | "mimetype": "text/x-python", 409 | "name": "python", 410 | "nbconvert_exporter": "python", 411 | "pygments_lexer": "ipython3", 412 | "version": "3.7.7" 413 | } 414 | }, 415 | "nbformat": 4, 416 | "nbformat_minor": 4 417 | } 418 | -------------------------------------------------------------------------------- /presto-pyhive.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Connecting to Presto and Hive with PyHive\n", 8 | "\n", 9 | "- Github - [dropbox/PyHive](https://github.com/dropbox/PyHive)" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "%matplotlib inline" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "import pandas as pd\n", 28 | "\n", 29 | "from pyhive import hive, presto\n", 30 | "from sqlalchemy.engine import create_engine" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "with hive.Connection(host='localhost', port=10000, database='default') as connection:\n", 40 | " cursor = connection.cursor()\n", 41 | " cursor.execute(\"SHOW TABLES\")\n", 42 | " \n", 43 | " items = cursor.fetchall()\n", 44 | " columns = [v[0] for v in cursor.description]\n", 45 | " df = pd.DataFrame(items, columns=columns)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 4, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "data": { 55 | "text/html": [ 56 | "
\n", 57 | "\n", 70 | "\n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | "
tab_name
0gdelt_csv_2019
1gdelt_csv_2020
2gdelt_parquet_2020
3gdelt_parquet_inserted_2019
4gdelt_parquet_inserted_2020
5ne_10_states_provinces_parquet
6ne_110_countries_parquet
\n", 108 | "
" 109 | ], 110 | "text/plain": [ 111 | " tab_name\n", 112 | "0 gdelt_csv_2019\n", 113 | "1 gdelt_csv_2020\n", 114 | "2 gdelt_parquet_2020\n", 115 | "3 gdelt_parquet_inserted_2019\n", 116 | "4 gdelt_parquet_inserted_2020\n", 117 | "5 ne_10_states_provinces_parquet\n", 118 | "6 ne_110_countries_parquet" 119 | ] 120 | }, 121 | "execution_count": 4, 122 | "metadata": {}, 123 | "output_type": "execute_result" 124 | } 125 | ], 126 | "source": [ 127 | "df" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 7, 133 | "metadata": {}, 134 | "outputs": [ 135 | { 136 | "name": "stdout", 137 | "output_type": "stream", 138 | "text": [ 139 | "\n", 140 | "RangeIndex: 10 entries, 0 to 9\n", 141 | "Data columns (total 9 columns):\n", 142 | " # Column Non-Null Count Dtype \n", 143 | "--- ------ -------------- ----- \n", 144 | " 0 gdelt_csv_2020.event_id 10 non-null int64 \n", 145 | " 1 gdelt_csv_2020.date 10 non-null object\n", 146 | " 2 gdelt_csv_2020.event_date 10 non-null object\n", 147 | " 3 gdelt_csv_2020.event_code 10 non-null int64 \n", 148 | " 4 gdelt_csv_2020.event_base_code 10 non-null int64 \n", 149 | " 5 gdelt_csv_2020.event_root_code 10 non-null int64 \n", 150 | " 6 gdelt_csv_2020.lat 10 non-null object\n", 151 | " 7 gdelt_csv_2020.lon 10 non-null object\n", 152 | " 8 gdelt_csv_2020.source_url 10 non-null object\n", 153 | "dtypes: int64(4), object(5)\n", 154 | "memory usage: 5.3 KB\n" 155 | ] 156 | } 157 | ], 158 | "source": [ 159 | "with hive.connect(host='localhost', port=10000, database='default') as connection:\n", 160 | " cursor = connection.cursor()\n", 161 | " cursor.execute(\"SELECT * FROM gdelt_csv_2020 LIMIT 10\")\n", 162 | " \n", 163 | " items = cursor.fetchall()\n", 164 | " columns = [v[0] for v in cursor.description]\n", 165 | " df = pd.DataFrame(items, columns=columns)\n", 166 | "\n", 167 | "df.info(memory_usage='deep')" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "# Using SQLAlchemy" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 9, 180 | "metadata": {}, 181 | "outputs": [ 182 | { 183 | "name": "stdout", 184 | "output_type": "stream", 185 | "text": [ 186 | "\n", 187 | "RangeIndex: 10 entries, 0 to 9\n", 188 | "Data columns (total 9 columns):\n", 189 | " # Column Non-Null Count Dtype \n", 190 | "--- ------ -------------- ----- \n", 191 | " 0 event_id 10 non-null int64 \n", 192 | " 1 date 10 non-null object\n", 193 | " 2 event_date 10 non-null object\n", 194 | " 3 event_code 10 non-null int64 \n", 195 | " 4 event_base_code 10 non-null int64 \n", 196 | " 5 event_root_code 10 non-null int64 \n", 197 | " 6 lat 10 non-null object\n", 198 | " 7 lon 10 non-null object\n", 199 | " 8 source_url 10 non-null object\n", 200 | "dtypes: int64(4), object(5)\n", 201 | "memory usage: 4.7 KB\n" 202 | ] 203 | } 204 | ], 205 | "source": [ 206 | "engine = create_engine('presto://localhost:8080/hive/default')\n", 207 | "\n", 208 | "df = pd.read_sql(\"SELECT * FROM gdelt_csv_2020 LIMIT 10\", engine)\n", 209 | "df.info(memory_usage='deep')" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 10, 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "name": "stdout", 219 | "output_type": "stream", 220 | "text": [ 221 | "\n", 222 | "RangeIndex: 10 entries, 0 to 9\n", 223 | "Data columns (total 9 columns):\n", 224 | " # Column Non-Null Count Dtype \n", 225 | "--- ------ -------------- ----- \n", 226 | " 0 event_id 10 non-null int64 \n", 227 | " 1 date 10 non-null object \n", 228 | " 2 event_date 10 non-null object \n", 229 | " 3 event_code 10 non-null int64 \n", 230 | " 4 event_base_code 10 non-null int64 \n", 231 | " 5 event_root_code 10 non-null int64 \n", 232 | " 6 lat 10 non-null float64\n", 233 | " 7 lon 10 non-null float64\n", 234 | " 8 source_url 10 non-null object \n", 235 | "dtypes: float64(2), int64(4), object(3)\n", 236 | "memory usage: 3.3 KB\n" 237 | ] 238 | } 239 | ], 240 | "source": [ 241 | "engine = create_engine('hive://localhost:10000/default')\n", 242 | "\n", 243 | "df = pd.read_sql(\"SELECT * FROM gdelt_csv_2020 LIMIT 10\", engine)\n", 244 | "df.info(memory_usage='deep')" 245 | ] 246 | } 247 | ], 248 | "metadata": { 249 | "kernelspec": { 250 | "display_name": "sgds", 251 | "language": "python", 252 | "name": "sgds" 253 | }, 254 | "language_info": { 255 | "codemirror_mode": { 256 | "name": "ipython", 257 | "version": 3 258 | }, 259 | "file_extension": ".py", 260 | "mimetype": "text/x-python", 261 | "name": "python", 262 | "nbconvert_exporter": "python", 263 | "pygments_lexer": "ipython3", 264 | "version": "3.7.7" 265 | } 266 | }, 267 | "nbformat": 4, 268 | "nbformat_minor": 4 269 | } 270 | -------------------------------------------------------------------------------- /scripts/configuration/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | hadoop.tmp.dir 22 | /tmp/hadoop 23 | 24 | 25 | fs.default.name 26 | hdfs://node-master:54310 27 | 28 | 29 | -------------------------------------------------------------------------------- /scripts/configuration/hadoop-env.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # Set Hadoop-specific environment variables here. 19 | 20 | ## 21 | ## THIS FILE ACTS AS THE MASTER FILE FOR ALL HADOOP PROJECTS. 22 | ## SETTINGS HERE WILL BE READ BY ALL HADOOP COMMANDS. THEREFORE, 23 | ## ONE CAN USE THIS FILE TO SET YARN, HDFS, AND MAPREDUCE 24 | ## CONFIGURATION OPTIONS INSTEAD OF xxx-env.sh. 25 | ## 26 | ## Precedence rules: 27 | ## 28 | ## {yarn-env.sh|hdfs-env.sh} > hadoop-env.sh > hard-coded defaults 29 | ## 30 | ## {YARN_xyz|HDFS_xyz} > HADOOP_xyz > hard-coded defaults 31 | ## 32 | 33 | # Many of the options here are built from the perspective that users 34 | # may want to provide OVERWRITING values on the command line. 35 | # For example: 36 | # 37 | # JAVA_HOME=/usr/java/testing hdfs dfs -ls 38 | # 39 | # Therefore, the vast majority (BUT NOT ALL!) of these defaults 40 | # are configured for substitution and not append. If append 41 | # is preferable, modify this file accordingly. 42 | 43 | ### 44 | # Generic settings for HADOOP 45 | ### 46 | 47 | # Technically, the only required environment variable is JAVA_HOME. 48 | # All others are optional. However, the defaults are probably not 49 | # preferred. Many sites configure these options outside of Hadoop, 50 | # such as in /etc/profile.d 51 | 52 | # The java implementation to use. By default, this environment 53 | # variable is REQUIRED on ALL platforms except OS X! 54 | export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre 55 | 56 | # Location of Hadoop. By default, Hadoop will attempt to determine 57 | # this location based upon its execution path. 58 | # export HADOOP_HOME= 59 | 60 | # Location of Hadoop's configuration information. i.e., where this 61 | # file is living. If this is not defined, Hadoop will attempt to 62 | # locate it based upon its execution path. 63 | # 64 | # NOTE: It is recommend that this variable not be set here but in 65 | # /etc/profile.d or equivalent. Some options (such as 66 | # --config) may react strangely otherwise. 67 | # 68 | # export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop 69 | 70 | # The maximum amount of heap to use (Java -Xmx). If no unit 71 | # is provided, it will be converted to MB. Daemons will 72 | # prefer any Xmx setting in their respective _OPT variable. 73 | # There is no default; the JVM will autoscale based upon machine 74 | # memory size. 75 | # export HADOOP_HEAPSIZE_MAX= 76 | 77 | # The minimum amount of heap to use (Java -Xms). If no unit 78 | # is provided, it will be converted to MB. Daemons will 79 | # prefer any Xms setting in their respective _OPT variable. 80 | # There is no default; the JVM will autoscale based upon machine 81 | # memory size. 82 | # export HADOOP_HEAPSIZE_MIN= 83 | 84 | # Enable extra debugging of Hadoop's JAAS binding, used to set up 85 | # Kerberos security. 86 | # export HADOOP_JAAS_DEBUG=true 87 | 88 | # Extra Java runtime options for all Hadoop commands. We don't support 89 | # IPv6 yet/still, so by default the preference is set to IPv4. 90 | # export HADOOP_OPTS="-Djava.net.preferIPv4Stack=true" 91 | # For Kerberos debugging, an extended option set logs more information 92 | # export HADOOP_OPTS="-Djava.net.preferIPv4Stack=true -Dsun.security.krb5.debug=true -Dsun.security.spnego.debug" 93 | 94 | # Some parts of the shell code may do special things dependent upon 95 | # the operating system. We have to set this here. See the next 96 | # section as to why.... 97 | export HADOOP_OS_TYPE=${HADOOP_OS_TYPE:-$(uname -s)} 98 | 99 | # Extra Java runtime options for some Hadoop commands 100 | # and clients (i.e., hdfs dfs -blah). These get appended to HADOOP_OPTS for 101 | # such commands. In most cases, # this should be left empty and 102 | # let users supply it on the command line. 103 | # export HADOOP_CLIENT_OPTS="" 104 | 105 | # 106 | # A note about classpaths. 107 | # 108 | # By default, Apache Hadoop overrides Java's CLASSPATH 109 | # environment variable. It is configured such 110 | # that it starts out blank with new entries added after passing 111 | # a series of checks (file/dir exists, not already listed aka 112 | # de-deduplication). During de-deduplication, wildcards and/or 113 | # directories are *NOT* expanded to keep it simple. Therefore, 114 | # if the computed classpath has two specific mentions of 115 | # awesome-methods-1.0.jar, only the first one added will be seen. 116 | # If two directories are in the classpath that both contain 117 | # awesome-methods-1.0.jar, then Java will pick up both versions. 118 | 119 | # An additional, custom CLASSPATH. Site-wide configs should be 120 | # handled via the shellprofile functionality, utilizing the 121 | # hadoop_add_classpath function for greater control and much 122 | # harder for apps/end-users to accidentally override. 123 | # Similarly, end users should utilize ${HOME}/.hadooprc . 124 | # This variable should ideally only be used as a short-cut, 125 | # interactive way for temporary additions on the command line. 126 | # export HADOOP_CLASSPATH="/some/cool/path/on/your/machine" 127 | 128 | # Should HADOOP_CLASSPATH be first in the official CLASSPATH? 129 | # export HADOOP_USER_CLASSPATH_FIRST="yes" 130 | 131 | # If HADOOP_USE_CLIENT_CLASSLOADER is set, the classpath along 132 | # with the main jar are handled by a separate isolated 133 | # client classloader when 'hadoop jar', 'yarn jar', or 'mapred job' 134 | # is utilized. If it is set, HADOOP_CLASSPATH and 135 | # HADOOP_USER_CLASSPATH_FIRST are ignored. 136 | # export HADOOP_USE_CLIENT_CLASSLOADER=true 137 | 138 | # HADOOP_CLIENT_CLASSLOADER_SYSTEM_CLASSES overrides the default definition of 139 | # system classes for the client classloader when HADOOP_USE_CLIENT_CLASSLOADER 140 | # is enabled. Names ending in '.' (period) are treated as package names, and 141 | # names starting with a '-' are treated as negative matches. For example, 142 | # export HADOOP_CLIENT_CLASSLOADER_SYSTEM_CLASSES="-org.apache.hadoop.UserClass,java.,javax.,org.apache.hadoop." 143 | 144 | # Enable optional, bundled Hadoop features 145 | # This is a comma delimited list. It may NOT be overridden via .hadooprc 146 | # Entries may be added/removed as needed. 147 | # export HADOOP_OPTIONAL_TOOLS="hadoop-aliyun,hadoop-openstack,hadoop-azure,hadoop-azure-datalake,hadoop-aws,hadoop-kafka" 148 | 149 | ### 150 | # Options for remote shell connectivity 151 | ### 152 | 153 | # There are some optional components of hadoop that allow for 154 | # command and control of remote hosts. For example, 155 | # start-dfs.sh will attempt to bring up all NNs, DNS, etc. 156 | 157 | # Options to pass to SSH when one of the "log into a host and 158 | # start/stop daemons" scripts is executed 159 | # export HADOOP_SSH_OPTS="-o BatchMode=yes -o StrictHostKeyChecking=no -o ConnectTimeout=10s" 160 | 161 | # The built-in ssh handler will limit itself to 10 simultaneous connections. 162 | # For pdsh users, this sets the fanout size ( -f ) 163 | # Change this to increase/decrease as necessary. 164 | # export HADOOP_SSH_PARALLEL=10 165 | 166 | # Filename which contains all of the hosts for any remote execution 167 | # helper scripts # such as workers.sh, start-dfs.sh, etc. 168 | # export HADOOP_WORKERS="${HADOOP_CONF_DIR}/workers" 169 | 170 | ### 171 | # Options for all daemons 172 | ### 173 | # 174 | 175 | # 176 | # Many options may also be specified as Java properties. It is 177 | # very common, and in many cases, desirable, to hard-set these 178 | # in daemon _OPTS variables. Where applicable, the appropriate 179 | # Java property is also identified. Note that many are re-used 180 | # or set differently in certain contexts (e.g., secure vs 181 | # non-secure) 182 | # 183 | 184 | # Where (primarily) daemon log files are stored. 185 | # ${HADOOP_HOME}/logs by default. 186 | # Java property: hadoop.log.dir 187 | # export HADOOP_LOG_DIR=${HADOOP_HOME}/logs 188 | 189 | # A string representing this instance of hadoop. $USER by default. 190 | # This is used in writing log and pid files, so keep that in mind! 191 | # Java property: hadoop.id.str 192 | # export HADOOP_IDENT_STRING=$USER 193 | 194 | # How many seconds to pause after stopping a daemon 195 | # export HADOOP_STOP_TIMEOUT=5 196 | 197 | # Where pid files are stored. /tmp by default. 198 | # export HADOOP_PID_DIR=/tmp 199 | 200 | # Default log4j setting for interactive commands 201 | # Java property: hadoop.root.logger 202 | # export HADOOP_ROOT_LOGGER=INFO,console 203 | 204 | # Default log4j setting for daemons spawned explicitly by 205 | # --daemon option of hadoop, hdfs, mapred and yarn command. 206 | # Java property: hadoop.root.logger 207 | # export HADOOP_DAEMON_ROOT_LOGGER=INFO,RFA 208 | 209 | # Default log level and output location for security-related messages. 210 | # You will almost certainly want to change this on a per-daemon basis via 211 | # the Java property (i.e., -Dhadoop.security.logger=foo). (Note that the 212 | # defaults for the NN and 2NN override this by default.) 213 | # Java property: hadoop.security.logger 214 | # export HADOOP_SECURITY_LOGGER=INFO,NullAppender 215 | 216 | # Default process priority level 217 | # Note that sub-processes will also run at this level! 218 | # export HADOOP_NICENESS=0 219 | 220 | # Default name for the service level authorization file 221 | # Java property: hadoop.policy.file 222 | # export HADOOP_POLICYFILE="hadoop-policy.xml" 223 | 224 | # 225 | # NOTE: this is not used by default! <----- 226 | # You can define variables right here and then re-use them later on. 227 | # For example, it is common to use the same garbage collection settings 228 | # for all the daemons. So one could define: 229 | # 230 | # export HADOOP_GC_SETTINGS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps" 231 | # 232 | # .. and then use it as per the b option under the namenode. 233 | 234 | ### 235 | # Secure/privileged execution 236 | ### 237 | 238 | # 239 | # Out of the box, Hadoop uses jsvc from Apache Commons to launch daemons 240 | # on privileged ports. This functionality can be replaced by providing 241 | # custom functions. See hadoop-functions.sh for more information. 242 | # 243 | 244 | # The jsvc implementation to use. Jsvc is required to run secure datanodes 245 | # that bind to privileged ports to provide authentication of data transfer 246 | # protocol. Jsvc is not required if SASL is configured for authentication of 247 | # data transfer protocol using non-privileged ports. 248 | # export JSVC_HOME=/usr/bin 249 | 250 | # 251 | # This directory contains pids for secure and privileged processes. 252 | #export HADOOP_SECURE_PID_DIR=${HADOOP_PID_DIR} 253 | 254 | # 255 | # This directory contains the logs for secure and privileged processes. 256 | # Java property: hadoop.log.dir 257 | # export HADOOP_SECURE_LOG=${HADOOP_LOG_DIR} 258 | 259 | # 260 | # When running a secure daemon, the default value of HADOOP_IDENT_STRING 261 | # ends up being a bit bogus. Therefore, by default, the code will 262 | # replace HADOOP_IDENT_STRING with HADOOP_xx_SECURE_USER. If one wants 263 | # to keep HADOOP_IDENT_STRING untouched, then uncomment this line. 264 | # export HADOOP_SECURE_IDENT_PRESERVE="true" 265 | 266 | ### 267 | # NameNode specific parameters 268 | ### 269 | 270 | # Default log level and output location for file system related change 271 | # messages. For non-namenode daemons, the Java property must be set in 272 | # the appropriate _OPTS if one wants something other than INFO,NullAppender 273 | # Java property: hdfs.audit.logger 274 | # export HDFS_AUDIT_LOGGER=INFO,NullAppender 275 | 276 | # Specify the JVM options to be used when starting the NameNode. 277 | # These options will be appended to the options specified as HADOOP_OPTS 278 | # and therefore may override any similar flags set in HADOOP_OPTS 279 | # 280 | # a) Set JMX options 281 | # export HDFS_NAMENODE_OPTS="-Dcom.sun.management.jmxremote=true -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.port=1026" 282 | # 283 | # b) Set garbage collection logs 284 | # export HDFS_NAMENODE_OPTS="${HADOOP_GC_SETTINGS} -Xloggc:${HADOOP_LOG_DIR}/gc-rm.log-$(date +'%Y%m%d%H%M')" 285 | # 286 | # c) ... or set them directly 287 | # export HDFS_NAMENODE_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -Xloggc:${HADOOP_LOG_DIR}/gc-rm.log-$(date +'%Y%m%d%H%M')" 288 | 289 | # this is the default: 290 | # export HDFS_NAMENODE_OPTS="-Dhadoop.security.logger=INFO,RFAS" 291 | 292 | ### 293 | # SecondaryNameNode specific parameters 294 | ### 295 | # Specify the JVM options to be used when starting the SecondaryNameNode. 296 | # These options will be appended to the options specified as HADOOP_OPTS 297 | # and therefore may override any similar flags set in HADOOP_OPTS 298 | # 299 | # This is the default: 300 | # export HDFS_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=INFO,RFAS" 301 | 302 | ### 303 | # DataNode specific parameters 304 | ### 305 | # Specify the JVM options to be used when starting the DataNode. 306 | # These options will be appended to the options specified as HADOOP_OPTS 307 | # and therefore may override any similar flags set in HADOOP_OPTS 308 | # 309 | # This is the default: 310 | # export HDFS_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS" 311 | 312 | # On secure datanodes, user to run the datanode as after dropping privileges. 313 | # This **MUST** be uncommented to enable secure HDFS if using privileged ports 314 | # to provide authentication of data transfer protocol. This **MUST NOT** be 315 | # defined if SASL is configured for authentication of data transfer protocol 316 | # using non-privileged ports. 317 | # This will replace the hadoop.id.str Java property in secure mode. 318 | # export HDFS_DATANODE_SECURE_USER=hdfs 319 | 320 | # Supplemental options for secure datanodes 321 | # By default, Hadoop uses jsvc which needs to know to launch a 322 | # server jvm. 323 | # export HDFS_DATANODE_SECURE_EXTRA_OPTS="-jvm server" 324 | 325 | ### 326 | # NFS3 Gateway specific parameters 327 | ### 328 | # Specify the JVM options to be used when starting the NFS3 Gateway. 329 | # These options will be appended to the options specified as HADOOP_OPTS 330 | # and therefore may override any similar flags set in HADOOP_OPTS 331 | # 332 | # export HDFS_NFS3_OPTS="" 333 | 334 | # Specify the JVM options to be used when starting the Hadoop portmapper. 335 | # These options will be appended to the options specified as HADOOP_OPTS 336 | # and therefore may override any similar flags set in HADOOP_OPTS 337 | # 338 | # export HDFS_PORTMAP_OPTS="-Xmx512m" 339 | 340 | # Supplemental options for priviliged gateways 341 | # By default, Hadoop uses jsvc which needs to know to launch a 342 | # server jvm. 343 | # export HDFS_NFS3_SECURE_EXTRA_OPTS="-jvm server" 344 | 345 | # On privileged gateways, user to run the gateway as after dropping privileges 346 | # This will replace the hadoop.id.str Java property in secure mode. 347 | # export HDFS_NFS3_SECURE_USER=nfsserver 348 | 349 | ### 350 | # ZKFailoverController specific parameters 351 | ### 352 | # Specify the JVM options to be used when starting the ZKFailoverController. 353 | # These options will be appended to the options specified as HADOOP_OPTS 354 | # and therefore may override any similar flags set in HADOOP_OPTS 355 | # 356 | # export HDFS_ZKFC_OPTS="" 357 | 358 | ### 359 | # QuorumJournalNode specific parameters 360 | ### 361 | # Specify the JVM options to be used when starting the QuorumJournalNode. 362 | # These options will be appended to the options specified as HADOOP_OPTS 363 | # and therefore may override any similar flags set in HADOOP_OPTS 364 | # 365 | # export HDFS_JOURNALNODE_OPTS="" 366 | 367 | ### 368 | # HDFS Balancer specific parameters 369 | ### 370 | # Specify the JVM options to be used when starting the HDFS Balancer. 371 | # These options will be appended to the options specified as HADOOP_OPTS 372 | # and therefore may override any similar flags set in HADOOP_OPTS 373 | # 374 | # export HDFS_BALANCER_OPTS="" 375 | 376 | ### 377 | # HDFS Mover specific parameters 378 | ### 379 | # Specify the JVM options to be used when starting the HDFS Mover. 380 | # These options will be appended to the options specified as HADOOP_OPTS 381 | # and therefore may override any similar flags set in HADOOP_OPTS 382 | # 383 | # export HDFS_MOVER_OPTS="" 384 | 385 | ### 386 | # Router-based HDFS Federation specific parameters 387 | # Specify the JVM options to be used when starting the RBF Routers. 388 | # These options will be appended to the options specified as HADOOP_OPTS 389 | # and therefore may override any similar flags set in HADOOP_OPTS 390 | # 391 | # export HDFS_DFSROUTER_OPTS="" 392 | 393 | ### 394 | # HDFS StorageContainerManager specific parameters 395 | ### 396 | # Specify the JVM options to be used when starting the HDFS Storage Container Manager. 397 | # These options will be appended to the options specified as HADOOP_OPTS 398 | # and therefore may override any similar flags set in HADOOP_OPTS 399 | # 400 | # export HDFS_STORAGECONTAINERMANAGER_OPTS="" 401 | 402 | ### 403 | # Advanced Users Only! 404 | ### 405 | 406 | # 407 | # When building Hadoop, one can add the class paths to the commands 408 | # via this special env var: 409 | # export HADOOP_ENABLE_BUILD_PATHS="true" 410 | 411 | # 412 | # To prevent accidents, shell commands be (superficially) locked 413 | # to only allow certain users to execute certain subcommands. 414 | # It uses the format of (command)_(subcommand)_USER. 415 | # 416 | # For example, to limit who can execute the namenode command, 417 | # export HDFS_NAMENODE_USER=hdfs 418 | -------------------------------------------------------------------------------- /scripts/configuration/hbase-env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | #/** 4 | # * Licensed to the Apache Software Foundation (ASF) under one 5 | # * or more contributor license agreements. See the NOTICE file 6 | # * distributed with this work for additional information 7 | # * regarding copyright ownership. The ASF licenses this file 8 | # * to you under the Apache License, Version 2.0 (the 9 | # * "License"); you may not use this file except in compliance 10 | # * with the License. You may obtain a copy of the License at 11 | # * 12 | # * http://www.apache.org/licenses/LICENSE-2.0 13 | # * 14 | # * Unless required by applicable law or agreed to in writing, software 15 | # * distributed under the License is distributed on an "AS IS" BASIS, 16 | # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | # * See the License for the specific language governing permissions and 18 | # * limitations under the License. 19 | # */ 20 | 21 | # Set environment variables here. 22 | 23 | # This script sets variables multiple times over the course of starting an hbase process, 24 | # so try to keep things idempotent unless you want to take an even deeper look 25 | # into the startup scripts (bin/hbase, etc.) 26 | 27 | # The java implementation to use. Java 1.8+ required. 28 | export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 29 | 30 | # Extra Java CLASSPATH elements. Optional. 31 | # export HBASE_CLASSPATH= 32 | 33 | # The maximum amount of heap to use. Default is left to JVM default. 34 | # export HBASE_HEAPSIZE=1G 35 | 36 | # Uncomment below if you intend to use off heap cache. For example, to allocate 8G of 37 | # offheap, set the value to "8G". 38 | # export HBASE_OFFHEAPSIZE=1G 39 | 40 | # Extra Java runtime options. 41 | # Below are what we set by default. May only work with SUN JVM. 42 | # For more on why as well as other possible settings, 43 | # see http://hbase.apache.org/book.html#performance 44 | export HBASE_OPTS="$HBASE_OPTS -XX:+UseConcMarkSweepGC" 45 | 46 | # Uncomment one of the below three options to enable java garbage collection logging for the server-side processes. 47 | 48 | # This enables basic gc logging to the .out file. 49 | # export SERVER_GC_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps" 50 | 51 | # This enables basic gc logging to its own file. 52 | # If FILE-PATH is not replaced, the log file(.gc) would still be generated in the HBASE_LOG_DIR . 53 | # export SERVER_GC_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -Xloggc:" 54 | 55 | # This enables basic GC logging to its own file with automatic log rolling. Only applies to jdk 1.6.0_34+ and 1.7.0_2+. 56 | # If FILE-PATH is not replaced, the log file(.gc) would still be generated in the HBASE_LOG_DIR . 57 | # export SERVER_GC_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -Xloggc: -XX:+UseGCLogFileRotation -XX:NumberOfGCLogFiles=1 -XX:GCLogFileSize=512M" 58 | 59 | # Uncomment one of the below three options to enable java garbage collection logging for the client processes. 60 | 61 | # This enables basic gc logging to the .out file. 62 | # export CLIENT_GC_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps" 63 | 64 | # This enables basic gc logging to its own file. 65 | # If FILE-PATH is not replaced, the log file(.gc) would still be generated in the HBASE_LOG_DIR . 66 | # export CLIENT_GC_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -Xloggc:" 67 | 68 | # This enables basic GC logging to its own file with automatic log rolling. Only applies to jdk 1.6.0_34+ and 1.7.0_2+. 69 | # If FILE-PATH is not replaced, the log file(.gc) would still be generated in the HBASE_LOG_DIR . 70 | # export CLIENT_GC_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -Xloggc: -XX:+UseGCLogFileRotation -XX:NumberOfGCLogFiles=1 -XX:GCLogFileSize=512M" 71 | 72 | # See the package documentation for org.apache.hadoop.hbase.io.hfile for other configurations 73 | # needed setting up off-heap block caching. 74 | 75 | # Uncomment and adjust to enable JMX exporting 76 | # See jmxremote.password and jmxremote.access in $JRE_HOME/lib/management to configure remote password access. 77 | # More details at: http://java.sun.com/javase/6/docs/technotes/guides/management/agent.html 78 | # NOTE: HBase provides an alternative JMX implementation to fix the random ports issue, please see JMX 79 | # section in HBase Reference Guide for instructions. 80 | 81 | # export HBASE_JMX_BASE="-Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false" 82 | # export HBASE_MASTER_OPTS="$HBASE_MASTER_OPTS $HBASE_JMX_BASE -Dcom.sun.management.jmxremote.port=10101" 83 | # export HBASE_REGIONSERVER_OPTS="$HBASE_REGIONSERVER_OPTS $HBASE_JMX_BASE -Dcom.sun.management.jmxremote.port=10102" 84 | # export HBASE_THRIFT_OPTS="$HBASE_THRIFT_OPTS $HBASE_JMX_BASE -Dcom.sun.management.jmxremote.port=10103" 85 | # export HBASE_ZOOKEEPER_OPTS="$HBASE_ZOOKEEPER_OPTS $HBASE_JMX_BASE -Dcom.sun.management.jmxremote.port=10104" 86 | # export HBASE_REST_OPTS="$HBASE_REST_OPTS $HBASE_JMX_BASE -Dcom.sun.management.jmxremote.port=10105" 87 | 88 | # File naming hosts on which HRegionServers will run. $HBASE_HOME/conf/regionservers by default. 89 | # export HBASE_REGIONSERVERS=${HBASE_HOME}/conf/regionservers 90 | 91 | # Uncomment and adjust to keep all the Region Server pages mapped to be memory resident 92 | #HBASE_REGIONSERVER_MLOCK=true 93 | #HBASE_REGIONSERVER_UID="hbase" 94 | 95 | # File naming hosts on which backup HMaster will run. $HBASE_HOME/conf/backup-masters by default. 96 | # export HBASE_BACKUP_MASTERS=${HBASE_HOME}/conf/backup-masters 97 | 98 | # Extra ssh options. Empty by default. 99 | # export HBASE_SSH_OPTS="-o ConnectTimeout=1 -o SendEnv=HBASE_CONF_DIR" 100 | 101 | # Where log files are stored. $HBASE_HOME/logs by default. 102 | # export HBASE_LOG_DIR=${HBASE_HOME}/logs 103 | 104 | # Enable remote JDWP debugging of major HBase processes. Meant for Core Developers 105 | # export HBASE_MASTER_OPTS="$HBASE_MASTER_OPTS -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8070" 106 | # export HBASE_REGIONSERVER_OPTS="$HBASE_REGIONSERVER_OPTS -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8071" 107 | # export HBASE_THRIFT_OPTS="$HBASE_THRIFT_OPTS -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8072" 108 | # export HBASE_ZOOKEEPER_OPTS="$HBASE_ZOOKEEPER_OPTS -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8073" 109 | # export HBASE_REST_OPTS="$HBASE_REST_OPTS -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8074" 110 | 111 | # A string representing this instance of hbase. $USER by default. 112 | # export HBASE_IDENT_STRING=$USER 113 | 114 | # The scheduling priority for daemon processes. See 'man nice'. 115 | # export HBASE_NICENESS=10 116 | 117 | # The directory where pid files are stored. /tmp by default. 118 | # export HBASE_PID_DIR=/var/hadoop/pids 119 | 120 | # Seconds to sleep between slave commands. Unset by default. This 121 | # can be useful in large clusters, where, e.g., slave rsyncs can 122 | # otherwise arrive faster than the master can service them. 123 | # export HBASE_SLAVE_SLEEP=0.1 124 | 125 | # Tell HBase whether it should manage it's own instance of ZooKeeper or not. 126 | # export HBASE_MANAGES_ZK=true 127 | 128 | # The default log rolling policy is RFA, where the log file is rolled as per the size defined for the 129 | # RFA appender. Please refer to the log4j.properties file to see more details on this appender. 130 | # In case one needs to do log rolling on a date change, one should set the environment property 131 | # HBASE_ROOT_LOGGER to ",DRFA". 132 | # For example: 133 | # HBASE_ROOT_LOGGER=INFO,DRFA 134 | # The reason for changing default to RFA is to avoid the boundary case of filling out disk space as 135 | # DRFA doesn't put any cap on the log size. Please refer to HBase-5655 for more context. 136 | 137 | # Tell HBase whether it should include Hadoop's lib when start up, 138 | # the default value is false,means that includes Hadoop's lib. 139 | # export HBASE_DISABLE_HADOOP_CLASSPATH_LOOKUP="true" 140 | -------------------------------------------------------------------------------- /scripts/configuration/hbase-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 22 | 23 | 42 | 43 | hbase.cluster.distributed 44 | true 45 | 46 | 47 | hbase.tmp.dir 48 | ./tmp 49 | 50 | 51 | hbase.unsafe.stream.capability.enforce 52 | false 53 | 54 | 55 | hbase.rootdir 56 | 57 | hdfs://node-master:54310/hbase 58 | 59 | 60 | hbase.zookeeper.property.dataDir 61 | /usr/local/hbase-2.2.5/data/zookeeper 62 | 63 | 64 | hbase.zookeeper.quorum 65 | node-master:2181 66 | 67 | 68 | zookeeper.znode.parent 69 | /hbase-unsecure 70 | 71 | 75 | 76 | -------------------------------------------------------------------------------- /scripts/configuration/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | dfs.namenode.name.dir 22 | /usr/local/hdfs/namenode 23 | 24 | 25 | dfs.datanode.data.dir 26 | /usr/local/hdfs/datanode 27 | 28 | 29 | dfs.datanode.use.datanode.hostname 30 | true 31 | 32 | 33 | dfs.client.use.datanode.hostname 34 | true 35 | 36 | 37 | dfs.replication 38 | 1 39 | 40 | 41 | -------------------------------------------------------------------------------- /scripts/configuration/hive-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | javax.jdo.option.ConnectionURL 6 | jdbc:mysql://localhost/metastore?createDatabaseIfNotExist=true 7 | 8 | 9 | javax.jdo.option.ConnectionDriverName 10 | com.mysql.jdbc.Driver 11 | 12 | 13 | javax.jdo.option.ConnectionUserName 14 | hive 15 | 16 | 17 | javax.jdo.option.ConnectionPassword 18 | hive 19 | 20 | 21 | hive.metastore.schema.verification 22 | false 23 | 24 | 25 | -------------------------------------------------------------------------------- /scripts/configuration/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | mapreduce.framework.name 22 | yarn 23 | 24 | 25 | yarn.app.mapreduce.am.env 26 | HADOOP_MAPRED_HOME=$HADOOP_HOME 27 | 28 | 29 | mapreduce.map.env 30 | HADOOP_MAPRED_HOME=$HADOOP_HOME 31 | 32 | 33 | mapreduce.reduce.env 34 | HADOOP_MAPRED_HOME=$HADOOP_HOME 35 | 36 | 37 | yarn.app.mapreduce.am.resource.mb 38 | 512 39 | 40 | 41 | mapreduce.map.memory.mb 42 | 256 43 | 44 | 45 | mapreduce.reduce.memory.mb 46 | 256 47 | 48 | 49 | -------------------------------------------------------------------------------- /scripts/configuration/metastore-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | javax.jdo.option.ConnectionURL 6 | jdbc:mysql://localhost/metastore?createDatabaseIfNotExist=true 7 | 8 | 9 | javax.jdo.option.ConnectionDriverName 10 | com.mysql.jdbc.Driver 11 | 12 | 13 | javax.jdo.option.ConnectionUserName 14 | hive 15 | 16 | 17 | javax.jdo.option.ConnectionPassword 18 | hive 19 | 20 | 21 | hive.metastore.event.db.notification.api.auth 22 | false 23 | 24 | 25 | metastore.thrift.uris 26 | thrift://localhost:9083 27 | 28 | 29 | metastore.task.threads.always 30 | org.apache.hadoop.hive.metastore.events.EventCleanerTask 31 | 32 | 33 | metastore.expression.proxy 34 | org.apache.hadoop.hive.metastore.DefaultPartitionExpressionProxy 35 | 36 | 37 | metastore.warehouse.dir 38 | /user/hive/warehouse 39 | 40 | 41 | 42 | fs.s3a.access.key 43 | ${S3_ACCESS_KEY} 44 | 45 | 46 | fs.s3a.secret.key 47 | ${S3_SECRET_KEY} 48 | 49 | 50 | fs.s3a.connection.ssl.enabled 51 | false 52 | 53 | 54 | fs.s3a.path.style.access 55 | true 56 | 57 | 58 | fs.s3a.endpoint 59 | ${S3_ENDPOINT} 60 | 61 | 62 | -------------------------------------------------------------------------------- /scripts/configuration/presto/catalog/hive.properties: -------------------------------------------------------------------------------- 1 | connector.name=hive-hadoop2 2 | hive.metastore.uri=thrift://node-master:9083 3 | -------------------------------------------------------------------------------- /scripts/configuration/presto/config.properties: -------------------------------------------------------------------------------- 1 | coordinator=true 2 | node-scheduler.include-coordinator=true 3 | http-server.http.port=8080 4 | query.max-memory=50GB 5 | query.max-memory-per-node=1GB 6 | query.max-total-memory-per-node=2GB 7 | discovery-server.enabled=true 8 | discovery.uri=http://node-master:8080 9 | -------------------------------------------------------------------------------- /scripts/configuration/presto/jvm.config: -------------------------------------------------------------------------------- 1 | -server 2 | -Xmx16G 3 | -XX:+UseG1GC 4 | -XX:G1HeapRegionSize=32M 5 | -XX:+UseGCOverheadLimit 6 | -XX:+ExplicitGCInvokesConcurrent 7 | -XX:+HeapDumpOnOutOfMemoryError 8 | -XX:+ExitOnOutOfMemoryError 9 | -Djdk.attach.allowAttachSelf=true 10 | -------------------------------------------------------------------------------- /scripts/configuration/presto/node.properties: -------------------------------------------------------------------------------- 1 | node.environment=production 2 | node.id=68a12c8a-0b33-11eb-8a07-2151a366a228 3 | node.data-dir=/usr/local/presto/data 4 | -------------------------------------------------------------------------------- /scripts/configuration/spark-defaults.conf: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Default system properties included when running spark-submit. 19 | # This is useful for setting default environmental settings. 20 | 21 | # Example: 22 | # spark.master spark://master:7077 23 | # spark.eventLog.enabled true 24 | # spark.eventLog.dir hdfs://namenode:8021/directory 25 | # spark.serializer org.apache.spark.serializer.KryoSerializer 26 | # spark.driver.memory 5g 27 | # spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" 28 | 29 | spark.master yarn 30 | spark.driver.memory 512m 31 | spark.yarn.am.memory 512m 32 | spark.executor.memory 512m 33 | spark.eventLog.enabled true 34 | spark.eventLog.dir hdfs://node-master:54310/spark-logs 35 | 36 | spark.history.provider org.apache.spark.deploy.history.FsHistoryProvider 37 | spark.history.fs.logDirectory hdfs://node-master:54310/spark-logs 38 | spark.history.fs.update.interval 10s 39 | spark.history.ui.port 18080 40 | 41 | spark.yarn.preserve.staging.files true 42 | spark.yarn.archive hdfs:///spark-libs/spark-libs.jar 43 | -------------------------------------------------------------------------------- /scripts/configuration/yarn-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 15 | 16 | 17 | yarn.acl.enable 18 | 0 19 | 20 | 21 | yarn.resourcemanager.hostname 22 | node-master 23 | 24 | 25 | 26 | yarn.nodemanager.aux-services 27 | mapreduce_shuffle 28 | 29 | 30 | yarn.nodemanager.resource.memory-mb 31 | 4096 32 | 33 | 34 | yarn.scheduler.maximum-allocation-mb 35 | 4096 36 | 37 | 38 | yarn.scheduler.minimum-allocation-mb 39 | 128 40 | 41 | 42 | yarn.nodemanager.vmem-check-enabled 43 | false 44 | 45 | 46 | -------------------------------------------------------------------------------- /scripts/configuration/zoo.cfg: -------------------------------------------------------------------------------- 1 | tickTime=2000 2 | dataDir=/var/zookeeper 3 | clientPort=2181 4 | admin.serverPort=8081 5 | initLimit=5 6 | syncLimit=2 7 | server.1=node-master:2888:3888 8 | -------------------------------------------------------------------------------- /scripts/download_gdelt.sh: -------------------------------------------------------------------------------- 1 | base_url="http://data.gdeltproject.org/events" 2 | dst_folderpath="../data/raw" 3 | year="2020" 4 | 5 | 6 | if [[ "$1" = "download" ]]; then 7 | for month in {01..12}; do 8 | for day in {01..31}; do 9 | url="${base_url}/2020${month}${day}.export.CSV.zip" 10 | zip_filepath="${dst_folderpath}/${year}${month}${day}.export.CSV.zip" 11 | 12 | echo "Downloading ${url} ..." 13 | wget $url -P $dst_folderpath 14 | 15 | if [[ -f $zip_filepath ]]; then 16 | echo $zip_filepath 17 | unzip $zip_filepath -d $dst_folderpath 18 | fi 19 | done 20 | done 21 | fi 22 | 23 | if [[ "$1" = "lowercase" ]]; then 24 | for filepath in $dst_folderpath/*; do 25 | dst_filepath=$(echo $filepath | tr "[:upper:]" "[:lower:]") 26 | if [[ ! -f $dst_filepath ]]; then 27 | echo $filepath $dst_filepath 28 | mv $filepath $dst_filepath 29 | fi 30 | done 31 | fi 32 | -------------------------------------------------------------------------------- /scripts/elasticsearch/logstash_ingest_gdelt.sh: -------------------------------------------------------------------------------- 1 | INDEX=gdelt_custom_2020 2 | 3 | # Delete index 4 | curl \ 5 | -u $ES_USERNAME:$ES_PASSWORD \ 6 | -X DELETE "${ES_HOST}:${ES_PORT}/${INDEX}?pretty" 7 | 8 | # Create mapping for the data 9 | curl \ 10 | -u $ES_USERNAME:$ES_PASSWORD \ 11 | -X PUT "${ES_HOST}:${ES_PORT}/_template/${INDEX}?pretty" \ 12 | -H 'Content-Type: application/json' \ 13 | -d' 14 | { 15 | "order": 10, 16 | "index_patterns": [ 17 | "'"${INDEX}"'*" 18 | ], 19 | "settings": { 20 | "index": { 21 | "number_of_shards": "5", 22 | "number_of_replicas": "1" 23 | } 24 | }, 25 | "mappings": { 26 | "properties": { 27 | "location": { 28 | "type": "geo_point" 29 | } 30 | } 31 | }, 32 | "aliases": {} 33 | }' 34 | 35 | # Load data 36 | #cat /home/hadoop/sgds/processed_data/2020_test_noheader_100k.csv | \ 37 | # /usr/share/logstash/bin/logstash -f \ 38 | # /home/hadoop/sgds/scripts/logstash_ingest_gdelt.conf 39 | 40 | #cat /home/hadoop/sgds/processed_data/20200101_filtered_noheader.csv | \ 41 | # /usr/share/logstash/bin/logstash -f \ 42 | # /home/hadoop/sgds/scripts/elasticsearch/logstash_ingest_gdelt.conf 43 | 44 | # real 95m0.294s 45 | # user 77m19.002s 46 | # sys 2m8.739s 47 | cat /home/hadoop/sgds/processed_data/2020_filtered_noheader.csv | \ 48 | /usr/share/logstash/bin/logstash -f \ 49 | /home/hadoop/sgds/scripts/elasticsearch/logstash_ingest_gdelt_2020.conf 50 | -------------------------------------------------------------------------------- /scripts/elasticsearch/logstash_ingest_gdelt_2019.conf: -------------------------------------------------------------------------------- 1 | input { 2 | stdin{ } 3 | } 4 | 5 | # Does not stop running after finish 6 | #input { 7 | # file { 8 | # path => "/home/hadoop/sgds/processed_data/2019_filtered_noheader.csv" 9 | # start_position => "beginning" 10 | # type => "data" 11 | # } 12 | #} 13 | 14 | filter { 15 | csv { 16 | separator => "," 17 | columns => ["event_id", 18 | "date", 19 | "event_date", 20 | "event_code", 21 | "event_base_code", 22 | "event_root_code", 23 | "lat", 24 | "lon", 25 | "source_url"] 26 | } 27 | date { 28 | match => ["date", "YYYY-MM-dd"] 29 | target => "date" 30 | } 31 | date { 32 | match => ["event_date", "YYYY-MM-dd"] 33 | target => "event_date" 34 | } 35 | mutate { 36 | convert => {"lat" => "float"} 37 | convert => {"lon" => "float"} 38 | convert => {"event_code" => "integer"} 39 | convert => {"event_base_code" => "integer"} 40 | convert => {"event_root_code" => "integer"} 41 | add_field => ["location", "%{lat},%{lon}"] 42 | } 43 | } 44 | 45 | output { 46 | elasticsearch { 47 | index => "gdelt_custom_2019" 48 | hosts => ["{ES_HOST}:${ES_PORT}"] 49 | user => "${ES_USERNAME}" 50 | password => "${ES_PASSWORD}" 51 | } 52 | # stdout { codec => rubydebug } 53 | } 54 | -------------------------------------------------------------------------------- /scripts/elasticsearch/logstash_ingest_gdelt_2020.conf: -------------------------------------------------------------------------------- 1 | input { 2 | stdin{ } 3 | } 4 | 5 | # Does not stop running after finish 6 | #input { 7 | # file { 8 | # path => "/home/hadoop/sgds/processed_data/20200101_filtered_noheader.csv" 9 | # start_position => "beginning" 10 | # type => "data" 11 | # } 12 | #} 13 | 14 | filter { 15 | csv { 16 | separator => "," 17 | columns => ["event_id", 18 | "date", 19 | "event_date", 20 | "event_code", 21 | "event_base_code", 22 | "event_root_code", 23 | "lat", 24 | "lon", 25 | "source_url"] 26 | } 27 | date { 28 | match => ["date", "YYYY-MM-dd"] 29 | target => "date" 30 | } 31 | date { 32 | match => ["event_date", "YYYY-MM-dd"] 33 | target => "event_date" 34 | } 35 | mutate { 36 | convert => {"lat" => "float"} 37 | convert => {"lon" => "float"} 38 | convert => {"event_code" => "integer"} 39 | convert => {"event_base_code" => "integer"} 40 | convert => {"event_root_code" => "integer"} 41 | add_field => ["location", "%{lat},%{lon}"] 42 | } 43 | } 44 | 45 | output { 46 | elasticsearchtxttxttxt { 47 | index => "gdelt_custom_2020" 48 | hosts => ["{ES_HOST}:${ES_PORT}"] 49 | user => "${ES_USERNAME}" 50 | password => "${ES_PASSWORD}" 51 | } 52 | # stdout { codec => rubydebug } 53 | } 54 | -------------------------------------------------------------------------------- /scripts/elasticsearch/logstash_ingest_ne_countries.conf: -------------------------------------------------------------------------------- 1 | input { 2 | stdin{ } 3 | } 4 | 5 | filter { 6 | csv { 7 | separator => "," 8 | columns => ["ne_id", "name", "iso_a2", "geometry"] 9 | } 10 | } 11 | 12 | output { 13 | elasticsearch { 14 | index => "ne_countries" 15 | hosts => ["{ES_HOST}:${ES_PORT}"] 16 | user => "${ES_USERNAME}" 17 | password => "${ES_PASSWORD}" 18 | } 19 | # stdout { codec => rubydebug } 20 | } 21 | -------------------------------------------------------------------------------- /scripts/elasticsearch/logstash_ingest_ne_countries.sh: -------------------------------------------------------------------------------- 1 | INDEX=ne_country 2 | 3 | # Delete index 4 | curl \ 5 | -u $ES_USERNAME:$ES_PASSWORD \ 6 | -X DELETE "${ES_HOST}:${ES_PORT}/${INDEX}?pretty" 7 | 8 | # Create mapping for the data 9 | curl \ 10 | -u $ES_USERNAME:$ES_PASSWORD \ 11 | -X PUT "${ES_HOST}:${ES_PORT}/_template/${INDEX}?pretty" \ 12 | -H 'Content-Type: application/json' \ 13 | -d' 14 | { 15 | "order": 10, 16 | "index_patterns": [ 17 | "'"${INDEX}"'*" 18 | ], 19 | "settings": { 20 | "index": { 21 | "number_of_shards": "1", 22 | "number_of_replicas": "0" 23 | } 24 | }, 25 | "mappings": { 26 | "properties": { 27 | "geometry": { 28 | "type": "geo_shape" 29 | } 30 | } 31 | }, 32 | "aliases": {} 33 | }' 34 | 35 | # Load data 36 | cat /home/hadoop/sgds/processed_data/ne_110m_admin_0_countries.csv | \ 37 | /usr/share/logstash/bin/logstash -f \ 38 | /home/hadoop/sgds/scripts/logstash_ingest_ne_countries.conf 39 | -------------------------------------------------------------------------------- /scripts/geomesa/geomesa_fs_ingest.sh: -------------------------------------------------------------------------------- 1 | /usr/local/geomesa-fs/bin/geomesa-fs ingest \ 2 | --encoding parquet \ 3 | --partition-scheme daily,z2-2bit \ 4 | --path hdfs://node-master:54310/tmp/geomesa/1 \ 5 | --converter gdelt \ 6 | --spec gdelt \ 7 | --num-reducers 10 \ 8 | /home/hadoop/sgds/data/raw/20200101.export.csv 9 | -------------------------------------------------------------------------------- /scripts/geomesa/geomesa_fs_ingest_gdelt_custom.sh: -------------------------------------------------------------------------------- 1 | # real 33m52.894s 2 | # user 26m2.721s 3 | # sys 4m0.637s 4 | # 16083528 ingested 8926313 failed in 00:33:01 5 | 6 | HDFS_HOST="hdfs://node-master:54310" 7 | #HDFS_FILEPATH="/tmp/geomesa/gdelt_custom" 8 | HDFS_FILEPATH="/tmp/geomesa/gdelt_custom_2020" 9 | CONF_FILEPATH="geomesa_gdelt_custom.conf" 10 | #DATA_FILEPATH="/home/hadoop/sgds/processed_data/2020_test.csv" 11 | #DATA_FILEPATH="/home/hadoop/sgds/processed_data/2020_test_noheader_100k.csv" 12 | DATA_FILEPATH="/home/hadoop/sgds/processed_data/2020_filtered_noheader.csv" 13 | 14 | hdfs dfs -rm -r $HDFS_FILEPATH 15 | 16 | time /usr/local/geomesa-fs/bin/geomesa-fs ingest \ 17 | --encoding parquet \ 18 | --partition-scheme z2-8bit \ 19 | --path "${HDFS_HOST}${HDFS_FILEPATH}" \ 20 | --converter $CONF_FILEPATH \ 21 | --spec $CONF_FILEPATH \ 22 | --num-reducers 2 \ 23 | $DATA_FILEPATH 24 | 25 | # 2020 26 | # 100% complete 24330238 ingested 0 failed in 00:20:15 27 | # INFO Local ingestion complete in 00:20:38 28 | # INFO Ingested 24330238 features with no failures for file: /home/hadoop/sgds/processed_data/2020_filtered_noheader.csv 29 | # real 20m43.756s 30 | # user 21m59.054s 31 | # sys 1m23.437s 32 | -------------------------------------------------------------------------------- /scripts/geomesa/geomesa_fs_ingest_ne_countries.sh: -------------------------------------------------------------------------------- 1 | HDFS_HOST="hdfs://node-master:54310" 2 | HDFS_FILEPATH="/tmp/geomesa/ne/countries" 3 | CONF_FILEPATH="geomesa_ne_countries.conf" 4 | DATA_FILEPATH="/home/hadoop/sgds/processed_data/ne_110m_admin_0_countries.csv" 5 | 6 | hdfs dfs -rm -r $HDFS_FILEPATH 7 | 8 | /usr/local/geomesa-fs/bin/geomesa-fs ingest \ 9 | --encoding parquet \ 10 | --partition-scheme xz2-10bit \ 11 | --path "${HDFS_HOST}${HDFS_FILEPATH}" \ 12 | --converter $CONF_FILEPATH \ 13 | --spec $CONF_FILEPATH \ 14 | --num-reducers 2 \ 15 | $DATA_FILEPATH 16 | -------------------------------------------------------------------------------- /scripts/geomesa/geomesa_gdelt_custom.conf: -------------------------------------------------------------------------------- 1 | geomesa = { 2 | sfts = { 3 | gdelt_custom_2020 = { 4 | type-name = "gdelt_custom" 5 | attributes = [ 6 | { name = "event_id", type = "Integer", index = true } 7 | { name = "date", type = "Date", default = true } 8 | { name = "event_code", type = "Integer" } 9 | { name = "event_base_code", type = "Integer" } 10 | { name = "event_root_code", type = "Integer" } 11 | { name = "geom", type = "Point", index = true, default = true, srid = 4326 } 12 | ] 13 | user-data = { 14 | option.one = "value" 15 | } 16 | } 17 | } 18 | converters { 19 | "gdelt_custom_2020" = { 20 | type = "delimited-text" 21 | format = "CSV" 22 | options { 23 | # skip-lines = 1 24 | error-mode = "raise-errors" 25 | # error-mode = "skip-bad-records" 26 | } 27 | id-field = "toString($event_id)", 28 | fields = [ 29 | { name = "event_id", transform = "$1::int" } 30 | { name = "date", transform = "date('yyyy-MM-dd', $3)" } 31 | { name = "event_code", transform = "$4::int" } 32 | { name = "event_base_code", transform = "$5::int" } 33 | { name = "event_root_code", transform = "$6::int" } 34 | { name = "lat", transform = "$7::double" } 35 | { name = "lon", transform = "$8::double" } 36 | { name = "source_url", transform = "$9::string" } 37 | { name = "geom", transform = "point($lon, $lat)" } 38 | ] 39 | } 40 | } 41 | } -------------------------------------------------------------------------------- /scripts/geomesa/geomesa_hbase_ingest.sh: -------------------------------------------------------------------------------- 1 | # 100% complete 1695 ingested 87520 failed in 00:00:12 2 | 3 | /usr/local/geomesa-hbase_2.11-3.0.0/bin/geomesa-hbase ingest \ 4 | --catalog gdelt \ 5 | --converter gdelt \ 6 | --spec gdelt \ 7 | /home/hadoop/sgds/data/raw/20200101.export.csv 8 | -------------------------------------------------------------------------------- /scripts/geomesa/geomesa_hbase_ingest_gdelt_custom.sh: -------------------------------------------------------------------------------- 1 | # 2020 2 | # 100% complete 24330238 ingested 0 failed in 00:53:23 3 | # INFO Local ingestion complete in 00:53:25 4 | # INFO Ingested 24330238 features with no failures for file: /home/hadoop/sgds/processed_data/2020_filtered_noheader.csv 5 | # real 40m8.945s 6 | # user 24m53.436s 7 | # sys 1m6.064s 8 | 9 | 10 | CONF_FILEPATH="geomesa_gdelt_custom.conf" 11 | #DATA_FILEPATH="/home/hadoop/sgds/processed_data/2020_test.csv" 12 | #DATA_FILEPATH="/home/hadoop/sgds/processed_data/2020_test_noheader_100k.csv" 13 | #DATA_FILEPATH="/home/hadoop/sgds/processed_data/2020_filtered_noheader.csv" 14 | DATA_FILEPATH="/home/hadoop/sgds/processed_data/2020_filtered_noheader.csv" 15 | CATALOG_NAME=gdelt_custom_2020 16 | 17 | time /usr/local/geomesa-hbase_2.11-3.0.0/bin/geomesa-hbase ingest \ 18 | --catalog $CATALOG_NAME \ 19 | --converter $CONF_FILEPATH \ 20 | --spec $CONF_FILEPATH \ 21 | $DATA_FILEPATH 22 | -------------------------------------------------------------------------------- /scripts/geomesa/geomesa_hbase_ingest_ne_countries.sh: -------------------------------------------------------------------------------- 1 | CONF_FILEPATH="geomesa_ne_countries.conf" 2 | #DATA_FILEPATH="/home/hadoop/sgds/processed_data/2020_test.csv" 3 | #DATA_FILEPATH="/home/hadoop/sgds/processed_data/2020_test_noheader_100k.csv" 4 | #DATA_FILEPATH="/home/hadoop/sgds/processed_data/2020_filtered_noheader.csv" 5 | DATA_FILEPATH="/home/hadoop/sgds/processed_data/ne_110m_admin_0_countries.csv" 6 | CATALOG_NAME=ne_countries 7 | 8 | time /usr/local/geomesa-hbase_2.11-3.0.0/bin/geomesa-hbase ingest \ 9 | --catalog $CATALOG_NAME \ 10 | --converter $CONF_FILEPATH \ 11 | --spec $CONF_FILEPATH \ 12 | $DATA_FILEPATH -------------------------------------------------------------------------------- /scripts/geomesa/geomesa_ne_countries.conf: -------------------------------------------------------------------------------- 1 | geomesa { 2 | sfts { 3 | "ne_countries" = { 4 | attributes = [ 5 | { name = "ne_id", type = "Integer", index = true } 6 | { name = "name", type = "String" } 7 | { name = "iso_a2", type = "String" } 8 | { name = "polygons", type = "MultiPolygon", index = true, srid = 4326, default = true } 9 | ] 10 | } 11 | } 12 | converters { 13 | "ne_countries" { 14 | type = "delimited-text", 15 | format = "CSV", 16 | options { 17 | skip-lines = 1 18 | # error-mode = "raise-errors" 19 | error-mode = "skip-bad-records" 20 | }, 21 | id-field = "$1", 22 | fields = [ 23 | { name = "ne_id", transform = "$1::int" } 24 | { name = "name", transform = "$2::string" } 25 | { name = "iso_a2", transform = "$3::string" } 26 | { name = "polygons", transform = "multipolygon($4)" } 27 | ] 28 | } 29 | } 30 | } -------------------------------------------------------------------------------- /scripts/geomesa/geomesa_parquet_example.conf: -------------------------------------------------------------------------------- 1 | { 2 | "geomesa" : { 3 | "sfts" : { 4 | "example" : { 5 | "fields" : [ 6 | { "name" : "color", "type" : "String" } 7 | { "name" : "number", "type" : "Long" } 8 | { "name" : "height", "type" : "String" } 9 | { "name" : "weight", "type" : "Double" } 10 | { "name" : "geom", "type" : "Point", "srid" : 4326 } 11 | ] 12 | } 13 | }, 14 | "converters" : { 15 | "example" : { 16 | "type" : "parquet", 17 | "id-field" : "avroPath($0, '/id')", 18 | "fields" : [ 19 | { "name" : "color", "transform" : "avroPath($0,'/color')" }, 20 | { "name" : "number", "transform" : "avroPath($0,'/number')" }, 21 | { "name" : "height", "transform" : "avroPath($0,'/physical/height')" }, 22 | { "name" : "weight", "transform" : "avroPath($0,'/physical/weight')" }, 23 | { "name" : "geom", "transform" : "point(avroPath($0,'/lon'),avroPath($0,'/lat'))" } 24 | ], 25 | "options" : { 26 | "encoding" : "UTF-8", 27 | "error-mode" : "skip-bad-records", 28 | "parse-mode" : "incremental", 29 | "validators" : [ "index" ] 30 | } 31 | } 32 | } 33 | } 34 | } -------------------------------------------------------------------------------- /scripts/geomesa/geomesa_spark_shell.sh: -------------------------------------------------------------------------------- 1 | #VERSION=2.4.1 2 | VERSION=3.0.0 3 | 4 | GEOMESA_FS_SPARK_RUNTIME=$GEOMESA_FS_HOME/dist/spark/geomesa-fs-spark-runtime_2.11-$VERSION.jar 5 | GEOMESA_HBASE_SPARK_RUNTIME=$GEOMESA_HBASE_HOME/dist/spark/geomesa-hbase-spark-runtime-hbase2_2.11-$VERSION.jar 6 | 7 | spark-shell \ 8 | --driver-java-options "-Dhive.metastore.uris=thrift://node-master:9083" \ 9 | --jars $GEOMESA_HBASE_SPARK_RUNTIME,$GEOMESA_FS_SPARK_RUNTIME \ 10 | --master yarn \ 11 | --driver-memory 8g 12 | -------------------------------------------------------------------------------- /scripts/geomesa/install_geomesa_pyspark.sh: -------------------------------------------------------------------------------- 1 | VERSION=3.0.0 2 | 3 | mvn clean install -Ppython 4 | pip install geomesa-spark/geomesa_pyspark/target/geomesa_pyspark-$VERSION.tar.gz 5 | -------------------------------------------------------------------------------- /scripts/hive/calculate_table_statistics.sh: -------------------------------------------------------------------------------- 1 | hive -v -e " 2 | -- Enable snappy compression for parquet files 3 | SET parquet.compression=SNAPPY; 4 | 5 | -- SET mapreduce.map.memory.mb=4096; 6 | SET mapreduce.map.java.opts=-Xmx2048m; 7 | -- SET mapreduce.reduce.memory.mb=4096; 8 | -- SET mapreduce.reduce.java.opts=-Xmx3686m; 9 | 10 | -- ANALYZE TABLE gdelt_parquet_2020 COMPUTE STATISTICS FOR COLUMNS; 11 | -- ANALYZE TABLE gdelt_parquet COMPUTE STATISTICS FOR COLUMNS; 12 | ANALYZE TABLE ne_110_countries_parquet COMPUTE STATISTICS FOR COLUMNS; 13 | ANALYZE TABLE ne_10_states_provinces_parquet COMPUTE STATISTICS FOR COLUMNS; 14 | -- ANALYZE TABLE iris_parquet COMPUTE STATISTICS FOR COLUMNS;" 15 | -------------------------------------------------------------------------------- /scripts/hive/create_table_gdelt_csv.hql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS gdelt_csv_2020; 2 | 3 | CREATE EXTERNAL TABLE gdelt_csv_2020 ( 4 | `event_id` BIGINT, 5 | `date` DATE, 6 | `event_date` DATE, 7 | `event_code` INT, 8 | `event_base_code` INT, 9 | `event_root_code` INT, 10 | `lat` DECIMAL(18,14), 11 | `lon` DECIMAL(18,14), 12 | `source_url` VARCHAR(100) 13 | ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' 14 | LINES TERMINATED BY '\n' 15 | STORED AS TEXTFILE 16 | LOCATION 'hdfs://node-master:54310/user/hadoop/gdelt_csv_2020/'; 17 | 18 | 19 | 20 | DROP TABLE IF EXISTS gdelt_csv_2019; 21 | 22 | CREATE EXTERNAL TABLE gdelt_csv_2019 ( 23 | `event_id` BIGINT, 24 | `date` DATE, 25 | `event_date` DATE, 26 | `event_code` INT, 27 | `event_base_code` INT, 28 | `event_root_code` INT, 29 | `lat` DECIMAL(18,14), 30 | `lon` DECIMAL(18,14), 31 | `source_url` VARCHAR(100) 32 | ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' 33 | LINES TERMINATED BY '\n' 34 | STORED AS TEXTFILE 35 | LOCATION 'hdfs://node-master:54310/user/hadoop/gdelt_csv_2019/'; 36 | -------------------------------------------------------------------------------- /scripts/hive/create_table_gdelt_parquet.hql: -------------------------------------------------------------------------------- 1 | -- Enable snappy compression for parquet files 2 | SET parquet.compression=SNAPPY; 3 | 4 | DROP TABLE IF EXISTS gdelt_parquet; 5 | CREATE EXTERNAL TABLE gdelt_parquet ( 6 | `event_id` STRING, 7 | `date` DATE, 8 | `event_date` DATE, 9 | `event_code` BIGINT, 10 | `event_base_code` BIGINT, 11 | `event_root_code` BIGINT, 12 | `lat` DOUBLE, 13 | `lon` DOUBLE, 14 | `geo_type` BIGINT, 15 | `country_code` STRING, 16 | `adm1_code` STRING, 17 | `source_url` STRING, 18 | `netloc` STRING 19 | ) 20 | STORED AS PARQUET 21 | LOCATION 'hdfs://node-master:54310/user/hadoop/gdelt_500MB.snappy.parq'; 22 | 23 | DROP TABLE IF EXISTS gdelt_parquet_2020; 24 | CREATE EXTERNAL TABLE gdelt_parquet_2020 ( 25 | `event_id` STRING, 26 | `date` DATE, 27 | `event_date` DATE, 28 | `event_code` BIGINT, 29 | `event_base_code` BIGINT, 30 | `event_root_code` BIGINT, 31 | `lat` DOUBLE, 32 | `lon` DOUBLE, 33 | `geo_type` BIGINT, 34 | `country_code` STRING, 35 | `adm1_code` STRING, 36 | `source_url` STRING, 37 | `netloc` STRING 38 | ) 39 | STORED AS PARQUET 40 | LOCATION 'hdfs://node-master:54310/user/hadoop/gdelt_2020_500MB.snappy.parq'; 41 | 42 | -- Calculate table statistics 43 | --ANALYZE TABLE gdelt_parquet 44 | --COMPUTE STATISTICS FOR COLUMNS; 45 | 46 | --ANALYZE TABLE gdelt_parquet_2020 47 | --COMPUTE STATISTICS FOR COLUMNS; 48 | -------------------------------------------------------------------------------- /scripts/hive/create_table_ne_parquet.hql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS ne_10_states_provinces_parquet; 2 | CREATE EXTERNAL TABLE ne_10_states_provinces_parquet ( 3 | `ne_id` BIGINT, 4 | `name` STRING, 5 | `iso_a2` STRING, 6 | `geometry` BINARY 7 | ) 8 | STORED AS PARQUET 9 | LOCATION 'hdfs://node-master:54310/user/hadoop/ne_states_provinces_parquet'; 10 | 11 | 12 | DROP TABLE IF EXISTS ne_110_countries_parquet; 13 | CREATE EXTERNAL TABLE ne_110_countries_parquet ( 14 | `ne_id` BIGINT, 15 | `name` STRING, 16 | `iso_a2` STRING, 17 | `geometry` BINARY 18 | ) 19 | STORED AS PARQUET 20 | LOCATION 'hdfs://node-master:54310/user/hadoop/ne_countries_parquet'; 21 | -------------------------------------------------------------------------------- /scripts/install_hadoop.sh: -------------------------------------------------------------------------------- 1 | #VERSION="2.10.0" 2 | #VERSION="3.1.3" 3 | VERSION="3.2.1" 4 | FILEPATH="/home/hadoop/hadoop-${VERSION}.tar.gz" 5 | 6 | wget "https://downloads.apache.org/hadoop/common/hadoop-${VERSION}/hadoop-${VERSION}.tar.gz" \ 7 | -O $FILEPATH 8 | 9 | sudo tar xvf $FILEPATH \ 10 | --directory="/usr/local/hadoop" \ 11 | --strip 1 12 | -------------------------------------------------------------------------------- /scripts/install_hive.sh: -------------------------------------------------------------------------------- 1 | #VERSION="1.2.2" 2 | #VERSION="2.3.7" 3 | VERSION="3.1.2" 4 | FILEPATH="/home/hadoop/apache-hive-${VERSION}-bin.tar.gz" 5 | 6 | #wget "https://downloads.apache.org/hive/hive-${VERSION}/apache-hive-${VERSION}-bin.tar.gz" \ 7 | # -O $FILEPATH 8 | 9 | #sudo tar xvf $FILEPATH \ 10 | # --directory="/usr/local/hive" \ 11 | # --strip 1 12 | 13 | # Change ownership to hadoop user 14 | sudo chown -R hadoop:hadoop "/usr/local/hive" 15 | 16 | # Add configuration 17 | cp "configuration/hive-site.xml" \ 18 | "/usr/local/hive/conf/" 19 | 20 | # Download MySQL Connector 21 | wget "https://repo1.maven.org/maven2/mysql/mysql-connector-java/8.0.20/mysql-connector-java-8.0.20.jar" \ 22 | --directory-prefix="/usr/local/hive/lib/" 23 | 24 | rm "/usr/local/hive/lib/guava-"* 25 | cp "/usr/local/hadoop/share/hadoop/hdfs/lib/guava-"* \ 26 | "/usr/local/hive/lib" 27 | 28 | # Drop metastore database 29 | #mysql -u hive -D metastore -e "DROP DATABASE metastore;" -p 30 | -------------------------------------------------------------------------------- /scripts/install_hive_metastore.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | source .env 4 | 5 | METASTORE_HOME=/usr/local/metastore 6 | 7 | if [ -z "$HADOOP_HOME" ]; then 8 | echo "HADOOP_HOME evnironment variable not set" 9 | echo 'export HADOOP_HOME=/usr/local/hadoop' >> ~/.bashrc 10 | export HADOOP_HOME=/usr/local/hadoop 11 | fi 12 | 13 | if [ "$1" = "mariadb" ]; then 14 | # Install MariaDB 15 | sudo apt update 16 | sudo apt install -y mariadb-server 17 | sudo systemctl enable mariadb.service 18 | sudo systemctl start mariadb.service 19 | 20 | # Prepare user and database 21 | sudo mysql -u root -e " 22 | DROP DATABASE IF EXISTS metastore; 23 | CREATE DATABASE metastore; 24 | 25 | CREATE USER 'hive'@localhost IDENTIFIED BY 'hive'; 26 | GRANT ALL PRIVILEGES ON *.* TO 'hive'@'localhost'; 27 | FLUSH PRIVILEGES;" 28 | fi 29 | 30 | # Download Hive Standalone Metastore 31 | wget "https://repo1.maven.org/maven2/org/apache/hive/hive-standalone-metastore/3.1.2/hive-standalone-metastore-3.1.2-bin.tar.gz" 32 | tar -zxvf hive-standalone-metastore-3.1.2-bin.tar.gz 33 | sudo mv apache-hive-metastore-3.1.2-bin $METASTORE_HOME 34 | sudo chown $USER:$USER $METASTORE_HOME 35 | 36 | # Download Hadoop 37 | wget "https://downloads.apache.org/hadoop/common/hadoop-3.2.1/hadoop-3.2.1.tar.gz" 38 | tar xvf hadoop-3.2.1.tar.gz 39 | sudo mv hadoop-3.2.1 $HADOOP_HOME 40 | sudo chown $USER:$USER $HADOOP_HOME 41 | 42 | # Replace Guava library and add missing libraries 43 | rm $METASTORE_HOME/lib/guava-19.0.jar 44 | cp $HADOOP_HOME/share/hadoop/common/lib/guava-27.0-jre.jar $METASTORE_HOME/lib/ 45 | cp $HADOOP_HOME/share/hadoop/tools/lib/hadoop-aws-3.2.1.jar $METASTORE_HOME/lib/ 46 | cp $HADOOP_HOME/share/hadoop/tools/lib/aws-java-sdk-bundle-1.11.375.jar $METASTORE_HOME/lib/ 47 | 48 | # Download MySQL Connector 49 | wget "https://repo1.maven.org/maven2/mysql/mysql-connector-java/8.0.20/mysql-connector-java-8.0.20.jar" \ 50 | --directory-prefix=$METASTORE_HOME/lib/ 51 | 52 | envsubst < scripts/configuration/metastore-site.xml > \ 53 | $METASTORE_HOME/conf/metastore-site.xml 54 | 55 | $METASTORE_HOME/bin/schematool -initSchema -dbType mysql 56 | 57 | # Start metastore with: $METASTORE_HOME/bin/start-metastore & 58 | -------------------------------------------------------------------------------- /scripts/install_trino.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | source .env 4 | 5 | TRINO_HOME=/usr/local/trino 6 | CORDINATOR=true 7 | 8 | sudo apt-get update 9 | sudo apt-get install -y \ 10 | openjdk-11-jdk-headless \ 11 | openjdk-11-jre-headless \ 12 | openjdk-11-jre \ 13 | python-is-python3 \ 14 | uuid 15 | 16 | if [ -z "$JAVA_HOME" ]; then 17 | echo "JAVA_HOME evnironment variable not set" 18 | echo 'export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64' >> ~/.bashrc 19 | export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64 20 | fi 21 | 22 | wget "https://repo1.maven.org/maven2/io/trino/trino-server/352/trino-server-352.tar.gz" 23 | tar -xzvf trino-server-352.tar.gz 24 | sudo mv trino-server-352 $TRINO_HOME 25 | sudo chown $USER:$USER $TRINO_HOME 26 | 27 | if [ "$1" = "cli" ]; then 28 | wget "https://repo1.maven.org/maven2/io/trino/trino-cli/352/trino-cli-352-executable.jar" 29 | mv trino-cli-352-executable.jar $TRINO_HOME/bin/trino 30 | sudo chmod +x $TRINO_HOME/bin/trino 31 | fi 32 | 33 | # Trino Configuration 34 | mkdir -p $TRINO_HOME/etc/catalog 35 | 36 | cat > $TRINO_HOME/etc/jvm.config << EOF 37 | -server 38 | -Xmx6G 39 | -XX:+UseG1GC 40 | -XX:G1HeapRegionSize=32M 41 | -XX:+UseGCOverheadLimit 42 | -XX:+ExplicitGCInvokesConcurrent 43 | -XX:+HeapDumpOnOutOfMemoryError 44 | -XX:+ExitOnOutOfMemoryError 45 | -Djdk.attach.allowAttachSelf=true 46 | EOF 47 | 48 | NODE_UUID=$(uuid -v 1) 49 | cat > $TRINO_HOME/etc/node.properties << EOF 50 | node.environment=production 51 | node.id=$NODE_UUID 52 | node.data-dir=$TRINO_HOME/data 53 | EOF 54 | 55 | cat > $TRINO_HOME/etc/config.properties << EOF 56 | coordinator=$CORDINATOR 57 | node-scheduler.include-coordinator=$CORDINATOR 58 | http-server.http.port=8080 59 | query.max-memory=50GB 60 | query.max-memory-per-node=1GB 61 | query.max-total-memory-per-node=2GB 62 | discovery-server.enabled=true 63 | discovery.uri=http://localhost:8080 64 | EOF 65 | 66 | cat > $TRINO_HOME/etc/catalog/hive.properties << EOF 67 | connector.name=hive-hadoop2 68 | hive.metastore.uri=thrift://localhost:9083 69 | hive.s3.path-style-access=true 70 | hive.s3.endpoint=$S3_ENDPOINT 71 | hive.s3.aws-access-key=$S3_ACCESS_KEY 72 | hive.s3.aws-secret-key=$S3_SECRET_KEY 73 | hive.s3.ssl.enabled=false 74 | EOF 75 | -------------------------------------------------------------------------------- /scripts/load_gdelt_to_hdfs.sh: -------------------------------------------------------------------------------- 1 | hdfs dfs -rm -r gdelt_csv_2020 2 | hdfs dfs -rm -r gdelt_parquet_2020 3 | hdfs dfs -rm -r gdelt_parquet 4 | hdfs dfs -rm -r /user/hive/warehouse 5 | 6 | hdfs dfs -mkdir gdelt_csv_2020 7 | hdfs dfs -mkdir -p /user/hive/warehouse 8 | 9 | time hdfs dfs -put ../processed_data/2020_filtered_noheader.csv \ 10 | gdelt_csv_2020/2020.csv 11 | 12 | time hdfs dfs -put ../processed_data/gdelt_2020_500MB.snappy.parq \ 13 | gdelt_2020_500MB.snappy.parq 14 | 15 | time hdfs dfs -put ../processed_data/gdelt_500MB.snappy.parq \ 16 | gdelt_500MB.snappy.parq 17 | 18 | hive -f hive/create_table_gdelt_csv.hql 19 | hive -f hive/create_table_gdelt_parquet.hql 20 | -------------------------------------------------------------------------------- /scripts/load_ne_to_hdfs.sh: -------------------------------------------------------------------------------- 1 | hdfs dfs -rm -r ne_countries_parquet 2 | hdfs dfs -rm -r ne_states_provinces_parquet 3 | hdfs dfs -mkdir ne_countries_parquet 4 | hdfs dfs -mkdir ne_states_provinces_parquet 5 | 6 | time hdfs dfs -put \ 7 | ../processed_data/ne_110_countries.snappy.parq \ 8 | ne_countries_parquet/ne_110_countries.snappy.parq 9 | 10 | time hdfs dfs -put \ 11 | ../processed_data/ne_10_states_provinces.snappy.parq \ 12 | ne_states_provinces_parquet/ne_10_states_provinces.snappy.parq 13 | 14 | hive -f hive/create_table_ne_parquet.hql 15 | -------------------------------------------------------------------------------- /scripts/load_ne_to_postgres.sh: -------------------------------------------------------------------------------- 1 | #FILEPATH="naturalearth/ne_10m_admin_1_states_provinces.shp" 2 | FILEPATH="naturalearth/ne_10m_admin_0_countries.shp" 3 | #TABLE_NAME=states_provinces 4 | TABLE_NAME=countries 5 | 6 | ogr2ogr -f "PostgreSQL" \ 7 | PG:"dbname='sgds' \ 8 | host='${POSTGRES_HOST}' \ 9 | port='${POSTGRES_PORT}' \ 10 | user='${POSTGRES_USERNAME}' \ 11 | password='${POSTGRES_PASSWORD}'" \ 12 | $FILEPATH \ 13 | -nln $TABLE_NAME \ 14 | -nlt MULTIPOLYGON 15 | -------------------------------------------------------------------------------- /scripts/mongodb/gdelt_2020_create_geometry_field.js: -------------------------------------------------------------------------------- 1 | // https://docs.mongodb.com/manual/tutorial/write-scripts-for-the-mongo-shell/ 2 | db = db.getSiblingDB('sgds'); 3 | //collection = db.gdelt_20200101; 4 | collection = db.gdelt_2020; 5 | 6 | //print("Delete all documents with no lat or lon") 7 | //collection.remove({ 8 | // "$or": [ 9 | // { "lat": { "$exists": false } }, 10 | // { "lon": { "$exists": false } } 11 | // ] 12 | //}); 13 | 14 | print("Iterate over all entries and add geometry"); 15 | collection.updateMany( {}, [ 16 | { "$set": { 17 | "geometry": { 18 | type: "Point", 19 | coordinates: ["$lon", "$lat"] 20 | } 21 | } 22 | } 23 | ]) 24 | 25 | print("Add 2dsphere index"); 26 | printjson( collection.createIndex( { "geometry": "2dsphere" } ) ); 27 | printjson( collection.ensureIndex( { "geometry": "2dsphere" } ) ); 28 | 29 | cursor = collection.find().limit(2); 30 | while ( cursor.hasNext() ) { 31 | printjson( cursor.next() ); 32 | } 33 | -------------------------------------------------------------------------------- /scripts/mongodb/mongodb_gdelt_fields.txt: -------------------------------------------------------------------------------- 1 | event_id.int64() 2 | event_date.date(2006-01-02) 3 | date.date(2006-01-02) 4 | event_code.int64() 5 | event_base_code.int64() 6 | event_root_code.int64() 7 | lat.double() 8 | lon.double() 9 | source_url.string() 10 | -------------------------------------------------------------------------------- /scripts/mongodb/mongodb_import_csv.sh: -------------------------------------------------------------------------------- 1 | #CSV_FILEPATH="/home/hadoop/sgds/processed_data/2020_test_noheader_100k.csv" 2 | #CSV_FILEPATH="/home/hadoop/sgds/processed_data/2019_filtered_noheader.csv" 3 | CSV_FILEPATH="/home/hadoop/sgds/processed_data/20200101_filtered_noheader.csv" 4 | FIELDS_FILEPATH="/home/hadoop/sgds/scripts/mongodb/mongodb_gdelt_fields.txt" 5 | COLLECTION=gdelt_20200101 6 | 7 | # --ignoreBlanks Ignores empty fields in csv and tsv exports 8 | # --drop drops the collection before importing the data 9 | 10 | # Reference date to parse dates: 11 | # https://docs.mongodb.com/database-tools/mongoimport/#cmdoption-mongoimport-columnshavetypes 12 | # column.date(2006-01-02T15:04:05.000Z) 13 | # column.date(2006-01-02T15:04:05Z) 14 | # column.date(2006-01-02) 15 | 16 | mongoimport \ 17 | --type csv \ 18 | --ignoreBlanks \ 19 | --username $MONGODB_USERNAME \ 20 | --password $MONGODB_PASSWORD \ 21 | --authenticationDatabase admin \ 22 | --db sgds \ 23 | --collection $COLLECTION \ 24 | --columnsHaveTypes \ 25 | --fieldFile $FIELDS_FILEPATH \ 26 | --drop \ 27 | --file=$CSV_FILEPATH 28 | 29 | # 2020 30 | # 24330238 document(s) imported successfully. 0 document(s) failed to import. 3.67GB 31 | # real 45m28.693s 32 | # user 17m2.235s 33 | # sys 1m22.913s 34 | 35 | # 2019 36 | # 54815269 document(s) imported successfully. 0 document(s) failed to import. 8.14GB 37 | # real 91m43.745s 38 | # user 37m13.230s 39 | # sys 3m6.665s 40 | 41 | 42 | # real 172m7.550s 43 | # user 0m0.431s 44 | # sys 0m0.186s 45 | time mongo \ 46 | --username $MONGODB_USERNAME \ 47 | --password $MONGODB_PASSWORD \ 48 | --authenticationDatabase admin \ 49 | mongodb/gdelt_2020_create_geometry_field.js 50 | -------------------------------------------------------------------------------- /scripts/postgres/add_geometry.sql: -------------------------------------------------------------------------------- 1 | ALTER TABLE gdelt_2020_test 2 | ADD COLUMN geom geometry(Point,4326); 3 | 4 | UPDATE gdelt_2020_test 5 | SET geom = ST_SetSRID(ST_MakePoint( 6 | CAST(ActionGeo_Long AS FLOAT), 7 | CAST(ActionGeo_Lat AS FLOAT)), 4326); 8 | -------------------------------------------------------------------------------- /scripts/postgres/copy_to_postgres.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS gdelt_2020_test; 2 | CREATE TABLE gdelt_2020_test ( 3 | GLOBALEVENTID TEXT, 4 | SQLDATE TEXT, 5 | MonthYear TEXT, 6 | Year TEXT, 7 | FractionDate TEXT, 8 | Actor1Code TEXT, 9 | Actor1Name TEXT, 10 | Actor1CountryCode TEXT, 11 | Actor1KnownGroupCode TEXT, 12 | Actor1EthnicCode TEXT, 13 | Actor1Religion1Code TEXT, 14 | Actor1Religion2Code TEXT, 15 | Actor1Type1Code TEXT, 16 | Actor1Type2Code TEXT, 17 | Actor1Type3Code TEXT, 18 | Actor2Code TEXT, 19 | Actor2Name TEXT, 20 | Actor2CountryCode TEXT, 21 | Actor2KnownGroupCode TEXT, 22 | Actor2EthnicCode TEXT, 23 | Actor2Religion1Code TEXT, 24 | Actor2Religion2Code TEXT, 25 | Actor2Type1Code TEXT, 26 | Actor2Type2Code TEXT, 27 | Actor2Type3Code TEXT, 28 | IsRootEvent TEXT, 29 | EventCode TEXT, 30 | EventBaseCode TEXT, 31 | EventRootCode TEXT, 32 | QuadClass TEXT, 33 | GoldsteinScale TEXT, 34 | NumMentions TEXT, 35 | NumSources TEXT, 36 | NumArticles TEXT, 37 | AvgTone TEXT, 38 | Actor1Geo_Type TEXT, 39 | Actor1Geo_FullName TEXT, 40 | Actor1Geo_CountryCode TEXT, 41 | Actor1Geo_ADM1Code TEXT, 42 | Actor1Geo_Lat TEXT, 43 | Actor1Geo_Long TEXT, 44 | Actor1Geo_FeatureID TEXT, 45 | Actor2Geo_Type TEXT, 46 | Actor2Geo_FullName TEXT, 47 | Actor2Geo_CountryCode TEXT, 48 | Actor2Geo_ADM1Code TEXT, 49 | Actor2Geo_Lat TEXT, 50 | Actor2Geo_Long TEXT, 51 | Actor2Geo_FeatureID TEXT, 52 | ActionGeo_Type TEXT, 53 | ActionGeo_FullName TEXT, 54 | ActionGeo_CountryCode TEXT, 55 | ActionGeo_ADM1Code TEXT, 56 | ActionGeo_Lat TEXT, 57 | ActionGeo_Long TEXT, 58 | ActionGeo_FeatureID TEXT, 59 | DATEADDED TEXT, 60 | SOURCEURL TEXT 61 | ); 62 | 63 | \COPY gdelt_2020_test FROM 'processed_data/2020.csv' DELIMITER ',' CSV HEADER; 64 | -------------------------------------------------------------------------------- /scripts/postgres/create_cstore_test.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS cstore_points_test; 2 | 3 | CREATE FOREIGN TABLE cstore_points_test ( 4 | SQLDATE TEXT, 5 | ActionGeo_Lat FLOAT, 6 | ActionGeo_Long FLOAT, 7 | SOURCEURL TEXT 8 | SERVER cstore_server OPTIONS(compression 'pglz'); 9 | 10 | COPY cstore_points_test 11 | FROM '/home/hadoop/sgds/processed_data/2020_test.csv' 12 | WITH CSV; 13 | -------------------------------------------------------------------------------- /scripts/postgres/create_table.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE gdelt_2020_test ( 2 | GLOBALEVENTID TEXT, 3 | SQLDATE TEXT, 4 | MonthYear TEXT, 5 | Year TEXT, 6 | FractionDate TEXT, 7 | Actor1Code TEXT, 8 | Actor1Name TEXT, 9 | Actor1CountryCode TEXT, 10 | Actor1KnownGroupCode TEXT, 11 | Actor1EthnicCode TEXT, 12 | Actor1Religion1Code TEXT, 13 | Actor1Religion2Code TEXT, 14 | Actor1Type1Code TEXT, 15 | Actor1Type2Code TEXT, 16 | Actor1Type3Code TEXT, 17 | Actor2Code TEXT, 18 | Actor2Name TEXT, 19 | Actor2CountryCode TEXT, 20 | Actor2KnownGroupCode TEXT, 21 | Actor2EthnicCode TEXT, 22 | Actor2Religion1Code TEXT, 23 | Actor2Religion2Code TEXT, 24 | Actor2Type1Code TEXT, 25 | Actor2Type2Code TEXT, 26 | Actor2Type3Code TEXT, 27 | IsRootEvent TEXT, 28 | EventCode TEXT, 29 | EventBaseCode TEXT, 30 | EventRootCode TEXT, 31 | QuadClass TEXT, 32 | GoldsteinScale TEXT, 33 | NumMentions TEXT, 34 | NumSources TEXT, 35 | NumArticles TEXT, 36 | AvgTone TEXT, 37 | Actor1Geo_Type TEXT, 38 | Actor1Geo_FullName TEXT, 39 | Actor1Geo_CountryCode TEXT, 40 | Actor1Geo_ADM1Code TEXT, 41 | Actor1Geo_Lat TEXT, 42 | Actor1Geo_Long TEXT, 43 | Actor1Geo_FeatureID TEXT, 44 | Actor2Geo_Type TEXT, 45 | Actor2Geo_FullName TEXT, 46 | Actor2Geo_CountryCode TEXT, 47 | Actor2Geo_ADM1Code TEXT, 48 | Actor2Geo_Lat TEXT, 49 | Actor2Geo_Long TEXT, 50 | Actor2Geo_FeatureID TEXT, 51 | ActionGeo_Type TEXT, 52 | ActionGeo_FullName TEXT, 53 | ActionGeo_CountryCode TEXT, 54 | ActionGeo_ADM1Code TEXT, 55 | ActionGeo_Lat TEXT, 56 | ActionGeo_Long TEXT, 57 | ActionGeo_FeatureID TEXT, 58 | DATEADDED TEXT, 59 | SOURCEURL TEXT 60 | ); -------------------------------------------------------------------------------- /scripts/postgres/spatial_join.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | gdelt."EventRootCode" AS eventcode, 3 | COUNT(gdelt."SQLDATE") AS counts, 4 | region.wkb_geometry AS geom 5 | FROM gdelt_2020 as gdelt 6 | JOIN states_provinces as region 7 | ON ST_Contains(region.wkb_geometry, gdelt.geometry) 8 | GROUP BY gdelt."EventRootCode", region.wkb_geometry 9 | LIMIT 10; 10 | -------------------------------------------------------------------------------- /scripts/presto/create_tables.sh: -------------------------------------------------------------------------------- 1 | RUNTIME=/usr/local/trino/bin/trino 2 | 3 | BUCKET="s3a://gdelt" 4 | #BUCKET="s3a://gdelt-446756" 5 | CATALOG=hive 6 | SCHEMA=gdelt 7 | 8 | command=$(cat << EOF 9 | CREATE SCHEMA IF NOT EXISTS $CATALOG.$SCHEMA 10 | WITH (location = '${BUCKET}/'); 11 | 12 | CREATE TABLE IF NOT EXISTS $CATALOG.$SCHEMA.gdelt_parquet_2020 ( 13 | event_id VARCHAR, 14 | date DATE, 15 | event_date DATE, 16 | event_code BIGINT, 17 | event_base_code BIGINT, 18 | event_root_code BIGINT, 19 | lat DOUBLE, 20 | lon DOUBLE, 21 | geo_type BIGINT, 22 | country_code VARCHAR, 23 | adm1_code VARCHAR, 24 | source_url VARCHAR, 25 | netloc VARCHAR 26 | ) 27 | WITH ( 28 | external_location = '${BUCKET}/gdelt_parquet_2020', 29 | format = 'PARQUET' 30 | ); 31 | 32 | CREATE TABLE IF NOT EXISTS $CATALOG.$SCHEMA.gdelt_parquet ( 33 | event_id VARCHAR, 34 | date DATE, 35 | event_date DATE, 36 | event_code BIGINT, 37 | event_base_code BIGINT, 38 | event_root_code BIGINT, 39 | lat DOUBLE, 40 | lon DOUBLE, 41 | geo_type BIGINT, 42 | country_code VARCHAR, 43 | adm1_code VARCHAR, 44 | source_url VARCHAR, 45 | netloc VARCHAR 46 | ) 47 | WITH ( 48 | external_location = '${BUCKET}/gdelt_parquet', 49 | format = 'PARQUET' 50 | ); 51 | 52 | CREATE TABLE IF NOT EXISTS $CATALOG.$SCHEMA.ne_110_countries_parquet ( 53 | ne_id BIGINT, 54 | name VARCHAR, 55 | iso_a2 VARCHAR, 56 | geometry VARBINARY 57 | ) 58 | WITH ( 59 | external_location = '${BUCKET}/ne_110_countries_parquet', 60 | format = 'PARQUET' 61 | ); 62 | 63 | SHOW TABLES IN $CATALOG.$SCHEMA; 64 | EOF 65 | ) 66 | 67 | $RUNTIME --execute "$command" 68 | -------------------------------------------------------------------------------- /scripts/presto/presto_query_stats.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | substring(regexp_split(query, '\n')[1], 1, 20) AS q, 3 | date_diff('millisecond', created, "end")/1000.0 AS total 4 | FROM system.runtime.queries 5 | WHERE state = 'FINISHED' 6 | AND source = 'presto-cli' 7 | ORDER BY "end"; 8 | -------------------------------------------------------------------------------- /scripts/presto/query/query_01.sql: -------------------------------------------------------------------------------- 1 | EXPLAIN ANALYZE SELECT 2 | COUNT(event_id) AS cnt 3 | FROM 4 | ${GDELT_TABLE_NAME}; 5 | -------------------------------------------------------------------------------- /scripts/presto/query/query_02.sql: -------------------------------------------------------------------------------- 1 | EXPLAIN ANALYZE SELECT 2 | event_root_code, 3 | COUNT(event_id) AS cnt 4 | FROM 5 | ${GDELT_TABLE_NAME} 6 | GROUP BY 7 | event_root_code 8 | ORDER BY 2 DESC; 9 | -------------------------------------------------------------------------------- /scripts/presto/query/query_03.sql: -------------------------------------------------------------------------------- 1 | EXPLAIN ANALYZE SELECT 2 | event_root_code, 3 | date_trunc('month', event_date) AS event_date, 4 | COUNT(event_id) AS cnt 5 | FROM 6 | ${GDELT_TABLE_NAME} 7 | GROUP BY ( 8 | event_root_code, 9 | date_trunc('month', event_date)) 10 | ORDER BY 3 DESC; 11 | -------------------------------------------------------------------------------- /scripts/presto/query/query_04.sql: -------------------------------------------------------------------------------- 1 | EXPLAIN ANALYZE SELECT 2 | event_date, 3 | event_root_code, 4 | lat, 5 | lon, 6 | ST_Distance(to_spherical_geography(ST_Point(2.349014, 48.864716)), 7 | to_spherical_geography(ST_Point(lon, lat))) AS distance 8 | FROM 9 | ${GDELT_TABLE_NAME} 10 | WHERE (-85 < lat) AND (lat < 85) 11 | ORDER BY 5 ASC 12 | LIMIT 100; 13 | -------------------------------------------------------------------------------- /scripts/presto/query/query_05.sql: -------------------------------------------------------------------------------- 1 | EXPLAIN ANALYZE SELECT 2 | event_root_code, 3 | COUNT(event_id) AS cnt 4 | FROM 5 | ${GDELT_TABLE_NAME} 6 | WHERE (-85 < lat) AND (lat < 85) 7 | AND ST_Distance(to_spherical_geography(ST_Point(2.349014, 48.864716)), 8 | to_spherical_geography(ST_Point(lon, lat))) < 10000 9 | GROUP BY 10 | event_root_code 11 | ORDER BY 2 DESC 12 | LIMIT 1000; 13 | -------------------------------------------------------------------------------- /scripts/presto/query/query_06.sql: -------------------------------------------------------------------------------- 1 | EXPLAIN ANALYZE SELECT 2 | event_date, 3 | event_root_code, 4 | lat, 5 | lon 6 | FROM 7 | ${GDELT_TABLE_NAME} 8 | WHERE (event_date BETWEEN DATE '2010-01-01' AND DATE '2021-01-01') 9 | AND (-85 < lat) AND (lat < 85) 10 | AND ST_Intersects( 11 | ST_Point(lon, lat), 12 | ST_GeometryFromText( 13 | 'POLYGON ((17.1608 46.3723, 14 | 17.1608 49.0205, 15 | 9.5307 49.0205, 16 | 9.5307 46.3723, 17 | 17.1608 46.3723))')); 18 | -------------------------------------------------------------------------------- /scripts/presto/query/query_07.sql: -------------------------------------------------------------------------------- 1 | EXPLAIN ANALYZE SELECT 2 | points.event_date, 3 | points.event_root_code, 4 | points.lat, 5 | points.lon 6 | FROM 7 | ${GDELT_TABLE_NAME} AS points, 8 | ${NE_TABLE_NAME} AS countries 9 | WHERE 10 | countries.iso_a2 = 'AT' 11 | AND (-85 < points.lat) AND (points.lat < 85) 12 | AND ST_Contains(ST_GeomFromBinary(countries.geometry), 13 | ST_Point(points.lon, points.lat)) 14 | LIMIT 1000; 15 | -------------------------------------------------------------------------------- /scripts/presto/query/query_08.sql: -------------------------------------------------------------------------------- 1 | EXPLAIN ANALYZE SELECT 2 | points.event_root_code, 3 | COUNT(points.event_id) AS cnt 4 | FROM 5 | ${GDELT_TABLE_NAME} AS points, 6 | ${NE_TABLE_NAME} AS countries 7 | WHERE 8 | countries.iso_a2 = 'AT' 9 | AND (-85 < points.lat) AND (points.lat < 85) 10 | AND ST_Contains(ST_GeomFromBinary(countries.geometry), 11 | ST_Point(points.lon, points.lat)) 12 | GROUP BY 13 | points.event_root_code; 14 | -------------------------------------------------------------------------------- /scripts/presto/query/query_09.sql: -------------------------------------------------------------------------------- 1 | EXPLAIN ANALYZE WITH countries AS ( 2 | SELECT 3 | ne_id, 4 | ST_GeomFromBinary(geometry) AS geometry 5 | FROM 6 | ${NE_TABLE_NAME} 7 | ), points AS ( 8 | SELECT 9 | ST_Point(lon, lat) AS point 10 | FROM 11 | ${GDELT_TABLE_NAME} 12 | WHERE (-85 < lat) AND (lat < 85) 13 | ) 14 | 15 | SELECT 16 | countries.ne_id, 17 | COUNT(*) AS cnt 18 | FROM 19 | points, 20 | countries 21 | WHERE 22 | ST_Contains(countries.geometry, points.point) 23 | GROUP BY 24 | countries.ne_id; 25 | -------------------------------------------------------------------------------- /scripts/presto/query/query_10.sql: -------------------------------------------------------------------------------- 1 | EXPLAIN ANALYZE WITH countries AS ( 2 | SELECT 3 | ne_id, 4 | iso_a2, 5 | ST_GeomFromBinary(geometry) AS geometry 6 | FROM 7 | ${NE_TABLE_NAME} 8 | ), points AS ( 9 | SELECT 10 | event_root_code, 11 | ST_Point(lon, lat) AS point 12 | FROM 13 | ${GDELT_TABLE_NAME} 14 | WHERE (-85 < lat) AND (lat < 85) 15 | ) 16 | 17 | SELECT 18 | countries.ne_id, 19 | countries.iso_a2, 20 | points.event_root_code, 21 | COUNT(*) AS cnt 22 | FROM 23 | points, 24 | countries 25 | WHERE 26 | ST_Contains(countries.geometry, points.point) 27 | GROUP BY ( 28 | countries.ne_id, 29 | countries.iso_a2, 30 | points.event_root_code 31 | ); 32 | -------------------------------------------------------------------------------- /scripts/presto/run_query.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | LOG_FOLDERPATH=log 4 | 5 | mkdir -p $LOG_FOLDERPATH 6 | 7 | #LOG_FILEPATH=$LOG_FOLDERPATH/presto_queries_344.log 8 | #STATS_FILEPATH=$LOG_FOLDERPATH/presto_stats_344.csv 9 | #LOG_FILEPATH=$LOG_FOLDERPATH/presto_queries_0.242_s.log 10 | #STATS_FILEPATH=$LOG_FOLDERPATH/presto_stats_0.242_s.csv 11 | #LOG_FILEPATH=$LOG_FOLDERPATH/presto_queries_0.247_s.log 12 | #STATS_FILEPATH=$LOG_FOLDERPATH/presto_stats_0.247_s.csv 13 | LOG_FILEPATH=$LOG_FOLDERPATH/presto_queries_minio2_352_cpx51.log 14 | STATS_FILEPATH=$LOG_FOLDERPATH/presto_stats_minio2_352_cpx51.csv 15 | 16 | RUNTIME=/usr/local/trino/bin/trino 17 | CATALOG=hive 18 | SCHEMA=gdelt 19 | 20 | export GDELT_TABLE_NAME=$CATALOG.$SCHEMA.gdelt_parquet 21 | export NE_TABLE_NAME=$CATALOG.$SCHEMA.ne_110_countries_parquet 22 | 23 | # Run warmup queries 24 | time $RUNTIME \ 25 | --catalog $CATALOG \ 26 | --schema $SCHEMA \ 27 | --execute " 28 | SELECT * FROM gdelt_parquet LIMIT 100; 29 | SELECT * FROM gdelt_parquet_2020 LIMIT 100; 30 | SELECT * FROM ne_110_countries_parquet LIMIT 100;" \ 31 | > /dev/null 32 | 33 | rm -f $LOG_FILEPATH 34 | echo "filepath,duration" > $STATS_FILEPATH 35 | 36 | for filepath in query/*.sql; do 37 | echo "$filepath" | \ 38 | tee --append $LOG_FILEPATH 39 | 40 | query=$(envsubst < $filepath) 41 | 42 | start=$(date +%s.%N) 43 | $RUNTIME \ 44 | --execute "${query}" \ 45 | --client-tags $filepath | \ 46 | tee --append $LOG_FILEPATH 47 | end=$(date +%s.%N) 48 | 49 | duration=$(echo "$end $start" | awk '{print $1-$2}') 50 | echo "Duration $duration" | \ 51 | tee --append $LOG_FILEPATH 52 | 53 | echo "$filepath,$duration" >> $STATS_FILEPATH 54 | done 55 | -------------------------------------------------------------------------------- /scripts/python/gdelt_dask_filter_merge.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pyarrow as pa 4 | import dask.dataframe as dd 5 | from dask.distributed import Client 6 | from urllib.parse import urlparse 7 | 8 | HDFS_HOME = "hdfs://node-master:54310/user/hadoop" 9 | 10 | 11 | if __name__ == '__main__': 12 | numeric_columns = { 13 | "event_code": np.int64, 14 | "event_base_code": np.int64, 15 | "event_root_code": np.int64, 16 | "lat": np.float64, 17 | "lon": np.float64, 18 | "geo_type": np.int64 19 | } 20 | 21 | columns_name_mapping = { 22 | 'GLOBALEVENTID': 'event_id', 23 | 'DATEADDED': 'date', 24 | 'SQLDATE': 'event_date', 25 | 'EventCode': 'event_code', 26 | 'EventBaseCode': 'event_base_code' , 27 | 'EventRootCode': 'event_root_code', 28 | 'ActionGeo_Lat': 'lat', 29 | 'ActionGeo_Long': 'lon', 30 | 'ActionGeo_Type': 'geo_type', 31 | 'ActionGeo_CountryCode': 'country_code', 32 | 'ActionGeo_ADM1Code': 'adm1_code', 33 | 'SOURCEURL': 'source_url' 34 | } 35 | 36 | schema = pa.schema([ 37 | pa.field('event_id', pa.string()), 38 | pa.field('date', pa.date32()), 39 | pa.field('event_date', pa.date32()), 40 | pa.field('event_code', pa.int64()), 41 | pa.field('event_base_code', pa.int64()), 42 | pa.field('event_root_code', pa.int64()), 43 | pa.field('lat', pa.float64()), 44 | pa.field('lon', pa.float64()), 45 | pa.field('geo_type', pa.int64()), 46 | pa.field('country_code', pa.string()), 47 | pa.field('adm1_code', pa.string()), 48 | pa.field('source_url', pa.string()), 49 | pa.field('netloc', pa.string()) 50 | ]) 51 | 52 | client = Client(memory_limit='6GB', processes=True) 53 | print("Dashboard Link", client.dashboard_link) 54 | 55 | #src_filepath = "data/raw/*.csv" 56 | src_filepath = "data/raw/2019*.csv" 57 | #dst_filepath = "processed_data/gdelt_500MB.snappy.parq" 58 | dst_filepath = "processed_data/gdelt_2019_500MB.snappy.parq" 59 | #dst_filepath = HDFS_HOME + "/gdelt_2019_500MB.snappy.parq" 60 | 61 | df_headers = pd.read_excel('data/CSV.header.fieldids.xlsx') 62 | columns = df_headers.columns.values 63 | 64 | df = dd.read_csv(src_filepath, names=columns, dtype='str', delimiter='\t') 65 | columns_subset = list(columns_name_mapping.keys()) 66 | df = df[columns_subset] 67 | df = df.rename(columns=columns_name_mapping) 68 | 69 | for col, dtype in numeric_columns.items(): 70 | df[col] = dd.to_numeric(df[col], errors='coerce') 71 | 72 | df = df.dropna(subset=[ 73 | 'event_code', 74 | 'event_base_code', 75 | 'event_root_code', 76 | 'lat', 77 | 'lon', 78 | 'geo_type' 79 | ]) 80 | 81 | for col, dtype in numeric_columns.items(): 82 | if not df[col].dtype == dtype: 83 | df[col] = df[col].astype(dtype) 84 | 85 | df["date"] = dd.to_datetime( 86 | df["date"], errors='coerce', format="%Y%m%d") 87 | df["event_date"] = dd.to_datetime( 88 | df["event_date"], errors='coerce', format="%Y%m%d") 89 | 90 | df['netloc'] = df['source_url'].apply( 91 | lambda url: urlparse(url).netloc if not pd.isna(url) else None, 92 | meta=('source_url', 'str')) 93 | 94 | # Filter wrong dates after 2013 95 | mask = ((df["date"].dt.year - df["event_date"].dt.year).abs() < 5) | \ 96 | (df["date"].dt.year < 2014) 97 | df = df[mask] 98 | 99 | #df = df.set_index('event_id', sorted=True) 100 | 101 | # Repartition dataset 102 | print(f"Number of partitions: {df.npartitions}") 103 | #df = df.repartition(npartitions=1) 104 | df = df.repartition(partition_size="500MB") 105 | 106 | print(f"Save to {dst_filepath}") 107 | df.to_parquet(dst_filepath, 108 | engine='pyarrow', 109 | schema=schema, 110 | compression='snappy') 111 | 112 | 113 | 114 | # Full 500MB 115 | # real 218m44.256s 116 | # user 651m22.014s 117 | # sys 47m7.481s 118 | 119 | # HDFS 2019 500MB (568 partitions) 120 | # real 20m21.166s 121 | # user 60m12.809s 122 | # sys 5m9.280s 123 | 124 | # 2019 500MB (568 partitions) 125 | # real 19m52.539s 126 | # user 60m17.722s 127 | # sys 4m53.468s 128 | 129 | # OLD 130 | 131 | # unsorted filtered dates 132 | # 4925 partitions 133 | # 134 | # real 142m24.507s 135 | # user 345m47.006s 136 | # sys 23m33.074s 137 | 138 | # unsorted filtered dates int columns 139 | # real 130m3.256s 140 | # user 353m3.205s 141 | # sys 25m35.665s 142 | 143 | # unsorted filtered dates 100MB partitions 144 | # 3744 partitions 145 | # 146 | # real 246m34.456s 147 | # user 695m41.250s 148 | # sys 44m38.978s 149 | 150 | # unsorted filtered dates 500MB partitions 151 | # 569 partitions 152 | # 153 | # real 302m38.417s 154 | # user 692m3.138s 155 | # sys 46m23.634s 156 | -------------------------------------------------------------------------------- /scripts/python/gdelt_download.py: -------------------------------------------------------------------------------- 1 | import os 2 | import zipfile 3 | import requests 4 | from bs4 import BeautifulSoup 5 | from io import BytesIO 6 | 7 | DST_FOLDERPATH = "data/raw" 8 | GDELT_URL = "http://data.gdeltproject.org/events/" 9 | 10 | 11 | def get_urls(url): 12 | r = requests.get(url) 13 | items = [] 14 | for line in r.text.split('\n'): 15 | if line.startswith('
  • '): 16 | a = BeautifulSoup(line, 'html.parser').find('a') 17 | items.append({ 18 | 'name': a.text, 19 | 'url': url + a['href'] 20 | }) 21 | return items 22 | 23 | 24 | def download_and_extract(url, dst_folderpath): 25 | r = requests.get(url) 26 | with zipfile.ZipFile(BytesIO(r.content)) as zfile: 27 | zfile.extractall(dst_folderpath) 28 | 29 | 30 | if __name__ == '__main__': 31 | items = get_urls(GDELT_URL) 32 | 33 | # Download and unzip all files 34 | for item in items: 35 | if item['name'].endswith('zip') and not item['name'].startswith('GDELT'): 36 | print(item['name'], item['url']) 37 | download_and_extract(item['url'], DST_FOLDERPATH) 38 | 39 | # Rename all files to lower case 40 | for filename in os.listdir(DST_FOLDERPATH): 41 | filepath = os.path.join(DST_FOLDERPATH, filename) 42 | print("Rename", filepath) 43 | os.rename(filepath, filepath.lower()) 44 | -------------------------------------------------------------------------------- /scripts/python/gdelt_filter_csv.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import pandas as pd 4 | from urllib.parse import urlparse 5 | 6 | # 2020 7 | # real 20m17.494s 8 | # user 11m56.839s 9 | # sys 0m39.823s 10 | 11 | # 2019 12 | # real 55m9.319s 13 | # user 26m4.964s 14 | # sys 1m21.729s 15 | 16 | # 20XX 17 | # real 451m42.280s 18 | # user 220m21.252s 19 | # sys 14m57.931s 20 | 21 | # Full 22 | # real 653m22.214s 23 | # user 368m54.816s 24 | # sys 13m25.819s 25 | 26 | # time hdfs dfs -put /home/hadoop/sgds/data/raw/*.csv /user/hadoop/gdelt_raw 27 | # real 222m33.449s 28 | # user 8m55.517s 29 | # sys 10m25.576s 30 | 31 | def filter_csv(src_filepath, columns): 32 | columns_name_mapping = { 33 | 'GLOBALEVENTID': 'event_id', 34 | 'DATEADDED': 'date', 35 | 'SQLDATE': 'event_date', 36 | 'EventCode': 'event_code', 37 | 'EventBaseCode': 'event_base_code' , 38 | 'EventRootCode': 'event_root_code', 39 | 'ActionGeo_Lat': 'lat', 40 | 'ActionGeo_Long': 'lon', 41 | 'ActionGeo_Type': 'geo_type', 42 | 'ActionGeo_CountryCode': 'country_code', 43 | 'ActionGeo_ADM1Code': 'adm1_code', 44 | 'SOURCEURL': 'source_url' 45 | } 46 | 47 | df = pd.read_csv(src_filepath, names=columns, delimiter='\t') 48 | df = df[columns_name_mapping.keys()] 49 | df = df.rename(columns=columns_name_mapping) 50 | 51 | numeric_columns = { 52 | 'event_id': 'int64', 53 | 'event_code': 'int64', 54 | 'event_base_code': 'int64', 55 | 'event_root_code': 'int64', 56 | 'lat': 'float64', 57 | 'lon': 'float64' 58 | } 59 | 60 | for col, dtype in numeric_columns.items(): 61 | df[col] = pd.to_numeric(df[col], errors='coerce') 62 | 63 | df = df.dropna(subset=[ 64 | 'event_code', 65 | 'event_base_code', 66 | 'event_root_code', 67 | 'geo_type' 68 | ]) 69 | 70 | for col, dtype in numeric_columns.items(): 71 | if not df[col].dtype == dtype: 72 | df[col] = df[col].astype(dtype) 73 | 74 | df["date"] = pd.to_datetime( 75 | df["date"], errors='coerce', format="%Y%m%d") 76 | df["event_date"] = pd.to_datetime( 77 | df["event_date"], errors='coerce', format="%Y%m%d") 78 | 79 | df['netloc'] = df['source_url'].apply( 80 | lambda url: urlparse(url).netloc if not pd.isna(url) else None) 81 | 82 | return df 83 | 84 | 85 | if __name__ == '__main__': 86 | df_headers = pd.read_excel('data/CSV.header.fieldids.xlsx') 87 | columns = df_headers.columns.values 88 | 89 | src_folderpath = 'data/raw' 90 | dst_folderpath = 'data/raw_filtered' 91 | 92 | log_items = [] 93 | filenames = [f for f in os.listdir(src_folderpath) if f.endswith('csv')] 94 | sum_duration = 0 95 | num_files = len(filenames) 96 | 97 | for i, filename in enumerate(sorted(filenames)): 98 | src_filepath = os.path.join(src_folderpath, filename) 99 | dst_filepath = os.path.join(dst_folderpath, filename) 100 | year = int(filename[:4]) 101 | 102 | print(f"{src_filepath}") 103 | num_rows = None 104 | duration = None 105 | num_wrong_dates = None 106 | try: 107 | s = time.time() 108 | df = filter_csv(src_filepath, columns) 109 | num_wrong_dates = (df['event_date'].dt.year != year).sum() 110 | 111 | # filter wrong dates 112 | mask = (df['event_date'].dt.year - year).abs() < 5 113 | df = df[mask] 114 | 115 | df.to_csv(dst_filepath, index=False) 116 | num_rows = df.shape[0] 117 | 118 | duration = time.time() - s 119 | sum_duration += duration 120 | avg_duration = sum_duration / (i + 1) 121 | est_duration = avg_duration * num_files 122 | 123 | print(f"{i} / {num_files}, " \ 124 | f"avg. duration: {round(avg_duration, 2)}, " \ 125 | f"est. duration: {round(est_duration, 2)}") 126 | print(f"Duration {round(duration, 2)} s for {num_rows} rows, " \ 127 | f"num_wrong_dates: {num_wrong_dates}") 128 | except Exception as e: 129 | print("EXCEPTION", e) 130 | 131 | log_items.append({ 132 | 'src': src_filepath, 133 | 'dst': dst_filepath, 134 | 'duration': duration, 135 | 'num_rows': num_rows, 136 | 'num_wrong_dates': num_wrong_dates 137 | }) 138 | 139 | df = pd.DataFrame(log_items) 140 | df.to_csv("data/filtered_stats_full.csv", index=False) 141 | -------------------------------------------------------------------------------- /scripts/python/gdelt_load_to_postgres.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import pandas as pd 4 | import sqlalchemy 5 | from geoalchemy2 import Geometry, WKTElement 6 | from shapely.geometry import Point 7 | from sqlalchemy.engine.url import URL 8 | from dotenv import load_dotenv 9 | 10 | load_dotenv() 11 | 12 | 13 | def convert_geometry(lon, lat): 14 | if pd.isna(lon) or pd.isna(lat): 15 | return None 16 | else: 17 | return WKTElement(Point(lon, lat).wkt, srid=4326) 18 | 19 | 20 | def load_to_postgis(filepath, connection_uri, table_name, chunksize=10**5): 21 | engine = sqlalchemy.create_engine(connection_uri) 22 | 23 | with engine.connect() as connection: 24 | connection.execute(f'DROP TABLE IF EXISTS {table_name}') 25 | 26 | for i, df_chunk in enumerate(pd.read_csv(filepath, chunksize=chunksize)): 27 | start_time = time.time() 28 | df_chunk['geometry'] = df_chunk.apply( 29 | lambda row: convert_geometry(row['ActionGeo_Long'], 30 | row['ActionGeo_Lat']), axis=1) 31 | df_chunk.to_sql(table_name, 32 | engine, 33 | if_exists='append', 34 | index=False, 35 | dtype={'geometry': Geometry('POINT', srid=4326)}) 36 | 37 | print('Chunk', i, 'duration: {:.4f}'.format(time.time() - start_time)) 38 | 39 | 40 | if __name__ == '__main__': 41 | filepath = "processed_data/2020.csv" 42 | table_name = "gdelt_2020" 43 | credentials = { 44 | 'drivername': 'postgres', 45 | 'host': os.environ['POSTGRES_HOST'], 46 | 'port': os.environ['POSTGRES_PORT'], 47 | 'username': os.environ['POSTGRES_USER'], 48 | 'password': os.environ['POSTGRES_PASS'], 49 | 'database': os.environ['POSTGRES_DB'] 50 | } 51 | 52 | connection_uri = str(URL(**credentials)) 53 | load_to_postgis(filepath, connection_uri, table_name, chunksize=10**5) 54 | -------------------------------------------------------------------------------- /scripts/python/gdelt_merge_csv.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import dask.dataframe as dd 3 | from dask.distributed import Client 4 | 5 | 6 | if __name__ == '__main__': 7 | client = Client(memory_limit='14GB', processes=True) 8 | print("Dashboard Link", client.dashboard_link) 9 | 10 | #src_filepath = "data/raw_filtered/2019*.export.csv" 11 | src_filepath = "data/raw_filtered/2020*.export.csv" 12 | #src_filepath = "data/raw_filtered/*.export.csv" 13 | #dst_filepath = "processed_data/2019_filtered.csv" 14 | dst_filepath = "processed_data/2020_filtered.csv" 15 | 16 | df = dd.read_csv(src_filepath) 17 | 18 | # https://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.to_csv 19 | df.to_csv(dst_filepath, 20 | index=False, 21 | single_file=True) 22 | 23 | # 2020 csv 24 | # real 6m4.852s 25 | # user 5m28.882s 26 | # sys 0m39.676s 27 | 28 | # 2019 csv 29 | #real 15m54.717s 30 | #user 13m17.211s 31 | #sys 1m22.761s 32 | -------------------------------------------------------------------------------- /scripts/python/gdelt_merge_parquet.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import dask.dataframe as dd 4 | from dask.distributed import Client 5 | 6 | 7 | DTYPES = { 8 | "event_id": str, 9 | "date": str, 10 | "event_date": str, 11 | "event_code": np.int64, 12 | "event_base_code": np.int64, 13 | "event_root_code": np.int64, 14 | "lat": np.float64, 15 | "lon": np.float64, 16 | "geo_type": np.int64, 17 | "country_code": str, 18 | "adm1_code": str, 19 | "source_url": str, 20 | "netloc": str 21 | } 22 | 23 | 24 | if __name__ == '__main__': 25 | client = Client(memory_limit='14GB', processes=True) 26 | print("Dashboard Link", client.dashboard_link) 27 | 28 | #src_filepath = "data/raw_filtered/2019*.export.csv" 29 | #src_filepath = "data/raw_filtered/2020*.export.csv" 30 | src_filepath = "data/raw_filtered/*.export.csv" 31 | #dst_filepath = "processed_data/2019_filtered.snappy.parq" 32 | #dst_filepath = "processed_data/2020_filtered.snappy.parq" 33 | dst_filepath = "processed_data/gdelt_filtered.snappy.parq" 34 | 35 | df = dd.read_csv(src_filepath, 36 | dtype=DTYPES, 37 | parse_dates=['date', 'event_date']) 38 | 39 | df = df.set_index('date', sorted=True) 40 | df.to_parquet(dst_filepath, 41 | engine='pyarrow', 42 | compression='snappy') 43 | 44 | # 2020 parquet 45 | # real 1m49.834s 46 | # user 1m47.804s 47 | # sys 0m23.030s 48 | 49 | # 2019 parquet 50 | # real 4m47.002s 51 | # user 3m59.711s 52 | # sys 0m47.536s 53 | 54 | # Full parquet 55 | # real 150m2.354s 56 | # user 119m37.066s 57 | # sys 13m59.735s 58 | 59 | # time hdfs dfs -put gdelt.snappy.parq/ gdelt_parquet 60 | # real 137m3.985s 61 | # user 2m41.709s 62 | # sys 3m25.887s 63 | 64 | # time hdfs dfs -put processed_data/gdelt_filtered.snappy.parq/ gdelt_parquet 65 | # real 58m15.061s 66 | # user 1m23.266s 67 | # sys 1m54.480s 68 | -------------------------------------------------------------------------------- /scripts/spark/gdelt_event_count.scala: -------------------------------------------------------------------------------- 1 | // spark-shell --master yarn -i gdelt_event_count.scala 2 | // real 1m11.478s 3 | // user 1m16.376s 4 | // sys 0m3.083s 5 | 6 | println("GDELT Event Code Count") 7 | 8 | val parquetDF = spark.read.parquet("hdfs:///user/hadoop/gdelt_parquet/2020.snappy.parq") 9 | 10 | parquetDF.createOrReplaceTempView("parquetFile") 11 | val countsDF = spark.sql(""" 12 | SELECT COUNT(*) AS event_count, event_root_code 13 | FROM parquetFile 14 | GROUP BY event_root_code 15 | ORDER BY event_count DESC 16 | """) 17 | 18 | countsDF.show(5) 19 | 20 | System.exit(0) 21 | -------------------------------------------------------------------------------- /scripts/spark/gdelt_event_count_hive.scala: -------------------------------------------------------------------------------- 1 | // spark-shell --master yarn -i gdelt_event_count_hive.scala 2 | // real 3m11.222s 3 | // user 1m19.898s 4 | // sys 0m4.304s 5 | 6 | // System.setProperty("hive.metastore.uris", "thrift://node-master:9083"); 7 | 8 | println("GDELT Event Code Count") 9 | 10 | import org.apache.spark.sql.hive.HiveContext 11 | 12 | val hiveContext = new HiveContext(sc) 13 | hiveContext.setConf("hive.metastore.uris", "thrift://node-master:9083") 14 | 15 | val countsDF = hiveContext.sql(""" 16 | SELECT COUNT(*) AS event_count, event_root_code 17 | FROM gdelt_csv 18 | GROUP BY event_root_code 19 | ORDER BY event_count DESC 20 | """) 21 | 22 | countsDF.show(5) 23 | 24 | System.exit(0) 25 | -------------------------------------------------------------------------------- /spark-geomesa-fs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Apache Spark with GeoMesa" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "%matplotlib inline" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import os\n", 26 | "import pyspark\n", 27 | "from pyspark import SparkConf\n", 28 | "from pyspark import SparkContext\n", 29 | "from pyspark.sql import SparkSession\n", 30 | "\n", 31 | "import geomesa_pyspark" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "geomesa_fs_jar_filepath = os.path.join(\n", 41 | " os.environ['GEOMESA_FS_HOME'], \n", 42 | " \"dist/spark/geomesa-fs-spark-runtime_2.11-3.0.0.jar\")\n", 43 | "geomesa_hbase_jar_filepath = os.path.join(\n", 44 | " os.environ['GEOMESA_HBASE_HOME'],\n", 45 | " \"dist/spark/geomesa-hbase-spark-runtime-hbase2_2.11-3.0.0.jar\")" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 4, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "conf = geomesa_pyspark.configure(\n", 55 | " jars=[geomesa_fs_jar_filepath],\n", 56 | " packages=['geomesa_pyspark','pytz'],\n", 57 | " spark_home=os.environ['SPARK_HOME']).\\\n", 58 | " setAppName('MyTestApp')" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 5, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "name": "stdout", 68 | "output_type": "stream", 69 | "text": [ 70 | "CPU times: user 316 ms, sys: 133 ms, total: 449 ms\n", 71 | "Wall time: 6min 22s\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "%%time\n", 77 | "spark = SparkSession \\\n", 78 | " .builder \\\n", 79 | " .config(conf=conf) \\\n", 80 | " .enableHiveSupport() \\\n", 81 | " .getOrCreate()" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "# Prepare DataFrames" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 7, 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "name": "stdout", 98 | "output_type": "stream", 99 | "text": [ 100 | "CPU times: user 51 ms, sys: 21.2 ms, total: 72.1 ms\n", 101 | "Wall time: 4min 57s\n" 102 | ] 103 | }, 104 | { 105 | "data": { 106 | "text/plain": [ 107 | "86770" 108 | ] 109 | }, 110 | "execution_count": 7, 111 | "metadata": {}, 112 | "output_type": "execute_result" 113 | } 114 | ], 115 | "source": [ 116 | "%%time\n", 117 | "hdfs_filepath = \"hdfs://node-master:54310/tmp/geomesa/gdelt_custom_20200101\"\n", 118 | "\n", 119 | "df_fs_sample = spark.read.format(\"geomesa\") \\\n", 120 | " .option(\"fs.path\", hdfs_filepath) \\\n", 121 | " .option(\"geomesa.feature\", \"gdelt_custom\") \\\n", 122 | " .load()\n", 123 | "df_fs_sample.count()" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 7, 129 | "metadata": {}, 130 | "outputs": [ 131 | { 132 | "name": "stdout", 133 | "output_type": "stream", 134 | "text": [ 135 | "CPU times: user 3.31 ms, sys: 885 µs, total: 4.2 ms\n", 136 | "Wall time: 3.44 s\n" 137 | ] 138 | }, 139 | { 140 | "data": { 141 | "text/plain": [ 142 | "176" 143 | ] 144 | }, 145 | "execution_count": 7, 146 | "metadata": {}, 147 | "output_type": "execute_result" 148 | } 149 | ], 150 | "source": [ 151 | "%%time\n", 152 | "hdfs_filepath = \"hdfs://node-master:54310/tmp/geomesa/ne/countries\"\n", 153 | "\n", 154 | "df_ne_fs = spark.read.format(\"geomesa\") \\\n", 155 | " .option(\"fs.path\", hdfs_filepath) \\\n", 156 | " .option(\"geomesa.feature\", \"ne_countries\") \\\n", 157 | " .load()\n", 158 | "df_ne_fs.count()" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 8, 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "name": "stdout", 168 | "output_type": "stream", 169 | "text": [ 170 | "+--------+--------------------+-----------+\n", 171 | "|database| tableName|isTemporary|\n", 172 | "+--------+--------------------+-----------+\n", 173 | "| default| gdelt_csv_2019| false|\n", 174 | "| default| gdelt_csv_2020| false|\n", 175 | "| default| gdelt_parquet_2020| false|\n", 176 | "| default|gdelt_parquet_ins...| false|\n", 177 | "| default|gdelt_parquet_ins...| false|\n", 178 | "| default|ne_10_states_prov...| false|\n", 179 | "| default|ne_110_countries_...| false|\n", 180 | "| |gdelt_custom_2020...| true|\n", 181 | "| | ne_countries| true|\n", 182 | "+--------+--------------------+-----------+\n", 183 | "\n", 184 | "CPU times: user 6.77 ms, sys: 1.2 ms, total: 7.98 ms\n", 185 | "Wall time: 10.5 s\n" 186 | ] 187 | } 188 | ], 189 | "source": [ 190 | "%%time\n", 191 | "df_fs_sample.createOrReplaceTempView(\"gdelt_custom_20200101\")\n", 192 | "df_ne_fs.createOrReplaceTempView(\"ne_countries\")\n", 193 | "spark.sql(\"SHOW TABLES\").show()" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "# Geospatial Functions" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 9, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "EXTENT = [9.5307, 46.3723, 17.1608, 49.0205]" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 11, 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "name": "stdout", 219 | "output_type": "stream", 220 | "text": [ 221 | "CPU times: user 27.1 ms, sys: 12.9 ms, total: 40 ms\n", 222 | "Wall time: 8.33 s\n" 223 | ] 224 | }, 225 | { 226 | "data": { 227 | "text/html": [ 228 | "
    \n", 229 | "\n", 242 | "\n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | "
    event_root_codecnt
    0123
    1139
    2131
    3161
    465
    \n", 278 | "
    " 279 | ], 280 | "text/plain": [ 281 | " event_root_code cnt\n", 282 | "0 12 3\n", 283 | "1 1 39\n", 284 | "2 13 1\n", 285 | "3 16 1\n", 286 | "4 6 5" 287 | ] 288 | }, 289 | "execution_count": 11, 290 | "metadata": {}, 291 | "output_type": "execute_result" 292 | } 293 | ], 294 | "source": [ 295 | "%%time\n", 296 | "df = spark.sql(\"\"\"\n", 297 | " SELECT\n", 298 | " event_root_code,\n", 299 | " COUNT(event_id) AS cnt\n", 300 | " FROM\n", 301 | " gdelt_custom_20200101\n", 302 | " WHERE \n", 303 | " ST_Within(geom, st_makeBBOX({}, {}, {}, {}))\n", 304 | " GROUP BY event_root_code\n", 305 | "\"\"\".format(*EXTENT)).toPandas()\n", 306 | "\n", 307 | "df.head()" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": {}, 313 | "source": [ 314 | "# Spatial Join" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 17, 320 | "metadata": {}, 321 | "outputs": [ 322 | { 323 | "name": "stdout", 324 | "output_type": "stream", 325 | "text": [ 326 | "CPU times: user 117 ms, sys: 15.7 ms, total: 133 ms\n", 327 | "Wall time: 7min 28s\n" 328 | ] 329 | }, 330 | { 331 | "data": { 332 | "text/html": [ 333 | "
    \n", 334 | "\n", 347 | "\n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | "
    iso_a2cnt
    0DZ56
    1MM65
    2LT30
    3CI47
    4AZ48
    \n", 383 | "
    " 384 | ], 385 | "text/plain": [ 386 | " iso_a2 cnt\n", 387 | "0 DZ 56\n", 388 | "1 MM 65\n", 389 | "2 LT 30\n", 390 | "3 CI 47\n", 391 | "4 AZ 48" 392 | ] 393 | }, 394 | "execution_count": 17, 395 | "metadata": {}, 396 | "output_type": "execute_result" 397 | } 398 | ], 399 | "source": [ 400 | "%%time\n", 401 | "df = spark.sql(\"\"\"\n", 402 | " SELECT\n", 403 | " c.iso_a2,\n", 404 | " COUNT(g.event_id) AS cnt\n", 405 | " FROM\n", 406 | " gdelt_custom_20200101 AS g,\n", 407 | " ne_countries AS c\n", 408 | " WHERE ST_Within(g.geom, c.polygons)\n", 409 | " GROUP BY c.iso_a2\n", 410 | "\"\"\").toPandas()\n", 411 | "\n", 412 | "df.head()" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": 31, 418 | "metadata": {}, 419 | "outputs": [ 420 | { 421 | "data": { 422 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAtIAAAI/CAYAAABasD72AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nOzdfZRdV3nn+e/jomwXuYkAl+wQgVzIIhCQjIIU3g33xoHhxU3jNHHbiOBMgIIJGRjA6SjAKE66A0wGYcnGQJfSxECI6W4SYtpKaALObcwkE2IFgWxAgHDx3jHGgeHiCsjFM3/UVVIUVVLdXVXn3CO+n7VqVd2zz8uj9fiPn/fa9+zITCRJkiQN5rS6C5AkSZKayCAtSZIkFTBIS5IkSQUM0pIkSVIBg7QkSZJUwCAtSZIkFbhX3QWUuM997pObN2+uuwytwHe+8x1+7Md+rO4yVMj+NZv9azb713z2sFkOHjx4Z2auX2yskUH6nHPO4ZZbbqm7DK1At9ul3W7XXYYK2b9ms3/NZv+azx42S0R8Yakxl3ZIkiRJBQzSkiRJUgGDtCRJklQgMrOaB0VMADdm5pZ5x64EesBHgH3AGf2f/5yZVy51r42bNudpl+xbw2q11l659R72HG7kEn1h/5rO/jWb/Ws+eziY6dc/o9bnR8TBzNyx2NiwdPHtwCWZ+fGIGAEeUndBkiRJ0okMS5A+G/gaQGbOAp+stxxJkiTpxIZljfRVwJGIeG9EvCgizqy7IEmSJOlEqlwjfS5wYJE10t/OzD0RcR7wFOBSIDOzveD6SWASYHx8/fbde/dXUrfWxjlj8A8zdVehUvav2exfs9m/5rOHg9m6YV2tz+90OkOxRvobwH0XHLsfcDtAZh4F3hIR+4GvR8RZmfmN4ydm5hQwBXNfNnSRfrP5RYtms3/NZv+azf41nz0czPTOdt0lLKmypR2Z2QO+FhEXAkTE/YCnAh+JiGdERPRPfTAwC3yzqtokSZKkQVX9v0PPA66NiD39z7+TmUcj4veAqyLibuAeYGf/S4eSJEnSUKo0SGfmJ4HOIscvrbIOSZIkaaUauUBnbHSEIzW/nFsr0+12h3rNk07M/jWb/Ws2+9d89vDUMSyvv5MkSZIaxSAtSZIkFTBIS5IkSQUM0pIkSVIBg7QkSZJUwCAtSZIkFTBIS5IkSQUM0pIkSVKBWjZkiYheZrYiYgK4HXhpZl7TH3sTcEtmXrfU9TPHZpnYdaCKUlVg2s1yJEnSj4BhmJG+A3hZRJxedyGSJEnScg1DkP468CHg8roLkSRJkpZrGII0wOuBV0bESN2FSJIkSctRyxrphTLz9oj4KPCcpc6JiElgEmB8fD27t95TVXkaULfbPek5vV5vWedpONm/ZrN/zWb/ms8enjqGIkj3vRZ4D/DhxQYzcwqYAti4aXPuOTxMpWu+6Z3tk57T7XZpt09+noaT/Ws2+9ds9q/57OGpY1iWdpCZnwY+CVxUdy2SJEnSyQxNkO77PeABdRchSZIknUwt6yMys9X/PQ1smXf84wxfuJckSZJ+SCMXGo+NjnDETT8kSZJUI2d/JUmSpAIGaUmSJKmAQVqSJEkqYJCWJEmSChikJUmSpAIGaUmSJKmAQVqSJEkqYJCWJEmSCjRyQ5aZY7NM7DpQdxmNNu2GNpIkSStSy4x0RPTm/f3wiLgpIj4TEUcj4nciwplySZIkDbVaA2tEjAHvA16fmT8NbAUeBbyszrokSZKkk6l75vc5wP+TmR8AyMy7gV8HfqPWqiRJkqSTqDtIPxw4OP9AZh4FxiLiPvWUJEmSJJ1cZGb1D43oZWYrIq4Cbs/MqxeMfxOYyMxvzjs2CUwCjI+v37577/5Kaz7VbN2wrtbn93o9Wq1WrTWonP1rNvvXbPav+exhs3Q6nYOZuWOxsbrf2nEb8MT5ByJiE3Dn/BANkJlTwBTAxk2bc8/huktvtumd7Vqf3+12abfrrUHl7F+z2b9ms3/NZw9PHXUv7XgX8ISI+AX45y8fXg38dq1VSZIkSSdRa5DOzBngmcCrI+IzwJ3MffnwXXXWJUmSJJ1MLesjMrM17+9bgQ5ARDwLeGNE/HFmfmGp68dGRzjihiKSJEmqUd1LO35AZv5ZZm46UYiWJEmShsFQBWlJkiSpKQzSkiRJUgGDtCRJklTAIC1JkiQVMEhLkiRJBQzSkiRJUgGDtCRJklSglg1ZVmrm2CwTuw7UXcbQmXaTGkmSpMpUNiMdET8ZEe+OiKMR8cmI+POI+OmIyIj43+ed96aI+JWq6pIkSZJKVBKkIyKA9wLdzDwvMx8GvAo4B7gDeFlEnF5FLZIkSdJqqGpGugMcy8y3Hj+QmYeALwFfBz4EXF5RLZIkSdKKVRWktwAHTzD+euCVETFSUT2SJEnSigzFlw0z8/aI+CjwnKXOiYhJYBJgfHw9u7feU1V5jdHtdusuYdl6vV6j6tUPsn/NZv+azf41nz08dVQVpG8Dnn2Sc14LvAf48GKDmTkFTAFs3LQ59xweiv8HGCrTO9t1l7Bs3W6XdrtddxkqZP+azf41m/1rPnt46qhqacdNwBkR8cLjByLi54Bzj3/OzE8DnwQuqqgmSZIkqVglQTozE7gYeHL/9Xe3AVcCX11w6u8BD6iiJkmSJGklKlsfkZlfBS5ZZGjLvHM+zjLC/djoCEfcfESSJEk1cotwSZIkqYBBWpIkSSpgkJYkSZIKGKQlSZKkAgZpSZIkqYBBWpIkSSpgkJYkSZIKGKQlSZKkApVtyLKaZo7NMrHrQN1lDIVpN6aRJEmqRW0z0hHR6/+eiIiZiDgUEZ+MiHdExGhddUmSJEnLMSxLO45m5jZgK/AAFt9KXJIkSRoawxKkAcjMWeCjwIa6a5EkSZJOZKiCdEScCTwaeH/dtUiSJEknEplZz4MjepnZiogJ4FPAEeDBwHsy8/JFzp8EJgHGx9dv3713f4XVDq+tG9bVXUKRXq9Hq9WquwwVsn/NZv+azf41nz1slk6nczAzdyw2Nixv7Tiamdsi4v5ANyKemZnvm39CZk4BUwAbN23OPYeHpfR6Te9s111CkW63S7vdrrsMFbJ/zWb/ms3+NZ89PHUM1dKOzPwasAv4rbprkSRJkk5kqIJ0358B946IC+ouRJIkSVpKbesjMrPV/z0NbJl3PIFHnOjasdERjrgRiSRJkmo0jDPSkiRJ0tAzSEuSJEkFDNKSJElSAYO0JEmSVMAgLUmSJBUwSEuSJEkFDNKSJElSAYO0JEmSVKC2DVlWYubYLBO7DtRdRiWm3XhGkiRpKFUSpCPiHOAq4DHAPwLfA36///cNwO3MzY7fATwnM++ooi5JkiSp1Jov7YiIAP4M+HBmbsrM7cClwAP6p9ycmdsy83zg74CXrHVNkiRJ0kpVsUb654HvZeZbjx/IzC9k5jXzT+oH7h9nbpZakiRJGmpVLO14OPD3Jxi/ICIOAWcB3wFeVUFNkiRJ0opEZq7tAyJeCjwoM1/e/3wt8ATm1kn/BnBFZl7UH/vN/rkvXuQ+k8AkwPj4+u279+5f07qHxdYN6+ouYU30ej1arVbdZaiQ/Ws2+9ds9q/57GGzdDqdg5m5Y7GxKmakbwP+zfEPmfmSiBgHblnk3PcBf7LYTTJzCpgC2Lhpc+453MgXjgxseme77hLWRLfbpd1u112GCtm/ZrN/zWb/ms8enjqqWCN9E3BmRPxv847de4lznwAcXfuSJEmSpJVZ82ndzMyIeBZwVUT8O+DrzK2F/s3+KcfXSAfwLeAFa12TJEmStFKVrI/IzK8x98q7xQy8CHhsdIQjblQiSZKkGrlFuCRJklTAIC1JkiQVMEhLkiRJBQzSkiRJUgGDtCRJklTAIC1JkiQVMEhLkiRJBQzSkiRJUoFKNmRZbTPHZpnYdaDuMiox7cYzkiRJQ6myGemIuDgiMiIe2v/cjogbF5xzXUQ8u6qaJEmSpFJVLu24DPgIS28VLkmSJDVGJUE6IlrA44HnY5CWJEnSKaCqGelnAe/PzM8Ad0XEIyt6riRJkrQmqvqy4WXA3v7f7+5/vnGJc3OxgxExCUwCjI+vZ/fWe1a7xqHU7XbrLmFN9Hq9U/bf9qPA/jWb/Ws2+9d89vDUseZBOiLOAn4e2BIRCYwwF5bfAdx3wen3A+5c7D6ZOQVMAWzctDn3HG7kC0cGNr2zXXcJa6Lb7dJut+suQ4XsX7PZv2azf81nD08dVSzteDbwjsw8NzMnMvOBwO3MheafioifAYiIc4FHAIcqqEmSJElakSqmdS8DXr/g2J8w96XD5wJ/GBFnAseAF2TmtyqoSZIkSVqRNQ/Smdle5NjV8z4+Zq1rkCRJklZbIxcaj42OcMQd/yRJklSjKjdkkSRJkk4ZBmlJkiSpgEFakiRJKmCQliRJkgoYpCVJkqQCBmlJkiSpgEFakiRJKmCQliRJkgqs+YYsEZHAGzPzlf3PVwCtzLyy//m5wL8DRoB7gL8DrsjMby51z5ljs0zsOrDWpddq2g1nJEmShloVM9LfBX4xIsYXDkTEU4GXA0/LzIcDjwT+GjingrokSZKkYlUE6XuAKeYC80KvZm72+SsAmTmbmW/LzCMV1CVJkiQVq2qN9LXAzohYt+D4w4G/r6gGSZIkadVEZq7tAyJ6mdmKiN8FjgEz9NdIR8RdwIMy81sRsRV4J/DjwKsy8z8vuM8kMAkwPr5+++69+9e07rpt3bDw/zlOLb1ej1arVXcZKmT/ms3+NZv9az572CydTudgZu5YbGzNv2w4z17mZp//cN6x25hbF/1XmXkY2BYRbwLGFl6cmVPMLRFh46bNuedwlaVXb3pnu+4S1lS326XdbtddhgrZv2azf81m/5rPHp46Knv9XWbeBfwX4PnzDr8OeENEPGDesR8K0ZIkSdKwqXpadw/w68c/ZOafR8R64C8iYgT4JnAr8N8rrkuSJEkayJoH6cxszfv7H4B7Lxh/O/D2ta5DkiRJWk2NXGg8NjrCETcskSRJUo3cIlySJEkqYJCWJEmSChikJUmSpAIGaUmSJKmAQVqSJEkqYJCWJEmSChikJUmSpAIGaUmSJKlApRuyRMQscLj/3E8Bl2fm3RHRO74DYkQ8HdgHXJiZX1zsPjPHZpnYdaCqsgc27WYxkiRJp7yqZ6RnMnNbZm4Bvge8eP5gRFwIXAM8dakQLUmSJA2DOpd23AxsPv4hIi4A9gPPyMyjtVUlSZIkLUMtQToi7gU8jbllHgBnADcAz8rMT9dRkyRJkjSIyMzqHvYva6Rhbkb6lZn5vYi4G7gJOJqZL1vi2klgEmB8fP323Xv3V1Fyka0b1tVdwtDr9Xq0Wq26y1Ah+9ds9q/Z7F/z2cNm6XQ6BzNzx2JjVQfpf/5S4cLjwNnAB4EbM/O1J7rPxk2b87RL9q1RlSvnlw1Prtvt0m636y5Dhexfs9m/ZrN/zWcPmyUilgzSlb6140T6b++4CLg5Iv4hM/9T3TVJkiRJSxmaIA2QmXdFxFOBD0fEnZl5Q901SZIkSYupNEgvtqxj4fHM/BLwoMqKkiRJkgoM1Yz0co2NjnDEdciSJEmqkVuES5IkSQUM0pIkSVIBg7QkSZJUwCAtSZIkFTBIS5IkSQUM0pIkSVIBg7QkSZJUwCAtSZIkFahsQ5aI+ElgL/BzwHeBaeD/yMzPRMTLgdcB52Tmt052r5ljs0zsOrCW5Q5s2g1iJEmSfqRUMiMdEQG8F+hm5nmZ+TDgVcA5/VMuA/4OuLiKeiRJkqSVqmppRwc4lplvPX4gMw9l5s0RcR7QAl7DXKCWJEmShl5VQXoLcHCJscuA64GbgYdExNkV1SRJkiQVi8xc+4dEvBR4UGa+fJGxW4GLM/OzEfFG4GhmXrvIeZPAJMD4+Prtu/fuX+uyB7J1w7q6S2iUXq9Hq9WquwwVsn/NZv+azf41nz1slk6nczAzdyw2VtWXDW8Dnr3wYEScDzwY+Mu5ZdScDnwe+KEgnZlTwBTAxk2bc8/hyr4nuSzTO9t1l9Ao3W6XdrtddxkqZP+azf41m/1rPnt46qhqacdNwBkR8cLjByLi54B9wJWZOdH/+SlgQ0ScW1FdkiRJUpFKgnTOrR+5GHhyRByNiNuAK4E2c2/zmO+9wKVV1CVJkiSVqmx9RGZ+FbhkGee9ooJyJEmSpBUZroXGyzQ2OsIRN0CRJElSjdwiXJIkSSpgkJYkSZIKGKQlSZKkAgZpSZIkqYBBWpIkSSpgkJYkSZIKGKQlSZKkAgZpSZIkqUBlG7JExH2BtwHnAf8E/Gpm3tofexnwQiCA/Zm590T3mjk2y8SuA2tc8Q+adgMYSZIkzVPljPSrgEOZeT7wPGAfQERsYS5EPwp4BHBRRDy4wrokSZKkgVUZpB8GfAggMz8NTETEOcDPAP9vZt6dmfcA/wO4uMK6JEmSpIFVGaQ/DvwiQEQ8CjgXeABwK/DEiDgrIu4NPB14YIV1SZIkSQOLzKzmQRE/wdxyjp8FDgMPBV6QmR+PiOcDLwF6wCeBmcx8+YLrJ4FJgPHx9dt3791fSd3Hbd2wrtLnnep6vR6tVqvuMlTI/jWb/Ws2+9d89rBZOp3OwczcsdjYmgbpiHgJc+ufAZ6emV/tHw/gduD8zPz/FlzzWuDLmfnmpe67cdPmPO2SfWtU9eL8suHq6na7tNvtustQIfvXbPav2exf89nDZomIJYP0mi7tyMxrM3NbZm4D7o6I0/tDLwA+fDxER8TZ/d8bmVv+cf1a1iVJkiStVGWvv2PuS4XviIhZ5pZvPH/e2J9ExFnAMeAlmfmPFdYlSZIkDayyIJ2ZfwMs+lq7zLygqjokSZKk1VDljPSqGRsd4YhrliVJklQjtwiXJEmSChikJUmSpAIGaUmSJKmAQVqSJEkqYJCWJEmSChikJUmSpAIGaUmSJKmAQVqSJEkq0MgNWWaOzTKx60Blz5t28xdJkiQtUOuMdET0ImIiImYi4lBEfDIi3hoRzpRLkiRpqA1LYD2amduA84GHAc+quR5JkiTphIYlSAOQmfcAfw1srrsWSZIk6USGKkhHxL2BC4HDddciSZIknUhkZn0Pj+gBW4BPAUeABG7IzCsXOXcSmAQYH1+/fffe/ZXVuXXDusqe9aOi1+vRarXqLkOF7F+z2b9ms3/NZw+bpdPpHMzMHYuNDctbO46vkV5SZk4BUwAbN23OPYerK316Z7uyZ/2o6Ha7tNvtustQIfvXbPav2exf89nDU8dQLe2QJEmSmqK2IB0R9wK+W9fzJUmSpJWoc2nHw5lb0jHN3DrpZRsbHeGIm6RIkiSpRrXMSEfEi4HrgdfU8XxJkiRppWqZkc7MtwJvrePZkiRJ0mrwy4aSJElSAYO0JEmSVMAgLUmSJBUwSEuSJEkFDNKSJElSAYO0JEmSVKDODVmKzRybZWLXgUqeNe3GL5IkSVpEJTPSETEREbcuOHZlRFzR//teEXFnRLyuinokSZKklRqWpR1PAY4Al0RE1F2MJEmSdDLDEqQvA/YBXwQeU3MtkiRJ0knVHqQjYgy4ELgRuJ65UC1JkiQNtcjMtX9IxLnAgczcMu/YlcC3mZuFflZm7oyIs4BDwERmzi64xyQwCTA+vn777r3717xugK0b1lXynB81vV6PVqtVdxkqZP+azf41m/1rPnvYLJ1O52Bm7lhsrKq3dnwDuO+CY/cDbmduBvrxETHdP34W0AE+OP/kzJwCpgA2btqcew5XU/r0znYlz/lR0+12abfbdZehQvav2exfs9m/5rOHp45KlnZkZg/4WkRcCBAR9wOeytzs8xOAjZk5kZkTwEtweYckSZKGXJVrpJ8HvCYiDgE3Ab8D/CxwU2Z+d955NwDPjIgzKqxNkiRJGkhlG7Jk5ieZW7Kx0HULzrsLWH+ie42NjnDEjVIkSZJUo9rf2iFJkiQ1kUFakiRJKmCQliRJkgoYpCVJkqQCBmlJkiSpgEFakiRJKmCQliRJkgoYpCVJkqQClW3Isppmjs0ysevAmt1/2s1eJEmSdBK1zkhHRK//eyIibq2zFkmSJGkQLu2QJEmSChikJUmSpAIGaUmSJKlAZGZ9D4/oZWYrIiaAGzNzywnOnQQmAcbH12/fvXf/mtW1dcO6Nbu35vR6PVqtVt1lqJD9azb712z2r/nsYbN0Op2DmbljsbHGvLUjM6eAKYCNmzbnnsNrV/r0zvaa3Vtzut0u7Xa77jJUyP41m/1rNvvXfPbw1OHSDkmSJKnAMAXph0TEl+f9/FLdBUmSJElLqXVpR2a2+r+ngdHlXjc2OsIRN02RJElSjYZpRlqSJElqDIO0JEmSVMAgLUmSJBUwSEuSJEkFDNKSJElSAYO0JEmSVMAgLUmSJBUwSEuSJEkFat2QpdTMsVkmdh1Y8X2m3dRFkiRJhU46Ix0RIxHxooj49xHx+AVjr1nOQyIiI+Kd8z7fKyK+HhE3RsTDI+IzETE2b/xARFw6yD9EkiRJqtJylnb8R+BJwDeAqyPijfPGfnGZz/kOsGVeWH4y8BWAzLwN+FPg1QAR8SxgNDPfvcx7S5IkSZVbTpB+VGY+JzP3Ao8GWhHxpxFxBhADPOsvgONrKS4Drp839rvAL0XENuD1wEsGuK8kSZJUueUE6dOP/5GZ92TmJHAIuAloDfCsdwOXRsSZwPnA3867793AFcCHgXdn5mcHuK8kSZJUucjME58Q8UfAH2Xm+xccfwHwlswcPelDInqZ2YqIW4BrgQcDHwCuyMyL5p33RWBHZt6xyD0mgUmA8fH123fv3X/Sf9zJbN2wbsX3UJler0erNcj/h2mY2L9ms3/NZv+azx42S6fTOZiZOxYbO+lbOzLzuUsc/wPgDwas5X3AG4A2cNYi49/v/yz2vClgCmDjps255/DKXzgyvbO94nuoTLfbpd1u112GCtm/ZrN/zWb/ms8enjoGSqMRsQV4GHDm8WOZ+Y4BbvE24FuZeTgi2oM8W5IkSRomyw7SEfHbzM0kPwz4c+BpwEeAZQfpzPwysG+wEiVJkqThM8iM9LOBRwAfy8z/NSLOYZlLOzLzhxYCZWYX6C44NrGc+42NjnDEzVQkSZJUo0G2CJ/JzO8D90TETwB3AJvWpixJkiRpuA0yI31LRNwH2A8cBHrAR9ekKkmSJGnILTtIZ+av9f98a0S8H/iJzPzE8fGIeHh/l0JJkiTplDfI0o5/lpnT80N03ztXoR5JkiSpEYqC9BIG2S5ckiRJarTVDNIn3iJRkiRJOoWsZpCWJEmSfmSsZpD+3ireS5IkSRpqg24R/kzgif2P/yMz/9vxscx8zGoWJkmSJA2zQbYIfx3wKOBd/UMvjYjHZeZvDXCP3vFdDiPi6cxtF34h8KvAC4Gv92t6VWa+b6n7zBybZWLXgeU+dknT7o4oSZKkQoPMSD8D2Nbf3ZCIeDvwMWDZQfq4iLgQuAZ4SmZ+MSIArsrMN0TEzwA3R8TZx58lSZIkDZtB10jfZ97f60oeGBEXMLc74jMy8+jC8cz8FHAPMF5yf0mSJKkKg8xIvw74WET8FXPvjH4ig89GnwHcALQz89OLnRARjwa+z9wyD0mSJGkoRebyX/8cEfcHfo65IP23mfk/B3pYxN3ATcDRzHzZvONX8i9rpL/N3BrpmxdcOwlMAoyPr9++e+/+QR69qK0biibVtQp6vR6tVqvuMlTI/jWb/Ws2+9d89rBZOp3OwczcsdjYsoN0RDweOJSZ34mI5wKPBPZl5heWW0hE9ICzgQ8CN2bma/vHrwR6mfmG5dxn46bNedol+5b72CX5ZcP6dLtd2u123WWokP1rNvvXbPav+exhs0TEkkF6kDXSbwHujohHAL8BfAF4x6DFZObdwEXAzoh4/qDXS5IkScNgkDXS92RmRsS/Bq7OzP8UEZeXPDQz74qIpwIfjog7S+4hSZIk1WmQIP3tiPgt4LnAEyNiBBgd5GHH3yHd//tLwIP6H28Y5D6SJElS3QYJ0v8WeA7w/Mz8nxGxEfi/16asExsbHeGI65slSZJUo2UH6f4bOt447/MXKVgjLUmSJJ0KThqkI+IjmfmEiPg2MP8VHwFkZv7EmlUnSZIkDamTBunMfEL/94+vfTmSJElSMwy6RbgkSZIkDNKSJElSEYO0JEmSVMAgLUmSJBUwSEuSJEkFBtmQZVVExKuZ29hlFvg+8DXgE5n5m/3xc4G/Ah6Zmd9c7B4zx2aZ2HVgxbVMu6mLJEmSClUapCPiscBFzIXk70bEOHAG8KGIuC4zPwXsA/7PpUK0JEmSNAyqXtpxf+DOzPwuQGbemZlfAV4BvDkingb8eGa+q+K6JEmSpIFUHaQ/ADwwIj4TEW+OiCcBZOafA3cxt+X4r1VckyRJkjSwyMyTn7WaD4wYAS4AOsCLgF2ZeV1E/DxwRWY+fYnrJoFJgPHx9dt3792/4lq2bli34nuoTK/Xo9Vq1V2GCtm/ZrN/zWb/ms8eNkun0zmYmTsWG6v8y4aZOQt0gW5EHAYuB65j7ouH3z/BdVPAFMDGTZtzz+GVlz69s73ie6hMt9ul3W7XXYYK2b9ms3/NZv+azx6eOipd2hERD4mIB887tA34QpU1SJIkSauh6hnpFnBNRNwHuAf4HP3lGpIkSVKTVBqkM/Mg8LglxrrMLfmQJEmShl7la6RXw9joCEfcTEWSJEk1cotwSZIkqYBBWpIkSSpgkJYkSZIKGKQlSZKkAgZpSZIkqYBBWpIkSSpgkJYkSZIKGKQlSZKkApVsyBIRFwO/veDw+cAzMvMvIuLlwOuAczLzWye738yxWSZ2HVhRTdNu6CJJkqQVqGRGOjPfm5nbjv8AbwZuBv57/5TLgL8DLq6iHkmSJGmlKl/aERE/DewGfjkzvx8R5wEt4DXMBWpJkiRp6FUapCNiFPhj4IrM/GL/8GXA9czNUD8kIs6usiZJkiSpRGRmdQ+LeD1w/8y8fN6xW4GLM/OzEfFG4GhmXrvItZPAJMD4+Prtu/fuX1EtWzesW9H1Wpler0er1aq7DBWyf81m/5rN/jWfPWyWTqdzMDN3LDZWWZCOiDawH3hkZn67f+x85tZGf61/2unA5zPzCSe618ZNm/O0S9fQj9YAABiLSURBVPatqB6/bFivbrdLu92uuwwVsn/NZv+azf41nz1slohYMkhXsrQjIu4L/CHwvOMhuu8y4MrMnOj//BSwISLOraIuSZIkqVQlr78DXgycDbwlIuYfXwc8bcG57wUuBf6vakqTJEmSBldJkM7M1zH3nujlnPuKNS5HkiRJWrGqZqRX1djoCEdc4yxJkqQauUW4JEmSVMAgLUmSJBUwSEuSJEkFDNKSJElSAYO0JEmSVMAgLUmSJBUwSEuSJEkFDNKSJElSgco2ZImIs4AP9T/+JDALfL3/+RHAx/v13A78cmZ+c6l7zRybZWLXgRXVM+2GLpIkSVqBymakM/MbmbktM7cBbwWumvf5O/2/twB3AS+pqi5JkiSpxDAu7fgbYEPdRUiSJEknMlRBOiJGgAuB99VdiyRJknQikZnVPzTiSqCXmW/of54FDgMTwEHgKZk5u+CaSWASYHx8/fbde/evqIatG9at6HqtTK/Xo9Vq1V2GCtm/ZrN/zWb/ms8eNkun0zmYmTsWG6vsy4YnMZOZ2yJiHXAjc2ukr55/QmZOAVMAGzdtzj2HV1b69M72iq7XynS7Xdrtdt1lqJD9azb712z2r/ns4aljqJZ2ZOa3gJcCV0TEaN31SJIkSUsZqiANkJkfY+5VeJfWXYskSZK0lFqWdmTmlQs+txZ8/leVFiRJkiQNaFjWSA9kbHSEI26oIkmSpBoN3dIOSZIkqQkM0pIkSVIBg7QkSZJUwCAtSZIkFTBIS5IkSQUM0pIkSVIBg7QkSZJUwCAtSZIkFWjkhiwzx2aZ2HWg+PppN3ORJEnSCtU2Ix0Rvf7viYiYiYhD836eV1ddkiRJ0nIMy4z00czcVncRkiRJ0nK5RlqSJEkqMCxB+rwFSzsuqLsgSZIk6UQiM+t5cEQvM1sRMQHcmJlbTnL+JDAJMD6+fvvuvfuLn711w7ria7U6er0erVar7jJUyP41m/1rNvvXfPawWTqdzsHM3LHY2LCskT6pzJwCpgA2btqcew6Xlz69s71KValUt9ul3W7XXYYK2b9ms3/NZv+azx6eOoZlaYckSZLUKMMyI31eRBya9/ltmXl1bdVIkiRJJ1FbkM7MVv/3NDA2yLVjoyMccVMVSZIk1cilHZIkSVIBg7QkSZJUwCAtSZIkFTBIS5IkSQUM0pIkSVIBg7QkSZJUwCAtSZIkFTBIS5IkSQWGZWfDgcwcm2Vi14Hi66fdzEWSJEkrVOmMdETMRsShiLg1Iv5rRNy7f/zVEXFbRHyiP/7oKuuSJEmSBlX1jPRMZm4DiIh3AS+OiL8BLgIemZnfjYhx4PSK65IkSZIGUufSjpuB84Fp4M7M/C5AZt5ZY02SJEnSstTyZcOIuBfwNOAw8AHggRHxmYh4c0Q8qY6aJEmSpEFEZlb3sIhZ5sIzzM1IvzIzvxcRI8AFQAd4EbArM69bcO0kMAkwPr5+++69+4vr2LphXfG1Wh29Xo9Wq1V3GSpk/5rN/jWb/Ws+e9gsnU7nYGbuWGys6iDdy8wT/pcTEc8GLs/Mf7XUORs3bc7TLtlXXIdv7ahft9ul3W7XXYYK2b9ms3/NZv+azx42S0QsGaRrf490RDwkIh4879A24At11SNJkiQtxzC8R7oFXBMR9wHuAT5HfwmHJEmSNKwqDdKLLevIzIPA4wa5z9joCEdcniFJkqQa1b60Q5IkSWoig7QkSZJUwCAtSZIkFTBIS5IkSQUM0pIkSVIBg7QkSZJUwCAtSZIkFTBIS5IkSQWGYWfDgc0cm2Vi14Hi66fdzEWSJEkrVPmMdERcHBEZEQ/tf56IiJmIODTv5/Sq65IkSZIGUcfSjsuAjwCXzjt2NDO3zfv5Xg11SZIkSctWaZCOiBbweOD5/GCQliRJkhql6hnpZwHvz8zPAHdFxCP7x8+bt6zj2oprkiRJkgYWmVndwyIOAHsz8y8j4qXAA4FrgRszc8tJrp0EJgHGx9dv3713f3EdWzesK75Wq6PX69FqteouQ4XsX7PZv2azf81nD5ul0+kczMwdi41VFqQj4izgy8AdQAIj/d9PAv7byYL0fBs3bc7TLtlXXItv7ahft9ul3W7XXYYK2b9ms3/NZv+azx42S0QsGaSrXNrxbOAdmXluZk5k5gOB24EHVFiDJEmStCqqDNKXAe9dcOxPgFdVWIMkSZK0KirbkCUz24scuxq4etB7jY2OcMTlGZIkSaqRW4RLkiRJBQzSkiRJUgGDtCRJklTAIC1JkiQVMEhLkiRJBQzSkiRJUgGDtCRJklTAIC1JkiQVqGxDltU0c2yWiV0Hiq+fdjMXSZIkrVClM9IR0Vvk2JUR8ZWIOBQRn46It0SEM+WSJEkaasMSWK/KzG3Aw4CtwJNqrkeSJEk6oWEJ0sedDpwJ/GPdhUiSJEknMixB+uURcQj4GvCZzDxUd0GSJEnSiURmVvewiF5mthYcuxLoZeYbImIUeA9wfWa+e8F5k8AkwPj4+u279+4vrmPrhnXF12p19Ho9Wq3WyU/UULJ/zWb/ms3+NZ89bJZOp3MwM3csNjZUb+3IzGMR8X7gicC7F4xNAVMAGzdtzj2Hy0uf3tleQZVaDd1ul3a7XXcZKmT/ms3+NZv9az57eOoYlqUdAEREAI8DjtZdiyRJknQiVc9I3zsivjzv8xv7v18eEc8FRoFPAG+uuC5JkiRpIJUG6cxcagb8ykHuMzY6whE3VZEkSVKNhmpphyRJktQUBmlJkiSpgEFakiRJKmCQliRJkgoYpCVJkqQCBmlJkiSpgEFakiRJKmCQliRJkgpUvbPhqpg5NsvErgMDXzftJi6SJElaJZXNSEfEbEQciohbI+K/RsS9+8d7C877lYh4U1V1SZIkSSWqXNoxk5nbMnML8D3gxRU+W5IkSVpVda2RvhnYXNOzJUmSpBWrfI10RNwLeBrw/v6hsYg4NO+U+wHvq7ouSZIkaRCRmdU8KGIWONz/eDPwysz8XkT0MrM177xfAXZk5q8vuH4SmAQYH1+/fffe/QPXsHXDusLqtdp6vR6tVuvkJ2oo2b9ms3/NZv+azx42S6fTOZiZOxYbq3JGeiYzt5VenJlTwBTAxk2bc8/hwUuf3tkufbxWWbfbpd1u112GCtm/ZrN/zWb/ms8enjp8j7QkSZJUwCAtSZIkFahsacf8ddAnOp6Z1wHXVVCSJEmSVKyROxuOjY5wxF0KJUmSVCOXdkiSJEkFDNKSJElSAYO0JEmSVMAgLUmSJBUwSEuSJEkFDNKSJElSAYO0JEmSVMAgLUmSJBWoZEOWiJgFDgOjwD3A24G9mfn9iGgDNwC3z7vkisz84FL3mzk2y8SuAwPXMe0mLpIkSVolVe1sOJOZ2wAi4mzgj4F1wG/3x2/OzIsqqkWSJElascqXdmTmHcAk8OsREVU/X5IkSVoNVc1I/4DM/HxEnAac3T90QUQcmnfKv8nMozWUJkmSJC1LZObaPySil5mtBce+CTwE+Bnm1kSfcGlHREwyN5PN+Pj67bv37h+4jq0b1g18jdZGr9ej1Wqd/EQNJfvXbPav2exf89nDZul0Ogczc8diY7XMSEfEJmAWuIO5IH1SmTkFTAFs3LQ59xwevPTpne2Br9Ha6Ha7tNvtustQIfvXbPav2exf89nDU0fla6QjYj3wVuBNWcV0uCRJkrQGqpqRHuuvgT7++rt3Am+cN75wjfR/yMz3VFSbJEmSNLBKgnRmjpxgrMvcq/AkSZKkxqhljfRKjY2OcMTNVSRJklQjtwiXJEmSChikJUmSpAIGaUmSJKmAQVqSJEkqYJCWJEmSChikJUmSpAIGaUmSJKmAQVqSJEkqUNmGLBHRy8xW/++nA/uAC4FfBV4IfB04Hfj3mXn9ie41c2yWiV0HBnr+tBu4SJIkaRVVPiMdERcC1wBPzcwv9g9flZnbgH8N/MeIGK26LkmSJGkQlQbpiLgA2A88IzOPLhzPzM8CdwP3rbIuSZIkaVCVLe0AzgBuANqZ+enFToiIRwKfzcw7KqxLkiRJGlhkZjUPirgbuAk4mpkvm3f8SubWSPeATcwt+fjQItdPApMA4+Prt+/eu3+g52/dsK64dq2+Xq9Hq9WquwwVsn/NZv+azf41nz1slk6nczAzdyw2VmWQ7gFnAx8EbszM1/aPXwn0MvMNEfGLzK2fPi8z/2mpe23ctDlPu2TfQM/3y4bDpdvt0m636y5Dhexfs9m/ZrN/zWcPmyUilgzSla6Rzsy7gYuAnRHx/EXG/xS4Bbi8yrokSZKkQVW5RhqAzLwrIp4KfDgi7lzklN8F/jgi9mfm9ysuT5IkSVqWyoL08XdI9//+EvCg/scbFpx3EHhIVXVJkiRJJSqfkV4NY6MjHHHNsyRJkmrkFuGSJElSAYO0JEmSVMAgLUmSJBUwSEuSJEkFDNKSJElSAYO0JEmSVMAgLUmSJBUwSEuSJEkF1nxDlohI4I8y85f7n+8FfA3428y8KCJeATw8M5/fH98JPCczl9xxZebYLBO7DgxUx7QbuEiSJGkVVbGz4XeALRExlpkzwJOBr8wbvxq4JSIeD9wG/AfgwgrqkiRJkopVtbTjL4DjU8KXAdcfH8jMe4BfA64Ffh94W2Z+vqK6JEmSpCJVBel3A5dGxJnA+cDfzh/MzL8GPgX8AnNhWpIkSRpqkZlr+4CIXma2IuIW5madHwx8ALgiMy/qn9MCPgacCTw2M7+8yH0mgUmA8fH123fv3T9QHVs3rFvRv0Orq9fr0Wq16i5Dhexfs9m/ZrN/zWcPm6XT6RzMzB2LjVWxRvq49wFvANrAWQvGfgf4I+AfgKuAX1p4cWZOAVMAGzdtzj2HByt9emd70Hq1hrrdLu12u+4yVMj+NZv9azb713z28NRRZZB+G/CtzDwcEe3jByNiK3Prp7cB3wN+NSKenJl/WWFtkiRJ0kAqe490Zn45M/fNPxYRAbwFeHlm/lNmfp+5Lx7ui4jTq6pNkiRJGtSaz0hn5g8tAsrMLtDtf3zCgrFbgIetdV2SJEnSSlS5tGPVjI2OcMQNViRJklQjtwiXJEmSChikJUmSpAIGaUmSJKmAQVqSJEkqYJCWJEmSChikJUmSpAIGaUmSJKmAQVqSJEkq0MgNWWaOzTKx68Cyzp124xZJkiStgUpnpCPi1RFxW0R8IiIORcSj+8fvFRF3RsTrqqxHkiRJKlVZkI6IxwIXAY/MzPOBXwC+1B9+CnAEuCQioqqaJEmSpFJVzkjfH7gzM78LkJl3ZuZX+2OXAfuALwKPqbAmSZIkqUiVQfoDwAMj4jMR8eaIeBJARIwBFwI3AtczF6olSZKkoRaZWd3DIkaAC4AO8CJgF/Ad4FmZuTMizgIOAROZObvg2klgEmB8fP323Xv3L+uZWzesW71/gFZNr9ej1WrVXYYK2b9ms3/NZv+azx42S6fTOZiZOxYbqzRI/8CDI54NXA4cAx4PzPSHzgaemZkfXOrajZs252mX7FvWc3xrx3Dqdru02+26y1Ah+9ds9q/Z7F/z2cNmiYglg3SVXzZ8SEQ8eN6hbcDXgScAGzNzIjMngJfg8g5JkiQNuSrfI90CromI+wD3AJ8D/hq49/EvIPbdAPx+RJyx4LgkSZI0NCoL0pl5EHjcMs67C1h/onPGRkc44pINSZIk1cgtwiVJkqQCBmlJkiSpgEFakiRJKmCQliRJkgoYpCVJkqQCBmlJkiSpgEFakiRJKmCQliRJkgpUubPhqpk5NsvErgPLOnfajVskSZK0BmoN0hHRAx4LvLN/aCPwrf7PnZn5C3XVJkmSJJ1I7TPSmXkY2AYQEdcBN2bme2otSpIkSToJ10hLkiRJBQzSkiRJUoHIzPoeHtHLzNa8z9exxNKOiJgEJgHGx9dv3713/7KesXXDutUpVquq1+vRarVOfqKGkv1rNvvXbPav+exhs3Q6nYOZuWOxsdrXSC9XZk4BUwAbN23OPYeXV/r0zvYaVqVS3W6XdrtddxkqZP+azf41m/1rPnt46nBphyRJklTAIC1JkiQVqHVpx/z10f3Pv7Kc68ZGRzjiRiuSJEmqkTPSkiRJUgGDtCRJklTAIC1JkiQVMEhLkiRJBQzSkiRJUgGDtCRJklTAIC1JkiQVMEhLkiRJBWrdkKXUzLFZJnYdWNa5027cIkmSpDVQyYx0RMxGxKGI+HhE/H1EPK5/fCIiZiLiYxHxqYj4aERcXkVNkiRJ0kpUNSM9k5nbACLifwFeBzypP3Y0M3+2P7YJ+NOIOC0z/7Ci2iRJkqSB1bFG+ieAf1xsIDM/D7wCeGmlFUmSJEkDqmpGeiwiDgFnAvcHfv4E5/498NBKqpIkSZIKRWau/UMiepnZ6v/9WOAPgC3AucCNmbll3rn3Bb6amWML7jEJTAKMj6/fvnvv/mU9e+uGdavyb9Dq6vV6tFqtustQIfvXbPav2exf89nDZul0Ogczc8diY5W/tSMz/yYixoH1S5zys8CnFrluCpgC2Lhpc+45vLzSp3e2ywrVmup2u7Tb7brLUCH712z2r9nsX/PZw1NH5WukI+KhwAjwjUXGJoA3ANdUW5UkSZI0mKrXSAMEcHlmzkYEwHkR8THm1k9/G7jGN3ZIkiRp2FUSpDNzZInj08DYYmMnMjY6whE3WpEkSVKN3CJckiRJKmCQliRJkgoYpCVJkqQCBmlJkiSpQCUbsqy2iPg2cKTuOrQi48CddRehYvav2exfs9m/5rOHzXJuZi66/0nlG7KskiNL7TCjZoiIW+xhc9m/ZrN/zWb/ms8enjpc2iFJkiQVMEhLkiRJBZoapKfqLkArZg+bzf41m/1rNvvXfPbwFNHILxtKkiRJdWvqjLQkSZJUq8YF6Yh4akQciYjPRcSuuuvRv4iI6Yg4HBGHIuKW/rH7RcRfRsRn+7/v2z8eEXF1v4+fiIhHzrvP5f3zPxsRl9f17znVRcTbIuKOiLh13rFV61dEbO//9/C5/rVR7b/w1Pf/t3cHIVKWcRzHv3+s3YMZrYqybEJrePFUi8hG4qHDqnvZgg57MioIoiAPHRQvXgv0ZBSEgoZkaUVepCSELrVGoaYs2mpBq4silnoq03+H5z/yMsz7wo6z88478/vAwzzzzDvD+/Kb/zsP877vTE6Gu8zsStThaTMbzzy2I/K4YGabMuMN96tmNmxmU5HtZ2bW176t625mtsrMTprZtJmdN7N3Ylw1WBEFGaoGe4m7V6YBi4BLwGqgDzgDrC17vdQe5PMHsLxu7H1ge/S3A+9Ffxw4DhgwCkzF+FLgctwORH+g7G3rxgZsBEaAcwuRF3AKeC6ecxzYUvY2d1vLyXAX8G6DZdfGPrMfGI596aKi/SrwOTAZ/Y+AN8ve5m5pwCAwEv0lwMXISDVYkVaQoWqwh1rVvpFeD8y4+2V3/xc4DEyUvE5SbAI4EP0DwIuZ8YOe/Ag8YWaDwCbghLvfdPe/gBPA5navdC9w9++Bm3XDLckrHnvc3X/w9AlwMPNa0iI5GeaZAA67+z/u/jswQ9qnNtyvxreXLwBH4/nZ94M8JHefc/dfon8HmAaGUA1WRkGGeVSDXahqE+kh4M/M/VmK37TSXg58a2Y/m9kbMbbS3ecg7XSAFTGel6UyLler8hqKfv24tMfbcfh/f+3UAOaf4TLgb3f/r25cWszMngKeBaZQDVZSXYagGuwZVZtINzq/Sz870jmed/cRYAvwlpltLFg2L0tl3Jnmm5dyLM+HwNPAM8AcsDvGlWEHMrPHgC+Abe5+u2jRBmPKrwM0yFA12EOqNpGeBVZl7j8JXC1pXaSOu1+N2+vAV6TDVdfiECNxez0Wz8tSGZerVXnNRr9+XBaYu19z93vufh/4mFSHMP8Mb5BOH3ikblxaxMweJU3ADrn7lzGsGqyQRhmqBntL1SbSPwFr4irWPmASOFbyOglgZovNbEmtD4wB50j51K4ifwX4OvrHgK1xJfoocCsOY34DjJnZQBwOG4sxaY+W5BWP3TGz0TjPb2vmtWQB1SZh4SVSHULKcNLM+s1sGFhDuhit4X41zqs9Cbwcz8++H+QhRV3sA6bdfU/mIdVgReRlqBrsMWVf7TjfRrpy+SLpCtedZa+P2oNcVpOuND4DnK9lQzrH6zvgt7hdGuMGfBA5/gqsy7zWa6SLMGaAV8vetm5twKekw453Sd+IvN7KvIB1pA+QS8Be4g+g1BY8w08io7OkD+7BzPI7I48LZH7BIW+/GnV9KrI9AvSXvc3d0oANpMP0Z4HT0cZVg9VpBRmqBnuo6Z8NRURERESaULVTO0REREREOoIm0iIiIiIiTdBEWkRERESkCZpIi4iIiIg0QRNpEREREZEmaCItIiIiItIETaRFRERERJqgibSIiIiISBP+B6M0iO9S5HyLAAAAAElFTkSuQmCC\n", 423 | "text/plain": [ 424 | "
    " 425 | ] 426 | }, 427 | "metadata": { 428 | "needs_background": "light" 429 | }, 430 | "output_type": "display_data" 431 | } 432 | ], 433 | "source": [ 434 | "df_plot = df.set_index('iso_a2')['cnt'].sort_values(ascending=False)[:30][::-1]\n", 435 | "df_plot.plot(kind='barh', figsize=(12, 10), grid=True);" 436 | ] 437 | } 438 | ], 439 | "metadata": { 440 | "kernelspec": { 441 | "display_name": "spark", 442 | "language": "python", 443 | "name": "spark" 444 | }, 445 | "language_info": { 446 | "codemirror_mode": { 447 | "name": "ipython", 448 | "version": 3 449 | }, 450 | "file_extension": ".py", 451 | "mimetype": "text/x-python", 452 | "name": "python", 453 | "nbconvert_exporter": "python", 454 | "pygments_lexer": "ipython3", 455 | "version": "3.7.7" 456 | } 457 | }, 458 | "nbformat": 4, 459 | "nbformat_minor": 4 460 | } 461 | -------------------------------------------------------------------------------- /spark-geomesa-hbase.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Apache Spark with GeoMesa and Apache HBase" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "%matplotlib inline" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import os\n", 26 | "import pyspark\n", 27 | "from pyspark import SparkConf\n", 28 | "from pyspark import SparkContext\n", 29 | "from pyspark.sql import SparkSession\n", 30 | "\n", 31 | "import geomesa_pyspark" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "geomesa_fs_jar_filepath = os.path.join(\n", 41 | " os.environ['GEOMESA_FS_HOME'], \n", 42 | " \"dist/spark/geomesa-fs-spark-runtime_2.11-3.0.0.jar\")\n", 43 | "geomesa_hbase_jar_filepath = os.path.join(\n", 44 | " os.environ['GEOMESA_HBASE_HOME'],\n", 45 | " \"dist/spark/geomesa-hbase-spark-runtime-hbase2_2.11-3.0.0.jar\")" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 4, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "conf = geomesa_pyspark.configure(\n", 55 | " jars=[geomesa_hbase_jar_filepath],\n", 56 | " packages=['geomesa_pyspark','pytz'],\n", 57 | " spark_home=os.environ['SPARK_HOME']).\\\n", 58 | " setAppName('geomesa_hbase')" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 5, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "name": "stdout", 68 | "output_type": "stream", 69 | "text": [ 70 | "CPU times: user 48.9 ms, sys: 21.8 ms, total: 70.7 ms\n", 71 | "Wall time: 30.9 s\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "%%time\n", 77 | "spark = SparkSession \\\n", 78 | " .builder \\\n", 79 | " .config(conf=conf) \\\n", 80 | " .enableHiveSupport() \\\n", 81 | " .getOrCreate()" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "# Prepare DataFrames" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 6, 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "name": "stdout", 98 | "output_type": "stream", 99 | "text": [ 100 | "CPU times: user 9.78 ms, sys: 4.66 ms, total: 14.4 ms\n", 101 | "Wall time: 50 s\n" 102 | ] 103 | }, 104 | { 105 | "data": { 106 | "text/plain": [ 107 | "86770" 108 | ] 109 | }, 110 | "execution_count": 6, 111 | "metadata": {}, 112 | "output_type": "execute_result" 113 | } 114 | ], 115 | "source": [ 116 | "%%time\n", 117 | "params = {\"hbase.catalog\": \"gdelt_custom_20200101\"}\n", 118 | "\n", 119 | "df_hbase = spark.read.format(\"geomesa\") \\\n", 120 | " .options(**params) \\\n", 121 | " .option(\"geomesa.feature\", \"gdelt_custom\") \\\n", 122 | " .load()\n", 123 | "\n", 124 | "df_hbase.count()" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 7, 130 | "metadata": {}, 131 | "outputs": [ 132 | { 133 | "name": "stdout", 134 | "output_type": "stream", 135 | "text": [ 136 | "CPU times: user 3.99 ms, sys: 0 ns, total: 3.99 ms\n", 137 | "Wall time: 2.42 s\n" 138 | ] 139 | }, 140 | { 141 | "data": { 142 | "text/plain": [ 143 | "176" 144 | ] 145 | }, 146 | "execution_count": 7, 147 | "metadata": {}, 148 | "output_type": "execute_result" 149 | } 150 | ], 151 | "source": [ 152 | "%%time\n", 153 | "params = {\"hbase.catalog\": \"ne_countries\"}\n", 154 | "\n", 155 | "df_hbase_ne = spark.read.format(\"geomesa\") \\\n", 156 | " .options(**params) \\\n", 157 | " .option(\"geomesa.feature\", \"ne_countries\") \\\n", 158 | " .load()\n", 159 | "\n", 160 | "df_hbase_ne.count()" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 8, 166 | "metadata": {}, 167 | "outputs": [ 168 | { 169 | "name": "stdout", 170 | "output_type": "stream", 171 | "text": [ 172 | "+--------+--------------------+-----------+\n", 173 | "|database| tableName|isTemporary|\n", 174 | "+--------+--------------------+-----------+\n", 175 | "| default| gdelt_csv_2019| false|\n", 176 | "| default| gdelt_csv_2020| false|\n", 177 | "| default| gdelt_parquet_2020| false|\n", 178 | "| default|gdelt_parquet_ins...| false|\n", 179 | "| default|gdelt_parquet_ins...| false|\n", 180 | "| default|ne_10_states_prov...| false|\n", 181 | "| default|ne_110_countries_...| false|\n", 182 | "| |gdelt_custom_2020...| true|\n", 183 | "| | ne_countries| true|\n", 184 | "+--------+--------------------+-----------+\n", 185 | "\n", 186 | "CPU times: user 29.9 ms, sys: 14.9 ms, total: 44.7 ms\n", 187 | "Wall time: 3min 1s\n" 188 | ] 189 | } 190 | ], 191 | "source": [ 192 | "%%time\n", 193 | "df_hbase.createOrReplaceTempView(\"gdelt_custom_20200101\")\n", 194 | "df_hbase_ne.createOrReplaceTempView(\"ne_countries\")\n", 195 | "spark.sql(\"SHOW TABLES\").show()" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "# Geospatial Functions" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 9, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "EXTENT = [9.5307, 46.3723, 17.1608, 49.0205]" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 10, 217 | "metadata": {}, 218 | "outputs": [ 219 | { 220 | "name": "stdout", 221 | "output_type": "stream", 222 | "text": [ 223 | "CPU times: user 479 ms, sys: 241 ms, total: 720 ms\n", 224 | "Wall time: 2min 8s\n" 225 | ] 226 | }, 227 | { 228 | "data": { 229 | "text/html": [ 230 | "
    \n", 231 | "\n", 244 | "\n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | "
    event_root_codecnt
    0472
    1139
    2531
    3318
    4213
    \n", 280 | "
    " 281 | ], 282 | "text/plain": [ 283 | " event_root_code cnt\n", 284 | "0 4 72\n", 285 | "1 1 39\n", 286 | "2 5 31\n", 287 | "3 3 18\n", 288 | "4 2 13" 289 | ] 290 | }, 291 | "execution_count": 10, 292 | "metadata": {}, 293 | "output_type": "execute_result" 294 | } 295 | ], 296 | "source": [ 297 | "%%time\n", 298 | "df = spark.sql(\"\"\"\n", 299 | " SELECT\n", 300 | " event_root_code,\n", 301 | " COUNT(event_id) AS cnt\n", 302 | " FROM\n", 303 | " gdelt_custom_20200101\n", 304 | " WHERE \n", 305 | " ST_Within(geom, st_makeBBOX({}, {}, {}, {}))\n", 306 | " GROUP BY event_root_code\n", 307 | " ORDER BY cnt DESC\n", 308 | "\"\"\".format(*EXTENT)).toPandas()\n", 309 | "\n", 310 | "df.head()" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": {}, 316 | "source": [ 317 | "# Spatial Join" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 11, 323 | "metadata": {}, 324 | "outputs": [ 325 | { 326 | "name": "stdout", 327 | "output_type": "stream", 328 | "text": [ 329 | "CPU times: user 30.9 ms, sys: 10.6 ms, total: 41.5 ms\n", 330 | "Wall time: 59.2 s\n" 331 | ] 332 | }, 333 | { 334 | "data": { 335 | "text/html": [ 336 | "
    \n", 337 | "\n", 350 | "\n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | "
    iso_a2cnt
    0MM65
    1DZ56
    2LT30
    3CI47
    4AZ48
    \n", 386 | "
    " 387 | ], 388 | "text/plain": [ 389 | " iso_a2 cnt\n", 390 | "0 MM 65\n", 391 | "1 DZ 56\n", 392 | "2 LT 30\n", 393 | "3 CI 47\n", 394 | "4 AZ 48" 395 | ] 396 | }, 397 | "execution_count": 11, 398 | "metadata": {}, 399 | "output_type": "execute_result" 400 | } 401 | ], 402 | "source": [ 403 | "%%time\n", 404 | "df = spark.sql(\"\"\"\n", 405 | " SELECT\n", 406 | " c.iso_a2,\n", 407 | " COUNT(g.event_id) AS cnt\n", 408 | " FROM\n", 409 | " gdelt_custom_20200101 AS g,\n", 410 | " ne_countries AS c\n", 411 | " WHERE ST_Within(g.geom, c.polygons)\n", 412 | " GROUP BY c.iso_a2\n", 413 | "\"\"\").toPandas()\n", 414 | "\n", 415 | "df.head()" 416 | ] 417 | } 418 | ], 419 | "metadata": { 420 | "kernelspec": { 421 | "display_name": "spark", 422 | "language": "python", 423 | "name": "spark" 424 | }, 425 | "language_info": { 426 | "codemirror_mode": { 427 | "name": "ipython", 428 | "version": 3 429 | }, 430 | "file_extension": ".py", 431 | "mimetype": "text/x-python", 432 | "name": "python", 433 | "nbconvert_exporter": "python", 434 | "pygments_lexer": "ipython3", 435 | "version": "3.7.7" 436 | } 437 | }, 438 | "nbformat": 4, 439 | "nbformat_minor": 4 440 | } 441 | -------------------------------------------------------------------------------- /spark-parquet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Apache Spark Parquet" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "%matplotlib inline" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import numpy as np\n", 26 | "import pandas as pd\n", 27 | "\n", 28 | "from pyspark import SparkConf\n", 29 | "from pyspark import SparkContext\n", 30 | "from pyspark.sql import SparkSession\n", 31 | "\n", 32 | "conf = SparkConf()\n", 33 | "conf.setMaster('yarn')\n", 34 | "conf.setAppName('PySparkGeoMesaGDELT')\n", 35 | "\n", 36 | "sc = SparkContext(conf=conf)\n", 37 | "spark = SparkSession(sc)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "## Load CAMEO Event Codes" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 12, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "data": { 54 | "text/html": [ 55 | "
    \n", 56 | "\n", 69 | "\n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | "
    event_description
    event_code
    1Make Public Statement
    10Make Statement, Not Specified Below
    11Decline Comment
    12Make Pessimistic Comment
    13Make Optimistic Comment
    \n", 103 | "
    " 104 | ], 105 | "text/plain": [ 106 | " event_description\n", 107 | "event_code \n", 108 | "1 Make Public Statement\n", 109 | "10 Make Statement, Not Specified Below\n", 110 | "11 Decline Comment\n", 111 | "12 Make Pessimistic Comment\n", 112 | "13 Make Optimistic Comment" 113 | ] 114 | }, 115 | "execution_count": 12, 116 | "metadata": {}, 117 | "output_type": "execute_result" 118 | } 119 | ], 120 | "source": [ 121 | "df_codes = pd.read_csv('data/CAMEO.eventcodes.txt', delimiter='\\t')\n", 122 | "df_codes = df_codes.rename(columns={\n", 123 | " 'CAMEOEVENTCODE': 'event_code', \n", 124 | " 'EVENTDESCRIPTION': 'event_description'\n", 125 | "})\n", 126 | "df_codes['event_code'] = df_codes['event_code'].astype(int)\n", 127 | "df_codes['event_description'] = df_codes['event_description'].str.title()\n", 128 | "df_codes = df_codes.set_index('event_code')\n", 129 | "df_codes.head()" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 5, 135 | "metadata": {}, 136 | "outputs": [ 137 | { 138 | "name": "stdout", 139 | "output_type": "stream", 140 | "text": [ 141 | "CPU times: user 5.58 ms, sys: 0 ns, total: 5.58 ms\n", 142 | "Wall time: 6.14 s\n" 143 | ] 144 | } 145 | ], 146 | "source": [ 147 | "%%time\n", 148 | "filepath = \"hdfs://node-master:54310/user/hadoop/gdelt_parquet_2020/2020.snappy.parq\"\n", 149 | "parquetDF = spark.read.parquet(filepath)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 6, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "name": "stdout", 159 | "output_type": "stream", 160 | "text": [ 161 | "CPU times: user 5.78 ms, sys: 3.69 ms, total: 9.47 ms\n", 162 | "Wall time: 35 s\n" 163 | ] 164 | }, 165 | { 166 | "data": { 167 | "text/plain": [ 168 | "24330238" 169 | ] 170 | }, 171 | "execution_count": 6, 172 | "metadata": {}, 173 | "output_type": "execute_result" 174 | } 175 | ], 176 | "source": [ 177 | "%%time\n", 178 | "parquetDF.count()" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 9, 184 | "metadata": {}, 185 | "outputs": [ 186 | { 187 | "name": "stdout", 188 | "output_type": "stream", 189 | "text": [ 190 | "CPU times: user 10.3 ms, sys: 2.34 ms, total: 12.6 ms\n", 191 | "Wall time: 1.97 s\n" 192 | ] 193 | }, 194 | { 195 | "data": { 196 | "text/plain": [ 197 | "Row(event_date=datetime.datetime(2010, 3, 7, 0, 0))" 198 | ] 199 | }, 200 | "execution_count": 9, 201 | "metadata": {}, 202 | "output_type": "execute_result" 203 | } 204 | ], 205 | "source": [ 206 | "%%time\n", 207 | "parquetDF.select('event_date').first()" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 7, 213 | "metadata": {}, 214 | "outputs": [ 215 | { 216 | "name": "stdout", 217 | "output_type": "stream", 218 | "text": [ 219 | "['event_id', 'event_date', 'event_code', 'event_base_code', 'event_root_code', 'lat', 'lon', 'source_url', 'date']\n" 220 | ] 221 | } 222 | ], 223 | "source": [ 224 | "print(parquetDF.columns)" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 10, 230 | "metadata": {}, 231 | "outputs": [ 232 | { 233 | "name": "stdout", 234 | "output_type": "stream", 235 | "text": [ 236 | "CPU times: user 24.8 ms, sys: 1.66 ms, total: 26.4 ms\n", 237 | "Wall time: 40 s\n" 238 | ] 239 | }, 240 | { 241 | "data": { 242 | "text/html": [ 243 | "
    \n", 244 | "\n", 257 | "\n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | "
    event_root_codecount
    045919732
    113617853
    221979791
    351879386
    431640332
    \n", 293 | "
    " 294 | ], 295 | "text/plain": [ 296 | " event_root_code count\n", 297 | "0 4 5919732\n", 298 | "1 1 3617853\n", 299 | "2 2 1979791\n", 300 | "3 5 1879386\n", 301 | "4 3 1640332" 302 | ] 303 | }, 304 | "execution_count": 10, 305 | "metadata": {}, 306 | "output_type": "execute_result" 307 | } 308 | ], 309 | "source": [ 310 | "%%time\n", 311 | "df = parquetDF.groupby('event_root_code') \\\n", 312 | " .count() \\\n", 313 | " .orderBy('count', ascending=False) \\\n", 314 | " .toPandas()\n", 315 | "df.head()" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 13, 321 | "metadata": {}, 322 | "outputs": [ 323 | { 324 | "data": { 325 | "text/html": [ 326 | "
    \n", 327 | "\n", 340 | "\n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | "
    event_root_codecountevent_description
    045919732Consult
    113617853Make Public Statement
    221979791Appeal
    351879386Engage In Diplomatic Cooperation
    431640332Express Intent To Cooperate
    \n", 382 | "
    " 383 | ], 384 | "text/plain": [ 385 | " event_root_code count event_description\n", 386 | "0 4 5919732 Consult\n", 387 | "1 1 3617853 Make Public Statement\n", 388 | "2 2 1979791 Appeal\n", 389 | "3 5 1879386 Engage In Diplomatic Cooperation\n", 390 | "4 3 1640332 Express Intent To Cooperate" 391 | ] 392 | }, 393 | "execution_count": 13, 394 | "metadata": {}, 395 | "output_type": "execute_result" 396 | } 397 | ], 398 | "source": [ 399 | "df_plot = df.join(df_codes, on='event_root_code', how='left')\n", 400 | "df_plot.head()" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": 14, 406 | "metadata": {}, 407 | "outputs": [ 408 | { 409 | "data": { 410 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAA30AAAFyCAYAAABBQ7NFAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nOzdeZRdVZ3+//dDmE0ICkEBxaIRiEAgQhFlEIId6a+iMoiGiP0TUdJ0i6CIdL42KGirsbEVFAWDzSAiIEOUQRlUwgyhEpIUsz8g2EYcUAwJQ4DwfP84u+RyvTUldasqN89rrbvq3L332edzThVr5cMejmwTERERERERrWmNoQ4gIiIiIiIimidJX0RERERERAtL0hcREREREdHCkvRFRERERES0sCR9ERERERERLSxJX0RERERERAtbc6gDiGh1G2+8sdva2oY6jIiIiIhoYXPmzHnC9phGdUn6Ipqsra2Njo6OoQ4jIiIiIlqYpMe6q8v0zoiIiIiIiBaWpC8iIiIiIqKFZXpnrHIkvQ44FdgVWAYsBD5l+6EmXrMNuMr2DpLGA5vZ/llfzu1ctJi2aVc3K7SIiIiIGAYWTt9vqEPoVkb6YpUiScBMYJbtrWxvB3wOeO0ghjEeePcgXi8iIiIiYoUl6YtVzT7AC7bP7CqwPQ+4RdIpku6R1ClpMoCkiZJmSbpU0gOSLiiJI5KmS7pP0gJJXy9l50o6uKtvSUtrLy5pbeCLwGRJ87quExERERExXGV6Z6xqdgDmNCg/iGoEbidgY+AuSTeVurcA2wO/A24F9pB0H3AgMNa2JW3Yl4vbfl7S54F220et3K1ERERERDRfRvqiVewJXGh7ue0/ADdSrfkDmG37t7ZfAuYBbcBTwHPA9yUdBDwzkMFImiqpQ1LH8mcWD2TXERERERH9kqQvVjX3Ars0KFcP5yyrOV4OrGn7RWACcBlwAHBNqX+R8t9FmQa69ooEaXuG7Xbb7SPWH70iXUREREREDIgkfbGq+RWwjqQjugok7Qo8SbXOboSkMcBewOzuOpE0EhhdduD8FNXUUKh2Au1KKvcH1mpw+hJg1EreR0RERETEoEjSF6sU26Zai/dOSQ9Luhc4CfgRsACYT5UYHm/79z10NQq4StICqqmgny7lZwF7S5oNvBV4usG5NwDbZSOXiIiIiFgVqPo3dEQ0S3t7uzs6OoY6jIiIiIhoYZLm2G5vVJeRvoiIiIiIiBaWpC8iIiIiIqKFJemLiIiIiIhoYUn6IiIiIiIiWliSvoiIiIiIiBaWpC8iIiIiIqKFJemLiIiIiIhoYWsOdQARra5z0WLapl091GFERMQwsHD6fkMdQkSshjLSN8QkWdL5Nd/XlPQnSVf1ct5hkk5fwWueK+lRSfMkzZW0Wy/tT5J0XIPyNkn3lON2Sd/qRwzvkXS3pPmS7pP0L6X8AEnb9eH8PrVrFkkTJe0+VNePiIiIiOirJH1D72lgB0nrle/vBBYNwnU/a3s8MA343sp2ZrvD9tF9aStpLWAG8F7bOwFvAWaV6gOAviRzfW3XLBOBJH0RERERMewl6Rsefg50zfeYAlzYVSFpgqTbyqjYbZK2rT9Z0n6Sbpe0saR9y/FcSZdIGtnLtW8C3lT6WShp43LcLmlWTbudJP1K0q8lHdEgholdo5OSRko6R1KnpAWS3l/XfBTV1OI/A9heZvvBMnL2PuCUMgq5laQjJN1VRgQvk7R+N+22knSNpDmSbpY0tsRyrqQzJN0g6RFJe0s6W9L9ks6tib/hcyvP5ORS3ilprKQ24Ejg0+X6b+/lGUdEREREDJkkfcPDRcAhktYFdgTurKl7ANjL9luAzwNfqT1R0oFUo3XvLkUnAJNs7wx0AMf2cu33Ap19iHFHqsR0N+Dzkjbroe2JwGLb42zvCPyqttL2X4ArgMckXSjpUElr2L6tlH/W9njbDwOX2961jAjeD3ysm3YzgE/a3gU4DvhuzSVfDbwD+DRwJfBNYHtgnKTxJdHt6bk9UcrPAI6zvRA4E/hmuf7N9Q9A0lRJHZI6lj+zuOcnGxERERHRRNnIZRiwvaCMHk0BflZXPRo4T9LWgIG1aur2AdqBfW0/Jek9VFMeb5UEsDZwezeXPUXSCcCfgI/1Icyf2n4WeFbSDcAEYF43bScBh9Tc35P1DWx/XNK40vY4qmmthzXoawdJ/wlsCIwErq1vUEbldgcuKfcNsE5NkyttW1In8AfbneW8e4E24PX0/NwuLz/nAAd1c8/19zeDKhFlnU23dl/OiYiIiIhohiR9w8cVwNep1optVFP+JeAG2weWxHBWTd0jwD8A21CNTgm43vaUPlzvs7YvrSt7kZdHf9etq6tPXHpKZNRLfdVBlXx1lo1sHqVx0ncucIDt+ZIOo3o+9dYA/lrWKDayrPx8qea46/uawHJ6fm5d5ywn/81ERERExCom0zuHj7OBL3aNQtUYzcsbuxxWV/cY1cjTDyRtD9wB7CGpa43e+pK26UcMC4FdynH9Orz9Ja0raSOqxOuuHvq5Djiq64ukV9dWljV/E2uKxpd7AVhCteavyyjg8bL5y6E15X9rZ/sp4FFJHyj9S9JOPcRXb0WeW32cERERERHDUkYthgnbvwVOa1D1X1TTO4+lbm1cOe9BSYcCl1CtzzsMuFBS1/TGE4CH+hjGycD/SPocr1xXCDAbuBrYAviS7d+VkcdG/hP4jqrXOSwv/V5eUy/geEnfA56l2sH0sFJ3EXCWpKOBg6nWB95JlRR28nKiVd/uUOCMMmV1rVI/vy83bftPZRSxP8/tSuBSSftTrSX8u3V9XcZtPpqOvJcpIiIiIoaI7Cw3imim9vZ2d3R0DHUYEREREdHCJM2x3d6oLtM7IyIiIiIiWliSvoiIiIiIiBaWpC8iIiIiIqKFJemLiIiIiIhoYUn6IiIiIiIiWliSvoiIiIiIiBaW9/RFNFnnosW0Tbt6qMOIaBkL897LiIiIfslIX6zyJB0oyZLGDuI1D5N0+mBdLyIiIiJiRSXpi1YwBbgFOGSoA4mIiIiIGG6S9MUqTdJIYA/gY5SkT9JESTdJminpPklnSlqj1C2V9N+S5kr6paQxpXwrSddImiPp5q5RQ0nvlXSnpLsl/ULSa4foViMiIiIiVkiSvljVHQBcY/sh4C+Sdi7lE4DPAOOArYCDSvmrgLm2dwZuBL5QymcAn7S9C3Ac8N1SfgvwNttvAS4Cju9LUJKmSuqQ1LH8mcUrdYMRERERESsjG7nEqm4KcGo5vqh8vxqYbfsRAEkXAnsClwIvAReX9j8ELi+jhbsDl0jq6ned8vP1wMWSNgXWBh7tS1C2Z1Alkqyz6dZe0ZuLiIiIiFhZSfpilSVpI+AdwA6SDIwADPys/KzVXeJlqhHvv9oe36D+28A3bF8haSJw0gCEHhERERExaDK9M1ZlBwM/sP1G222230A1ErcnMEHSlmUt32SqaZpQ/c0fXI4/BNxi+yngUUkfAFBlp9JmNLCoHH+k+bcUERERETGwMtIXq7IpwPS6ssuAfwVuL3XjgJuAmaX+aWB7SXOAxVQJIcChwBmSTgDWopoqOp9qZO8SSYuAO4At+xvkuM1H05H3ikVERETEEJGd5UbRWso0zONsv6dB3VLbIwcznvb2dnd0dAzmJSMiIiJiNSNpju32RnWZ3hkREREREdHCMr0zWo7tWcCsbuoGdZQvIiIiImKoZaQvIiIiIiKihSXpi4iIiIiIaGFJ+iIiIiIiIlpYkr6IiIiIiIgWlqQvIiIiIiKihQ373TslLQc6a4ousl3/Qu4h09M74Xo4ZyGwpHwdAVwOfMn2MkmbAd+yfXAvfTT1fXPlvp63fVv5fiTwjO0f9PH81wGnArsCy4CFwKdsP9SUgFeApAOAh2zfV75/EbjJ9i8G8jqdixbTNu3qgewyouUsnL7fUIcQERHRsoZ90gc8a3v8UAfRBPvYfkLSSGBG+XzE9u+AHhO+QTIRWArcBmD7zL6eKEnATOA824eUsvHAa4FBTfokjbC9vJvqA4CrgPsAbH9+0AKLiIiIiBgkq+z0TkkLJZ0saa6kTkljS/kYSdeX8u9JekzSxqXuJ5LmSLpX0tSavj4m6SFJsySdJen0mr4uk3RX+ezRS0wnSTq79POIpKN7uw/bS4EjgQMkvUZSm6R7Sn+HSfqppGskPSjpCw2uKUmnSLqnPIfJpXyipBsl/bjc23RJh0qaXdptVdq9V9Kdku6W9AtJr5XUVmL6tKR5kt5e7u24cs6bStv55TlvVRfWPsALtYmi7Xm2b+4h3p7u4yZJMyXdJ+lMSWuUun0l3V5iuKQk0F1/G5+XdAvwAUlHlN/f/PL7XF/S7sD7gFPKPW4l6VxJB5c+/rE8k87yO12npu+/+7uLiIiIiBiuVoWkb73yj/Kuz+Sauids7wycARxXyr4A/KqUzwS2qGl/uO1dgHbgaEkbqZpOeSLwNuCdQO0/4k8Dvml7V+D9wPf7EO9Y4J+ACcAXJK3V2wm2nwIeBbZuUD0BOBQYT5XAtNfVH1TqdgImUSUxm5a6nYBjgHHAPwPb2J5Q7uOTpc0twNtsvwW4CDje9kLgTKp7H2/75rprXgB8x/ZOwO7A43X1OwBzurnd7uLt6T4mAJ8p97EVcFBJ5E8AJpXfdQdwbM11nrO9p+2LgMtt71rivR/4WJm2egXw2XKPD3edKGld4Fxgsu1xVCPi/1rTd6O/u4iIiIiIYWlVn955efk5hyppANgTOBDA9jWSnqxpf7SkA8vxG6iSrNcBN9r+C4CkS4BtSptJwHaSus7fQNIo20vo3tW2lwHLJP2Rakrjb/twn+qm/Hrbfy6xXV7ur6Omfk/gwjKF8Q+SbqRaR/cUcJftx8u5DwPXlXM6qUbjAF4PXFwSrLWpks/ug5RGAZvbnglg+7k+3Fut7uLt6T5m236kXP/C0vY5YDvg1vL7WRu4veY6F9cc7yDpP4ENgZHAtb3EuC3waM36w/OAT1CtUYTGf3evoGokeSrAiA3G9HK5iIiIiIjmWRWSvp4sKz+X8/K9NEyeVG1MMgnYzfYzkmYB63bXvlijtH92BWKqj6tbJZFqo1rvNrqu2r187yn+2lheqvn+Uk1c3wa+YfuK8oxO6i3cXuoB7qX7dYndnd9Tv42egagS4indnPN0zfG5wAG250s6jGq9Yk96u8dGf3evDNDuWqfJOptuXR9/RERERMSgWRWmd/bXLcAHoVrzBby6lI8GniwJ31iq6ZwAs4G9Jb1a0ppU0zi7XAcc1fVF1WYkA6qsQ/su8BPbTzZo8k5Va/3Wo9p45Na6+puAyZJGSBoD7EV1T301GlhUjj9SU74EGFXfuExF/a2qnS+RtI6k9eua/QpYR9IRXQWSdpW0dw/x9nQfEyRtWdbyTab6Hd8B7CHpTaX/9SV1jdDWGwU8XqbaHtrbPQIPAG1dfVNNjb2xm74jIiIiIoa1VSHpq1/T19vrGk4G9pU0F3gX1XqzJcA1wJqSFgBfokoasL0I+ApwJ/ALqp0cF5e+jgbaJS2QdB/V5iYD5QZVG7bMBn4D/Es37W4BzgfmAZfZ7qirnwksAOZTJVvH2/59P+I4CbhE0s3AEzXlVwIHlmf+9rpz/plqquwCqt09X1dbadtUU2zfKelhSfeW6/yuh3h7uo/bgenAPVTTT2fa/hNwGHBhieMOXrkes9aJVL/f66kSui4XAZ8tG7b8bTOaMmX1o+W5dFKNjPZ599KIiIiIiOFE1b/PW0fZZXG57Rcl7Qac0dsrHySNtL20jPTNBM7uWrM2lMpUxHbbR/XWtlVpBd6DONy0t7e7o6M+V4+IiIiIGDiS5tiu3/QRWPXX9DWyBfDjMhXweeCIXtoDnCRpEtUav+uAnzQxvoiIiIiIiEHTciN9EcNNRvoiIiIiotl6GulbFdb0RURERERExApK0hcREREREdHCkvRFRERERES0sCR9ERERERERLSxJX0RERERERAtrxVc2RAwrnYsW0zbt6qEOI2KFLJy+31CHEBERESspI309kLRc0ryaz7ShjqkRSUv70OZTktZfiWtMlLR7g/KP1jyf5yV1luPpfejzdZIukvSwpPsk/UzSNisaYzNIOkDSdkMdR0RERETEispIX8+etT1+IDuUtKbtFweyzz76FPBD4JkVPH8isBS4rbbQ9jnAOQCSFgL72H6it84kCZgJnGf7kFI2Hngt8NAKxrhCJI2wvbyb6gOAq4D7BjGkiIiIiIgBk5G+fpI0WtKDkrYt3y+UdEQ5XirpvyXNlfRLSWNK+SxJX5F0I3CMpF0k3ShpjqRrJW1a2h1dRrwWSLqolO1dM5J2t6RRPcQ2sVzrUkkPSLpAlaOBzYAbJN1Q2u4r6fYS6yWSRpbyhZJOLuWdksZKagOOBD5d4nh7L89Ikk6RdE/pY3KDZvsAL9g+s6vA9jzbN3d3fg/lEyXdJGlmeX5nSlqjD/f5eUm3AB+QdISkuyTNl3SZpPXLyOb7gFPKfW9VPteU393Nksb29CwiIiIiIoZaRvp6tp6keTXfv2r7YklHAedKOg14te2zSv2rgLm2PyPp88AXgKNK3Ya295a0FnAjsL/tP5XE5cvA4cA0YEvbyyRtWM47DviE7VtLwvJcLzG/Bdge+B1wK7CH7W9JOpYyCidpY+AEYJLtpyX9O3As8MXSxxO2d5b0b8Bxtj8u6Uxgqe2v9+G5HQSMB3YCNgbuknST7cdr2uwAzOnP+cDu3ZQDTAC2Ax4DrgEOkjSrl/t8zvaeAJI26vo9SvpP4GO2vy3pCuAq25eWul8CR9r+taS3At8F3lF/A5KmAlMBRmwwpg+PLCIiIiKiOZL09azh9E7b10v6APAdqgSky0vAxeX4h8DlNXVd5dtSJTzXSwIYAXQlQwuACyT9BPhJKbsV+IakC4DLbf+2l5hnd7UpCWsbcEtdm7dRJUi3lhjWBm6vqe+Kew5VAtZfewIXlimTfygjnLsCV6zk+d2VP0V1349ANfpa2j7Xy31eXHO8Q0n2NgRGAtfWB1WS7t2BS0p/AOs0ugHbM4AZAOtsurX7eN8REREREQMuSd8KKFMH3ww8C7wG6C4Rq/3H/tNdpwP32t6tQfv9gL2ophSeKGl729MlXQ28G7hD0iTbD/QQ3rKa4+U0/h0LuN72lF766O783qj3JtwLHNzP83vqtz6xMr3f59M1x+cCB9ieL+kwqjWM9dYA/jrQ6zwjIiIiIpopa/pWzKeB+4EpwNllyiZUz7MrkfkQfz/CBvAgMEbSbgCS1pK0fUkk32D7BuB4yoiTpK1sd9r+GtABrOgasiVA13rAO4A9JL2pxLC+et81s/b83twETJY0QtW6xr2A2XVtfgWso7IessSxq6S9ezi/p34nSNqyPMfJVM++P/c5Cni8/C4PbXTftp8CHi2jvF1rDHf6u54iIiIiIoaRjPT1rH5N3zXA2cDHgQm2l5Q1ZSdQrd97Gthe0hxgMVXy8Qq2n5d0MPAtSaOpfgenUu1Y+cNSJuCbtv8q6UuS9qEadbsP+PkK3ssM4OeSHre9TxnNulBS1/TEE+h518wrgUsl7Q980vbNPbSdCewGzKcacTve9u9rG9i2pAOBU1W9CuM5YCHVLqM3NTpfUsN+y2YqtwPTgXHl/Jm2X+rHfZ4I3Em1JrCTlxPci4CzVG2GczBVQniGpBOAtUr9/B6eBeM2H01H3nUWEREREUNEdpYbDRRJS22PHOo4VjeSJlJtOPOeoY6lkfb2dnd0dAx1GBERERHRwiTNsd3eqC7TOyMiIiIiIlpYpncOoIzyDQ3bs4BZQxxGRERERMSwlJG+iIiIiIiIFpakLyIiIiIiooUl6YuIiIiIiGhhSfoiIiIiIiJaWDZyiWiyzkWLaZt29VCHEauJhXknZERERNTJSN9qQNJySfMk3StpvqRjJa3Q717SueXl8kj6vqTtBijGCZJukvSgpAdK3+sPRN/NIOkwSZsNdRwREREREb3JSN/q4Vnb4wEkbQL8CBgNfGFlOrX98QGIDUmvBS4BDrF9uyQB7wdGAc8MxDWa4DDgHuB3QxxHRERERESPMtK3mrH9R2AqcJQqIySdIukuSQsk/UtXW0nHS+oso4PT6/uSNEtSezleKunLpe0dJZFD0hhJl5X+75K0R4OwPgGcZ/v2EqNtX2r7D5JeI+knJbY7JO1Y+j1J0nmSrpO0UNJBkv6rxHuNpLVKu4WSviLpdkkdknaWdK2khyUdWXMvn615BieXsjZJ90s6q4ySXidpvTLS2Q5cUEZQ1xuY305ERERExMBL0rcasv0I1e9+E+BjwGLbuwK7AkdI2lLSu4ADgLfa3gn4r166fRVwR2l7E3BEKT8N+Gbp//3A9xucuwMwp5t+Twbutr0j8DngBzV1WwH7AfsDPwRusD0OeLaUd/lf27sBNwPnAgcDbwO+CCBpX2BrYAIwHthF0l7l3K2B79jeHvgr8H7blwIdwKG2x9t+tj5oSVNLktmx/JnF3dxaRERERETzZXrn6kvl577Ajl3r9KimfW4NTALOsf0MgO2/9NLf88BV5XgO8M5yPAnYrpqxCcAGkkbZXtLHOPekShax/StJG0kaXep+bvsFSZ3ACOCaUt4JtNX0cUVN+chy7SWSnpO0IdUz2Be4u7QbSfUMfgM8antezX3V9tst2zOAGQDrbLq1+3ivEREREREDLknfakjSPwDLgT9SJX+ftH1tXZv/A/QnWXnBdlf75bz8t7UGsFuj0bAa9wK7AD9tFG6Dsq7rLAOw/ZKk2uu/xCv/tpfVlC+rKe9qJ+Crtr/3igtLbXXtlwOZyhkRERERq5RM71zNSBoDnAmcXpKka4F/rVkDt42kVwHXAYd37aAp6TUreMnrgKNqrj++QZvTgY9IemtNuw9Leh3VVNFDS9lE4AnbT61gLN25lupeR5brbF42vOnJEqqNZiIiIiIihrWM9K0e1pM0D1gLeBE4H/hGqfs+1ZTFuWXXzD8BB9i+piRoHZKeB35Gtaauv44GviNpAdXf203AkbUNyoYthwBfL8nWS6Xd5cBJwDnl/GeAj6xADD2yfZ2kNwO3l2moS4EPU43sdedc4ExJz9LLSOa4zUfTkXenRURERMQQ0csz4iKiGdrb293R0THUYUREREREC5M0x3Z7o7pM74yIiIiIiGhhSfoiIiIiIiJaWJK+iIiIiIiIFpakLyIiIiIiooUl6YuIiIiIiGhhSfoiIiIiIiJaWJK+iIiIiIiIFpaXs8egkLQc6OTlF8SfB5xq+yVJ7cD/Z/vooYyxWToXLaZt2tVDHUasJhZO32+oQ4iIiIhhJklfDJZnbY8HkLQJ8CNgNPAF2x3AoL+9XNKatl8c7OtGRERERAymTO+MQWf7j8BU4ChVJkq6CkDS3pLmlc/dkkZJGinpl5LmSuqUtH9p2ybpAUnnSVog6VJJ65e6hZK+Jml2+byplJ8r6RuSbgC+Jmm8pDvK+TMlvVrSmyXN7oq3XGdBOd5F0o2S5ki6VtKmg/z4IiIiIiL6JUlfDAnbj1D9/W1SV3Uc8IkyKvh24FngOeBA2zsD+wD/LUml/bbADNs7Ak8B/1bT11O2JwCnA6fWlG8DTLL9GeAHwL+X8zupRh7vB9aW9A+l/WTgx5LWAr4NHGx7F+Bs4Msr+ywiIiIiIpopSV8MJTUouxX4hqSjgQ3L9EsBXymjbb8ANgdeW9r/r+1by/EPgT1r+rqw5uduNeWX2F4uaXS5xo2l/Dxgr3L8Y+CD5XgycDFVgrkDcL2kecAJwOsb3pg0VVKHpI7lzyzu8SFERERERDRTkr4YEmUUbTnwx9py29OBjwPrAXdIGgscCowBdikjgH8A1u06pa5r9+H46T6EeDHwQUnbVGH511TJ5722x5fPONv7NjrZ9gzb7bbbR6w/ug+Xi4iIiIhojiR9MegkjQHOBE637bq6rWx32v4a1eYuY6k2fPmj7Rck7QO8seaULSR1jeJNAW6pqZtc8/P2+jhsLwaelPT2UvTPwI2l7mGqpPREqgQQ4EFgTNf1JK0laft+P4CIiIiIiEGU3TtjsKxXpkR2vbLhfOAbDdp9qiR2y4H7gJ8Do4ArJXUA84AHatrfD3xE0veAXwNn1NStI+lOqv+5MaWbuD4CnFk2gHkE+GhN3cXAKcCWALafl3Qw8K0yNXRNqrWC9/btEUREREREDD7VDbRErDIktQFX2d6hQd1CoN32E4Mc1t9pb293R8egv5EiIiIiIlYjkubYbm9Ul+mdERERERERLSzTO2OVZXsh1W6ajeraBjWYiIiIiIhhKiN9ERERERERLSxJX0RERERERAtL0hcREREREdHCkvRFRERERES0sCR9ERERERERLSy7d0Y0WeeixbRNu3qow4gWsXD6fkMdQkRERKxiMtIXDUlaLmlezWfaUMfUHUkHSrKksYN83VmSGr4AMyIiIiJiuMhIX3TnWdvjB7JDSWvafnEg+yymALcAhwAnNaH/ZsYeEREREdFU/RrpkzRC0maStuj6NCuwGH4kjZb0oKRty/cLJR1RjpdK+m9JcyX9UtKYUj5L0lck3QgcI2kXSTdKmiPpWkmblnZHS7pP0gJJF5WyvWtGGu+WNKpBTCOBPYCPUSV9XeUjJH1dUmfp85OlfFdJt0maL2m2pFGS1pV0Tml7t6R9StvDJF0i6UrgOknrSbqo9HcxsF7znnZERERExMDo80hf+UfzF4A/AC+VYgM7NiGuGHrrSZpX8/2rti+WdBRwrqTTgFfbPqvUvwqYa/szkj5P9bdyVKnb0PbektYCbgT2t/0nSZOBLwOHA9OALW0vk7RhOe844BO2by3J3XMN4jwAuMb2Q5L+Imln23OBqcCWwFtsvyjpNZLWBi4GJtu+S9IGwLPAMQC2x5UpotdJ2qb0vxuwo+2/SDoWeMb2jpJ2BOZ29/AkTS0xMGKDMb0964iIiIiIpunP9M5jgG1t/7lZwcSw0nB6p+3rJX0A+A6wU03VS1QJFcAPgctr6rrKtwV2AK6XBDACeLzULQAukPQT4Cel7FbgG5IuAC63/dsGcU4BTi3HF5Xvc4FJwJldUzJL0jYOeNz2Xe7c93oAACAASURBVKXsKQBJewLfLmUPSHoM6Er6rrf9l3K8F/Ct0m6BpAUN4ul6TjOAGQDrbLq1u2sXEREREdFs/Un6/hdY3KxAYtUgaQ3gzVQjZK8BGiViUI0Cd3m663TgXtu7NWi/H1VS9T7gREnb254u6Wrg3cAdkibZfqAmlo2AdwA7SDJVEmlJx5dr1Sdbjcq6yrvzdN33JHARERERsUrpz5q+R4BZkv6vpGO7Ps0KLIatTwP3U42onV2mbEL1t3RwOf4Q1cYq9R4ExkjaDUDSWpK2L4nkG2zfABwPbAiMlLSV7U7bXwM6gPrdOQ8GfmD7jbbbbL8BeBTYE7gOOFLSmuVarwEeADaTtGspG1XqbwIOLWXbAFuUWOvVttuBTG2OiIiIiFVAf0b6flM+a5dPtLb6NX3XAGcDHwcm2F4i6SbgBKr1e08D20uaQzUiPLm+Q9vPSzoY+Jak0VR/f6cCDwE/LGUCvmn7r5K+VDZVWQ7cB/y8rsspwPS6ssuoks5PUk3RXCDpBeAs26eXdYTflrQe1WjlJOC7wJmSOoEXgcPK2sL6WzgDOKdM65wHzO7tIQKM23w0HXm3WkREREQMEdn9m61WdlC07aXNCSlWRZKW2h451HEMR+3t7e7o6BjqMCIiIiKihUmaY7vhO6T7PL1T0g6S7gbuAe4tW+5vP1BBRkRERERExMDrz5q+GcCxZf3UG4HPAGf1ck6sJjLKFxERERExPPUn6XtV2WgDANuzqN7NFhEREREREcNUfzZyeUTSicD55fuHqXZKjIiIiIiIiGGqPyN9hwNjqF66PbMcf7QZQUVERERERMTA6PNIn+0ngaObGEtEREREREQMsF6TPkmn2v6UpCuBv3u/g+33NSWyiBbRuWgxbdOuHuowokUszDsfIyIiop/6MtLXtYbv680MJKKZJC0HOmuKDgB+ZHv3Xs5bCLTbfqKufCLwvO3bBjjUiIiIiIgB1WvSZ3tOORxv+7TaOknHADc2I7CIAfas7fF1ZT0mfL2YCCwFkvRFRERExLDWn41cPtKg7LABiiNi0ElaWn6uIem7ku6VdJWkn0k6uKbpJyXNldQpaaykNuBI4NOS5kl6+xCEHxERERHRJ31Z0zcF+BCwpaQraqpGAX9uVmARA2w9SfPK8aO2D6ypOwhoA8YBmwD3A2fX1D9he2dJ/wYcZ/vjks4EltpuOO1Z0lRgKsCIDcYM7J1ERERERPRDX9b03QY8DmwM/HdN+RJgQTOCimiCRtM7u+wJXGL7JeD3km6oq7+8/JxDlSD2yvYMYAbAOptu/XcbIEVEREREDJa+rOl7DHgM2E3S64AJVLt4Pmj7xSbHFzEY1Ev9svJzOf14zUlERERExHDQ5zV9kj4GzKYa6TgYuEPS4c0KLGIQ3QK8v6ztey3VJi29WUI1xTkiIiIiYljrz6jF8cBbbP8ZQNJGVFM/z+7xrIjh7zLgH4F7gIeAO4HFvZxzJXCppP2BT9q+ubuG4zYfTUferRYRERERQ6Q/Sd9vqUY3uiwB/ndgw4loDtsjuyuz/ZKk42wvLf8zYzblnX6222rad1BGAW0/BOzY/MgjIiIiIlZOf5K+RcCdkn5KtaZvf2C2pGMBbH+jCfFFDJarJG0IrA18yfbvhzqgiIiIiIiB0J+k7+Hy6fLT8jPrmmKVZ3viUMcQEREREdEMfU76bJ/czEAiIiIiIiJi4PXl5eyn2v6UpCuppnW+gu33NSWyiIiIiIiIWGl9Gek7v/z8ejMDiYiIiIiIiIHXl5ezz5E0AjjC9ocHIaaIiIiIiIgYIH1a02d7uaQxkta2/Xyzg4poJZ2LFtM27eqhDiMG2cK8mzEiIiKGiTX60XYhcKukEyUd2/VpUlwtTdJySfNqPtOGOqZakiZKuqqf5xwuqVPSAkn3lJeWD0UcCyVtXI5vW4Frzpd0YR/ajZf07v72HxEREREx2Przyobflc8a5DUNK+tZ2+OHOoiBIun1wH8AO9teLGkkMGaIw8L27v1pL+nNVH/fe0l6le2ne2g+HmgHfrYSIUZERERENF2fR/psn9zo08zgVjdllOpkSXPLqNnYUj5G0vWl/HuSHqsZzfqJpDmS7pU0taavj0l6SNIsSWdJOr2mr8sk3VU+e/QS00mSzi79PCLp6AbNNgGWAEsBbC+1/aikrSTNrelra0lzau71K5Jul9QhaWdJ10p6WNKRNX1vIGmmpPsknSlpjXL+lPKM7pH0tW5iX1pzfHxpP1/S9G5u90NUGxddB7yv5txdJd1Wzp0taTTwRWByGamd3NMzjIiIiIgYSn1O+krSsWHN91dLurY5YbW89eqmd9YmDU/Y3hk4AziulH0B+FUpnwlsUdP+cNu7UI06HS1pI0mbAScCbwPeCYytaX8a8E3buwLvB77fh3jHAv8ETAC+IGmtuvr5wB+ARyWdI+m9ALYfBhZL6hrV/Chwbs15/2t7N+DmUn5wifmLNW0mAJ8BxgFbAQeV+/sa8A6qEbddJR3QXfCS3gUcALzV9k7Af3XTdDJwMXAhMKWcu3YpO6acOwl4Gvg8cLHt8bYv7u7aERERERFDrT/TO8fY/mvXF9tPStqkCTGtDnqa3nl5+TkHOKgc7wkcCGD7GklP1rQ/WtKB5fgNwNbA64Abbf8FQNIlwDalzSRgO0ld528gaZTtJT3Ee7XtZcAySX8EXgv8tquybPTzf4BdgX8EvilpF9snUSWVHy3rPydTJXFdrig/O4GRJYYlkp6r+R8Ms20/Uu7jwvIsXgBm2f5TKb8A2Av4STfxTwLOsf1Mifcv9Q0k7Qr8yfZjkn4LnC3p1cDrgcdt31XOfaq07+FxQRl1nQowYoMhn+kaEREREaux/mzkslzS30aYJL2RBi9rj5W2rPxczstJecMMQ9JEqoRmtzIKdTewbnftizVK+/Hls3kvCV9tTPVx/Y0rs21/FTiEahQR4DLgXcB7gDm2/9yg35fqrvFSzTXq/8ZMz/fXiBr0U28KMFbSQuBhYAOqe+jLuX/H9gzb7bbbR6w/ur+nR0REREQMmP4kff8B3CLpfEnnAzcB/7c5YUWdW4APAkjaF3h1KR8NPGn7mbL+722lfDawd5mCuyYvJ2BQrVc7qutLzdTLFSZpM0k71xSNBx4DsP0ccC3VdNVzVqD7CZK2LGv5JlM9izup7m9jVe+QnALc2EMf1wGHS1q/xPuauvjXAD4A7Gi7zXYbsH/p9wFgszISiKRR5ZkuIRsaRURERMQqoD8buVwD7Ey1vunHwC62s6ZvxdSv6etuY5EuJwP7lk1R3gU8TpV0XAOsKWkB8CXgDgDbi4CvUCVHvwDuAxaXvo4G2lW9WuE+oHbTlBW1FvB1SQ9ImkeVnB1TU38B1WjZdSvQ9+3AdOAe4FFgpu3Hqf6Hww1U6wnn2v5pdx2Uv90rgI4S33F1TfYCFpXn1uUmYDtgo3I/35Y0H7ieajT1BqppstnIJSIiIiKGNdl9m7lWdnmcZ/tpSR+mSgBPs/1YMwMMkLQOsNz2i5J2A87o7ZUPkkbaXlpGpWYCZ9ueORjxNojlOGC07ROH4vpDrb293R0dHUMdRkRERES0MElzbLc3quvPRi5nADtJ2gn4LHA28ANg75UPMXqxBfDjMg3xeeCIPpxzkqRJVKNS19H9JidNJWkm1a6b7xiK60dERERErO76k/S9aNuS9ge+Zft/JH2kWYHFy2z/GnhLP8+pn8I4JGwf2HuriIiIiIholv4kfUsk/V/gn4G3lw006t/XFhEREREREcNIf3bvnEy1rf7htn8PbA6c0pSoIiIiIiIiYkD0Z/fO31O9c22dUvQE1QYhERERERERMUz1OemTdARwKfC9UrQ5Q7Q5SERERERERPRNf6Z3fgLYA3gK/ra5yCbNCCoiIiIiIiIGRn82cllm+3lJAJT3v/XtJX8RTSTpdcCpwK5U604XAp+y/dBQxtWlc9Fi2qZdPdRhxCBbOH2/oQ4hIiIiAujfSN+Nkj4HrCfpncAlwJXNCSuib1T9X4iZwCzbW9neDvgc8NqV6bO8EzEiIiIiYpXXn3/YTgP+BHQC/wL8DDihGUFF9MM+wAu2z+wqsD0PuEXSKZLukdQpaXJXvaTPSrpL0gJJJ5eyNkn3S/ouMBd4g6Tjy7nzJU0v7baSdI2kOZJuljR2cG83IiIiIqJ/+jy90/ZLwFnlEzFc7ADMaVB+EDAe2AnYGLhL0k3AOGBrYAIg4ApJewG/AbYFPmr73yS9CzgAeKvtZyS9pvQ7AzjS9q8lvRX4LvCO5t1eRERERMTK6TXpk9RJD2v3bO84oBFFDIw9gQttLwf+IOlGqjV/ewH7AneXdiOpksDfAI/ZvqOUTwLOsf0MgO2/SBoJ7A5c0rW2lZdfYfIKkqYCUwFGbDBmgG8tIiIiIqLv+jLS957y8xPl5/nl56HAMwMeUUT/3Asc3KBcDcq6yr9q+3uvKJTagKfr2tX/z441gL/aHt9bULZnUI0Kss6mW2fDo4iIiIgYMr2u6bP9mO3HgD1sH2+7s3ymAf/U/BAjevQrYJ3yHkkAJO0KPAlMljRC0hiqEb7ZwLXA4WXUDkmbS2r06pHrSrv1S7vX2H4KeFTSB0qZJO3UzJuLiIiIiFhZ/Xllw6sk7Wn7FgBJuwOvak5YEX1j25IOBE6VNA14jvLKBqqpm/OpRuyOt/174PeS3gzcXqZoLgU+DCyv6/caSeOBDknPU21c9DmqEe4zJJ0ArAVcVK4RERERETEsye7bzDNJuwBnA6Op/hG9GDjc9tzmhRex6mtvb3dHR8dQhxERERERLUzSHNvtjer6s3vnHGAnSRtQJYuL6y7yEdvnrVyoERERERERMZD6/QJq20/VJ3zFMQMQT0RERERERAygfid9Pehut8SIiIiIiIgYIgOZ9GVb+oiIiIiIiGEmI30REREREREtrM9Jn6Qteym7dUAiioiIiIiIiAHTn5G+yxqUXdp1YPuolQ8nIiIiIiIiBlKvr2yQNBbYHhgt6aCaqg2AdZsVWESr6Fy0mLZpVw91GNHAwun7DXUIEREREU3Xl5G+bYH3ABsC76357Awc0bzQIgaOKrdIeldN2QclvSRpWi/nHibp9G7qlg50rBERERERA6nXkT7bPwV+Kmk327cPQkwRA862JR0JXCLpBmAE8GVga9sPD210ERERERHN02vSV+P/l/Q5oK32PNuHD3RQEc1g+x5JVwL/DrwK+AHwdkmftn2UpDHAmcAW5ZRP2X7FBkVl86IfUf03cM3gRR8RERERsWL6k/T9FLgZ+AWwvDnhRDTdycBc4HmgHZhSU3ca8E3bt0jaArgWeHPd+acBZ9j+gaRPdHcRSVOBqQAjNhgzgOFHRERERPRPf5K+9W3/e9MiiRgEtp+WdDGw1PYy6RWvl5wEbFdTtoGkUXVd7AG8vxyfD3ytm+vMAGYArLPp1h6g8CMiIiIi+q0/Sd9Vkt5t+2dNiyZicLxUPvXWAHaz/WxtYV1iCJAkLiIiIiJWGf15T98xVInfc5KekrRE0lPNCixiCFwH/O19k5LGN2hzK3BIOT50MIKKiIiIiFgZfR7ps10/zS2i1RwNfEfSAqr/Nm4CjqxrcwzwI0nHAJf1pdNxm4+mI++Di4iIiIghIrtvM9VUzXE7FNjS9pckvQHY1PbsZgYYsaprb293R0fHUIcRERERES1M0hzb7Y3q+jO987vAbsCHyvelwHdWMraIiIiIiIhoov5s5PJW2ztLuhvA9pOS1m5SXBERERERETEA+jPS94KkEZSdC8uLrBvtgBgRERERERHDRH+Svm8BM4FNJH0ZuAX4SlOiioiIiIiIiAHRn907L5A0B/hHQMABtu9vWmQRERERERGx0vqc9Ek6DbjYdjZviYiIiIiIWEX0ZyOXucAJkrahmuZ5se3sQx/Ri85Fi2mbdvVQhxF1FubdiREREbGa6POaPtvn2X43MAF4CPiapF83LbJoKZKWS5on6R5Jl0hafwD6fJ+kad3ULV2B/t4iyZL+qa78tm7anyvp4P5eJyIiIiJiMPVnI5cubwLGAm3AAwMaTbSyZ22Pt70D8DxwZG1l2Rm2X2xfYXv6QAUITKHaoGhK3XV2H8BrREREREQMqj4nfZK+Jukh4IvAPcAutt/btMiild0MvEnSREk3SPoR0ClpXUnnSOqUdLekfQAk3Slp+66TJc2StIukwySdXsq2lHS7pLskfan2YpI+W8oXSDq5UUCSBBwMHAbsK2ndmrqlXW0knS7pPklXA5sM6FOJiIiIiGiC/oz0PQb8J3Cb7XOADSRNaE5Y0aokrQm8C+gsRROA/7C9HfAJANvjqEbbzivJ10XAB8v5mwKb2Z5T1/VpwBm2dwV+X3O9fYGty3XGA7tI2qtBaHsAj9p+GJgFvLtBmwOBbYFxwBFAtyOAkqZK6pDUsfyZxd01i4iIiIhouv4kfeOAt/Ly1LclQHbyjL5aT9I8oAP4DfA/pXy27UfL8Z7A+QC2H6D6Hw3bAD8GPlDafBC4pEH/ewAXluPza8r3LZ+7qTYjGkuVBNabQpVcUn5OadBmL+BC28tt/w74VXc3a3uG7Xbb7SPWH91ds4iIiIiIpuvP7p1vtb2zpLsBbD8pae0mxRWt51nb42sLqhmVPF1b1OhE24sk/VnSjsBk4F+6uYYblAn4qu3vdRdYWU/4fuB9kv6jnLORpFG2l/ThGhERERERw1Z/RvpeKP84NoCkMcBLTYkqVlc3AYcClFeDbAE8WOouAo4HRtvubHDurcAh5fjQmvJrgcMljSz9bi6pfi3eJGC+7TfYbrP9RuAy4IAG8R0iaUSZZrrPitxkRERERMRg6k/S9y2q9/NtIunLVLscfqUpUcXq6rvACEmdwMXAYbaXlbpLqZK6H3dz7jHAJyTdBfxtPqXt64AfAbeXfi8FRtWdO4Xqb7vWZcCH6spmAr+mWo94BnBj328tIiIiImJoyO77bDVJY4F/pJr+9kvb9zcrsIhW0d7e7o6OjqEOIyIiIiJamKQ5ttsb1fVnTV/X5hp5N19ERERERMQqYkVezh4RERERERGriCR9ERERERERLSxJX0RERERERAtL0hcREREREf+vvXuPu3Su9z/+ehtjHIbRjtpDhykNNoPBzQ7FkPwq5JAaUu3Z22bXdgibmv3bkiiRdkqKJgklNERODdIM4+we5mAmtDH9tkNRGOMQmnn//ri+K8ttrftk7nvds7yfj8f9WGtd39Pne63bw3zu7/e6rmhjSfoiIiIiIiLaWJK+iIiIiIiINtanRza8kUky8FPbny6fVwQeA263vVs37SYBHbYP6ceY5wA7AIuApcDBtm/te/Sv6fejwEa2T+pDm6uBT9p+ukn54cAU28/3pn6D9h8GTgBWo3oO5JW2j+ptfIOt63y7M++RRYyZfNUgRDV0LTxp11aHEBEREfGGlZW+3nsOGCdplfL5g8AjgzDu0bbHA5OBHyyLDm1f3peEr7T5SA8J3OHAqn2o/zeSxgGnA5+y/Q/AOODBvsTXAq+ab0RERETEUJWkr29+BdSWLPYDLqgVSNpa0i2S7i6vG3RtLGlXSbdKWkvSLuX9XZKmShrZw9g3Au8p/XxK0h2SZkv6gaRh5eccSfdImifpiFL3MEkLJM2VdGE5NknS6eX9OZLOkDRd0oOSdpB0tqTflpXGWuwLS9yrSbpK0pwy1kRJhwHrANMlTa+vX95/pow/R9JPGsztC8DXbN8LYPuvtr9f2r5T0vWl/fWS3tHHuJ+VdLKkWZJ+Xb6nGaXNR0udYZJOkXRnGeffyvEJpe7Fku6VdL4qr5lvRERERMRQlaSvby4E9pW0MrApcHtd2b3A9rY3B44FTqxvKGkvqtW6j5RDxwA7294C6ASO7GHs3YF5kv4BmAhsV1YAlwD7A+OBdW2Ps70J8OPSbjKwue1Ngc826ftNwE7AEcAVwKnAxsAmksZ3qfsh4FHbm9keB0yzfRrwKLCj7R27zHtj4L+AnWxvBny+wfjjgFlNYjsdOK/Efz5wWh/jXg2YYXtLYDHwVapV2r2A40udA4BFtrcCtgIOlPSuUrY51areRsC7qc570/lGRERERAw1uaavD2zPlTSGapXv6i7Fo4BzJY0FDAyvK9sR6AB2sf2MpN2okoibJQGsBDS7Vu8USccAT1AlJx8AtgTuLG1XAR6nSnreLem7wFXAtaX9XOB8SZcBlzUZ4wrbljQP+KPteQCS5gNjgNl1decB35R0MtV1dzOb9FmzE3Cx7T8B2H6yh/pdbQPsXd7/BPhGH+N+CZhWF/uLtl8ubcaU47sAm0rap3weBYwtbe+w/XDpd3Zpc1NPQUs6CDgIYNgaa/dtxhERERERy1BW+vrucuCb1G3tLE4AppfVr92BlevKHgRWB9YvnwVcZ3t8+dnI9gFNxju61Pmg7XtK23Pr2m5g+zjbTwGbATOAg4GzSvtdge9RJYqzyg1ounqxvC6te1/7/Kr6tu8vfc0Dvi7p2CZx14gqCe7O/NJnb9T31Zu4X7btrvVs19cRcGjdOX2X7VrSXN/vEnr5hxLbU2x32O4Ytuqo3jSJiIiIiBgQSfr67mzg+NqqUp1RvHJjl0ldyn5PtVp1XtnueBuwnaTaNXqrSlqf3rke2EfSW0rbvyvXva0FrGD7EuBLwBaSVgDebns61XVzawI9XTvYLUnrAM/b/ilV8rtFKVpMldg2ivcTkt5ci7dBnVOA/1s7B5JWkFTb7noLsG95vz+9WGXrh2uAz0kaXsZfX9JqPbRpNt+IiIiIiCEl2zv7qGz1+06Dom9Qbe88EvhNg3b3SdofmEq1EjgJuEDSiFLlGOD+Xoy/oGz3vLYkdS9Trey9APy4HAP4T2AY8FNJo6hWs061/XTZFtpfm1BtOV1axv5cOT4F+JWkx+qvc7M9X9LXgBskLQHupktSXLbNHk51PlalWs2rPePgMOBsSUdTbXH959cTfBNnUW3bvEvVyXkC2LOHNg3nGxEREREx1OiVnW8RMRA6Ojrc2dnZ6jAiIiIioo1JmmW7o1FZtndGRERERES0sSR9ERERERERbSxJX0RERERERBtL0hcREREREdHGkvRFRERERES0sSR9ERERERERbSxJX0RERERERBvLw9kjBti8RxYxZvJVPVdsYwtP2rXVIURERES8YWWlL9qKpCWSZku6R9IVktbsof5nJX2mH+OsKenf+x9pRERERMTgSNIX7eYF2+NtjwOeBA7urrLtM22f149x1gSS9EVERETEkJekL9rZrcC6AJLWkzRN0ixJMyVtWI4fJ+moHuq8VdKlkuaUn22Bk4D1yqriKS2aX0REREREj3JNX7QlScOADwA/KoemAJ+1/TtJ/wh8H9ipS7NmdU4DbrC9V+l3JDAZGGd7fJPxDwIOAhi2xtrLdnIREREREX2QpC/azSqSZgNjgFnAdZJGAtsCUyXV6o2ob9RDnZ2AzwDYXgIskvSm7oKwPYUqiWTE6LF+fVOKiIiIiOi/JH3Rbl6wPV7SKOBKqmv6zgGebrYqV6zQizoREREREcudXNMXbcn2IuAw4CjgBeAhSR8HUGWzLvWf6abO9cDnyvFhktYAFgOrD8pkIiIiIiJeh6z0RduyfbekOcC+wP7AGZKOAYYDFwJzalXLa7M6nwemSDoAWAJ8zvatkm6WdA/wK9tHN4tjk3VH0Znn1EVEREREiyTpi7Zie2SXz7vXffxQgyZvBn5f6j7UqI7tPwJ7NDj+ydcVbERERETEIMj2znjDknQC8I/A5a2OJSIiIiJioCTpizcs21+yvbXtP7c6loiIiIiIgZKkLyIiIiIioo0l6YuIiIiIiGhjSfoiIiIiIiLaWJK+iIiIiIiINpZHNkQMsHmPLGLM5KtaHcagWpjnEkZEREQMGW250idpiaTZdT+TWx1TPUkTJF3ZxzYLJc3scmx2eTh4d+3GSOrX8+Qk3dKLOs82Of73ki6U9ICkBZKulrR+f+IYKJL2lLRR3efjJe3cypgiIiIiIpa1dl3pe8H2+FYHMQBWl/R22/8r6R962WYM8EngZ70dRNIw20tsb9ufICUJuBQ41/a+5dh44K3A/f3ps79qc2lSvCdwJbAAwPaxgxZYRERERMQgacuVvmbKatlXJN0laZ6kDcvxtSVdV47/QNLvJa1Vyi6TNEvSfEkH1fV1gKT7Jc2Q9ENJp9f1dYmkO8vPdj3EdJyks0s/D0o6rJvqPwcmlvf7ARfU9TNG0swyh7sk1RK2k4D3l1XBIyQNk3RKiW2upH8r7SdImi7pZ8C8cuzZ8jpS0vV1522PHk71jsDLts+sHbA92/ZMVU6RdE/pa2IZo9nxCZJulHRpWTE8U9IKpWwXSbeWuKZKGlmOL5R0rKSbgI9LOrDMd075blYt5+ejwCnl3Kwn6RxJ+5Q+PiDp7hLL2ZJG1PX9mt+hiIiIiIihql2TvlX06u2dE+vK/mR7C+AM4Khy7MvAb8rxS4F31NX/F9tbAh3AYZLeLGkd4EvAe4EPAvX/8P8OcKrtrYCPAWf1It4Ngf8DbA18WdLwJvUuBvYu73cHrqgrexz4YJnDROC0cnwyMNP2eNunAgcAi0p8WwEHSnpXqbs18F+2N+LV/gLsVfreEfjvsprXzDhgVpOyvYHxwGbAzlRJ1+hujtfi+g9gE2A9YO+SlB8D7Fzi6gSOrI/Z9vtsXwj8wvZWtjcDfgscYPsW4HLg6HJuHqg1lLQycA4w0fYmVCvin6vru9Hv0KtIOkhSp6TOJc8v6uZURUREREQMrDfi9s5flNdZvJJAvQ/YC8D2NElP1dU/TNJe5f3bgbHA3wM32H4SQNJUoHa92s7ARnU50RqSVre9uJt4r7L9IvCipMeptkE+3KDek8BTkvalSl6erysbDpyuahvlkrp4utoF2LS2ogWMKnN6CbjD9kMN2gg4gdYDtgAAGpBJREFUUdL2wFJg3RLjH7qZUzPvAy4oWy7/KOkGquSz2fFnSlwPAki6oNT9C7ARcHM51ysBt9aNc1Hd+3GSvgqsCYwErukhxg2Ah2zXtqKeCxwMfLt8bvQ79Cq2pwBTAEaMHusexouIiIiIGDDtmvR158XyuoRX5t9w1UrSBKokbhvbz0uaAazcrH6xQqn/Qj9i6hpXIxcB3wMmdTl+BPBHqpWyFaiSokYEHGr7VYlPmetzTdrsD6wNbGn7ZUkLqc5DM/OBfZqUNTt33Z3TrkmTS/3rbO/XpE39XM4B9rQ9R9IkYEI3Y/UUCzT+HYqIiIiIGJLadXtnX90EfAKq68SAN5Xjo4CnSsK3IdV2ToA7gB0kvUnSilTbOGuuBQ6pfSgrb8vSpcA3eO1q1SjgMdtLgU8Dw8rxxcDqdfWuAT5X20IqaX1Jq/Uw5ijg8ZLw7Qi8s4f6vwFGSDqwdkDSVpJ2AG4EJpZrC9cGtqc6n82OA2wt6V3lWr6JVN/XbcB2kt5T+l9Vze8OujrwWJnz/nXHu56bmnuBMbW+qc7nDT3MOSIiIiJiSGrXVYpVJM2u+zzNdnePbfgKcEG59u8G4DGqhGAa8FlJc4H7qBINbD8i6UTgduBRqrs/1i7cOgz4XmmzIlUy89llNbGyTfRkgC6X1X0fuETSx4HpvLLSNRf4q6Q5VCte36G6o+dd5bq8J6juYtmd84ErJHUCs6mSou5idNkS+21Vj8v4C7AQOJzqfGwDzKFasfuC7T9IurTJ8Q2ptm2eRHVN343ApbaXllW7C2o3WaG6xq/R3UG/RPVd/Z7qJjW1RO9C4Ieqbp7zt5VJ23+R9M/A1JLU3wmcST9tsu4oOvPcuoiIiIhoEdm53KgkDUts/1XSNsAZPT3yQdJI28+WpOBS4Gzblw5GvG8kZdvpUbZ3a3Us/dXR0eHOzs5WhxERERERbUzSLNsdjcradaWvr94B/LxsH3wJOLCH+gDHqXqQ98pUWzovG8D4IiIiIiIi+iVJH2D7d8DmfWzT8Fb9sWzZngHMaHEYERERERHLrdzIJSIiIiIioo0l6YuIiIiIiGhjSfoiIiIiIiLaWJK+iIiIiIiINpakLyIiIiIioo3l7p0xaCQ9a3vkIIyzJ3C/7QXl8/HAjbZ/3Y++xgPr2L66v/HMe2QRYyZf1d/mQ8rCPGQ+IiIiYrmTlb5oR3sCG9U+2D62PwlfMR74yDKJKiIiIiKiBZL0xaCTNEHSDEkXS7pX0vmqfFjSz7vUu6K830XSrZLukjRV0shy/CRJCyTNlfRNSdsCHwVOkTRb0nqSzpG0T6n/kTLmTZJOk3RlOb61pFsk3V1eN5C0EnA8MLH0NVHSapLOlnRnqbvHYJ+/iIiIiIi+yPbOaJXNgY2BR4Gbge2A64AfSFrN9nPAROAiSWsBxwA7235O0heBIyWdDuwFbGjbkta0/bSky4ErbV8MIInyujLwA2B72w9JuqAunnvL8b9K2hk40fbHJB0LdNg+pPRxIvAb2/8iaU3gDkm/LvFGRERERAw5WemLVrnD9sO2lwKzgTG2/wpMA3aXtCKwK/BL4L1U2zVvljQb+CfgncAzwF+AsyTtDTzfw5gbAg/afqh8rk/6RgFTJd0DnEqVkDayCzC5xDEDWBl4R9dKkg6S1Cmpc8nzi3oIKyIiIiJi4GSlL1rlxbr3S3jld/Ei4GDgSeBO24tVLdVdZ3u/rp1I2hr4ALAvcAiwUzdjqpuyE4DptveSNIYqoWvWx8ds39dNX9ieAkwBGDF6rLurGxERERExkLLSF0PNDGAL4ECqBBDgNmA7Se8BkLSqpPXLdX2jyp01D6e66QrAYmD1Bn3fC7y7JHVQbR+tGQU8Ut5Pqjveta9rgENLIoqkzfs2vYiIiIiIwZWkL4YU20uAK4EPl1dsP0GViF0gaS5VErghVTJ2ZTl2A3BE6eZC4Ohyo5X16vp+Afh3YJqkm4A/ArW9l98Avi7pZmBYXUjTgY1qN3KhWhEcDswtW0FPWManICIiIiJimZKdnWfxxiFppO1ny0rd94Df2T51IMfs6OhwZ2fnQA4REREREW9wkmbZ7mhUlpW+eKM5sNyEZT7Vls4ftDieiIiIiIgBlRu5xBtKWdUb0JW9iIiIiIihJCt9ERERERERbSxJX0RERERERBtL0hcREREREdHGkvRFRERERES0sSR9ERERERERbSx374wYYPMeWcSYyVe1OoweLTxp11aHEBEREREDICt9r4MkS/pJ3ecVJT0h6coe2k2SdHo/x5SkYyT9TtL9kqZL2rgX7SZJWqfu81mSNurDuB2STuumfIykT/a2foP2wyWdVOZ1j6Q7JH24t+0HW9f5RkREREQMVUn6Xp/ngHGSVimfPwg8MsBjHgxsC2xme33g68Dlklbuod0k4G9Jn+1/tb2gt4Pa7rR9WDdVxgB/S4J6Ub+rE4DRwDjb44DdgdX70H6wjaFuvhERERERQ1WSvtfvV0BtX9x+wAW1AklbS7pF0t3ldYOujSXtKulWSWtJ2qW8v0vSVEkjG4z3ReBQ288D2L4WuAXYv/T3rKT/Ln1cL2ltSfsAHcD5kmZLWkXSDEkddW1OljRL0q9L3DMkPSjpo6XOhNoKpqQdSj+zy9xWB04C3l+OHdGl/khJP5Y0T9JcSR/rcg5WBQ4s83qxzOuPtn9eyvcrbe+RdHJdu97EPUnSZZKukPSQpEMkHVnivk3S35V660maVvqaKWnDcvwcSaeV7+/Bci7pOt8ef0siIiIiIlokSd/rdyGwb1lp2xS4va7sXmB725sDxwIn1jeUtBcwGfhIOXQMsLPtLYBO4Mgu9dcAVrP9QJcYOoHaFs/VgLtKHzcAX7Z9camzv+3xtl/o0n41YIbtLYHFwFepVi33Ao5vMOejgINtjwfeD7xQ5jGz9H9ql/pfAhbZ3sT2psBvupS/B/h/tp/pOlDZknoysBMwHthK0p59jHsc1arc1sDXgOfLd3Ir8JlSZwpV0rllmd/369qPBt4H7EaV7NHDfJF0kKROSZ1Lnl/UtTgiIiIiYtDkRi6vk+25ksZQrfJd3aV4FHCupLGAgeF1ZTtSrb7tYvsZSbsBGwE3SwJYiSop6Q2V/gGWAheV9z8FftGL9i8B08r7ecCLtl+WNI9qG2NXNwPfknQ+8AvbD5eYm9kZ2Lf2wfZTvYipZiuqxO4JgDLm9sBlfYh7uu3FwGJJi4Ar6tpsWlZUtwWm1s1jRF37y2wvBRZIemtvgrY9hSqRZMTose6hekRERETEgEnSt2xcDnwTmAC8ue74CVQJx14lMZxRV/Yg8G5gfapVOAHX2d6v2SAlOXxO0rttP1hXVFvVa9isF/G/bLs+aaxtsVwq6TW/I7ZPknQV1QrlbZJ27qH/+qS0kf8B3iFp9ZKcdW37euN+se790rrPS6n+G1gBeLqsXDZS377b7DYiIiIiYqjJ9s5l42zgeNvzuhwfxSs3dpnUpez3wN7AearuvnkbsJ2k90B1nZuk9RuMdQpwWu3mMSXheh/ws1K+AlC77uyTwE3l/WKW0Y1RJK1ne57tk6kS1g176P9a4JC69m+qLyzXJ/6Ial4rlTqjJX2KarvsDuWax2FUK6rNEtx+KdtKH5L08TK2JG3WQ7Nldj4jIiIiIgZSVvqWAdsPA99pUPQNqu2dR/La69iwfZ+k/YGpVHernARcIKm2tfAY4P4uzb4LvAmYJ2kJ8Adgj7rr9J4DNpY0C1gETCzHzwHOlPQCsE1/5lnncEk7AkuABVQ3s1kK/FXSnDLW3XX1vwp8T9I9pc1XeO2202NKvQWS/lLmcaztxyT9JzCdapXtatu/fJ3xN7I/cIakY6i24V4IzOmm/lzq5tvour6aTdYdRWeegRcRERERLaJXdsdFO5D0rO1Gd/2MFuno6HBnZ2erw4iIiIiINiZplu2ORmXZ3hkREREREdHGkvS1mazyRUREREREvSR9ERERERERbSxJX0RERERERBtL0hcREREREdHGkvRFRERERES0sTynL2KAzXtkEWMmX9XqMHq0MM8SjIiIiGhLSfpiyJH0ZuD68vHvqR7o/gQwBnjU9kYDMOaewP22FyzrviMiIiIiWinbO2PIsf1n2+NtjwfOBE4t78cDS3tqL6k/f8zYE1jmyWRERERERKsl6YvlzTBJP5Q0X9K1klYBkDRD0omSbgA+L2lLSTdImiXpGkmjS70DJd0paY6kSyStKmlb4KPAKZJmS1qv/Ewr7WdK2rC0P0fSaZJukfSgpH1adiYiIiIiInohSV8sb8YC37O9MfA08LG6sjVt7wCcBnwX2Mf2lsDZwNdKnV/Y3sr2ZsBvgQNs3wJcDhxdVhgfAKYAh5b2RwHfrxtnNPA+YDfgpEZBSjpIUqekziXPL1o2M4+IiIiI6Idc0xfLm4dszy7vZ1Fd51dzUXndABgHXCcJYBjwWCkbJ+mrwJrASOCargNIGglsC0wt7QFG1FW5zPZSYIGktzYK0vYUqsSREaPHug/zi4iIiIhYppL0xfLmxbr3S4BV6j4/V14FzLe9TYP25wB72p4jaRIwoUGdFYCny3WEPcWgJnUiIiIiIoaEbO+MdnQfsLakbQAkDZe0cSlbHXhM0nBg/7o2i0sZtp8BHpL08dJekjYbtOgjIiIiIpahrPRF27H9UrnBymmSRlH9nn8bmA98Cbgd+D0wj5LoARcCP5R0GLAPVUJ4hqRjgOGlfE5/4tlk3VF05hl4EREREdEisnO5UcRA6ujocGdnZ6vDiIiIiIg2JmmW7Y5GZdneGRERERER0caS9EVERERERLSxJH0RERERERFtLElfREREREREG0vSFxERERER0caS9EVERERERLSxJH0RERERERFtbEAfzi7JwE9tf7p8XhF4DLjd9m7dtJsEdNg+pB9jvhf4DjCi/Fxk+zhJE4CXbN/SQ/te1RsoksYD69i+uod6Y4CHgMNsf7ccOx3otH1ON+0mAdfafrRBWcNz1595NBn7s8Dzts+TtCHVA89N9TD0n9jetg99HQc8a/ubDY4fCDwBrAxMBw62vbSvfS0r8x5ZxJjJVw1E16/LwjwwPiIiIuINYaBX+p4DxklapXz+IPDIAI95LnCQ7fHAOODn5fgEoDdJRW/rDZTxwEd6Wfdx4POSVupD/5OAdZqUNTt3y4TtM22fVz7uCfzS9ua2H+hLwtcLp5Y5bARsAuywDPuOiIiIiFiuDMb2zl8BtSWF/YALagWStpZ0i6S7y+sGXRtL2lXSrZLWkrRLeX+XpKmSRjYY7y1Uq4nYXmJ7QVkV+yxwhKTZkt4vaXdJt5exfy3prU3qrS3pEkl3lp/tSlzHSTpX0rWSFkraW9I3JM2TNE3S8FJvS0k3SJol6RpJo8vxGZJOlnSHpPvLWCsBxwMTy/gTezi3TwDXA//U4LyNl3SbpLmSLpX0Jkn7AB3A+aX/Vbo0e825q5vrTyT9RtLvJB1YN87R5bzMlfSVuuOfKcfmSPpJXT9HSfoIcDjwr5Kml7Jne9Hnf0m6T9Kvgdf8rjSwEtVq31Ol/Xrlu5klaWZZbezNeXuLpFmlfDNJlvSO8vkBSav2IpaIiIiIiJYYjKTvQmBfSSsDmwK315XdC2xve3PgWODE+oaS9gIm88rK1zHAzra3ADqBIxuMdypwX/kH+79JWtn2QuBMygqQ7ZnATcB7y9gXAl9oUu875fNWwMeAs+rGWo8qod0D+Ckw3fYmwAvAriXx+y6wj+0tgbOBr9W1X9H21lQJ0Jdtv1TOw0Vl/It6OLcAJwH/IWlYl+PnAV+0vSkwr/R/cTlv+5f+X+jp3NWVbVrmug1wrKR1JO0CjAW2plqh3FLS9pI2Bv4L2Mn2ZsDn6wcpW1dr53nH+rJu+twS2BfYHNgb2Kqbc3KEpNlUCez9tmeX41OAQ8t3cRTw/QZtG523x4GVJa0BvL+cw/dLeifwuO3nu4klIiIiIqKlBvSaPgDbc8sK2n5A1+vURgHnShpLdW3X8LqyHalWpXax/Yyk3ai2690sCapVnFsbjHe8pPOBXYBPlnEnNAjtbcBFZeVtJarr4xrZGdiojAmwhqTVy/tf2X5Z0jxgGDCtHJ8HjKFajRoHXFfaD6OspBW/KK+zSv0+s/2QpDuo5gqApFHAmrZvKIfOBab2oq/uzt0vS5L4Qlmd2xp4X6l7d6kzkiph2wy42PafSr9P9mFKuzTpc3Xg0lqCJenybvo41fY3S9J9saR9gSuptu1OrfsuR9Q36uG83QJsB2xP9ceJDwECZjYKQNJBwEEAw9ZYu+dZR0REREQMkAFP+orLgW9SJRBvrjt+AtXq2F4lMZxRV/Yg8G5gfaqVFQHX2d6vp8FsPwCcIemHwBOS3tyg2neBb9m+XNXNW45r0t0KwDZdV8VK4vBiGW+ppJdtuxQvpTq3Aubb3qZJ3y+W1yW8vu/iROBi4MbX0QfQ7blz16pU8/u67R/UF0g6rEH93mrW5+F97bMk5NOoErWrgafLtX79MZNqle+dwC+BL5Z4rmwy9hSqlUVGjB7b33MREREREfG6DdYjG84Gjrc9r8vxUbxyY5dJXcp+T7WN77yyXfA2YDtJ7wGQtKqk9bsOpOoawNpSzliqhOppYDHValGjseuvieta71rgb3cRVXV3zd66D1hb0jal7fAyl+68anxV1z2e1019bN8LLAB2K58XAU9Jen+p8mmgtnrVdX5/0825A9hD0solCZwA3AlcA/yLyrWVktaV9Baq6ww/UUsYJf1dD3Ou16zPG4G9JK1SVlp376mjMpdtgQdsPwM8JOnjtTJJm9XX7+G83Qh8CvhduRPok1Tbjm/uw9wiIiIiIgbdoCR9th+2/Z0GRd8Avi7pZqqtj13b3QfsT7XFbg2qxPACSXOpksDX3IiD6h/q95Vrun5Cdf3aEuAKqqRhdvlH/XFUW/1mAn+qa9+13mFAR7mxxwKqG730dt4vUT2O4GRJc4DZ9Hxn0OlU20lrN3J5B9U1gj35GtWW1Zp/Ak4p52o81Q1iAM4BzlTjG7k0O3cAdwBXUZ33E2w/avta4GfArWWL68XA6rbnl3huKPP+Vi/iB6CbPu8CLqI6h5fQZFtlUbum7x6qFdTatXv7AweUmOZTXYvZVcPzVq73hFdWU2+iWjl8qrdzi4iIiIhoBb2yIzGGIkmnUD3Dbm4LYziOAXyOXbvr6OhwZ2dnq8OIiIiIiDYmaZbtjkZlg3VNX/ST7aNbHUNERERERCy/kvRFj2wf1+oYIiIiIiKifwbrRi4RERERERHRArmmL2KASVpMdSfXGPrW4tU3doqhLd/X8iXf1/Il39fyI9/V8mUgv6932m74gOhs74wYePc1u6g2hhZJnfmulh/5vpYv+b6WL/m+lh/5rpYvrfq+sr0zIiIiIiKijSXpi4iIiIiIaGNJ+iIG3pRWBxC9lu9q+ZLva/mS72v5ku9r+ZHvavnSku8rN3KJiIiIiIhoY1npi4iIiIiIaGNJ+iIGiKQPSbpP0v9ImtzqeKI5SWdLelzSPa2OJXom6e2Spkv6raT5kj7f6piiMUkrS7pD0pzyXX2l1TFFzyQNk3S3pCtbHUt0T9JCSfMkzZbU2ep4onuS1pR0saR7y//Dthm0sbO9M2LZkzQMuB/4IPAwcCewn+0FLQ0sGpK0PfAscJ7tca2OJ7onaTQw2vZdklYHZgF75r+voUeSgNVsPytpOHAT8Hnbt7U4tOiGpCOBDmAN27u1Op5oTtJCoMN2ntO3HJB0LjDT9lmSVgJWtf30YIydlb6IgbE18D+2H7T9EnAhsEeLY4ombN8IPNnqOKJ3bD9m+67yfjHwW2Dd1kYVjbjybPk4vPzkr81DmKS3AbsCZ7U6loh2ImkNYHvgRwC2XxqshA+S9EUMlHWB/637/DD5R2nEMidpDLA5cHtrI4lmylbB2cDjwHW2810Nbd8GvgAsbXUg0SsGrpU0S9JBrQ4muvVu4Angx2X79FmSVhuswZP0RQwMNTiWv25HLEOSRgKXAIfbfqbV8URjtpfYHg+8DdhaUrZQD1GSdgMetz2r1bFEr21newvgw8DB5XKFGJpWBLYAzrC9OfAcMGj3fEjSFzEwHgbeXvf5bcCjLYolou2U68MuAc63/YtWxxM9K9uYZgAfanEo0dx2wEfLdWIXAjtJ+mlrQ4ru2H60vD4OXEp1eUkMTQ8DD9ftdriYKgkcFEn6IgbGncBYSe8qF+ruC1ze4pgi2kK5OciPgN/a/lar44nmJK0tac3yfhVgZ+De1kYVzdj+T9tvsz2G6v9bv7H9qRaHFU1IWq3czIqyTXAXIHehHqJs/wH4X0kblEMfAAbtBmQrDtZAEW8ktv8q6RDgGmAYcLbt+S0OK5qQdAEwAVhL0sPAl23/qLVRRTe2Az4NzCvXigH8X9tXtzCmaGw0cG65o/EKwM9t5zEAEcvGW4FLq7+DsSLwM9vTWhtS9OBQ4PyyIPAg8M+DNXAe2RAREREREdHGsr0zIiIiIiKijSXpi4iIiIiIaGNJ+iIiIiIiItpYkr6IiIiIiIg2lqQvIiIiIiKiRSSdLelxSb165IakT0haIGm+pJ/1qk3u3hkREREREdEakrYHngXOsz2uh7pjgZ8DO9l+StJbbD/e0xhZ6YuIiIiIiGgR2zcCT9Yfk7SepGmSZkmaKWnDUnQg8D3bT5W2PSZ8kKQvIiIiIiJiqJkCHGp7S+Ao4Pvl+PrA+pJulnSbpA/1prMVByjIiIiIiIiI6CNJI4FtgamSaodHlNcVgbHABOBtwExJ42w/3V2fSfoiIiIiIiKGjhWAp22Pb1D2MHCb7ZeBhyTdR5UE3tlThxERERERETEE2H6GKqH7OIAqm5Xiy4Ady/G1qLZ7PthTn0n6IiIiIiIiWkTSBcCtwAaSHpZ0ALA/cICkOcB8YI9S/Rrgz5IWANOBo23/uccx8siGiIiIiIiI9pWVvoiIiIiIiDaWpC8iIiIiIqKNJemLiIiIiIhoY0n6IiIiIiIi2liSvoiIiIiIiDaWpC8iIiIiIqKNJemLiIiIiIhoY0n6IiIiIiIi2tj/B1/FPz6CukhuAAAAAElFTkSuQmCC\n", 411 | "text/plain": [ 412 | "
    " 413 | ] 414 | }, 415 | "metadata": { 416 | "needs_background": "light" 417 | }, 418 | "output_type": "display_data" 419 | } 420 | ], 421 | "source": [ 422 | "df_plot.set_index('event_description')['count'][:20][::-1].plot(\n", 423 | " kind='barh', figsize=(12, 6));" 424 | ] 425 | } 426 | ], 427 | "metadata": { 428 | "kernelspec": { 429 | "display_name": "spark", 430 | "language": "python", 431 | "name": "spark" 432 | }, 433 | "language_info": { 434 | "codemirror_mode": { 435 | "name": "ipython", 436 | "version": 3 437 | }, 438 | "file_extension": ".py", 439 | "mimetype": "text/x-python", 440 | "name": "python", 441 | "nbconvert_exporter": "python", 442 | "pygments_lexer": "ipython3", 443 | "version": "3.7.7" 444 | } 445 | }, 446 | "nbformat": 4, 447 | "nbformat_minor": 4 448 | } 449 | --------------------------------------------------------------------------------