├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── Week1 ├── ClickHouse │ └── docker-compose.yml ├── Postgres │ ├── container-data │ │ └── postgres │ │ │ └── init.sql │ └── docker-compose.yaml ├── employee.png ├── help.txt └── practice.sql ├── Week2 ├── docker-hadoop │ ├── docker-compose.yml │ ├── hadoop.env │ ├── init4win7.bat │ └── tmp │ │ ├── data │ │ ├── checkdata.txt │ │ ├── checkdata2.txt │ │ ├── checkdata3.txt │ │ ├── checkdata4.txt │ │ └── checkdata5.txt │ │ ├── init │ │ └── init.sh │ │ ├── input │ │ ├── input1.txt │ │ └── input2.txt │ │ └── job │ │ └── mr-wordcount.jar └── help.txt ├── Week3 ├── Jupyter │ ├── docker-compose.yaml │ └── notebooks │ │ ├── .ipynb_checkpoints │ │ └── PySparkTasksTemplate-checkpoint.ipynb │ │ ├── PySparkTasksTemplate.ipynb │ │ ├── taxi_cars_data.parquet │ │ └── taxi_data.parquet ├── Project │ ├── PySparkJob.py │ └── clickstream.parquet ├── help.txt ├── spark-practice │ ├── PySpark.ipynb │ ├── PySparkTitanikJob.ipynb │ └── train.csv └── spark-tasks │ ├── PySparkTasksTemplate.ipynb │ ├── taxi_cars_data.parquet │ └── taxi_data.parquet ├── Week4 ├── Airflow │ ├── container-data │ │ └── airflow │ │ │ └── dags │ │ │ ├── calculate_dag.py │ │ │ ├── calculate_parallel_dag.py │ │ │ ├── dummy_dag.py │ │ │ └── hello_dag.py │ └── docker-compose.yaml └── help.txt ├── Week5 ├── SparkML │ ├── Project │ │ ├── PySparkMLFit.py │ │ ├── PySparkMLPredict.py │ │ ├── test.parquet │ │ └── train.parquet │ ├── spark-practice │ │ ├── TitanikSparkML.ipynb │ │ ├── cat_dog │ │ │ ├── PySparkMLDL.ipynb │ │ │ ├── cat_dog.zip │ │ │ ├── scala-logging_2.11-3.9.2.jar │ │ │ └── tensorframes-0.6.0-s_2.11.jar │ │ └── train.csv │ └── spark-tasks │ │ ├── SparkMLTemplate.ipynb │ │ ├── iris.parquet │ │ ├── linear_regression.parquet │ │ └── wine.parquet └── help.txt ├── Week6 ├── Superset │ ├── .env │ ├── config │ │ └── superset_config.py │ └── docker-compose.yaml └── help.txt └── workshops ├── 1. DB_Hadoop ├── CAP-Theorem.png ├── docker-hadoop │ ├── docker-compose.yml │ ├── hadoop.env │ ├── init4win.bat │ └── tmp │ │ ├── data │ │ ├── checkdata.txt │ │ ├── checkdata2.txt │ │ ├── checkdata3.txt │ │ ├── checkdata4.txt │ │ └── checkdata5.txt │ │ ├── init │ │ └── init.sh │ │ └── input │ │ └── input.txt └── hadoop-with-hive │ ├── README.md │ ├── data │ ├── create.sql │ ├── init.sh │ ├── users_20210501.csv │ └── users_20210502.csv │ ├── docker-compose.yml │ └── hadoop.env ├── 2. Spark ├── PySparkShow.ipynb ├── Spark │ ├── PySparkJob.py │ ├── docker-compose.yml │ └── help.txt ├── Streaming │ ├── WordStream │ │ ├── WordStream.py │ │ └── commands.txt │ └── docker-compose.yml └── taxi_data.parquet ├── 3. Airflow ├── container-data │ └── airflow │ │ └── dags │ │ ├── calculate_dag.py │ │ ├── calculate_parallel_dag.py │ │ ├── dummy_dag.py │ │ ├── hello_dag.py │ │ └── test_dag.py └── docker-compose.yaml └── 4. Spark ML ├── TitanikSparkML.ipynb ├── test.csv └── train.csv /.gitattributes: -------------------------------------------------------------------------------- 1 | *.sql linguist-detectable=false -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled class file 2 | *.class 3 | 4 | # Log file 5 | *.log 6 | 7 | # BlueJ files 8 | *.ctxt 9 | 10 | # Mobile Tools for Java (J2ME) 11 | .mtj.tmp/ 12 | 13 | # Package Files # 14 | *.jar 15 | *.war 16 | *.nar 17 | *.ear 18 | *.zip 19 | *.tar.gz 20 | *.rar 21 | 22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 23 | hs_err_pid* 24 | .idea 25 | !/Week2/docker-hadoop/tmp/job/mr-wordcount.jar 26 | !/Week5/SparkML/spark-practice/cat_dog/scala-logging_2.11-3.9.2.jar 27 | !/Week5/SparkML/spark-practice/cat_dog/tensorframes-0.6.0-s_2.11.jar 28 | !/Week5/SparkML/spark-practice/cat_dog/cat_dog.zip 29 | /Week1/Postgres/container-data/postgres/data/ 30 | /venv/ 31 | .cache/ 32 | .ipynb_checkpoints/ 33 | .ipython/ 34 | .jupyter/ 35 | .local/ 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Big Data for Data Science 2 | ## [Stepic Academy Course [RU]](https://academy.stepik.org/big-data) 3 | Репозиторий содержит docker конфигурации, шаблоны и данные необходимые для выполнения практических задач курса. 4 | * Week1 - SQL 5 | * Week2 - Hadoop 6 | * Week3 - Spark 7 | * Week4 - Workflow/Airflow 8 | * Week5 - SparkML 9 | * Week6 - BI/Superset 10 | * Workshops -------------------------------------------------------------------------------- /Week1/ClickHouse/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | server: 4 | image: yandex/clickhouse-server:19.13 5 | restart: always 6 | ports: 7 | - "8123:8123" 8 | - "9000:9000" 9 | - "9009:9009" 10 | volumes: 11 | - ./container-data/clickhouse/data:/var/lib/clickhouse 12 | ulimits: 13 | nproc: 65535 14 | nofile: 15 | soft: 262144 16 | hard: 262144 17 | client: 18 | image: yandex/clickhouse-client:19.13 19 | command: ['--host', 'server'] 20 | -------------------------------------------------------------------------------- /Week1/Postgres/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | postgres: 4 | image: postgres:10 5 | restart: always 6 | ports: 7 | - 5555:5432 8 | environment: 9 | POSTGRES_PASSWORD: admin 10 | POSTGRES_DB: course 11 | POSTGRES_USER: admin 12 | PGDATA: /tmp 13 | volumes: 14 | - ./container-data/postgres/data:/var/lib/postgresql/data 15 | - ./container-data/postgres/init.sql:/docker-entrypoint-initdb.d/init.sql 16 | -------------------------------------------------------------------------------- /Week1/employee.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexKbit/stepik-ds-course/db4e2f3f61ad74291d1fcacabd83e43dd78392a3/Week1/employee.png -------------------------------------------------------------------------------- /Week1/help.txt: -------------------------------------------------------------------------------- 1 | Запуск PostgreSQL (Выполнять из папки Week1/Postgres) 2 | 1) Запуск контейнеров 3 | docker-compose up -d 4 | 2) Проверка запущенных контейнеров 5 | docker ps 6 | 3) Соединение к БД 7 | host: localhost 8 | port: 5555 9 | database: course 10 | login: admin 11 | password: admin 12 | 4) Выполнение SQL 13 | ... 14 | 5) Остановка контейнеров 15 | docker-compose down 16 | 17 | Запуск ClickHouse (Выполнять из папки Week1/ClickHouse) 18 | 1) Запуск контейнеров 19 | docker-compose up -d 20 | 2) Проверка запущенных контейнеров 21 | docker ps 22 | 3) Соединение к БД 23 | host: localhost 24 | port: 8123 25 | database: 26 | login: 27 | password: 28 | 4) Выполнение SQL -------------------------------------------------------------------------------- /Week1/practice.sql: -------------------------------------------------------------------------------- 1 | -- 1. Простые запросы 2 | select * from employee; 3 | select id, name, surname from employee; 4 | select id, name, surname from employee limit 10; 5 | select id, concat(name, ' ', surname) as fio from employee; 6 | 7 | -- 2. Применение аггрегирующих функций 8 | select count(*) from employee; 9 | select count(*), max(salary), min(salary), avg(salary) from employee; 10 | 11 | -- 3. Применение ограничивающих условий 12 | select count(*) from employee 13 | where salary > 5000; 14 | 15 | select count(*) from employee 16 | where salary > 5000 and salary < 8000; 17 | 18 | select count(*) from employee 19 | where salary >= 8000 or salary <= 5000; 20 | 21 | select count(*) from employee 22 | where name = 'Tom'; 23 | 24 | select count(*) from employee 25 | where name in ('Tom', 'Mark', 'Kate'); 26 | 27 | select count(*) from employee 28 | where name like 'A%'; -- % - любое число символов 29 | 30 | -- Число сотрудников с именем начинающимся на А и из 4 букв 31 | select count(*) from employee 32 | where name like 'A___'; -- _ - один любой символ 33 | 34 | -- Число сотрудников с именем из 4 букв 35 | select count(*) from employee 36 | where length(name) = 4; 37 | 38 | -- 4. Группировка и упорядочивание данных 39 | select name, count(name) from employee 40 | where length(name) > 3 41 | group by name; 42 | 43 | select name, count(name) as count from employee 44 | where length(name) > 3 45 | group by name 46 | order by count desc; 47 | 48 | select name, count(name) as count from employee 49 | where length(name) > 3 50 | group by name 51 | having count(name) >= 150 52 | order by count desc; 53 | 54 | -- группировки и упорядочивания отлично подходят при решений для нахождения самых популярных значений 55 | select name, count(name) as count from employee 56 | group by name 57 | order by count desc 58 | limit 3; 59 | 60 | 61 | -- 5. Join 62 | select c.name, o.country from office o 63 | join company c on o.company_id = c.id; 64 | 65 | select distinct c.name, o.country from office o 66 | join company c on o.company_id = c.id 67 | order by name; 68 | 69 | -- 6. Вложенные запросы 70 | -- Список сотрудников с зарплатой выше средней 71 | select name, surname from employee 72 | where salary > (select avg(salary) from employee); 73 | -------------------------------------------------------------------------------- /Week2/docker-hadoop/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | 3 | services: 4 | namenode: 5 | image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8 6 | container_name: namenode 7 | restart: always 8 | ports: 9 | - 9870:9870 10 | - 9000:9000 11 | volumes: 12 | - hadoop_namenode:/hadoop/dfs/name 13 | - ./tmp/data:/course/data 14 | - ./tmp/input:/course/input 15 | - ./tmp/job:/job 16 | - ./tmp/init:/init 17 | #command: sh /init/init.sh 18 | environment: 19 | - CLUSTER_NAME=test 20 | env_file: 21 | - ./hadoop.env 22 | 23 | datanode: 24 | image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8 25 | container_name: datanode 26 | restart: always 27 | volumes: 28 | - hadoop_datanode:/hadoop/dfs/data 29 | environment: 30 | SERVICE_PRECONDITION: "namenode:9870" 31 | env_file: 32 | - ./hadoop.env 33 | 34 | resourcemanager: 35 | image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.2.1-java8 36 | container_name: resourcemanager 37 | restart: always 38 | ports: 39 | - 8088:8088 40 | environment: 41 | SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode:9864" 42 | env_file: 43 | - ./hadoop.env 44 | 45 | nodemanager1: 46 | image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.2.1-java8 47 | container_name: nodemanager 48 | restart: always 49 | environment: 50 | SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode:9864 resourcemanager:8088" 51 | env_file: 52 | - ./hadoop.env 53 | 54 | historyserver: 55 | image: bde2020/hadoop-historyserver:2.0.0-hadoop3.2.1-java8 56 | container_name: historyserver 57 | restart: always 58 | environment: 59 | SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode:9864 resourcemanager:8088" 60 | volumes: 61 | - hadoop_historyserver:/hadoop/yarn/timeline 62 | env_file: 63 | - ./hadoop.env 64 | 65 | volumes: 66 | hadoop_namenode: 67 | hadoop_datanode: 68 | hadoop_historyserver: 69 | -------------------------------------------------------------------------------- /Week2/docker-hadoop/hadoop.env: -------------------------------------------------------------------------------- 1 | CORE_CONF_fs_defaultFS=hdfs://namenode:9000 2 | CORE_CONF_hadoop_http_staticuser_user=root 3 | CORE_CONF_hadoop_proxyuser_hue_hosts=* 4 | CORE_CONF_hadoop_proxyuser_hue_groups=* 5 | CORE_CONF_io_compression_codecs=org.apache.hadoop.io.compress.SnappyCodec 6 | 7 | HDFS_CONF_dfs_webhdfs_enabled=true 8 | HDFS_CONF_dfs_permissions_enabled=false 9 | HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false 10 | 11 | YARN_CONF_yarn_log___aggregation___enable=true 12 | YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/ 13 | YARN_CONF_yarn_resourcemanager_recovery_enabled=true 14 | YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore 15 | YARN_CONF_yarn_resourcemanager_scheduler_class=org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler 16 | YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___mb=8192 17 | YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___vcores=4 18 | YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate 19 | YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true 20 | YARN_CONF_yarn_resourcemanager_hostname=resourcemanager 21 | YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032 22 | YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030 23 | YARN_CONF_yarn_resourcemanager_resource__tracker_address=resourcemanager:8031 24 | YARN_CONF_yarn_timeline___service_enabled=true 25 | YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true 26 | YARN_CONF_yarn_timeline___service_hostname=historyserver 27 | YARN_CONF_mapreduce_map_output_compress=true 28 | YARN_CONF_mapred_map_output_compress_codec=org.apache.hadoop.io.compress.SnappyCodec 29 | YARN_CONF_yarn_nodemanager_resource_memory___mb=16384 30 | YARN_CONF_yarn_nodemanager_resource_cpu___vcores=8 31 | YARN_CONF_yarn_nodemanager_disk___health___checker_max___disk___utilization___per___disk___percentage=98.5 32 | YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs 33 | YARN_CONF_yarn_nodemanager_aux___services=mapreduce_shuffle 34 | 35 | MAPRED_CONF_mapreduce_framework_name=yarn 36 | MAPRED_CONF_mapred_child_java_opts=-Xmx4096m 37 | MAPRED_CONF_mapreduce_map_memory_mb=4096 38 | MAPRED_CONF_mapreduce_reduce_memory_mb=8192 39 | MAPRED_CONF_mapreduce_map_java_opts=-Xmx3072m 40 | MAPRED_CONF_mapreduce_reduce_java_opts=-Xmx6144m 41 | MAPRED_CONF_yarn_app_mapreduce_am_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/ 42 | MAPRED_CONF_mapreduce_map_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/ 43 | MAPRED_CONF_mapreduce_reduce_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/ -------------------------------------------------------------------------------- /Week2/docker-hadoop/init4win7.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | docker cp ./tmp/data namenode:/course/data 3 | docker cp ./tmp/input namenode:/course/input 4 | docker cp ./tmp/job namenode:/job 5 | docker cp ./tmp/init namenode:/init 6 | ECHO All data has been copied successfully. -------------------------------------------------------------------------------- /Week2/docker-hadoop/tmp/data/checkdata.txt: -------------------------------------------------------------------------------- 1 | Big Data = Big Issues 2 | -------------------------------------------------------------------------------- /Week2/docker-hadoop/tmp/data/checkdata2.txt: -------------------------------------------------------------------------------- 1 | 0101020201020304038974589381219038190288237520834203482039529874359273648723658743753094589023842093742983759843658734643808203482935839485739464389572037409238420938023975394857394653487230423094820934820385739486538746583457289470239482093842093850394739846794836598347528947092384209384209375938457394865834752837402398402938409235830945739845769384653984752093480298405938450934759836798347534759304597 2 | -------------------------------------------------------------------------------- /Week2/docker-hadoop/tmp/data/checkdata3.txt: -------------------------------------------------------------------------------- 1 | 0101020201020304038974589381219038190288237520834203482039529874359273648723658743753094589023842093742983759843658734643808203482935839485739464389572037409238420938023975394857394653487230423094820934820385739486538746583457289470239482093842093850394739846794836598347528947092384209384209375938457394865834752837402398402938409235830945739845769384653984752093480298405938450934759836798347534759304597 2 | -------------------------------------------------------------------------------- /Week2/docker-hadoop/tmp/data/checkdata4.txt: -------------------------------------------------------------------------------- 1 | Data4 2 | 3 | -------------------------------------------------------------------------------- /Week2/docker-hadoop/tmp/data/checkdata5.txt: -------------------------------------------------------------------------------- 1 | Data5 2 | 3 | -------------------------------------------------------------------------------- /Week2/docker-hadoop/tmp/init/init.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | hdfs dfs -mkdir /data 3 | hdfs dfs -mkdir /input 4 | hdfs dfs -put /course/data/* /data 5 | hdfs dfs -put /course/input/* /input 6 | echo "Initialization complete" -------------------------------------------------------------------------------- /Week2/docker-hadoop/tmp/input/input2.txt: -------------------------------------------------------------------------------- 1 | Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Data Science Data Science Data Science Data Science Data Science Data Science Data Science Data Science Stepik Data Science Data Science Data Science Data Science Data Science Data Science Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Data Science Data Science Data Science Data Science Data Science Data Science Data Science Data Science Stepik Data Science Data Science Data Science Data Science Data Science Data Science Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Data Science Data Science Data Science Data Science Data Science Data Science Data Science Data Science Stepik Data Science Data Science Data Science Data Science Data Science Data Science Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Data Science Data Science Data Science Data Science Data Science Data Science Data Science Data Science Stepik Data Science Data Science Data Science Data Science Data Science Data Science Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Data Science Data Science Data Science Data Science Data Science Data Science Data Science Data Science Stepik Data Science Data Science Data Science Data Science Data Science Data Science Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Data Science Data Science Data Science Data Science Data Science Data Science Data Science Data Science Stepik Data Science Data Science Data Science Data Science Data Science Data Science Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Data Science Data Science Data Science Data Science Data Science Data Science Data Science Data Science Stepik Data Science Data Science Data Science Data Science Data Science Data Science Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Data Science Data Science Data Science Data Science Data Science Data Science Data Science Data Science Stepik Data Science Data Science Data Science Data Science Data Science Data Science Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Data Science Data Science Data Science Data Science Data Science Data Science Data Science Data Science Stepik Data Science Data Science Data Science Data Science Data Science Data Science Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Data Science Data Science Data Science Data Science Data Science Data Science Data Science Data Science Stepik Data Science Data Science Data Science Data Science Data Science Data Science Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Data Science Data Science Data Science Data Science Data Science Data Science Data Science Data Science Stepik Data Science Data Science Data Science Data Science Data Science Data Science Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Data Science Data Science Data Science Data Science Data Science Data Science Data Science Data Science Stepik Data Science Data Science Data Science Data Science Data Science Data Science Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Data Science Data Science Data Science Data Science Data Science Data Science Data Science Data Science Stepik Data Science Data Science Data Science Data Science Data Science Data Science Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Data Science Data Science Data Science Data Science Data Science Data Science Data Science Data Science Stepik Data Science Data Science Data Science Data Science Data Science Data Science Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Big Data Big Data Big Data Big Data Big Data Big Data Big Data Stepik Data Science Data Science Data Science Data Science Data Science Data Science Data Science Data Science Stepik Data Science Data Science Data Science Data Science Data Science Data Science -------------------------------------------------------------------------------- /Week2/docker-hadoop/tmp/job/mr-wordcount.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexKbit/stepik-ds-course/db4e2f3f61ad74291d1fcacabd83e43dd78392a3/Week2/docker-hadoop/tmp/job/mr-wordcount.jar -------------------------------------------------------------------------------- /Week2/help.txt: -------------------------------------------------------------------------------- 1 | Выполнять из папки Week2/docker-hadoop 2 | 1) Запуск контейнеров 3 | docker-compose up -d 4 | 2) Проверка запущенных контейнеров 5 | docker ps 6 | 3) Hadoop UI 7 | http://localhost:9870 8 | 4) Подключение к NameNode 9 | docker exec -it namenode bash 10 | 5) Выполнение команд в HDFS 11 | ... 12 | 6) Запуск MapReduce (Java) 13 | hadoop jar /job/mr-wordcount.jar org.apache.hadoop.examples.WordCount /input /output 14 | 7) Выход из контейнера 15 | exit 16 | 8) Остановка контейнеров 17 | docker-compose down 18 | ВАЖНО! 19 | 9) После остановки контейнеров выполните команду: 20 | docker system prune —-volumes 21 | И выбирете Yes(Y) для удаления метаинформации кластера. -------------------------------------------------------------------------------- /Week3/Jupyter/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | jupyter-pyspark: 4 | image: jupyter/pyspark-notebook:4d9c9bd9ced0 5 | command: start.sh jupyter notebook --NotebookApp.token='' 6 | restart: always 7 | ports: 8 | - 8888:8888 9 | - 4040:4040 10 | volumes: 11 | - ./notebooks:/home/jovyan -------------------------------------------------------------------------------- /Week3/Jupyter/notebooks/.ipynb_checkpoints/PySparkTasksTemplate-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# @@@@@@@ @ @ @ @ #\n", 10 | "# @ @ @ @ @ @ @ #\n", 11 | "# @ @ @ @ @@ @ #\n", 12 | "# @ @@@@@@@ @ @ @ #\n", 13 | "# @ @ @ @ @ @ #" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "Привет, в этой практике мы с вами применим наши знания по PySpark и постараемся изучить что-то новое в процессе выполнения.\n", 21 | "
В занятии используется датасет собранный на основе данных Chicago Taxi Rides 2016\n", 22 | "
Полная документация PySpark.\n", 23 | "
Схема данны:\n", 24 | "
|-- taxi_id = идентификатор таксиста\n", 25 | "
|-- trip_start_timestamp = время начала поездки\n", 26 | "
|-- trip_end_timestamp = время окончания поездки\n", 27 | "
|-- trip_seconds = время длительности поездки в секундах\n", 28 | "
|-- trip_miles = мили проиденные во время поездки\n", 29 | "
|-- fare = транспортные расходы\n", 30 | "
|-- tips = назначенные чаевые\n", 31 | "
|-- trip_total = общая стоимость поездки (Итоговая с учетом чаевых и расходов)\n", 32 | "
|-- payment_type = тип оплаты" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "from pyspark.sql import SparkSession\n", 42 | "from pyspark.sql.functions import col" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "spark = SparkSession.builder.appName('PySparkTasks').getOrCreate()" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "spark.conf.set(\"spark.sql.session.timeZone\", \"GMT+3\")" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "spark" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "Скачайте taxi_data.parquet и загрузите используя SparkAPI" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "df = #Ваш код загрузки" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "№1 Посчитайте количество загруженных строк." 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "# Число строк" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "df.show()" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "Посмотрим схему данных:" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "df.printSchema()" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "№2 Чему равна корреляция и ковариация между длиной маршрута и ценой за поездку? Ответ округлите до 5 знаков после запятой.\n", 134 | "
Подробнее corr & cov" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "# Ваш код" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "№3 Найдите количество, среднее, cреднеквадратическое отклонение, минимум и максимум для длины маршрута и цены за поездку? Ответ округлите до 1 знака после запятой. Подробнее describe" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "# Ваш код" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "№4 Найдите самый НЕпопулярный вид оплаты.\n", 167 | "
Подробнее groupBy orderBy" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "# Ваш код" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "№5 Найдите идентификатор таксиста выполнившего наибольшее число заказов." 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "# Ваш код" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "№6 Чему равна средняя цена среди поездок, оплаченных наличными? Ответ округлите до 5 знака.\n", 200 | "
Подробней where" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "# Ваш код" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "№7 Сколько таксистов проехало больше 1000 миль за все время выполнения заказов?" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "# Ваш код" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "№8 Сколько миль проехал пассажир в самой долгой поездке? (Ответ округлите до целого)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "# Ваш код" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "№9 Каков средний заработок всех таксистов? Ответ округлите до 5-ого знака.\n", 249 | "
Отсеките неизвестные машины (не определенный taxi_id)." 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "# Ваш код" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "№10 Сколько поездок начиналось в самый загруженный час?\n", 266 | "
Используйте функцию hour" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "from pyspark.sql.functions import hour" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "# Ваш код" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": {}, 290 | "source": [ 291 | "№11 Сколько поездок началось во второй четверти суток?" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "# Ваш код" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "№12 Найдите топ три даты, в которые было суммарно больше всего чаевых? (Чаевые выдаются после совершения поездки)\n", 308 | "
Ожидаемый формат дат YYYY-MM-DD\n", 309 | "
Вам может понадобится конвертация типов cast" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "from pyspark.sql.types import DateType" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "# Ваш код" 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": {}, 333 | "source": [ 334 | "№13 Сколько было заказов в дату с наибольшим спросом?" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "# Ваш код" 344 | ] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "metadata": {}, 349 | "source": [ 350 | "Подгрузите данные о марках машин из датасета taxi_cars_data.parquet" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": null, 356 | "metadata": {}, 357 | "outputs": [], 358 | "source": [ 359 | "df_car = # Ваш код загрузки" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "df_car.show()" 369 | ] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "metadata": {}, 374 | "source": [ 375 | "№14 Какая марка машины самая распрастранненая среди таксистов?\n", 376 | "
Подробнее split" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": null, 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "from pyspark.sql.functions import split" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [ 394 | "# Ваш код" 395 | ] 396 | }, 397 | { 398 | "cell_type": "markdown", 399 | "metadata": {}, 400 | "source": [ 401 | "№15 Сколько раз и какая модель машин чаще всего встречается в поездках?\n", 402 | "
Подробнее join" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": null, 408 | "metadata": {}, 409 | "outputs": [], 410 | "source": [ 411 | "# Ваш код" 412 | ] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "metadata": {}, 417 | "source": [ 418 | "Почувствуй силу сжатия! сохрани DataFrame в csv и сравни размеры файлов." 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": null, 424 | "metadata": {}, 425 | "outputs": [], 426 | "source": [ 427 | "# Ваш код с coalesce(1)" 428 | ] 429 | }, 430 | { 431 | "cell_type": "markdown", 432 | "metadata": {}, 433 | "source": [ 434 | "Теперь загрузите данные из csv и проверьте типы методом printSchema()." 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": null, 440 | "metadata": {}, 441 | "outputs": [], 442 | "source": [ 443 | "# Ваш код с printSchema() для DataFrame из csv" 444 | ] 445 | }, 446 | { 447 | "cell_type": "markdown", 448 | "metadata": {}, 449 | "source": [ 450 | "Не забудьте посетить SparkUI и изучить историю ваших задач." 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": null, 456 | "metadata": {}, 457 | "outputs": [], 458 | "source": [ 459 | "spark" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": null, 465 | "metadata": {}, 466 | "outputs": [], 467 | "source": [] 468 | } 469 | ], 470 | "metadata": { 471 | "kernelspec": { 472 | "display_name": "Python 3", 473 | "language": "python", 474 | "name": "python3" 475 | }, 476 | "language_info": { 477 | "codemirror_mode": { 478 | "name": "ipython", 479 | "version": 3 480 | }, 481 | "file_extension": ".py", 482 | "mimetype": "text/x-python", 483 | "name": "python", 484 | "nbconvert_exporter": "python", 485 | "pygments_lexer": "ipython3", 486 | "version": "3.7.4" 487 | } 488 | }, 489 | "nbformat": 4, 490 | "nbformat_minor": 2 491 | } 492 | -------------------------------------------------------------------------------- /Week3/Jupyter/notebooks/PySparkTasksTemplate.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# @@@@@@@ @ @ @ @ #\n", 10 | "# @ @ @ @ @ @ @ #\n", 11 | "# @ @ @ @ @@ @ #\n", 12 | "# @ @@@@@@@ @ @ @ #\n", 13 | "# @ @ @ @ @ @ #" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "Привет, в этой практике мы с вами применим наши знания по PySpark и постараемся изучить что-то новое в процессе выполнения.\n", 21 | "
В занятии используется датасет собранный на основе данных Chicago Taxi Rides 2016\n", 22 | "
Полная документация PySpark.\n", 23 | "
Схема данны:\n", 24 | "
|-- taxi_id = идентификатор таксиста\n", 25 | "
|-- trip_start_timestamp = время начала поездки\n", 26 | "
|-- trip_end_timestamp = время окончания поездки\n", 27 | "
|-- trip_seconds = время длительности поездки в секундах\n", 28 | "
|-- trip_miles = мили проиденные во время поездки\n", 29 | "
|-- fare = транспортные расходы\n", 30 | "
|-- tips = назначенные чаевые\n", 31 | "
|-- trip_total = общая стоимость поездки (Итоговая с учетом чаевых и расходов)\n", 32 | "
|-- payment_type = тип оплаты" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "from pyspark.sql import SparkSession\n", 42 | "from pyspark.sql.functions import col" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "spark = SparkSession.builder.appName('PySparkTasks').getOrCreate()" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "spark.conf.set(\"spark.sql.session.timeZone\", \"GMT+3\")" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "spark" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "Скачайте taxi_data.parquet и загрузите используя SparkAPI" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "df = #Ваш код загрузки" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "№1 Посчитайте количество загруженных строк." 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "# Число строк" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "df.show()" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "Посмотрим схему данных:" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "df.printSchema()" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "№2 Чему равна корреляция и ковариация между длиной маршрута и ценой за поездку? Ответ округлите до 5 знаков после запятой.\n", 134 | "
Подробнее corr & cov" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "# Ваш код" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "№3 Найдите количество, среднее, cреднеквадратическое отклонение, минимум и максимум для длины маршрута и цены за поездку? Ответ округлите до 1 знака после запятой. Подробнее describe" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "# Ваш код" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "№4 Найдите самый НЕпопулярный вид оплаты.\n", 167 | "
Подробнее groupBy orderBy" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "# Ваш код" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "№5 Найдите идентификатор таксиста выполнившего наибольшее число заказов." 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "# Ваш код" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "№6 Чему равна средняя цена среди поездок, оплаченных наличными? Ответ округлите до 5 знака.\n", 200 | "
Подробней where" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "# Ваш код" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "№7 Сколько таксистов проехало больше 1000 миль за все время выполнения заказов?" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "# Ваш код" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "№8 Сколько миль проехал пассажир в самой долгой поездке? (Ответ округлите до целого)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "# Ваш код" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "№9 Каков средний заработок всех таксистов? Ответ округлите до 5-ого знака.\n", 249 | "
Отсеките неизвестные машины (не определенный taxi_id)." 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "# Ваш код" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "№10 Сколько поездок начиналось в самый загруженный час?\n", 266 | "
Используйте функцию hour" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "from pyspark.sql.functions import hour" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "# Ваш код" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": {}, 290 | "source": [ 291 | "№11 Сколько поездок началось во второй четверти суток?" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "# Ваш код" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "№12 Найдите топ три даты, в которые было суммарно больше всего чаевых? (Чаевые выдаются после совершения поездки)\n", 308 | "
Ожидаемый формат дат YYYY-MM-DD\n", 309 | "
Вам может понадобится конвертация типов cast" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "from pyspark.sql.types import DateType" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "# Ваш код" 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": {}, 333 | "source": [ 334 | "№13 Сколько было заказов в дату с наибольшим спросом?" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "# Ваш код" 344 | ] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "metadata": {}, 349 | "source": [ 350 | "Подгрузите данные о марках машин из датасета taxi_cars_data.parquet" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": null, 356 | "metadata": {}, 357 | "outputs": [], 358 | "source": [ 359 | "df_car = # Ваш код загрузки" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "df_car.show()" 369 | ] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "metadata": {}, 374 | "source": [ 375 | "№14 Какая марка машины самая распрастранненая среди таксистов?\n", 376 | "
Подробнее split" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": null, 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "from pyspark.sql.functions import split" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [ 394 | "# Ваш код" 395 | ] 396 | }, 397 | { 398 | "cell_type": "markdown", 399 | "metadata": {}, 400 | "source": [ 401 | "№15 Сколько раз и какая модель машин чаще всего встречается в поездках?\n", 402 | "
Подробнее join" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": null, 408 | "metadata": {}, 409 | "outputs": [], 410 | "source": [ 411 | "# Ваш код" 412 | ] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "metadata": {}, 417 | "source": [ 418 | "Почувствуй силу сжатия! сохрани DataFrame в csv и сравни размеры файлов." 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": null, 424 | "metadata": {}, 425 | "outputs": [], 426 | "source": [ 427 | "# Ваш код с coalesce(1)" 428 | ] 429 | }, 430 | { 431 | "cell_type": "markdown", 432 | "metadata": {}, 433 | "source": [ 434 | "Теперь загрузите данные из csv и проверьте типы методом printSchema()." 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": null, 440 | "metadata": {}, 441 | "outputs": [], 442 | "source": [ 443 | "# Ваш код с printSchema() для DataFrame из csv" 444 | ] 445 | }, 446 | { 447 | "cell_type": "markdown", 448 | "metadata": {}, 449 | "source": [ 450 | "Не забудьте посетить SparkUI и изучить историю ваших задач." 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": null, 456 | "metadata": {}, 457 | "outputs": [], 458 | "source": [ 459 | "spark" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": null, 465 | "metadata": {}, 466 | "outputs": [], 467 | "source": [] 468 | } 469 | ], 470 | "metadata": { 471 | "kernelspec": { 472 | "display_name": "Python 3", 473 | "language": "python", 474 | "name": "python3" 475 | }, 476 | "language_info": { 477 | "codemirror_mode": { 478 | "name": "ipython", 479 | "version": 3 480 | }, 481 | "file_extension": ".py", 482 | "mimetype": "text/x-python", 483 | "name": "python", 484 | "nbconvert_exporter": "python", 485 | "pygments_lexer": "ipython3", 486 | "version": "3.7.4" 487 | } 488 | }, 489 | "nbformat": 4, 490 | "nbformat_minor": 2 491 | } 492 | -------------------------------------------------------------------------------- /Week3/Jupyter/notebooks/taxi_cars_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexKbit/stepik-ds-course/db4e2f3f61ad74291d1fcacabd83e43dd78392a3/Week3/Jupyter/notebooks/taxi_cars_data.parquet -------------------------------------------------------------------------------- /Week3/Jupyter/notebooks/taxi_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexKbit/stepik-ds-course/db4e2f3f61ad74291d1fcacabd83e43dd78392a3/Week3/Jupyter/notebooks/taxi_data.parquet -------------------------------------------------------------------------------- /Week3/Project/PySparkJob.py: -------------------------------------------------------------------------------- 1 | import io 2 | import sys 3 | from pyspark.sql import SparkSession 4 | from pyspark.sql.functions import col, datediff 5 | from pyspark.sql import functions as F 6 | 7 | 8 | def process(spark, input_file, target_path): 9 | # TODO Ваш код 10 | 11 | 12 | def main(argv): 13 | input_path = argv[0] 14 | print("Input path to file: " + input_path) 15 | target_path = argv[1] 16 | print("Target path: " + target_path) 17 | spark = _spark_session() 18 | process(spark, input_path, target_path) 19 | 20 | 21 | def _spark_session(): 22 | return SparkSession.builder.appName('PySparkJob').getOrCreate() 23 | 24 | 25 | if __name__ == "__main__": 26 | arg = sys.argv[1:] 27 | if len(arg) != 2: 28 | sys.exit("Input and Target path are require.") 29 | else: 30 | main(arg) 31 | -------------------------------------------------------------------------------- /Week3/Project/clickstream.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexKbit/stepik-ds-course/db4e2f3f61ad74291d1fcacabd83e43dd78392a3/Week3/Project/clickstream.parquet -------------------------------------------------------------------------------- /Week3/help.txt: -------------------------------------------------------------------------------- 1 | Для работы необходимы: 2 | 1) Python 3.X+ 3 | 2) Jupyter Notebook 4 | 3) PySpark 5 | pip install pyspark 6 | 7 | Или используйте docker конфигурацию в папке Jupyter 8 | 1) cd Jupyter 9 | 2) docker-compose up 10 | -------------------------------------------------------------------------------- /Week3/spark-practice/PySparkTitanikJob.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pyspark.sql import SparkSession\n", 10 | "from pyspark.sql.functions import split,col,avg" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "spark = SparkSession.builder.appName('PySparkTitanikJob').getOrCreate()" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 3, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "data": { 29 | "text/html": [ 30 | "\n", 31 | "
\n", 32 | "

SparkSession - in-memory

\n", 33 | " \n", 34 | "
\n", 35 | "

SparkContext

\n", 36 | "\n", 37 | "

Spark UI

\n", 38 | "\n", 39 | "
\n", 40 | "
Version
\n", 41 | "
v2.4.3
\n", 42 | "
Master
\n", 43 | "
local[*]
\n", 44 | "
AppName
\n", 45 | "
PySparkTitanikJob
\n", 46 | "
\n", 47 | "
\n", 48 | " \n", 49 | "
\n", 50 | " " 51 | ], 52 | "text/plain": [ 53 | "" 54 | ] 55 | }, 56 | "execution_count": 3, 57 | "metadata": {}, 58 | "output_type": "execute_result" 59 | } 60 | ], 61 | "source": [ 62 | "spark" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 9, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "df = spark.read.option('header','true').csv('train.csv')" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 10, 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "name": "stdout", 81 | "output_type": "stream", 82 | "text": [ 83 | "+-----------+--------+------+--------------------+------+---+-----+-----+---------+-------+-----+--------+\n", 84 | "|PassengerId|Survived|Pclass| Name| Sex|Age|SibSp|Parch| Ticket| Fare|Cabin|Embarked|\n", 85 | "+-----------+--------+------+--------------------+------+---+-----+-----+---------+-------+-----+--------+\n", 86 | "| 1| 0| 3|Braund, Mr. Owen ...| male| 22| 1| 0|A/5 21171| 7.25| null| S|\n", 87 | "| 2| 1| 1|Cumings, Mrs. Joh...|female| 38| 1| 0| PC 17599|71.2833| C85| C|\n", 88 | "+-----------+--------+------+--------------------+------+---+-----+-----+---------+-------+-----+--------+\n", 89 | "only showing top 2 rows\n", 90 | "\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "df.show(2)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 15, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "data": { 105 | "text/plain": [ 106 | "[('PassengerId', 'string'),\n", 107 | " ('Survived', 'string'),\n", 108 | " ('Pclass', 'string'),\n", 109 | " ('Name', 'string'),\n", 110 | " ('Sex', 'string'),\n", 111 | " ('Age', 'string'),\n", 112 | " ('SibSp', 'string'),\n", 113 | " ('Parch', 'string'),\n", 114 | " ('Ticket', 'string'),\n", 115 | " ('Fare', 'string'),\n", 116 | " ('Cabin', 'string'),\n", 117 | " ('Embarked', 'string')]" 118 | ] 119 | }, 120 | "execution_count": 15, 121 | "metadata": {}, 122 | "output_type": "execute_result" 123 | } 124 | ], 125 | "source": [ 126 | "df.dtypes" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 16, 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "name": "stdout", 136 | "output_type": "stream", 137 | "text": [ 138 | "+-------+------+------------------+\n", 139 | "|summary| Sex| Age|\n", 140 | "+-------+------+------------------+\n", 141 | "| count| 891| 714|\n", 142 | "| mean| null| 29.69911764705882|\n", 143 | "| stddev| null|14.526497332334035|\n", 144 | "| min|female| 0.42|\n", 145 | "| max| male| 9|\n", 146 | "+-------+------+------------------+\n", 147 | "\n" 148 | ] 149 | } 150 | ], 151 | "source": [ 152 | "df.describe(['Sex', 'Age']).show()" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 17, 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "data": { 162 | "text/plain": [ 163 | "891" 164 | ] 165 | }, 166 | "execution_count": 17, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "df.count()" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 18, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "df = df.drop('Ticket', 'Name', 'Fare','Cabin')" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 19, 187 | "metadata": {}, 188 | "outputs": [ 189 | { 190 | "name": "stdout", 191 | "output_type": "stream", 192 | "text": [ 193 | "+-----------+--------+------+------+----+-----+-----+--------+\n", 194 | "|PassengerId|Survived|Pclass| Sex| Age|SibSp|Parch|Embarked|\n", 195 | "+-----------+--------+------+------+----+-----+-----+--------+\n", 196 | "| 1| 0| 3| male| 22| 1| 0| S|\n", 197 | "| 2| 1| 1|female| 38| 1| 0| C|\n", 198 | "| 3| 1| 3|female| 26| 0| 0| S|\n", 199 | "| 4| 1| 1|female| 35| 1| 0| S|\n", 200 | "| 5| 0| 3| male| 35| 0| 0| S|\n", 201 | "| 6| 0| 3| male|null| 0| 0| Q|\n", 202 | "| 7| 0| 1| male| 54| 0| 0| S|\n", 203 | "| 8| 0| 3| male| 2| 3| 1| S|\n", 204 | "| 9| 1| 3|female| 27| 0| 2| S|\n", 205 | "| 10| 1| 2|female| 14| 1| 0| C|\n", 206 | "| 11| 1| 3|female| 4| 1| 1| S|\n", 207 | "| 12| 1| 1|female| 58| 0| 0| S|\n", 208 | "| 13| 0| 3| male| 20| 0| 0| S|\n", 209 | "| 14| 0| 3| male| 39| 1| 5| S|\n", 210 | "| 15| 0| 3|female| 14| 0| 0| S|\n", 211 | "| 16| 1| 2|female| 55| 0| 0| S|\n", 212 | "| 17| 0| 3| male| 2| 4| 1| Q|\n", 213 | "| 18| 1| 2| male|null| 0| 0| S|\n", 214 | "| 19| 0| 3|female| 31| 1| 0| S|\n", 215 | "| 20| 1| 3|female|null| 0| 0| C|\n", 216 | "+-----------+--------+------+------+----+-----+-----+--------+\n", 217 | "only showing top 20 rows\n", 218 | "\n" 219 | ] 220 | } 221 | ], 222 | "source": [ 223 | "df.show()" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 28, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "df = df.withColumn('FamilySize', col('SibSp') + col('Parch') + 1)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 29, 238 | "metadata": {}, 239 | "outputs": [ 240 | { 241 | "name": "stdout", 242 | "output_type": "stream", 243 | "text": [ 244 | "+-----------+--------+------+------+---+-----+-----+--------+----------+\n", 245 | "|PassengerId|Survived|Pclass| Sex|Age|SibSp|Parch|Embarked|FamilySize|\n", 246 | "+-----------+--------+------+------+---+-----+-----+--------+----------+\n", 247 | "| 1| 0| 3| male| 22| 1| 0| S| 2.0|\n", 248 | "| 2| 1| 1|female| 38| 1| 0| C| 2.0|\n", 249 | "| 3| 1| 3|female| 26| 0| 0| S| 1.0|\n", 250 | "| 4| 1| 1|female| 35| 1| 0| S| 2.0|\n", 251 | "+-----------+--------+------+------+---+-----+-----+--------+----------+\n", 252 | "only showing top 4 rows\n", 253 | "\n" 254 | ] 255 | } 256 | ], 257 | "source": [ 258 | "df.show(4)" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 30, 264 | "metadata": {}, 265 | "outputs": [ 266 | { 267 | "data": { 268 | "text/plain": [ 269 | "177" 270 | ] 271 | }, 272 | "execution_count": 30, 273 | "metadata": {}, 274 | "output_type": "execute_result" 275 | } 276 | ], 277 | "source": [ 278 | "df.where(col('Age').isNull()).count()" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 33, 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "avg_age = df.select(avg(col('Age'))).collect()[0][0]" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 36, 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [ 296 | "ndf = df.fillna({'Age': avg_age})" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 37, 302 | "metadata": {}, 303 | "outputs": [ 304 | { 305 | "name": "stdout", 306 | "output_type": "stream", 307 | "text": [ 308 | "+-----------+--------+------+------+---+-----+-----+--------+----------+\n", 309 | "|PassengerId|Survived|Pclass| Sex|Age|SibSp|Parch|Embarked|FamilySize|\n", 310 | "+-----------+--------+------+------+---+-----+-----+--------+----------+\n", 311 | "| 1| 0| 3| male| 22| 1| 0| S| 2.0|\n", 312 | "| 2| 1| 1|female| 38| 1| 0| C| 2.0|\n", 313 | "| 3| 1| 3|female| 26| 0| 0| S| 1.0|\n", 314 | "| 4| 1| 1|female| 35| 1| 0| S| 2.0|\n", 315 | "| 5| 0| 3| male| 35| 0| 0| S| 1.0|\n", 316 | "+-----------+--------+------+------+---+-----+-----+--------+----------+\n", 317 | "only showing top 5 rows\n", 318 | "\n" 319 | ] 320 | } 321 | ], 322 | "source": [ 323 | "ndf.show(5)" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 43, 329 | "metadata": {}, 330 | "outputs": [ 331 | { 332 | "name": "stdout", 333 | "output_type": "stream", 334 | "text": [ 335 | "+------+\n", 336 | "| Sex|\n", 337 | "+------+\n", 338 | "|female|\n", 339 | "| male|\n", 340 | "+------+\n", 341 | "\n" 342 | ] 343 | } 344 | ], 345 | "source": [ 346 | "ndf[['Sex']].distinct().show()" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 44, 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [ 355 | "ndf = ndf.withColumn('M', col('Sex') == 'male')" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": 45, 361 | "metadata": {}, 362 | "outputs": [], 363 | "source": [ 364 | "ndf = ndf.withColumn('W', col('Sex') == 'female')" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 46, 370 | "metadata": {}, 371 | "outputs": [ 372 | { 373 | "name": "stdout", 374 | "output_type": "stream", 375 | "text": [ 376 | "+-----------+--------+------+------+---+-----+-----+--------+----------+-----+-----+\n", 377 | "|PassengerId|Survived|Pclass| Sex|Age|SibSp|Parch|Embarked|FamilySize| M| W|\n", 378 | "+-----------+--------+------+------+---+-----+-----+--------+----------+-----+-----+\n", 379 | "| 1| 0| 3| male| 22| 1| 0| S| 2.0| true|false|\n", 380 | "| 2| 1| 1|female| 38| 1| 0| C| 2.0|false| true|\n", 381 | "+-----------+--------+------+------+---+-----+-----+--------+----------+-----+-----+\n", 382 | "only showing top 2 rows\n", 383 | "\n" 384 | ] 385 | } 386 | ], 387 | "source": [ 388 | "ndf.show(2)" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 47, 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [ 397 | "ndf = ndf.drop('Sex')" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": 48, 403 | "metadata": {}, 404 | "outputs": [ 405 | { 406 | "name": "stdout", 407 | "output_type": "stream", 408 | "text": [ 409 | "+-----------+--------+------+---+-----+-----+--------+----------+-----+-----+\n", 410 | "|PassengerId|Survived|Pclass|Age|SibSp|Parch|Embarked|FamilySize| M| W|\n", 411 | "+-----------+--------+------+---+-----+-----+--------+----------+-----+-----+\n", 412 | "| 1| 0| 3| 22| 1| 0| S| 2.0| true|false|\n", 413 | "| 2| 1| 1| 38| 1| 0| C| 2.0|false| true|\n", 414 | "| 3| 1| 3| 26| 0| 0| S| 1.0|false| true|\n", 415 | "| 4| 1| 1| 35| 1| 0| S| 2.0|false| true|\n", 416 | "| 5| 0| 3| 35| 0| 0| S| 1.0| true|false|\n", 417 | "+-----------+--------+------+---+-----+-----+--------+----------+-----+-----+\n", 418 | "only showing top 5 rows\n", 419 | "\n" 420 | ] 421 | } 422 | ], 423 | "source": [ 424 | "ndf.show(5)" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": 51, 430 | "metadata": {}, 431 | "outputs": [], 432 | "source": [ 433 | "ndf.coalesce(1).write.option('header','true').csv('clear_data')" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": null, 439 | "metadata": {}, 440 | "outputs": [], 441 | "source": [] 442 | } 443 | ], 444 | "metadata": { 445 | "kernelspec": { 446 | "display_name": "Python 3", 447 | "language": "python", 448 | "name": "python3" 449 | }, 450 | "language_info": { 451 | "codemirror_mode": { 452 | "name": "ipython", 453 | "version": 3 454 | }, 455 | "file_extension": ".py", 456 | "mimetype": "text/x-python", 457 | "name": "python", 458 | "nbconvert_exporter": "python", 459 | "pygments_lexer": "ipython3", 460 | "version": "3.6.5" 461 | } 462 | }, 463 | "nbformat": 4, 464 | "nbformat_minor": 2 465 | } 466 | -------------------------------------------------------------------------------- /Week3/spark-tasks/PySparkTasksTemplate.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# @@@@@@@ @ @ @ @ #\n", 10 | "# @ @ @ @ @ @ @ #\n", 11 | "# @ @ @ @ @@ @ #\n", 12 | "# @ @@@@@@@ @ @ @ #\n", 13 | "# @ @ @ @ @ @ #" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "Привет, в этой практике мы с вами применим наши знания по PySpark и постараемся изучить что-то новое в процессе выполнения.\n", 21 | "
В занятии используется датасет собранный на основе данных Chicago Taxi Rides 2016\n", 22 | "
Полная документация PySpark.\n", 23 | "
Схема данны:\n", 24 | "
|-- taxi_id = идентификатор таксиста\n", 25 | "
|-- trip_start_timestamp = время начала поездки\n", 26 | "
|-- trip_end_timestamp = время окончания поездки\n", 27 | "
|-- trip_seconds = время длительности поездки в секундах\n", 28 | "
|-- trip_miles = мили проиденные во время поездки\n", 29 | "
|-- fare = транспортные расходы\n", 30 | "
|-- tips = назначенные чаевые\n", 31 | "
|-- trip_total = общая стоимость поездки (Итоговая с учетом чаевых и расходов)\n", 32 | "
|-- payment_type = тип оплаты" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "from pyspark.sql import SparkSession\n", 42 | "from pyspark.sql.functions import col" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "spark = SparkSession.builder.appName('PySparkTasks').getOrCreate()" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "spark.conf.set(\"spark.sql.session.timeZone\", \"GMT+3\")" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "spark" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "Скачайте taxi_data.parquet и загрузите используя SparkAPI" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "df = #Ваш код загрузки" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "№1 Посчитайте количество загруженных строк." 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "# Число строк" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "df.show()" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "Посмотрим схему данных:" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "df.printSchema()" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "№2 Чему равна корреляция и ковариация между длиной маршрута и ценой за поездку? Ответ округлите до 5 знаков после запятой.\n", 134 | "
Подробнее corr & cov" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "# Ваш код" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "№3 Найдите количество, среднее, cреднеквадратическое отклонение, минимум и максимум для длины маршрута и цены за поездку? Ответ округлите до 1 знака после запятой. Подробнее describe" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "# Ваш код" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "№4 Найдите самый НЕпопулярный вид оплаты.\n", 167 | "
Подробнее groupBy orderBy" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "# Ваш код" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "№5 Найдите идентификатор таксиста выполнившего наибольшее число заказов." 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "# Ваш код" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "№6 Чему равна средняя цена среди поездок, оплаченных наличными? Ответ округлите до 5 знака.\n", 200 | "
Подробней where" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "# Ваш код" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "№7 Сколько таксистов проехало больше 1000 миль за все время выполнения заказов?" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "# Ваш код" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "№8 Сколько миль проехал пассажир в самой долгой поездке? (Ответ округлите до целого)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "# Ваш код" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "№9 Каков средний заработок всех таксистов? Ответ округлите до 5-ого знака.\n", 249 | "
Отсеките неизвестные машины (не определенный taxi_id)." 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "# Ваш код" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "№10 Сколько поездок начиналось в самый загруженный час?\n", 266 | "
Используйте функцию hour" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "from pyspark.sql.functions import hour" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "# Ваш код" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": {}, 290 | "source": [ 291 | "№11 Сколько поездок началось во второй четверти суток?" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "# Ваш код" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "№12 Найдите топ три даты, в которые было суммарно больше всего чаевых? (Чаевые выдаются после совершения поездки)\n", 308 | "
Ожидаемый формат дат YYYY-MM-DD\n", 309 | "
Вам может понадобится конвертация типов cast" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "from pyspark.sql.types import DateType" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "# Ваш код" 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": {}, 333 | "source": [ 334 | "№13 Сколько было заказов в дату с наибольшим спросом?" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "# Ваш код" 344 | ] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "metadata": {}, 349 | "source": [ 350 | "Подгрузите данные о марках машин из датасета taxi_cars_data.parquet" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": null, 356 | "metadata": {}, 357 | "outputs": [], 358 | "source": [ 359 | "df_car = # Ваш код загрузки" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "df_car.show()" 369 | ] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "metadata": {}, 374 | "source": [ 375 | "№14 Какая марка машины самая распрастранненая среди таксистов?\n", 376 | "
Подробнее split" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": null, 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "from pyspark.sql.functions import split" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [ 394 | "# Ваш код" 395 | ] 396 | }, 397 | { 398 | "cell_type": "markdown", 399 | "metadata": {}, 400 | "source": [ 401 | "№15 Сколько раз и какая модель машин чаще всего встречается в поездках?\n", 402 | "
Подробнее join" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": null, 408 | "metadata": {}, 409 | "outputs": [], 410 | "source": [ 411 | "# Ваш код" 412 | ] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "metadata": {}, 417 | "source": [ 418 | "Почувствуй силу сжатия! сохрани DataFrame в csv и сравни размеры файлов." 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": null, 424 | "metadata": {}, 425 | "outputs": [], 426 | "source": [ 427 | "# Ваш код с coalesce(1)" 428 | ] 429 | }, 430 | { 431 | "cell_type": "markdown", 432 | "metadata": {}, 433 | "source": [ 434 | "Теперь загрузите данные из csv и проверьте типы методом printSchema()." 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": null, 440 | "metadata": {}, 441 | "outputs": [], 442 | "source": [ 443 | "# Ваш код с printSchema() для DataFrame из csv" 444 | ] 445 | }, 446 | { 447 | "cell_type": "markdown", 448 | "metadata": {}, 449 | "source": [ 450 | "Не забудьте посетить SparkUI и изучить историю ваших задач." 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": null, 456 | "metadata": {}, 457 | "outputs": [], 458 | "source": [ 459 | "spark" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": null, 465 | "metadata": {}, 466 | "outputs": [], 467 | "source": [] 468 | } 469 | ], 470 | "metadata": { 471 | "kernelspec": { 472 | "display_name": "Python 3", 473 | "language": "python", 474 | "name": "python3" 475 | }, 476 | "language_info": { 477 | "codemirror_mode": { 478 | "name": "ipython", 479 | "version": 3 480 | }, 481 | "file_extension": ".py", 482 | "mimetype": "text/x-python", 483 | "name": "python", 484 | "nbconvert_exporter": "python", 485 | "pygments_lexer": "ipython3", 486 | "version": "3.7.4" 487 | } 488 | }, 489 | "nbformat": 4, 490 | "nbformat_minor": 2 491 | } 492 | -------------------------------------------------------------------------------- /Week3/spark-tasks/taxi_cars_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexKbit/stepik-ds-course/db4e2f3f61ad74291d1fcacabd83e43dd78392a3/Week3/spark-tasks/taxi_cars_data.parquet -------------------------------------------------------------------------------- /Week3/spark-tasks/taxi_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexKbit/stepik-ds-course/db4e2f3f61ad74291d1fcacabd83e43dd78392a3/Week3/spark-tasks/taxi_data.parquet -------------------------------------------------------------------------------- /Week4/Airflow/container-data/airflow/dags/calculate_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from airflow import DAG 3 | from airflow.models import Variable 4 | from airflow.operators.python_operator import PythonOperator 5 | 6 | 7 | def load(): 8 | return int(Variable.get('value')) 9 | 10 | 11 | def multiply(**ctx): 12 | x = ctx['ti'].xcom_pull(key='return_value', task_ids='load_task') 13 | x = x * 25 14 | return x 15 | 16 | 17 | def plus(**ctx): 18 | x = ctx['ti'].xcom_pull(key='return_value', task_ids='multiply_task') 19 | x = x + 5 20 | return x 21 | 22 | 23 | def upload(**ctx): 24 | x = ctx['ti'].xcom_pull(key='return_value', task_ids='plus_task') 25 | Variable.set('result', x) 26 | 27 | 28 | dag = DAG(dag_id='calculation-dag', 29 | start_date=datetime(2020, 3, 30), 30 | schedule_interval='@once') 31 | 32 | load_task = PythonOperator(task_id='load_task', 33 | python_callable=load, 34 | dag=dag) 35 | multiply_task = PythonOperator(task_id='multiply_task', 36 | python_callable=multiply, 37 | provide_context=True, 38 | dag=dag) 39 | plus_task = PythonOperator(task_id='plus_task', 40 | python_callable=plus, 41 | provide_context=True, 42 | dag=dag) 43 | upload_task = PythonOperator(task_id='upload_task', 44 | python_callable=upload, 45 | provide_context=True, 46 | dag=dag) 47 | 48 | 49 | load_task >> multiply_task >> plus_task >> upload_task 50 | -------------------------------------------------------------------------------- /Week4/Airflow/container-data/airflow/dags/calculate_parallel_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from airflow import DAG 3 | from airflow.models import Variable 4 | from airflow.operators.python_operator import PythonOperator 5 | 6 | 7 | def load(): 8 | return int(Variable.get('value')) 9 | 10 | 11 | def multiply(**ctx): 12 | x = ctx['ti'].xcom_pull(key='return_value', task_ids='load_task') 13 | x = x * 25 14 | return x 15 | 16 | 17 | def plus5(**ctx): 18 | x = ctx['ti'].xcom_pull(key='return_value', task_ids='multiply_task') 19 | x = x + 5 20 | return x 21 | 22 | 23 | def plus10(**ctx): 24 | x = ctx['ti'].xcom_pull(key='return_value', task_ids='multiply_task') 25 | x = x + 10 26 | return x 27 | 28 | 29 | def upload1(**ctx): 30 | x = ctx['ti'].xcom_pull(key='return_value', task_ids='plus5_task') 31 | Variable.set('result1', x) 32 | 33 | 34 | def upload2(**ctx): 35 | x = ctx['ti'].xcom_pull(key='return_value', task_ids='plus10_task') 36 | Variable.set('result2', x) 37 | 38 | 39 | dag = DAG(dag_id='calculation-parallel-dag', 40 | start_date=datetime(2020, 3, 30), 41 | schedule_interval='@once') 42 | 43 | load_task = PythonOperator(task_id='load_task', 44 | python_callable=load, 45 | dag=dag) 46 | multiply_task = PythonOperator(task_id='multiply_task', 47 | python_callable=multiply, 48 | provide_context=True, 49 | dag=dag) 50 | plus5_task = PythonOperator(task_id='plus5_task', 51 | python_callable=plus5, 52 | provide_context=True, 53 | dag=dag) 54 | plus10_task = PythonOperator(task_id='plus10_task', 55 | python_callable=plus10, 56 | provide_context=True, 57 | dag=dag) 58 | upload1_task = PythonOperator(task_id='upload1_task', 59 | python_callable=upload1, 60 | provide_context=True, 61 | dag=dag) 62 | upload2_task = PythonOperator(task_id='upload2_task', 63 | python_callable=upload2, 64 | provide_context=True, 65 | dag=dag) 66 | 67 | 68 | load_task >> multiply_task 69 | multiply_task >> plus5_task >> upload1_task 70 | multiply_task >> plus10_task >> upload2_task 71 | -------------------------------------------------------------------------------- /Week4/Airflow/container-data/airflow/dags/dummy_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from airflow import DAG 3 | from airflow.operators.dummy_operator import DummyOperator 4 | 5 | 6 | dag = DAG(dag_id='dummy-dag', 7 | start_date=datetime(2020, 3, 30), 8 | schedule_interval='@once') 9 | 10 | t1 = DummyOperator(task_id='task1', dag=dag) 11 | t2 = DummyOperator(task_id='task2', dag=dag) 12 | t3 = DummyOperator(task_id='task3', dag=dag) 13 | t4 = DummyOperator(task_id='task4', dag=dag) 14 | t5 = DummyOperator(task_id='task5', dag=dag) 15 | t6 = DummyOperator(task_id='task6', dag=dag) 16 | t7 = DummyOperator(task_id='task7', dag=dag) 17 | t8 = DummyOperator(task_id='task8', dag=dag) 18 | t9 = DummyOperator(task_id='task9', dag=dag) 19 | t10 = DummyOperator(task_id='task10', dag=dag) 20 | t11 = DummyOperator(task_id='task11', dag=dag) 21 | t12 = DummyOperator(task_id='task12', dag=dag) 22 | t13 = DummyOperator(task_id='task13', dag=dag) 23 | t14 = DummyOperator(task_id='task14', dag=dag) 24 | t15 = DummyOperator(task_id='task15', dag=dag) 25 | t16 = DummyOperator(task_id='task16', dag=dag) 26 | t17 = DummyOperator(task_id='task17', dag=dag) 27 | t18 = DummyOperator(task_id='task18', dag=dag) 28 | t19 = DummyOperator(task_id='task19', dag=dag) 29 | t20 = DummyOperator(task_id='task20', dag=dag) 30 | 31 | t1 >> t2 32 | t2 >> t3 >> t5 >> t7 33 | t2 >> t4 >> t6 >> t7 34 | t2 >> t8 >> t7 35 | t2 >> t9 >> t7 36 | t7 >> t10 37 | t10 >> t11 >> t15 38 | t10 >> t12 >> t15 39 | t10 >> t13 >> t15 40 | t10 >> t14 >> t15 41 | t15 >> t16 42 | t16 >> t17 43 | t16 >> t18 44 | t16 >> t19 45 | t16 >> t20 46 | -------------------------------------------------------------------------------- /Week4/Airflow/container-data/airflow/dags/hello_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from airflow import DAG, AirflowException 3 | from datetime import datetime 4 | from airflow.operators.python_operator import PythonOperator 5 | from airflow.utils.trigger_rule import TriggerRule 6 | 7 | 8 | def print_1(): 9 | return 'Step 1' 10 | 11 | 12 | def print_2(): 13 | #raise AirflowException('Oops!') #Uncomment to make error 14 | return 'Step 2' 15 | 16 | 17 | def print_3(): 18 | return 'Step 3' 19 | 20 | 21 | def print_hello(): 22 | return 'Hello Wolrd' 23 | 24 | 25 | dag = DAG('hello-world-dag', 26 | start_date=datetime(2020, 3, 30), 27 | description='Hello world example', 28 | schedule_interval='@once') 29 | 30 | step1 = PythonOperator(task_id='step1', python_callable=print_1, dag=dag) 31 | step2 = PythonOperator(task_id='step2', python_callable=print_2, dag=dag) 32 | step3 = PythonOperator(task_id='step3', python_callable=print_3, dag=dag) 33 | hello_operator = PythonOperator(task_id='hello_task', 34 | python_callable=print_hello, 35 | #trigger_rule=TriggerRule.ONE_SUCCESS, #Uncomment to skip error 36 | dag=dag) 37 | 38 | step1 >> [step2, step3] 39 | step2 >> hello_operator 40 | step3 >> hello_operator 41 | -------------------------------------------------------------------------------- /Week4/Airflow/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '2.1' 2 | services: 3 | redis: 4 | image: 'redis:5.0.5' 5 | # command: redis-server --requirepass redispass 6 | 7 | postgres: 8 | image: postgres:9.6 9 | environment: 10 | - POSTGRES_USER=airflow 11 | - POSTGRES_PASSWORD=airflow 12 | - POSTGRES_DB=airflow 13 | # Uncomment these lines to persist data on the local filesystem. 14 | # - PGDATA=/var/lib/postgresql/data/pgdata 15 | # volumes: 16 | # - ./pgdata:/var/lib/postgresql/data/pgdata 17 | 18 | webserver: 19 | image: puckel/docker-airflow:1.10.4 20 | restart: always 21 | depends_on: 22 | - postgres 23 | - redis 24 | environment: 25 | - LOAD_EX=n 26 | - FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho= 27 | - EXECUTOR=Celery 28 | # - POSTGRES_USER=airflow 29 | # - POSTGRES_PASSWORD=airflow 30 | # - POSTGRES_DB=airflow 31 | # - REDIS_PASSWORD=redispass 32 | volumes: 33 | - ./container-data/airflow/dags:/usr/local/airflow/dags 34 | # Uncomment to include custom plugins 35 | # - ./plugins:/usr/local/airflow/plugins 36 | ports: 37 | - "8080:8080" 38 | command: webserver 39 | healthcheck: 40 | test: ["CMD-SHELL", "[ -f /usr/local/airflow/airflow-webserver.pid ]"] 41 | interval: 30s 42 | timeout: 30s 43 | retries: 3 44 | 45 | flower: 46 | image: puckel/docker-airflow:1.10.4 47 | restart: always 48 | depends_on: 49 | - redis 50 | environment: 51 | - EXECUTOR=Celery 52 | # - REDIS_PASSWORD=redispass 53 | ports: 54 | - "5555:5555" 55 | command: flower 56 | 57 | scheduler: 58 | image: puckel/docker-airflow:1.10.4 59 | restart: always 60 | depends_on: 61 | - webserver 62 | volumes: 63 | - ./container-data/airflow/dags:/usr/local/airflow/dags 64 | # Uncomment to include custom plugins 65 | # - ./plugins:/usr/local/airflow/plugins 66 | environment: 67 | - LOAD_EX=n 68 | - FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho= 69 | - EXECUTOR=Celery 70 | # - POSTGRES_USER=airflow 71 | # - POSTGRES_PASSWORD=airflow 72 | # - POSTGRES_DB=airflow 73 | # - REDIS_PASSWORD=redispass 74 | command: scheduler 75 | 76 | worker: 77 | image: puckel/docker-airflow:1.10.4 78 | restart: always 79 | depends_on: 80 | - scheduler 81 | volumes: 82 | - ./container-data/airflow/dags:/usr/local/airflow/dags 83 | # Uncomment to include custom plugins 84 | # - ./plugins:/usr/local/airflow/plugins 85 | environment: 86 | - FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho= 87 | - EXECUTOR=Celery 88 | # - POSTGRES_USER=airflow 89 | # - POSTGRES_PASSWORD=airflow 90 | # - POSTGRES_DB=airflow 91 | # - REDIS_PASSWORD=redispass 92 | command: worker 93 | -------------------------------------------------------------------------------- /Week4/help.txt: -------------------------------------------------------------------------------- 1 | Выполнять из папки Week4/Airflow 2 | 1) Запуск контейнеров 3 | docker-compose up -d 4 | 2) Проверка запущенных контейнеров 5 | docker ps 6 | 3) UI Airflow 7 | http://localhost:8080 8 | 4) DAGs которые подгружаются в Airflow Week4/Airflow/container-data/airflow/dags 9 | Вы можете менять их или добавлять свои не останавливая контейнер 10 | 5) Остановка контейнеров 11 | docker-compose down -------------------------------------------------------------------------------- /Week5/SparkML/Project/PySparkMLFit.py: -------------------------------------------------------------------------------- 1 | import io 2 | import sys 3 | 4 | from pyspark.ml import Pipeline 5 | from pyspark.ml.evaluation import RegressionEvaluator 6 | from pyspark.ml.feature import VectorAssembler 7 | from pyspark.ml.regression import LinearRegression 8 | from pyspark.sql import SparkSession 9 | 10 | # Используйте как путь куда сохранить модель 11 | MODEL_PATH = 'spark_ml_model' 12 | 13 | 14 | def process(spark, train_data, test_data): 15 | #train_data - путь к файлу с данными для обучения модели 16 | #test_data - путь к файлу с данными для оценки качества модели 17 | #Ваш код 18 | 19 | 20 | def main(argv): 21 | train_data = argv[0] 22 | print("Input path to train data: " + train_data) 23 | test_data = argv[1] 24 | print("Input path to test data: " + test_data) 25 | spark = _spark_session() 26 | process(spark, train_data, test_data) 27 | 28 | 29 | def _spark_session(): 30 | return SparkSession.builder.appName('PySparkMLFitJob').getOrCreate() 31 | 32 | 33 | if __name__ == "__main__": 34 | arg = sys.argv[1:] 35 | if len(arg) != 2: 36 | sys.exit("Train and test data are require.") 37 | else: 38 | main(arg) 39 | -------------------------------------------------------------------------------- /Week5/SparkML/Project/PySparkMLPredict.py: -------------------------------------------------------------------------------- 1 | import io 2 | import sys 3 | 4 | from pyspark.ml import PipelineModel 5 | from pyspark.sql import SparkSession 6 | 7 | # Используйте как путь откуда загрузить модель 8 | MODEL_PATH = 'spark_ml_model' 9 | 10 | 11 | def process(spark, input_file, output_file): 12 | #input_file - путь к файлу с данными для которых нужно предсказать ctr 13 | #output_file - путь по которому нужно сохранить файл с результатами [ads_id, prediction] 14 | #Ваш код 15 | 16 | 17 | def main(argv): 18 | input_path = argv[0] 19 | print("Input path to file: " + input_path) 20 | output_file = argv[1] 21 | print("Output path to file: " + output_file) 22 | spark = _spark_session() 23 | process(spark, input_path, output_file) 24 | 25 | 26 | def _spark_session(): 27 | return SparkSession.builder.appName('PySparkMLPredict').getOrCreate() 28 | 29 | 30 | if __name__ == "__main__": 31 | arg = sys.argv[1:] 32 | if len(arg) != 2: 33 | sys.exit("Input and Target path are require.") 34 | else: 35 | main(arg) 36 | -------------------------------------------------------------------------------- /Week5/SparkML/Project/test.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexKbit/stepik-ds-course/db4e2f3f61ad74291d1fcacabd83e43dd78392a3/Week5/SparkML/Project/test.parquet -------------------------------------------------------------------------------- /Week5/SparkML/Project/train.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexKbit/stepik-ds-course/db4e2f3f61ad74291d1fcacabd83e43dd78392a3/Week5/SparkML/Project/train.parquet -------------------------------------------------------------------------------- /Week5/SparkML/spark-practice/cat_dog/PySparkMLDL.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "!pip install sparkdl keras tensorflow==1.13.1 tensorframes==0.2.7 tensorflowonspark==1.3.0 jieba" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import os\n", 19 | "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"-1\" " 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 3, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "name": "stderr", 29 | "output_type": "stream", 30 | "text": [ 31 | "Using TensorFlow backend.\n", 32 | "/Users/aleksandrsavchenko/opt/anaconda3/envs/stepik-ds-course/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:526: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 33 | " _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n", 34 | "/Users/aleksandrsavchenko/opt/anaconda3/envs/stepik-ds-course/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:527: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 35 | " _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n", 36 | "/Users/aleksandrsavchenko/opt/anaconda3/envs/stepik-ds-course/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:528: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 37 | " _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n", 38 | "/Users/aleksandrsavchenko/opt/anaconda3/envs/stepik-ds-course/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:529: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 39 | " _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n", 40 | "/Users/aleksandrsavchenko/opt/anaconda3/envs/stepik-ds-course/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:530: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 41 | " _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n", 42 | "/Users/aleksandrsavchenko/opt/anaconda3/envs/stepik-ds-course/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:535: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 43 | " np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "from pyspark.sql import SparkSession\n", 49 | "from pyspark.ml import Pipeline\n", 50 | "from pyspark.ml.feature import VectorAssembler\n", 51 | "from sparkdl import readImages\n", 52 | "import tensorflow as tf\n", 53 | "import tensorframes as tfs" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 4, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "spark = SparkSession.builder \\\n", 63 | " .appName(\"PySparkCatDogJob\") \\\n", 64 | " .config(\"spark.jars\", \"*.jar\") \\\n", 65 | " .config(\"packages\",\"databricks:spark-deep-learning:1.5.0-spark2.4-s_2.11,databricks:tensorframes:0.6.0-s_2.11\") \\\n", 66 | " .getOrCreate()\n", 67 | "\n", 68 | "sc = spark.sparkContext\n", 69 | "sc.PACKAGE_EXTENSIONS=(\"databricks:spark-deep-learning:1.5.0-spark2.4-s_2.11,databricks:tensorframes:0.6.0-s_2.11\")" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 5, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "data": { 79 | "text/html": [ 80 | "\n", 81 | "
\n", 82 | "

SparkSession - in-memory

\n", 83 | " \n", 84 | "
\n", 85 | "

SparkContext

\n", 86 | "\n", 87 | "

Spark UI

\n", 88 | "\n", 89 | "
\n", 90 | "
Version
\n", 91 | "
v2.4.4
\n", 92 | "
Master
\n", 93 | "
local[*]
\n", 94 | "
AppName
\n", 95 | "
PySparkCatDogJob
\n", 96 | "
\n", 97 | "
\n", 98 | " \n", 99 | "
\n", 100 | " " 101 | ], 102 | "text/plain": [ 103 | "" 104 | ] 105 | }, 106 | "execution_count": 5, 107 | "metadata": {}, 108 | "output_type": "execute_result" 109 | } 110 | ], 111 | "source": [ 112 | "spark" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 6, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "from pyspark.sql.functions import lit" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 7, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "cat_df = readImages(\"cat_dog/cat\").withColumn(\"label\", lit(0))" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 8, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "dog_df = readImages(\"cat_dog/dog\").withColumn(\"label\", lit(1))" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 9, 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "name": "stdout", 149 | "output_type": "stream", 150 | "text": [ 151 | "root\n", 152 | " |-- filePath: string (nullable = false)\n", 153 | " |-- image: struct (nullable = true)\n", 154 | " | |-- mode: string (nullable = false)\n", 155 | " | |-- height: integer (nullable = false)\n", 156 | " | |-- width: integer (nullable = false)\n", 157 | " | |-- nChannels: integer (nullable = false)\n", 158 | " | |-- data: binary (nullable = false)\n", 159 | " |-- label: integer (nullable = false)\n", 160 | "\n" 161 | ] 162 | } 163 | ], 164 | "source": [ 165 | "cat_df.printSchema()" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 10, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "(cat_df_train, cat_df_test) = cat_df.randomSplit([0.7, 0.3])\n", 175 | "(dog_df_train, dog_df_test) = dog_df.randomSplit([0.7, 0.3])" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 11, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "df_train = cat_df_train.union(dog_df_train)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 12, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "df_test = cat_df_test.union(dog_df_test)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 16, 199 | "metadata": {}, 200 | "outputs": [ 201 | { 202 | "data": { 203 | "text/plain": [ 204 | "216" 205 | ] 206 | }, 207 | "execution_count": 16, 208 | "metadata": {}, 209 | "output_type": "execute_result" 210 | } 211 | ], 212 | "source": [ 213 | "df_train.count()" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 17, 219 | "metadata": {}, 220 | "outputs": [ 221 | { 222 | "data": { 223 | "text/plain": [ 224 | "84" 225 | ] 226 | }, 227 | "execution_count": 17, 228 | "metadata": {}, 229 | "output_type": "execute_result" 230 | } 231 | ], 232 | "source": [ 233 | "df_test.count()" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 15, 239 | "metadata": {}, 240 | "outputs": [ 241 | { 242 | "name": "stdout", 243 | "output_type": "stream", 244 | "text": [ 245 | "WARNING:tensorflow:From /Users/aleksandrsavchenko/opt/anaconda3/envs/stepik-ds-course/lib/python3.7/site-packages/tensorflow/python/ops/resource_variable_ops.py:435: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n", 246 | "Instructions for updating:\n", 247 | "Colocations handled automatically by placer.\n" 248 | ] 249 | }, 250 | { 251 | "name": "stderr", 252 | "output_type": "stream", 253 | "text": [ 254 | "2019-12-09 22:53:22,898 WARNING (MainThread-99959) From /Users/aleksandrsavchenko/opt/anaconda3/envs/stepik-ds-course/lib/python3.7/site-packages/tensorflow/python/ops/resource_variable_ops.py:435: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n", 255 | "Instructions for updating:\n", 256 | "Colocations handled automatically by placer.\n" 257 | ] 258 | }, 259 | { 260 | "name": "stdout", 261 | "output_type": "stream", 262 | "text": [ 263 | "WARNING:tensorflow:From /Users/aleksandrsavchenko/opt/anaconda3/envs/stepik-ds-course/lib/python3.7/site-packages/sparkdl/graph/utils.py:189: convert_variables_to_constants (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.\n", 264 | "Instructions for updating:\n", 265 | "Use tf.compat.v1.graph_util.convert_variables_to_constants\n" 266 | ] 267 | }, 268 | { 269 | "name": "stderr", 270 | "output_type": "stream", 271 | "text": [ 272 | "2019-12-09 22:53:31,085 WARNING (MainThread-99959) From /Users/aleksandrsavchenko/opt/anaconda3/envs/stepik-ds-course/lib/python3.7/site-packages/sparkdl/graph/utils.py:189: convert_variables_to_constants (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.\n", 273 | "Instructions for updating:\n", 274 | "Use tf.compat.v1.graph_util.convert_variables_to_constants\n" 275 | ] 276 | }, 277 | { 278 | "name": "stdout", 279 | "output_type": "stream", 280 | "text": [ 281 | "WARNING:tensorflow:From /Users/aleksandrsavchenko/opt/anaconda3/envs/stepik-ds-course/lib/python3.7/site-packages/tensorflow/python/framework/graph_util_impl.py:245: extract_sub_graph (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.\n", 282 | "Instructions for updating:\n", 283 | "Use tf.compat.v1.graph_util.extract_sub_graph\n" 284 | ] 285 | }, 286 | { 287 | "name": "stderr", 288 | "output_type": "stream", 289 | "text": [ 290 | "2019-12-09 22:53:31,086 WARNING (MainThread-99959) From /Users/aleksandrsavchenko/opt/anaconda3/envs/stepik-ds-course/lib/python3.7/site-packages/tensorflow/python/framework/graph_util_impl.py:245: extract_sub_graph (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.\n", 291 | "Instructions for updating:\n", 292 | "Use tf.compat.v1.graph_util.extract_sub_graph\n" 293 | ] 294 | }, 295 | { 296 | "name": "stdout", 297 | "output_type": "stream", 298 | "text": [ 299 | "INFO:tensorflow:Froze 376 variables.\n" 300 | ] 301 | }, 302 | { 303 | "name": "stderr", 304 | "output_type": "stream", 305 | "text": [ 306 | "2019-12-09 22:53:31,582 INFO (MainThread-99959) Froze 376 variables.\n" 307 | ] 308 | }, 309 | { 310 | "name": "stdout", 311 | "output_type": "stream", 312 | "text": [ 313 | "INFO:tensorflow:Converted 376 variables to const ops.\n" 314 | ] 315 | }, 316 | { 317 | "name": "stderr", 318 | "output_type": "stream", 319 | "text": [ 320 | "2019-12-09 22:53:31,780 INFO (MainThread-99959) Converted 376 variables to const ops.\n" 321 | ] 322 | }, 323 | { 324 | "name": "stdout", 325 | "output_type": "stream", 326 | "text": [ 327 | "WARNING:tensorflow:From /Users/aleksandrsavchenko/opt/anaconda3/envs/stepik-ds-course/lib/python3.7/site-packages/sparkdl/transformers/tf_image.py:178: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n", 328 | "Instructions for updating:\n", 329 | "Use tf.cast instead.\n" 330 | ] 331 | }, 332 | { 333 | "name": "stderr", 334 | "output_type": "stream", 335 | "text": [ 336 | "2019-12-09 22:53:49,286 WARNING (MainThread-99959) From /Users/aleksandrsavchenko/opt/anaconda3/envs/stepik-ds-course/lib/python3.7/site-packages/sparkdl/transformers/tf_image.py:178: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n", 337 | "Instructions for updating:\n", 338 | "Use tf.cast instead.\n" 339 | ] 340 | }, 341 | { 342 | "name": "stdout", 343 | "output_type": "stream", 344 | "text": [ 345 | "INFO:tensorflow:Froze 0 variables.\n" 346 | ] 347 | }, 348 | { 349 | "name": "stderr", 350 | "output_type": "stream", 351 | "text": [ 352 | "2019-12-09 22:53:50,284 INFO (MainThread-99959) Froze 0 variables.\n" 353 | ] 354 | }, 355 | { 356 | "name": "stdout", 357 | "output_type": "stream", 358 | "text": [ 359 | "INFO:tensorflow:Converted 0 variables to const ops.\n" 360 | ] 361 | }, 362 | { 363 | "name": "stderr", 364 | "output_type": "stream", 365 | "text": [ 366 | "2019-12-09 22:53:50,401 INFO (MainThread-99959) Converted 0 variables to const ops.\n", 367 | "2019-12-09 22:53:50,983 INFO (MainThread-99959) Fetch names: ['sdl_flattened_mixed10/concat:0']\n", 368 | "2019-12-09 22:53:50,984 INFO (MainThread-99959) Spark context = \n" 369 | ] 370 | } 371 | ], 372 | "source": [ 373 | "from pyspark.ml.classification import LogisticRegression\n", 374 | "from pyspark.ml import Pipeline\n", 375 | "from sparkdl import DeepImageFeaturizer\n", 376 | "featurizer = DeepImageFeaturizer(inputCol=\"image\", outputCol=\"features\", modelName=\"InceptionV3\")\n", 377 | "lr = LogisticRegression(maxIter=1, regParam=0.05, elasticNetParam=0.3, labelCol=\"label\")\n", 378 | "p = Pipeline(stages=[featurizer, lr]) \n", 379 | "p_model = p.fit(df_train)" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": 18, 385 | "metadata": {}, 386 | "outputs": [ 387 | { 388 | "name": "stdout", 389 | "output_type": "stream", 390 | "text": [ 391 | "Test set accuracy = 0.78756454354\n" 392 | ] 393 | } 394 | ], 395 | "source": [ 396 | "from pyspark.ml.evaluation import MulticlassClassificationEvaluator \n", 397 | "tested_df = p_model.transform(df_test)\n", 398 | "evaluator = MulticlassClassificationEvaluator(metricName=\"accuracy\") \n", 399 | "print(\"Test set accuracy = \" + str(evaluator.evaluate(tested_df.select(\"prediction\", \"label\"))))" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": 19, 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "spark.stop()" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": null, 414 | "metadata": {}, 415 | "outputs": [], 416 | "source": [] 417 | } 418 | ], 419 | "metadata": { 420 | "kernelspec": { 421 | "display_name": "Python 3", 422 | "language": "python", 423 | "name": "python3" 424 | }, 425 | "language_info": { 426 | "codemirror_mode": { 427 | "name": "ipython", 428 | "version": 3 429 | }, 430 | "file_extension": ".py", 431 | "mimetype": "text/x-python", 432 | "name": "python", 433 | "nbconvert_exporter": "python", 434 | "pygments_lexer": "ipython3", 435 | "version": "3.7.5" 436 | } 437 | }, 438 | "nbformat": 4, 439 | "nbformat_minor": 2 440 | } 441 | -------------------------------------------------------------------------------- /Week5/SparkML/spark-practice/cat_dog/cat_dog.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexKbit/stepik-ds-course/db4e2f3f61ad74291d1fcacabd83e43dd78392a3/Week5/SparkML/spark-practice/cat_dog/cat_dog.zip -------------------------------------------------------------------------------- /Week5/SparkML/spark-practice/cat_dog/scala-logging_2.11-3.9.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexKbit/stepik-ds-course/db4e2f3f61ad74291d1fcacabd83e43dd78392a3/Week5/SparkML/spark-practice/cat_dog/scala-logging_2.11-3.9.2.jar -------------------------------------------------------------------------------- /Week5/SparkML/spark-practice/cat_dog/tensorframes-0.6.0-s_2.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexKbit/stepik-ds-course/db4e2f3f61ad74291d1fcacabd83e43dd78392a3/Week5/SparkML/spark-practice/cat_dog/tensorframes-0.6.0-s_2.11.jar -------------------------------------------------------------------------------- /Week5/SparkML/spark-tasks/SparkMLTemplate.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pyspark.sql import SparkSession\n", 10 | "from pyspark.ml import Pipeline\n", 11 | "from pyspark.sql.functions import mean,col,split, col, regexp_extract, when, lit\n", 12 | "from pyspark.ml.feature import StringIndexer\n", 13 | "from pyspark.ml.feature import VectorAssembler\n", 14 | "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n", 15 | "from pyspark.ml.feature import QuantileDiscretizer\n", 16 | "from pyspark.ml.classification import LogisticRegression\n", 17 | "from pyspark.ml.classification import DecisionTreeClassifier\n", 18 | "from pyspark.ml.classification import RandomForestClassifier\n", 19 | "from pyspark.ml.classification import GBTClassifier\n", 20 | "from pyspark.ml.clustering import KMeans\n", 21 | "from pyspark.ml.evaluation import ClusteringEvaluator\n", 22 | "from pyspark.ml.regression import LinearRegression" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import random\n", 32 | "import numpy as np\n", 33 | "#set seed\n", 34 | "random.seed(1234)\n", 35 | "np.random.seed(1234)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "spark = SparkSession.builder.appName(\"PySparkML\").getOrCreate()" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "spark" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "## №1 Линейная регрессия" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "Загрузите данные для применения линейной регрессии linear_regression.parquet" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "lr_df = #Ваш код" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "lr_df.show(5)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "Создайте учителя линейной регресии со следующими параметрами:\n", 93 | "maxIter=20\n", 94 | "regParam=0.5\n", 95 | "elasticNetParam=0.75
\n", 96 | "LinearRegression" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "lr = #Ваш код" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "Выполните обучения на загруженных данных и сохраните результат в переменную." 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "lrModel = #Ваш код" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "Найдите следующие параметры полученной модели rootMeanSquaredError (RMSE), r2 и округлити их до 3его знака." 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "#Ваш код" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "## №2 Кластеризация (K-Means)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "Загрузите данные для применения из wine.parquet" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "wine_df = #Ваш код" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "wine_df.show(5)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "Примените VectorAssembler для создания вектора фич (задействуйте все свойства, кроме Customer_Segment)." 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "feature = #Ваш код" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "Cоздайте учителя KMeans со следующими параметрами K=3 Seed=1
\n", 193 | "Обучите модель и примените ее к тому же вектору.\n", 194 | "Документация по KMeans" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "#Ваш код" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "Найдите силуэт с евклидовым расстоянием в квадрате для данных по вину(округлите до четвертого знака).\n", 211 | "
ClusteringEvaluator" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "#Ваш код" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": {}, 226 | "source": [ 227 | "## №3 DecisionTreeClassifier" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "Загрузити датасет из файла iris.parquet" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "iris_df = #Ваш код" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "iris_df.show(5)" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "Составьте из наших признаков вектор применив VectorAssembler как колонку features.\n", 260 | "
Задействуйте все признаки, кроме species, так как он является целевым." 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "#Ваш код" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "Используйте StringIndexer и сделайте новый признак с именем type из целевого признака species который является категориальным." 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "#Ваш код" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "iris_df.show(5)" 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": {}, 300 | "source": [ 301 | "Сформируем выборки на обучение и тест." 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "metadata": {}, 308 | "outputs": [], 309 | "source": [ 310 | "(training_data, test_data) = iris_df.randomSplit([0.8, 0.2],seed = 42)" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": {}, 316 | "source": [ 317 | "Создайте и обучите DecisionTreeClassifier на датасете для обучения.\n", 318 | "Полученную модель используйте над тестовым датасетом." 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "from pyspark.ml.classification import DecisionTreeClassifier\n", 328 | "#Ваш код" 329 | ] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": {}, 334 | "source": [ 335 | "Используйте MulticlassClassificationEvaluator (помните что целевая фича это - type) для оценки качества модели по метрике accuracy.
Какая точность полученной модели?" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": null, 341 | "metadata": {}, 342 | "outputs": [], 343 | "source": [ 344 | "#Ваш код" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": {}, 350 | "source": [ 351 | "## №4 Random forest" 352 | ] 353 | }, 354 | { 355 | "cell_type": "markdown", 356 | "metadata": {}, 357 | "source": [ 358 | "Создайте и обучите RandomForestClassifier из 10 деревьев на датасете для обучения.\n", 359 | "Полученную модель примените к тестовому датасету.\n" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "from pyspark.ml.classification import RandomForestClassifier\n", 369 | "#Ваш код" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "metadata": {}, 375 | "source": [ 376 | "Используйте MulticlassClassificationEvaluator (помните что целевая фича это - type) для оценки качества модели по метрике accuracy.
Какая точность полученной модели?" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": null, 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "#Ваш код" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": {}, 391 | "source": [ 392 | "## №5 Hyperparameter tuning" 393 | ] 394 | }, 395 | { 396 | "cell_type": "markdown", 397 | "metadata": {}, 398 | "source": [ 399 | "Займемся оптимизацией гиперпараметров для модели.\n", 400 | "Примените TrainValidationSplit для оптимизации гиперпараметров используя подготовленный вами выше датасет iris.parquet на модели RandomForestClassifier совместно с MulticlassClassificationEvaluator.\n", 401 | "
Ваша цель определить оптимальные значения параметров из следующих диапазонов:\n", 402 | "
impurity = [\"entropy\", \"gini\"]\n", 403 | "
maxDepth = [2, 3, 4, 5]\n", 404 | "
numTrees = [3, 6, 9, 12, 15, 18, 21]" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "metadata": {}, 411 | "outputs": [], 412 | "source": [ 413 | "from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": null, 419 | "metadata": {}, 420 | "outputs": [], 421 | "source": [ 422 | "model = #Ваш код" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": null, 428 | "metadata": {}, 429 | "outputs": [], 430 | "source": [ 431 | "print('Num Trees: {}'.format(model.bestModel._java_obj.getNumTrees()))\n", 432 | "print('Max Depth: {}'.format(model.bestModel._java_obj.getMaxDepth()))\n", 433 | "print('Impurity: {}'.format(model.bestModel._java_obj.getImpurity()))" 434 | ] 435 | } 436 | ], 437 | "metadata": { 438 | "kernelspec": { 439 | "display_name": "Python 3", 440 | "language": "python", 441 | "name": "python3" 442 | }, 443 | "language_info": { 444 | "codemirror_mode": { 445 | "name": "ipython", 446 | "version": 3 447 | }, 448 | "file_extension": ".py", 449 | "mimetype": "text/x-python", 450 | "name": "python", 451 | "nbconvert_exporter": "python", 452 | "pygments_lexer": "ipython3", 453 | "version": "3.7.4" 454 | } 455 | }, 456 | "nbformat": 4, 457 | "nbformat_minor": 2 458 | } 459 | -------------------------------------------------------------------------------- /Week5/SparkML/spark-tasks/iris.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexKbit/stepik-ds-course/db4e2f3f61ad74291d1fcacabd83e43dd78392a3/Week5/SparkML/spark-tasks/iris.parquet -------------------------------------------------------------------------------- /Week5/SparkML/spark-tasks/linear_regression.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexKbit/stepik-ds-course/db4e2f3f61ad74291d1fcacabd83e43dd78392a3/Week5/SparkML/spark-tasks/linear_regression.parquet -------------------------------------------------------------------------------- /Week5/SparkML/spark-tasks/wine.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexKbit/stepik-ds-course/db4e2f3f61ad74291d1fcacabd83e43dd78392a3/Week5/SparkML/spark-tasks/wine.parquet -------------------------------------------------------------------------------- /Week5/help.txt: -------------------------------------------------------------------------------- 1 | Для работы необходимы: 2 | 1) Python 3.X+ 3 | 2) Jupyter Notebook 4 | 3) PySpark 5 | pip install pyspark -------------------------------------------------------------------------------- /Week6/Superset/.env: -------------------------------------------------------------------------------- 1 | # metadata database environment variables. 2 | MYSQL_USER=superset 3 | MYSQL_PASSWORD=superset 4 | MYSQL_DATABASE=superset 5 | MYSQL_ROOT_PASSWORD=root 6 | 7 | # redis environment variables. 8 | REDIS_HOST=redis 9 | REDIS_PORT=6379 10 | 11 | # superset environment variables. 12 | MYSQL_HOST=mysql 13 | MYSQL_PORT=3306 14 | SUPERSET_ENV=local 15 | SUPERSET_VERSION=0.29.0rc5 16 | -------------------------------------------------------------------------------- /Week6/Superset/config/superset_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from superset.config import * 3 | from werkzeug.contrib.cache import RedisCache 4 | 5 | 6 | def get_env_variable(var_name, default=None): 7 | """Get the environment variable or raise exception.""" 8 | try: 9 | return os.environ[var_name] 10 | except KeyError: 11 | if default is not None: 12 | return default 13 | else: 14 | error_msg = 'The environment variable {} was missing, abort...' \ 15 | .format(var_name) 16 | raise EnvironmentError(error_msg) 17 | 18 | invocation_type = get_env_variable('INVOCATION_TYPE') 19 | if invocation_type == 'COMPOSE': 20 | MYSQL_USER = get_env_variable('MYSQL_USER') 21 | MYSQL_PASS = get_env_variable('MYSQL_PASS') 22 | MYSQL_HOST = get_env_variable('MYSQL_HOST') 23 | MYSQL_PORT = get_env_variable('MYSQL_PORT') 24 | MYSQL_DATABASE = get_env_variable('MYSQL_DATABASE') 25 | 26 | # The SQLAlchemy connection string. 27 | SQLALCHEMY_DATABASE_URI = 'mysql://%s:%s@%s:%s/%s' % (MYSQL_USER, 28 | MYSQL_PASS, 29 | MYSQL_HOST, 30 | MYSQL_PORT, 31 | MYSQL_DATABASE) 32 | elif invocation_type == 'RUN': 33 | SQLALCHEMY_DATABASE_URI = get_env_variable('DB_URL') 34 | else: 35 | SQLALCHEMY_DATABASE_URI = 'sqlite:///' + os.path.join(DATA_DIR, 'superset.db') 36 | 37 | REDIS_HOST='' 38 | REDIS_PORT='' 39 | if invocation_type == 'COMPOSE': 40 | REDIS_HOST = get_env_variable('REDIS_HOST') 41 | REDIS_PORT = get_env_variable('REDIS_PORT') 42 | RESULTS_BACKEND = RedisCache(host=REDIS_HOST, port=REDIS_PORT, key_prefix='superset_results') 43 | elif invocation_type == 'RUN': 44 | REDIS_HOST = get_env_variable('REDIS_URL').split(":")[1].replace("/","") 45 | REDIS_PORT = get_env_variable('REDIS_URL').split(":")[2].replace("/0","") 46 | RESULTS_BACKEND = RedisCache(host=REDIS_HOST, port=REDIS_PORT, key_prefix='superset_results') 47 | else: 48 | RESULTS_BACKEND = None 49 | 50 | class CeleryConfig(object): 51 | BROKER_URL = ('redis://%s:%s/0' % (REDIS_HOST, REDIS_PORT), 'sqla+sqlite:///'+ os.path.join(DATA_DIR, 'celeryDB.db'))[bool(not REDIS_HOST)] 52 | CELERY_RESULT_BACKEND = ('redis://%s:%s/0' % (REDIS_HOST, REDIS_PORT), 'db+sqlite:///'+ os.path.join(DATA_DIR, 'celeryResultDB.db'))[bool(not REDIS_HOST)] 53 | CELERY_ANNOTATIONS = {'tasks.add': {'rate_limit': '10/s'}} 54 | CELERY_IMPORTS = ('superset.sql_lab', ) 55 | CELERY_TASK_PROTOCOL = 1 56 | 57 | 58 | CELERY_CONFIG = CeleryConfig 59 | -------------------------------------------------------------------------------- /Week6/Superset/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | redis: 4 | image: redis:3.2 5 | restart: always 6 | ports: 7 | - 6379:6379 8 | volumes: 9 | - redis:/data 10 | mysql: 11 | image: mysql:5.7 12 | restart: always 13 | environment: 14 | MYSQL_USER: ${MYSQL_USER} 15 | MYSQL_PASSWORD: ${MYSQL_PASSWORD} 16 | MYSQL_DATABASE: ${MYSQL_DATABASE} 17 | MYSQL_ROOT_PASSWORD: ${MYSQL_ROOT_PASSWORD} 18 | ports: 19 | - 3306:3306 20 | volumes: 21 | - mysql:/var/lib/mysql 22 | superset: 23 | image: abhioncbr/docker-superset:${SUPERSET_VERSION} 24 | restart: always 25 | environment: 26 | ADMIN_USERNAME: admin 27 | ADMIN_PWD: superset 28 | MYSQL_USER: ${MYSQL_USER} 29 | MYSQL_PASS: ${MYSQL_PASSWORD} 30 | MYSQL_DATABASE: ${MYSQL_DATABASE} 31 | MYSQL_HOST: ${MYSQL_HOST} 32 | MYSQL_PORT: ${MYSQL_PORT} 33 | REDIS_HOST: ${REDIS_HOST} 34 | REDIS_PORT: ${REDIS_PORT} 35 | SUPERSET_ENV: ${SUPERSET_ENV} 36 | user: root:root 37 | ports: 38 | - 8088:8088 39 | - 5555:5555 40 | depends_on: 41 | - mysql 42 | - redis 43 | volumes: 44 | - ../config/:/home/superset/config/ 45 | volumes: 46 | mysql: 47 | external: false 48 | redis: 49 | external: false 50 | -------------------------------------------------------------------------------- /Week6/help.txt: -------------------------------------------------------------------------------- 1 | Выполнять из папки Week6/Superset 2 | 1) Запуск контейнеров 3 | docker-compose up -d 4 | 2) Проверка запущенных контейнеров 5 | docker ps 6 | 3) Web UI: http://localhost:8088 7 | login: admin 8 | password: admin 9 | 4) Работа с Superset 10 | ... 11 | 5) Остановка контейнеров 12 | docker-compose down -------------------------------------------------------------------------------- /workshops/1. DB_Hadoop/CAP-Theorem.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexKbit/stepik-ds-course/db4e2f3f61ad74291d1fcacabd83e43dd78392a3/workshops/1. DB_Hadoop/CAP-Theorem.png -------------------------------------------------------------------------------- /workshops/1. DB_Hadoop/docker-hadoop/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | 3 | services: 4 | namenode: 5 | image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8 6 | container_name: namenode 7 | restart: always 8 | ports: 9 | - 9870:9870 10 | - 9000:9000 11 | volumes: 12 | - hadoop_namenode:/hadoop/dfs/name 13 | - ./tmp/data:/course/data 14 | - ./tmp/input:/course/input 15 | - ./tmp/job:/job 16 | - ./tmp/init:/init 17 | #command: sh /init/init.sh 18 | environment: 19 | - CLUSTER_NAME=test 20 | env_file: 21 | - ./hadoop.env 22 | 23 | datanode: 24 | image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8 25 | container_name: datanode 26 | restart: always 27 | volumes: 28 | - hadoop_datanode:/hadoop/dfs/data 29 | environment: 30 | SERVICE_PRECONDITION: "namenode:9870" 31 | env_file: 32 | - ./hadoop.env 33 | 34 | datanode2: 35 | image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8 36 | container_name: datanode2 37 | restart: always 38 | volumes: 39 | - hadoop_datanode2:/hadoop/dfs/data 40 | environment: 41 | SERVICE_PRECONDITION: "namenode:9870" 42 | env_file: 43 | - ./hadoop.env 44 | 45 | datanode3: 46 | image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8 47 | container_name: datanode3 48 | restart: always 49 | volumes: 50 | - hadoop_datanode3:/hadoop/dfs/data 51 | environment: 52 | SERVICE_PRECONDITION: "namenode:9870" 53 | env_file: 54 | - ./hadoop.env 55 | 56 | resourcemanager: 57 | image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.2.1-java8 58 | container_name: resourcemanager 59 | restart: always 60 | ports: 61 | - 8088:8088 62 | environment: 63 | SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode:9864" 64 | env_file: 65 | - ./hadoop.env 66 | 67 | nodemanager1: 68 | image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.2.1-java8 69 | container_name: nodemanager 70 | restart: always 71 | environment: 72 | SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode:9864 resourcemanager:8088" 73 | env_file: 74 | - ./hadoop.env 75 | 76 | historyserver: 77 | image: bde2020/hadoop-historyserver:2.0.0-hadoop3.2.1-java8 78 | container_name: historyserver 79 | restart: always 80 | environment: 81 | SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode:9864 resourcemanager:8088" 82 | volumes: 83 | - hadoop_historyserver:/hadoop/yarn/timeline 84 | env_file: 85 | - ./hadoop.env 86 | 87 | volumes: 88 | hadoop_namenode: 89 | hadoop_datanode: 90 | hadoop_datanode2: 91 | hadoop_datanode3: 92 | hadoop_historyserver: 93 | -------------------------------------------------------------------------------- /workshops/1. DB_Hadoop/docker-hadoop/hadoop.env: -------------------------------------------------------------------------------- 1 | CORE_CONF_fs_defaultFS=hdfs://namenode:9000 2 | CORE_CONF_hadoop_http_staticuser_user=root 3 | CORE_CONF_hadoop_proxyuser_hue_hosts=* 4 | CORE_CONF_hadoop_proxyuser_hue_groups=* 5 | CORE_CONF_io_compression_codecs=org.apache.hadoop.io.compress.SnappyCodec 6 | 7 | HDFS_CONF_dfs_webhdfs_enabled=true 8 | HDFS_CONF_dfs_permissions_enabled=false 9 | HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false 10 | 11 | YARN_CONF_yarn_log___aggregation___enable=true 12 | YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/ 13 | YARN_CONF_yarn_resourcemanager_recovery_enabled=true 14 | YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore 15 | YARN_CONF_yarn_resourcemanager_scheduler_class=org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler 16 | YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___mb=8192 17 | YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___vcores=4 18 | YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate 19 | YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true 20 | YARN_CONF_yarn_resourcemanager_hostname=resourcemanager 21 | YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032 22 | YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030 23 | YARN_CONF_yarn_resourcemanager_resource__tracker_address=resourcemanager:8031 24 | YARN_CONF_yarn_timeline___service_enabled=true 25 | YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true 26 | YARN_CONF_yarn_timeline___service_hostname=historyserver 27 | YARN_CONF_mapreduce_map_output_compress=true 28 | YARN_CONF_mapred_map_output_compress_codec=org.apache.hadoop.io.compress.SnappyCodec 29 | YARN_CONF_yarn_nodemanager_resource_memory___mb=16384 30 | YARN_CONF_yarn_nodemanager_resource_cpu___vcores=8 31 | YARN_CONF_yarn_nodemanager_disk___health___checker_max___disk___utilization___per___disk___percentage=98.5 32 | YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs 33 | YARN_CONF_yarn_nodemanager_aux___services=mapreduce_shuffle 34 | 35 | MAPRED_CONF_mapreduce_framework_name=yarn 36 | MAPRED_CONF_mapred_child_java_opts=-Xmx4096m 37 | MAPRED_CONF_mapreduce_map_memory_mb=4096 38 | MAPRED_CONF_mapreduce_reduce_memory_mb=8192 39 | MAPRED_CONF_mapreduce_map_java_opts=-Xmx3072m 40 | MAPRED_CONF_mapreduce_reduce_java_opts=-Xmx6144m 41 | MAPRED_CONF_yarn_app_mapreduce_am_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/ 42 | MAPRED_CONF_mapreduce_map_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/ 43 | MAPRED_CONF_mapreduce_reduce_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/ -------------------------------------------------------------------------------- /workshops/1. DB_Hadoop/docker-hadoop/init4win.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | docker cp ./tmp/data namenode:/course/data 3 | docker cp ./tmp/input namenode:/course/input 4 | docker cp ./tmp/job namenode:/job 5 | docker cp ./tmp/init namenode:/init 6 | ECHO All data has been copied successfully. -------------------------------------------------------------------------------- /workshops/1. DB_Hadoop/docker-hadoop/tmp/data/checkdata.txt: -------------------------------------------------------------------------------- 1 | Some data 2 | -------------------------------------------------------------------------------- /workshops/1. DB_Hadoop/docker-hadoop/tmp/data/checkdata2.txt: -------------------------------------------------------------------------------- 1 | 0101020201020304038974589381219038190288237520834203482039529874359273648723658743753094589023842093742983759843658734643808203482935839485739464389572037409238420938023975394857394653487230423094820934820385739486538746583457289470239482093842093850394739846794836598347528947092384209384209375938457394865834752837402398402938409235830945739845769384653984752093480298405938450934759836798347534759304597 2 | -------------------------------------------------------------------------------- /workshops/1. DB_Hadoop/docker-hadoop/tmp/data/checkdata3.txt: -------------------------------------------------------------------------------- 1 | 0101020201020304038974589381219038190288237520834203482039529874359273648723658743753094589023842093742983759843658734643808203482935839485739464389572037409238420938023975394857394653487230423094820934820385739486538746583457289470239482093842093850394739846794836598347528947092384209384209375938457394865834752837402398402938409235830945739845769384653984752093480298405938450934759836798347534759304597 2 | -------------------------------------------------------------------------------- /workshops/1. DB_Hadoop/docker-hadoop/tmp/data/checkdata4.txt: -------------------------------------------------------------------------------- 1 | Data4 2 | 3 | -------------------------------------------------------------------------------- /workshops/1. DB_Hadoop/docker-hadoop/tmp/data/checkdata5.txt: -------------------------------------------------------------------------------- 1 | Data5 2 | 3 | -------------------------------------------------------------------------------- /workshops/1. DB_Hadoop/docker-hadoop/tmp/init/init.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | hdfs dfs -mkdir /data 3 | hdfs dfs -mkdir /input 4 | hdfs dfs -put /course/data/* /data/ 5 | hdfs dfs -put /course/input/* /input/ 6 | echo "Initialization complete" -------------------------------------------------------------------------------- /workshops/1. DB_Hadoop/docker-hadoop/tmp/input/input.txt: -------------------------------------------------------------------------------- 1 | my data so big my data so big my data so big my data so big 2 | -------------------------------------------------------------------------------- /workshops/1. DB_Hadoop/hadoop-with-hive/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Testing 3 | Load data into Hive: 4 | ``` 5 | docker-compose exec hive-server bash 6 | 7 | hdfs dfs -mkdir /data 8 | 9 | hdfs dfs -put /data/users_20210501.csv /data 10 | hdfs dfs -put /data/users_20210502.csv /data 11 | 12 | hive 13 | 14 | CREATE EXTERNAL TABLE IF NOT EXISTS users( 15 | id INT, 16 | login STRING, 17 | email STRING, 18 | active BOOLEAN, 19 | organization_id INT) 20 | ROW FORMAT DELIMITED 21 | FIELDS TERMINATED BY ',' 22 | STORED AS TEXTFILE 23 | location 'hdfs:///data'; 24 | 25 | ``` 26 | -------------------------------------------------------------------------------- /workshops/1. DB_Hadoop/hadoop-with-hive/data/create.sql: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL TABLE IF NOT EXISTS users( 2 | id INT, 3 | login STRING, 4 | email STRING, 5 | active BOOLEAN, 6 | organization_id INT) 7 | ROW FORMAT DELIMITED 8 | FIELDS TERMINATED BY ',' 9 | STORED AS TEXTFILE 10 | location 'hdfs:///data'; -------------------------------------------------------------------------------- /workshops/1. DB_Hadoop/hadoop-with-hive/data/init.sh: -------------------------------------------------------------------------------- 1 | hdfs dfs -mkdir /data 2 | hdfs dfs -put /data/users_20210501.csv /data 3 | hdfs dfs -put /data/users_20210502.csv /data 4 | -------------------------------------------------------------------------------- /workshops/1. DB_Hadoop/hadoop-with-hive/data/users_20210501.csv: -------------------------------------------------------------------------------- 1 | 1,gregor,gregor@fake.com,true,1001 2 | 2,jess,jess@fake.com,true,1001 3 | 3,pikachu,pikachu@fake.com,true,1001 4 | 4,jerry,jerry@fake.com,false,1001 5 | 5,lolly,lolly@fake.com,false,1001 6 | -------------------------------------------------------------------------------- /workshops/1. DB_Hadoop/hadoop-with-hive/data/users_20210502.csv: -------------------------------------------------------------------------------- 1 | 6,stella,stella@fake.com,true,1001 2 | 7,bella,bella@fake.com,true,1001 3 | 8,ingrid,ingrid@fake.com,true,1001 4 | 9,freya,freya@fake.com,true,1001 5 | 10,zik,zik@fake.com,true,1001 6 | -------------------------------------------------------------------------------- /workshops/1. DB_Hadoop/hadoop-with-hive/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | 3 | services: 4 | namenode: 5 | image: bde2020/hadoop-namenode:2.0.0-hadoop2.7.4-java8 6 | container_name: namenode 7 | ports: 8 | - 9870:9870 9 | - 9000:9000 10 | - 50070:50070 11 | volumes: 12 | - hadoop_namenode:/hadoop/dfs/name 13 | environment: 14 | - CLUSTER_NAME=test 15 | env_file: 16 | - ./hadoop.env 17 | 18 | datanode: 19 | image: bde2020/hadoop-datanode:2.0.0-hadoop2.7.4-java8 20 | container_name: datanode 21 | ports: 22 | - 50075:50075 23 | volumes: 24 | - hadoop_datanode:/hadoop/dfs/data 25 | environment: 26 | SERVICE_PRECONDITION: "namenode:50070" 27 | env_file: 28 | - ./hadoop.env 29 | 30 | resourcemanager: 31 | image: bde2020/hadoop-resourcemanager:2.0.0-hadoop2.7.4-java8 32 | container_name: resourcemanager 33 | ports: 34 | - 8088:8088 35 | environment: 36 | SERVICE_PRECONDITION: "namenode:50070 datanode:50075" 37 | env_file: 38 | - ./hadoop.env 39 | 40 | nodemanager: 41 | image: bde2020/hadoop-nodemanager:2.0.0-hadoop2.7.4-java8 42 | container_name: nodemanager 43 | environment: 44 | SERVICE_PRECONDITION: "namenode:50070 datanode:50075 resourcemanager:8088" 45 | env_file: 46 | - ./hadoop.env 47 | 48 | historyserver: 49 | image: bde2020/hadoop-historyserver:2.0.0-hadoop2.7.4-java8 50 | container_name: historyserver 51 | environment: 52 | SERVICE_PRECONDITION: "namenode:50070 datanode:50075 resourcemanager:8088" 53 | volumes: 54 | - hadoop_historyserver:/hadoop/yarn/timeline 55 | env_file: 56 | - ./hadoop.env 57 | 58 | hive-metastore-postgresql: 59 | image: bde2020/hive-metastore-postgresql:2.3.0 60 | container_name: hive-metastore-postgresql 61 | 62 | hive-server: 63 | image: bde2020/hive:2.3.2-postgresql-metastore 64 | container_name: hive-server 65 | ports: 66 | - 10000:10000 67 | volumes: 68 | - ./data:/data 69 | environment: 70 | HIVE_CORE_CONF_javax_jdo_option_ConnectionURL: "jdbc:postgresql://hive-metastore/metastore" 71 | SERVICE_PRECONDITION: "hive-metastore:9083" 72 | env_file: 73 | - ./hadoop.env 74 | 75 | hive-metastore: 76 | image: bde2020/hive:2.3.2-postgresql-metastore 77 | container_name: hive-metastore 78 | ports: 79 | - 9083:9083 80 | environment: 81 | SERVICE_PRECONDITION: "namenode:50070 datanode:50075 hive-metastore-postgresql:5432" 82 | env_file: 83 | - ./hadoop.env 84 | command: /opt/hive/bin/hive --service metastore 85 | 86 | volumes: 87 | hadoop_namenode: 88 | hadoop_datanode: 89 | hadoop_historyserver: 90 | -------------------------------------------------------------------------------- /workshops/1. DB_Hadoop/hadoop-with-hive/hadoop.env: -------------------------------------------------------------------------------- 1 | CORE_CONF_fs_defaultFS=hdfs://namenode:8020 2 | CORE_CONF_hadoop_http_staticuser_user=root 3 | CORE_CONF_hadoop_proxyuser_hue_hosts=* 4 | CORE_CONF_hadoop_proxyuser_hue_groups=* 5 | CORE_CONF_io_compression_codecs=org.apache.hadoop.io.compress.SnappyCodec 6 | 7 | HDFS_CONF_dfs_webhdfs_enabled=true 8 | HDFS_CONF_dfs_permissions_enabled=false 9 | HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false 10 | 11 | YARN_CONF_yarn_log___aggregation___enable=true 12 | YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/ 13 | YARN_CONF_yarn_resourcemanager_recovery_enabled=true 14 | YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore 15 | YARN_CONF_yarn_resourcemanager_scheduler_class=org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler 16 | YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___mb=8192 17 | YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___vcores=4 18 | YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate 19 | YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true 20 | YARN_CONF_yarn_resourcemanager_hostname=resourcemanager 21 | YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032 22 | YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030 23 | YARN_CONF_yarn_resourcemanager_resource__tracker_address=resourcemanager:8031 24 | YARN_CONF_yarn_timeline___service_enabled=true 25 | YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true 26 | YARN_CONF_yarn_timeline___service_hostname=historyserver 27 | YARN_CONF_mapreduce_map_output_compress=true 28 | YARN_CONF_mapred_map_output_compress_codec=org.apache.hadoop.io.compress.SnappyCodec 29 | YARN_CONF_yarn_nodemanager_resource_memory___mb=8192 30 | YARN_CONF_yarn_nodemanager_resource_cpu___vcores=4 31 | YARN_CONF_yarn_nodemanager_disk___health___checker_max___disk___utilization___per___disk___percentage=98.5 32 | YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs 33 | YARN_CONF_yarn_nodemanager_aux___services=mapreduce_shuffle 34 | 35 | MAPRED_CONF_mapreduce_framework_name=yarn 36 | MAPRED_CONF_mapred_child_java_opts=-Xmx1024m 37 | MAPRED_CONF_mapreduce_map_memory_mb=512 38 | MAPRED_CONF_mapreduce_reduce_memory_mb=1024 39 | MAPRED_CONF_mapreduce_map_java_opts=-Xmx512m 40 | MAPRED_CONF_mapreduce_reduce_java_opts=-Xmx1024m 41 | 42 | HIVE_SITE_CONF_javax_jdo_option_ConnectionURL=jdbc:postgresql://hive-metastore-postgresql/metastore 43 | HIVE_SITE_CONF_javax_jdo_option_ConnectionDriverName=org.postgresql.Driver 44 | HIVE_SITE_CONF_javax_jdo_option_ConnectionUserName=hive 45 | HIVE_SITE_CONF_javax_jdo_option_ConnectionPassword=hive 46 | HIVE_SITE_CONF_datanucleus_autoCreateSchema=false 47 | HIVE_SITE_CONF_hive_metastore_uris=thrift://hive-metastore:9083 48 | -------------------------------------------------------------------------------- /workshops/2. Spark/PySparkShow.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pyspark.sql import SparkSession" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "#spark = SparkSession.builder.appName(\"TestApp\").master('spark://localhost:7077').getOrCreate()\n", 19 | "spark = SparkSession.builder.appName(\"TestApp\").getOrCreate()" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "spark" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "df = spark.read.parquet('taxi_data.parquet')" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "type(df)" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "type(df.rdd)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "df.rdd.getNumPartitions()" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "df.repartition(16).write.parquet(\"tmp\")" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "ndf = spark.read.parquet('tmp')" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "ndf.count()" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "df.count()" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "df.groupBy(\"taxi_id\").count().explain()" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "df.groupBy(\"taxi_id\").count().show()" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "l = list(df.collect())" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "type(l)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "l[0]" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "df.toPandas().head()" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "dfp = df.groupBy('taxi_id').count().orderBy('count', ascending=False).limit(1)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "type(dfp)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "dfp.show()" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "spark.read.parquet('taxi_data.parquet')\\\n", 191 | " .groupBy('taxi_id').count()\\\n", 192 | " .orderBy('count',ascending=False)\\\n", 193 | " .limit(1)\\\n", 194 | " .show()" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "import pyspark.sql.functions as F\n", 204 | "df = spark.read.parquet('taxi_data.parquet')" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "df = df.filter(F.col('tips') > 0)\n", 214 | "df.cache()\n", 215 | "print('Count of events with tips: {}'.format(df.count()))\n", 216 | "df.select('taxi_id', 'tips').groupBy(['taxi_id', 'tips']).sum('tips').show(5)" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "from pyspark import StorageLevel\n", 226 | "df = spark.read.parquet('taxi_data.parquet')" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "df.persist(StorageLevel.MEMORY_AND_DISK)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "df.persist(StorageLevel.DISK_ONLY)" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "dch = df.cache() # == StorageLevel.MEMORY_ONLY_SER" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "dch.show()" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "df.groupBy('taxi_id') \\\n", 272 | ".agg({'trip_start_timestamp': 'max', 'trip_start_timestamp': 'min'}) \\\n", 273 | ".show()" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "from pyspark.sql.functions import min, max, col\n", 283 | "df.groupBy('taxi_id') \\\n", 284 | ".agg(min(col('trip_start_timestamp')).alias('min_time'), max(col('trip_start_timestamp')).alias('max_time')) \\\n", 285 | ".show()" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "spark.stop()" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [] 303 | } 304 | ], 305 | "metadata": { 306 | "kernelspec": { 307 | "display_name": "Python 3", 308 | "language": "python", 309 | "name": "python3" 310 | }, 311 | "language_info": { 312 | "codemirror_mode": { 313 | "name": "ipython", 314 | "version": 3 315 | }, 316 | "file_extension": ".py", 317 | "mimetype": "text/x-python", 318 | "name": "python", 319 | "nbconvert_exporter": "python", 320 | "pygments_lexer": "ipython3", 321 | "version": "3.7.4" 322 | } 323 | }, 324 | "nbformat": 4, 325 | "nbformat_minor": 2 326 | } 327 | -------------------------------------------------------------------------------- /workshops/2. Spark/Spark/PySparkJob.py: -------------------------------------------------------------------------------- 1 | import io 2 | import sys 3 | from pyspark.sql import SparkSession 4 | 5 | 6 | def main(): 7 | spark = SparkSession.builder.appName('PySparkJob').getOrCreate() 8 | 9 | if __name__ == "__main__": 10 | main() 11 | -------------------------------------------------------------------------------- /workshops/2. Spark/Spark/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | 3 | networks: 4 | default: 5 | driver: bridge 6 | 7 | services: 8 | # =============== Spark services =============== 9 | spark-master: 10 | image: bde2020/spark-master:2.4.1-hadoop2.7 11 | hostname: spark-master 12 | networks: 13 | - default 14 | environment: 15 | SPARK_PUBLIC_DNS: 0.0.0.0 16 | INIT_DAEMON_STEP: setup_spark 17 | #SPARK_DEPLOY_DEFAULTCORES: 2 18 | expose: 19 | - 4040 20 | - 7001 21 | - 7002 22 | - 7003 23 | - 7004 24 | - 7005 25 | - 7006 26 | - 7077 27 | - 6066 28 | ports: 29 | - 4040:4040 30 | - 6066:6066 31 | - 7077:7077 32 | - 8080:8080 33 | - 20021:20021 34 | volumes: 35 | - ./container-data/spark-master/conf:/conf 36 | - ./container-data/spark-master/data:/tmp/data 37 | - ./container-data/spark-master/work:/spark/work 38 | - ./container-data/spark-allnodes:/jar 39 | - ./container-data/spark-allnodes:/etc/spark/apps 40 | 41 | spark-worker-1: 42 | image: bde2020/spark-worker:2.4.1-hadoop2.7 43 | networks: 44 | - default 45 | depends_on: 46 | - "spark-master" 47 | hostname: spark-worker-1 48 | environment: 49 | SPARK_MASTER: spark://spark-master:7077 50 | SPARK_PUBLIC_DNS: 0.0.0.0 51 | SPARK_WORKER_CORES: 2 52 | SPARK_WORKER_MEMORY: 3g 53 | SPARK_WORKER_PORT: 8881 54 | SPARK_WORKER_WEBUI_PORT: 8081 55 | expose: 56 | - 7012 57 | - 7013 58 | - 7014 59 | - 7015 60 | - 7016 61 | - 8881 62 | ports: 63 | - 8081:8081 64 | volumes: 65 | - ./container-data/spark-worker-1/config:/conf 66 | - ./container-data/spark-worker-1/data:/tmp/data 67 | - ./container-data/spark-worker-1/work:/spark/work 68 | - ./container-data/spark-allnodes:/jar 69 | - ./container-data/spark-allnodes:/etc/spark/apps 70 | 71 | spark-worker-2: 72 | image: bde2020/spark-worker:2.4.1-hadoop2.7 73 | networks: 74 | - default 75 | depends_on: 76 | - "spark-master" 77 | hostname: spark-worker-2 78 | environment: 79 | SPARK_MASTER: spark://spark-master:7077 80 | SPARK_PUBLIC_DNS: 0.0.0.0 81 | SPARK_WORKER_CORES: 2 82 | SPARK_WORKER_MEMORY: 3g 83 | SPARK_WORKER_PORT: 8881 84 | SPARK_WORKER_WEBUI_PORT: 8081 85 | expose: 86 | - 7012 87 | - 7013 88 | - 7014 89 | - 7015 90 | - 7016 91 | - 8882 92 | ports: 93 | - 8082:8082 94 | volumes: 95 | - ./container-data/spark-worker-2/config:/conf 96 | - ./container-data/spark-worker-2/data:/tmp/data 97 | - ./container-data/spark-worker-2/work:/spark/work 98 | - ./container-data/spark-allnodes:/jar 99 | - ./container-data/spark-allnodes:/etc/spark/apps 100 | -------------------------------------------------------------------------------- /workshops/2. Spark/Spark/help.txt: -------------------------------------------------------------------------------- 1 | Master UI: http://localhost:8080/ 2 | spark-submit --master local --deploy-mode client PySparkJob.py 3 | spark-submit --master spark://localhost:7077 --deploy-mode client PySparkJob.py 4 | -------------------------------------------------------------------------------- /workshops/2. Spark/Streaming/WordStream/WordStream.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | 4 | def main(): 5 | spark = SparkSession \ 6 | .builder \ 7 | .appName("WordCountStream") \ 8 | .getOrCreate() 9 | 10 | lines = spark \ 11 | .readStream \ 12 | .format("kafka") \ 13 | .option("kafka.bootstrap.servers", "localhost:9092,localhost:9093,localhost:9094") \ 14 | .option("subscribe", "words") \ 15 | .load() 16 | 17 | df = lines.selectExpr("CAST(value AS STRING)").alias("value") 18 | 19 | query = df \ 20 | .writeStream \ 21 | .outputMode("append") \ 22 | .format("console") \ 23 | .start() 24 | 25 | query.awaitTermination() 26 | 27 | 28 | if __name__ == "__main__": 29 | main() 30 | -------------------------------------------------------------------------------- /workshops/2. Spark/Streaming/WordStream/commands.txt: -------------------------------------------------------------------------------- 1 | spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4 --master local[*] --deploy-mode client WordStream.py > log.txt 2 | 3 | 4 | # Скачать Кафку тут 5 | # https://kafka.apache.org/downloads (version 2.8) 6 | 7 | #Kafka 8 | kafka-topics --create --bootstrap-server localhost:9092,localhost:9093,localhost:9094 --replication-factor 1 --partitions 1 --topic words 9 | 10 | kafka-console-consumer --bootstrap-server localhost:9092,localhost:9093,localhost:9094 --topic words 11 | 12 | kafka-console-producer --broker-list localhost:9092,localhost:9093,localhost:9094 --topic words 13 | 14 | 15 | #Kafka sh 16 | ./kafka-topics.sh --create --bootstrap-server localhost:9092,localhost:9093,localhost:9094 --replication-factor 1 --partitions 1 --topic words 17 | 18 | ./kafka-console-consumer.sh --bootstrap-server localhost:9092,localhost:9093,localhost:9094 --topic words 19 | 20 | ./kafka-console-producer.sh --broker-list localhost:9092,localhost:9093,localhost:9094 --topic words 21 | -------------------------------------------------------------------------------- /workshops/2. Spark/Streaming/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | 3 | networks: 4 | default: 5 | driver: bridge 6 | 7 | services: 8 | # =============== Zookeeper services =============== 9 | zoo1: 10 | image: zookeeper:3.4.9 11 | hostname: zoo1 12 | networks: 13 | - default 14 | ports: 15 | - "2181:2181" 16 | environment: 17 | ZOO_MY_ID: 1 18 | ZOO_PORT: 2181 19 | ZOO_SERVERS: server.1=zoo1:2888:3888 20 | volumes: 21 | - ./container-data/zoo1/data:/data 22 | - ./container-data/zoo1/datalog:/datalog 23 | # =============== Kafka services =============== 24 | kafka1: 25 | image: confluentinc/cp-kafka:5.2.2 26 | hostname: kafka1 27 | networks: 28 | - default 29 | ports: 30 | - "9092:9092" 31 | environment: 32 | KAFKA_ADVERTISED_LISTENERS: LISTENER_DOCKER_INTERNAL://kafka1:19092,LISTENER_DOCKER_EXTERNAL://${DOCKER_HOST_IP:-127.0.0.1}:9092 33 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: LISTENER_DOCKER_INTERNAL:PLAINTEXT,LISTENER_DOCKER_EXTERNAL:PLAINTEXT 34 | KAFKA_INTER_BROKER_LISTENER_NAME: LISTENER_DOCKER_INTERNAL 35 | KAFKA_ZOOKEEPER_CONNECT: "zoo1:2181" 36 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 37 | KAFKA_BROKER_ID: 1 38 | KAFKA_LOG4J_LOGGERS: "kafka.controller=INFO,kafka.producer.async.DefaultEventHandler=INFO,state.change.logger=INFO" 39 | volumes: 40 | - ./container-data/kafka1/data:/var/lib/kafka/data 41 | - ./container-data/kafka1/logs:/logs 42 | depends_on: 43 | - zoo1 44 | 45 | kafka2: 46 | image: confluentinc/cp-kafka:5.2.2 47 | hostname: kafka2 48 | networks: 49 | - default 50 | ports: 51 | - "9093:9093" 52 | environment: 53 | KAFKA_ADVERTISED_LISTENERS: LISTENER_DOCKER_INTERNAL://kafka2:19093,LISTENER_DOCKER_EXTERNAL://${DOCKER_HOST_IP:-127.0.0.1}:9093 54 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: LISTENER_DOCKER_INTERNAL:PLAINTEXT,LISTENER_DOCKER_EXTERNAL:PLAINTEXT 55 | KAFKA_INTER_BROKER_LISTENER_NAME: LISTENER_DOCKER_INTERNAL 56 | KAFKA_ZOOKEEPER_CONNECT: "zoo1:2181" 57 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 58 | KAFKA_BROKER_ID: 2 59 | KAFKA_LOG4J_LOGGERS: "kafka.controller=INFO,kafka.producer.async.DefaultEventHandler=INFO,state.change.logger=INFO" 60 | volumes: 61 | - ./container-data/kafka2/data:/var/lib/kafka/data 62 | - ./container-data/kafka2/logs:/logs 63 | depends_on: 64 | - zoo1 65 | 66 | kafka3: 67 | image: confluentinc/cp-kafka:5.2.2 68 | hostname: kafka3 69 | networks: 70 | - default 71 | ports: 72 | - "9094:9094" 73 | environment: 74 | KAFKA_ADVERTISED_LISTENERS: LISTENER_DOCKER_INTERNAL://kafka3:19094,LISTENER_DOCKER_EXTERNAL://${DOCKER_HOST_IP:-127.0.0.1}:9094 75 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: LISTENER_DOCKER_INTERNAL:PLAINTEXT,LISTENER_DOCKER_EXTERNAL:PLAINTEXT 76 | KAFKA_INTER_BROKER_LISTENER_NAME: LISTENER_DOCKER_INTERNAL 77 | KAFKA_ZOOKEEPER_CONNECT: "zoo1:2181" 78 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 79 | KAFKA_BROKER_ID: 3 80 | KAFKA_LOG4J_LOGGERS: "kafka.controller=INFO,kafka.producer.async.DefaultEventHandler=INFO,state.change.logger=INFO" 81 | volumes: 82 | - ./container-data/kafka3/data:/var/lib/kafka/data 83 | - ./container-data/kafka3/logs:/logs 84 | depends_on: 85 | - zoo1 86 | -------------------------------------------------------------------------------- /workshops/2. Spark/taxi_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexKbit/stepik-ds-course/db4e2f3f61ad74291d1fcacabd83e43dd78392a3/workshops/2. Spark/taxi_data.parquet -------------------------------------------------------------------------------- /workshops/3. Airflow/container-data/airflow/dags/calculate_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from airflow import DAG 3 | from airflow.models import Variable 4 | from airflow.operators.python_operator import PythonOperator 5 | 6 | 7 | def load(): 8 | return int(Variable.get('value')) 9 | 10 | 11 | def multiply(**ctx): 12 | x = ctx['ti'].xcom_pull(key='return_value', task_ids='load_task') 13 | x = x * 25 14 | return x 15 | 16 | 17 | def plus(**ctx): 18 | x = ctx['ti'].xcom_pull(key='return_value', task_ids='multiply_task') 19 | x = x + 5 20 | return x 21 | 22 | 23 | def upload(**ctx): 24 | x = ctx['ti'].xcom_pull(key='return_value', task_ids='plus_task') 25 | Variable.set('result', x) 26 | 27 | 28 | dag = DAG(dag_id='calculation-dag', 29 | start_date=datetime(2020, 3, 30), 30 | schedule_interval='@once') 31 | 32 | load_task = PythonOperator(task_id='load_task', 33 | python_callable=load, 34 | dag=dag) 35 | multiply_task = PythonOperator(task_id='multiply_task', 36 | python_callable=multiply, 37 | provide_context=True, 38 | dag=dag) 39 | plus_task = PythonOperator(task_id='plus_task', 40 | python_callable=plus, 41 | provide_context=True, 42 | dag=dag) 43 | upload_task = PythonOperator(task_id='upload_task', 44 | python_callable=upload, 45 | provide_context=True, 46 | dag=dag) 47 | 48 | 49 | load_task >> multiply_task >> plus_task >> upload_task 50 | -------------------------------------------------------------------------------- /workshops/3. Airflow/container-data/airflow/dags/calculate_parallel_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from airflow import DAG 3 | from airflow.models import Variable 4 | from airflow.operators.python_operator import PythonOperator 5 | 6 | 7 | def load(): 8 | return int(Variable.get('value')) 9 | 10 | 11 | def multiply(**ctx): 12 | x = ctx['ti'].xcom_pull(key='return_value', task_ids='load_task') 13 | x = x * 25 14 | return x 15 | 16 | 17 | def plus5(**ctx): 18 | x = ctx['ti'].xcom_pull(key='return_value', task_ids='multiply_task') 19 | x = x + 5 20 | return x 21 | 22 | 23 | def plus10(**ctx): 24 | x = ctx['ti'].xcom_pull(key='return_value', task_ids='multiply_task') 25 | x = x + 10 26 | return x 27 | 28 | 29 | def upload1(**ctx): 30 | x = ctx['ti'].xcom_pull(key='return_value', task_ids='plus5_task') 31 | Variable.set('result1', x) 32 | 33 | 34 | def upload2(**ctx): 35 | x = ctx['ti'].xcom_pull(key='return_value', task_ids='plus10_task') 36 | Variable.set('result2', x) 37 | 38 | 39 | dag = DAG(dag_id='calculation-parallel-dag', 40 | start_date=datetime(2020, 3, 30), 41 | schedule_interval='@once') 42 | 43 | load_task = PythonOperator(task_id='load_task', 44 | python_callable=load, 45 | dag=dag) 46 | multiply_task = PythonOperator(task_id='multiply_task', 47 | python_callable=multiply, 48 | provide_context=True, 49 | dag=dag) 50 | plus5_task = PythonOperator(task_id='plus5_task', 51 | python_callable=plus5, 52 | provide_context=True, 53 | dag=dag) 54 | plus10_task = PythonOperator(task_id='plus10_task', 55 | python_callable=plus10, 56 | provide_context=True, 57 | dag=dag) 58 | upload1_task = PythonOperator(task_id='upload1_task', 59 | python_callable=upload1, 60 | provide_context=True, 61 | dag=dag) 62 | upload2_task = PythonOperator(task_id='upload2_task', 63 | python_callable=upload2, 64 | provide_context=True, 65 | dag=dag) 66 | 67 | 68 | load_task >> multiply_task 69 | multiply_task >> plus5_task >> upload1_task 70 | multiply_task >> plus10_task >> upload2_task 71 | -------------------------------------------------------------------------------- /workshops/3. Airflow/container-data/airflow/dags/dummy_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from airflow import DAG 3 | from airflow.operators.dummy_operator import DummyOperator 4 | 5 | 6 | dag = DAG(dag_id='dummy-dag', 7 | start_date=datetime(2020, 3, 30), 8 | schedule_interval='@once') 9 | 10 | t1 = DummyOperator(task_id='task1', dag=dag) 11 | t2 = DummyOperator(task_id='task2', dag=dag) 12 | t3 = DummyOperator(task_id='task3', dag=dag) 13 | t4 = DummyOperator(task_id='task4', dag=dag) 14 | t5 = DummyOperator(task_id='task5', dag=dag) 15 | t6 = DummyOperator(task_id='task6', dag=dag) 16 | t7 = DummyOperator(task_id='task7', dag=dag) 17 | t8 = DummyOperator(task_id='task8', dag=dag) 18 | t9 = DummyOperator(task_id='task9', dag=dag) 19 | t10 = DummyOperator(task_id='task10', dag=dag) 20 | t11 = DummyOperator(task_id='task11', dag=dag) 21 | t12 = DummyOperator(task_id='task12', dag=dag) 22 | t13 = DummyOperator(task_id='task13', dag=dag) 23 | t14 = DummyOperator(task_id='task14', dag=dag) 24 | t15 = DummyOperator(task_id='task15', dag=dag) 25 | 26 | t1 >> t2 27 | t2 >> t3 >> t5 >> t7 28 | t2 >> t4 >> t6 >> t7 29 | t2 >> t8 >> t7 30 | t2 >> t9 >> t7 31 | t7 >> t10 32 | t10 >> t11 >> t15 33 | t10 >> t12 >> t15 34 | t10 >> t13 >> t15 35 | t10 >> t14 >> t15 36 | -------------------------------------------------------------------------------- /workshops/3. Airflow/container-data/airflow/dags/hello_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from airflow import DAG, AirflowException 3 | from datetime import datetime 4 | from airflow.operators.python_operator import PythonOperator 5 | from airflow.utils.trigger_rule import TriggerRule 6 | 7 | 8 | def print_1(): 9 | return 'Step 1' 10 | 11 | 12 | def print_2(): 13 | raise AirflowException('Oops!') #Uncomment to make error 14 | return 'Step 2' 15 | 16 | 17 | def print_3(): 18 | return 'Step 3' 19 | 20 | 21 | def print_hello(): 22 | return 'Hello Wolrd' 23 | 24 | 25 | dag = DAG('hello-world-dag', 26 | start_date=datetime(2020, 3, 30), 27 | description='Hello world example', 28 | schedule_interval='@once') 29 | 30 | step1 = PythonOperator(task_id='step1', python_callable=print_1, dag=dag) 31 | step2 = PythonOperator(task_id='step2', python_callable=print_2, dag=dag) 32 | step3 = PythonOperator(task_id='step3', python_callable=print_3, dag=dag) 33 | hello_operator = PythonOperator(task_id='hello_task', 34 | python_callable=print_hello, 35 | trigger_rule=TriggerRule.ONE_SUCCESS, #Uncomment to skip error 36 | dag=dag) 37 | 38 | step1 >> [step2, step3] 39 | step2 >> hello_operator 40 | step3 >> hello_operator 41 | -------------------------------------------------------------------------------- /workshops/3. Airflow/container-data/airflow/dags/test_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from airflow import DAG 3 | from airflow.operators.dummy_operator import DummyOperator 4 | 5 | 6 | dag = DAG(dag_id='test-dag-2', 7 | start_date=datetime(2022, 3, 30), 8 | max_active_runs = 1, 9 | schedule_interval='@daily') 10 | 11 | t1 = DummyOperator(task_id='task1', dag=dag) 12 | t2 = DummyOperator(task_id='task2', dag=dag) 13 | t3 = DummyOperator(task_id='task3', dag=dag) 14 | 15 | t1 >> t2 >> t3 16 | -------------------------------------------------------------------------------- /workshops/3. Airflow/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '2.1' 2 | services: 3 | redis: 4 | image: 'redis:5.0.5' 5 | # command: redis-server --requirepass redispass 6 | 7 | postgres: 8 | image: postgres:9.6 9 | environment: 10 | - POSTGRES_USER=airflow 11 | - POSTGRES_PASSWORD=airflow 12 | - POSTGRES_DB=airflow 13 | # Uncomment these lines to persist data on the local filesystem. 14 | # - PGDATA=/var/lib/postgresql/data/pgdata 15 | # volumes: 16 | # - ./pgdata:/var/lib/postgresql/data/pgdata 17 | 18 | webserver: 19 | image: puckel/docker-airflow:1.10.4 20 | restart: always 21 | depends_on: 22 | - postgres 23 | - redis 24 | environment: 25 | - LOAD_EX=n 26 | - FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho= 27 | - EXECUTOR=Celery 28 | # - POSTGRES_USER=airflow 29 | # - POSTGRES_PASSWORD=airflow 30 | # - POSTGRES_DB=airflow 31 | # - REDIS_PASSWORD=redispass 32 | volumes: 33 | - ./container-data/airflow/dags:/usr/local/airflow/dags 34 | # Uncomment to include custom plugins 35 | # - ./plugins:/usr/local/airflow/plugins 36 | ports: 37 | - "8080:8080" 38 | command: webserver 39 | healthcheck: 40 | test: ["CMD-SHELL", "[ -f /usr/local/airflow/airflow-webserver.pid ]"] 41 | interval: 30s 42 | timeout: 30s 43 | retries: 3 44 | 45 | flower: 46 | image: puckel/docker-airflow:1.10.4 47 | restart: always 48 | depends_on: 49 | - redis 50 | environment: 51 | - EXECUTOR=Celery 52 | # - REDIS_PASSWORD=redispass 53 | ports: 54 | - "5555:5555" 55 | command: flower 56 | 57 | scheduler: 58 | image: puckel/docker-airflow:1.10.4 59 | restart: always 60 | depends_on: 61 | - webserver 62 | volumes: 63 | - ./container-data/airflow/dags:/usr/local/airflow/dags 64 | # Uncomment to include custom plugins 65 | # - ./plugins:/usr/local/airflow/plugins 66 | environment: 67 | - LOAD_EX=n 68 | - FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho= 69 | - EXECUTOR=Celery 70 | # - POSTGRES_USER=airflow 71 | # - POSTGRES_PASSWORD=airflow 72 | # - POSTGRES_DB=airflow 73 | # - REDIS_PASSWORD=redispass 74 | command: scheduler 75 | 76 | worker: 77 | image: puckel/docker-airflow:1.10.4 78 | restart: always 79 | depends_on: 80 | - scheduler 81 | volumes: 82 | - ./container-data/airflow/dags:/usr/local/airflow/dags 83 | # Uncomment to include custom plugins 84 | # - ./plugins:/usr/local/airflow/plugins 85 | environment: 86 | - FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho= 87 | - EXECUTOR=Celery 88 | # - POSTGRES_USER=airflow 89 | # - POSTGRES_PASSWORD=airflow 90 | # - POSTGRES_DB=airflow 91 | # - REDIS_PASSWORD=redispass 92 | command: worker 93 | --------------------------------------------------------------------------------