├── code ├── kettle.zip ├── etl_demo.zip └── pg.sql └── README.md /code/kettle.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Arkronus/cooking-data/HEAD/code/kettle.zip -------------------------------------------------------------------------------- /code/etl_demo.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Arkronus/cooking-data/HEAD/code/etl_demo.zip -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Марафон Готовим данные 2 | 3 | В этом репозитории представлены материалы марафона и дополнительные ссылки. 4 | 5 | [Ссылка на Playlist в Youtube](https://youtube.com/playlist?list=PLiEhcUyRBhuFo62pab41J6uTbVqSRGC4P) 6 | 7 | ## 1.Анализ и моделирование данных 8 | 9 | * [Презентация](https://docs.google.com/presentation/d/e/2PACX-1vS58Lp6ZYbD6dASPGse2i_b5ZjNyzRWnoHUicSWUW3hJNIv3lhtvEpFjnS9aMR53M4ZbCsZ4t3CI0S4/pub?start=false&loop=false&delayms=3000) 10 | * [Скрипт для установки Northwind DB](code/northwind.sql) 11 | * [Elephant SQL](https://www.elephantsql.com/) 12 | * [DBeaver](https://dbeaver.io/) 13 | 14 | ### Дополнительные ссылки 15 | 16 | * [PlantUML](https://plantuml.com/ru/) 17 | * [PlantText](https://www.planttext.com/) 18 | * [Примеры схем PlantUML](https://gist.github.com/Arkronus/53731f0e58f46ffaa38dccb288037490) 19 | * [DrawIO Desktop](https://github.com/jgraph/drawio-desktop/releases) 20 | * [DBDocs](https://dbdocs.io/) 21 | 22 | ## 2.SQL для аналитиков 23 | 24 | * [Примеры запросов](code/pg.sql) 25 | 26 | ## 3.Создание хранилища данных и ETL потоков 27 | 28 | * [Презентация](https://docs.google.com/presentation/d/e/2PACX-1vS1fAS22y0T62KP9K_axLf5sMF1kHEcTQYHswI1LfQwRTNDrvUk2ZJCA7UG0gj7IhcWw2qWZWjX5dM-/pub?start=false&loop=false&delayms=3000) 29 | * [Полезные книги про хранилища данных](https://drive.google.com/drive/folders/0B3kUSm9XPlJCcTFVOThYdXN0MGc?resourcekey=0-YBHv2yozQsTwB5Eprp8Ydg) 30 | * [Трансформации в Pentaho Kettle](code/kettle.zip) 31 | 32 | ### Установка и настройка Pentaho Kettle 33 | 34 | Для запуска Pentaho Kettle необходимо 35 | 36 | 1. Скачать и установить [Java 8](https://www.java.com/ru/download/help/windows_manual_download.html) 37 | 1. Скачать и распаковать дистрибутив [Pentaho Kettle CE](https://sourceforge.net/projects/pentaho/files/) 38 | 1. Запустить файл Spoon.bat 39 | 40 | ## 4. Инструменты разработки 41 | 42 | * [Презентация](https://docs.google.com/presentation/d/e/2PACX-1vRllvRI4xDg2_aG9ECmZAkPnjqBg8dBFqLOXu5SmtOnbkG9Gh3z3eKCXH983yAf8Bba3ocbGdRp6_ko/pub?start=false&loop=false&delayms=3000) 43 | 44 | Бесплатные курсы про основы работы с Ubuntu 45 | 46 | * [Основы Ubuntu Linux (Youtube)](https://www.youtube.com/watch?v=tQLpAefAKuA) 47 | * [Введение в Linux (Stepik)](https://stepik.org/course/73/promo) 48 | 49 | Ссылки на программы 50 | 51 | * [VS Code](https://code.visualstudio.com/) 52 | * [Git](https://git-scm.com/) 53 | * [Docker Desktop](https://www.docker.com/products/docker-desktop) 54 | 55 | ## 5.Оркестрация задач. Работа с Airflow 56 | 57 | * [Презентация](https://docs.google.com/presentation/d/e/2PACX-1vSyjtl8jddefsXvW5klVG6P2nOP30SANyiFnvqbiY5_GAwg0OZ6IQdYHUZ4lFBf1HG-53_MHXOUR2Kr/pub?start=false&loop=false&delayms=3000) 58 | 59 | ## 6.ETL c Python 60 | 61 | * [Презентация](https://docs.google.com/presentation/d/e/2PACX-1vQloZNAJxo3S_7BqqtLyM2ntT5EyfbgSKPtXoPCgITg5C-UA1YD1Hb5PMlfpJnpUyfkF_-evsi9LXYx/pub?start=false&loop=false&delayms=3000) 62 | * [Примеры кода](code/etl_demo.zip) 63 | 64 | ## Об авторе 65 | 66 | Меня зовут Артём Прытков, я занимаюсь BI консалтингом. Если у вас есть какие-то вопросы или идеи насчет марафона, то можете написать на почту aprytkov@gmail.com 67 | -------------------------------------------------------------------------------- /code/pg.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE films ( 2 | code char(5), 3 | title varchar(40), 4 | did integer, 5 | date_prod date, 6 | kind varchar(10), 7 | len interval hour to minute, 8 | CONSTRAINT code_title PRIMARY KEY(code,title) 9 | ); 10 | 11 | 12 | 13 | CREATE TABLE total_sales_tbl AS 14 | SELECT c.country, 15 | c.city, 16 | sum(od.unit_price * od.quantity::double precision) AS sum 17 | FROM orders o 18 | JOIN order_details od ON o.order_id = od.order_id 19 | JOIN customers c ON o.customer_id = c.customer_id 20 | GROUP BY c.country, c.city; 21 | 22 | 23 | CREATE TABLE total_sales_tbl AS 24 | SELECT c.country, 25 | c.city, 26 | sum(od.unit_price * od.quantity::double precision) AS sum 27 | FROM orders o 28 | JOIN order_details od ON o.order_id = od.order_id 29 | JOIN customers c ON o.customer_id = c.customer_id 30 | GROUP BY c.country, c.city 31 | WITH NO DATA; 32 | 33 | 34 | CREATE TEMP TABLE total_sales_tbl AS 35 | SELECT c.country, 36 | c.city, 37 | sum(od.unit_price * od.quantity::double precision) AS sum 38 | FROM orders o 39 | JOIN order_details od ON o.order_id = od.order_id 40 | JOIN customers c ON o.customer_id = c.customer_id 41 | GROUP BY c.country, c.city; 42 | 43 | 44 | CREATE OR REPLACE VIEW public.v_total_sales 45 | AS SELECT c.country, 46 | c.city, 47 | sum(od.unit_price * od.quantity) AS sales 48 | FROM orders o 49 | JOIN order_details od ON o.order_id = od.order_id 50 | JOIN customers c ON o.customer_id = c.customer_id 51 | GROUP BY c.country, c.city; 52 | 53 | SELECT country, city, sales FROM v_total_sales vts 54 | 55 | 56 | CREATE materialized VIEW public.vm_total_sales 57 | AS SELECT c.country, 58 | c.city, 59 | sum(od.unit_price * od.quantity) AS sales 60 | FROM orders o 61 | JOIN order_details od ON o.order_id = od.order_id 62 | JOIN customers c ON o.customer_id = c.customer_id 63 | GROUP BY c.country, c.city; 64 | 65 | 66 | REFRESH MATERIALIZED VIEW vm_total_sales; 67 | 68 | 69 | TRUNCATE TABLE total_sales_tbl; 70 | 71 | DROP TABLE total_sales_tbl; 72 | 73 | 74 | SELECT 75 | c.company_name, 76 | c.country 77 | FROM 78 | customers c 79 | JOIN orders o ON c.customer_id = o.customer_id 80 | JOIN order_details od ON o.order_id = od.order_id 81 | WHERE EXTRACT(YEAR FROM o.order_date) = 1996 82 | GROUP BY 1,2 83 | 84 | EXCEPT 85 | 86 | SELECT 87 | c.company_name, 88 | c.country 89 | FROM 90 | customers c 91 | JOIN orders o ON c.customer_id = o.customer_id 92 | JOIN order_details od ON o.order_id = od.order_id 93 | WHERE EXTRACT(YEAR FROM o.order_date) = 1997 94 | GROUP BY 1,2 95 | 96 | SELECT 97 | employee_id, 98 | last_name, 99 | first_name, 100 | hire_date 101 | FROM 102 | employees e 103 | 104 | SELECT max(hire_date) FROM employees; 105 | 106 | -- 1994-11-15 107 | 108 | SELECT 109 | employee_id, 110 | last_name, 111 | first_name, 112 | hire_date 113 | FROM 114 | employees e 115 | WHERE hire_date = '1994-11-15' 116 | 117 | SELECT 118 | employee_id, 119 | last_name, 120 | first_name, 121 | hire_date 122 | FROM 123 | employees e 124 | WHERE hire_date = (SELECT max(hire_date) FROM employees) 125 | 126 | 127 | SELECT 128 | customer_id, 129 | company_name, 130 | country, 131 | address 132 | FROM 133 | customers c 134 | WHERE 135 | country IN ( 136 | 'USA', 'UK' 137 | ) 138 | 139 | 140 | -- 141 | SELECT ship_country, ship_city, count(DISTINCT order_id) 142 | FROM orders o 143 | WHERE customer_id IN ( 144 | SELECT customer_id 145 | FROM customers c 146 | WHERE country IN ('USA', 'UK') 147 | ) 148 | GROUP BY ship_country, ship_city 149 | 150 | 151 | SELECT * FROM orders o 152 | WHERE (ship_country, ship_city) IN ( 153 | SELECT country, city -- можно не указывать DISTINCT. Дубли игнорируются 154 | FROM customers c 155 | WHERE country IN ('USA', 'UK') 156 | ) 157 | 158 | 159 | SELECT customer_id, company_name 160 | FROM customers c 161 | WHERE EXISTS( 162 | SELECT 1 FROM orders o 163 | WHERE o.customer_id = c.customer_id 164 | AND o.order_date >='1998-04-01') 165 | 166 | 167 | SELECT 168 | YEAR, 169 | MONTH, 170 | sales 171 | FROM ( 172 | SELECT 173 | EXTRACT(YEAR FROM o.order_date) AS year, 174 | EXTRACT(MONTH FROM o.order_date) AS month, 175 | SUM(od.unit_price * od.quantity) AS sales 176 | FROM order_details od 177 | JOIN orders o ON o.order_id = od.order_id 178 | GROUP BY 1, 2 179 | ) as sales_data 180 | WHERE YEAR = 1998 181 | 182 | 183 | 184 | WITH sales_data AS ( 185 | SELECT 186 | EXTRACT(YEAR FROM o.order_date) AS year, 187 | EXTRACT(MONTH FROM o.order_date) AS month, 188 | SUM(od.unit_price * od.quantity) AS sales 189 | FROM order_details od 190 | JOIN orders o ON o.order_id = od.order_id 191 | GROUP BY 1, 2) 192 | SELECT 193 | YEAR, 194 | MONTH, 195 | sales 196 | FROM sales_data 197 | WHERE YEAR = 1998 198 | 199 | 200 | SELECT 201 | EXTRACT(YEAR FROM o.order_date) AS year, 202 | EXTRACT(MONTH FROM o.order_date) AS month, 203 | EXTRACT(QUARTER FROM o.order_date) AS quarter, 204 | SUM(od.unit_price * od.quantity) AS sales, 205 | max(SUM(od.unit_price * od.quantity)) OVER () AS max_sales, 206 | max(SUM(od.unit_price * od.quantity)) OVER ( PARTITION BY EXTRACT(QUARTER FROM o.order_date) ) AS max_sales_quarter 207 | FROM order_details od 208 | JOIN orders o ON o.order_id = od.order_id 209 | WHERE EXTRACT(YEAR FROM o.order_date) = 1997 210 | GROUP BY 1, 2, 3 211 | 212 | WITH grouped AS ( 213 | SELECT 214 | EXTRACT(YEAR FROM o.order_date) AS year, 215 | EXTRACT(MONTH FROM o.order_date) AS month, 216 | EXTRACT(QUARTER FROM o.order_date) AS quarter, 217 | SUM(od.unit_price * od.quantity) AS sales 218 | FROM order_details od 219 | JOIN orders o ON o.order_id = od.order_id 220 | WHERE EXTRACT(YEAR FROM o.order_date) = 1997 221 | GROUP BY 1, 2, 3) 222 | SELECT 223 | YEAR, 224 | MONTH, 225 | quarter, 226 | sales, 227 | max(sales) OVER () AS max_sales, 228 | max(sales) OVER ( PARTITION BY quarter) AS max_sales_quarter 229 | FROM 230 | grouped 231 | 232 | 233 | /* Рассчитаем ранги продаж. Для этого нам нужно будет указать порядок */ 234 | SELECT 235 | EXTRACT( QUARTER FROM o.order_date), 236 | EXTRACT( MONTH FROM o.order_date), 237 | SUM(od.unit_price*od.quantity) AS sales, 238 | max(SUM(od.unit_price*od.quantity)) OVER () AS max_sales, 239 | max(SUM(od.unit_price*od.quantity)) OVER (PARTITION BY EXTRACT( quarter FROM o.order_date) ) AS max_qtr_sales, 240 | rank() OVER (ORDER BY SUM(od.unit_price*od.quantity) DESC) AS sales_rank, 241 | rank() OVER (PARTITION BY EXTRACT( quarter FROM o.order_date) ORDER BY SUM(od.unit_price*od.quantity) DESC) AS sales_rank_qtr 242 | FROM 243 | orders o 244 | JOIN order_details od ON 245 | o.order_id = od.order_id 246 | WHERE 247 | EXTRACT(YEAR FROM o.order_date)= 1996 248 | GROUP BY 1,2 249 | ORDER BY EXTRACT( MONTH FROM o.order_date) 250 | 251 | 252 | SELECT 253 | EXTRACT(MONTH FROM o.order_date) AS month, 254 | SUM(od.unit_price * od.quantity), 255 | SUM(CASE WHEN extract(YEAR FROM o.order_date) = 1997 then od.unit_price * od.quantity END) AS sales97, 256 | SUM(od.unit_price * od.quantity) FILTER(WHERE extract(YEAR FROM o.order_date) = 1997) AS sales_97 257 | FROM orders o 258 | JOIN order_details od ON o.order_id = od.order_id 259 | GROUP BY 1 260 | ORDER BY 1 261 | 262 | 263 | WITH sales_data AS ( 264 | SELECT UNNEST AS sales FROM unnest(ARRAY[5, 10,20,30,25,10]) 265 | ) 266 | SELECT 267 | sales, 268 | ROW_NUMBER() OVER(ORDER BY sales desc) AS rn_sales_desc, 269 | RANK() OVER(ORDER BY sales desc) AS rnk_sales_desc, 270 | DENSE_RANK() OVER(ORDER BY sales desc) AS dense_rnk_sales_desc 271 | FROM sales_data 272 | 273 | WITH sales_data AS ( 274 | SELECT UNNEST AS sales FROM unnest(ARRAY[5, 10,20,30,25,10]) 275 | ) 276 | SELECT 277 | sales, 278 | ROW_NUMBER() OVER(ORDER BY sales ) AS rn_sales_desc, 279 | RANK() OVER(ORDER BY sales ) AS rnk_sales_desc, 280 | DENSE_RANK() OVER(ORDER BY sales ) AS dense_rnk_sales_desc 281 | FROM sales_data 282 | 283 | 284 | 285 | 286 | WITH sales_data AS ( 287 | SELECT 288 | ship_country, 289 | ship_city, 290 | sum(od.quantity * od.unit_price) AS sales 291 | FROM orders o 292 | JOIN order_details od ON o.order_id = od.order_id 293 | GROUP BY 1,2), 294 | sales_ranks AS ( 295 | SELECT 296 | ship_country, 297 | ship_city, 298 | sales, 299 | RANK() OVER(ORDER BY sales desc) AS rnk_sales_total, 300 | RANK() OVER(PARTITION BY ship_country ORDER BY sales DESC) AS rnk_sales_by_country 301 | FROM sales_data) 302 | SELECT * 303 | FROM sales_ranks 304 | WHERE rnk_sales_by_country = 1 305 | 306 | CREATE TABLE total_sales_tbl AS 307 | SELECT c.country, 308 | c.city, 309 | sum(od.unit_price * od.quantity::double precision) AS sales 310 | FROM orders o 311 | JOIN order_details od ON o.order_id = od.order_id 312 | JOIN customers c ON o.customer_id = c.customer_id 313 | GROUP BY c.country, c.city; 314 | 315 | 316 | SELECT 317 | country, 318 | city, 319 | sales, 320 | sum(sales) over() AS grand_total, 321 | sum(sales) over(PARTITION BY country) AS country_total, 322 | round((sales / sum(sales) over()*100.0)::numeric, 2) AS pct_of_total 323 | FROM total_sales_tbl 324 | ORDER BY sales DESC; 325 | 326 | SELECT * 327 | FROM 328 | unnest(ARRAY[5,10,20,30,25,10]) t1, 329 | unnest(ARRAY[5,10,20,30,25,10]) t2 330 | 331 | 332 | 333 | SELECT 334 | country, 335 | city, 336 | sales, 337 | CASE sales 338 | WHEN max(sales) OVER () THEN 'Лучший результат' 339 | WHEN min(sales) OVER () THEN 'Худший результат' 340 | ELSE 'Средний результат' 341 | END AS result 342 | FROM total_sales_tbl 343 | ORDER BY sales DESC; 344 | 345 | 346 | WITH sales_data AS ( 347 | SELECT 348 | EXTRACT(YEAR FROM o.order_date) AS year, 349 | EXTRACT(MONTH FROM o.order_date) AS month, 350 | SUM(od.unit_price * od.quantity) AS sales 351 | FROM order_details od 352 | JOIN orders o ON o.order_id = od.order_id 353 | GROUP BY 1, 2) 354 | SELECT 355 | YEAR, 356 | MONTH, 357 | sales, 358 | sum(sales) over(PARTITION BY year ORDER BY month) AS rolling_sum, 359 | avg(sales) over(ORDER BY YEAR, MONTH ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS rolling_3mon_avg_sales 360 | FROM sales_data 361 | 362 | 363 | WITH sales_data AS ( 364 | SELECT 365 | EXTRACT(YEAR FROM o.order_date) AS year, 366 | EXTRACT(MONTH FROM o.order_date) AS month, 367 | SUM(od.unit_price * od.quantity) AS sales 368 | FROM order_details od 369 | JOIN orders o ON o.order_id = od.order_id 370 | GROUP BY 1, 2), 371 | fill_data AS ( 372 | SELECT 373 | t1 AS mon, 374 | t2 AS yr, 375 | 0 AS sales 376 | FROM 377 | unnest(ARRAY[1,2,3,4,5,6]) t1, 378 | unnest(ARRAY[1995, 1996,1997]) t2 379 | ), 380 | unioned AS ( 381 | SELECT * FROM sales_data 382 | UNION ALL 383 | SELECT * FROM fill_data 384 | ) 385 | SELECT 386 | YEAR, 387 | MONTH, 388 | sales, 389 | lag(sales, 1) over(ORDER BY YEAR, month) AS prev_month_sales, 390 | lag(sales, 12) over(ORDER BY YEAR, month) AS prev_year_month_sales, 391 | sales - lag(sales, 12) over(ORDER BY YEAR, month) AS sales_diff, 392 | lead(sales, 1) over(ORDER BY YEAR, month) AS next_month_sales 393 | FROM unioned 394 | 395 | SELECT 396 | t1 AS mon, 397 | t2 AS yr, 398 | 0 AS sales 399 | FROM 400 | unnest(ARRAY[1,2,3,4,5,6]) t1, 401 | unnest(ARRAY[1996,1997]) t2 402 | 403 | 404 | SELECT country, city, SUM(sales) 405 | FROM v_total_sales vts 406 | GROUP BY ROLLUP(country, city) 407 | 408 | 409 | SELECT country, city, SUM(sales) 410 | FROM v_total_sales vts 411 | GROUP BY CUBE(country, city) 412 | 413 | 414 | SELECT 415 | country, 416 | city, 417 | SUM(sales) 418 | FROM v_total_sales 419 | GROUP BY GROUPING SETS((country),(city), ()) 420 | --------------------------------------------------------------------------------