├── luigi.cfg ├── .gitignore ├── README.md ├── requirements.txt ├── utils.py └── notebook.ipynb /luigi.cfg: -------------------------------------------------------------------------------- 1 | [core] 2 | no_configure_logging=false 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.ipynb_checkpoints 2 | /.vscode 3 | /__pycache__ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Resources 2 | 3 | - https://stackoverflow.com/questions/40407936/mysql-targets-in-luigi-workflow/40423427#40423427 4 | - https://stackoverflow.com/questions/40707004/using-luigi-to-update-postgres-table 5 | - https://stackoverflow.com/questions/28793832/can-luigi-rerun-tasks-when-the-task-dependencies-become-out-of-date 6 | - https://luigi.readthedocs.io/en/stable/_modules/luigi/contrib/sqla.html 7 | - https://stackoverflow.com/questions/9727673/list-directory-tree-structure-in-python 8 | - https://stackoverflow.com/questions/11349333/how-to-ignore-the-first-line-of-data-when-processing-csv-data 9 | - https://stackoverflow.com/questions/35918605/how-to-delete-a-table-in-sqlalchemy 10 | - https://stackoverflow.com/questions/11900553/sqlalchemy-table-already-exists 11 | - https://stackoverflow.com/questions/237079/how-to-get-file-creation-modification-date-times-in-python 12 | - https://stackoverflow.com/questions/48509083/how-to-make-a-parameter-available-to-all-luigi-tasks 13 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # This file may be used to create an environment using: 2 | # $ conda create --name --file 3 | # platform: win-64 4 | astroid=2.4.2=py37_0 5 | async_generator=1.10=py_0 6 | attrs=20.2.0=py_0 7 | backcall=0.2.0=py_0 8 | blas=1.0=mkl 9 | bleach=3.2.1=py_0 10 | brotlipy=0.7.0=py37he774522_1000 11 | ca-certificates=2020.10.14=0 12 | certifi=2020.6.20=py37_0 13 | cffi=1.14.3=py37h7a1dbc1_0 14 | chardet=3.0.4=py37_1003 15 | colorama=0.4.4=py_0 16 | cryptography=3.1.1=py37h7a1dbc1_0 17 | decorator=4.4.2=py_0 18 | defusedxml=0.6.0=py_0 19 | docutils=0.16=pypi_0 20 | entrypoints=0.3=py37_0 21 | et_xmlfile=1.0.1=py_1001 22 | idna=2.10=py_0 23 | importlib-metadata=2.0.0=py_1 24 | importlib_metadata=2.0.0=1 25 | intel-openmp=2020.2=254 26 | ipykernel=5.3.4=py37h5ca1d4c_0 27 | ipython=7.18.1=py37h5ca1d4c_0 28 | ipython_genutils=0.2.0=py37_0 29 | isort=5.6.4=py_0 30 | jdcal=1.4.1=py_0 31 | jedi=0.17.2=py37_0 32 | jinja2=2.11.2=py_0 33 | jsonschema=3.2.0=py_2 34 | jupyter_client=6.1.7=py_0 35 | jupyter_core=4.6.3=py37_0 36 | jupyterlab_pygments=0.1.2=py_0 37 | lazy-object-proxy=1.4.3=py37he774522_0 38 | libsodium=1.0.18=h62dcd97_0 39 | lockfile=0.12.2=pypi_0 40 | luigi=3.0.2=pypi_0 41 | m2w64-gcc-libgfortran=5.3.0=6 42 | m2w64-gcc-libs=5.3.0=7 43 | m2w64-gcc-libs-core=5.3.0=7 44 | m2w64-gmp=6.1.0=2 45 | m2w64-libwinpthread-git=5.0.0.4634.697f757=2 46 | markupsafe=1.1.1=py37hfa6e2cd_1 47 | mccabe=0.6.1=py37_1 48 | mistune=0.8.4=py37hfa6e2cd_1001 49 | mkl=2020.2=256 50 | mkl-service=2.3.0=py37hb782905_0 51 | mkl_fft=1.2.0=py37h45dec08_0 52 | mkl_random=1.1.1=py37h47e9c7a_0 53 | msys2-conda-epoch=20160418=1 54 | nbclient=0.5.1=py_0 55 | nbconvert=6.0.7=py37_0 56 | nbformat=5.0.8=py_0 57 | nest-asyncio=1.4.1=py_0 58 | notebook=6.0.3=py37_0 59 | numpy=1.19.1=py37h5510c5b_0 60 | numpy-base=1.19.1=py37ha3acd2a_0 61 | openpyxl=3.0.5=py_0 62 | openssl=1.1.1h=he774522_0 63 | packaging=20.4=py_0 64 | pandas=1.1.3=py37ha925a31_0 65 | pandoc=2.11=h9490d1a_0 66 | pandocfilters=1.4.2=py37_1 67 | parso=0.7.0=py_0 68 | pickleshare=0.7.5=py37_1001 69 | pip=20.2.4=py37_0 70 | prometheus_client=0.8.0=py_0 71 | prompt-toolkit=3.0.8=py_0 72 | pycparser=2.20=py_2 73 | pygments=2.7.1=py_0 74 | pylint=2.6.0=py37_0 75 | pyopenssl=19.1.0=py_1 76 | pyparsing=2.4.7=py_0 77 | pyrsistent=0.17.3=py37he774522_0 78 | pysocks=1.7.1=py37_1 79 | python=3.7.9=h60c2a47_0 80 | python-daemon=2.2.4=pypi_0 81 | python-dateutil=2.8.1=py_0 82 | pytz=2020.1=py_0 83 | pywin32=227=py37he774522_1 84 | pywinpty=0.5.7=py37_0 85 | pyzmq=19.0.2=py37ha925a31_1 86 | requests=2.24.0=py_0 87 | send2trash=1.5.0=py37_0 88 | setuptools=50.3.0=py37h9490d1a_1 89 | six=1.15.0=py_0 90 | sqlalchemy=1.3.19=py37he774522_0 91 | sqlite=3.33.0=h2a8f88b_0 92 | terminado=0.9.1=py37_0 93 | testpath=0.4.4=py_0 94 | toml=0.10.1=py_0 95 | tornado=6.0.4=py37he774522_1 96 | traitlets=5.0.5=py_0 97 | typed-ast=1.4.1=py37he774522_0 98 | urllib3=1.25.10=py_0 99 | vc=14.1=h0510ff6_4 100 | vs2015_runtime=14.16.27012=hf0eaf9b_3 101 | wcwidth=0.2.5=py_0 102 | webencodings=0.5.1=py37_1 103 | wheel=0.35.1=py_0 104 | win_inet_pton=1.1.0=py37_0 105 | wincertstore=0.2=py37_0 106 | winpty=0.4.3=4 107 | wrapt=1.11.2=py37he774522_0 108 | zeromq=4.3.2=ha925a31_3 109 | zipp=3.3.0=py_0 110 | zlib=1.2.11=h62dcd97_4 111 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | 2 | import time 3 | import os 4 | from pathlib import Path 5 | from sqlalchemy import MetaData, Table, Column 6 | from sqlalchemy.ext.declarative import declarative_base 7 | 8 | class MTimeMixin: 9 | """ 10 | Mixin that flags a task as incomplete if any requirement 11 | is incomplete or has been updated more recently than this task 12 | This is based on http://stackoverflow.com/a/29304506, but extends 13 | it to support multiple input / output dependencies. 14 | """ 15 | 16 | def complete(self): 17 | def to_list(obj): 18 | if type(obj) in (type(()), type([])): 19 | return obj 20 | else: 21 | return [obj] 22 | 23 | def mtime(path): 24 | return time.ctime(os.path.getmtime(path)) 25 | 26 | if not all(os.path.exists(out.path) for out in to_list(self.output())): 27 | return False 28 | 29 | self_mtime = min(mtime(out.path) for out in to_list(self.output())) 30 | 31 | # the below assumes a list of requirements, each with a list of outputs. YMMV 32 | for el in to_list(self.requires()): 33 | if not el.complete(): 34 | # Fixes Windows FileExistsError 35 | if os.path.exists(self.output().path): 36 | os.remove(self.output().path) 37 | return False 38 | for output in to_list(el.output()): 39 | if mtime(output.path) > self_mtime: 40 | # Fixes Windows FileExistsError 41 | os.remove(self.output().path) 42 | 43 | return False 44 | 45 | return True 46 | 47 | 48 | class DisplayablePath(object): 49 | display_filename_prefix_middle = '├──' 50 | display_filename_prefix_last = '└──' 51 | display_parent_prefix_middle = ' ' 52 | display_parent_prefix_last = '│ ' 53 | 54 | def __init__(self, path, parent_path, is_last): 55 | self.path = Path(str(path)) 56 | self.parent = parent_path 57 | self.is_last = is_last 58 | if self.parent: 59 | self.depth = self.parent.depth + 1 60 | else: 61 | self.depth = 0 62 | 63 | @property 64 | def displayname(self): 65 | if self.path.is_dir(): 66 | return self.path.name + '/' 67 | return self.path.name 68 | 69 | @classmethod 70 | def make_tree(cls, root, parent=None, is_last=False, criteria=None): 71 | root = Path(str(root)) 72 | criteria = criteria or cls._default_criteria 73 | 74 | displayable_root = cls(root, parent, is_last) 75 | yield displayable_root 76 | 77 | children = sorted(list(path 78 | for path in root.iterdir() 79 | if criteria(path)), 80 | key=lambda s: str(s).lower()) 81 | count = 1 82 | for path in children: 83 | is_last = count == len(children) 84 | if path.is_dir(): 85 | yield from cls.make_tree(path, 86 | parent=displayable_root, 87 | is_last=is_last, 88 | criteria=criteria) 89 | else: 90 | yield cls(path, displayable_root, is_last) 91 | count += 1 92 | 93 | @classmethod 94 | def _default_criteria(cls, path): 95 | return True 96 | 97 | def displayable(self): 98 | if self.parent is None: 99 | return self.displayname 100 | 101 | _filename_prefix = (self.display_filename_prefix_last 102 | if self.is_last 103 | else self.display_filename_prefix_middle) 104 | 105 | parts = ['{!s} {!s}'.format(_filename_prefix, 106 | self.displayname)] 107 | 108 | parent = self.parent 109 | while parent and parent.parent is not None: 110 | parts.append(self.display_parent_prefix_middle 111 | if parent.is_last 112 | else self.display_parent_prefix_last) 113 | parent = parent.parent 114 | 115 | return ''.join(reversed(parts)) 116 | -------------------------------------------------------------------------------- /notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Luigi ETL\n", 8 | "> \"The getting started docs are a head scratcher. Let's explore the package together.\"" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "## About\n", 16 | "\n", 17 | "Luigi is a python ETL framework built by Spotify. I use pandas in my day-to-day job and have created numerous pipeline tasks to move, transform, and analyze data across my organization. I thought Luigi would be a great addition to help manage these pipelines, but after reading their getting started documentation, it left me scratching my head.\n", 18 | "\n", 19 | "If you are reading this, then I assume the docs have you confused as well, and hopefully, my post below can provide you with a bit more clarity. The post assumes you have already read the docs. If you haven't, please read that first before continuing." 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 1, 25 | "metadata": { 26 | "ExecuteTime": { 27 | "end_time": "2020-10-23T20:59:53.111298Z", 28 | "start_time": "2020-10-23T20:59:40.529601Z" 29 | } 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "#hide\n", 34 | "from utils import DisplayablePath" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "## Task Execution\n", 42 | "\n", 43 | "Typical ETL Execution:\n", 44 | "> Task A $\\longrightarrow$ Task B $\\longrightarrow$ Task C\n", 45 | "\n", 46 | "Luig ETL Execution:\n", 47 | "> Task A $\\longleftarrow$ Task B $\\longleftarrow$ Task C\n", 48 | "\n", 49 | "The most important thing to understand about Luigi is that it executes the ETL backward (recursively). It checks first to see if the current task (Task C) is completed. If not, it will then move backward to check if the previous task is completed (Task B). Once it finds the first completed task, it will then begin to execute the Tasks moving forward again. \n", 50 | "\n", 51 | "This approach can save you a lot of time in your ETL. The reason is that you won't re-run completed tasks. However, this makes it a bit trickier to implement because you can find yourself in a situation where Task B or Task C will always return that they are complete, and Task A will never run again." 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "## How are Tasks linked?\n", 59 | "\n", 60 | "> Task A $\\longleftarrow$ Task B $\\longleftarrow$ Task C\n", 61 | "\n", 62 | "Except for External Tasks, most other tasks are dependent on another Luigi Task. The way you define this dependency is by defining a `requires()` method in the Task Class and defining those dependent tasks(s). If the task is complete, it won't bother to check the dependent task(s). " 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "## What defines a completed Task?\n", 70 | "\n", 71 | "Luigi considers a Task completed when the Task output exists. So if Task A outputs `task_a.csv` and it exists, then Task A will be considered complete. What the getting started docs fail to mention is that in reality, Task A is complete when Task A's method `complete()` returns `True`. The `complete()` method default behavior is to check if the output exists. We can override this behavior, and I would probably bet most do that have deployed Luigi into production." 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "## Coding Demonstration of Task Execution\n", 79 | "\n", 80 | "> Task A $\\longleftarrow$ Task B\n", 81 | "\n", 82 | "The below code is an example of how to set up a Luigi Task. We have two classes, `Task_A` and `Task_B`, where `Task_B` is dependent on `Task_A`. I've provided comments in the output to help visualize the order of events that take place when the Tasks run.\n", 83 | "\n", 84 | "There are few things to note about the code:\n", 85 | "* `GlobalParam` is a helper class to provide a global variable so I can count the execution events i.e. **1:** complete () ...\n", 86 | "* I replaced the Luigi `complete()` method with a similar method that checks if the output file exists so we could see the method executed in the print statements.\n", 87 | "* `MockTarget` creates an in-memory file object that we can write to and check if it exists." 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 2, 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "name": "stderr", 97 | "output_type": "stream", 98 | "text": [ 99 | "DEBUG: Checking if Task_B() is complete\n", 100 | "DEBUG: Checking if Task_A() is complete\n", 101 | "INFO: Informed scheduler that task Task_B__99914b932b has status PENDING\n", 102 | "INFO: Informed scheduler that task Task_A__99914b932b has status PENDING\n", 103 | "INFO: Done scheduling tasks\n", 104 | "INFO: Running Worker with 1 processes\n", 105 | "DEBUG: Asking scheduler for work...\n", 106 | "DEBUG: Pending tasks: 2\n", 107 | "INFO: [pid 2456] Worker Worker(salt=998663148, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) running Task_A()\n", 108 | "INFO: [pid 2456] Worker Worker(salt=998663148, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) done Task_A()\n", 109 | "DEBUG: 1 running tasks, waiting for next task to finish\n", 110 | "INFO: Informed scheduler that task Task_A__99914b932b has status DONE\n", 111 | "DEBUG: Asking scheduler for work...\n", 112 | "DEBUG: Pending tasks: 1\n", 113 | "INFO: [pid 2456] Worker Worker(salt=998663148, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) running Task_B()\n", 114 | "INFO: [pid 2456] Worker Worker(salt=998663148, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) done Task_B()\n", 115 | "DEBUG: 1 running tasks, waiting for next task to finish\n", 116 | "INFO: Informed scheduler that task Task_B__99914b932b has status DONE\n", 117 | "DEBUG: Asking scheduler for work...\n", 118 | "DEBUG: Done\n", 119 | "DEBUG: There are no more tasks to run at this time\n", 120 | "INFO: Worker Worker(salt=998663148, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) was stopped. Shutting down Keep-Alive thread\n", 121 | "INFO: \n", 122 | "===== Luigi Execution Summary =====\n", 123 | "\n", 124 | "Scheduled 2 tasks of which:\n", 125 | "* 2 ran successfully:\n", 126 | " - 1 Task_A()\n", 127 | " - 1 Task_B()\n", 128 | "\n", 129 | "This progress looks :) because there were no failed tasks or missing dependencies\n", 130 | "\n", 131 | "===== Luigi Execution Summary =====\n", 132 | "\n" 133 | ] 134 | }, 135 | { 136 | "name": "stdout", 137 | "output_type": "stream", 138 | "text": [ 139 | "1: complete() Checking to see if Task_B has been completed\n", 140 | "2: requires() Task_B is not completed, checking to see if previous tasks are required and completed\n", 141 | "3: complete() Checking to see if Task_A has been completed\n", 142 | "4: run() Task_A has no prior Task dependency. It is now running to complete the task\n", 143 | "5: requires() Task_B is not completed, checking to see if previous tasks are required and completed\n", 144 | "6: complete() Checking to see if Task_A has been completed\n", 145 | "7: run() All previous tasks are completed and Task_B is running to complete the task\n", 146 | "8: All Tasks are completed\n" 147 | ] 148 | }, 149 | { 150 | "data": { 151 | "text/plain": [ 152 | "True" 153 | ] 154 | }, 155 | "execution_count": 2, 156 | "metadata": {}, 157 | "output_type": "execute_result" 158 | } 159 | ], 160 | "source": [ 161 | "import luigi\n", 162 | "import pandas as pd\n", 163 | "from luigi.mock import MockTarget\n", 164 | "\n", 165 | "class GlobalParams(luigi.Config):\n", 166 | " count = luigi.IntParameter(default=1)\n", 167 | "\n", 168 | "class Task_A(luigi.Task):\n", 169 | "\n", 170 | " def output(self):\n", 171 | " return MockTarget(\"Task_A\")\n", 172 | "\n", 173 | " def run(self):\n", 174 | " print(f\"{g.count}: run() {self.__class__.__name__} has no prior Task dependency. It is now running to complete the task\")\n", 175 | " g.count += 1 \n", 176 | " out = self.output().open(\"w\")\n", 177 | " out.write('complete')\n", 178 | " out.close()\n", 179 | " \n", 180 | " def complete(self):\n", 181 | " print(f'{g.count}: complete() Checking to see if {self.__class__.__name__} has been completed')\n", 182 | " g.count += 1 \n", 183 | " return self.output().exists() \n", 184 | "\n", 185 | " \n", 186 | "class Task_B(luigi.Task):\n", 187 | " \n", 188 | " def requires(self):\n", 189 | " print(f'{g.count}: requires() {self.__class__.__name__} is not completed, checking to see if previous tasks are required and completed')\n", 190 | " g.count += 1 \n", 191 | " return Task_A()\n", 192 | " \n", 193 | " def output(self):\n", 194 | " return MockTarget(\"Task_B\")\n", 195 | "\n", 196 | " def run(self):\n", 197 | " print(f'{g.count}: run() All previous tasks are completed and {self.__class__.__name__} is running to complete the task')\n", 198 | " g.count += 1 \n", 199 | " out = self.output().open(\"w\")\n", 200 | " out.write('complete')\n", 201 | " out.close()\n", 202 | " print(f'{g.count}: All Tasks are completed')\n", 203 | " \n", 204 | " def complete(self):\n", 205 | " print(f'{g.count}: complete() Checking to see if {self.__class__.__name__} has been completed')\n", 206 | " g.count += 1 \n", 207 | " \n", 208 | " if self.output().exists():\n", 209 | " print(f'{g.count}: All Tasks are completed')\n", 210 | " return self.output().exists()\n", 211 | "\n", 212 | "g = GlobalParams()\n", 213 | "luigi.build([Task_B()], local_scheduler=True)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "When we run the Task a second-time, note how `Task_A` is not referenced. Luigi checked to see if `Task_B` was complete and stopped the execution since it returned `True`. That means, that if some file upstream is updated and needed to be transformed by `Task_A`it would not occur since Luigi would always stop at `Task_B`. " 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 3, 226 | "metadata": {}, 227 | "outputs": [ 228 | { 229 | "name": "stderr", 230 | "output_type": "stream", 231 | "text": [ 232 | "DEBUG: Checking if Task_B() is complete\n", 233 | "INFO: Informed scheduler that task Task_B__99914b932b has status DONE\n", 234 | "INFO: Done scheduling tasks\n", 235 | "INFO: Running Worker with 1 processes\n", 236 | "DEBUG: Asking scheduler for work...\n", 237 | "DEBUG: Done\n", 238 | "DEBUG: There are no more tasks to run at this time\n", 239 | "INFO: Worker Worker(salt=177125647, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) was stopped. Shutting down Keep-Alive thread\n", 240 | "INFO: \n", 241 | "===== Luigi Execution Summary =====\n", 242 | "\n", 243 | "Scheduled 1 tasks of which:\n", 244 | "* 1 complete ones were encountered:\n", 245 | " - 1 Task_B()\n", 246 | "\n", 247 | "Did not run any tasks\n", 248 | "This progress looks :) because there were no failed tasks or missing dependencies\n", 249 | "\n", 250 | "===== Luigi Execution Summary =====\n", 251 | "\n" 252 | ] 253 | }, 254 | { 255 | "name": "stdout", 256 | "output_type": "stream", 257 | "text": [ 258 | "1: complete() Checking to see if Task_B has been completed\n", 259 | "2: All Tasks are completed\n" 260 | ] 261 | }, 262 | { 263 | "data": { 264 | "text/plain": [ 265 | "True" 266 | ] 267 | }, 268 | "execution_count": 3, 269 | "metadata": {}, 270 | "output_type": "execute_result" 271 | } 272 | ], 273 | "source": [ 274 | "g.count=1\n", 275 | "luigi.build([Task_B()], local_scheduler=True)" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 4, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "#hide\n", 285 | "\n", 286 | "# Clean Up...Remove the Memory Files so be able to run again with same ouput\n", 287 | "Task_B().output().remove()\n", 288 | "Task_A().output().remove()" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": {}, 294 | "source": [ 295 | "## Luigi Paramaters\n", 296 | "\n", 297 | "> Words $\\longleftarrow$ Count\n", 298 | "\n", 299 | "Parameters are Luigi's intended way to make sure tasks get updated based on some frequency to make sure they don't get stuck in a \"complete\" status. Luigi offers there own `Parameter` object that is mostly intended to act as a constructor when executing tasks from the command line.\n", 300 | "\n", 301 | "Below we have created two new Tasks, `Words` and `Count`. Each task takes a date as a parameter and appends the date to the file name output. You'll also notice I removed the `complete()` method. This means it will default to the original method that also checks if the output target exists more robustly. " 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 5, 307 | "metadata": {}, 308 | "outputs": [ 309 | { 310 | "name": "stderr", 311 | "output_type": "stream", 312 | "text": [ 313 | "DEBUG: Checking if Count(date=2020-10-26) is complete\n", 314 | "DEBUG: Checking if Words(date=2020-10-26) is complete\n", 315 | "INFO: Informed scheduler that task Count_2020_10_26_424115e443 has status PENDING\n", 316 | "INFO: Informed scheduler that task Words_2020_10_26_424115e443 has status PENDING\n", 317 | "INFO: Done scheduling tasks\n", 318 | "INFO: Running Worker with 1 processes\n", 319 | "DEBUG: Asking scheduler for work...\n", 320 | "DEBUG: Pending tasks: 2\n", 321 | "INFO: [pid 2456] Worker Worker(salt=660151019, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) running Words(date=2020-10-26)\n", 322 | "INFO: [pid 2456] Worker Worker(salt=660151019, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) done Words(date=2020-10-26)\n", 323 | "DEBUG: 1 running tasks, waiting for next task to finish\n", 324 | "INFO: Informed scheduler that task Words_2020_10_26_424115e443 has status DONE\n", 325 | "DEBUG: Asking scheduler for work...\n", 326 | "DEBUG: Pending tasks: 1\n", 327 | "INFO: [pid 2456] Worker Worker(salt=660151019, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) running Count(date=2020-10-26)\n", 328 | "INFO: [pid 2456] Worker Worker(salt=660151019, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) done Count(date=2020-10-26)\n", 329 | "DEBUG: 1 running tasks, waiting for next task to finish\n", 330 | "INFO: Informed scheduler that task Count_2020_10_26_424115e443 has status DONE\n", 331 | "DEBUG: Asking scheduler for work...\n", 332 | "DEBUG: Done\n", 333 | "DEBUG: There are no more tasks to run at this time\n", 334 | "INFO: Worker Worker(salt=660151019, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) was stopped. Shutting down Keep-Alive thread\n", 335 | "INFO: \n", 336 | "===== Luigi Execution Summary =====\n", 337 | "\n", 338 | "Scheduled 2 tasks of which:\n", 339 | "* 2 ran successfully:\n", 340 | " - 1 Count(date=2020-10-26)\n", 341 | " - 1 Words(date=2020-10-26)\n", 342 | "\n", 343 | "This progress looks :) because there were no failed tasks or missing dependencies\n", 344 | "\n", 345 | "===== Luigi Execution Summary =====\n", 346 | "\n" 347 | ] 348 | }, 349 | { 350 | "data": { 351 | "text/plain": [ 352 | "True" 353 | ] 354 | }, 355 | "execution_count": 5, 356 | "metadata": {}, 357 | "output_type": "execute_result" 358 | } 359 | ], 360 | "source": [ 361 | "import datetime\n", 362 | "from pathlib import Path\n", 363 | "OUTPUT_PATH = Path('output')\n", 364 | "\n", 365 | "class Words(luigi.Task):\n", 366 | " date = luigi.DateParameter(default=datetime.date.today())\n", 367 | " \n", 368 | " def output(self):\n", 369 | " return luigi.LocalTarget(OUTPUT_PATH/f'words_{self.date}.csv')\n", 370 | "\n", 371 | " def run(self):\n", 372 | " words = ['apple','banana','grapefruit']\n", 373 | "\n", 374 | " df = pd.DataFrame(dict(words=words))\n", 375 | " df.to_csv(self.output().path, index=False)\n", 376 | " \n", 377 | "class Count(luigi.Task):\n", 378 | " date = luigi.DateParameter(default=datetime.date.today())\n", 379 | " \n", 380 | " def requires(self):\n", 381 | " # Passing the luigi paramater back to upstream task\n", 382 | " return Words(self.date) \n", 383 | " \n", 384 | " def output(self):\n", 385 | " return luigi.LocalTarget(OUTPUT_PATH/f'count_{self.date}.csv')\n", 386 | "\n", 387 | " def run(self):\n", 388 | " df = pd.read_csv(self.input().path)\n", 389 | " df['letter_count'] = df.words.map(len)\n", 390 | " df.to_csv(self.output().path, index=False)\n", 391 | " \n", 392 | "luigi.build([Count()], local_scheduler=True)" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 6, 398 | "metadata": {}, 399 | "outputs": [ 400 | { 401 | "name": "stdout", 402 | "output_type": "stream", 403 | "text": [ 404 | "//Directory Tree:\n", 405 | "output/\n", 406 | "├── count_2020-10-26.csv\n", 407 | "└── words_2020-10-26.csv\n" 408 | ] 409 | } 410 | ], 411 | "source": [ 412 | "#hide_input\n", 413 | "\n", 414 | "# Used to display the files for demonstration purposes\n", 415 | "paths = DisplayablePath.make_tree(OUTPUT_PATH)\n", 416 | "print('//Directory Tree:')\n", 417 | "for path in paths:\n", 418 | " print(path.displayable())" 419 | ] 420 | }, 421 | { 422 | "cell_type": "markdown", 423 | "metadata": {}, 424 | "source": [ 425 | "Above, our Tasks ran successfully and saved the outputs in our output directory. So what happens if we were to run it a second time?" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": 7, 431 | "metadata": {}, 432 | "outputs": [ 433 | { 434 | "name": "stderr", 435 | "output_type": "stream", 436 | "text": [ 437 | "DEBUG: Checking if Count(date=2020-10-26) is complete\n", 438 | "INFO: Informed scheduler that task Count_2020_10_26_424115e443 has status DONE\n", 439 | "INFO: Done scheduling tasks\n", 440 | "INFO: Running Worker with 1 processes\n", 441 | "DEBUG: Asking scheduler for work...\n", 442 | "DEBUG: Done\n", 443 | "DEBUG: There are no more tasks to run at this time\n", 444 | "INFO: Worker Worker(salt=832748578, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) was stopped. Shutting down Keep-Alive thread\n", 445 | "INFO: \n", 446 | "===== Luigi Execution Summary =====\n", 447 | "\n", 448 | "Scheduled 1 tasks of which:\n", 449 | "* 1 complete ones were encountered:\n", 450 | " - 1 Count(date=2020-10-26)\n", 451 | "\n", 452 | "Did not run any tasks\n", 453 | "This progress looks :) because there were no failed tasks or missing dependencies\n", 454 | "\n", 455 | "===== Luigi Execution Summary =====\n", 456 | "\n" 457 | ] 458 | }, 459 | { 460 | "data": { 461 | "text/plain": [ 462 | "True" 463 | ] 464 | }, 465 | "execution_count": 7, 466 | "metadata": {}, 467 | "output_type": "execute_result" 468 | } 469 | ], 470 | "source": [ 471 | "luigi.build([Count()], local_scheduler=True)" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": 8, 477 | "metadata": {}, 478 | "outputs": [ 479 | { 480 | "name": "stdout", 481 | "output_type": "stream", 482 | "text": [ 483 | "//Directory Tree:\n", 484 | "output/\n", 485 | "├── count_2020-10-26.csv\n", 486 | "└── words_2020-10-26.csv\n" 487 | ] 488 | } 489 | ], 490 | "source": [ 491 | "#hide_input\n", 492 | "\n", 493 | "# Used to display the files for demonstration purposes\n", 494 | "paths = DisplayablePath.make_tree(OUTPUT_PATH)\n", 495 | "print('//Directory Tree:')\n", 496 | "for path in paths:\n", 497 | " print(path.displayable())" 498 | ] 499 | }, 500 | { 501 | "cell_type": "markdown", 502 | "metadata": {}, 503 | "source": [ 504 | "As you can see, nothing happened since the `Count` task encountered an output that already existed with the same name. Below we'll provide a different date to the `Count` task." 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": 9, 510 | "metadata": {}, 511 | "outputs": [ 512 | { 513 | "name": "stderr", 514 | "output_type": "stream", 515 | "text": [ 516 | "DEBUG: Checking if Count(date=2021-10-25) is complete\n", 517 | "DEBUG: Checking if Words(date=2021-10-25) is complete\n", 518 | "INFO: Informed scheduler that task Count_2021_10_25_8a7563aba6 has status PENDING\n", 519 | "INFO: Informed scheduler that task Words_2021_10_25_8a7563aba6 has status PENDING\n", 520 | "INFO: Done scheduling tasks\n", 521 | "INFO: Running Worker with 1 processes\n", 522 | "DEBUG: Asking scheduler for work...\n", 523 | "DEBUG: Pending tasks: 2\n", 524 | "INFO: [pid 2456] Worker Worker(salt=766530749, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) running Words(date=2021-10-25)\n", 525 | "INFO: [pid 2456] Worker Worker(salt=766530749, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) done Words(date=2021-10-25)\n", 526 | "DEBUG: 1 running tasks, waiting for next task to finish\n", 527 | "INFO: Informed scheduler that task Words_2021_10_25_8a7563aba6 has status DONE\n", 528 | "DEBUG: Asking scheduler for work...\n", 529 | "DEBUG: Pending tasks: 1\n", 530 | "INFO: [pid 2456] Worker Worker(salt=766530749, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) running Count(date=2021-10-25)\n", 531 | "INFO: [pid 2456] Worker Worker(salt=766530749, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) done Count(date=2021-10-25)\n", 532 | "DEBUG: 1 running tasks, waiting for next task to finish\n", 533 | "INFO: Informed scheduler that task Count_2021_10_25_8a7563aba6 has status DONE\n", 534 | "DEBUG: Asking scheduler for work...\n", 535 | "DEBUG: Done\n", 536 | "DEBUG: There are no more tasks to run at this time\n", 537 | "INFO: Worker Worker(salt=766530749, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) was stopped. Shutting down Keep-Alive thread\n", 538 | "INFO: \n", 539 | "===== Luigi Execution Summary =====\n", 540 | "\n", 541 | "Scheduled 2 tasks of which:\n", 542 | "* 2 ran successfully:\n", 543 | " - 1 Count(date=2021-10-25)\n", 544 | " - 1 Words(date=2021-10-25)\n", 545 | "\n", 546 | "This progress looks :) because there were no failed tasks or missing dependencies\n", 547 | "\n", 548 | "===== Luigi Execution Summary =====\n", 549 | "\n" 550 | ] 551 | }, 552 | { 553 | "data": { 554 | "text/plain": [ 555 | "True" 556 | ] 557 | }, 558 | "execution_count": 9, 559 | "metadata": {}, 560 | "output_type": "execute_result" 561 | } 562 | ], 563 | "source": [ 564 | "luigi.build([Count(date=pd.to_datetime('10/25/2021'))], local_scheduler=True)" 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": 10, 570 | "metadata": {}, 571 | "outputs": [ 572 | { 573 | "name": "stdout", 574 | "output_type": "stream", 575 | "text": [ 576 | "//Directory Tree:\n", 577 | "output/\n", 578 | "├── count_2020-10-26.csv\n", 579 | "├── count_2021-10-25.csv\n", 580 | "├── words_2020-10-26.csv\n", 581 | "└── words_2021-10-25.csv\n" 582 | ] 583 | } 584 | ], 585 | "source": [ 586 | "#hide_input\n", 587 | "\n", 588 | "# Used to display the files for demonstration purposes\n", 589 | "paths = DisplayablePath.make_tree(OUTPUT_PATH)\n", 590 | "print('//Directory Tree:')\n", 591 | "for path in paths:\n", 592 | " print(path.displayable())" 593 | ] 594 | }, 595 | { 596 | "cell_type": "code", 597 | "execution_count": 11, 598 | "metadata": {}, 599 | "outputs": [], 600 | "source": [ 601 | "#hide\n", 602 | "\n", 603 | "# Remove Files\n", 604 | "for file in OUTPUT_PATH.glob('*'):\n", 605 | " file.unlink()" 606 | ] 607 | }, 608 | { 609 | "cell_type": "markdown", 610 | "metadata": {}, 611 | "source": [ 612 | "## External Tasks" 613 | ] 614 | }, 615 | { 616 | "cell_type": "markdown", 617 | "metadata": {}, 618 | "source": [ 619 | "If your pipeline starts with some dependency from an External Task, you can utilize the `ExternalTask` object. The `External Task` is the same as the `Task` object except it doesn't have a `run()` method.\n", 620 | "\n", 621 | "`External Task` is useful because it allows for your task to gracefully end a job if the external source criteria are not met." 622 | ] 623 | }, 624 | { 625 | "cell_type": "code", 626 | "execution_count": 12, 627 | "metadata": {}, 628 | "outputs": [ 629 | { 630 | "name": "stderr", 631 | "output_type": "stream", 632 | "text": [ 633 | "DEBUG: Checking if Words() is complete\n", 634 | "WARNING: Data for Words() does not exist (yet?). The task is an external data dependency, so it cannot be run from this luigi process.\n", 635 | "INFO: Informed scheduler that task Words__99914b932b has status PENDING\n", 636 | "INFO: Done scheduling tasks\n", 637 | "INFO: Running Worker with 1 processes\n", 638 | "DEBUG: Asking scheduler for work...\n", 639 | "DEBUG: Done\n", 640 | "DEBUG: There are no more tasks to run at this time\n", 641 | "INFO: Worker Worker(salt=806349904, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) was stopped. Shutting down Keep-Alive thread\n", 642 | "INFO: \n", 643 | "===== Luigi Execution Summary =====\n", 644 | "\n", 645 | "Scheduled 1 tasks of which:\n", 646 | "* 1 were left pending, among these:\n", 647 | " * 1 were missing external dependencies:\n", 648 | " - 1 Words()\n", 649 | "\n", 650 | "Did not run any tasks\n", 651 | "This progress looks :| because there were missing external dependencies\n", 652 | "\n", 653 | "===== Luigi Execution Summary =====\n", 654 | "\n" 655 | ] 656 | }, 657 | { 658 | "data": { 659 | "text/plain": [ 660 | "True" 661 | ] 662 | }, 663 | "execution_count": 12, 664 | "metadata": {}, 665 | "output_type": "execute_result" 666 | } 667 | ], 668 | "source": [ 669 | "class Words(luigi.ExternalTask):\n", 670 | " def output(self):\n", 671 | " return luigi.LocalTarget(OUTPUT_PATH/f'words.csv')\n", 672 | "\n", 673 | "luigi.build([Words()], local_scheduler=True)" 674 | ] 675 | }, 676 | { 677 | "cell_type": "markdown", 678 | "metadata": {}, 679 | "source": [ 680 | "Above, the `Words` external task did not run because `words.csv`, the external dependency, was missing." 681 | ] 682 | }, 683 | { 684 | "cell_type": "code", 685 | "execution_count": 13, 686 | "metadata": {}, 687 | "outputs": [], 688 | "source": [ 689 | "OUTPUT_PATH = Path('output')\n", 690 | "\n", 691 | "words = ['apple','banana','grapefruit']\n", 692 | "\n", 693 | "df = pd.DataFrame(dict(words=words))\n", 694 | "df.to_csv(OUTPUT_PATH/'words.csv', index=False)" 695 | ] 696 | }, 697 | { 698 | "cell_type": "markdown", 699 | "metadata": {}, 700 | "source": [ 701 | "Now that we created `words.csv` our external task will return as completed and pass its output to the next Task if it exists." 702 | ] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": 14, 707 | "metadata": {}, 708 | "outputs": [ 709 | { 710 | "name": "stderr", 711 | "output_type": "stream", 712 | "text": [ 713 | "DEBUG: Checking if Words() is complete\n", 714 | "INFO: Informed scheduler that task Words__99914b932b has status DONE\n", 715 | "INFO: Done scheduling tasks\n", 716 | "INFO: Running Worker with 1 processes\n", 717 | "DEBUG: Asking scheduler for work...\n", 718 | "DEBUG: Done\n", 719 | "DEBUG: There are no more tasks to run at this time\n", 720 | "INFO: Worker Worker(salt=781673351, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) was stopped. Shutting down Keep-Alive thread\n", 721 | "INFO: \n", 722 | "===== Luigi Execution Summary =====\n", 723 | "\n", 724 | "Scheduled 1 tasks of which:\n", 725 | "* 1 complete ones were encountered:\n", 726 | " - 1 Words()\n", 727 | "\n", 728 | "Did not run any tasks\n", 729 | "This progress looks :) because there were no failed tasks or missing dependencies\n", 730 | "\n", 731 | "===== Luigi Execution Summary =====\n", 732 | "\n" 733 | ] 734 | }, 735 | { 736 | "data": { 737 | "text/plain": [ 738 | "True" 739 | ] 740 | }, 741 | "execution_count": 14, 742 | "metadata": {}, 743 | "output_type": "execute_result" 744 | } 745 | ], 746 | "source": [ 747 | "luigi.build([Words()], local_scheduler=True)" 748 | ] 749 | }, 750 | { 751 | "cell_type": "markdown", 752 | "metadata": {}, 753 | "source": [ 754 | "## Alternate Complete Method" 755 | ] 756 | }, 757 | { 758 | "cell_type": "markdown", 759 | "metadata": {}, 760 | "source": [ 761 | "As I mentioned earlier, by default Luigi determines if a Task is complete by checking if the output exists. However, there is a common use case in pipeline workflows where Tasks should be run when a file is updated. Since Luigi only checks the output name it will determine that a Task is completed no matter how many times a file gets updated.\n", 762 | "\n", 763 | "However, we can override this method by overriding the `complete()` method in the `Task` object by defining your criteria. It needs to return `False` if the task is not complete and `True` if the task is complete.\n", 764 | "\n", 765 | "Below we will create our own `complete()` function that will update all tasks where their dependent task output files have been updated. It will first check to see if the dependency tasks have been completed, then check to see if the modified time of the output of the current task is greater than the modified time of the prior task's output." 766 | ] 767 | }, 768 | { 769 | "cell_type": "code", 770 | "execution_count": 20, 771 | "metadata": {}, 772 | "outputs": [], 773 | "source": [ 774 | "import os\n", 775 | "import time\n", 776 | "\n", 777 | "class Words(luigi.ExternalTask):\n", 778 | " def output(self):\n", 779 | " return luigi.LocalTarget(OUTPUT_PATH/'words.csv')\n", 780 | " \n", 781 | "class CountLetters(luigi.Task):\n", 782 | "\n", 783 | " def requires(self):\n", 784 | " return Words()\n", 785 | "\n", 786 | " # Custom Complete Method\n", 787 | " def complete(self): \n", 788 | " if not self.output().exists():\n", 789 | " print('//Count Letters: No Output File')\n", 790 | " return False\n", 791 | "\n", 792 | " input_mtime = time.ctime(os.path.getmtime(self.input().path))\n", 793 | " output_mtime = time.ctime(os.path.getmtime(self.output().path)) \n", 794 | " \n", 795 | " if output_mtime < input_mtime:\n", 796 | " print('//Count Letters: File Out of Date')\n", 797 | " return False\n", 798 | " \n", 799 | " print('//Count Letters: Task is Complete')\n", 800 | " return True\n", 801 | "\n", 802 | " \n", 803 | " def output(self):\n", 804 | " return luigi.LocalTarget(OUTPUT_PATH/'count_letters.csv')\n", 805 | "\n", 806 | " def run(self):\n", 807 | " df = pd.read_csv(self.input().path)\n", 808 | " df['letter_count'] = df.words.map(len)\n", 809 | " df.to_csv(self.output().path, index=False)" 810 | ] 811 | }, 812 | { 813 | "cell_type": "code", 814 | "execution_count": 16, 815 | "metadata": {}, 816 | "outputs": [ 817 | { 818 | "name": "stderr", 819 | "output_type": "stream", 820 | "text": [ 821 | "DEBUG: Checking if CountLetters() is complete\n", 822 | "DEBUG: Checking if Words() is complete\n", 823 | "INFO: Informed scheduler that task CountLetters__99914b932b has status PENDING\n", 824 | "INFO: Informed scheduler that task Words__99914b932b has status DONE\n", 825 | "INFO: Done scheduling tasks\n", 826 | "INFO: Running Worker with 1 processes\n", 827 | "DEBUG: Asking scheduler for work...\n", 828 | "DEBUG: Pending tasks: 1\n", 829 | "INFO: [pid 2456] Worker Worker(salt=437114638, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) running CountLetters()\n", 830 | "INFO: [pid 2456] Worker Worker(salt=437114638, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) done CountLetters()\n", 831 | "DEBUG: 1 running tasks, waiting for next task to finish\n", 832 | "INFO: Informed scheduler that task CountLetters__99914b932b has status DONE\n", 833 | "DEBUG: Asking scheduler for work...\n", 834 | "DEBUG: Done\n", 835 | "DEBUG: There are no more tasks to run at this time\n", 836 | "INFO: Worker Worker(salt=437114638, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) was stopped. Shutting down Keep-Alive thread\n", 837 | "INFO: \n", 838 | "===== Luigi Execution Summary =====\n", 839 | "\n", 840 | "Scheduled 2 tasks of which:\n", 841 | "* 1 complete ones were encountered:\n", 842 | " - 1 Words()\n", 843 | "* 1 ran successfully:\n", 844 | " - 1 CountLetters()\n", 845 | "\n", 846 | "This progress looks :) because there were no failed tasks or missing dependencies\n", 847 | "\n", 848 | "===== Luigi Execution Summary =====\n", 849 | "\n" 850 | ] 851 | }, 852 | { 853 | "name": "stdout", 854 | "output_type": "stream", 855 | "text": [ 856 | "//Count Letters: No Output File\n" 857 | ] 858 | }, 859 | { 860 | "data": { 861 | "text/plain": [ 862 | "True" 863 | ] 864 | }, 865 | "execution_count": 16, 866 | "metadata": {}, 867 | "output_type": "execute_result" 868 | } 869 | ], 870 | "source": [ 871 | "luigi.build([CountLetters()], local_scheduler=True)" 872 | ] 873 | }, 874 | { 875 | "cell_type": "code", 876 | "execution_count": 17, 877 | "metadata": {}, 878 | "outputs": [ 879 | { 880 | "name": "stdout", 881 | "output_type": "stream", 882 | "text": [ 883 | "//Directory Tree:\n", 884 | "output/\n", 885 | "├── count_letters.csv\n", 886 | "└── words.csv\n" 887 | ] 888 | } 889 | ], 890 | "source": [ 891 | "#hide_input\n", 892 | "\n", 893 | "# Used to display the files for demonstration purposes\n", 894 | "paths = DisplayablePath.make_tree(OUTPUT_PATH)\n", 895 | "print('//Directory Tree:')\n", 896 | "for path in paths:\n", 897 | " print(path.displayable())" 898 | ] 899 | }, 900 | { 901 | "cell_type": "markdown", 902 | "metadata": {}, 903 | "source": [ 904 | "When we run the task again it doesn't run any tasks because `words.csv` exists and its modified time (mtime) is less than `count_letters.csv` modified time." 905 | ] 906 | }, 907 | { 908 | "cell_type": "code", 909 | "execution_count": 18, 910 | "metadata": {}, 911 | "outputs": [ 912 | { 913 | "name": "stderr", 914 | "output_type": "stream", 915 | "text": [ 916 | "DEBUG: Checking if CountLetters() is complete\n", 917 | "INFO: Informed scheduler that task CountLetters__99914b932b has status DONE\n", 918 | "INFO: Done scheduling tasks\n", 919 | "INFO: Running Worker with 1 processes\n", 920 | "DEBUG: Asking scheduler for work...\n", 921 | "DEBUG: Done\n", 922 | "DEBUG: There are no more tasks to run at this time\n", 923 | "INFO: Worker Worker(salt=438850535, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) was stopped. Shutting down Keep-Alive thread\n", 924 | "INFO: \n", 925 | "===== Luigi Execution Summary =====\n", 926 | "\n", 927 | "Scheduled 1 tasks of which:\n", 928 | "* 1 complete ones were encountered:\n", 929 | " - 1 CountLetters()\n", 930 | "\n", 931 | "Did not run any tasks\n", 932 | "This progress looks :) because there were no failed tasks or missing dependencies\n", 933 | "\n", 934 | "===== Luigi Execution Summary =====\n", 935 | "\n" 936 | ] 937 | }, 938 | { 939 | "name": "stdout", 940 | "output_type": "stream", 941 | "text": [ 942 | "//Count Letters: Task is Complete\n" 943 | ] 944 | }, 945 | { 946 | "data": { 947 | "text/plain": [ 948 | "True" 949 | ] 950 | }, 951 | "execution_count": 18, 952 | "metadata": {}, 953 | "output_type": "execute_result" 954 | } 955 | ], 956 | "source": [ 957 | "luigi.build([CountLetters()], local_scheduler=True)" 958 | ] 959 | }, 960 | { 961 | "cell_type": "markdown", 962 | "metadata": {}, 963 | "source": [ 964 | "We will now update `count_letters.csv` to include a few more words and watch our Luigi run the Task because of the updated modification times on the files." 965 | ] 966 | }, 967 | { 968 | "cell_type": "code", 969 | "execution_count": 19, 970 | "metadata": {}, 971 | "outputs": [ 972 | { 973 | "name": "stderr", 974 | "output_type": "stream", 975 | "text": [ 976 | "DEBUG: Checking if CountLetters() is complete\n", 977 | "DEBUG: Checking if Words() is complete\n", 978 | "INFO: Informed scheduler that task CountLetters__99914b932b has status PENDING\n", 979 | "INFO: Informed scheduler that task Words__99914b932b has status DONE\n", 980 | "INFO: Done scheduling tasks\n", 981 | "INFO: Running Worker with 1 processes\n", 982 | "DEBUG: Asking scheduler for work...\n", 983 | "DEBUG: Pending tasks: 1\n", 984 | "INFO: [pid 2456] Worker Worker(salt=695141065, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) running CountLetters()\n", 985 | "INFO: [pid 2456] Worker Worker(salt=695141065, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) done CountLetters()\n", 986 | "DEBUG: 1 running tasks, waiting for next task to finish\n", 987 | "INFO: Informed scheduler that task CountLetters__99914b932b has status DONE\n", 988 | "DEBUG: Asking scheduler for work...\n", 989 | "DEBUG: Done\n", 990 | "DEBUG: There are no more tasks to run at this time\n", 991 | "INFO: Worker Worker(salt=695141065, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) was stopped. Shutting down Keep-Alive thread\n", 992 | "INFO: \n", 993 | "===== Luigi Execution Summary =====\n", 994 | "\n", 995 | "Scheduled 2 tasks of which:\n", 996 | "* 1 complete ones were encountered:\n", 997 | " - 1 Words()\n", 998 | "* 1 ran successfully:\n", 999 | " - 1 CountLetters()\n", 1000 | "\n", 1001 | "This progress looks :) because there were no failed tasks or missing dependencies\n", 1002 | "\n", 1003 | "===== Luigi Execution Summary =====\n", 1004 | "\n" 1005 | ] 1006 | }, 1007 | { 1008 | "name": "stdout", 1009 | "output_type": "stream", 1010 | "text": [ 1011 | "//Count Letters: File Out of Date\n" 1012 | ] 1013 | }, 1014 | { 1015 | "data": { 1016 | "text/plain": [ 1017 | "True" 1018 | ] 1019 | }, 1020 | "execution_count": 19, 1021 | "metadata": {}, 1022 | "output_type": "execute_result" 1023 | } 1024 | ], 1025 | "source": [ 1026 | "words = ['apple','banana','grapefruit', 'cherry', 'orange']\n", 1027 | "\n", 1028 | "df = pd.DataFrame(dict(words=words))\n", 1029 | "df.to_csv(OUTPUT_PATH/'words.csv', index=False)\n", 1030 | "\n", 1031 | "luigi.build([CountLetters()], local_scheduler=True)" 1032 | ] 1033 | }, 1034 | { 1035 | "cell_type": "markdown", 1036 | "metadata": {}, 1037 | "source": [ 1038 | "## SQL Tasks" 1039 | ] 1040 | }, 1041 | { 1042 | "cell_type": "markdown", 1043 | "metadata": {}, 1044 | "source": [ 1045 | "> Words $\\longleftarrow$ Count $\\longleftarrow$ StoreSQL $\\longleftarrow$ PrintSQL\n", 1046 | "\n", 1047 | "SQL is a common step in many pipelines but the Luigi getting started docs barely cover the topic. In this section we will create two new tasks. The first task, `StoreSql`, will take the ouput from `CountLetters` and store it in a `SQLite` database. The second task, `PrintSQL`, will then read from out database and print both tables that Luigi created." 1048 | ] 1049 | }, 1050 | { 1051 | "cell_type": "markdown", 1052 | "metadata": {}, 1053 | "source": [ 1054 | "The big difference between `LocalTarget` is that `SQLAlchemyTarget` creates and updates a \"Marker Table\" to keep track of whether a task is complete or not. You provide the Marker Table with a `update_id` and Luigi will check if it exists before running the Task.\n", 1055 | "\n", 1056 | "Below I've provided the code for our `StoreSQL` and `PrintSQL` tasks. There are a couple of things worth noting.\n", 1057 | "\n", 1058 | "* We have overridden the `complete()` method to check if the prior task `CountLetters` has been completed and if the `StoreSQL` task output exists. If either returns `False` the Task will run.\n", 1059 | "* We are creating an SQLite database called `my.db`.\n", 1060 | "* `self.output().touch()` is what marks the Task as complete and creates/updates the Marker Table\n", 1061 | "* `PrintSQL` is complete method is set to False so that it always runs for demonstration purposes." 1062 | ] 1063 | }, 1064 | { 1065 | "cell_type": "code", 1066 | "execution_count": 21, 1067 | "metadata": {}, 1068 | "outputs": [ 1069 | { 1070 | "name": "stdout", 1071 | "output_type": "stream", 1072 | "text": [ 1073 | "//Directory Tree:\n", 1074 | "output/\n", 1075 | "├── count_letters.csv\n", 1076 | "└── words.csv\n" 1077 | ] 1078 | } 1079 | ], 1080 | "source": [ 1081 | "#hide_input\n", 1082 | "\n", 1083 | "# Used to display the files for demonstration purposes\n", 1084 | "paths = DisplayablePath.make_tree(OUTPUT_PATH)\n", 1085 | "print('//Directory Tree:')\n", 1086 | "for path in paths:\n", 1087 | " print(path.displayable())" 1088 | ] 1089 | }, 1090 | { 1091 | "cell_type": "code", 1092 | "execution_count": 22, 1093 | "metadata": {}, 1094 | "outputs": [], 1095 | "source": [ 1096 | "from luigi.contrib import sqla\n", 1097 | "from sqlalchemy import create_engine\n", 1098 | "\n", 1099 | "OUTPUT_PATH = Path('output')\n", 1100 | "connection_string = f\"sqlite:///{OUTPUT_PATH}/my.db\"\n", 1101 | "\n", 1102 | "outputs = []\n", 1103 | "\n", 1104 | "class StoreSQL(luigi.Task):\n", 1105 | " connection_string = luigi.Parameter()\n", 1106 | " target_table = luigi.Parameter()\n", 1107 | " \n", 1108 | " @property\n", 1109 | " def update_id(self):\n", 1110 | " mtime = os.path.getmtime(self.input().path)\n", 1111 | " mtime = datetime.datetime.fromtimestamp(mtime).strftime(\"%Y-%m-%d %H:%M:%S\")\n", 1112 | " return mtime + '_' + self.target_table\n", 1113 | "\n", 1114 | " def complete(self):\n", 1115 | " \n", 1116 | " if not self.output().exists():\n", 1117 | " return False\n", 1118 | " \n", 1119 | " if not self.requires().complete():\n", 1120 | " return False\n", 1121 | " \n", 1122 | " return True\n", 1123 | " \n", 1124 | " def requires(self):\n", 1125 | " return CountLetters()\n", 1126 | "\n", 1127 | " def output(self):\n", 1128 | " return sqla.SQLAlchemyTarget(\n", 1129 | " connection_string=self.connection_string,\n", 1130 | " target_table=self.target_table,\n", 1131 | " update_id=self.update_id\n", 1132 | " )\n", 1133 | "\n", 1134 | " def run(self):\n", 1135 | " self.requires().complete()\n", 1136 | " con = self.output().engine\n", 1137 | " df = pd.read_csv(self.input().path)\n", 1138 | " df.to_sql(name=self.target_table, con=con, if_exists='replace')\n", 1139 | "\n", 1140 | " # Update Marker Table\n", 1141 | " self.output().touch()\n", 1142 | "\n", 1143 | "\n", 1144 | "class PrintSQL(luigi.Task):\n", 1145 | " connection_string = luigi.Parameter()\n", 1146 | " target_table = luigi.Parameter()\n", 1147 | " \n", 1148 | " def requires(self):\n", 1149 | " return StoreSQL(self.connection_string, self.target_table)\n", 1150 | "\n", 1151 | " def complete(self):\n", 1152 | " return False\n", 1153 | "\n", 1154 | " def output(self):\n", 1155 | " pass\n", 1156 | "\n", 1157 | " def run(self):\n", 1158 | " input = self.input()\n", 1159 | " con = input.engine\n", 1160 | " table = input.target_table\n", 1161 | " \n", 1162 | " print('// Letter Count Table')\n", 1163 | " print(pd.read_sql(sql=table, con=con), end='\\n\\n')\n", 1164 | " print('// Marker Table')\n", 1165 | " print(pd.read_sql(sql='table_updates', con=con))" 1166 | ] 1167 | }, 1168 | { 1169 | "cell_type": "code", 1170 | "execution_count": 23, 1171 | "metadata": {}, 1172 | "outputs": [ 1173 | { 1174 | "name": "stderr", 1175 | "output_type": "stream", 1176 | "text": [ 1177 | "DEBUG: Checking if PrintSQL(connection_string=sqlite:///output/my.db, target_table=letter_count) is complete\n", 1178 | "DEBUG: Checking if StoreSQL(connection_string=sqlite:///output/my.db, target_table=letter_count) is complete\n", 1179 | "INFO: Informed scheduler that task PrintSQL_sqlite____output_letter_count_4c5210e673 has status PENDING\n", 1180 | "DEBUG: Checking if CountLetters() is complete\n", 1181 | "INFO: Informed scheduler that task StoreSQL_sqlite____output_letter_count_4c5210e673 has status PENDING\n", 1182 | "INFO: Informed scheduler that task CountLetters__99914b932b has status DONE\n", 1183 | "INFO: Done scheduling tasks\n", 1184 | "INFO: Running Worker with 1 processes\n", 1185 | "DEBUG: Asking scheduler for work...\n", 1186 | "DEBUG: Pending tasks: 2\n", 1187 | "INFO: [pid 2456] Worker Worker(salt=382729348, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) running StoreSQL(connection_string=sqlite:///output/my.db, target_table=letter_count)\n", 1188 | "INFO: [pid 2456] Worker Worker(salt=382729348, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) done StoreSQL(connection_string=sqlite:///output/my.db, target_table=letter_count)\n", 1189 | "DEBUG: 1 running tasks, waiting for next task to finish\n", 1190 | "INFO: Informed scheduler that task StoreSQL_sqlite____output_letter_count_4c5210e673 has status DONE\n", 1191 | "DEBUG: Asking scheduler for work...\n", 1192 | "DEBUG: Pending tasks: 1\n", 1193 | "INFO: [pid 2456] Worker Worker(salt=382729348, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) running PrintSQL(connection_string=sqlite:///output/my.db, target_table=letter_count)\n", 1194 | "INFO: [pid 2456] Worker Worker(salt=382729348, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) done PrintSQL(connection_string=sqlite:///output/my.db, target_table=letter_count)\n", 1195 | "DEBUG: 1 running tasks, waiting for next task to finish\n", 1196 | "INFO: Informed scheduler that task PrintSQL_sqlite____output_letter_count_4c5210e673 has status DONE\n", 1197 | "DEBUG: Asking scheduler for work...\n", 1198 | "DEBUG: Done\n", 1199 | "DEBUG: There are no more tasks to run at this time\n", 1200 | "INFO: Worker Worker(salt=382729348, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) was stopped. Shutting down Keep-Alive thread\n", 1201 | "INFO: \n", 1202 | "===== Luigi Execution Summary =====\n", 1203 | "\n", 1204 | "Scheduled 3 tasks of which:\n", 1205 | "* 1 complete ones were encountered:\n", 1206 | " - 1 CountLetters()\n", 1207 | "* 2 ran successfully:\n", 1208 | " - 1 PrintSQL(connection_string=sqlite:///output/my.db, target_table=letter_count)\n", 1209 | " - 1 StoreSQL(connection_string=sqlite:///output/my.db, target_table=letter_count)\n", 1210 | "\n", 1211 | "This progress looks :) because there were no failed tasks or missing dependencies\n", 1212 | "\n", 1213 | "===== Luigi Execution Summary =====\n", 1214 | "\n" 1215 | ] 1216 | }, 1217 | { 1218 | "name": "stdout", 1219 | "output_type": "stream", 1220 | "text": [ 1221 | "// Letter Count Table\n", 1222 | " index words letter_count\n", 1223 | "0 0 apple 5\n", 1224 | "1 1 banana 6\n", 1225 | "2 2 grapefruit 10\n", 1226 | "3 3 cherry 6\n", 1227 | "4 4 orange 6\n", 1228 | "\n", 1229 | "// Marker Table\n", 1230 | " update_id target_table inserted\n", 1231 | "0 2020-10-26 17:09:56_letter_count letter_count 2020-10-26 17:10:30.763616\n" 1232 | ] 1233 | }, 1234 | { 1235 | "data": { 1236 | "text/plain": [ 1237 | "True" 1238 | ] 1239 | }, 1240 | "execution_count": 23, 1241 | "metadata": {}, 1242 | "output_type": "execute_result" 1243 | } 1244 | ], 1245 | "source": [ 1246 | "luigi.build([PrintSQL(connection_string, target_table='letter_count')], local_scheduler=True)" 1247 | ] 1248 | }, 1249 | { 1250 | "cell_type": "code", 1251 | "execution_count": 24, 1252 | "metadata": {}, 1253 | "outputs": [ 1254 | { 1255 | "name": "stdout", 1256 | "output_type": "stream", 1257 | "text": [ 1258 | "Output Directory Tree:\n", 1259 | "output/\n", 1260 | "├── count_letters.csv\n", 1261 | "├── my.db\n", 1262 | "└── words.csv\n" 1263 | ] 1264 | } 1265 | ], 1266 | "source": [ 1267 | "# Used to display the files for demonstration purposes\n", 1268 | "paths = DisplayablePath.make_tree(OUTPUT_PATH)\n", 1269 | "print('Output Directory Tree:')\n", 1270 | "for path in paths:\n", 1271 | " print(path.displayable())" 1272 | ] 1273 | }, 1274 | { 1275 | "cell_type": "markdown", 1276 | "metadata": {}, 1277 | "source": [ 1278 | "As you can see from above, our SQL Task has updated two tables and printed out the table results. If you look at the Marker table `update_id` column you'll notice it is the concatenation of our `count_letters.csv` mtime and target table name.\n", 1279 | "\n", 1280 | "Now let's update our `words.csv` and see what happens when we run the task again." 1281 | ] 1282 | }, 1283 | { 1284 | "cell_type": "code", 1285 | "execution_count": 25, 1286 | "metadata": {}, 1287 | "outputs": [ 1288 | { 1289 | "name": "stderr", 1290 | "output_type": "stream", 1291 | "text": [ 1292 | "DEBUG: Checking if PrintSQL(connection_string=sqlite:///output/my.db, target_table=letter_count) is complete\n", 1293 | "DEBUG: Checking if StoreSQL(connection_string=sqlite:///output/my.db, target_table=letter_count) is complete\n", 1294 | "INFO: Informed scheduler that task PrintSQL_sqlite____output_letter_count_4c5210e673 has status PENDING\n", 1295 | "DEBUG: Checking if CountLetters() is complete\n", 1296 | "INFO: Informed scheduler that task StoreSQL_sqlite____output_letter_count_4c5210e673 has status PENDING\n", 1297 | "DEBUG: Checking if Words() is complete\n", 1298 | "INFO: Informed scheduler that task CountLetters__99914b932b has status PENDING\n", 1299 | "INFO: Informed scheduler that task Words__99914b932b has status DONE\n", 1300 | "INFO: Done scheduling tasks\n", 1301 | "INFO: Running Worker with 1 processes\n", 1302 | "DEBUG: Asking scheduler for work...\n", 1303 | "DEBUG: Pending tasks: 3\n", 1304 | "INFO: [pid 2456] Worker Worker(salt=051857807, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) running CountLetters()\n", 1305 | "INFO: [pid 2456] Worker Worker(salt=051857807, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) done CountLetters()\n", 1306 | "DEBUG: 1 running tasks, waiting for next task to finish\n", 1307 | "INFO: Informed scheduler that task CountLetters__99914b932b has status DONE\n", 1308 | "DEBUG: Asking scheduler for work...\n", 1309 | "DEBUG: Pending tasks: 2\n", 1310 | "INFO: [pid 2456] Worker Worker(salt=051857807, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) running StoreSQL(connection_string=sqlite:///output/my.db, target_table=letter_count)\n", 1311 | "INFO: [pid 2456] Worker Worker(salt=051857807, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) done StoreSQL(connection_string=sqlite:///output/my.db, target_table=letter_count)\n", 1312 | "DEBUG: 1 running tasks, waiting for next task to finish\n", 1313 | "INFO: Informed scheduler that task StoreSQL_sqlite____output_letter_count_4c5210e673 has status DONE\n", 1314 | "DEBUG: Asking scheduler for work...\n", 1315 | "DEBUG: Pending tasks: 1\n", 1316 | "INFO: [pid 2456] Worker Worker(salt=051857807, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) running PrintSQL(connection_string=sqlite:///output/my.db, target_table=letter_count)\n", 1317 | "INFO: [pid 2456] Worker Worker(salt=051857807, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) done PrintSQL(connection_string=sqlite:///output/my.db, target_table=letter_count)\n", 1318 | "DEBUG: 1 running tasks, waiting for next task to finish\n", 1319 | "INFO: Informed scheduler that task PrintSQL_sqlite____output_letter_count_4c5210e673 has status DONE\n", 1320 | "DEBUG: Asking scheduler for work...\n", 1321 | "DEBUG: Done\n", 1322 | "DEBUG: There are no more tasks to run at this time\n", 1323 | "INFO: Worker Worker(salt=051857807, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) was stopped. Shutting down Keep-Alive thread\n", 1324 | "INFO: \n", 1325 | "===== Luigi Execution Summary =====\n", 1326 | "\n", 1327 | "Scheduled 4 tasks of which:\n", 1328 | "* 1 complete ones were encountered:\n", 1329 | " - 1 Words()\n", 1330 | "* 3 ran successfully:\n", 1331 | " - 1 CountLetters()\n", 1332 | " - 1 PrintSQL(connection_string=sqlite:///output/my.db, target_table=letter_count)\n", 1333 | " - 1 StoreSQL(connection_string=sqlite:///output/my.db, target_table=letter_count)\n", 1334 | "\n", 1335 | "This progress looks :) because there were no failed tasks or missing dependencies\n", 1336 | "\n", 1337 | "===== Luigi Execution Summary =====\n", 1338 | "\n" 1339 | ] 1340 | }, 1341 | { 1342 | "name": "stdout", 1343 | "output_type": "stream", 1344 | "text": [ 1345 | "// Letter Count Table\n", 1346 | " index words letter_count\n", 1347 | "0 0 apple 5\n", 1348 | "1 1 banana 6\n", 1349 | "2 2 grapefruit 10\n", 1350 | "3 3 cherry 6\n", 1351 | "4 4 orange 6\n", 1352 | "5 5 peach 5\n", 1353 | "6 6 strawberry 10\n", 1354 | "\n", 1355 | "// Marker Table\n", 1356 | " update_id target_table inserted\n", 1357 | "0 2020-10-26 17:09:56_letter_count letter_count 2020-10-26 17:10:30.763616\n", 1358 | "1 2020-10-26 17:10:34_letter_count letter_count 2020-10-26 17:10:34.734229\n" 1359 | ] 1360 | }, 1361 | { 1362 | "data": { 1363 | "text/plain": [ 1364 | "True" 1365 | ] 1366 | }, 1367 | "execution_count": 25, 1368 | "metadata": {}, 1369 | "output_type": "execute_result" 1370 | } 1371 | ], 1372 | "source": [ 1373 | "words = ['apple','banana','grapefruit', 'cherry', 'orange', 'peach', 'strawberry']\n", 1374 | "\n", 1375 | "df = pd.DataFrame(dict(words=words))\n", 1376 | "df.to_csv(OUTPUT_PATH/'words.csv', index=False)\n", 1377 | "luigi.build([PrintSQL(connection_string, target_table='letter_count')], local_scheduler=True)" 1378 | ] 1379 | }, 1380 | { 1381 | "cell_type": "markdown", 1382 | "metadata": {}, 1383 | "source": [ 1384 | "As expected, our Letter Count table updated and the Marker Table's contains a new row to represent the task completing.\n", 1385 | "\n", 1386 | "Let's run the task one more time without updating `words.csv`." 1387 | ] 1388 | }, 1389 | { 1390 | "cell_type": "code", 1391 | "execution_count": 26, 1392 | "metadata": {}, 1393 | "outputs": [ 1394 | { 1395 | "name": "stderr", 1396 | "output_type": "stream", 1397 | "text": [ 1398 | "DEBUG: Checking if PrintSQL(connection_string=sqlite:///output/my.db, target_table=letter_count) is complete\n", 1399 | "DEBUG: Checking if StoreSQL(connection_string=sqlite:///output/my.db, target_table=letter_count) is complete\n", 1400 | "INFO: Informed scheduler that task PrintSQL_sqlite____output_letter_count_4c5210e673 has status PENDING\n", 1401 | "INFO: Informed scheduler that task StoreSQL_sqlite____output_letter_count_4c5210e673 has status DONE\n", 1402 | "INFO: Done scheduling tasks\n", 1403 | "INFO: Running Worker with 1 processes\n", 1404 | "DEBUG: Asking scheduler for work...\n", 1405 | "DEBUG: Pending tasks: 1\n", 1406 | "INFO: [pid 2456] Worker Worker(salt=023539738, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) running PrintSQL(connection_string=sqlite:///output/my.db, target_table=letter_count)\n", 1407 | "INFO: [pid 2456] Worker Worker(salt=023539738, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) done PrintSQL(connection_string=sqlite:///output/my.db, target_table=letter_count)\n", 1408 | "DEBUG: 1 running tasks, waiting for next task to finish\n", 1409 | "INFO: Informed scheduler that task PrintSQL_sqlite____output_letter_count_4c5210e673 has status DONE\n", 1410 | "DEBUG: Asking scheduler for work...\n", 1411 | "DEBUG: Done\n", 1412 | "DEBUG: There are no more tasks to run at this time\n", 1413 | "INFO: Worker Worker(salt=023539738, workers=1, host=DESKTOP-BCU4BGH, username=Mike, pid=2456) was stopped. Shutting down Keep-Alive thread\n", 1414 | "INFO: \n", 1415 | "===== Luigi Execution Summary =====\n", 1416 | "\n", 1417 | "Scheduled 2 tasks of which:\n", 1418 | "* 1 complete ones were encountered:\n", 1419 | " - 1 StoreSQL(connection_string=sqlite:///output/my.db, target_table=letter_count)\n", 1420 | "* 1 ran successfully:\n", 1421 | " - 1 PrintSQL(connection_string=sqlite:///output/my.db, target_table=letter_count)\n", 1422 | "\n", 1423 | "This progress looks :) because there were no failed tasks or missing dependencies\n", 1424 | "\n", 1425 | "===== Luigi Execution Summary =====\n", 1426 | "\n" 1427 | ] 1428 | }, 1429 | { 1430 | "name": "stdout", 1431 | "output_type": "stream", 1432 | "text": [ 1433 | "// Letter Count Table\n", 1434 | " index words letter_count\n", 1435 | "0 0 apple 5\n", 1436 | "1 1 banana 6\n", 1437 | "2 2 grapefruit 10\n", 1438 | "3 3 cherry 6\n", 1439 | "4 4 orange 6\n", 1440 | "5 5 peach 5\n", 1441 | "6 6 strawberry 10\n", 1442 | "\n", 1443 | "// Marker Table\n", 1444 | " update_id target_table inserted\n", 1445 | "0 2020-10-26 17:09:56_letter_count letter_count 2020-10-26 17:10:30.763616\n", 1446 | "1 2020-10-26 17:10:34_letter_count letter_count 2020-10-26 17:10:34.734229\n" 1447 | ] 1448 | }, 1449 | { 1450 | "data": { 1451 | "text/plain": [ 1452 | "True" 1453 | ] 1454 | }, 1455 | "execution_count": 26, 1456 | "metadata": {}, 1457 | "output_type": "execute_result" 1458 | } 1459 | ], 1460 | "source": [ 1461 | "luigi.build([PrintSQL(connection_string, target_table='letter_count')], local_scheduler=True)" 1462 | ] 1463 | }, 1464 | { 1465 | "cell_type": "markdown", 1466 | "metadata": {}, 1467 | "source": [ 1468 | "The only task that ran was the `PrintSQL` task, and our other task(s) didn't run. The Marker Table was also not updated." 1469 | ] 1470 | }, 1471 | { 1472 | "cell_type": "code", 1473 | "execution_count": 27, 1474 | "metadata": {}, 1475 | "outputs": [], 1476 | "source": [ 1477 | "#hide\n", 1478 | "\n", 1479 | "# Remove Files\n", 1480 | "for file in OUTPUT_PATH.glob('*'):\n", 1481 | " file.unlink()" 1482 | ] 1483 | }, 1484 | { 1485 | "cell_type": "markdown", 1486 | "metadata": {}, 1487 | "source": [ 1488 | "## Resources\n", 1489 | "- https://stackoverflow.com/questions/40407936/mysql-targets-in-luigi-workflow/40423427#40423427\n", 1490 | "- https://stackoverflow.com/questions/40707004/using-luigi-to-update-postgres-table\n", 1491 | "- https://stackoverflow.com/questions/28793832/can-luigi-rerun-tasks-when-the-task-dependencies-become-out-of-date\n", 1492 | "- https://luigi.readthedocs.io/en/stable/_modules/luigi/contrib/sqla.html\n", 1493 | "- https://stackoverflow.com/questions/9727673/list-directory-tree-structure-in-python\n", 1494 | "- https://stackoverflow.com/questions/11349333/how-to-ignore-the-first-line-of-data-when-processing-csv-data\n", 1495 | "- https://stackoverflow.com/questions/35918605/how-to-delete-a-table-in-sqlalchemy\n", 1496 | "- https://stackoverflow.com/questions/11900553/sqlalchemy-table-already-exists\n", 1497 | "- https://stackoverflow.com/questions/237079/how-to-get-file-creation-modification-date-times-in-python\n", 1498 | "- https://stackoverflow.com/questions/48509083/how-to-make-a-parameter-available-to-all-luigi-tasks" 1499 | ] 1500 | } 1501 | ], 1502 | "metadata": { 1503 | "kernelspec": { 1504 | "display_name": "Python 3", 1505 | "language": "python", 1506 | "name": "python3" 1507 | }, 1508 | "language_info": { 1509 | "codemirror_mode": { 1510 | "name": "ipython", 1511 | "version": 3 1512 | }, 1513 | "file_extension": ".py", 1514 | "mimetype": "text/x-python", 1515 | "name": "python", 1516 | "nbconvert_exporter": "python", 1517 | "pygments_lexer": "ipython3", 1518 | "version": "3.7.9" 1519 | }, 1520 | "varInspector": { 1521 | "cols": { 1522 | "lenName": 16, 1523 | "lenType": 16, 1524 | "lenVar": 40 1525 | }, 1526 | "kernels_config": { 1527 | "python": { 1528 | "delete_cmd_postfix": "", 1529 | "delete_cmd_prefix": "del ", 1530 | "library": "var_list.py", 1531 | "varRefreshCmd": "print(var_dic_list())" 1532 | }, 1533 | "r": { 1534 | "delete_cmd_postfix": ") ", 1535 | "delete_cmd_prefix": "rm(", 1536 | "library": "var_list.r", 1537 | "varRefreshCmd": "cat(var_dic_list()) " 1538 | } 1539 | }, 1540 | "types_to_exclude": [ 1541 | "module", 1542 | "function", 1543 | "builtin_function_or_method", 1544 | "instance", 1545 | "_Feature" 1546 | ], 1547 | "window_display": false 1548 | } 1549 | }, 1550 | "nbformat": 4, 1551 | "nbformat_minor": 2 1552 | } --------------------------------------------------------------------------------