├── .coveragerc ├── .gitignore ├── .travis.yml ├── MANIFEST.in ├── README.markdown ├── README.zh_CN.markdown ├── TODO.markdown ├── bin └── luiti ├── changelog.markdown ├── example_webui_run.py ├── install-dependencies.sh ├── luiti ├── __init__.py ├── daemon │ ├── __init__.py │ ├── graph.py │ ├── ptm.py │ ├── query_engine │ │ ├── __init__.py │ │ ├── builder.py │ │ ├── create_task.py │ │ └── params.py │ ├── utils │ │ ├── __init__.py │ │ ├── cache.py │ │ ├── string.py │ │ ├── task_storage.py │ │ └── template.py │ └── web │ │ ├── __init__.py │ │ ├── assets.py │ │ ├── code_render.py │ │ ├── handlers.py │ │ └── server.py ├── java │ └── MultipleTextFiles.java ├── luigi_decorators │ ├── __init__.py │ ├── as_a_luiti_task.py │ ├── check_date_range.py │ ├── check_runtime_range.py │ ├── mr_local.py │ ├── multiple_text_files.py │ ├── persist_files.py │ ├── plug_packages.py │ └── ref_tasks.py ├── luigi_extensions │ ├── __init__.py │ ├── create_python_package.py │ ├── hadoop_ext.py │ ├── luigi_root_context.py │ ├── manage_decorators.py │ ├── parameter.py │ ├── root_task.py │ ├── task_base.py │ └── task_init.py ├── manager │ ├── __init__.py │ ├── active_packages.py │ ├── cli.py │ ├── config.py │ ├── dep.py │ ├── files.py │ ├── generate_from_templates.py │ ├── lazy_data.py │ ├── loader.py │ ├── package_map.py │ ├── sys_argv.py │ └── table.py ├── schedule │ ├── __init__.py │ └── sensor_schedule.py ├── task_templates │ ├── __init__.py │ ├── other │ │ ├── __init__.py │ │ ├── hive_task.py │ │ ├── mongo_import_task.py │ │ └── static_file.py │ └── time │ │ ├── __init__.py │ │ ├── task_biweekly.py │ │ ├── task_biweekly_hadoop.py │ │ ├── task_day.py │ │ ├── task_day_hadoop.py │ │ ├── task_hour.py │ │ ├── task_hour_hadoop.py │ │ ├── task_month.py │ │ ├── task_month_hadoop.py │ │ ├── task_quarter.py │ │ ├── task_quarter_hadoop.py │ │ ├── task_range.py │ │ ├── task_range_hadoop.py │ │ ├── task_week.py │ │ ├── task_week_hadoop.py │ │ ├── task_year.py │ │ └── task_year_hadoop.py ├── tests │ ├── __init__.py │ ├── mr_test_case.py │ └── setup_luiti_packages.py ├── utils │ ├── __init__.py │ ├── command_utils.py │ ├── compress_utils.py │ ├── date_utils.py │ ├── ext_utils.py │ ├── hdfs_utils.py │ ├── io_utils.py │ ├── math_utils.py │ ├── mr_utils.py │ ├── target_utils.py │ └── visualiser_env_template.py └── webui │ ├── INSTALL.markdown │ ├── assets │ ├── javascripts │ │ └── luiti.js │ ├── jsx │ │ └── luiti.jsx │ └── stylesheets │ │ └── luiti.css │ ├── bower.json │ └── index.html ├── requirements.txt ├── screenshots ├── README.markdown ├── luiti_code_show.png ├── luiti_webui_list.png └── luiti_webui_show.png ├── setup.py ├── tests ├── client.cfg ├── jsons_data │ └── mr_local.json ├── project_A │ ├── __init__.py │ └── luiti_tasks │ │ ├── __init__.py │ │ ├── __init_luiti.py │ │ ├── a_day.py │ │ ├── b_day.py │ │ ├── c_day.py │ │ ├── d_day.py │ │ ├── foobar_day.py │ │ ├── import_packages_day.py │ │ └── multiple_dependent_day.py ├── project_B │ ├── __init__.py │ └── luiti_tasks │ │ ├── __init__.py │ │ ├── __init_luiti.py │ │ └── h_day.py ├── test_daemon.py ├── test_luigi_decorators.py ├── test_main.py ├── test_manager.py ├── test_mr_test_case.py ├── test_schedule.py ├── test_task.py ├── test_task_templates.py ├── test_utils.py ├── webui_packages │ ├── README.markdown │ ├── luiti_clean │ │ ├── README.markdown │ │ ├── luiti_clean │ │ │ ├── __init__.py │ │ │ └── luiti_tasks │ │ │ │ ├── __init__.py │ │ │ │ ├── __init_luiti.py │ │ │ │ └── clean_web_log_day.py │ │ ├── setup.py │ │ └── tests │ │ │ └── test_main.py │ ├── luiti_dump │ │ ├── README.markdown │ │ ├── luiti_dump │ │ │ ├── __init__.py │ │ │ └── luiti_tasks │ │ │ │ ├── __init__.py │ │ │ │ ├── __init_luiti.py │ │ │ │ ├── dump_browser_map_day.py │ │ │ │ └── dump_web_log_day.py │ │ ├── setup.py │ │ └── tests │ │ │ └── test_main.py │ ├── luiti_middle │ │ ├── README.markdown │ │ ├── luiti_middle │ │ │ ├── __init__.py │ │ │ └── luiti_tasks │ │ │ │ ├── __init__.py │ │ │ │ ├── __init_luiti.py │ │ │ │ ├── counter_visitor_by_browser_day.py │ │ │ │ ├── counter_visitor_by_region_day.py │ │ │ │ └── counter_visitor_day.py │ │ ├── setup.py │ │ └── tests │ │ │ └── test_main.py │ ├── luiti_summary │ │ ├── README.markdown │ │ ├── luiti_summary │ │ │ ├── __init__.py │ │ │ └── luiti_tasks │ │ │ │ ├── __init__.py │ │ │ │ ├── __init_luiti.py │ │ │ │ └── beta_report_day.py │ │ ├── setup.py │ │ └── tests │ │ │ └── test_main.py │ └── luiti_webui_tests │ │ └── luiti_webui_tests │ │ └── __init__.py └── zip_package_by_luiti │ ├── setup.py │ └── zip_package_by_luiti │ ├── __init__.py │ └── subfold │ └── __init__.py └── tox.ini /.coveragerc: -------------------------------------------------------------------------------- 1 | [report] 2 | omit = 3 | */python?.?/* 4 | */site-packages/nose/* 5 | *__init__* 6 | */__init__.py 7 | */*/__init__.py 8 | luiti/utils/__init__.py 9 | tests/* 10 | */setup.py 11 | 12 | [run] 13 | parallel = True 14 | source = luiti 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | bin/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # Installer logs 26 | pip-log.txt 27 | pip-delete-this-directory.txt 28 | 29 | # Unit test / coverage reports 30 | htmlcov/ 31 | .tox/ 32 | .coverage 33 | .cache 34 | nosetests.xml 35 | coverage.xml 36 | 37 | # Translations 38 | *.mo 39 | 40 | # Mr Developer 41 | .mr.developer.cfg 42 | .project 43 | .pydevproject 44 | 45 | # Rope 46 | .ropeproject 47 | 48 | # Django stuff: 49 | *.log 50 | *.pot 51 | 52 | # Sphinx documentation 53 | docs/_build/ 54 | 55 | *.yml 56 | *.jar 57 | *.coverage.* 58 | 59 | bower_components 60 | node_modules 61 | .idea/ 62 | .DS_Store 63 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | env: 4 | global: 5 | - PIP_DOWNLOAD_CACHE=$HOME/.pip-cache 6 | matrix: 7 | - TOXENV=pep8 8 | - TOXENV=docs 9 | - TOXENV=py27-nonhdfs 10 | - TOXENV=py33-nonhdfs 11 | - TOXENV=py34-nonhdfs 12 | - TOXENV=py27-cdh 13 | - TOXENV=py33-cdh 14 | - TOXENV=py34-cdh 15 | 16 | sudo: false 17 | 18 | cache: 19 | - $HOME/.pip-cache 20 | 21 | install: 22 | - pip install coveralls 23 | - pip install tox 24 | 25 | before_script: 26 | # allow ssh loopback 27 | - ssh-keygen -t rsa -N '' -C '' -f ~/.ssh/id_rsa 28 | - cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys 29 | - ssh -o StrictHostKeyChecking=no localhost true 30 | 31 | - ./install-dependencies.sh 32 | 33 | script: 34 | - nosetests 35 | - coverage run --source=luiti setup.py test 36 | 37 | after_failure: 38 | - cat /home/travis/build/luiti/luiti/.tox/cdh/log/cdh-1.log 39 | 40 | after_success: 41 | - coveralls 42 | 43 | branches: 44 | only: 45 | - master 46 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include changelog.markdown 2 | include README.markdown 3 | 4 | include luiti/java/*.java 5 | 6 | include luiti/webui/assets/*/** 7 | 8 | include luiti/webui/*.html 9 | recursive-include luiti/webui/bower_components * 10 | -------------------------------------------------------------------------------- /TODO.markdown: -------------------------------------------------------------------------------- 1 | 1. Seperate MapReduce's requires, one is used to input, another is used 2 | to dict. 3 | 2. Clean /tmp/sjfljslfjs after package task related files into a tar. 4 | 3. Support without current package. 5 | 6 | 7 | ## WebUI 8 | 1. OPTIMIZE task dep infos. 9 | 2. Add daemon tests. 10 | 2. Add webui tests. 11 | -------------------------------------------------------------------------------- /bin/luiti: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | from luiti.manager import Cli 5 | 6 | cli = Cli(sys.argv) 7 | cli.run() 8 | -------------------------------------------------------------------------------- /changelog.markdown: -------------------------------------------------------------------------------- 1 | ### 0.2.2 - Nov 10, 2015 2 | * Add lots of test cases 3 | * Document wording 4 | 5 | ### 0.2.1 - July 15, 2015 6 | * Add SensorSchedule to wait external task to finish lazily. 7 | * Add WebUI screenshots. 8 | * Lots of bug fixes. 9 | 10 | ### 0.2.0 - July 7, 2015 11 | * Add WebUI and daemon. 12 | * Lots of bug fixes and refactor. 13 | 14 | ### 0.1.4 - May 10, 2015 15 | * Add English README 16 | 17 | ### 0.1.3 - April 20, 2015 18 | * All codes are conform to PEP8 style. 19 | * Add @luigi.multiple_text_files decorator 20 | 21 | ### 0.1.2 - April 20, 2015 22 | * Project is more solid, add services such as travis, etc. 23 | 24 | ### 0.1.0 - March 24, 2015 25 | * Stable version, compact with luigi==1.0.19 and snakebite==1.3.8, 26 | is already validated in a production environment. 27 | -------------------------------------------------------------------------------- /example_webui_run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*-coding:utf-8-*- 3 | 4 | import os 5 | import sys 6 | 7 | try: 8 | # Try load installed version first. 9 | import luiti 10 | luiti 11 | except: 12 | root_dir = os.path.dirname(os.path.abspath(__file__)) 13 | sys.path.insert(0, root_dir) 14 | 15 | import logging 16 | logger = logging.getLogger("luiti.server") 17 | 18 | # link webui_packages path 19 | from luiti.tests import SetupLuitiPackages 20 | config = SetupLuitiPackages.config 21 | from luiti.daemon import Server 22 | 23 | 24 | task_list_url = "http://localhost:8082/luiti/dag_visualiser?date_value=2015-07-09T00%3A00%3A00%2B08%3A00&language=English&luiti_package=luiti_summary&luiti_package=luiti_clean&luiti_package=luiti_dump&luiti_package=luiti_middle&luiti_package=project_A&luiti_package=project_B" 25 | task_show_url = "http://localhost:8082/luiti/dag_visualiser?date_value=2015-07-09T00%3A00%3A00%2B08%3A00&language=English&luiti_package=luiti_summary&luiti_package=luiti_clean&luiti_package=luiti_dump&luiti_package=luiti_middle&luiti_package=project_A&luiti_package=project_B&task_cls=BetaReportDay" 26 | 27 | # generated from http://www.network-science.de/ascii/ 28 | print "Welcome to luiti's test webui example!" 29 | print 30 | print " Open below two urls in your favourite browser." 31 | print 32 | print " task_list_url: ", task_list_url 33 | print " task_show_url: ", task_show_url 34 | print 35 | 36 | Server("localhost", 8082).run() 37 | -------------------------------------------------------------------------------- /install-dependencies.sh: -------------------------------------------------------------------------------- 1 | # Travis had already installed Node.js with npm. 2 | npm install bower -g 3 | cd luiti/webui; bower install; cd -; 4 | 5 | # Install eggs dependencies. 6 | 7 | # Fix => Reading http://pyparsing.wikispaces.com/ error: timed out 8 | pip install pyparsing --retries 10 --timeout 60 9 | python setup.py install 10 | -------------------------------------------------------------------------------- /luiti/__init__.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ['luigi', 'config', "VisualiserEnvTemplate", 4 | 5 | 'TaskBase', 6 | "TaskHour", 7 | "TaskHourHadoop", 8 | "TaskDay", 9 | "TaskDayHadoop", 10 | "TaskWeek", 11 | "TaskWeekHadoop", 12 | "TaskBiweekly", 13 | "TaskBiweeklyHadoop", 14 | "TaskMonth", 15 | "TaskMonthHadoop", 16 | "TaskQuarter", 17 | "TaskQuarterHadoop", 18 | "TaskYear", 19 | "TaskYearHadoop", 20 | "TaskRange", 21 | "TaskRangeHadoop", 22 | 23 | 'RootTask', 24 | 25 | 'StaticFile', 26 | 'MongoImportTask', 27 | 'HiveTask', 28 | 29 | 'HadoopExt', 30 | 31 | 'manager', 32 | 33 | 'IOUtils', 'DateUtils', 'TargetUtils', 'HDFSUtils', 34 | 'MRUtils', 'MathUtils', 'CommandUtils', 35 | 'CompressUtils', 36 | 37 | 'ArrowParameter', 38 | 39 | 'os', 're', 'sys', 'defaultdict', 'json', 'cached_property', 40 | 'arrow', 41 | 42 | 'MrTestCase', ] 43 | 44 | import os 45 | import sys 46 | import re 47 | from collections import defaultdict 48 | import json 49 | from etl_utils import cached_property 50 | 51 | from .luigi_extensions import luigi 52 | 53 | from .task_templates import TaskHour, TaskDay, TaskWeek, TaskBiweekly, TaskMonth, TaskQuarter, TaskYear, TaskRange 54 | from .task_templates import TaskHourHadoop, TaskDayHadoop, TaskWeekHadoop, TaskBiweeklyHadoop, TaskMonthHadoop, TaskQuarterHadoop, TaskYearHadoop, TaskRangeHadoop 55 | from .task_templates import StaticFile, MongoImportTask, HiveTask 56 | 57 | 58 | from . import manager 59 | from .utils import IOUtils, DateUtils, TargetUtils, HDFSUtils 60 | from .utils import MRUtils, MathUtils, CommandUtils, CompressUtils 61 | 62 | import arrow 63 | from .luigi_extensions import RootTask, TaskBase, ArrowParameter, HadoopExt 64 | 65 | from .utils.visualiser_env_template import VisualiserEnvTemplate 66 | 67 | from .tests import MrTestCase 68 | 69 | 70 | config = manager.luiti_config 71 | -------------------------------------------------------------------------------- /luiti/daemon/__init__.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["Server", ] 4 | 5 | 6 | from .web import Server 7 | -------------------------------------------------------------------------------- /luiti/daemon/graph.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["Graph"] 4 | 5 | from copy import deepcopy 6 | 7 | from .utils import Template, stringify, TaskStorageSet, TaskStorageDict 8 | 9 | 10 | class Graph(object): 11 | """ 12 | Analysis graph relation between nodes. 13 | """ 14 | 15 | @staticmethod 16 | def analysis_dependencies_between_nodes(task_instances, selected_packages): 17 | """ 18 | Based on Data: 19 | 1. Task_instances 20 | 2. Their `requires` informations. 21 | 22 | Related function is luiti.manager.dep.Dep.find_dep_on_tasks 23 | """ 24 | uniq_set = TaskStorageSet(task_instances) 25 | 26 | # 1. raw `requires` and `invert` informations. 27 | # TODO TaskStorageDict are already String, modify back to real Python objects. 28 | task_instances_to_their_direct_requires = TaskStorageDict() 29 | task_instances_to_their_direct_upons = TaskStorageDict() 30 | 31 | for task_instance in task_instances: 32 | deps = Utils.read_requires_from_task(task_instance, selected_packages) 33 | selected_deps = [d1 for d1 in deps if d1 in uniq_set] 34 | task_instances_to_their_direct_requires[task_instance] = TaskStorageSet(selected_deps) 35 | for dep1 in selected_deps: 36 | task_instances_to_their_direct_upons[dep1].add(task_instance) 37 | 38 | # 2. unfold `requires` and `invert` informations. 39 | task_instances_to_their_total_requires = TaskStorageDict() 40 | task_instances_to_their_total_upons = TaskStorageDict() 41 | 42 | for task_instance in task_instances: 43 | Utils.add_total_deps(task_instances_to_their_total_requires, task_instances_to_their_direct_requires, task_instance) 44 | Utils.add_total_deps(task_instances_to_their_total_upons, task_instances_to_their_direct_upons, task_instance) 45 | 46 | def generate_result(_type="python"): 47 | """ 48 | provide two versions of graph infos. 49 | 50 | 1. one for front-end javascript. 51 | 2. another for API python. 52 | """ 53 | def wrap(obj): 54 | if _type == "python": 55 | return obj 56 | if _type == "json": 57 | return stringify(obj) 58 | 59 | return { 60 | "requires": { 61 | "direct": wrap(task_instances_to_their_direct_requires), 62 | "total": wrap(task_instances_to_their_total_requires), 63 | }, 64 | "upons": { 65 | "direct": wrap(task_instances_to_their_direct_upons), 66 | "total": wrap(task_instances_to_their_total_upons), 67 | }, 68 | } 69 | 70 | return { 71 | "python": generate_result("python"), 72 | "json": generate_result("json"), 73 | } 74 | 75 | @staticmethod 76 | def split_edges_into_groups(edges, nodes, task_instances): 77 | """ 78 | Put linked task instances into a group. 79 | """ 80 | edges = deepcopy(edges) 81 | groups = list() # element is set 82 | 83 | # make sure every node appear, even has not link to other tasks. 84 | for ti in task_instances: 85 | edges.append(Template.an_edge(ti, ti)) 86 | 87 | # 1. first time, divid edges into groups. 88 | for edge in edges: 89 | is_in_current_groups = False 90 | for group in groups: 91 | if (edge["from"] in group) or (edge["to"] in group): 92 | is_in_current_groups = True 93 | group.add(edge["from"]) 94 | group.add(edge["to"]) 95 | if is_in_current_groups is False: 96 | groups.append(set([edge["from"], edge["to"]])) 97 | 98 | # 2. second time, merge groups that has common tasks 99 | # iterate to reduce redudant group 100 | result = list() 101 | for group1 in groups: 102 | append_idx = None 103 | for idx2, group2 in enumerate(result): 104 | if len(group1 & group2) > 0: 105 | append_idx = idx2 106 | break 107 | if append_idx is None: 108 | result.append(group1) 109 | else: 110 | result[append_idx] = result[append_idx] | group1 111 | 112 | result = sorted(result, key=lambda i1: (-len(i1), i1)) 113 | return result 114 | 115 | 116 | class Utils(object): 117 | """ only for this file """ 118 | 119 | @staticmethod 120 | def read_requires_from_task(task_instance, selected_packages): 121 | deps = task_instance.requires() 122 | if not isinstance(deps, list): 123 | deps = [deps] 124 | # make sure it's a valid luiti task 125 | deps = filter(lambda i1: hasattr(i1, "package_name"), deps) 126 | # filter is very important, or can't find dict data. 127 | deps = filter(lambda i1: i1.package_name in selected_packages, deps) 128 | return deps 129 | 130 | @staticmethod 131 | def add_total_deps(store, tree, store_node, fetch_node=None): 132 | """ add all recursive dependencies. 133 | 1. `store_node` used to store in a result store. 134 | 2. `fetch_node` used to fetch dependencies from a tree. 135 | """ 136 | fetch_node = fetch_node or store_node 137 | 138 | for d1 in tree[fetch_node]: 139 | if d1 == store_node: 140 | continue 141 | 142 | store[store_node].add(d1) 143 | 144 | for d2 in tree[d1]: 145 | if d2 not in store[store_node]: 146 | Utils.add_total_deps(store, tree, store_node, d2) 147 | -------------------------------------------------------------------------------- /luiti/daemon/ptm.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["PTM"] 4 | 5 | 6 | import sys 7 | from etl_utils import singleton, cached_property 8 | import importlib 9 | import inspect 10 | 11 | from .. import manager 12 | from ..utils import VisualiserEnvTemplate 13 | 14 | 15 | @singleton() 16 | class PackageTaskManagementClass(object): 17 | """ 18 | Manage packages and tasks. 19 | 20 | When webui daemon started, these values are readed, and will not be modified. It means they are static. 21 | """ 22 | 23 | @cached_property 24 | def current_package_name(self): 25 | return manager.luiti_config.get_curr_project_name() 26 | 27 | @cached_property 28 | def current_init_luiti(self): 29 | self.current_package_path # insert pacakge into sys.path 30 | __init_luiti = self.current_package_name + ".luiti_tasks.__init_luiti" 31 | return importlib.import_module(__init_luiti) 32 | 33 | @cached_property 34 | def current_package_path(self): 35 | p1 = manager.luiti_config.get_curr_project_path() 36 | sys.path.insert(0, p1) 37 | return p1 38 | 39 | @cached_property 40 | def current_luiti_visualiser_env(self): 41 | env = getattr(self.current_init_luiti, "luiti_visualiser_env", VisualiserEnvTemplate()) 42 | assert isinstance(env, VisualiserEnvTemplate), env 43 | return env.data 44 | 45 | @cached_property 46 | def load_all_tasks_result(self): 47 | return manager.load_all_tasks() 48 | 49 | @cached_property 50 | def task_classes(self): 51 | return [i1["task_cls"] for i1 in self.load_all_tasks_result["success"]] 52 | 53 | @cached_property 54 | def task_class_names(self): 55 | return sorted([i1.__name__ for i1 in self.task_classes]) 56 | 57 | @cached_property 58 | def task_clsname_to_package(self): 59 | return manager.PackageMap.task_clsname_to_package 60 | 61 | @cached_property 62 | def task_clsname_to_source_file(self): 63 | def get_pyfile(task_cls): 64 | f1 = inspect.getfile(task_cls) 65 | return f1.replace(".pyc", ".py") 66 | 67 | return {task_cls.__name__: get_pyfile(task_cls) for task_cls in self.task_classes} 68 | 69 | @cached_property 70 | def task_clsname_to_package_name(self): 71 | return {t1: p1.__name__ for t1, p1 in self.task_clsname_to_package.iteritems()} 72 | 73 | @cached_property 74 | def task_package_names(self): 75 | return sorted([p1.__name__ for p1 in set(self.task_clsname_to_package.values())]) 76 | 77 | @cached_property 78 | def package_to_task_clsnames(self): 79 | return {package.__name__: sorted(list(task_clsnames)) for package, task_clsnames 80 | in manager.PackageMap.package_to_task_clsnames.iteritems()} 81 | 82 | 83 | PTM = PackageTaskManagementClass() 84 | -------------------------------------------------------------------------------- /luiti/daemon/query_engine/__init__.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["Query"] 4 | 5 | from .builder import QueryBuilder 6 | 7 | 8 | class Query(object): 9 | """ 10 | Use params to query some data from luiti. 11 | """ 12 | 13 | cache = dict() 14 | 15 | def __init__(self, ptm): 16 | self.ptm = ptm # global task and package data. 17 | 18 | def get_env(self, raw_params=dict()): 19 | """ 20 | Generate all data needed. 21 | """ 22 | # Compact with yesterday and today are the same cache key. 23 | raw_params["date_value"] = raw_params.get("date_value", unicode(QueryBuilder.yesterday())) 24 | 25 | # TODO cache maybe replaced by a decorator, such as @functools.lru_cache 26 | cache_key = unicode(sorted(raw_params.items())) # A simple cache 27 | 28 | result = self.cache.get(cache_key, None) 29 | if result is None: 30 | result = QueryBuilder(self.ptm, raw_params).result 31 | self.cache[cache_key] = QueryBuilder(self.ptm, raw_params).result 32 | 33 | return result 34 | -------------------------------------------------------------------------------- /luiti/daemon/query_engine/builder.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["QueryBuilder"] 4 | 5 | import arrow 6 | from etl_utils import cached_property 7 | from copy import deepcopy 8 | 9 | from ...luigi_extensions import ArrowParameter 10 | from ..graph import Graph 11 | from ..utils import stringify, Template, TaskStorageSet 12 | from .params import Params 13 | from .create_task import CreateTask 14 | 15 | 16 | class QueryBuilder(object): 17 | """ 18 | Construct a query builder. 19 | 20 | All propertyies are generated lazily by using `cached_property`, as in a **DAG**. 21 | """ 22 | 23 | def __init__(self, ptm, raw_params): 24 | assert isinstance(raw_params, dict), raw_params 25 | 26 | self.raw_params = raw_params 27 | self.ptm = ptm 28 | 29 | @cached_property 30 | def date_begin(self): 31 | return self.ptm.current_luiti_visualiser_env["date_begin"] 32 | 33 | @cached_property 34 | def date_end(self): 35 | date_end = self.ptm.current_luiti_visualiser_env.get("date_end", self.yesterday_str) 36 | self.ptm.current_luiti_visualiser_env["date_end"] = date_end 37 | return date_end 38 | 39 | @staticmethod 40 | def yesterday(): 41 | return ArrowParameter.now().replace(days=-1).floor("day") 42 | 43 | @cached_property 44 | def yesterday_str(self): 45 | return QueryBuilder.yesterday().format("YYYY-MM-DD") 46 | 47 | @cached_property 48 | def accepted_params(self): 49 | """ 50 | Comes from current luiti that selected. 51 | """ 52 | return self.ptm.current_luiti_visualiser_env["additional_task_parameters"] 53 | 54 | @cached_property 55 | def accepted_query_params(self): 56 | """ 57 | provide to visualSearch.js, used for autocomplete. 58 | 59 | user query via URL search. 60 | 61 | autocomplete params key/value. 62 | """ 63 | # date range related. 64 | days_range = arrow.Arrow.range("day", 65 | ArrowParameter.get(self.date_begin), 66 | ArrowParameter.get(self.date_end)) 67 | accepted_date_values = sorted(map(str, days_range)) 68 | 69 | # result 70 | return { 71 | "date_value": accepted_date_values, 72 | "task_cls": self.ptm.task_class_names, 73 | "luiti_package": self.ptm.task_package_names, 74 | } 75 | 76 | @cached_property 77 | def default_query(self): 78 | """ Query provide by user config. """ 79 | # assign default params 80 | default_query = { 81 | "date_value": str(QueryBuilder.yesterday()), 82 | # to insert more key-value 83 | } 84 | 85 | # get config from current package's luiti_visualiser_env 86 | for task_param, task_param_opt in self.accepted_params.iteritems(): 87 | self.accepted_query_params[task_param] = task_param_opt["values"] 88 | default_query[task_param] = task_param_opt["default"] 89 | 90 | return default_query 91 | 92 | @cached_property 93 | def selected_query(self): 94 | selected_query = {k1: v1 for k1, v1 in self.raw_params.iteritems() if k1 in self.accepted_params or k1 == "date_value"} 95 | selected_query["luiti_package"] = self.selected_packages 96 | selected_query = dict(self.default_query.items() + selected_query.items()) 97 | 98 | return selected_query 99 | 100 | @cached_property 101 | def default_packages(self): 102 | """ user provided. """ 103 | return self.ptm.current_luiti_visualiser_env["package_config"].get("defaults", []) 104 | 105 | @cached_property 106 | def selected_packages(self): 107 | result = self.raw_params.get("luiti_package", self.default_packages) 108 | result = result or self.ptm.task_package_names 109 | return result 110 | 111 | @cached_property 112 | def selected_task_cls_names(self): 113 | """ 114 | current selected. 115 | """ 116 | result = set(self.raw_params.get("task_cls", [])) 117 | 118 | # modify other cached_property 119 | self.selected_query["task_cls"] = list(result) 120 | 121 | return result 122 | 123 | @cached_property 124 | def total_task_instances(self): 125 | """ 126 | Total task instances. 127 | """ 128 | # 1. build possible params. 129 | # **remove** luiti_package and task_cls query str 130 | params_array = Params.build_params_array(self.default_query, self.selected_query) 131 | 132 | # 2. and generate task instances. 133 | total_task_instances = list() 134 | for ti in self.ptm.task_classes: 135 | # TODO why below two lines exist before. 136 | # if ti.__name__ not in self.selected_task_cls_names: 137 | # continue 138 | 139 | for _params in params_array: 140 | task_instance = CreateTask.new(ti, _params) 141 | total_task_instances.append(task_instance) 142 | 143 | result = sorted(list(set(total_task_instances))) 144 | return result 145 | 146 | @cached_property 147 | def selected_task_instances(self): 148 | """ nodes that drawed in vis.js """ 149 | # filter by package 150 | result = sorted(list(set(self.total_task_instances))) 151 | result = filter(lambda ti: ti.package_name in self.selected_packages, 152 | result) 153 | 154 | # To avoid only self is in the graph. 155 | # If select task class, then to find linked task instances. 156 | if not self.selected_task_cls_names: 157 | return result 158 | 159 | pure_selected_task_instances = [ti for ti in result if ti.task_clsname in self.selected_task_cls_names] 160 | pure_linked = TaskStorageSet() 161 | for ti in pure_selected_task_instances: 162 | for t2 in self.graph_infos_python["requires"]["direct"][ti]: 163 | pure_linked.add(t2) 164 | for t2 in self.graph_infos_python["upons"]["direct"][ti]: 165 | pure_linked.add(t2) 166 | 167 | # filter that tasks are linked, in current task_classes. 168 | result = [ti for ti in result if ti in pure_linked] 169 | result.extend(pure_selected_task_instances) 170 | result = list(set(result)) 171 | return result 172 | 173 | @cached_property 174 | def graph_infos_data(self): 175 | return Graph.analysis_dependencies_between_nodes(self.total_task_instances, 176 | self.selected_packages) 177 | 178 | @cached_property 179 | def graph_infos_python(self): 180 | return self.graph_infos_data["python"] 181 | 182 | @cached_property 183 | def nodes(self): 184 | return [Template.a_node(ti) for ti in self.selected_task_instances] 185 | 186 | @cached_property 187 | def edges(self): 188 | return Template.edges_from_nodes(self.selected_task_instances) 189 | 190 | @cached_property 191 | def nodes_groups(self): 192 | return Graph.split_edges_into_groups(self.edges, 193 | self.nodes, 194 | self.selected_task_instances) 195 | 196 | @cached_property 197 | def nodes_groups_in_view(self): 198 | return [sorted(list(nodes_set)) for nodes_set in self.nodes_groups] 199 | 200 | @cached_property 201 | def task_instance_repr_to_info(self): 202 | result = dict() 203 | for ti in self.total_task_instances: 204 | param_kwargs = deepcopy(ti.param_kwargs) 205 | if "pool" in param_kwargs: 206 | del param_kwargs["pool"] 207 | result[str(ti)] = {"task_cls": ti.task_clsname, "param_kwargs": stringify(param_kwargs)} 208 | return result 209 | 210 | @cached_property 211 | def result(self): 212 | return { 213 | "title": "Luiti WebUI, a DAG timely visualiser.", 214 | 215 | "queryparams": { 216 | "accepted": self.accepted_query_params, 217 | "selected_query": self.selected_query, 218 | "default_query": self.default_query, 219 | "luiti_visualiser_env": self.ptm.current_luiti_visualiser_env, 220 | }, 221 | 222 | "ptm": { 223 | "task_class_names": self.ptm.task_class_names, 224 | "task_package_names": self.ptm.task_package_names, 225 | "task_clsname_to_package_name": self.ptm.task_clsname_to_package_name, 226 | "package_to_task_clsnames": self.ptm.package_to_task_clsnames, 227 | "task_instance_repr_to_info": self.task_instance_repr_to_info, 228 | }, 229 | 230 | "nodeedge": { 231 | "nodes": self.nodes, 232 | "edges": self.edges, 233 | "nodes_groups": self.nodes_groups_in_view, 234 | "graph_infos": self.graph_infos_data["json"], 235 | }, 236 | 237 | "errors": { 238 | "load_tasks": self.ptm.load_all_tasks_result["failure"], 239 | } 240 | } 241 | -------------------------------------------------------------------------------- /luiti/daemon/query_engine/create_task.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | 4 | __all__ = ["CreateTask"] 5 | 6 | import luigi 7 | from ..utils import CacheByDictKey 8 | 9 | 10 | class CreateTask(object): 11 | 12 | task_clsname_cache = dict() 13 | 14 | @staticmethod 15 | def new(task_cls, _params): 16 | """ Initialize a task instance, with filter invalid params. """ 17 | task_cls_cache = CreateTask.task_clsname_cache.get(task_cls, None) 18 | if task_cls_cache is None: 19 | task_cls_cache = TaskInstanceCache(task_cls) 20 | CreateTask.task_clsname_cache[task_cls] = task_cls_cache 21 | 22 | return task_cls_cache[_params] 23 | 24 | 25 | class TaskInstanceCache(object): 26 | """ 27 | To avoid create duplicated task instances. 28 | """ 29 | 30 | def __init__(self, task_cls): 31 | self.task_cls = task_cls 32 | self.cache = CacheByDictKey(self.process) 33 | 34 | def __getitem__(self, _params): 35 | return self.cache[_params] 36 | 37 | def process(self, _params): 38 | _real_task_params = dict() 39 | for k1, v1 in _params.iteritems(): 40 | has_key = hasattr(self.task_cls, k1) 41 | is_luigi_params = isinstance(getattr(self.task_cls, k1, None), luigi.Parameter) 42 | if has_key and is_luigi_params: 43 | _real_task_params[k1] = v1 44 | task_instance = self.task_cls(**_real_task_params) 45 | return task_instance 46 | -------------------------------------------------------------------------------- /luiti/daemon/query_engine/params.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["Params"] 4 | 5 | from ...luigi_extensions import ArrowParameter 6 | import itertools 7 | 8 | 9 | class Params(object): 10 | 11 | @staticmethod 12 | def build_params_array(default_query, selected_query): 13 | """ 14 | 1. build possible params 15 | 2. and with default params 16 | """ 17 | selected_query_with_kv_array = list() 18 | for k1, v1 in selected_query.iteritems(): 19 | k1_v2_list = list() 20 | 21 | # v1 is params value list 22 | if not isinstance(v1, list): 23 | v1 = [v1] 24 | 25 | if len(v1) == 0: 26 | continue # ignore key that no value. 27 | 28 | for v2 in v1: 29 | # Already overwrited params type and luigi.Task#__eq__ in luiti. 30 | # See more details at task_templates.time.task_base.py 31 | if k1 == "date_value": 32 | v2 = ArrowParameter.get(v2) 33 | else: 34 | v2 = unicode(v2) 35 | k1_v2_list.append({"key": k1, "val": v2}) 36 | selected_query_with_kv_array.append(k1_v2_list) 37 | 38 | possible_params_in_kv = map(list, itertools.product(*selected_query_with_kv_array)) 39 | 40 | params_array = list() 41 | for kv_list in possible_params_in_kv: 42 | opt = {kv1["key"]: kv1["val"] for kv1 in kv_list} 43 | opt = dict(default_query.items() + opt.items()) 44 | params_array.append(opt) 45 | 46 | return sorted(params_array) 47 | -------------------------------------------------------------------------------- /luiti/daemon/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["stringify", 4 | "TaskStorageSet", "TaskStorageDict", 5 | "Template", 6 | "CacheByDictKey", ] 7 | 8 | 9 | from .string import stringify 10 | from .task_storage import TaskStorageSet, TaskStorageDict 11 | from .template import Template 12 | from .cache import CacheByDictKey 13 | -------------------------------------------------------------------------------- /luiti/daemon/utils/cache.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["CacheByDictKey"] 4 | 5 | # TODO cache maybe replaced by a decorator, such as @functools.lru_cache 6 | # 1. https://pypi.python.org/pypi/py_lru_cache/0.1.4 is slow, 100 ms, but simple dict cache is only 1 ms. 7 | # 2. https://github.com/tkem/cachetools dont support dict parameters. 8 | 9 | 10 | class CacheByDictKey(object): 11 | """ 12 | Support cache by a dict. 13 | 14 | Only support dict[] operation. 15 | """ 16 | 17 | def __init__(self, func): 18 | self.store = dict() 19 | 20 | assert callable(func) 21 | self.func = func 22 | 23 | def __getitem__(self, query): 24 | cache_key = self.generate_cache_key(query) 25 | 26 | result = self.store.get(cache_key, None) 27 | if result is None: 28 | result = self.func(query) 29 | self.store[cache_key] = result 30 | return result 31 | 32 | def generate_cache_key(self, query): 33 | assert isinstance(query, dict) 34 | return unicode(sorted(query.items())) 35 | -------------------------------------------------------------------------------- /luiti/daemon/utils/string.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["stringify"] 4 | 5 | 6 | def stringify(default_dict): 7 | """ 8 | make an object can be serialized by JSON. 9 | 10 | This function is not general, just for luiti.daemon . 11 | """ 12 | result = dict() 13 | for k1, vs1 in default_dict.iteritems(): 14 | # only wrap first level, such as ArrowParameter 15 | if isinstance(vs1, (list, set)): 16 | vs1 = map(str, vs1) 17 | else: 18 | vs1 = str(vs1) 19 | result[str(k1)] = vs1 20 | return result 21 | -------------------------------------------------------------------------------- /luiti/daemon/utils/task_storage.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["TaskStorageSet", "TaskStorageDict"] 4 | 5 | from UserDict import UserDict 6 | 7 | """ 8 | Task#__hash isn't consistent when one is from task_instances, and another is from `requires`. 9 | 10 | Here we use #task_id to compare that if two tasks are the same one. 11 | """ 12 | 13 | 14 | class TaskStorageSet(set): 15 | """ 16 | hash(luigi.Task) don't work well, so use `luigi.Task.task_id` fix it temporarily. 17 | """ 18 | 19 | def __init__(self, task_list=list()): 20 | self.store = dict() 21 | 22 | for t1 in task_list: 23 | self.add(t1) 24 | 25 | def __contains__(self, t1): 26 | return t1.task_id in self.store 27 | 28 | def add(self, t1): 29 | self.store[t1.task_id] = t1 30 | 31 | def remove(self, t1): 32 | del self.store[t1.task_id] 33 | 34 | def __repr__(self): 35 | return repr(self.store.keys()) 36 | 37 | def __len__(self): 38 | return len(self.store) 39 | 40 | def __iter__(self): 41 | return self.store.itervalues() 42 | 43 | 44 | class TaskStorageDict(UserDict): 45 | 46 | def __getitem__(self, ti): 47 | if ti.task_id in self.data: 48 | return self.data[ti.task_id] 49 | if hasattr(self.__class__, "__missing__"): 50 | return self.__class__.__missing__(self, ti) 51 | raise KeyError(ti) 52 | 53 | def __setitem__(self, ti, item): 54 | self.data[ti.task_id] = item 55 | 56 | def __delitem__(self, ti): 57 | del self.data[ti.task_id] 58 | 59 | def __missing__(self, ti): 60 | s1 = TaskStorageSet() 61 | self.data[ti.task_id] = s1 62 | return s1 63 | -------------------------------------------------------------------------------- /luiti/daemon/utils/template.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["Template"] 4 | 5 | import luigi 6 | 7 | 8 | class Template(object): 9 | """ 10 | Generate some output from entities. 11 | """ 12 | 13 | @staticmethod 14 | def task_doc(ti): 15 | """ Get task doc from class. """ 16 | doc = (ti.task_class.__doc__ or "").strip() 17 | if isinstance(doc, str): 18 | doc = doc.decode("UTF-8") 19 | return doc 20 | 21 | @staticmethod 22 | def a_node(ti): 23 | result = {"id": ti.task_id, 24 | "label": ti.task_class.__name__, 25 | "group": ti.package_name, 26 | 27 | "detail": str(ti), 28 | "data_file": ti.data_file, 29 | "task_doc": Template.task_doc(ti), 30 | "task_file": ti.task_class.__module__.replace(".", "/") + ".py", 31 | "package_name": ti.package_name, 32 | } 33 | result["size"] = 20 34 | return result 35 | 36 | @staticmethod 37 | def edges_from_nodes(nodes): 38 | """ 39 | Generate relations between current task instances, but just only these task instances. 40 | """ 41 | # 1. check input is valid 42 | assert isinstance(nodes, list) 43 | if len(nodes): 44 | assert isinstance(nodes[0], luigi.Task) 45 | 46 | edges = list() 47 | for ti in nodes: 48 | t2_in_requires = ti.requires() 49 | if not isinstance(t2_in_requires, list): 50 | t2_in_requires = [t2_in_requires] 51 | for t2 in t2_in_requires: 52 | if t2 is None: # dep on none tasks 53 | continue 54 | if t2 not in nodes: 55 | continue 56 | edges.append(Template.an_edge(t2, ti)) 57 | 58 | return edges 59 | 60 | @staticmethod 61 | def an_edge(from_task, to_task): 62 | arrows = "to" # default 63 | if from_task == to_task: 64 | arrows = "self_to_self" 65 | 66 | result = {"id": from_task.task_id + " " + to_task.task_id, # id is uniq. 67 | "from": from_task.task_id, 68 | "source_name": from_task.task_class.__name__, 69 | "to": to_task.task_id, 70 | "target_name": to_task.task_class.__name__, 71 | "strength": 1.0, 72 | "arrows": arrows} 73 | 74 | return result 75 | -------------------------------------------------------------------------------- /luiti/daemon/web/__init__.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["Server"] 4 | 5 | 6 | from .server import Server 7 | -------------------------------------------------------------------------------- /luiti/daemon/web/assets.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["assets_main_dir", "assets_thirdparty_dir"] 4 | 5 | 6 | import os 7 | 8 | 9 | luiti_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 10 | 11 | assets_main_dir = os.path.join(luiti_dir, "webui/assets") 12 | assets_thirdparty_dir = os.path.join(luiti_dir, "webui/bower_components") 13 | 14 | assert os.path.isdir(assets_main_dir), "%s is not exists!" % assets_main_dir 15 | assert os.path.isdir(assets_thirdparty_dir), "%s is not exists!" % assets_thirdparty_dir 16 | -------------------------------------------------------------------------------- /luiti/daemon/web/code_render.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["CodeRender"] 4 | 5 | from etl_utils import cached_property 6 | 7 | 8 | class CodeRender(dict): 9 | """ Highlight luiti task code written in Python. """ 10 | 11 | @cached_property 12 | def highlight(self): 13 | """ Lazy load pygments, so user dont need to load all daemon code. """ 14 | import pygments 15 | from pygments.lexers import PythonLexer 16 | lexer = PythonLexer() 17 | 18 | return lambda source_code: pygments.highlight(source_code, lexer, self.formatter) 19 | 20 | @cached_property 21 | def formatter(self): 22 | from pygments.formatters import HtmlFormatter 23 | return HtmlFormatter(linenos=True) 24 | 25 | @cached_property 26 | def css_html(self): 27 | return u"""""" % self.formatter.get_style_defs('.highlight') 28 | 29 | def __missing__(self, source_file): 30 | source_code = file(source_file).read() 31 | 32 | path_html = u"""
source_file: %s
""" % source_file 33 | code_html = self.highlight(source_code) 34 | 35 | body_html = path_html + code_html + self.css_html 36 | title = source_file.split("/")[-1] 37 | 38 | return u""" 39 | 40 | 41 | %s 42 | 43 | 44 | %s 45 | 46 | 47 | """ % (title, body_html) 48 | -------------------------------------------------------------------------------- /luiti/daemon/web/handlers.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["web_handlers"] 4 | 5 | from etl_utils import cached_property 6 | import pkg_resources 7 | import tornado.web 8 | 9 | from .assets import assets_main_dir, assets_thirdparty_dir 10 | from ..ptm import PTM 11 | from ..query_engine import Query 12 | from .code_render import CodeRender 13 | 14 | 15 | class IndexHandler(tornado.web.RequestHandler): 16 | 17 | def get(self): 18 | # one query key has multiple values 19 | self.render("index.html") 20 | 21 | def get_template_path(self): 22 | return pkg_resources.resource_filename(__name__, "../../webui") 23 | 24 | 25 | class InitDataHandler(tornado.web.RequestHandler): 26 | 27 | @cached_property 28 | def query_engine(self): 29 | return Query(PTM) 30 | 31 | def get(self): 32 | params = self.request.query_arguments 33 | data = self.query_engine.get_env(params) 34 | 35 | self.write(data) 36 | 37 | 38 | class CodeShowHandler(tornado.web.RequestHandler): 39 | 40 | @cached_property 41 | def code_render(self): 42 | return CodeRender() 43 | 44 | def get(self, package_name, task_cls_name): 45 | # assert package and task exist! 46 | assert package_name in PTM.task_package_names 47 | assert task_cls_name in PTM.task_clsname_to_package 48 | 49 | source_file = PTM.task_clsname_to_source_file[task_cls_name] 50 | source_code = self.code_render[source_file] 51 | self.write(source_code) 52 | 53 | 54 | web_handlers = [ 55 | # make a static HTML vis URL 56 | (r'/luiti/bower_components/(.*)', tornado.web.StaticFileHandler, {'path': assets_thirdparty_dir}), 57 | (r'/luiti/assets/(.*)', tornado.web.StaticFileHandler, {'path': assets_main_dir}), 58 | 59 | (r'/luiti/code/([^/]+)/([^/]+)', CodeShowHandler, {}), 60 | (r'/luiti/dag_visualiser', IndexHandler, {}), 61 | (r'/luiti/init_data.json', InitDataHandler, {}), 62 | (r'/', tornado.web.RedirectHandler, {"url": "/luiti/dag_visualiser"}) 63 | ] 64 | -------------------------------------------------------------------------------- /luiti/daemon/web/server.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | """ 4 | A DAG timely visualiser. 5 | 6 | Draw DAG tasks under selected parameters. 7 | """ 8 | 9 | from __future__ import unicode_literals 10 | 11 | __all__ = ["Server"] 12 | 13 | from etl_utils import cached_property 14 | import tornado.httpclient 15 | import tornado.httpserver 16 | import tornado.ioloop 17 | import tornado.netutil 18 | import tornado.web 19 | import tornado.escape 20 | from tornado.log import enable_pretty_logging 21 | enable_pretty_logging() 22 | 23 | 24 | import logging 25 | logger = logging.getLogger("luiti.server") 26 | 27 | 28 | # 1. Setup business package env 29 | # list current package's related tasks, group by package name. 30 | from .handlers import web_handlers 31 | 32 | 33 | class Server(object): 34 | """ A tornado server. """ 35 | 36 | welcome_doc = u""" 37 | ( \ |\ /|\__ __/\__ __/\__ __/ 38 | | ( | ) ( | ) ( ) ( ) ( 39 | | | | | | | | | | | | | 40 | | | | | | | | | | | | | 41 | | | | | | | | | | | | | 42 | | (____/\| (___) |___) (___ | | ___) (___ 43 | (_______/(_______)\_______/ )_( \_______/ 44 | """ 45 | 46 | def __init__(self, host, port): 47 | self.host = host 48 | self.port = port 49 | 50 | # Fix cant open http://0.0.0.0 on browser. 51 | self.url = "http://%s:%s" % (self.host.replace("0.0.0.0", "localhost"), self.port) 52 | 53 | print self.welcome_doc 54 | print "Luiti WebUI is mounted on %s" % self.url 55 | 56 | def run(self): 57 | """ 58 | Runs one instance of the API server. 59 | """ 60 | api_sockets = tornado.netutil.bind_sockets(self.port, address=self.host) 61 | server = tornado.httpserver.HTTPServer(self.app) 62 | server.add_sockets(api_sockets) 63 | 64 | logger.info("Scheduler starting up") 65 | tornado.ioloop.IOLoop.instance().start() 66 | 67 | @cached_property 68 | def app(self): 69 | """ return a API app instance. """ 70 | settings = { 71 | "unescape": tornado.escape.xhtml_unescape, 72 | # "autoreload": True 73 | } 74 | 75 | return tornado.web.Application(web_handlers, **settings) 76 | -------------------------------------------------------------------------------- /luiti/java/MultipleTextFiles.java: -------------------------------------------------------------------------------- 1 | package com.voxlearning.bigdata.MrOutput; 2 | 3 | import org.apache.hadoop.fs.Path; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat; 6 | 7 | public class MultipleTextFiles extends MultipleTextOutputFormat { 8 | /** 9 | * Currently, the `reducer` function in luiti use below data format. 10 | * yield "", "{"json key": "json value"}" 11 | * If need multiple file output, then we use the unused yield key. 12 | * 13 | * Ref code: http://blog.csdn.net/lmc_wy/article/details/7532213 14 | */ 15 | 16 | protected String generateFileNameForKeyValue(Text key, Text value, String name) 17 | { 18 | String outputName = key.toString(); // Get the current filename 19 | key.set(""); // We just need the value, so remove the unneeded key. 20 | return new Path(outputName, name).toString(); // 参考 https://github.com/klbostee/feathers 21 | } 22 | 23 | } 24 | 25 | 26 | /* 27 | * deploy ref: https://github.com/klbostee/feathers/blob/master/build.sh 28 | */ 29 | -------------------------------------------------------------------------------- /luiti/luigi_decorators/__init__.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | """ 4 | This folder contains functions only. Please make sure dont make any complex `import` statements. 5 | 6 | See import logic at luiti/luigi_extensions/manage_decorators.py 7 | """ 8 | -------------------------------------------------------------------------------- /luiti/luigi_decorators/as_a_luiti_task.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["as_a_luiti_task"] 4 | 5 | import luigi 6 | from ..luigi_extensions import TaskBase, TaskInit 7 | from ..utils import ExtUtils 8 | 9 | # Extensions to luigi.Task 10 | task_base_members = [k1 for k1 in sorted(TaskBase.__dict__.keys()) if not k1.startswith("__")] 11 | task_base_members = [k1 for k1 in task_base_members if not k1.startswith("_abc")] 12 | """ member list, see details at TaskBase 13 | >>> ['_persist_files', '_ref_tasks', 'data_dir', 'data_file', 'data_name', 'date_str', 'date_type', 'date_value', 'date_value_by_type_in_begin', 'date_value_by_type_in_end', 'date_value_by_type_in_last', 'errput', 'instances_by_date_range', 'is_external', 'is_reach_the_edge', 'output', 'package_name', 'pre_task_by_self', 'requires', 'reset_date', 'root_dir', 'run', 'run_mode', 'task_class', 'task_clsname', 'task_namespace'] 14 | """ 15 | 16 | 17 | def as_a_luiti_task(**opts): # Decorator 18 | """ 19 | Luigi's contrib are really Great, luiti would like to Reuse them through just a decorator. 20 | 21 | Usage: 22 | 23 | @luigi.as_a_luiti_task() 24 | class AnotherHiveDay(HiveQueryTask): 25 | pass 26 | 27 | 28 | https://github.com/spotify/luigi/tree/master/luigi/contrib 29 | """ 30 | 31 | def func(task_cls): 32 | """ Main reason is to fix not overwrite `__init__` function. """ 33 | # Make sure it's a luigi.contrib 34 | assert issubclass(task_cls, luigi.Task), task_cls 35 | 36 | # copy members to target class 37 | for member in task_base_members: 38 | base_val = getattr(TaskBase, member) 39 | target_val = getattr(task_cls, member, NotImplementedError) 40 | if target_val in [NotImplementedError, NotImplemented]: 41 | setattr(task_cls, member, base_val) 42 | 43 | # let `isinstance` works for this wrap task class 44 | class wrap_cls(task_cls, TaskBase, ExtUtils.ExtendClass): 45 | def __init__(self, *args, **kwargs): 46 | super(wrap_cls, self).__init__(*args, **kwargs) 47 | TaskInit.setup(self) 48 | 49 | wrap_cls.__doc__ = task_cls.__doc__ 50 | wrap_cls.__module__ = task_cls.__module__ 51 | wrap_cls.__name__ = task_cls.__name__ 52 | task_cls = wrap_cls 53 | 54 | return task_cls 55 | return func 56 | -------------------------------------------------------------------------------- /luiti/luigi_decorators/check_date_range.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["check_date_range"] 4 | 5 | from ..luigi_extensions import ArrowParameter 6 | 7 | 8 | def check_date_range(): # 装饰器 9 | """ 10 | 从数据库导数据时,必须注意时间范围内的所有数据是否都齐全了。如果未齐全, 11 | 即在当前时间范围里导的话,那么就会缺失数据了,相当于提前导了。 12 | 13 | 比如在周六就把这周的关联数据导出来,那么周日的数据就没包含在里面。应该在下周一后才开始导。 14 | """ 15 | def decorator(orig_run): 16 | def new_run(self): 17 | # 说明时间未到,然后就直接退出 18 | if ArrowParameter.now() < self.date_value_by_type_in_end: 19 | return False 20 | return orig_run(self) 21 | return new_run 22 | 23 | def func(cls): 24 | cls.run = decorator(cls.run) 25 | return cls 26 | return func 27 | # TODO support Hadoop 28 | -------------------------------------------------------------------------------- /luiti/luigi_decorators/check_runtime_range.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["check_runtime_range"] 4 | 5 | from ..luigi_extensions import ArrowParameter 6 | 7 | 8 | def check_runtime_range(**opts_1): # 装饰器 9 | """ 10 | Support hour/weekday indexed range. 11 | 12 | Optional params: 13 | 1. hour_num 14 | 2. weekday_num 15 | 3. now 16 | """ 17 | def decorator(orig_run): 18 | def new_run(self): 19 | default_opts = { 20 | "hour_num": range(1, 25), 21 | "weekday_num": range(1, 8), 22 | } 23 | opts = dict(default_opts.items() + opts_1.items()) 24 | 25 | now = ArrowParameter.now() # get current time 26 | hour_24 = int(now.format("H")) # 0, 1, 2, ..., 23, 24 27 | day_of_week_7 = int(now.format("d")) # 1, 2, 3, ..., 6, 7 28 | 29 | is_false = False 30 | if hour_24 not in opts['hour_num']: 31 | is_false = True 32 | if day_of_week_7 not in opts['weekday_num']: 33 | is_false = True 34 | if is_false: 35 | print "[info]", now, " is not in ", opts, \ 36 | ", so the task exited." 37 | return False 38 | 39 | return orig_run(self) 40 | return new_run 41 | 42 | def func(cls): 43 | cls.run = decorator(cls.run) 44 | return cls 45 | return func 46 | -------------------------------------------------------------------------------- /luiti/luigi_decorators/mr_local.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["mr_local"] 4 | 5 | from collections import defaultdict 6 | from etl_utils import process_notifier 7 | from ..utils import TargetUtils 8 | 9 | 10 | def mr_local(**opts): 11 | """ 12 | Sometimes Hadoop streaming sucks, so we only use the solid HDFS, and turn 13 | MapReduce job into local mode. 14 | 15 | And `mr_local` is optimized by a fixed chunk write operation. 16 | """ 17 | 18 | def mr_run(self): 19 | """ Overwrite BaseHadoopJobTask#run function. """ 20 | # TODO maybe model cache 21 | map_kv_dict = defaultdict(list) 22 | 23 | inputs = self.input() 24 | if not isinstance(inputs, list): 25 | inputs = [inputs] 26 | for input_hdfs_1 in inputs: 27 | for line2 in TargetUtils.line_read(input_hdfs_1): 28 | for map_key_3, map_val_3 in self.mapper(line2): 29 | map_kv_dict[map_key_3].append(map_val_3) 30 | 31 | with self.output().open("w") as output1: 32 | fixed_chunk = list() 33 | for reduce_key_2 in process_notifier(map_kv_dict.keys()): 34 | reduce_vals_2 = map_kv_dict[reduce_key_2] 35 | for _, reduce_val_2 in self.reducer( 36 | reduce_key_2, reduce_vals_2): 37 | fixed_chunk.append(reduce_val_2) 38 | 39 | if len(fixed_chunk) % self.chunk_size == 0: 40 | output1.write("\n".join(fixed_chunk) + "\n") 41 | fixed_chunk = list() 42 | del map_kv_dict[reduce_key_2] 43 | output1.write("\n".join(fixed_chunk) + "\n") 44 | 45 | def wrap(cls): 46 | cls.run = mr_run 47 | cls.run_mode = "mr_local" 48 | 49 | opts["chunk_size"] = opts.get("chunk_size", 100) 50 | for k1, v1 in opts.iteritems(): 51 | setattr(cls, k1, v1) 52 | 53 | return cls 54 | return wrap 55 | -------------------------------------------------------------------------------- /luiti/luigi_decorators/multiple_text_files.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["multiple_text_files"] 4 | 5 | import os 6 | import commands 7 | from etl_utils import cached_property 8 | from ..utils import CommandUtils 9 | import luigi 10 | 11 | 12 | def multiple_text_files(opts=dict()): 13 | """ 14 | Let current task class's result can support outputing into multiple files. 15 | 16 | Usage: 17 | 18 | ```python 19 | @luigi.multiple_text_files 20 | class ManAndWomanDay(TaskDayHadoop): 21 | def mapper(self, line1): 22 | item1 = MRUtils.json_parse(line1) 23 | yield item1['uid'], item1 24 | 25 | def reducer(self, uid1, vals_1): 26 | for item1 in vals_1: 27 | yield item1["gender"], MRUtils.str_dump(item1) 28 | ``` 29 | 30 | So above code separate man and woman into two files. File name such as 31 | 1. man_and_woman_day.json/man 32 | 2. man_and_woman_day.json/woman 33 | 34 | But not the default one 35 | 1. man_and_woman_day.json/part-00000 36 | 37 | WARN: 38 | when use `@luigi.multiple_text_files`, consider to wrap subfolders with 39 | StaticFile task class. 40 | """ 41 | def func(task_cls): 42 | cjc = CompileJavaCode() 43 | 44 | def compile_java_code(self): 45 | """ compile java code dynamically. """ 46 | if not os.path.exists(cjc.target_jar): 47 | CommandUtils.execute(cjc.compile_cmd) 48 | 49 | setattr(task_cls, "output_format", cjc.output_format) 50 | setattr(task_cls, "libjars", [cjc.target_jar, ]) 51 | setattr(task_cls, "compile_java_code", compile_java_code) 52 | return task_cls 53 | 54 | # Comptible with old API. 55 | if isinstance(opts, dict): 56 | return func 57 | if issubclass(opts, luigi.Task): 58 | return func(opts) 59 | raise ValueError(opts) 60 | 61 | 62 | class CompileJavaCode(object): 63 | """ 64 | assemble jar. 65 | """ 66 | 67 | java_namespace = "com.voxlearning.bigdata.MrOutput" 68 | java_lib = "MultipleTextFiles" 69 | output_format = ".".join([java_namespace, java_lib]) 70 | root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 71 | 72 | @cached_property 73 | def java_file(self): 74 | return self.java_lib + ".java" 75 | 76 | @cached_property 77 | def target_class(self): 78 | return self.java_lib + ".class" 79 | 80 | @cached_property 81 | def target_jar(self): 82 | return os.path.join(self.root_dir, "java", self.java_lib + ".jar") 83 | 84 | @cached_property 85 | def compile_cmd(self): 86 | classes_dir = self.java_namespace.replace(".", "/") 87 | javac_cmd = commands.getoutput("which javac") 88 | java_classpath = commands.getoutput("hadoop classpath") 89 | jar_cmd = commands.getoutput("which jar") 90 | 91 | compile_cmd = ";\n".join([ 92 | # no absolute path, compact with java namespace. 93 | "cd %s/java" % self.root_dir, 94 | 95 | """%s -classpath "%s" %s""" % (javac_cmd, 96 | java_classpath, self.java_file, ), 97 | "rm -rf %s" % classes_dir, 98 | "mkdir -p %s" % classes_dir, 99 | "cp %s %s" % (self.target_class, classes_dir), 100 | "%s cvf %s %s/*.class" % (jar_cmd, self.target_jar, classes_dir, ), 101 | ]) 102 | return compile_cmd 103 | -------------------------------------------------------------------------------- /luiti/luigi_decorators/persist_files.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["persist_files"] 4 | 5 | import os 6 | from luigi import Event 7 | from ..utils import IOUtils 8 | 9 | 10 | # NOTE deprecated 11 | def persist_files(*files): # 装饰器 12 | """ 多个data_file 可以用 DSL 描述,然后和 event_handler(Event.FAILURE) 绑定在一起 """ 13 | def func(cls): 14 | # 1. 设置 持久化文件属性 15 | def wrap(file1): # 这样才可以保存 file1 变量,而不至于被覆写。 16 | def _file(self): 17 | return os.path.join(self.data_dir, file1 + ".json") 18 | return _file 19 | 20 | setattr(cls, "__persist_files", files) 21 | for file1 in getattr(cls, "__persist_files"): 22 | setattr(cls, file1, property(wrap(file1))) # @decorator 23 | 24 | # 2. 绑定 失败时删除这些文件 25 | def clean_tmp(task, exception): 26 | for file1 in files: 27 | IOUtils.remove_files(getattr(task, file1)) 28 | # IOUtils.remove_files(task.data_file) 29 | # NOTE 好像 Hadoop 会自动处理失败任务的输出文件的,否则就会导致其在N次重试一直在running。 30 | cls.event_handler(Event.FAILURE)(clean_tmp) 31 | 32 | return cls 33 | 34 | return func 35 | -------------------------------------------------------------------------------- /luiti/luigi_decorators/plug_packages.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["plug_packages"] 4 | 5 | from ..manager import luiti_config 6 | 7 | 8 | def plug_packages(*package_names): 9 | """ 10 | Let luigi know which packages should be attached, and can send to 11 | YARN, etc. 12 | 13 | Package format can be any valid Python package name, such as "project_B" or 14 | "project_C==0.0.2", etc. 15 | 16 | Usage: use `active_packages` decorator to notice luigi that these packages 17 | should include. 18 | """ 19 | for p1 in package_names: 20 | if p1: 21 | # load all packages's depended pacakges. 22 | luiti_config.attached_package_names.add(p1) 23 | # TODO why should do `luigi.hadoop.attach` in `active_packages` 24 | -------------------------------------------------------------------------------- /luiti/luigi_decorators/ref_tasks.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["ref_tasks"] 4 | 5 | from ..manager import load_a_task_by_name, luiti_config 6 | 7 | 8 | def ref_tasks(*tasks): # 装饰器 9 | """ 10 | 自动把依赖 Task 链接起来,通过属性访问。 11 | 12 | Example: 13 | 14 | ```python 15 | @ref_tasks("TaskB", "TaskC") 16 | class TaskA(TaskWeekBase): 17 | pass 18 | 19 | TaskA().TaskB == TaskB 20 | TaskA().TaskC == TaskC 21 | ``` 22 | """ 23 | def wrap_cls(ref_task_name): 24 | def _func(self): 25 | v1 = self.__dict__.get(ref_task_name, None) 26 | if v1 is None: 27 | v1 = load_a_task_by_name(ref_task_name) 28 | self.__dict__[ref_task_name] = v1 29 | return v1 30 | return _func 31 | 32 | def wrap_instance(ref_task_name, task_name): 33 | def _func(self): 34 | v1 = self.__dict__.get(task_name, None) 35 | if v1 is None: 36 | v1 = getattr(self, ref_task_name)(self.date_value) 37 | self.__dict__[task_name] = v1 38 | return v1 39 | return _func 40 | 41 | # Fix pickle dump, but it maybe unneeded. 42 | def __getstate__(self): 43 | """ Fix luiti_tasks module namespace conflicts. """ 44 | for ref_task1 in self._ref_tasks: 45 | cname = ref_task1 # class name 46 | iname = ref_task1 + "_task" # instance name 47 | 48 | if cname in self.__dict__: 49 | del self.__dict__[cname] 50 | if iname in self.__dict__: 51 | del self.__dict__[iname] 52 | return self.__dict__ 53 | 54 | def __setstate__(self, d1): 55 | # 1. default 56 | self.__dict__.update(d1) 57 | # 2. plug other package in `.__init_luiti` 58 | luiti_config.curr_project_name = self.package_name 59 | luiti_config.link_packages() 60 | 61 | # cached_property 捕获不了 ref_task_name 变量, 被重置为某一个了。。 62 | # property 可以捕获 ref_task_name 变量。 63 | def func(cls): 64 | setattr(cls, "_ref_tasks", tasks) 65 | for ref_task_name in cls._ref_tasks: 66 | setattr(cls, ref_task_name, property(wrap_cls(ref_task_name))) 67 | 68 | # TODO 根据当前日期返回。 69 | task_name = "%s_%s" % (ref_task_name, "task") 70 | setattr(cls, task_name, 71 | property(wrap_instance(ref_task_name, task_name))) 72 | 73 | # clear ref task info when pickle.dump 74 | setattr(cls, "__getstate__", __getstate__) 75 | return cls 76 | return func 77 | -------------------------------------------------------------------------------- /luiti/luigi_extensions/__init__.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["TaskInit", "ArrowParameter", "TaskBase", "HadoopExt", "RootTask", "luigi"] 4 | 5 | 6 | from .task_init import TaskInit 7 | from .parameter import ArrowParameter 8 | from .task_base import TaskBase 9 | from .hadoop_ext import HadoopExt 10 | from .root_task import RootTask 11 | 12 | from .create_python_package import luigi 13 | from .manage_decorators import ManageDecorators 14 | ManageDecorators.bind_to(luigi) 15 | -------------------------------------------------------------------------------- /luiti/luigi_extensions/create_python_package.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["create_packages_archive_with_support_egg"] 4 | 5 | import os 6 | from .luigi_root_context import luigi 7 | 8 | orig_create_packages_archive = luigi.hadoop.create_packages_archive 9 | 10 | 11 | def create_packages_archive_with_support_egg(packages, filename): 12 | """ 13 | Fix original luigi's `create_packages_archive` cannt attach egg packages 14 | (zip file type) to tarfile, Cause it's coping file mechanism by absolute 15 | path. 16 | """ 17 | # 1. original create tar file 18 | orig_create_packages_archive(packages, filename) 19 | 20 | # 2. append python egg packages that 1. not covered 21 | import tarfile 22 | tar = tarfile.open(filename, "a") # Force append 23 | 24 | logger = luigi.hadoop.logger 25 | fake_exists_path = "/" # root is awlays exists 26 | 27 | def get_parent_zip_file_within_absolute_path(path1): 28 | path2 = path1[:] 29 | is_success = False 30 | while path2 != fake_exists_path: 31 | path2 = os.path.dirname(path2) 32 | if os.path.isfile(path2): 33 | is_success = True 34 | break 35 | return is_success, path2 36 | 37 | def add(src, dst): 38 | logger.debug('adding to tar: %s -> %s', src, dst) 39 | tar.add(src, dst) 40 | 41 | import zipfile 42 | import tempfile 43 | for package1 in packages: 44 | path2 = (getattr(package1, "__path__", []) + [fake_exists_path])[0] 45 | if os.path.exists(path2): 46 | continue # so luigi can import it. 47 | if not path2.startswith("/"): 48 | continue # we only care about libraries. 49 | 50 | is_success, zipfilename3 = \ 51 | get_parent_zip_file_within_absolute_path(path2) 52 | if is_success: 53 | tmp_dir3 = tempfile.mkdtemp() 54 | zipfile.ZipFile(zipfilename3).extractall(tmp_dir3) 55 | 56 | for root4, dirs4, files4 in os.walk(tmp_dir3): 57 | for file5 in files4: 58 | if file5.endswith(".pyc"): 59 | continue 60 | add( 61 | os.path.join(root4, file5), 62 | os.path.join( 63 | root4.replace(tmp_dir3, "").lstrip("/"), file5)) 64 | 65 | client_cfg = os.path.join(os.getcwd(), "client.cfg") 66 | if os.path.exists(client_cfg): 67 | tar.add(client_cfg, "client.cfg") 68 | tar.close() 69 | 70 | luigi.hadoop.create_packages_archive = create_packages_archive_with_support_egg # wrap old function 71 | -------------------------------------------------------------------------------- /luiti/luigi_extensions/hadoop_ext.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from __future__ import print_function 4 | 5 | __all__ = ['HadoopExt'] 6 | 7 | import sys 8 | import luigi.hadoop 9 | from luigi.hadoop import flatten 10 | from itertools import groupby 11 | from etl_utils import cached_property 12 | 13 | from ..utils import ExtUtils, TargetUtils 14 | from .task_init import TaskInit 15 | 16 | # See benchmark at https://gist.github.com/mvj3/02dca2bcc8b0ef1bbfb5 17 | # force to use faster ujson, or it's meaningless to use JSON format with no performance gained. 18 | import ujson as json 19 | import jsonpickle 20 | 21 | 22 | class LuitiHadoopJobRunner(luigi.hadoop.HadoopJobRunner): 23 | """ overwrite DefaultHadoopJobRunner.class """ 24 | 25 | # params are copied from HadoopJobRunner 26 | def __init__(self, libjars=None, output_format=None): 27 | config = luigi.hadoop.configuration.get_config() 28 | opts = { 29 | "streaming_jar": config.get('hadoop', 'streaming-jar'), 30 | "output_format": output_format, 31 | "libjars": libjars, 32 | } 33 | super(LuitiHadoopJobRunner, self).__init__(**opts) 34 | 35 | 36 | DataInterchange = { 37 | "python": {"serialize": str, 38 | "internal_serialize": repr, 39 | "deserialize": eval}, 40 | "json": {"serialize": json.dumps, 41 | "internal_serialize": json.dumps, 42 | "deserialize": json.loads}, 43 | "jsonpickle": {"serialize": jsonpickle.dumps, 44 | "internal_serialize": jsonpickle.dumps, 45 | "deserialize": jsonpickle.loads} 46 | } 47 | 48 | 49 | class HadoopExt(luigi.hadoop.JobTask, ExtUtils.ExtendClass): 50 | 51 | # available formats are "python" and "json". 52 | data_interchange_format = "python" 53 | 54 | @cached_property 55 | def serialize(self): 56 | return DataInterchange[self.data_interchange_format]['serialize'] 57 | 58 | @cached_property 59 | def internal_serialize(self): 60 | return DataInterchange[self.data_interchange_format]['internal_serialize'] 61 | 62 | @cached_property 63 | def deserialize(self): 64 | return DataInterchange[self.data_interchange_format]['deserialize'] 65 | 66 | def writer(self, outputs, stdout, stderr=sys.stderr): 67 | """ 68 | Writer format is a method which iterates over the output records 69 | from the reducer and formats them for output. 70 | 71 | The default implementation outputs tab separated items. 72 | """ 73 | for output in outputs: 74 | try: 75 | output = flatten(output) 76 | if self.data_interchange_format == "json": 77 | # Only dump one json string, and skip another one, maybe key or value. 78 | output = filter(lambda x: x not in ["", None], output) 79 | else: 80 | # JSON is already serialized, so we put `self.serialize` in a else statement. 81 | output = map(self.serialize, output) 82 | print("\t".join(map(str, output)), file=stdout) 83 | except: 84 | print(output, file=stderr) 85 | raise 86 | 87 | def _reduce_input(self, inputs, reducer, final=NotImplemented): 88 | """ 89 | Iterate over input, collect values with the same key, and call the reducer for each unique key. 90 | """ 91 | for key, values in groupby(inputs, key=lambda x: self.internal_serialize(x[0])): 92 | for output in reducer(self.deserialize(key), (v[1] for v in values)): 93 | yield output 94 | if final != NotImplemented: 95 | for output in final(): 96 | yield output 97 | self._flush_batch_incr_counter() 98 | 99 | def internal_reader(self, input_stream): 100 | """ 101 | Reader which uses python eval on each part of a tab separated string. 102 | Yields a tuple of python objects. 103 | """ 104 | for input_line in input_stream: 105 | yield list(map(self.deserialize, input_line.split("\t"))) 106 | 107 | def internal_writer(self, outputs, stdout): 108 | """ 109 | Writer which outputs the python repr for each item. 110 | """ 111 | for output in outputs: 112 | print("\t".join(map(self.internal_serialize, output)), file=stdout) 113 | 114 | run_mode = "mr_distribute" 115 | n_reduce_tasks = 1 # 体现在 输出的part-00000数量为reduce数量 116 | 117 | output_format = [ 118 | # 单路输出。这个版本有问题。 119 | # "org.apache.hadoop.mapreduce.lib.output.TextOutputFormat", 120 | "org.apache.hadoop.mapred.TextOutputFormat", # 单路输出 121 | "org.apache.hadoop.mapred.lib.MultipleTextOutputFormat", # 多路输出 122 | ][0] # 默认是 单路输出 123 | output_format_default = output_format[:] 124 | libjars = [] 125 | 126 | def __init__(self, *args, **kwargs): 127 | """ 参考 TaskBase, 确保在 继承时还可以有TaskBase的覆写日期功能。 """ 128 | super(HadoopExt, self).__init__(*args, **kwargs) 129 | TaskInit.setup(self) 130 | 131 | # overwrite 132 | def job_runner(self): 133 | """ will be wraped in `run` function. """ 134 | # Auto compile java code 135 | if self.output_format != self.output_format_default: 136 | self.compile_java_code() 137 | 138 | return LuitiHadoopJobRunner( 139 | output_format=self.output_format, libjars=self.libjars) 140 | 141 | def output(self): 142 | return TargetUtils.hdfs(self.data_file) 143 | 144 | def jobconfs_opts(self): 145 | return [ 146 | "mapreduce.framework.name=yarn", 147 | 'mapred.reduce.tasks=%s' % self.n_reduce_tasks, 148 | ] 149 | 150 | def jobconfs(self): 151 | jcs = super(luigi.hadoop.JobTask, self).jobconfs() 152 | for conf_opt_1 in self.jobconfs_opts(): 153 | jcs.append(conf_opt_1) 154 | return jcs 155 | 156 | # TestCase related attrs 157 | def mrtest_input(self): 158 | raise NotImplementedError 159 | 160 | def mrtest_output(self): 161 | raise NotImplementedError 162 | 163 | def mrtest_attrs(self): 164 | return dict() 165 | 166 | def reader(self, input_stream): 167 | """ 168 | Overwrite luigi, skip blank line 169 | """ 170 | for line in input_stream: 171 | line = line.strip() 172 | if line: 173 | yield line, 174 | -------------------------------------------------------------------------------- /luiti/luigi_extensions/luigi_root_context.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["luigi"] 4 | 5 | """ 6 | Bind all things to `luigi` root namespace. 7 | """ 8 | 9 | 10 | import luigi.hdfs 11 | luigi.hdfs = luigi.hdfs # just make a link 12 | 13 | import luigi.hadoop 14 | luigi.hadoop = luigi.hadoop # just make a ref 15 | 16 | from .hadoop_ext import HadoopExt 17 | luigi.hadoop.HadoopExt = HadoopExt # write back 18 | # NOTE 对 luigi.hadoop 兼容 "track the job: " 19 | 20 | luigi.debug = False 21 | 22 | luigi.tmp_dir = "/tmp" # default one 23 | 24 | # TODO lazily 25 | from ..utils import TargetUtils 26 | luigi.HDFS = TargetUtils.hdfs # 本来就是需要读取全局配置,所以索性就绑定在 luigi 命名空间了吧。 27 | 28 | 29 | from ..manager import luiti_config, active_packages 30 | luigi.ensure_active_packages = lambda: active_packages # make a wrap 31 | luigi.luiti_config = luiti_config 32 | luiti_config.linked_luigi = luigi 33 | -------------------------------------------------------------------------------- /luiti/luigi_extensions/manage_decorators.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | import os 4 | import glob 5 | 6 | 7 | class ManageDecorators(object): 8 | 9 | @staticmethod 10 | def bind_to(luigi): 11 | root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 12 | decorator_dir = os.path.join(root_dir, "luigi_decorators") 13 | files = glob.glob(os.path.join(decorator_dir, "*.py")) 14 | 15 | # The decorator name Must as the same as the filename. 16 | decorator_names = map(lambda i1: i1.split("/")[-1].split(".")[0], files) 17 | decorator_names = filter(lambda i1: not i1.startswith("__"), decorator_names) 18 | assert len(decorator_names) > 0, decorator_names 19 | 20 | for name in decorator_names: 21 | try: 22 | mod = __import__("luiti.luigi_decorators." + name, fromlist=[name]) 23 | except ImportError: 24 | print "[Import error decorator name]", name 25 | exit() 26 | func = getattr(mod, name) 27 | setattr(luigi, name, func) 28 | 29 | return luigi 30 | -------------------------------------------------------------------------------- /luiti/luigi_extensions/parameter.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ['ArrowParameter', "arrow"] 4 | 5 | import luigi 6 | import arrow 7 | from dateutil import tz 8 | 9 | 10 | class ArrowParameter(luigi.DateParameter): 11 | 12 | """ 13 | Convert date or time type into Arrow type. 14 | 15 | "2014-11-24T00:00:00+00:00" # => len 25 16 | "2014-11-24" # => len 10 17 | """ 18 | 19 | arrow = arrow # make a ref 20 | 21 | def parse(self, s): 22 | """ overwrite default implement. """ 23 | s = str(s) # ensure `s` is a str 24 | assert len(s) in [25, 10], \ 25 | "Date format must be 2014-11-24T00:00:00+00:00 or 2014-11-24 !" 26 | return ArrowParameter.get(s) 27 | 28 | @staticmethod 29 | def get(*strs): 30 | """ 把原始的 `arrow.get` 兼容 tzlocal """ 31 | return arrow.get(*strs).replace(tzinfo=tz.tzlocal()) 32 | 33 | @staticmethod 34 | def now(): 35 | return ArrowParameter.get(arrow.now()) 36 | -------------------------------------------------------------------------------- /luiti/luigi_extensions/root_task.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import luigi 5 | from luigi import LocalTarget 6 | 7 | 8 | class RootTask(luigi.Task): 9 | 10 | def output(self): 11 | return LocalTarget(os.path.realpath(__file__)) # exist for ever 12 | -------------------------------------------------------------------------------- /luiti/luigi_extensions/task_base.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ['TaskBase'] 4 | 5 | import os 6 | import arrow 7 | from inflector import Inflector 8 | from etl_utils import cached_property 9 | 10 | from .luigi_root_context import luigi 11 | from .root_task import RootTask 12 | from ..utils import DateUtils, ExtUtils, IOUtils 13 | from ..manager import luiti_config 14 | 15 | from .parameter import ArrowParameter 16 | from .task_init import TaskInit 17 | 18 | 19 | class TaskBase(luigi.Task, ExtUtils.ExtendClass): 20 | """ 继承的子类在类名后 必须加 **时间类型**, 如 Day, Week, ... """ 21 | 22 | run_mode = ["local", "mr_distribute", "mr_local"][0] 23 | 24 | date_value = ArrowParameter() # **统一** 时间类型, 防止同时跑多个任务 25 | 26 | # will overwritten by @decorator 27 | # 不能以 **两个 __ 开头**, 否则会被 Python 当作隐私变量而无法继承。TODO 隐私变量 可能是错的。 28 | _persist_files = [] 29 | _ref_tasks = [] 30 | 31 | is_external = False # mark current task as a External Task, same to luigi.ExternalTask 32 | 33 | root_dir = NotImplementedError 34 | 35 | # Default one, always return True 36 | def requires(self): 37 | return RootTask() 38 | 39 | run = NotImplementedError 40 | 41 | def __init__(self, *args, **kwargs): 42 | # Fix date_value type 43 | if "date_value" in kwargs: 44 | kwargs["date_value"] = ArrowParameter.get(kwargs["date_value"]) 45 | if len(args) == 1: # just the luiti's date_value parameter 46 | args = (ArrowParameter.get(args[0]), ) 47 | 48 | super(TaskBase, self).__init__(*args, **kwargs) 49 | TaskInit.setup(self) 50 | 51 | @cached_property 52 | def data_dir(self): 53 | assert self.root_dir, "self.root_dir should not be None!" 54 | return os.path.join(self.root_dir, self.date_str) 55 | 56 | @cached_property 57 | def data_file(self): 58 | return os.path.join(self.data_dir, self.data_name + ".json") 59 | 60 | @cached_property 61 | def data_name(self): 62 | return Inflector().underscore(self.__class__.__name__) 63 | 64 | def output(self): 65 | return IOUtils.local_target(self.data_file) 66 | 67 | def errput(self): 68 | return IOUtils.local_target(self.data_file + ".err") 69 | 70 | @cached_property 71 | def date_str(self): 72 | return self.date_value.strftime("%Y-%m-%d") 73 | 74 | @cached_property 75 | def date_type(self): 76 | return luiti_config.get_date_type(self.__class__.__name__) 77 | 78 | @cached_property 79 | def date_value_by_type_in_last(self): 80 | return DateUtils.date_value_by_type_in_last( 81 | self.date_value, self.date_type) 82 | 83 | @cached_property 84 | def date_value_by_type_in_begin(self): 85 | return ArrowParameter.get(self.date_value).floor(self.date_type) 86 | 87 | @cached_property 88 | def date_value_by_type_in_end(self): 89 | return ArrowParameter.get(self.date_value).ceil(self.date_type) 90 | 91 | @cached_property 92 | def pre_task_by_self(self): 93 | """ 如果跨了两个周期就没有上次数据文件了 """ 94 | return RootTask() if self.is_reach_the_edge else \ 95 | self.__class__(self.date_value_by_type_in_last) 96 | 97 | @cached_property 98 | def is_reach_the_edge(self): 99 | return False # default. e.g. add semester 100 | 101 | def reset_date(self): 102 | # **强制** 写为统一时间格式(arrow格式),这样luigi就不会同时跑两个任务了。 103 | self.date_value = ArrowParameter.get(self.date_value) 104 | 105 | orig_date = self.date_value 106 | if self.date_type != 'range': 107 | new_date = orig_date.floor(self.date_type) 108 | if orig_date != new_date: 109 | if luigi.debug: 110 | print "[reset date by %s] from %s to %s" % \ 111 | (self.date_type, orig_date, new_date) 112 | self.date_value = new_date 113 | 114 | @classmethod 115 | def instances_by_date_range(cls, first_date, last_date): 116 | """ 返回属于某周期里的所有当前任务实例列表 """ 117 | assert isinstance(first_date, arrow.Arrow) 118 | assert isinstance(last_date, arrow.Arrow) 119 | 120 | if "Range" in cls.__name__: 121 | # return head and tail directly 122 | return list(set([cls(first_date), cls(last_date)])) 123 | else: 124 | dates = arrow.Arrow.range( 125 | luiti_config.get_date_type(cls.__name__), 126 | first_date, last_date) 127 | return [cls(date1.datetime) for date1 in dates] 128 | 129 | @cached_property 130 | def task_class(self): 131 | return self.__class__ 132 | 133 | @cached_property 134 | def task_clsname(self): 135 | return self.task_class.__name__ 136 | 137 | @cached_property 138 | def package_name(self): 139 | module_name = self.task_class.__module__ 140 | package_name = module_name.split(".")[0] 141 | return package_name 142 | -------------------------------------------------------------------------------- /luiti/luigi_extensions/task_init.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from dateutil import tz 4 | from .parameter import ArrowParameter 5 | 6 | 7 | class TaskInit(object): 8 | 9 | @staticmethod 10 | def setup(task_instance): 11 | """ 12 | Let luigi'Task supports luiti's operations. 13 | 14 | You need to call this function, if you want to extend luigi. 15 | """ 16 | self = task_instance 17 | 18 | # 在跨期的时候用于判断 该周应该是该周的哪些天。 19 | # 比如这学期开学是 2015-02-17(星期二) 开学, 那么这周的数据只有 0217-0222。 20 | # 而在寒假里(即run 2015-02-16(星期天) 的 task 时,那么该周的天只有 0216 一天。 21 | d1 = ArrowParameter.get(self.date_value).replace(tzinfo=tz.tzlocal()) 22 | self.orig_date_value = d1 # exists only if this `setup` executed. 23 | 24 | # reset date to at the beginning of current date type here 25 | self.reset_date() 26 | 27 | assert task_instance.root_dir is not NotImplementedError, [task_instance, task_instance.root_dir] 28 | self.data_file # force load it now, or `output` still load it. 29 | self.package_name # force load it now, use to serialize 30 | 31 | # Fix luigi.Task#__eq__ 32 | """ 33 | >>> t1.param_args 34 | (,) 35 | >>> map(str, t1.param_args) 36 | ['2015-06-23T00:00:00+08:00'] 37 | 38 | def __eq__(self, other): 39 | return self.__class__ == other.__class__ and self.param_args == other.param_args 40 | """ 41 | self.param_kwargs["date_value"] = ArrowParameter.get(self.param_kwargs["date_value"]) 42 | self.param_args = tuple(sorted(map(str, [value for key, value in self.param_kwargs.iteritems()]))) 43 | 44 | # NOTE below codes are copied from luigi's Task 45 | # Build up task id 46 | task_id_parts = ["%s=%s" % (k1, v1) for k1, v1 in self.param_kwargs.iteritems() if k1 not in ["pool"]] 47 | self.task_id = '%s(%s)' % (self.task_family, ', '.join(task_id_parts)) 48 | self.__hash = hash(self.task_id) 49 | -------------------------------------------------------------------------------- /luiti/manager/__init__.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = [ 4 | "ld", 5 | 6 | "load_a_task_by_name", 7 | "print_all_tasks", 8 | "new_a_project", 9 | "generate_a_task", 10 | "find_dep_on_tasks", 11 | 12 | "active_packages", 13 | 14 | "luiti_config", 15 | 16 | "Cli", 17 | "PackageMap", 18 | ] 19 | 20 | from .loader import Loader 21 | from .table import Table 22 | from .dep import Dep 23 | from .files import Files 24 | 25 | from .config import luiti_config 26 | from .package_map import PackageMap 27 | from .active_packages import active_packages 28 | 29 | 30 | from .generate_from_templates import GenerateFromTemplates 31 | 32 | from .cli import Cli 33 | 34 | 35 | # API list 36 | find_dep_on_tasks = Dep.find_dep_on_tasks 37 | get_all_date_file_to_task_instances = Files.get_all_date_file_to_task_instances 38 | soft_delete_files = Files.soft_delete_files 39 | load_all_tasks = Loader.load_all_tasks 40 | load_a_task_by_name = Loader.load_a_task_by_name 41 | print_all_tasks = Table.print_all_tasks 42 | print_files_by_task_cls_and_date_range = \ 43 | Table.print_files_by_task_cls_and_date_range 44 | new_a_project = GenerateFromTemplates.new_a_project 45 | generate_a_task = GenerateFromTemplates.generate_a_task 46 | 47 | 48 | from .lazy_data import ld 49 | -------------------------------------------------------------------------------- /luiti/manager/active_packages.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | import os 4 | from .config import luiti_config as lc 5 | 6 | processed_package_names = set([]) 7 | 8 | 9 | def active_packages(orig_func): 10 | """ 11 | called by `PackageMap.task_clsname_to_package` 12 | """ 13 | def new_func(*args, **kwargs): 14 | # 1. Setup env 15 | lc.link_packages() 16 | 17 | # 2. Load related packages. 18 | import pkg_resources 19 | import luigi.hadoop 20 | import re 21 | 22 | # fix Set changed size during iteration 23 | for p1 in list(lc.attached_package_names): 24 | package2, version2 = re.compile("(^[a-z0-9\_]+)(.*)", re.IGNORECASE) \ 25 | .match(p1).groups() 26 | if package2 in processed_package_names: 27 | continue 28 | else: 29 | # Pip cant manage versions packages, only exist one version at 30 | # one time. 31 | try: 32 | if version2: 33 | pkg_resources.require(p1) 34 | except: 35 | pkg_resources.require(package2) 36 | 37 | # TODO luiti 拷之前需要版本,之后不需要,分布式时判断目录packages即可。 38 | # Notice Python to import special version package. 39 | # if version2: pkg_resources.require(p1) 40 | 41 | # Let luigi know it. 42 | package2_lib = lc.import2(package2) 43 | luigi.hadoop.attach(package2_lib) 44 | 45 | # Add valid package which has .luiti_tasks 46 | # compact with package with a plain python file. 47 | try: 48 | path = (package2_lib.__path__ + [""])[0] 49 | except: 50 | print "[package2_lib load error]", package2_lib 51 | path = "/package/load/error" 52 | # TODO 兼容 egg zip 格式,看看里面有没有 luiti_tasks 53 | # 文件,然后提示加 zip_safe=False 54 | if os.path.exists(path + "/luiti_tasks"): 55 | # .__init_luiti Maybe not exists, so execute this first 56 | lc.luiti_tasks_packages.add(package2_lib) 57 | processed_package_names.add(p1) 58 | return orig_func(*args, **kwargs) # call it at last. 59 | new_func.func_name = orig_func.func_name 60 | return new_func 61 | -------------------------------------------------------------------------------- /luiti/manager/config.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | import os 4 | import sys 5 | from inflector import Inflector 6 | from etl_utils import singleton, cached_property 7 | import arrow 8 | 9 | 10 | @singleton() 11 | class LuitiConfigClass(object): 12 | 13 | """ Make sure init variables only once. """ 14 | # arrow.Arrow._ATTRS = ['year', 'month', 'day', 'hour', 'minute', 'second', 'microsecond'] 15 | DateTypes = ["range", "week", "biweekly", "quarter"] + arrow.Arrow._ATTRS 16 | 17 | curr_project_name = None 18 | curr_project_dir = None 19 | 20 | linked_luigi = None 21 | 22 | @cached_property 23 | def attached_package_names(self): 24 | return set(['luiti']) 25 | 26 | @cached_property 27 | def luiti_tasks_packages(self): 28 | return set([]) 29 | 30 | @staticmethod 31 | def import2(a_package): 32 | return __import__(a_package, None, None, 'non_empty') 33 | 34 | @staticmethod 35 | def get_date_type(name1): 36 | """ Inherit class must be in TaskBase{Day,Week,Month,Range} style. """ 37 | assert isinstance(name1, (str, unicode)) 38 | str1 = Inflector().underscore(name1).split("_")[-1].lower() 39 | assert str1 in luiti_config.DateTypes, [str1, luiti_config.DateTypes] 40 | return str1 41 | 42 | @staticmethod 43 | def get_time_task(name1): 44 | """ return e.g. TaskDay """ 45 | type2 = luiti_config.get_date_type(name1) 46 | return "Task" + Inflector().camelize(type2) 47 | 48 | @staticmethod 49 | def link_packages(): 50 | """ 51 | called by `active_packages` 52 | """ 53 | is_in_luigi_distributed = False 54 | 55 | # 1. unmornal task class 56 | if luiti_config.curr_project_name == "__main__": 57 | return False 58 | 59 | # 2. setup current project as root 60 | if luiti_config.curr_project_dir is None: 61 | luiti_config.curr_project_dir = os.getcwd() # auto from current class 62 | luiti_config.fix_project_dir() 63 | 64 | def exists(filename1): 65 | return os.path.exists(os.path.join(luiti_config.curr_project_dir, filename1)) 66 | 67 | # These files are created by luigi. 68 | if exists("job-instance.pickle") and exists("job.jar") and \ 69 | exists("packages.tar") and exists("luigi"): 70 | is_in_luigi_distributed = True 71 | 72 | # compact with no-luiti project 73 | is_a_luiti_project = exists("luiti_tasks") 74 | 75 | if luiti_config.curr_project_name is None: 76 | if is_in_luigi_distributed: 77 | for item1 in os.listdir(luiti_config.curr_project_dir): 78 | # is a valid python package 79 | if exists(item1 + "/__init__.py") and \ 80 | exists(item1 + "/luiti_tasks"): 81 | luiti_config.luiti_tasks_packages.add(luiti_config.import2(item1)) 82 | else: 83 | # "project_A" 84 | curr_project_name = luiti_config.get_curr_project_name() 85 | luiti_config.curr_project_name = curr_project_name 86 | 87 | # project_A/ 88 | curr_project_syspath = os.path.dirname(luiti_config.curr_project_dir) 89 | if curr_project_syspath not in sys.path: 90 | sys.path.insert(0, curr_project_syspath) 91 | 92 | luiti_config.luiti_tasks_packages.add(luiti_config.import2(luiti_config.curr_project_name)) 93 | 94 | # 3. ensure other luiti tasks packages can be loaded. 95 | if is_a_luiti_project: 96 | luiti_config.import2( 97 | luiti_config.curr_project_name + ".luiti_tasks.__init_luiti") 98 | 99 | def get_curr_project_path(self): 100 | curr_package_name = self.get_curr_project_name() 101 | curr_path = luiti_config.curr_project_dir 102 | dir1 = curr_path.rstrip("/") 103 | if dir1.split("/").count(curr_package_name) == 2: 104 | dir1 = os.path.dirname(dir1) 105 | return dir1 106 | 107 | def get_curr_project_name(self): 108 | """ a valid Python package path. """ 109 | assert isinstance(luiti_config.curr_project_dir, str), luiti_config.curr_project_dir 110 | return os.path.basename(luiti_config.curr_project_dir) 111 | 112 | def fix_project_dir(self): 113 | """ Fix project_A/project_A/luiti_tasks dir """ 114 | _try_dir = os.path.join( 115 | luiti_config.curr_project_dir, 116 | os.path.basename(luiti_config.curr_project_dir)) 117 | if os.path.exists(_try_dir): # cause of the same name 118 | luiti_config.curr_project_dir = _try_dir 119 | 120 | 121 | luiti_config = LuitiConfigClass() 122 | -------------------------------------------------------------------------------- /luiti/manager/dep.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from collections import defaultdict 4 | 5 | 6 | class Dep(object): 7 | 8 | @staticmethod 9 | def find_dep_on_tasks(curr_task_1, task_classes_1): 10 | """ return all task classes. """ 11 | # 找到的DAG库没有对应功能或不好用,比如 dagger。只能自己实现了。 12 | task_name_to_instance = {task_instance_1.__name__: task_instance_1 13 | for task_instance_1 in 14 | (task_classes_1 + [curr_task_1])} 15 | 16 | linked_dict = defaultdict(list) # dep_task => next_task 17 | for task_2 in task_classes_1: 18 | for ref_task_name_3 in task_2._ref_tasks: 19 | linked_dict[ref_task_name_3].append(task_2.__name__) 20 | 21 | # filter linked to self 22 | result = set( 23 | linked_dict[curr_task_1.__name__] + 24 | [curr_task_1.__name__]) 25 | _is_add = True 26 | while True: 27 | for next_task_name_1 in list(result): # make a copy 28 | next_task_names_2 = linked_dict[next_task_name_1] 29 | # 1. 没数据 30 | if len(next_task_names_2) == 0: 31 | _is_add = False 32 | # 2. 有数据 33 | else: 34 | for next_task_name_2 in next_task_names_2: 35 | if next_task_name_2 in result: 36 | _is_add = False 37 | else: 38 | result.add(next_task_name_2) 39 | 40 | if not _is_add: 41 | break 42 | 43 | result = [task_name_to_instance[name_1] for name_1 in result] 44 | result.remove(curr_task_1) 45 | return result 46 | -------------------------------------------------------------------------------- /luiti/manager/files.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from ..luigi_extensions import ArrowParameter 4 | import luigi.hdfs 5 | from datetime import datetime 6 | 7 | 8 | class Files(object): 9 | 10 | """ Get all outputs which generated by luiti tasks. """ 11 | 12 | @staticmethod 13 | def get_all_date_file_to_task_instances(date_range, task_classes): 14 | """ return all instances in date range. """ 15 | assert_msg = "[error] correct format is \"20140901-20140905\", " \ 16 | "but the input is %s" % date_range 17 | assert len(date_range) == 17, assert_msg 18 | 19 | first_date, last_date = date_range[0:8], date_range[9:] 20 | first_date, last_date = ArrowParameter.get( 21 | first_date, "YYYYMMDD"), ArrowParameter.get(last_date, "YYYYMMDD") 22 | 23 | return dict({file_3: task_instance_2 24 | for task1 in task_classes 25 | for task_instance_2 in task1.instances_by_date_range( 26 | first_date, last_date) 27 | for file_3 in task_instance_2._persist_files + 28 | [task_instance_2.data_file]}) 29 | 30 | @staticmethod 31 | def soft_delete_files(*files): 32 | delete_at_str = datetime.now().strftime("-deleted-at-%Y%m%d-%H%M%S") 33 | 34 | for file1 in sorted(files): 35 | print "[delete file]", file1 36 | if luigi.hdfs.clients.exists(file1): 37 | luigi.hdfs.clients.rename(file1, file1 + delete_at_str) 38 | print 39 | else: 40 | print "[err] doesnt exist!" 41 | 42 | print "\nDone!" 43 | return 0 44 | -------------------------------------------------------------------------------- /luiti/manager/generate_from_templates.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | # :PEP8 -E221 -W603 3 | 4 | __all__ = ['GenerateFromTemplates'] 5 | 6 | import os 7 | from inflector import Inflector 8 | from .config import luiti_config 9 | 10 | join = os.path.join 11 | exists = os.path.exists 12 | 13 | 14 | class GenerateFromTemplates(object): 15 | 16 | @staticmethod 17 | def new_a_project(project_name): 18 | project_name = Inflector().underscore(project_name) 19 | readme_path = join(project_name, "README.markdown") 20 | setup_path = join(project_name, "setup.py") 21 | package_dir = join(project_name, project_name) 22 | package_init = join(package_dir, "__init__.py") 23 | package_luiti_tasks_init = join(package_dir, "luiti_tasks/__init__.py") 24 | package_luiti_tasks_luiti = join( 25 | package_dir, "luiti_tasks/__init_luiti.py") 26 | tests_dir = join(project_name, "tests") 27 | tests_test_main = join(tests_dir, "test_main.py") 28 | 29 | write_content_to_file(a_project_readme(project_name), readme_path) 30 | write_content_to_file(a_project_setup(project_name), setup_path) 31 | write_content_to_file(u"", package_init) 32 | write_content_to_file(u"", package_luiti_tasks_init) 33 | write_content_to_file( 34 | a_project_init_luiti(), package_luiti_tasks_luiti) 35 | write_content_to_file( 36 | a_project_test_main(project_name), tests_test_main) 37 | 38 | # important files 39 | return [readme_path, setup_path, 40 | package_luiti_tasks_luiti, tests_test_main] 41 | 42 | @staticmethod 43 | def generate_a_task(task_name, project_dir=None,): 44 | path = join('luiti_tasks', Inflector().underscore(task_name) + ".py") 45 | if project_dir: 46 | path = join(project_dir, path) 47 | content = write_content_to_file( 48 | a_task_template(Inflector().classify(task_name)), 49 | path, 50 | ) 51 | return content 52 | 53 | 54 | """ 1. Project """ 55 | a_project_readme = lambda project_name: u""" 56 | %s 57 | ======================= 58 | 59 | TODO ... 60 | """.strip() % (Inflector().titleize(project_name), ) 61 | 62 | a_project_setup = lambda project_name: u""" 63 | # -*-coding:utf-8-*- 64 | 65 | from setuptools import setup 66 | 67 | setup( 68 | name="%s", 69 | version="0.0.1", 70 | packages=[ 71 | "%s", 72 | "%s/luiti_tasks", ], 73 | zip_safe=False, 74 | ) 75 | """.strip() % (project_name, project_name, project_name, ) 76 | 77 | """ has bugs ... 78 | from setuptools import setup, find_packages 79 | packages=find_packages("%s"), 80 | package_dir = {"": "%s"}, 81 | """ 82 | 83 | 84 | a_project_init_luiti = lambda: u""" 85 | # -*-coding:utf-8-*- 86 | 87 | from luiti import * 88 | luigi.plug_packages("package_a", "package_b==4.2") 89 | """.strip() 90 | 91 | 92 | a_project_test_main = lambda project_name: u""" 93 | # -*- coding: utf-8 -*- 94 | 95 | import os 96 | import sys 97 | root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 98 | sys.path.insert(0, root_dir) 99 | 100 | import unittest 101 | from luiti import MrTestCase 102 | 103 | 104 | @MrTestCase 105 | class TestMapReduce(unittest.TestCase): 106 | mr_task_names = [ 107 | ] 108 | 109 | if __name__ == '__main__': 110 | unittest.main() 111 | """.strip() 112 | 113 | 114 | """ 2. Task """ 115 | a_task_template = lambda task_clsname: u""" 116 | # -*-coding:utf-8-*- 117 | 118 | from .__init_luiti import * 119 | 120 | 121 | @luigi.ref_tasks() 122 | class %s(%s): 123 | 124 | root_dir = "/foobar" 125 | """.strip() % (task_clsname, luiti_config.get_time_task(task_clsname), ) 126 | 127 | 128 | def write_content_to_file(content, path): 129 | if exists(path): 130 | raise ValueError("path [%s] is already exists!" % path) 131 | 132 | dir1 = os.path.dirname(path) 133 | if not exists(dir1): 134 | os.mkdir(dir1) 135 | 136 | f1 = open(path, 'w') 137 | f1.write(content.encode("UTF-8")) 138 | f1.close() 139 | 140 | print "[info] generate %s file." % path 141 | 142 | return content 143 | -------------------------------------------------------------------------------- /luiti/manager/lazy_data.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["ld"] 4 | 5 | 6 | from etl_utils import singleton, cached_property 7 | 8 | from .loader import Loader 9 | from .dep import Dep 10 | from .table import Table 11 | 12 | 13 | @singleton() 14 | class LazyData(object): 15 | 16 | @cached_property 17 | def all_task_classes(self): 18 | return [i1['task_cls'] for i1 in self.result['success']] 19 | 20 | @cached_property 21 | def result(self): 22 | return Loader.load_all_tasks() 23 | 24 | ld = LazyData() 25 | Dep.ld = ld 26 | Table.ld = ld 27 | -------------------------------------------------------------------------------- /luiti/manager/loader.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | import sys 4 | import traceback 5 | from inflector import Inflector 6 | 7 | from .config import luiti_config as lc 8 | from .active_packages import active_packages 9 | from .package_map import PackageMap 10 | 11 | 12 | class Loader(object): 13 | 14 | @staticmethod 15 | @active_packages 16 | def load_all_tasks(): 17 | result = {"success": list(), "failure": list()} 18 | 19 | task_clsnames = sorted(PackageMap.task_clsname_to_package.keys()) 20 | for task_clsname_1 in task_clsnames: 21 | is_success = False 22 | task_cls = None 23 | err = None 24 | 25 | try: 26 | task_cls = Loader.load_a_task_by_name(task_clsname_1) 27 | is_success = True 28 | except Exception: 29 | err = list(sys.exc_info()) 30 | err[2] = "".join(traceback.format_tb(err[2])) 31 | err = str(err[0]) + ": " + str(err[1]) + "\n" + err[2] 32 | 33 | if is_success: 34 | result['success'].append({"task_cls": task_cls}) 35 | else: 36 | result['failure'].append( 37 | {"err": err, "task_clsname": task_clsname_1}) 38 | 39 | return result 40 | 41 | @staticmethod 42 | @active_packages 43 | def load_a_task_by_name(s1): 44 | task_clsname_1 = Inflector().classify(s1) # force convert 45 | task_filename_1 = Inflector().underscore(s1) # force convert 46 | 47 | assert task_clsname_1 in PackageMap.task_clsname_to_package, u""" 48 | "%s" cannt be found. Auto converted class name is "%s", file name 49 | is "luiti_tasks/%s.py", please check it carefully. 50 | 51 | Already loaded PackageMap.task_clsname_to_package is %s. 52 | """ % (s1, task_clsname_1, task_filename_1, PackageMap.task_clsname_to_package) 53 | 54 | package_path = PackageMap.task_clsname_to_package[task_clsname_1].__name__ + \ 55 | ".luiti_tasks." + task_filename_1 56 | task_lib = lc.import2(package_path) 57 | return getattr(task_lib, task_clsname_1) 58 | -------------------------------------------------------------------------------- /luiti/manager/package_map.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | import os 4 | import glob 5 | from inflector import Inflector 6 | from etl_utils import singleton, cached_property 7 | from collections import defaultdict 8 | 9 | from .config import luiti_config as lc 10 | from .active_packages import active_packages 11 | 12 | 13 | @singleton() 14 | class PackageMapClass(object): 15 | 16 | @cached_property 17 | @active_packages 18 | def task_clsname_to_package(self): 19 | 20 | assert lc.luiti_tasks_packages, "At least have one project!" 21 | 22 | result = dict() 23 | for project1 in lc.luiti_tasks_packages: 24 | project_dir2 = project1.__path__[0] 25 | 26 | # if it's not a zip file, but a normal package directory 27 | is_zip_file = os.path.exists( 28 | os.path.join(project_dir2, "__init__.py")) 29 | if not is_zip_file: 30 | raise Exception( 31 | """[setup.py format error] make sure """ 32 | """project "%s" zip_safe=False option exists!""" 33 | % project1.__name__) 34 | 35 | task_path_pattern = os.path.join( 36 | project_dir2, "luiti_tasks/[a-z]*.py") 37 | 38 | for f2 in glob.glob(task_path_pattern): 39 | task_filename3 = os.path.basename(f2).rsplit(".", 1)[0] 40 | task_clsname4 = Inflector().classify(task_filename3) 41 | result[task_clsname4] = project1 42 | return result 43 | 44 | @cached_property 45 | def package_to_task_clsnames(self): 46 | result = defaultdict(set) 47 | for task_clsname, package in self.task_clsname_to_package.iteritems(): 48 | result[package].add(task_clsname) 49 | return result 50 | 51 | PackageMap = PackageMapClass() 52 | -------------------------------------------------------------------------------- /luiti/manager/sys_argv.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["SysArgv"] 4 | 5 | 6 | class SysArgv(object): 7 | """ 8 | Modify sys.argv to fix luigi's command interface. 9 | """ 10 | 11 | @staticmethod 12 | def convert_to_luigi_accepted_argv(subparsers, argv): 13 | luigi_keep_opts = ["--date-value"] 14 | 15 | def fetch_keys(parser1): 16 | return parser1.__dict__['_option_string_actions'].keys() 17 | 18 | luiti_only_opts = subparsers.choices.keys() + \ 19 | list(set( 20 | [k3 for p2 in subparsers._name_parser_map.values() 21 | for k3 in fetch_keys(p2)])) 22 | luiti_only_opts = [i1 for i1 in luiti_only_opts 23 | if i1 not in luigi_keep_opts] 24 | 25 | delete_argv_idxes = set([]) 26 | for idx1, arg1 in enumerate(argv): 27 | if idx1 in delete_argv_idxes: 28 | continue 29 | # 1. remove tasks, files, run, etc. 30 | if (not arg1.startswith("--")) and (arg1 in luiti_only_opts): 31 | delete_argv_idxes.add(idx1) 32 | continue 33 | # 2. process --task-name and more params 34 | if "=" in arg1: 35 | arg2, val2 = arg1.split("=", 1) 36 | if arg2 in luiti_only_opts: 37 | delete_argv_idxes.add(idx1) 38 | else: 39 | if (arg1 in luiti_only_opts) and (arg1 not in luigi_keep_opts): 40 | delete_argv_idxes.add(idx1) 41 | delete_argv_idxes.add(idx1 + 1) 42 | argv = [arg1 for idx1, arg1 in enumerate(argv) 43 | if idx1 not in delete_argv_idxes] 44 | return argv 45 | -------------------------------------------------------------------------------- /luiti/manager/table.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | import os 4 | from .dep import Dep 5 | import luigi 6 | 7 | 8 | class Table(object): 9 | """ 10 | print task and package info. 11 | """ 12 | 13 | @staticmethod 14 | def puts(task_body, task_headers, **opts): 15 | from tabulate import tabulate 16 | result = tabulate(task_body, task_headers, **opts) 17 | print 18 | print result 19 | print 20 | return result 21 | 22 | @staticmethod 23 | def print_all_tasks(result): 24 | """ input from Loader.load_all_tasks """ 25 | 26 | def task_inspect(task_cls, order): 27 | return [ 28 | order, 29 | task_cls.__name__, 30 | task_cls.__module__.split(".")[0] 31 | ] 32 | 33 | task_headers = ["", "All Tasks", "luiti_package"] 34 | task_body = [task_inspect(item1['task_cls'], idx1 + 1) 35 | for idx1, item1 in enumerate(sorted(result['success']))] 36 | task_body.extend([["total", len(result['success']), ""]]) 37 | 38 | Table.puts(task_body, task_headers, tablefmt="grid") 39 | 40 | if result['failure']: 41 | print 42 | print "[warn] failure parsed files" 43 | print 44 | for failure1 in result['failure']: 45 | print "[task_file] ", failure1['task_clsname'] 46 | print "[err] ", failure1['err'] 47 | print 48 | return (task_body, task_headers) 49 | 50 | @staticmethod 51 | def print_files_by_task_cls_and_date_range(curr_task, args, opts=None): 52 | opts = opts or dict() 53 | # 打印 依赖类 和 执行配置 信息 54 | task_headers = ["Current Env Key", "Current Env Value"] 55 | task_body = [ 56 | ["task name", args.task_name], 57 | ["task date range", args.date_range], 58 | ["task execute mode", "DRY=" + str(args.dry)], 59 | ["task dep mode", "DEP=" + str(args.dep)], 60 | ["related task classes total count", opts['task_classes_count']], 61 | ] 62 | print 63 | print "Tasks related infos" 64 | Table.puts(task_body, task_headers, tablefmt="grid") 65 | 66 | # 打印 要删除的文件列表 67 | file_headers = ["Generated from task", "Storage", 68 | "Date value", "Filename"] 69 | 70 | dep_file_to_task_instances = opts['dep_file_to_task_instances'] 71 | file_table = [ 72 | [dep_file_to_task_instances[f1].__class__.__name__, 73 | 'HDFS', dep_file_to_task_instances[f1].date_str, 74 | os.path.basename(f1), ] 75 | for f1 in sorted(dep_file_to_task_instances.keys())] 76 | file_table.append( 77 | ['', '', '', "Total count %s" % len(dep_file_to_task_instances)]) 78 | file_table.append(['', '', '', '']) 79 | file_uniq_root_dir = set( 80 | [t1.root_dir for t1 in opts['dep_tasks_on_curr_task']]) 81 | file_table.append( 82 | ['All root dirs', '', '', 83 | 'Total count %s' % len(file_uniq_root_dir)]) 84 | for dir1 in file_uniq_root_dir: 85 | file_table.append(['', '', '', dir1]) 86 | 87 | print 88 | print "Files related infos" 89 | Table.puts(file_table, file_headers, tablefmt="grid") 90 | print "\n" * 3 91 | return (file_table, file_headers) 92 | 93 | @staticmethod 94 | def print_task_info(curr_task): 95 | assert issubclass(curr_task, luigi.Task) 96 | 97 | dep_tasks_on_curr_task = Dep.find_dep_on_tasks( 98 | curr_task, Table.ld.all_task_classes) 99 | 100 | task_headers = ["Task name", curr_task.__name__] 101 | task_content = [ 102 | ["Tasks self dep on", str(list(curr_task._ref_tasks))], 103 | ["Tasks dep on self", 104 | str(sorted([t2.__name__ for t2 in dep_tasks_on_curr_task]))], 105 | ] 106 | Table.puts(task_content, task_headers, tablefmt="grid") 107 | return (task_content, task_headers) 108 | -------------------------------------------------------------------------------- /luiti/schedule/__init__.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["SensorSchedule"] 4 | 5 | from .sensor_schedule import SensorSchedule 6 | -------------------------------------------------------------------------------- /luiti/task_templates/__init__.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["TaskHour", 4 | "TaskHourHadoop", 5 | "TaskDay", 6 | "TaskDayHadoop", 7 | "TaskWeek", 8 | "TaskWeekHadoop", 9 | "TaskBiweekly", 10 | "TaskBiweeklyHadoop", 11 | "TaskMonth", 12 | "TaskMonthHadoop", 13 | "TaskQuarter", 14 | "TaskQuarterHadoop", 15 | "TaskYear", 16 | "TaskYearHadoop", 17 | "TaskRange", 18 | "TaskRangeHadoop", 19 | 20 | "StaticFile", 21 | "HiveTask", 22 | "MongoImportTask", ] 23 | 24 | 25 | from .time.task_hour import TaskHour 26 | from .time.task_day import TaskDay 27 | from .time.task_week import TaskWeek 28 | from .time.task_biweekly import TaskBiweekly 29 | from .time.task_month import TaskMonth 30 | from .time.task_quarter import TaskQuarter 31 | from .time.task_year import TaskYear 32 | from .time.task_range import TaskRange 33 | 34 | from .time.task_hour_hadoop import TaskHourHadoop 35 | from .time.task_day_hadoop import TaskDayHadoop 36 | from .time.task_week_hadoop import TaskWeekHadoop 37 | from .time.task_biweekly_hadoop import TaskBiweeklyHadoop 38 | from .time.task_month_hadoop import TaskMonthHadoop 39 | from .time.task_quarter_hadoop import TaskQuarterHadoop 40 | from .time.task_year_hadoop import TaskYearHadoop 41 | from .time.task_range_hadoop import TaskRangeHadoop 42 | 43 | from .other.static_file import StaticFile 44 | from .other.mongo_import_task import MongoImportTask 45 | from .other.hive_task import HiveTask 46 | -------------------------------------------------------------------------------- /luiti/task_templates/other/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dchentech/luiti/11a5c62b265a92910a1d4c82431e3697b8b06814/luiti/task_templates/other/__init__.py -------------------------------------------------------------------------------- /luiti/task_templates/other/hive_task.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["HiveTask"] 4 | 5 | 6 | from etl_utils import cached_property 7 | from luigi.contrib.hive import HiveQueryTask 8 | 9 | from ...utils import TargetUtils 10 | from ...luigi_extensions import luigi, TaskBase 11 | 12 | 13 | @luigi.as_a_luiti_task() 14 | class HiveTask(HiveQueryTask, TaskBase): 15 | """ 16 | Hive SQL Template, follows luiti `date_value` date mode。 17 | 18 | 19 | Implement: 20 | 1. hive_db 21 | 2. sql_main 22 | 23 | Example: 24 | from luiti.task_templates import HiveTask 25 | 26 | class AnotherHiveDay(HiveTask): 27 | root_dir = "/another/hive/result/" 28 | use_hive_db = "main_hive_database" 29 | 30 | @cached_property 31 | def sql_main(self): 32 | return "select * from example_table;" 33 | 34 | """ 35 | 36 | run_mode = "mr_distribute" 37 | 38 | def output(self): 39 | """ Hive query default output directory has no _SUCCESS, not chunk filename is not MR style, see more details at `TargetUtils.hdfs_dir` . """ 40 | assert "ValueError" not in self.data_file, self.data_file 41 | return TargetUtils.hdfs_dir(self.data_file) 42 | 43 | def query(self): 44 | sql = u""" 45 | USE %s; 46 | INSERT OVERWRITE DIRECTORY "%s" %s 47 | """.replace("\n", " ") % (self.use_hive_db, self.data_file, self.sql_main.strip()) 48 | 49 | if self.run_mode == "mr_distribute": 50 | print "[info.luiti] run Hive SQL := %s" % sql 51 | 52 | return sql.strip() 53 | 54 | @cached_property 55 | def data_root(self): 56 | raise ValueError("Old API. Please use luiti's standard property `root_dir` instead.") 57 | 58 | @cached_property 59 | def root_dir(self): 60 | # or a cached_property 61 | if self.__class__.data_root not in [NotImplementedError, ValueError]: 62 | return self.data_root # from instance 63 | raise ValueError 64 | 65 | @cached_property 66 | def use_hive_db(self): 67 | if self.hive_db is not NotImplementedError: 68 | return self.hive_db 69 | raise ValueError 70 | 71 | # Deprecated API, use `use_hive_db` instead. 72 | hive_db = NotImplementedError 73 | 74 | @cached_property 75 | def sql_main(self): 76 | """ 77 | Need to implemented in subclass 78 | """ 79 | raise ValueError 80 | -------------------------------------------------------------------------------- /luiti/task_templates/other/mongo_import_task.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from etl_utils import process_notifier, cached_property 4 | import luigi 5 | import os 6 | import arrow 7 | import json 8 | 9 | from ...luigi_extensions import TaskBase 10 | from ...utils import CommandUtils, TargetUtils 11 | 12 | 13 | class MongoImportTask(TaskBase): 14 | """ 15 | Copy stat files to MongoDB. 16 | 17 | Steps: 18 | 1. download file from HDFS. 19 | 2. Make some indexes on MongoDB if needed. 20 | 3. Run `mongoimport` to import data. 21 | 4. Update report_status collection in MongoDB. 22 | 23 | Required: 24 | 1. Must be JSON file 25 | """ 26 | 27 | report_status_collection_name = "report_status" 28 | report_status_namespace = "latestCollection" 29 | report_name = NotImplementedError 30 | 31 | system_tmp = "/tmp" # default 32 | 33 | @cached_property 34 | def mongodb_db(self): 35 | return self.mongodb_connection[self.database_name] 36 | 37 | @cached_property 38 | def mongodb_connection_address(self): 39 | """ e.g. ('192.168.20.111', 37001) """ 40 | methods = dir(self.mongodb_connection) 41 | result = None 42 | 43 | # Compact with new pymongo API 44 | if "address" in methods: 45 | result = getattr(self.mongodb_connection, "address") 46 | if "connection" in methods: 47 | result = getattr(self.mongodb_connection, "connection").address 48 | if ("port" in methods) and ("host" in methods): 49 | result = (self.mongodb_connection.host, self.mongodb_connection.port) 50 | if result: 51 | assert len(result) == 2, result 52 | return result 53 | else: 54 | raise ValueError(self.mongodb_connection) 55 | 56 | @cached_property 57 | def mongodb_connection_host(self): 58 | return self.mongodb_connection_address[0] 59 | 60 | @cached_property 61 | def mongodb_connection_port(self): 62 | return self.mongodb_connection_address[1] 63 | 64 | @cached_property 65 | def report_status_collection_model(self): 66 | return self.mongodb_db[self.report_status_collection_name] 67 | 68 | @cached_property 69 | def data_file_collection_model(self): 70 | return self.mongodb_db[self.collection_name] 71 | 72 | # 1. config 73 | @cached_property 74 | def source_task(self): 75 | raise NotImplementedError 76 | 77 | @cached_property 78 | def mongodb_connection(self): 79 | raise NotImplementedError 80 | 81 | @cached_property 82 | def database_name(self): 83 | raise NotImplementedError 84 | 85 | @cached_property 86 | def index_schema(self): 87 | raise NotImplementedError 88 | 89 | def run_before_hook(self): 90 | pass 91 | 92 | def run_after_hook(self): 93 | pass 94 | 95 | # 2. common 96 | def requires(self): 97 | return [getattr(self, _ref_task_1)(self.date_value) 98 | for _ref_task_1 in self._ref_tasks] 99 | 100 | def run(self): 101 | self.run_before_hook() 102 | 103 | # 1. check is already done. 104 | if self.is_collection_exists(): 105 | print "[info] %s already exists!" % (self.data_file_collection_model, ) 106 | return False 107 | 108 | # 2. check report status collection is valid 109 | if self.report_status_collection_model.count() == 0: 110 | self.report_status_collection_model.insert( 111 | {self.report_status_namespace: {}}) 112 | assert self.report_status_collection_model.count() == 1, "更新纪录 只能有一条!" 113 | 114 | # 3. output json with err 115 | data_file1 = self.source_task_instance.data_file 116 | source1 = luigi.HDFS(data_file1) 117 | tmp_file1 = open(self.tmp_filepath, 'w') 118 | 119 | for line1 in process_notifier( 120 | TargetUtils.line_read(source1), u"[read lines] %s" % source1): 121 | tmp_file1.write(line1 + "\n") 122 | tmp_file1.close() 123 | 124 | # 4. upload to mongodb 125 | CommandUtils.execute(self.mongo_ensure_index) 126 | CommandUtils.execute(self.mongoimport_command) 127 | 128 | # 5. clean tmp 129 | CommandUtils.execute("rm -f %s" % self.tmp_filepath) 130 | 131 | # 6. update report status 132 | item1 = self.report_status_collection_model.find()[0] 133 | del item1['_id'] 134 | item1[self.report_status_namespace][self.report_name] = { 135 | 'collection_name': self.collection_name, 136 | 'updated_at': arrow.now().datetime, 137 | } 138 | self.report_status_collection_model.find_and_modify( 139 | query={}, 140 | update={"$set": item1}, 141 | full_response=True 142 | ) 143 | 144 | self.run_after_hook() 145 | 146 | return True 147 | 148 | def is_collection_exists(self): 149 | return self.data_file_collection_model.count() > 0 150 | 151 | @cached_property 152 | def source_task_instance(self): 153 | return self.source_task(self.date_value) 154 | 155 | @cached_property 156 | def mongoimport_command(self): 157 | return "/usr/bin/mongoimport " + \ 158 | ("--host %s " % self.mongodb_connection_host) + \ 159 | ("--port %s " % self.mongodb_connection_port) + \ 160 | ("--db %s " % self.database_name) + \ 161 | ("--collection %s " % self.collection_name) + \ 162 | ("--file %s" % self.tmp_filepath) 163 | 164 | @cached_property 165 | def mongo_ensure_index(self): 166 | if not isinstance(self.index_schema, basestring): 167 | self.index_schema = json.dumps(self.index_schema) 168 | js_str = "db.%s.ensureIndex(%s)" % \ 169 | (self.collection_name, self.index_schema) 170 | return self.mongo_eval(js_str) 171 | 172 | def mongo_eval(self, js_str): 173 | return "/usr/bin/mongo " + \ 174 | ("%s:%s/%s " % (self.mongodb_connection_host, self.mongodb_connection_port, self.database_name)) + \ 175 | ("--eval \"%s\" " % js_str) 176 | 177 | @cached_property 178 | def collection_name(self): 179 | """ e.g. redmine5954_parent_report_week_20140901 """ 180 | return self.data_name + "_" + self.date_value.strftime("%Y%m%d") 181 | 182 | @cached_property 183 | def tmp_filepath(self): 184 | return self.tmp_dir + "/" + self.date_value.strftime("%Y%m%d") 185 | 186 | @cached_property 187 | def tmp_dir(self): 188 | dir1 = os.path.join(self.system_tmp, self.task_class.__name__) 189 | os.system("mkdir -p %s" % dir1) 190 | return dir1 191 | -------------------------------------------------------------------------------- /luiti/task_templates/other/static_file.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | 4 | from etl_utils import cached_property 5 | from ...luigi_extensions import luigi 6 | from ...utils import TargetUtils 7 | 8 | 9 | class StaticFile(luigi.ExternalTask): 10 | """ 11 | By default, luigi don't have the ability to operate that tasks's outputs are generated by outside system 12 | 13 | So let luiti to schedule the task DAG, it allows to task to wait before submit to `luigid`. Check more details at luiti.schedule. 14 | """ 15 | 16 | is_external = True # see more documents at TaskBase 17 | data_file = None # The same as luiti.TaskBase 18 | filepath = None # Deprecated 19 | 20 | # Mimic default luigi.ExternalTask 21 | def run(self): 22 | pass 23 | 24 | def complete(self): 25 | return True 26 | 27 | def output(self): 28 | # Compatible with old API `filepath` 29 | if (self.data_file in [NotImplementedError, None]) \ 30 | and isinstance(self.filepath, basestring): 31 | self.data_file = self.filepath 32 | 33 | assert self.data_file, u"Please assign `data_file` !" 34 | return self.IODevice(self.data_file) 35 | 36 | @cached_property 37 | def IODevice(self): 38 | return self.io_devices[0] # default is HDFS 39 | 40 | io_devices = [TargetUtils.hdfs, luigi.LocalTarget] 41 | -------------------------------------------------------------------------------- /luiti/task_templates/time/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | API are listed at parent __ini__.py . 3 | 4 | 5 | Example: 6 | class TaskDayHadoop(luigi.hadoop.HadoopExt, TaskDay): 7 | pass 8 | 9 | TaskDay.__init__ will overwrite luigi.hadoop.HadoopExt's. 10 | 11 | 12 | NOTE: luigi.hadoop.HadoopExt will overwrite TaskDay 13 | 14 | """ 15 | -------------------------------------------------------------------------------- /luiti/task_templates/time/task_biweekly.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ['TaskBiweekly'] 4 | 5 | from ...luigi_extensions import TaskBase 6 | 7 | 8 | class TaskBiweekly(TaskBase): 9 | pass 10 | -------------------------------------------------------------------------------- /luiti/task_templates/time/task_biweekly_hadoop.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from .task_biweekly import TaskBiweekly 4 | from ...luigi_extensions import luigi 5 | 6 | 7 | class TaskBiweeklyHadoop(luigi.hadoop.HadoopExt, TaskBiweekly): 8 | 9 | pass 10 | -------------------------------------------------------------------------------- /luiti/task_templates/time/task_day.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ['TaskDay'] 4 | 5 | from ...luigi_extensions import TaskBase 6 | import arrow 7 | from etl_utils import cached_property 8 | 9 | class TaskDay(TaskBase): 10 | 11 | @cached_property 12 | def latest_7_days(self): 13 | return arrow.Arrow.range( 14 | 'day', 15 | self.date_value.replace(days=-6), 16 | self.date_value,) 17 | 18 | @cached_property 19 | def latest_30_days(self): 20 | return arrow.Arrow.range( 21 | 'day', 22 | self.date_value.replace(days=-29), 23 | self.date_value,) 24 | -------------------------------------------------------------------------------- /luiti/task_templates/time/task_day_hadoop.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from .task_day import TaskDay 4 | from ...luigi_extensions import luigi 5 | 6 | 7 | class TaskDayHadoop(luigi.hadoop.HadoopExt, TaskDay): 8 | 9 | pass 10 | -------------------------------------------------------------------------------- /luiti/task_templates/time/task_hour.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ['TaskHour'] 4 | 5 | from ...luigi_extensions import TaskBase 6 | 7 | 8 | class TaskHour(TaskBase): 9 | 10 | pass 11 | -------------------------------------------------------------------------------- /luiti/task_templates/time/task_hour_hadoop.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from .task_hour import TaskHour 4 | from ...luigi_extensions import luigi 5 | 6 | 7 | class TaskHourHadoop(luigi.hadoop.HadoopExt, TaskHour): 8 | 9 | pass 10 | -------------------------------------------------------------------------------- /luiti/task_templates/time/task_month.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ['TaskMonth'] 4 | 5 | from etl_utils import cached_property 6 | from ...luigi_extensions import TaskBase 7 | import arrow 8 | 9 | 10 | class TaskMonth(TaskBase): 11 | 12 | @cached_property 13 | def days_in_month(self): 14 | return arrow.Arrow.range( 15 | 'day', 16 | self.date_value.floor('month'), 17 | self.date_value.ceil('month'),) -------------------------------------------------------------------------------- /luiti/task_templates/time/task_month_hadoop.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from .task_month import TaskMonth 4 | from ...luigi_extensions import luigi 5 | 6 | 7 | class TaskMonthHadoop(luigi.hadoop.HadoopExt, TaskMonth): 8 | 9 | pass 10 | -------------------------------------------------------------------------------- /luiti/task_templates/time/task_quarter.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ['TaskQuarter'] 4 | 5 | from ...luigi_extensions import TaskBase 6 | 7 | 8 | class TaskQuarter(TaskBase): 9 | 10 | pass 11 | -------------------------------------------------------------------------------- /luiti/task_templates/time/task_quarter_hadoop.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from .task_quarter import TaskQuarter 4 | from ...luigi_extensions import luigi 5 | 6 | 7 | class TaskQuarterHadoop(luigi.hadoop.HadoopExt, TaskQuarter): 8 | 9 | pass 10 | -------------------------------------------------------------------------------- /luiti/task_templates/time/task_range.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ['TaskRange'] 4 | 5 | from ...luigi_extensions import TaskBase 6 | from ...utils import DateUtils 7 | 8 | 9 | class TaskRange(TaskBase): 10 | 11 | # NOTE date_value 和 date_range 两个值是必须的。 12 | # 1. date_value 是写到那个日期目录 13 | # 2. date_range 是指定了依赖的日期范围 14 | 15 | def date_range(self): 16 | raise ValueError("Overwrite Me!") 17 | # date_range = luigi.DateIntervalParameter() 18 | # date_range = luigi.Parameter() # 临时现为 str 类型吧 19 | 20 | @property 21 | def dates_in_range(self): 22 | # method_1 = self.date_type + "s_in_range" # e.g. weeks_in_range 23 | method_1 = 'week' + "s_in_range" # NOTE 目前直接为 week, 因为是range. 24 | 25 | # s1 = "2014-10-01-2014-10-07" 26 | # s1[0:10] => '2014-10-01' 27 | # s1[11:21] => '2014-10-07' 28 | date_1, date_2 = self.date_range[0:10], self.date_range[11:21] 29 | 30 | return list(getattr(DateUtils, method_1)(date_1, date_2)) 31 | -------------------------------------------------------------------------------- /luiti/task_templates/time/task_range_hadoop.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ['TaskRangeHadoop'] 4 | 5 | from .task_range import TaskRange 6 | from ...luigi_extensions import luigi 7 | 8 | 9 | class TaskRangeHadoop(luigi.hadoop.HadoopExt, TaskRange): 10 | 11 | pass 12 | -------------------------------------------------------------------------------- /luiti/task_templates/time/task_week.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ['TaskWeek'] 4 | 5 | from etl_utils import cached_property 6 | from ...luigi_extensions import TaskBase 7 | from ...utils import DateUtils 8 | 9 | 10 | class TaskWeek(TaskBase): 11 | 12 | @cached_property 13 | def days_in_week(self): 14 | return list(DateUtils.days_in_week(self.date_value)) 15 | 16 | def requires_with_prev_week(self, ref_task1): 17 | """ require days in current week, and stat data in previous week """ 18 | total_tasks = [ref_task1(date_value=date1) for date1 in self.days_in_week] 19 | 20 | prev_week_stat_task1 = self.pre_task_by_self 21 | if isinstance(prev_week_stat_task1, self.task_class): 22 | total_tasks.append(prev_week_stat_task1) # If it's not RootTask 23 | 24 | return total_tasks 25 | -------------------------------------------------------------------------------- /luiti/task_templates/time/task_week_hadoop.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["TaskWeekHadoop"] 4 | 5 | from .task_week import TaskWeek 6 | from ...luigi_extensions import luigi 7 | 8 | 9 | class TaskWeekHadoop(luigi.hadoop.HadoopExt, TaskWeek): 10 | pass 11 | -------------------------------------------------------------------------------- /luiti/task_templates/time/task_year.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ['TaskYear'] 4 | 5 | from ...luigi_extensions import TaskBase 6 | 7 | 8 | class TaskYear(TaskBase): 9 | 10 | pass 11 | -------------------------------------------------------------------------------- /luiti/task_templates/time/task_year_hadoop.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from .task_year import TaskYear 4 | from ...luigi_extensions import luigi 5 | 6 | 7 | class TaskYearHadoop(luigi.hadoop.HadoopExt, TaskYear): 8 | 9 | pass 10 | -------------------------------------------------------------------------------- /luiti/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __all__ = ['MrTestCase', "SetupLuitiPackages", "date_begin"] 4 | 5 | 6 | from .mr_test_case import MrTestCase 7 | from .setup_luiti_packages import SetupLuitiPackages 8 | 9 | date_begin = "2014-09-01" 10 | -------------------------------------------------------------------------------- /luiti/tests/mr_test_case.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __all__ = ['MrTestCase'] 4 | 5 | 6 | from collections import defaultdict 7 | import json 8 | 9 | from ..manager import Loader 10 | 11 | 12 | def MrTestCase(cls, verbose=False, date_value="2014-09-01"): 13 | """ 14 | 功能: 集成测试数据到 类中 ,这样就方便引用了。 15 | """ 16 | 17 | assert "mr_task_names" in dir(cls), "%s must assgin some task names!" % cls 18 | 19 | cls.maxDiff = None # compact large json diff 20 | 21 | def map_lines(text): 22 | assert isinstance(text, unicode) 23 | result = list() 24 | for l1 in text.split("\n"): 25 | l1 = l1.strip() 26 | if not l1: 27 | continue 28 | result.append(l1) 29 | return result 30 | 31 | def generate_closure_function(mr_task_name1): 32 | task_cls = Loader.load_a_task_by_name(mr_task_name1) # keep it! 33 | if verbose: 34 | print "[task_cls]", task_cls 35 | 36 | def test_mr(self): 37 | task_instance_1 = task_cls(date_value=date_value) 38 | if verbose: 39 | print "[task_instance]", task_instance_1 40 | 41 | task_instance_1.lines = map_lines(task_instance_1.mrtest_input()) 42 | result_expect = sorted( 43 | [read_json_from_mrtest_output(i2, idx + 1) for idx, i2 44 | in enumerate(map_lines(task_instance_1.mrtest_output()))]) 45 | 46 | self.assertEqual(result_expect, run_map_reduce(task_instance_1)) 47 | return test_mr 48 | 49 | for mr_task_name1 in cls.mr_task_names: 50 | test_method_name = "test_" + mr_task_name1 51 | if verbose: 52 | print 53 | if verbose: 54 | print "[test_method_name]", test_method_name 55 | 56 | setattr( 57 | cls, test_method_name, generate_closure_function(mr_task_name1)) 58 | 59 | if verbose: 60 | print 61 | if verbose: 62 | print 63 | 64 | return cls 65 | 66 | 67 | def run_map_reduce(task_instance_1): 68 | # 1. bind attrs 69 | for k1, v1 in task_instance_1.mrtest_attrs().iteritems(): 70 | setattr(task_instance_1, k1, v1) 71 | 72 | # 2. map it! 73 | mapper_key_to_vals = defaultdict(list) 74 | for line1 in task_instance_1.lines: 75 | for key_1, val_1 in task_instance_1.mapper(line1.strip()): 76 | mapper_key_to_vals[key_1].append(val_1) 77 | 78 | # 3. reduce it! 79 | result_list = list() 80 | for key_1, vals_1 in mapper_key_to_vals.iteritems(): 81 | vals_generator = iter(vals_1) 82 | for _, val_2 in task_instance_1.reducer(key_1, vals_generator): 83 | result_list.append(json.loads(val_2)) 84 | return sorted(result_list) 85 | 86 | 87 | def read_json_from_mrtest_output(line, num): 88 | """ print which json line error """ 89 | try: 90 | return json.loads(line) 91 | except Exception as e: 92 | print u"[line#%s] %s" % (num, line) 93 | raise e 94 | -------------------------------------------------------------------------------- /luiti/tests/setup_luiti_packages.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __all__ = ['SetupLuitiPackages'] 4 | 5 | import os 6 | import sys 7 | from etl_utils import cached_property, singleton 8 | 9 | 10 | @singleton() 11 | class SetupLuitiPackagesClass(object): 12 | 13 | @cached_property 14 | def config(self): 15 | root_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 16 | assert os.path.exists(root_dir), root_dir 17 | parent = os.path.join(root_dir, "tests/webui_packages") 18 | 19 | luiti_package_names = "dump clean middle summary".split(" ") 20 | for project_name in luiti_package_names + ["webui_tests"]: 21 | package_path = os.path.join(parent, "luiti_" + project_name) 22 | sys.path.insert(0, package_path) 23 | 24 | sys.path.insert(0, os.path.join(root_dir, "tests")) 25 | sys.path.insert(0, os.path.join(root_dir, "tests/project_A")) 26 | sys.path.insert(0, os.path.join(root_dir, "tests/project_B")) 27 | sys.path.insert(0, os.path.join(root_dir, "tests/zip_package_by_luiti")) 28 | 29 | # setup env 30 | from luiti import config 31 | config.curr_project_dir = os.path.join(root_dir, "tests/webui_packages/luiti_summary") 32 | 33 | return config 34 | 35 | SetupLuitiPackages = SetupLuitiPackagesClass() 36 | -------------------------------------------------------------------------------- /luiti/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = [ 4 | "IOUtils", 5 | "TargetUtils", 6 | "MRUtils", 7 | "MathUtils", 8 | "HDFSUtils", 9 | "CommandUtils", 10 | "CompressUtils", 11 | "DateUtils", 12 | "ExtUtils", 13 | "VisualiserEnvTemplate" 14 | ] 15 | 16 | from .io_utils import IOUtils 17 | from .target_utils import TargetUtils 18 | from .mr_utils import MRUtils 19 | from .math_utils import MathUtils 20 | from .hdfs_utils import HDFSUtils 21 | from .command_utils import CommandUtils 22 | from .date_utils import DateUtils 23 | from .compress_utils import CompressUtils 24 | from .ext_utils import ExtUtils 25 | from .visualiser_env_template import VisualiserEnvTemplate 26 | -------------------------------------------------------------------------------- /luiti/utils/command_utils.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | import os 4 | 5 | 6 | class CommandUtils: 7 | 8 | @staticmethod 9 | def execute(command_str, dry=False, verbose=True): 10 | if verbose: 11 | print "[command]", command_str 12 | if dry: 13 | return False 14 | 15 | # return commands.getstatusoutput(command_str) 16 | return os.system(command_str) # print logs in realtime. 17 | -------------------------------------------------------------------------------- /luiti/utils/compress_utils.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | import os 4 | import glob 5 | from .command_utils import CommandUtils 6 | from .hdfs_utils import HDFSUtils 7 | 8 | 9 | class CompressUtils: 10 | 11 | @staticmethod 12 | def unzip_with_upload(orig_filepath, hdfs_filepath, 13 | tmp_dir=NotImplementedError, tmp_name=NotImplementedError): 14 | """ 15 | 1. Download zip file from HDFS 16 | 2. Unzip it 17 | 3. Reupload to the same place on HDFS 18 | """ 19 | # 1. check 20 | if not HDFSUtils.exists(orig_filepath): 21 | raise ValueError("[hdfs] %s not exists!" % orig_filepath) 22 | 23 | # 2. pull file from hdfs 24 | tmp_local_target = os.path.join(tmp_dir, tmp_name) 25 | HDFSUtils.copyToLocal(orig_filepath, tmp_local_target) 26 | 27 | # 3. unzip 28 | unzip_dir = tmp_dir + "/unzip" 29 | CommandUtils.execute("mkdir -p %s" % unzip_dir) 30 | CommandUtils.execute( 31 | "tar xzvf %s -C %s" % (tmp_local_target, unzip_dir)) 32 | 33 | unzip_file = unzip_dir 34 | # 兼容 zip 文件是多层级目录 35 | while (os.path.isdir(unzip_file)): 36 | next_dirs = glob.glob(unzip_file + "/*") 37 | if len(next_dirs) > 1: 38 | raise ValueError( 39 | "%s should only one dir in a zip file!" % unzip_file) 40 | if len(next_dirs) == 0: 41 | raise ValueError( 42 | "%s must always exists one file or one dir in a zip file, " 43 | "but there are %s ." % (unzip_file, str(next_dirs))) 44 | unzip_file = next_dirs[0] 45 | 46 | # 4. push file to hdfs 47 | HDFSUtils.copyFromLocal(unzip_file, hdfs_filepath) 48 | CommandUtils.execute("rm -rf %s" % unzip_dir) 49 | CommandUtils.execute("rm -rf %s" % tmp_local_target) 50 | return True 51 | -------------------------------------------------------------------------------- /luiti/utils/date_utils.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ['DateUtils'] 4 | 5 | import arrow 6 | 7 | 8 | class DateUtils: 9 | arrow = arrow 10 | 11 | @staticmethod 12 | def arrow_str(arrow1): 13 | return arrow.get(arrow1).datetime.strftime("%Y-%m-%d") 14 | 15 | @staticmethod 16 | def days_in_week(arrow1): 17 | arrow1 = arrow.get(arrow1) 18 | return arrow.Arrow.range( 19 | 'day', 20 | arrow1.floor('week'), 21 | arrow1.ceil('week'),) 22 | 23 | @staticmethod 24 | def weeks_in_range(arrow1, arrow2): 25 | return arrow.Arrow.range( 26 | 'week', 27 | arrow.get(arrow1).floor('week'), 28 | arrow.get(arrow2).ceil('week'),) 29 | 30 | @staticmethod 31 | def fixed_weeks_in_range(date_range_str): 32 | """ 修复 一个范围内所有全部覆盖的weeks,即最坏情况是掐头去尾。""" 33 | # 兼容如果date_range的最后一个不是星期天,那该周日志就不完整。 34 | assert len(date_range_str) == 21 # e.g. "2014-09-01-2014-11-19" 35 | first_date = arrow.get(date_range_str[0:10]) 36 | last_date = arrow.get(date_range_str[11:21]) 37 | dates = DateUtils.weeks_in_range(first_date, last_date) 38 | if len(dates) > 0: 39 | if last_date.weekday() != 6: # 6 index is Sunday 40 | dates = dates[:-1] 41 | if first_date.weekday() != 0: # 0 index is Monday 42 | dates = dates[1:] 43 | return dates 44 | 45 | @staticmethod 46 | def date_value_by_type_in_last(date_value_1, date_type_1): 47 | val1 = arrow.get(date_value_1).replace(**{(date_type_1 + 's'): -1}) \ 48 | .floor(date_type_1) 49 | return val1 50 | -------------------------------------------------------------------------------- /luiti/utils/ext_utils.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from etl_utils import cached_property 4 | 5 | 6 | class ExtUtils(object): 7 | 8 | class ExtendClass(object): 9 | 10 | """ 11 | Extend a class dynamically, and compact with `property` and 12 | `cached_property` in a unified call mechanism. 13 | """ 14 | 15 | @classmethod 16 | def extend(cls, attrs): 17 | assert isinstance(attrs, dict), attrs 18 | 19 | for attr_k1, attr_v1 in attrs.iteritems(): 20 | orig_attr = getattr(cls, attr_k1, None) 21 | 22 | # convert input to original value type 23 | if isinstance(orig_attr, property) and \ 24 | (not isinstance(attr_v1, property)): 25 | new_v1 = property(attr_v1) 26 | elif isinstance(orig_attr, cached_property) and \ 27 | (not isinstance(attr_v1, cached_property)): 28 | new_v1 = cached_property(attr_v1) 29 | else: 30 | new_v1 = attr_v1 31 | 32 | setattr(cls, attr_k1, new_v1) 33 | -------------------------------------------------------------------------------- /luiti/utils/hdfs_utils.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from .command_utils import CommandUtils 4 | from .target_utils import TargetUtils 5 | 6 | 7 | class HDFSUtils: 8 | 9 | hdfs_cli = NotImplemented 10 | 11 | @staticmethod 12 | def exists(path1): 13 | return TargetUtils.exists(path1) 14 | 15 | @staticmethod 16 | def copy(path1, path2): 17 | command1 = HDFSUtils.hdfs_cli + " -cp %s %s" % (path1, path2) 18 | print "[command]", command1 19 | CommandUtils.execute(command1) 20 | 21 | @staticmethod 22 | def copyFromLocal(path1, path2): 23 | command1 = HDFSUtils.hdfs_cli + \ 24 | " -copyFromLocal %s %s" % (path1, path2) 25 | print "[command]", command1 26 | CommandUtils.execute(command1) 27 | 28 | @staticmethod 29 | def copyToLocal(path1, path2): 30 | command1 = HDFSUtils.hdfs_cli + " -copyToLocal %s %s" % (path1, path2) 31 | print "[command]", command1 32 | CommandUtils.execute(command1) 33 | 34 | @staticmethod 35 | def chown(path1): 36 | command1 = HDFSUtils.hdfs_cli + " -chown -R primary_user " + path1 37 | print "[command]", command1 38 | CommandUtils.execute(command1) 39 | 40 | @staticmethod 41 | def mkdir_p(dir1): 42 | command1 = HDFSUtils.hdfs_cli + " -mkdir -p " + dir1 43 | print "[command]", command1 44 | CommandUtils.execute(command1) 45 | 46 | @staticmethod 47 | def mkdir(dir1): 48 | command1 = HDFSUtils.hdfs_cli + " -mkdir " + dir1 49 | print "[command]", command1 50 | CommandUtils.execute(command1) 51 | 52 | @staticmethod 53 | def mv(src, dst): 54 | command1 = HDFSUtils.hdfs_cli + (" -mv %s %s " % (src, dst)) 55 | print "[command]", command1 56 | CommandUtils.execute(command1) 57 | 58 | 59 | # TODO 用装饰器来包装 print, CommandUtils.execute等 60 | -------------------------------------------------------------------------------- /luiti/utils/io_utils.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | import json 4 | import luigi 5 | import luigi.hdfs 6 | from luigi import LocalTarget 7 | from etl_utils import JsonUtils 8 | from .target_utils import TargetUtils 9 | 10 | 11 | class IOUtils: 12 | 13 | SQL_RANGE_LIMIT = 1000 14 | 15 | @staticmethod 16 | def json_dump(o1): 17 | m1 = lambda item1: json.dumps(list(item1)) 18 | m2 = lambda item1: JsonUtils.unicode_dump(item1).encode("UTF-8") 19 | if isinstance(o1, (list, set,)): 20 | # Comptible with JsonUtils.unicode_dump dont support list 21 | method = m1 22 | else: 23 | method = m2 24 | return method(o1) 25 | 26 | @staticmethod 27 | def write_json_to_output(result, output1): 28 | """ 29 | Support multiple lines. 30 | """ 31 | if isinstance(result, dict): 32 | result = [result] 33 | if isinstance(result, set): 34 | result = list(result) 35 | assert isinstance(result, list), result 36 | assert len(result) > 0, result 37 | assert isinstance(result[0], dict), result 38 | 39 | with output1.open('w') as output_hdfs: 40 | for o1 in result: 41 | output_hdfs.write(IOUtils.json_dump(o1) + "\n") 42 | return 0 43 | write_jsons_to_output = write_json_to_output # make a alias 44 | 45 | @staticmethod 46 | def read_json_from_output(output1): 47 | # only one line 48 | item1 = None 49 | read_line_count = 0 50 | for json1 in TargetUtils.json_read(output1): 51 | read_line_count += 1 52 | item1 = json1 53 | if read_line_count >= 2: 54 | raise ValueError("[multiple line error]" 55 | " %s should contain only one line!" % output1) 56 | return item1 57 | 58 | @staticmethod 59 | def remove_files(*files): # 兼容 写入中途失败 60 | for file1 in files: 61 | if luigi.hdfs.exists(file1): 62 | luigi.hdfs.remove(file1) 63 | return True 64 | 65 | @staticmethod 66 | def local_target(path1): 67 | return LocalTarget(path1) 68 | -------------------------------------------------------------------------------- /luiti/utils/math_utils.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | 4 | class MathUtils: 5 | 6 | @staticmethod 7 | def percent(a, b): 8 | # reset other False type obj to 0, e.g. None. 9 | if not b: 10 | b = 0 11 | if not a: 12 | a = 0 13 | 14 | if b == 0: 15 | return 0.0 16 | result = a / float(b) 17 | return result 18 | 19 | # 注释原因: 实际存储还是用高精度吧 from @连华 20 | # return int(round(result * 10000)) / 10000.0 21 | -------------------------------------------------------------------------------- /luiti/utils/mr_utils.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | import json 4 | from etl_utils import JsonUtils 5 | 6 | 7 | class MRUtils: 8 | 9 | map_key_split = u"@@" # map 多维度键 分隔符 10 | map_key_escape = u"\"" # map 字符串默认 JSON dump 11 | mr_separator = u"\t" # map reduce 分隔符 12 | 13 | @staticmethod 14 | def mr_key(item1, postfix=''): 15 | """ example is "104017@@37771707" """ 16 | # TODO 业务代码应该剥离 17 | str1 = u"%s%s%s" % ( 18 | item1.get('class_id', 0), 19 | MRUtils.map_key_split, item1.get('uid', 0),) 20 | if postfix: 21 | str1 += (MRUtils.map_key_split + unicode(postfix)) 22 | return str1 23 | 24 | @staticmethod 25 | def json_parse(line1): 26 | line1 = line1.strip() 27 | if isinstance(line1, str): 28 | line1 = line1.decode("UTF-8") 29 | return json.loads(line1) 30 | 31 | @staticmethod 32 | def is_mr_line(line1): 33 | # 1. 目前标准的 MapReduce 输出 34 | head = line1[0:30] 35 | is_true_1 = (MRUtils.map_key_split in head) or \ 36 | (MRUtils.mr_separator in head) 37 | # 2. value 必须是 } 或 ] 38 | is_true_2 = (line1.endswith("}") or line1.endswith("]")) 39 | # 3. 外部Python程序写的一行一行JSON, 没有 map key 。 40 | is_true_3 = (not line1.startswith("{")) and (not line1.startswith("[")) 41 | return is_true_1 and is_true_2 and is_true_3 42 | 43 | @staticmethod 44 | def unicode_value(item1, key1): 45 | val1 = item1.get(key1, u"") 46 | if isinstance(val1, str): 47 | val1 = val1.decode("UTF-8") 48 | return val1 49 | 50 | @staticmethod 51 | def split_mr_kv(line1): 52 | """ 返回一个 解析好的 [k,v] 数组。 """ 53 | if isinstance(line1, str): 54 | line1 = line1.decode("UTF-8") 55 | k_str, v_str = line1.split(MRUtils.mr_separator, 1) 56 | 57 | return [ 58 | MRUtils.select_prefix_keys(k_str), 59 | json.loads(v_str), 60 | ] 61 | 62 | # key related 63 | @staticmethod 64 | def merge_keys_in_dict(vals_1, keys_1): 65 | """ 合并多个键的整数值。 """ 66 | merge = {key_1: 0 for key_1 in keys_1} 67 | for v_2 in vals_1: 68 | for key_1 in keys_1: 69 | merge[key_1] += v_2[key_1] 70 | return merge 71 | 72 | @staticmethod 73 | def concat_prefix_keys(*keys): 74 | items_str = map(unicode, keys) 75 | return MRUtils.map_key_split.join(items_str) 76 | 77 | @staticmethod 78 | def split_prefix_keys(line_part_a): 79 | """ return list """ 80 | fixed_str = MRUtils.select_prefix_keys(line_part_a) 81 | return fixed_str.split(MRUtils.map_key_split) 82 | 83 | @staticmethod 84 | def select_prefix_keys(line_part_a, idxes=None): 85 | """ 86 | 根据索引数组 转化出新的 map key 87 | e.g. select_prefix_keys("232@@8923802@@afenti", [0,1]) 88 | # => "232@8923802" 89 | """ 90 | if isinstance(line_part_a, str): 91 | line_part_a = line_part_a.decode("UTF-8") 92 | # 兼容解析格式错误的jsonkey 93 | if line_part_a.startswith(MRUtils.map_key_escape) and \ 94 | (not line_part_a.endswith(MRUtils.map_key_escape)): 95 | line_part_a = line_part_a[1:] 96 | if line_part_a.startswith(MRUtils.map_key_escape): # is a json 97 | line_part_a = json.loads(line_part_a) 98 | 99 | if idxes is None: 100 | return line_part_a 101 | else: 102 | parts = line_part_a.split(MRUtils.map_key_split) 103 | new_parts = [] 104 | for idx_1 in idxes: 105 | new_parts.append(parts[idx_1]) 106 | return MRUtils.map_key_split.join(new_parts) 107 | 108 | @staticmethod 109 | def str_dump(result_dict): 110 | return JsonUtils.unicode_dump(result_dict).encode("UTF-8") 111 | 112 | @staticmethod 113 | def filter_dict(d1, keys): 114 | if not isinstance(keys, list): 115 | keys = [keys] 116 | return {k1: d1[k1] for k1 in keys} 117 | -------------------------------------------------------------------------------- /luiti/utils/target_utils.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | import json 4 | import luigi 5 | from etl_utils import singleton, cached_property 6 | 7 | 8 | @singleton() 9 | class TargetUtilsClass(object): 10 | 11 | def line_read(self, hdfs1): 12 | with hdfs1.open('r') as data1: 13 | for line1 in data1: 14 | line1 = line1.decode("UTF-8").strip() 15 | # filter blank line 16 | if len(line1) == 0: 17 | continue 18 | yield line1 19 | 20 | def json_read(self, hdfs1): 21 | for line1 in TargetUtils.line_read(hdfs1): 22 | yield json.loads(line1) # as item1 23 | 24 | def hdfs(self, data_file1): 25 | # [兼容] 可以判断出 data_file1 是否包含 part-00000 的目录。 26 | 27 | # 兼容 snakebite 对 不存在目录的 test 有bug,或者是因为从hadoop用户切换到primary_user导致。 28 | f1 = luigi.hdfs.HdfsTarget(data_file1) 29 | 30 | # isdir 在 luigi/hdfs.py 没有实现哦 31 | is_curr_dir = lambda: len(list(f1.fs.listdir(data_file1))) > 1 32 | 33 | if f1.exists() and is_curr_dir(): 34 | # There's no part-000 when use multiple text output in streaming 35 | def _exists(name): 36 | return luigi.hdfs.HdfsTarget(data_file1 + name).exists() 37 | is_mr_output_root = _exists("/_SUCCESS") 38 | has_part_000000 = _exists("/part-00000") 39 | if is_mr_output_root or has_part_000000: 40 | return luigi.hdfs.HdfsTarget(data_file1, 41 | format=luigi.hdfs.PlainDir) 42 | 43 | return f1 44 | 45 | def hdfs_dir(self, path1): 46 | """ 47 | Compact with someone use 000000_0 file naming style, but not the default MR part-00000。 48 | """ 49 | return luigi.hdfs.HdfsTarget(path1, format=luigi.hdfs.PlainDir) 50 | 51 | def isdir(self, path1): 52 | return self.client.get_bite().test(path1, directory=True) 53 | 54 | def exists(self, path1): 55 | return self.client.exists(path1) 56 | 57 | @cached_property 58 | def client(self): 59 | return HdfsClient.client 60 | 61 | TargetUtils = TargetUtilsClass() 62 | 63 | 64 | @singleton() 65 | class HdfsClientClass(object): 66 | # TODO use delegate 67 | 68 | @cached_property 69 | def client(self): 70 | import luigi.hdfs 71 | return luigi.hdfs.clients 72 | HdfsClient = HdfsClientClass() 73 | TargetUtils.HdfsClient = HdfsClient 74 | -------------------------------------------------------------------------------- /luiti/utils/visualiser_env_template.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from etl_utils import cached_property 4 | from ..luigi_extensions import ArrowParameter 5 | 6 | 7 | class VisualiserEnvTemplate(object): 8 | """ 9 | Setup luiti webui. 10 | 11 | Overwrite below attributes, see keys and their examples in `data`. 12 | """ 13 | def __init__(self, kwargs=dict()): 14 | assert isinstance(kwargs, dict), kwargs 15 | 16 | for k1, v1 in kwargs.iteritems(): 17 | if not hasattr(self, k1): 18 | raise ValueError("%s dont has attribute \"%s\"" % self, k1) 19 | setattr(self, k1, v1) 20 | 21 | @cached_property 22 | def data(self): 23 | def maybe_call(o1): 24 | if callable(o1): 25 | o1 = o1() 26 | return o1 27 | 28 | result = { 29 | "file_web_url_prefix": maybe_call(self.file_web_url_prefix), 30 | "date_begin": maybe_call(self.date_begin), 31 | "additional_task_parameters": maybe_call(self.additional_task_parameters), 32 | "package_config": maybe_call(self.package_config), 33 | } 34 | 35 | # check data valid 36 | assert isinstance(result["additional_task_parameters"], dict) 37 | if len(result["additional_task_parameters"]) > 0: 38 | val = result["additional_task_parameters"].values()[0] 39 | assert "values" in val 40 | assert "default" in val 41 | 42 | return result 43 | 44 | def __getitem__(self, k1): 45 | return self.data[k1] 46 | 47 | # API list 48 | file_web_url_prefix = "" 49 | date_begin = ArrowParameter.now().replace(weeks=-1).format("YYYY-MM-DD") 50 | 51 | def additional_task_parameters(self): 52 | """ 53 | Example is 54 | 55 | { 56 | "subject": { 57 | "values": ["english", "math"], 58 | "default": "english", 59 | } 60 | } 61 | """ 62 | return dict() 63 | 64 | def package_config(self): 65 | return { 66 | "default_selected": [] 67 | } 68 | -------------------------------------------------------------------------------- /luiti/webui/INSTALL.markdown: -------------------------------------------------------------------------------- 1 | Install by http://bower.io/ 2 | ============== 3 | ```bash 4 | bower install 5 | ``` 6 | -------------------------------------------------------------------------------- /luiti/webui/assets/javascripts/luiti.js: -------------------------------------------------------------------------------- 1 | (function() { 2 | 'use strict'; 3 | 4 | // mark color, when select a task, separate in and out. 5 | var colors = { 6 | "requires": "lime", 7 | "self": "#7BE141", 8 | "upons": "green", 9 | }; 10 | 11 | var render_network = function(nodes, edges, container_id, click_event) { 12 | nodes = _.map(nodes, function(node) { 13 | if (_.contains(queryparams.selected_query.task_cls, node.label)) { 14 | node.color = colors.self; 15 | } else { 16 | node.color = colors.requires; 17 | }; 18 | return node; 19 | }); 20 | 21 | // NOTE: original code is http://visjs.org/examples/network/nodeStyles/customGroups.html 22 | var container = $(container_id)[0]; // create a network 23 | var data = { 24 | nodes: nodes, 25 | edges: edges 26 | }; 27 | var options = { 28 | nodes: { 29 | shape: 'dot', 30 | size: 20, 31 | font: { 32 | size: 15, 33 | color: '#000000' 34 | }, 35 | borderWidth: 2 36 | }, 37 | edges: { 38 | width: 2 39 | } 40 | }; 41 | 42 | var network = new vis.Network(container, data, options); 43 | network.on("click", click_event); 44 | }; 45 | 46 | 47 | var render_visualSearch = function(container_id, default_query, selected_query, vs_accepted_params) { 48 | var env_config_visualSearch = { 49 | "facet_values": (function() { 50 | var task_namespaces = _.map(["task_cls", "luiti_package"], function(param) { 51 | return {"label": param, "category": "Namespaces"}; 52 | }); 53 | var task_params= _.map(_.keys(default_query), function(param) { 54 | return {"label": param, "category": "Params"}; 55 | }); 56 | return task_params.concat(task_namespaces); 57 | })(), 58 | }; 59 | 60 | var get_current_query = function(visualSearch) { 61 | var result = {}; 62 | 63 | _.map(visualSearch.searchQuery.facets(), function(facet) { 64 | var kv = _.pairs(facet)[0]; 65 | if (_.has(result, kv[0])) { 66 | result[kv[0]].push(kv[1]); 67 | } else { 68 | result[kv[0]] = [kv[1]]; 69 | }; 70 | }); 71 | 72 | return result; 73 | } 74 | 75 | var vs_config = { 76 | container: $(container_id), 77 | query: '', 78 | autosearch: true, 79 | callbacks: { 80 | search: function(query, searchCollection) { 81 | return false; 82 | }, 83 | facetMatches: function(callback) { 84 | callback(env_config_visualSearch["facet_values"]); 85 | }, 86 | valueMatches: function(facet, searchTerm, callback) { 87 | // support smart match, from any position of strs. 88 | var orig_array = vs_accepted_params[facet]; 89 | searchTerm = searchTerm.toLowerCase(); 90 | var result = _.filter(orig_array , function(str) { 91 | return s.contains(str.toLowerCase(), searchTerm); 92 | }); 93 | // dont work, see more details at search_fact.js#autocompleteValues 94 | return callback(result); 95 | }, 96 | blur: function() { 97 | var result = get_current_query(visualSearch); 98 | 99 | // Update a React view. 100 | group_summary.setState({"selected_luiti_packages": result["luiti_package"]}) 101 | }, 102 | } 103 | }; 104 | 105 | // Example format is: visualSearch.searchBox.value("Country: US State: \"New York\" Key: Value") 106 | var load_params = function(query_opts) { 107 | // support same key with multiple values. 108 | var vs_values = []; 109 | _.each(query_opts, function(opt_values, opt_key) { 110 | _.each(opt_values, function(opt_value) { 111 | vs_values = vs_values.concat(JSON.stringify(opt_key) + ": " + JSON.stringify(opt_value)); 112 | }); 113 | }); 114 | return vs_values.join(" "); 115 | }; 116 | 117 | // Run it! 118 | var visualSearch = VS.init(vs_config); 119 | 120 | visualSearch.current_query = (function() { 121 | var result = _.extend({}, selected_query, URI.parseQuery(URI(window.location)._parts.query)); 122 | // wrap value in a Array. 123 | _.each(_.keys(result), function(key) { 124 | if (!_.isArray(result[key])) { 125 | result[key] = [result[key]]; 126 | }; 127 | }); 128 | return result; 129 | })(); 130 | 131 | visualSearch.setValue = function(opts) { 132 | return visualSearch.searchBox.value(load_params(opts)); 133 | }; 134 | visualSearch.setValue(visualSearch.current_query); 135 | 136 | // support click query 137 | var searchBox = visualSearch.options.container.find(".VS-icon-search"); 138 | searchBox.click(function(event) { 139 | var result = get_current_query(visualSearch); 140 | 141 | // build a url query 142 | var url = URI(window.location); 143 | url._parts.query = ""; 144 | url.setQuery(result); 145 | window.location = url.build(); 146 | 147 | return false; 148 | }); 149 | searchBox.css("cursor", "pointer"); 150 | 151 | return visualSearch; 152 | }; 153 | 154 | 155 | var render_header_title = function(title) { 156 | $("head title").html(title); 157 | $("body #header .title").html(title); 158 | }; 159 | 160 | var render_all = function(env) { 161 | // 1. render network 162 | render_network(nodeedge.nodes, 163 | nodeedge.edges, 164 | "#network", 165 | function (params) { 166 | console.log("[click a node on #network]", params); 167 | var task_id = params["nodes"][0]; // only one task can be clicked. 168 | // Delegate to show TaskDetailView 169 | $("#nodes_groups").find('.nodes_group li[data-task-id="' + task_id + '"]').click(); 170 | }); 171 | 172 | // 2. render visualSearch 173 | env.visualSearch = render_visualSearch(".visual_search", queryparams.default_query, queryparams.selected_query, queryparams.accepted); 174 | 175 | // Other views. 176 | render_header_title(title); 177 | }; 178 | 179 | var init_data_url = "init_data.json" + location.search; 180 | 181 | $.getJSON(init_data_url, function(data) { 182 | // bind env's first level key to global `window` object. 183 | _.each(data, function(value, key) { 184 | window[key] = value; 185 | }); 186 | window.env = data; 187 | console.log("load data", env); 188 | 189 | // transform data 190 | nodeedge.nodeid_to_node_dict = _.reduce(nodeedge.nodes, function(dict, node) { 191 | dict[node.id] = node; 192 | return dict; 193 | }, {}); 194 | 195 | render_all(env); 196 | 197 | // orig is 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 |
33 | 34 | 39 | 40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 | 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dchentech/luiti/11a5c62b265a92910a1d4c82431e3697b8b06814/requirements.txt -------------------------------------------------------------------------------- /screenshots/README.markdown: -------------------------------------------------------------------------------- 1 | Luiti WebUI screenshots 2 | =========================== 3 | 4 | 5 | reduce PNG size 6 | --------------------------- 7 | ```bash 8 | brew install pngquant 9 | pngquant --quality 20-70 ~/Desktop/luiti\ screenshots\ copy/*.png 10 | ``` 11 | -------------------------------------------------------------------------------- /screenshots/luiti_code_show.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dchentech/luiti/11a5c62b265a92910a1d4c82431e3697b8b06814/screenshots/luiti_code_show.png -------------------------------------------------------------------------------- /screenshots/luiti_webui_list.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dchentech/luiti/11a5c62b265a92910a1d4c82431e3697b8b06814/screenshots/luiti_webui_list.png -------------------------------------------------------------------------------- /screenshots/luiti_webui_show.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dchentech/luiti/11a5c62b265a92910a1d4c82431e3697b8b06814/screenshots/luiti_webui_show.png -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | 4 | import os 5 | from setuptools import setup 6 | 7 | 8 | def get_static_files(root): 9 | return [os.path.join(path, name) 10 | for path, subdirs, files in os.walk(root) 11 | for name in files] 12 | package_data = sum(map(get_static_files, 13 | ["luiti/java/", 14 | "luiti/webui/assets/", 15 | "luiti/webui/bower_components/", 16 | ]), []) 17 | package_data += ["luiti/webui/index.html"] 18 | 19 | 20 | setup( 21 | name='luiti', 22 | version='0.2.2', 23 | url='http://github.com/luiti/luiti/', 24 | license='MIT', 25 | author='David Chen', 26 | author_email=''.join(reversed("moc.liamg@emojvm")), 27 | description='Luiti = Luigi + time', 28 | long_description=open("README.markdown").read(), 29 | packages=[ 30 | 'luiti', 31 | 'luiti/daemon', 32 | 'luiti/daemon/query_engine', 33 | 'luiti/daemon/utils', 34 | 'luiti/daemon/web', 35 | 'luiti/luigi_decorators', 36 | 'luiti/luigi_extensions', 37 | 'luiti/manager', 38 | 'luiti/schedule/', 39 | 'luiti/task_templates/', 40 | 'luiti/task_templates/time', 41 | 'luiti/task_templates/other', 42 | 'luiti/tests', 43 | 'luiti/utils', ], 44 | scripts=[ 45 | 'bin/luiti', 46 | ], 47 | 48 | package_data={'luiti': package_data}, 49 | include_package_data=True, 50 | 51 | zip_safe=False, 52 | platforms='any', 53 | install_requires=[ 54 | # 1. luigi related 55 | "luigi >=2.0,<2.2", 56 | "snakebite>=2.5,<2.6", 57 | "protobuf>=2.6,<2.7", 58 | "tornado>=4.0,<4.1", 59 | "mechanize>=0.2,<0.3", 60 | "python-daemon>=1.6,<1.7", 61 | "MySQL-python>=1.2,<1.3", 62 | "pymongo>=3.0", 63 | 64 | # 2. luiti self 65 | "etl_utils>=0.1,<0.2", 66 | "arrow>=0.4,<0.5", 67 | "inflector>=2.0,<2.1", 68 | "pygments>=2.0,<2.1", 69 | "ujson", 70 | "jsonpickle", 71 | "six", 72 | "tabulate", 73 | "toposort>=1.0,<1.1", 74 | ], 75 | classifiers=[ 76 | 'Intended Audience :: Developers', 77 | 'Operating System :: OS Independent', 78 | 'Programming Language :: Python', 79 | 'Topic :: Software Development :: Libraries :: Python Modules' 80 | ], 81 | ) 82 | -------------------------------------------------------------------------------- /tests/client.cfg: -------------------------------------------------------------------------------- 1 | [hdfs] 2 | client: snakebite 3 | namenode_host: localhost 4 | namenode_port: 8020 5 | 6 | -------------------------------------------------------------------------------- /tests/jsons_data/mr_local.json: -------------------------------------------------------------------------------- 1 | {"uid": 1} 2 | {"uid": 1} 3 | {"uid": 1} 4 | {"uid": 2} 5 | {"uid": 3} 6 | -------------------------------------------------------------------------------- /tests/project_A/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dchentech/luiti/11a5c62b265a92910a1d4c82431e3697b8b06814/tests/project_A/__init__.py -------------------------------------------------------------------------------- /tests/project_A/luiti_tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dchentech/luiti/11a5c62b265a92910a1d4c82431e3697b8b06814/tests/project_A/luiti_tasks/__init__.py -------------------------------------------------------------------------------- /tests/project_A/luiti_tasks/__init_luiti.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __all__ = ["luigi", "TaskDay", "cached_property", "TaskDayHadoop", 4 | "json", "MRUtils", ] 5 | 6 | import os 7 | import sys 8 | root_dir = os.path.dirname( 9 | os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 10 | sys.path.insert(0, root_dir) 11 | os.environ['LUIGI_CONFIG_PATH'] = root_dir + '/tests/client.cfg' 12 | 13 | 14 | from luiti import luigi, TaskDay, cached_property, TaskDayHadoop, json, MRUtils 15 | luigi.plug_packages( 16 | "project_B", # dep project 17 | "etl_utils==0.1.10", # just for test import 18 | "zip_package_by_luiti", # zip file package 19 | ) 20 | -------------------------------------------------------------------------------- /tests/project_A/luiti_tasks/a_day.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from .__init_luiti import luigi, TaskDay, cached_property 4 | 5 | 6 | @luigi.ref_tasks("BDay", "CDay") 7 | class ADay(TaskDay): 8 | 9 | root_dir = "/foobar" 10 | 11 | def requires(self): 12 | return [self.BDay_task, self.CDay_task] 13 | 14 | @cached_property 15 | def count(self): 16 | return 1 17 | 18 | @cached_property 19 | def total_count(self): 20 | return self.count + self.BDay_task.count + self.CDay_task.count 21 | -------------------------------------------------------------------------------- /tests/project_A/luiti_tasks/b_day.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from .__init_luiti import cached_property, TaskDay 4 | 5 | 6 | class BDay(TaskDay): 7 | 8 | root_dir = "/foobar" 9 | 10 | @cached_property 11 | def count(self): 12 | return 2 13 | -------------------------------------------------------------------------------- /tests/project_A/luiti_tasks/c_day.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from .__init_luiti import luigi, cached_property, TaskDay 4 | 5 | 6 | @luigi.ref_tasks("FoobarDay") 7 | class CDay(TaskDay): 8 | 9 | root_dir = "/foobar" 10 | 11 | def requires(self): 12 | self.FoobarDay_task 13 | 14 | @cached_property 15 | def count(self): 16 | return 3 17 | -------------------------------------------------------------------------------- /tests/project_A/luiti_tasks/d_day.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from .__init_luiti import cached_property, TaskDay, luigi 4 | 5 | 6 | @luigi.ref_tasks("HDay") 7 | class DDay(TaskDay): 8 | 9 | root_dir = "/foobar" 10 | 11 | def requires(self): 12 | return [self.HDay_task] 13 | 14 | @cached_property 15 | def count(self): 16 | return 4 17 | 18 | @cached_property 19 | def total_count(self): 20 | return self.count + self.HDay_task.count 21 | -------------------------------------------------------------------------------- /tests/project_A/luiti_tasks/foobar_day.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from .__init_luiti import TaskDayHadoop, MRUtils 4 | 5 | 6 | class FoobarDay(TaskDayHadoop): 7 | """ 8 | A MapReduce Python Program written in Luiti Task Style, including test case. 9 | """ 10 | 11 | root_dir = "/foobar" 12 | 13 | def mapper(self, line1): 14 | d2 = MRUtils.json_parse(line1) 15 | yield d2['uid'], d2 16 | 17 | def reducer(self, uid1, d1): 18 | yield '', MRUtils.str_dump({ 19 | "uid": uid1, 20 | "total": sum([i2['count'] for i2 in d1]), 21 | "ref": self.ref, 22 | }) 23 | 24 | ref = NotImplementedError 25 | 26 | def mrtest_input(self): 27 | return u""" 28 | {"uid": 1, "count": 2} 29 | {"uid": 1, "count": 3} 30 | {"uid": 2, "count": 1} 31 | """ 32 | 33 | def mrtest_output(self): 34 | return u""" 35 | {"uid": 1, "total": 5, "ref": "foobar"} 36 | {"uid": 2, "total": 1, "ref": "foobar"} 37 | """ 38 | 39 | def mrtest_attrs(self): 40 | return { 41 | "ref": "foobar", 42 | } 43 | -------------------------------------------------------------------------------- /tests/project_A/luiti_tasks/import_packages_day.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from .__init_luiti import TaskDay, cached_property 4 | 5 | 6 | class ImportPackagesDay(TaskDay): 7 | 8 | root_dir = "/foobar" 9 | 10 | @cached_property 11 | def egg_library(self): 12 | import zip_package_by_luiti # test import library from zip file 13 | return zip_package_by_luiti 14 | -------------------------------------------------------------------------------- /tests/project_A/luiti_tasks/multiple_dependent_day.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from .__init_luiti import luigi, TaskDay 4 | 5 | 6 | @luigi.ref_tasks("FoobarDay") 7 | class MultipleDependentDay(TaskDay): 8 | 9 | root_dir = "/foobar" 10 | 11 | def requires(self): 12 | return self.FoobarDay_task 13 | -------------------------------------------------------------------------------- /tests/project_B/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dchentech/luiti/11a5c62b265a92910a1d4c82431e3697b8b06814/tests/project_B/__init__.py -------------------------------------------------------------------------------- /tests/project_B/luiti_tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dchentech/luiti/11a5c62b265a92910a1d4c82431e3697b8b06814/tests/project_B/luiti_tasks/__init__.py -------------------------------------------------------------------------------- /tests/project_B/luiti_tasks/__init_luiti.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __all__ = ["luigi", "TaskDay", "cached_property"] 4 | 5 | from luiti import luigi, TaskDay, cached_property 6 | -------------------------------------------------------------------------------- /tests/project_B/luiti_tasks/h_day.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from luiti import TaskDay, cached_property, luigi 4 | 5 | 6 | @luigi.ref_tasks("MultipleDependentDay") 7 | class HDay(TaskDay): 8 | 9 | root_dir = "/foobar" 10 | 11 | def requires(self): 12 | return self.MultipleDependentDay_task 13 | 14 | @cached_property 15 | def count(self): 16 | return 8 17 | -------------------------------------------------------------------------------- /tests/test_main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import sys 5 | root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 6 | sys.path.insert(0, root_dir) 7 | os.environ['LUIGI_CONFIG_PATH'] = root_dir + '/tests/client.cfg' 8 | 9 | import unittest 10 | 11 | 12 | class TestLuiti(unittest.TestCase): 13 | 14 | def test_check_date_range(self): 15 | from luiti import luigi, TaskHour, arrow 16 | 17 | @luigi.check_date_range() 18 | class CheckDateRangeExampleHour(TaskHour): 19 | root_dir = "/foobar" 20 | 21 | def run(self): 22 | return "data" 23 | 24 | prev_hour = arrow.now().replace(hours=-1) 25 | 26 | # 这周得数据得下周跑 27 | self.assertEqual(CheckDateRangeExampleHour(prev_hour).run(), 'data') 28 | self.assertEqual(CheckDateRangeExampleHour(arrow.now()).run(), False) 29 | 30 | def test_check_runtime_range(self): 31 | from luiti import luigi, TaskWeek, arrow 32 | 33 | @luigi.check_runtime_range(hour_num=[5, 6, 7, 8], weekday_num=[1], ) 34 | class CheckRuntimeRangeExampleWeek(TaskWeek): 35 | root_dir = "/foobar" 36 | 37 | def run(self): 38 | return "data" 39 | 40 | day_1 = arrow.get("2014-09-01 06:28") # valid 41 | self.assertTrue(day_1) 42 | 43 | def func(d1): 44 | # overwrite arrow's method directly. 45 | arrow.now = lambda: arrow.get(d1) 46 | return CheckRuntimeRangeExampleWeek(d1).run() 47 | 48 | self.assertEqual(func("2014-09-01 09:00"), False) 49 | self.assertEqual(func("2014-09-02 06:28"), False) 50 | self.assertEqual(func("2014-09-01 04:28"), False) 51 | self.assertEqual(func("2014-09-01 05:00"), "data") 52 | self.assertEqual(func("2014-09-01 06:28"), "data") 53 | self.assertEqual(func("2014-09-01 08:59"), "data") 54 | 55 | 56 | if __name__ == '__main__': 57 | unittest.main() 58 | -------------------------------------------------------------------------------- /tests/test_manager.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import sys 5 | RootDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 6 | sys.path.insert(0, RootDir) 7 | os.environ['LUIGI_CONFIG_PATH'] = RootDir + '/tests/client.cfg' 8 | 9 | import unittest 10 | import mock 11 | 12 | from luiti import manager 13 | from luiti.tests import date_begin 14 | 15 | sys.path.insert(0, os.path.join( 16 | RootDir, "tests/zip_package_by_luiti")) 17 | 18 | 19 | class TestManager(unittest.TestCase): 20 | 21 | def setUp(self): 22 | # change work dir 23 | os.chdir(os.path.join(RootDir, "tests/project_A")) 24 | 25 | def test_Loader(self): 26 | self.assertEqual( 27 | manager.load_a_task_by_name("ADay"), 28 | manager.load_a_task_by_name("a_day"), 29 | ) 30 | 31 | self.assertRaises( 32 | AssertionError, 33 | lambda: manager.load_a_task_by_name("not_exists_day"), 34 | ) 35 | os.chdir(RootDir) 36 | 37 | def test_get_all_date_file_to_task_instances(self): 38 | ADay = manager.load_a_task_by_name("ADay") 39 | BDay = manager.load_a_task_by_name("BDay") 40 | files = manager.get_all_date_file_to_task_instances("20140901-20140903", [ADay, BDay]) 41 | self.assertEqual(['/foobar/2014-09-01/a_day.json', 42 | '/foobar/2014-09-01/b_day.json', 43 | '/foobar/2014-09-02/a_day.json', 44 | '/foobar/2014-09-02/b_day.json', 45 | '/foobar/2014-09-03/a_day.json', 46 | '/foobar/2014-09-03/b_day.json'], 47 | sorted(files.keys())) 48 | 49 | def test_load_all_tasks(self): 50 | all_tasks = manager.load_all_tasks() 51 | self.assertEqual(manager.ld.result, all_tasks) # cause they'are linked. 52 | 53 | HDay = manager.load_a_task_by_name("HDay") 54 | self.assertTrue(HDay in manager.ld.all_task_classes, "project B is also loaded.") 55 | 56 | def test_find_dep_on_tasks(self): 57 | # simple case 58 | # ADay is dep on BDay, ADay is inputed into BDay. 59 | BDay = manager.load_a_task_by_name("BDay") 60 | dep_tasks_by_BDay = manager.find_dep_on_tasks(BDay, manager.ld.all_task_classes) 61 | self.assertEqual(len(dep_tasks_by_BDay), 1) 62 | self.assertEqual(dep_tasks_by_BDay[0].__name__, "ADay") 63 | 64 | # complex case 65 | # MultipleDependentDay => HDay => DDay 66 | # delete MultipleDependentDay, and delete HDay and DDay. 67 | MultipleDependentDay = manager.load_a_task_by_name("MultipleDependentDay") 68 | dep_tasks_by_MultipleDependentDay = manager.find_dep_on_tasks(MultipleDependentDay, manager.ld.all_task_classes) 69 | self.assertEqual(len(dep_tasks_by_MultipleDependentDay), 2) 70 | self.assertEqual(sorted(map(lambda i1: i1.__name__, dep_tasks_by_MultipleDependentDay)), ["DDay", "HDay"]) 71 | 72 | def test_generate_a_task(self): 73 | dir1 = "/tmp/test_generate_a_task/" 74 | os.system("rm -rf %s" % dir1) # clean prev error 75 | os.system("mkdir -p %s/luiti_tasks" % dir1) 76 | os.chdir(dir1) 77 | 78 | content_a = manager.generate_a_task("ADay") 79 | self.assertTrue("ADay" in content_a) 80 | self.assertTrue("TaskDay" in content_a) 81 | 82 | content_b = manager.generate_a_task("b_week") 83 | self.assertTrue("BWeek" in content_b) 84 | self.assertTrue("TaskWeek" in content_b) 85 | 86 | os.system("rm -rf %s" % dir1) 87 | os.chdir(RootDir) 88 | 89 | def test_new_a_project(self): 90 | os.chdir("/") # fix chdir err 91 | dir1 = "/tmp/test_new_a_project/" 92 | os.system("rm -rf %s" % dir1) # clean prev error 93 | os.system("mkdir -p %s" % dir1) 94 | os.chdir(dir1) 95 | 96 | files = manager.new_a_project("project_c") 97 | 98 | self.assertTrue("Project C" in file(files[0]).read()) 99 | self.assertTrue("zip_safe" in file(files[1]).read()) 100 | self.assertTrue("luigi.plug_packages" in file(files[2]).read()) 101 | self.assertTrue("@MrTestCase" in file(files[3]).read()) 102 | 103 | os.chdir("project_c") 104 | os.system("python tests/test_main.py") 105 | os.chdir("..") 106 | 107 | os.system("rm -rf %s" % dir1) 108 | os.chdir(RootDir) 109 | 110 | def test_CLI(self): 111 | from luiti.manager.cli import Cli 112 | 113 | cli = Cli(["luiti", "ls"]) 114 | self.assertTrue("ArgumentParser" in repr(cli.parser)) 115 | self.assertTrue(callable(cli.load_a_task_by_name)) 116 | 117 | self.assertTrue(cli.executor) 118 | 119 | for subcommand in cli.subparsers.choices.keys(): 120 | # Dumb test, just test function exists. 121 | # TODO but dont works 122 | self.assertTrue(callable(getattr(cli.executor, subcommand))) 123 | 124 | from luiti.manager.cli import bool_type 125 | self.assertEqual(bool_type("False"), False) 126 | self.assertEqual(bool_type("false"), False) 127 | 128 | def test_SysArgv(self): 129 | from luiti.manager.sys_argv import SysArgv 130 | from luiti.manager.cli import Cli 131 | 132 | def func(argv_in, argv_ou): 133 | cli = Cli(argv_in) 134 | self.assertEqual(SysArgv.convert_to_luigi_accepted_argv(cli.subparsers, argv_in), argv_ou) 135 | 136 | func(["luiti", "info", "--task-name", "HelloDay", "--date-value", date_begin], ['luiti', '--date-value', date_begin]) 137 | func(["luiti", "info", "--task-name=HelloDay"], ['luiti']) 138 | 139 | def test_Table(self): 140 | # TODO add more tests 141 | from luiti.manager.table import Table 142 | ADay = manager.load_a_task_by_name("ADay") 143 | self.assertEqual(Table.print_task_info(ADay), ([['Tasks self dep on', "['BDay', 'CDay']"], ['Tasks dep on self', '[]']], ['Task name', 'ADay'])) 144 | 145 | from luiti.manager.lazy_data import ld 146 | self.assertTrue(len(Table.print_all_tasks(ld.result)[0]) > 6, """Example data is ([[1, 'ADay', 'project_A'], [2, 'BDay', 'project_A'], [3, 'CDay', 'project_A'], [4, 'DDay', 'project_A'], [5, 'FoobarDay', 'project_A'], [6, 'HDay', 'project_B'], [7, 'ImportPackagesDay', 'project_A'], [8, 'MultipleDependentDay', 'project_A'], ['total', 8, '']], ['', 'All Tasks', 'luiti_package'])""") 147 | 148 | @mock.patch("luigi.hdfs.clients.rename") 149 | @mock.patch("luigi.hdfs.clients.exists") 150 | def test_Files(self, exists, rename): 151 | from luiti.manager.files import Files 152 | 153 | exists.return_value = True 154 | rename.return_value = True 155 | self.assertEqual(Files.soft_delete_files("hello", "world"), 0) 156 | 157 | def test_ManageDecorators(self): 158 | from luiti.luigi_extensions.manage_decorators import ManageDecorators 159 | from luiti import luigi 160 | luigi = ManageDecorators.bind_to(luigi) # actually it's already runned by luiti.luigi_extensions.__init__ 161 | self.assertTrue("as_a_luiti_task" in dir(luigi)) 162 | 163 | if __name__ == '__main__': 164 | unittest.main() 165 | -------------------------------------------------------------------------------- /tests/test_mr_test_case.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import sys 5 | RootDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 6 | sys.path.insert(0, RootDir) 7 | os.environ['LUIGI_CONFIG_PATH'] = RootDir + '/tests/client.cfg' 8 | 9 | from luiti.tests import SetupLuitiPackages 10 | config = SetupLuitiPackages.config 11 | 12 | import unittest 13 | from luiti import MrTestCase 14 | 15 | 16 | @MrTestCase 17 | class TestMrTestCase(unittest.TestCase): 18 | 19 | mr_task_names = [ 20 | 'FoobarDay', 21 | ] 22 | 23 | if __name__ == '__main__': 24 | unittest.main() 25 | -------------------------------------------------------------------------------- /tests/test_schedule.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import sys 5 | root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 6 | sys.path.insert(0, root_dir) 7 | 8 | import unittest 9 | 10 | from luiti.tests import SetupLuitiPackages 11 | config = SetupLuitiPackages.config 12 | 13 | from luiti.schedule import SensorSchedule 14 | from luiti import luigi, TaskDay, manager 15 | 16 | 17 | class TestSensorSchedule(unittest.TestCase): 18 | 19 | def test_read_all_required_tasks(self): 20 | BetaReportDay = manager.load_a_task_by_name("BetaReportDay") 21 | ss = SensorSchedule(BetaReportDay, "2014-09-01", False) 22 | 23 | result = map(lambda i1: i1.task_clsname, ss.ordered_task_instances_list) 24 | self.assertEqual(result, ['DumpBrowserMapDay', 'DumpWebLogDay', 'CleanWebLogDay', 'CounterVisitorByBrowserDay', 'CounterVisitorByRegionDay', 'CounterVisitorDay', 'BetaReportDay']) 25 | 26 | def test_is_external(self): 27 | class ExampleExternalTask(luigi.ExternalTask): 28 | pass 29 | self.assertTrue(SensorSchedule.is_external(ExampleExternalTask())) 30 | 31 | class LuitiTaskDay(TaskDay): 32 | is_external = True 33 | root_dir = "/foobar" 34 | self.assertTrue(SensorSchedule.is_external(LuitiTaskDay(date_value="2014-09-01"))) 35 | 36 | 37 | if __name__ == '__main__': 38 | unittest.main() 39 | -------------------------------------------------------------------------------- /tests/test_task.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import sys 5 | root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 6 | sys.path.insert(0, root_dir) 7 | os.environ['LUIGI_CONFIG_PATH'] = root_dir + '/tests/client.cfg' 8 | 9 | import unittest 10 | 11 | 12 | class TestLuitiUtils(unittest.TestCase): 13 | 14 | def test_main(self): 15 | from luiti import TaskWeek, ArrowParameter 16 | 17 | class HelloWorldWeek(TaskWeek): 18 | root_dir = "/foobar" 19 | 20 | # Tuesday 21 | task1 = HelloWorldWeek("2014-09-02") 22 | # Monday 23 | self.assertEqual(task1.date_value, ArrowParameter.get("2014-09-01")) 24 | 25 | self.assertEqual(task1.data_dir, "/foobar/2014-09-01") 26 | self.assertEqual( 27 | task1.data_file, "/foobar/2014-09-01/hello_world_week.json") 28 | self.assertEqual(task1.date_str, "2014-09-01") 29 | self.assertEqual(task1.date_type, "week") 30 | self.assertEqual( 31 | task1.date_value_by_type_in_last, ArrowParameter.get("2014-08-25")) 32 | self.assertEqual(task1.task_class, HelloWorldWeek) 33 | 34 | def test_RootTask(self): 35 | from luiti import RootTask 36 | output_path = RootTask().output().path 37 | self.assertTrue("luiti/luigi_extensions/root_task.py" in output_path, output_path) 38 | 39 | 40 | if __name__ == '__main__': 41 | unittest.main() 42 | -------------------------------------------------------------------------------- /tests/test_task_templates.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import sys 5 | root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 6 | sys.path.insert(0, root_dir) 7 | os.environ['LUIGI_CONFIG_PATH'] = root_dir + '/tests/client.cfg' 8 | 9 | import mock 10 | import unittest 11 | 12 | from luiti.tests import date_begin 13 | from etl_utils import cached_property 14 | 15 | 16 | class TestLuitiUtils(unittest.TestCase): 17 | 18 | @mock.patch("os.system") 19 | def test_MongoImportTask(self, os_system, ): 20 | os_system.return_value = 0 21 | 22 | from luiti import MongoImportTask 23 | 24 | class AnotherMongoDay(MongoImportTask): 25 | root_dir = "/tmp" 26 | 27 | mongodb_connection_address = ('192.168.20.111', 37001) 28 | database_name = "17zuoye_crm" 29 | collection_name = "teacher_report" 30 | tmp_filepath = "/foobar.json" 31 | data_file_collection_model = "MongoCollection(foobar)" 32 | 33 | is_collection_exists = lambda self: True 34 | 35 | mongo_task = AnotherMongoDay(date_value=date_begin) 36 | 37 | self.assertEqual(mongo_task.mongodb_connection_host, "192.168.20.111") 38 | self.assertEqual(mongo_task.mongodb_connection_port, 37001) 39 | self.assertEqual(mongo_task.mongoimport_command, "/usr/bin/mongoimport --host 192.168.20.111 --port 37001 --db 17zuoye_crm --collection teacher_report --file /foobar.json") 40 | self.assertEqual(mongo_task.tmp_dir, "/tmp/AnotherMongoDay") 41 | 42 | self.assertFalse(mongo_task.run()) 43 | 44 | def test_StaticFile(self): 45 | from luiti import StaticFile, luigi 46 | 47 | class FoobarFileDay(StaticFile): 48 | data_file = "/foobar" 49 | IODevice = luigi.LocalTarget 50 | self.assertEqual(FoobarFileDay().output().path, "/foobar") 51 | 52 | class OldFoobarFileDay(StaticFile): 53 | filepath = "/foobar" 54 | IODevice = luigi.LocalTarget 55 | self.assertEqual(OldFoobarFileDay().output().path, "/foobar") 56 | self.assertTrue(OldFoobarFileDay().complete()) 57 | self.assertFalse(OldFoobarFileDay().run()) 58 | 59 | def test_TaskDate(self): 60 | from luiti.task_templates import TaskMonth, TaskDay 61 | 62 | class AnotherMonthDay(TaskMonth): 63 | root_dir = "/tmp" 64 | 65 | class AnotherDay(TaskDay): 66 | root_dir = "/tmp" 67 | 68 | m1 = AnotherMonthDay(date_value=date_begin) 69 | self.assertEqual(len(m1.days_in_month), 30) 70 | 71 | m2 = AnotherDay(date_value="2015-07-20") 72 | self.assertEqual(m2.latest_30_days[0].format('YYYY-MM-DD'), '2015-06-21') 73 | self.assertEqual(m2.latest_30_days[-1].format('YYYY-MM-DD'), '2015-07-20') 74 | self.assertEquals(len(m2.latest_30_days), 30) 75 | 76 | m3 = AnotherDay(date_value="2015-07-20") 77 | self.assertEquals(m3.latest_7_days[0].format('YYYY-MM-DD'), '2015-07-14') 78 | self.assertEqual(m3.latest_7_days[-1].format('YYYY-MM-DD'), '2015-07-20') 79 | self.assertEquals(len(m3.latest_7_days), 7) 80 | 81 | def test_HiveTask(self): 82 | from luiti.task_templates import HiveTask 83 | 84 | class AnotherHiveDay(HiveTask): 85 | run_mode = "local" # dont print when run unit test 86 | root_dir = "/another/hive/result/" 87 | use_hive_db = "main_hive_database" 88 | 89 | @cached_property 90 | def sql_main(self): 91 | return "select * from example_table where dt=%s;" % self.date_str 92 | 93 | h1 = AnotherHiveDay(date_value=date_begin) 94 | self.assertEqual(h1.sql_main, "select * from example_table where dt=2014-09-01;") 95 | self.assertEqual(h1.query(), "USE main_hive_database; INSERT OVERWRITE DIRECTORY \"/another/hive/result/2014-09-01/another_hive_day.json\" select * from example_table where dt=2014-09-01;") 96 | 97 | class CompatibilityHiveDay(HiveTask): 98 | """ test old API """ 99 | data_root = "/foobar" 100 | hive_db = "foobar" 101 | 102 | h2 = CompatibilityHiveDay(date_value=date_begin) 103 | self.assertEqual(h2.root_dir, "/foobar") 104 | self.assertEqual(h2.use_hive_db, "foobar") 105 | 106 | def test_requires_with_prev_week(self): 107 | from luiti.task_templates import TaskDay, TaskWeek 108 | 109 | class OneDay(TaskDay): 110 | root_dir = "/tmp" 111 | 112 | class AnotherWeek(TaskWeek): 113 | root_dir = "/tmp" 114 | 115 | w1 = AnotherWeek(date_value=date_begin) 116 | tasks = w1.requires_with_prev_week(OneDay) 117 | self.assertEqual(len(tasks), 8) 118 | 119 | 120 | if __name__ == '__main__': 121 | unittest.main() 122 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import sys 5 | root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 6 | sys.path.insert(0, root_dir) 7 | os.environ['LUIGI_CONFIG_PATH'] = root_dir + '/tests/client.cfg' 8 | 9 | import unittest 10 | import mock 11 | from luigi.mock import MockTarget 12 | 13 | 14 | class HdfsFile(MockTarget): 15 | pass 16 | 17 | 18 | class TestLuitiUtils(unittest.TestCase): 19 | 20 | def test_mr(self): 21 | from luiti import MRUtils 22 | 23 | item1 = {"class_id": 3, "uid": 7} 24 | self.assertEqual(MRUtils.mr_key(item1), "3@@7") 25 | self.assertEqual(MRUtils.mr_key(item1, "hid009"), "3@@7@@hid009") 26 | 27 | self.assertEqual(MRUtils.json_parse("{\"你好\":\"世界\"}"), {u"你好": u"世界"}) 28 | 29 | self.assertFalse(MRUtils.is_mr_line("[1,2]")) 30 | self.assertTrue(MRUtils.is_mr_line("hello\t{framework:luigi}")) 31 | self.assertTrue(MRUtils.is_mr_line("1@@" + "2" * 40 + "\t[world]")) 32 | 33 | self.assertEqual( 34 | MRUtils.unicode_value({u"hello": u"世界"}, "hello"), u"世界") 35 | 36 | self.assertEqual( 37 | MRUtils.split_mr_kv("hello\t[1,2,3,4]"), ["hello", [1, 2, 3, 4]]) 38 | 39 | self.assertEqual( 40 | MRUtils.merge_keys_in_dict([{"a": 1}, {"a": 2}], ["a"]), {"a": 3}) 41 | 42 | self.assertEqual( 43 | MRUtils.split_prefix_keys("1@@2@@other"), ["1", "2", "other"]) 44 | 45 | prefix_str1 = "232@@8923802@@afenti" 46 | prefix_str2 = "\"" + prefix_str1 47 | self.assertEqual( 48 | MRUtils.select_prefix_keys(prefix_str1, [0, 1]), "232@@8923802") 49 | self.assertEqual( 50 | MRUtils.select_prefix_keys(prefix_str2, [0, 1]), "232@@8923802") 51 | 52 | self.assertEqual( 53 | MRUtils.str_dump({"hello": u"世界"}), """{"hello": "世界"}""") 54 | 55 | self.assertEqual( 56 | MRUtils.filter_dict( 57 | {"hello": "world", "foobar": "barfoo"}, "hello"), 58 | {"hello": "world"}) 59 | 60 | def test_math(self): 61 | from luiti import MathUtils 62 | 63 | self.assertEqual(MathUtils.percent(5, 10), 0.5) 64 | self.assertEqual(MathUtils.percent(5, 0), 0) 65 | self.assertEqual(MathUtils.percent(5, None), 0) 66 | self.assertEqual(MathUtils.percent(None, 1), 0) 67 | 68 | def test_date(self): 69 | from luiti import DateUtils 70 | import arrow 71 | 72 | arrow1 = DateUtils.arrow.get("2014-10-01 12:01:01") 73 | arrow2 = DateUtils.arrow.get("2014-10-15 12:01:01") 74 | 75 | self.assertEqual(DateUtils.arrow_str(arrow1), "2014-10-01") 76 | 77 | self.assertEqual(len(DateUtils.days_in_week(arrow1)), 7) 78 | self.assertTrue(arrow1.floor('day') in DateUtils.days_in_week(arrow1)) 79 | 80 | self.assertEqual(len(DateUtils.weeks_in_range(arrow1, arrow2)), 3) 81 | 82 | self.assertEqual( 83 | len(DateUtils.fixed_weeks_in_range("2014-10-01-2014-10-15")), 1) 84 | self.assertEqual( 85 | len(DateUtils.fixed_weeks_in_range("2014-09-29-2014-10-15")), 2) 86 | 87 | self.assertEqual( 88 | DateUtils.date_value_by_type_in_last("2014-09-01", "week"), 89 | arrow.get("2014-08-25")) 90 | 91 | def test_ext(self): 92 | from etl_utils import cached_property 93 | from luiti.utils import ExtUtils 94 | import inspect 95 | 96 | class Foobar(ExtUtils.ExtendClass): 97 | 98 | def method_1(self): 99 | return "method_1" 100 | 101 | @property 102 | def property_1(self): 103 | return "property_1" 104 | 105 | @cached_property 106 | def cached_property_1(self): 107 | return "cached_property_1" 108 | 109 | fb1 = Foobar() 110 | self.assertEqual(fb1.method_1(), "method_1") 111 | self.assertEqual(fb1.property_1, "property_1") 112 | self.assertEqual(fb1.cached_property_1, "cached_property_1") 113 | 114 | self.assertTrue(inspect.ismethod(Foobar.method_1)) 115 | self.assertTrue(isinstance(Foobar.property_1, property)) 116 | self.assertTrue(isinstance(Foobar.cached_property_1, cached_property), Foobar.cached_property_1) 117 | 118 | Foobar.extend({ 119 | 'not_exist_str': "not_exist_str", 120 | 'method_1': lambda self: "method_2", 121 | 'property_1': lambda self: "property_2", 122 | 'cached_property_1': lambda self: "cached_property_2", 123 | }) 124 | 125 | fb2 = Foobar() 126 | self.assertEqual(fb2.method_1(), "method_2") 127 | self.assertEqual(fb2.property_1, "property_2") 128 | self.assertEqual(fb2.cached_property_1, "cached_property_2") 129 | 130 | self.assertTrue(isinstance(Foobar.not_exist_str, str)) 131 | self.assertTrue(inspect.ismethod(Foobar.method_1)) 132 | self.assertTrue(isinstance(Foobar.property_1, property)) 133 | self.assertTrue(isinstance(Foobar.cached_property_1, cached_property), Foobar.cached_property_1) 134 | 135 | @mock.patch("luigi.hdfs.exists") 136 | @mock.patch("luigi.hdfs.remove") 137 | def test_IOUtils(self, remove, exists): 138 | remove.return_value = True 139 | exists.return_value = True 140 | 141 | from luiti.utils import IOUtils 142 | 143 | self.assertEqual(IOUtils.json_dump({}), "{}") 144 | self.assertEqual(IOUtils.json_dump([{}]), "[{}]") 145 | 146 | f1 = HdfsFile("writor") 147 | self.assertEqual(IOUtils.write_json_to_output({}, f1), 0) 148 | 149 | f2 = HdfsFile("writor") 150 | with f2.open("w") as w2: 151 | w2.write("""{"foo":"bar"}""") 152 | self.assertEqual(IOUtils.read_json_from_output(f2), {"foo": "bar"}) 153 | 154 | f3 = HdfsFile("writor_error") 155 | with f3.open("w") as w3: 156 | w3.write("""{"foo":"bar"}\n{}""") # two lines 157 | self.assertRaises(ValueError, lambda: IOUtils.read_json_from_output(f3)) 158 | 159 | self.assertTrue(IOUtils.remove_files("f1", "f2"), True) 160 | 161 | def test_TargetUtils(self): 162 | from luiti.utils import TargetUtils 163 | 164 | def mock_test_file(filename, data): 165 | f = HdfsFile(filename) 166 | with f.open("w") as w: 167 | w.write(data) 168 | 169 | return f 170 | g1 = TargetUtils.line_read(mock_test_file("g1", """\nline one\nline two\n \n""")) 171 | self.assertTrue(list(g1), [u"line one", u"line two"]) 172 | 173 | g2 = TargetUtils.json_read(mock_test_file("g1", """\n{"a": 1}\n[1, "b"] \n \n""")) 174 | self.assertTrue(list(g2), [{"a": 1}, [1, "b"]]) 175 | 176 | @mock.patch("luiti.utils.HDFSUtils.hdfs_cli") 177 | @mock.patch("luiti.utils.CommandUtils.execute") 178 | @mock.patch("luiti.utils.HDFSUtils.copyToLocal") 179 | @mock.patch("os.path.isdir") 180 | @mock.patch("luiti.utils.HDFSUtils.exists") 181 | def test_CompressUtils(self, hdfs_exists, os_path_isdir, copyToLocal, execute, hdfs_cli): 182 | """ a rough test ... """ 183 | hdfs_exists.return_value = True 184 | os_path_isdir.return_value = False 185 | copyToLocal.return_value = True 186 | execute.return_value = True 187 | hdfs_cli.return_value = "hdfs" 188 | 189 | from luiti.utils import CompressUtils 190 | self.assertTrue(CompressUtils.unzip_with_upload( 191 | "orig", "dist", 192 | tmp_dir="/tmp", 193 | tmp_name="foobar")) 194 | 195 | @mock.patch("os.system") 196 | def test_CommandUtils(self, os_system): 197 | os_system.return_value = 0 198 | 199 | from luiti.utils import CommandUtils 200 | self.assertEqual(CommandUtils.execute("ls"), 0) 201 | self.assertEqual(CommandUtils.execute("ls", dry=True), 0) 202 | 203 | 204 | if __name__ == '__main__': 205 | unittest.main() 206 | -------------------------------------------------------------------------------- /tests/webui_packages/README.markdown: -------------------------------------------------------------------------------- 1 | Test webui visualizer. 2 | ======================= 3 | 4 | 5 | Package relations. 6 | ----------------------- 7 | ```text 8 | Hierarchical data warehouse 9 | 10 | / dump 11 | | || 12 | | \/ 13 | / clean 14 | Data Flow || 15 | \ \/ 16 | | middle 17 | | || 18 | | \/ 19 | \ summary 20 | 21 | ``` 22 | 23 | 24 | Some overwritten configuration. 25 | ----------------------- 26 | See it at `luiti_webui_tests/__init__.py` 27 | -------------------------------------------------------------------------------- /tests/webui_packages/luiti_clean/README.markdown: -------------------------------------------------------------------------------- 1 | Luiti Clean 2 | ======================= 3 | 4 | TODO ... -------------------------------------------------------------------------------- /tests/webui_packages/luiti_clean/luiti_clean/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dchentech/luiti/11a5c62b265a92910a1d4c82431e3697b8b06814/tests/webui_packages/luiti_clean/luiti_clean/__init__.py -------------------------------------------------------------------------------- /tests/webui_packages/luiti_clean/luiti_clean/luiti_tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dchentech/luiti/11a5c62b265a92910a1d4c82431e3697b8b06814/tests/webui_packages/luiti_clean/luiti_clean/luiti_tasks/__init__.py -------------------------------------------------------------------------------- /tests/webui_packages/luiti_clean/luiti_clean/luiti_tasks/__init_luiti.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["WebuiDay", "luigi"] 4 | 5 | 6 | from luiti_webui_tests import WebuiDay, luigi 7 | luigi.plug_packages("luiti_dump", "luiti_clean", "luiti_middle", "luiti_summary") 8 | -------------------------------------------------------------------------------- /tests/webui_packages/luiti_clean/luiti_clean/luiti_tasks/clean_web_log_day.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from .__init_luiti import WebuiDay, luigi 4 | 5 | 6 | @luigi.ref_tasks("DumpWebLogDay") 7 | class CleanWebLogDay(WebuiDay): 8 | """ 9 | Clean web log 10 | """ 11 | 12 | def requires(self): 13 | return self.DumpWebLogDay_task 14 | -------------------------------------------------------------------------------- /tests/webui_packages/luiti_clean/setup.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from setuptools import setup 4 | 5 | setup( 6 | name="luiti_clean", 7 | version="0.0.1", 8 | packages=[ 9 | "luiti_clean", 10 | "luiti_clean/luiti_tasks", ], 11 | zip_safe=False, 12 | ) -------------------------------------------------------------------------------- /tests/webui_packages/luiti_clean/tests/test_main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import sys 5 | root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 6 | sys.path.insert(0, root_dir) 7 | 8 | import unittest 9 | from luiti import MrTestCase 10 | 11 | 12 | @MrTestCase 13 | class TestMapReduce(unittest.TestCase): 14 | mr_task_names = [ 15 | ] 16 | 17 | if __name__ == '__main__': 18 | unittest.main() -------------------------------------------------------------------------------- /tests/webui_packages/luiti_dump/README.markdown: -------------------------------------------------------------------------------- 1 | Luiti Dump 2 | ======================= 3 | 4 | TODO ... -------------------------------------------------------------------------------- /tests/webui_packages/luiti_dump/luiti_dump/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dchentech/luiti/11a5c62b265a92910a1d4c82431e3697b8b06814/tests/webui_packages/luiti_dump/luiti_dump/__init__.py -------------------------------------------------------------------------------- /tests/webui_packages/luiti_dump/luiti_dump/luiti_tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dchentech/luiti/11a5c62b265a92910a1d4c82431e3697b8b06814/tests/webui_packages/luiti_dump/luiti_dump/luiti_tasks/__init__.py -------------------------------------------------------------------------------- /tests/webui_packages/luiti_dump/luiti_dump/luiti_tasks/__init_luiti.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["WebuiDay", "luigi"] 4 | 5 | 6 | from luiti_webui_tests import WebuiDay, luigi 7 | luigi.plug_packages("luiti_dump", "luiti_clean", "luiti_middle", "luiti_summary") 8 | -------------------------------------------------------------------------------- /tests/webui_packages/luiti_dump/luiti_dump/luiti_tasks/dump_browser_map_day.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from .__init_luiti import WebuiDay 4 | from etl_utils import cached_property 5 | 6 | 7 | class DumpBrowserMapDay(WebuiDay): 8 | """ 9 | Mimic dump {int: name} format data from MySQL relational database. 10 | """ 11 | 12 | @cached_property 13 | def cached_data(self): 14 | """ 15 | Actually need to read data from self.output(). 16 | """ 17 | return { 18 | "Google Chrome": 1, 19 | "Mozilla Firefox": 2, 20 | "IE": 3, 21 | } 22 | -------------------------------------------------------------------------------- /tests/webui_packages/luiti_dump/luiti_dump/luiti_tasks/dump_web_log_day.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from .__init_luiti import WebuiDay 4 | 5 | 6 | class DumpWebLogDay(WebuiDay): 7 | """ 8 | Dump web log from other database/storage. 9 | """ 10 | -------------------------------------------------------------------------------- /tests/webui_packages/luiti_dump/setup.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from setuptools import setup 4 | 5 | setup( 6 | name="luiti_dump", 7 | version="0.0.1", 8 | packages=[ 9 | "luiti_dump", 10 | "luiti_dump/luiti_tasks", ], 11 | zip_safe=False, 12 | ) -------------------------------------------------------------------------------- /tests/webui_packages/luiti_dump/tests/test_main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import sys 5 | root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 6 | sys.path.insert(0, root_dir) 7 | 8 | import unittest 9 | from luiti import MrTestCase 10 | 11 | 12 | @MrTestCase 13 | class TestMapReduce(unittest.TestCase): 14 | mr_task_names = [ 15 | ] 16 | 17 | if __name__ == '__main__': 18 | unittest.main() -------------------------------------------------------------------------------- /tests/webui_packages/luiti_middle/README.markdown: -------------------------------------------------------------------------------- 1 | Luiti Middle 2 | ======================= 3 | 4 | TODO ... -------------------------------------------------------------------------------- /tests/webui_packages/luiti_middle/luiti_middle/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dchentech/luiti/11a5c62b265a92910a1d4c82431e3697b8b06814/tests/webui_packages/luiti_middle/luiti_middle/__init__.py -------------------------------------------------------------------------------- /tests/webui_packages/luiti_middle/luiti_middle/luiti_tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dchentech/luiti/11a5c62b265a92910a1d4c82431e3697b8b06814/tests/webui_packages/luiti_middle/luiti_middle/luiti_tasks/__init__.py -------------------------------------------------------------------------------- /tests/webui_packages/luiti_middle/luiti_middle/luiti_tasks/__init_luiti.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["WebuiDay", "luigi"] 4 | 5 | 6 | from luiti_webui_tests import WebuiDay, luigi 7 | luigi.plug_packages("luiti_dump", "luiti_clean", "luiti_middle", "luiti_summary") 8 | -------------------------------------------------------------------------------- /tests/webui_packages/luiti_middle/luiti_middle/luiti_tasks/counter_visitor_by_browser_day.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from .__init_luiti import WebuiDay, luigi 4 | 5 | 6 | @luigi.ref_tasks("CleanWebLogDay", "DumpBrowserMapDay") 7 | class CounterVisitorByBrowserDay(WebuiDay): 8 | """ 9 | I'm 10 | Counter 11 | Visitor 12 | By 13 | Browser 14 | Day. 15 | """ 16 | 17 | def requires(self): 18 | return [self.CleanWebLogDay_task, self.DumpBrowserMapDay_task] 19 | -------------------------------------------------------------------------------- /tests/webui_packages/luiti_middle/luiti_middle/luiti_tasks/counter_visitor_by_region_day.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from .__init_luiti import WebuiDay, luigi 4 | 5 | 6 | @luigi.ref_tasks("CleanWebLogDay") 7 | class CounterVisitorByRegionDay(WebuiDay): 8 | 9 | def requires(self): 10 | return self.CleanWebLogDay_task 11 | -------------------------------------------------------------------------------- /tests/webui_packages/luiti_middle/luiti_middle/luiti_tasks/counter_visitor_day.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from .__init_luiti import WebuiDay, luigi 4 | 5 | 6 | @luigi.ref_tasks("CounterVisitorByBrowserDay") 7 | class CounterVisitorDay(WebuiDay): 8 | 9 | def requires(self): 10 | return self.CounterVisitorByBrowserDay_task 11 | -------------------------------------------------------------------------------- /tests/webui_packages/luiti_middle/setup.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from setuptools import setup 4 | 5 | setup( 6 | name="luiti_middle", 7 | version="0.0.1", 8 | packages=[ 9 | "luiti_middle", 10 | "luiti_middle/luiti_tasks", ], 11 | zip_safe=False, 12 | ) -------------------------------------------------------------------------------- /tests/webui_packages/luiti_middle/tests/test_main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import sys 5 | root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 6 | sys.path.insert(0, root_dir) 7 | 8 | import unittest 9 | from luiti import MrTestCase 10 | 11 | 12 | @MrTestCase 13 | class TestMapReduce(unittest.TestCase): 14 | mr_task_names = [ 15 | ] 16 | 17 | if __name__ == '__main__': 18 | unittest.main() -------------------------------------------------------------------------------- /tests/webui_packages/luiti_summary/README.markdown: -------------------------------------------------------------------------------- 1 | Luiti Summary 2 | ======================= 3 | 4 | TODO ... -------------------------------------------------------------------------------- /tests/webui_packages/luiti_summary/luiti_summary/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dchentech/luiti/11a5c62b265a92910a1d4c82431e3697b8b06814/tests/webui_packages/luiti_summary/luiti_summary/__init__.py -------------------------------------------------------------------------------- /tests/webui_packages/luiti_summary/luiti_summary/luiti_tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dchentech/luiti/11a5c62b265a92910a1d4c82431e3697b8b06814/tests/webui_packages/luiti_summary/luiti_summary/luiti_tasks/__init__.py -------------------------------------------------------------------------------- /tests/webui_packages/luiti_summary/luiti_summary/luiti_tasks/__init_luiti.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | __all__ = ["WebuiDay", "luigi"] 4 | 5 | 6 | from luiti_webui_tests import WebuiDay, luigi, VisualiserEnvTemplate 7 | luigi.plug_packages("luiti_dump", "luiti_clean", "luiti_middle", "luiti_summary") 8 | 9 | 10 | # plug more packages, compact with old tests, without to migrate them to webui_packages totally. 11 | luigi.plug_packages("project_A", "project_B", "zip_package_by_luiti") 12 | 13 | 14 | luiti_visualiser_env = VisualiserEnvTemplate({ 15 | "file_web_url_prefix": lambda: "http://HUE/filebrowser/#/", 16 | "date_begin": "2014-09-01", 17 | "additional_task_parameters": { 18 | "language": { 19 | "values": ["Chinese", "English"], 20 | "default": "English", 21 | } 22 | }, 23 | "package_config": { 24 | "defaults": ["luiti_dump", "luiti_clean", "luiti_middle", "luiti_summary", ], 25 | } 26 | }) 27 | -------------------------------------------------------------------------------- /tests/webui_packages/luiti_summary/luiti_summary/luiti_tasks/beta_report_day.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from .__init_luiti import WebuiDay, luigi 4 | 5 | 6 | @luigi.ref_tasks("CounterVisitorByBrowserDay", "CounterVisitorByRegionDay", "CounterVisitorDay") 7 | class BetaReportDay(WebuiDay): 8 | """ 9 | Beta report day's document. 10 | """ 11 | 12 | def requires(self): 13 | return [self.CounterVisitorByBrowserDay_task, 14 | self.CounterVisitorByRegionDay_task, 15 | self.CounterVisitorDay_task] 16 | -------------------------------------------------------------------------------- /tests/webui_packages/luiti_summary/setup.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | from setuptools import setup 4 | 5 | setup( 6 | name="luiti_summary", 7 | version="0.0.1", 8 | packages=[ 9 | "luiti_summary", 10 | "luiti_summary/luiti_tasks", ], 11 | zip_safe=False, 12 | ) -------------------------------------------------------------------------------- /tests/webui_packages/luiti_summary/tests/test_main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import sys 5 | root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 6 | sys.path.insert(0, root_dir) 7 | 8 | import unittest 9 | from luiti import MrTestCase 10 | 11 | 12 | @MrTestCase 13 | class TestMapReduce(unittest.TestCase): 14 | mr_task_names = [ 15 | ] 16 | 17 | if __name__ == '__main__': 18 | unittest.main() -------------------------------------------------------------------------------- /tests/webui_packages/luiti_webui_tests/luiti_webui_tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | """ 4 | Provide test environment for webui_packages. 5 | """ 6 | 7 | import os 8 | from etl_utils import cached_property 9 | from luiti import luigi, TaskDay, VisualiserEnvTemplate 10 | from luigi.mock import MockTarget 11 | 12 | 13 | @cached_property 14 | def root_dir(self): 15 | return os.path.join("/webui_packages", self.package_name) 16 | 17 | 18 | def data_file(self): 19 | return os.path.join(self.root_dir, self.task_clsname, self.date_str) 20 | 21 | 22 | def mock_output(self): 23 | """ Use luigi's feature. """ 24 | return MockTarget(self.data_file) 25 | 26 | 27 | class WebuiDay(TaskDay): 28 | """ 29 | Don't overwrite TaskDay or TaskBase, or will fail other tests files. 30 | """ 31 | pass 32 | 33 | 34 | WebuiDay.extend({ 35 | "root_dir": root_dir, 36 | "data_file": data_file, 37 | "output": mock_output, 38 | }) 39 | 40 | 41 | __all__ = ["luigi", "WebuiDay", "VisualiserEnvTemplate"] 42 | -------------------------------------------------------------------------------- /tests/zip_package_by_luiti/setup.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8-*- 2 | 3 | 4 | from setuptools import setup 5 | 6 | setup( 7 | name='zip_package_by_luiti', 8 | version='0.0.1', 9 | packages=[ 10 | 'zip_package_by_luiti', 11 | 'zip_package_by_luiti/subfold', ], 12 | zip_safe=True, 13 | ) 14 | -------------------------------------------------------------------------------- /tests/zip_package_by_luiti/zip_package_by_luiti/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dchentech/luiti/11a5c62b265a92910a1d4c82431e3697b8b06814/tests/zip_package_by_luiti/zip_package_by_luiti/__init__.py -------------------------------------------------------------------------------- /tests/zip_package_by_luiti/zip_package_by_luiti/subfold/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dchentech/luiti/11a5c62b265a92910a1d4c82431e3697b8b06814/tests/zip_package_by_luiti/zip_package_by_luiti/subfold/__init__.py -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # Tox (http://tox.testrun.org/) is a tool for running tests 2 | # in multiple virtualenvs. This configuration file will run the 3 | # test suite on all supported python versions. To use it, "pip install tox" 4 | # and then run "tox" from this directory. 5 | 6 | 7 | [tox] 8 | envlist = py{27}-{cdh}, pep8 9 | skipsdist = True 10 | 11 | [testenv] 12 | usedevelop = True 13 | deps= 14 | coverage>=3.6,<3.999 15 | coveralls 16 | nose 17 | mock 18 | setenv = 19 | COVERAGE_PROCESS_START={toxinidir}/.coveragerc 20 | FULL_COVERAGE=true 21 | commands = 22 | python --version 23 | python setup.py install 24 | nosetests --with-coverage --cover-inclusive --cover-package=luiti 25 | coverage combine 26 | coveralls 27 | 28 | [testenv:clean] 29 | commands= 30 | coverage erase 31 | 32 | [testenv:stats] 33 | commands= 34 | coverage report 35 | covarage html 36 | --------------------------------------------------------------------------------