├── .coveragerc
├── .gitignore
├── .travis.yml
├── MANIFEST.in
├── README.markdown
├── README.zh_CN.markdown
├── TODO.markdown
├── bin
└── luiti
├── changelog.markdown
├── example_webui_run.py
├── install-dependencies.sh
├── luiti
├── __init__.py
├── daemon
│ ├── __init__.py
│ ├── graph.py
│ ├── ptm.py
│ ├── query_engine
│ │ ├── __init__.py
│ │ ├── builder.py
│ │ ├── create_task.py
│ │ └── params.py
│ ├── utils
│ │ ├── __init__.py
│ │ ├── cache.py
│ │ ├── string.py
│ │ ├── task_storage.py
│ │ └── template.py
│ └── web
│ │ ├── __init__.py
│ │ ├── assets.py
│ │ ├── code_render.py
│ │ ├── handlers.py
│ │ └── server.py
├── java
│ └── MultipleTextFiles.java
├── luigi_decorators
│ ├── __init__.py
│ ├── as_a_luiti_task.py
│ ├── check_date_range.py
│ ├── check_runtime_range.py
│ ├── mr_local.py
│ ├── multiple_text_files.py
│ ├── persist_files.py
│ ├── plug_packages.py
│ └── ref_tasks.py
├── luigi_extensions
│ ├── __init__.py
│ ├── create_python_package.py
│ ├── hadoop_ext.py
│ ├── luigi_root_context.py
│ ├── manage_decorators.py
│ ├── parameter.py
│ ├── root_task.py
│ ├── task_base.py
│ └── task_init.py
├── manager
│ ├── __init__.py
│ ├── active_packages.py
│ ├── cli.py
│ ├── config.py
│ ├── dep.py
│ ├── files.py
│ ├── generate_from_templates.py
│ ├── lazy_data.py
│ ├── loader.py
│ ├── package_map.py
│ ├── sys_argv.py
│ └── table.py
├── schedule
│ ├── __init__.py
│ └── sensor_schedule.py
├── task_templates
│ ├── __init__.py
│ ├── other
│ │ ├── __init__.py
│ │ ├── hive_task.py
│ │ ├── mongo_import_task.py
│ │ └── static_file.py
│ └── time
│ │ ├── __init__.py
│ │ ├── task_biweekly.py
│ │ ├── task_biweekly_hadoop.py
│ │ ├── task_day.py
│ │ ├── task_day_hadoop.py
│ │ ├── task_hour.py
│ │ ├── task_hour_hadoop.py
│ │ ├── task_month.py
│ │ ├── task_month_hadoop.py
│ │ ├── task_quarter.py
│ │ ├── task_quarter_hadoop.py
│ │ ├── task_range.py
│ │ ├── task_range_hadoop.py
│ │ ├── task_week.py
│ │ ├── task_week_hadoop.py
│ │ ├── task_year.py
│ │ └── task_year_hadoop.py
├── tests
│ ├── __init__.py
│ ├── mr_test_case.py
│ └── setup_luiti_packages.py
├── utils
│ ├── __init__.py
│ ├── command_utils.py
│ ├── compress_utils.py
│ ├── date_utils.py
│ ├── ext_utils.py
│ ├── hdfs_utils.py
│ ├── io_utils.py
│ ├── math_utils.py
│ ├── mr_utils.py
│ ├── target_utils.py
│ └── visualiser_env_template.py
└── webui
│ ├── INSTALL.markdown
│ ├── assets
│ ├── javascripts
│ │ └── luiti.js
│ ├── jsx
│ │ └── luiti.jsx
│ └── stylesheets
│ │ └── luiti.css
│ ├── bower.json
│ └── index.html
├── requirements.txt
├── screenshots
├── README.markdown
├── luiti_code_show.png
├── luiti_webui_list.png
└── luiti_webui_show.png
├── setup.py
├── tests
├── client.cfg
├── jsons_data
│ └── mr_local.json
├── project_A
│ ├── __init__.py
│ └── luiti_tasks
│ │ ├── __init__.py
│ │ ├── __init_luiti.py
│ │ ├── a_day.py
│ │ ├── b_day.py
│ │ ├── c_day.py
│ │ ├── d_day.py
│ │ ├── foobar_day.py
│ │ ├── import_packages_day.py
│ │ └── multiple_dependent_day.py
├── project_B
│ ├── __init__.py
│ └── luiti_tasks
│ │ ├── __init__.py
│ │ ├── __init_luiti.py
│ │ └── h_day.py
├── test_daemon.py
├── test_luigi_decorators.py
├── test_main.py
├── test_manager.py
├── test_mr_test_case.py
├── test_schedule.py
├── test_task.py
├── test_task_templates.py
├── test_utils.py
├── webui_packages
│ ├── README.markdown
│ ├── luiti_clean
│ │ ├── README.markdown
│ │ ├── luiti_clean
│ │ │ ├── __init__.py
│ │ │ └── luiti_tasks
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __init_luiti.py
│ │ │ │ └── clean_web_log_day.py
│ │ ├── setup.py
│ │ └── tests
│ │ │ └── test_main.py
│ ├── luiti_dump
│ │ ├── README.markdown
│ │ ├── luiti_dump
│ │ │ ├── __init__.py
│ │ │ └── luiti_tasks
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __init_luiti.py
│ │ │ │ ├── dump_browser_map_day.py
│ │ │ │ └── dump_web_log_day.py
│ │ ├── setup.py
│ │ └── tests
│ │ │ └── test_main.py
│ ├── luiti_middle
│ │ ├── README.markdown
│ │ ├── luiti_middle
│ │ │ ├── __init__.py
│ │ │ └── luiti_tasks
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __init_luiti.py
│ │ │ │ ├── counter_visitor_by_browser_day.py
│ │ │ │ ├── counter_visitor_by_region_day.py
│ │ │ │ └── counter_visitor_day.py
│ │ ├── setup.py
│ │ └── tests
│ │ │ └── test_main.py
│ ├── luiti_summary
│ │ ├── README.markdown
│ │ ├── luiti_summary
│ │ │ ├── __init__.py
│ │ │ └── luiti_tasks
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __init_luiti.py
│ │ │ │ └── beta_report_day.py
│ │ ├── setup.py
│ │ └── tests
│ │ │ └── test_main.py
│ └── luiti_webui_tests
│ │ └── luiti_webui_tests
│ │ └── __init__.py
└── zip_package_by_luiti
│ ├── setup.py
│ └── zip_package_by_luiti
│ ├── __init__.py
│ └── subfold
│ └── __init__.py
└── tox.ini
/.coveragerc:
--------------------------------------------------------------------------------
1 | [report]
2 | omit =
3 | */python?.?/*
4 | */site-packages/nose/*
5 | *__init__*
6 | */__init__.py
7 | */*/__init__.py
8 | luiti/utils/__init__.py
9 | tests/*
10 | */setup.py
11 |
12 | [run]
13 | parallel = True
14 | source = luiti
15 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 |
8 | # Distribution / packaging
9 | .Python
10 | env/
11 | bin/
12 | build/
13 | develop-eggs/
14 | dist/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 |
25 | # Installer logs
26 | pip-log.txt
27 | pip-delete-this-directory.txt
28 |
29 | # Unit test / coverage reports
30 | htmlcov/
31 | .tox/
32 | .coverage
33 | .cache
34 | nosetests.xml
35 | coverage.xml
36 |
37 | # Translations
38 | *.mo
39 |
40 | # Mr Developer
41 | .mr.developer.cfg
42 | .project
43 | .pydevproject
44 |
45 | # Rope
46 | .ropeproject
47 |
48 | # Django stuff:
49 | *.log
50 | *.pot
51 |
52 | # Sphinx documentation
53 | docs/_build/
54 |
55 | *.yml
56 | *.jar
57 | *.coverage.*
58 |
59 | bower_components
60 | node_modules
61 | .idea/
62 | .DS_Store
63 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 |
3 | env:
4 | global:
5 | - PIP_DOWNLOAD_CACHE=$HOME/.pip-cache
6 | matrix:
7 | - TOXENV=pep8
8 | - TOXENV=docs
9 | - TOXENV=py27-nonhdfs
10 | - TOXENV=py33-nonhdfs
11 | - TOXENV=py34-nonhdfs
12 | - TOXENV=py27-cdh
13 | - TOXENV=py33-cdh
14 | - TOXENV=py34-cdh
15 |
16 | sudo: false
17 |
18 | cache:
19 | - $HOME/.pip-cache
20 |
21 | install:
22 | - pip install coveralls
23 | - pip install tox
24 |
25 | before_script:
26 | # allow ssh loopback
27 | - ssh-keygen -t rsa -N '' -C '' -f ~/.ssh/id_rsa
28 | - cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
29 | - ssh -o StrictHostKeyChecking=no localhost true
30 |
31 | - ./install-dependencies.sh
32 |
33 | script:
34 | - nosetests
35 | - coverage run --source=luiti setup.py test
36 |
37 | after_failure:
38 | - cat /home/travis/build/luiti/luiti/.tox/cdh/log/cdh-1.log
39 |
40 | after_success:
41 | - coveralls
42 |
43 | branches:
44 | only:
45 | - master
46 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include changelog.markdown
2 | include README.markdown
3 |
4 | include luiti/java/*.java
5 |
6 | include luiti/webui/assets/*/**
7 |
8 | include luiti/webui/*.html
9 | recursive-include luiti/webui/bower_components *
10 |
--------------------------------------------------------------------------------
/TODO.markdown:
--------------------------------------------------------------------------------
1 | 1. Seperate MapReduce's requires, one is used to input, another is used
2 | to dict.
3 | 2. Clean /tmp/sjfljslfjs after package task related files into a tar.
4 | 3. Support without current package.
5 |
6 |
7 | ## WebUI
8 | 1. OPTIMIZE task dep infos.
9 | 2. Add daemon tests.
10 | 2. Add webui tests.
11 |
--------------------------------------------------------------------------------
/bin/luiti:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import sys
4 | from luiti.manager import Cli
5 |
6 | cli = Cli(sys.argv)
7 | cli.run()
8 |
--------------------------------------------------------------------------------
/changelog.markdown:
--------------------------------------------------------------------------------
1 | ### 0.2.2 - Nov 10, 2015
2 | * Add lots of test cases
3 | * Document wording
4 |
5 | ### 0.2.1 - July 15, 2015
6 | * Add SensorSchedule to wait external task to finish lazily.
7 | * Add WebUI screenshots.
8 | * Lots of bug fixes.
9 |
10 | ### 0.2.0 - July 7, 2015
11 | * Add WebUI and daemon.
12 | * Lots of bug fixes and refactor.
13 |
14 | ### 0.1.4 - May 10, 2015
15 | * Add English README
16 |
17 | ### 0.1.3 - April 20, 2015
18 | * All codes are conform to PEP8 style.
19 | * Add @luigi.multiple_text_files decorator
20 |
21 | ### 0.1.2 - April 20, 2015
22 | * Project is more solid, add services such as travis, etc.
23 |
24 | ### 0.1.0 - March 24, 2015
25 | * Stable version, compact with luigi==1.0.19 and snakebite==1.3.8,
26 | is already validated in a production environment.
27 |
--------------------------------------------------------------------------------
/example_webui_run.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*-coding:utf-8-*-
3 |
4 | import os
5 | import sys
6 |
7 | try:
8 | # Try load installed version first.
9 | import luiti
10 | luiti
11 | except:
12 | root_dir = os.path.dirname(os.path.abspath(__file__))
13 | sys.path.insert(0, root_dir)
14 |
15 | import logging
16 | logger = logging.getLogger("luiti.server")
17 |
18 | # link webui_packages path
19 | from luiti.tests import SetupLuitiPackages
20 | config = SetupLuitiPackages.config
21 | from luiti.daemon import Server
22 |
23 |
24 | task_list_url = "http://localhost:8082/luiti/dag_visualiser?date_value=2015-07-09T00%3A00%3A00%2B08%3A00&language=English&luiti_package=luiti_summary&luiti_package=luiti_clean&luiti_package=luiti_dump&luiti_package=luiti_middle&luiti_package=project_A&luiti_package=project_B"
25 | task_show_url = "http://localhost:8082/luiti/dag_visualiser?date_value=2015-07-09T00%3A00%3A00%2B08%3A00&language=English&luiti_package=luiti_summary&luiti_package=luiti_clean&luiti_package=luiti_dump&luiti_package=luiti_middle&luiti_package=project_A&luiti_package=project_B&task_cls=BetaReportDay"
26 |
27 | # generated from http://www.network-science.de/ascii/
28 | print "Welcome to luiti's test webui example!"
29 | print
30 | print " Open below two urls in your favourite browser."
31 | print
32 | print " task_list_url: ", task_list_url
33 | print " task_show_url: ", task_show_url
34 | print
35 |
36 | Server("localhost", 8082).run()
37 |
--------------------------------------------------------------------------------
/install-dependencies.sh:
--------------------------------------------------------------------------------
1 | # Travis had already installed Node.js with npm.
2 | npm install bower -g
3 | cd luiti/webui; bower install; cd -;
4 |
5 | # Install eggs dependencies.
6 |
7 | # Fix => Reading http://pyparsing.wikispaces.com/ error: timed out
8 | pip install pyparsing --retries 10 --timeout 60
9 | python setup.py install
10 |
--------------------------------------------------------------------------------
/luiti/__init__.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ['luigi', 'config', "VisualiserEnvTemplate",
4 |
5 | 'TaskBase',
6 | "TaskHour",
7 | "TaskHourHadoop",
8 | "TaskDay",
9 | "TaskDayHadoop",
10 | "TaskWeek",
11 | "TaskWeekHadoop",
12 | "TaskBiweekly",
13 | "TaskBiweeklyHadoop",
14 | "TaskMonth",
15 | "TaskMonthHadoop",
16 | "TaskQuarter",
17 | "TaskQuarterHadoop",
18 | "TaskYear",
19 | "TaskYearHadoop",
20 | "TaskRange",
21 | "TaskRangeHadoop",
22 |
23 | 'RootTask',
24 |
25 | 'StaticFile',
26 | 'MongoImportTask',
27 | 'HiveTask',
28 |
29 | 'HadoopExt',
30 |
31 | 'manager',
32 |
33 | 'IOUtils', 'DateUtils', 'TargetUtils', 'HDFSUtils',
34 | 'MRUtils', 'MathUtils', 'CommandUtils',
35 | 'CompressUtils',
36 |
37 | 'ArrowParameter',
38 |
39 | 'os', 're', 'sys', 'defaultdict', 'json', 'cached_property',
40 | 'arrow',
41 |
42 | 'MrTestCase', ]
43 |
44 | import os
45 | import sys
46 | import re
47 | from collections import defaultdict
48 | import json
49 | from etl_utils import cached_property
50 |
51 | from .luigi_extensions import luigi
52 |
53 | from .task_templates import TaskHour, TaskDay, TaskWeek, TaskBiweekly, TaskMonth, TaskQuarter, TaskYear, TaskRange
54 | from .task_templates import TaskHourHadoop, TaskDayHadoop, TaskWeekHadoop, TaskBiweeklyHadoop, TaskMonthHadoop, TaskQuarterHadoop, TaskYearHadoop, TaskRangeHadoop
55 | from .task_templates import StaticFile, MongoImportTask, HiveTask
56 |
57 |
58 | from . import manager
59 | from .utils import IOUtils, DateUtils, TargetUtils, HDFSUtils
60 | from .utils import MRUtils, MathUtils, CommandUtils, CompressUtils
61 |
62 | import arrow
63 | from .luigi_extensions import RootTask, TaskBase, ArrowParameter, HadoopExt
64 |
65 | from .utils.visualiser_env_template import VisualiserEnvTemplate
66 |
67 | from .tests import MrTestCase
68 |
69 |
70 | config = manager.luiti_config
71 |
--------------------------------------------------------------------------------
/luiti/daemon/__init__.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["Server", ]
4 |
5 |
6 | from .web import Server
7 |
--------------------------------------------------------------------------------
/luiti/daemon/graph.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["Graph"]
4 |
5 | from copy import deepcopy
6 |
7 | from .utils import Template, stringify, TaskStorageSet, TaskStorageDict
8 |
9 |
10 | class Graph(object):
11 | """
12 | Analysis graph relation between nodes.
13 | """
14 |
15 | @staticmethod
16 | def analysis_dependencies_between_nodes(task_instances, selected_packages):
17 | """
18 | Based on Data:
19 | 1. Task_instances
20 | 2. Their `requires` informations.
21 |
22 | Related function is luiti.manager.dep.Dep.find_dep_on_tasks
23 | """
24 | uniq_set = TaskStorageSet(task_instances)
25 |
26 | # 1. raw `requires` and `invert` informations.
27 | # TODO TaskStorageDict are already String, modify back to real Python objects.
28 | task_instances_to_their_direct_requires = TaskStorageDict()
29 | task_instances_to_their_direct_upons = TaskStorageDict()
30 |
31 | for task_instance in task_instances:
32 | deps = Utils.read_requires_from_task(task_instance, selected_packages)
33 | selected_deps = [d1 for d1 in deps if d1 in uniq_set]
34 | task_instances_to_their_direct_requires[task_instance] = TaskStorageSet(selected_deps)
35 | for dep1 in selected_deps:
36 | task_instances_to_their_direct_upons[dep1].add(task_instance)
37 |
38 | # 2. unfold `requires` and `invert` informations.
39 | task_instances_to_their_total_requires = TaskStorageDict()
40 | task_instances_to_their_total_upons = TaskStorageDict()
41 |
42 | for task_instance in task_instances:
43 | Utils.add_total_deps(task_instances_to_their_total_requires, task_instances_to_their_direct_requires, task_instance)
44 | Utils.add_total_deps(task_instances_to_their_total_upons, task_instances_to_their_direct_upons, task_instance)
45 |
46 | def generate_result(_type="python"):
47 | """
48 | provide two versions of graph infos.
49 |
50 | 1. one for front-end javascript.
51 | 2. another for API python.
52 | """
53 | def wrap(obj):
54 | if _type == "python":
55 | return obj
56 | if _type == "json":
57 | return stringify(obj)
58 |
59 | return {
60 | "requires": {
61 | "direct": wrap(task_instances_to_their_direct_requires),
62 | "total": wrap(task_instances_to_their_total_requires),
63 | },
64 | "upons": {
65 | "direct": wrap(task_instances_to_their_direct_upons),
66 | "total": wrap(task_instances_to_their_total_upons),
67 | },
68 | }
69 |
70 | return {
71 | "python": generate_result("python"),
72 | "json": generate_result("json"),
73 | }
74 |
75 | @staticmethod
76 | def split_edges_into_groups(edges, nodes, task_instances):
77 | """
78 | Put linked task instances into a group.
79 | """
80 | edges = deepcopy(edges)
81 | groups = list() # element is set
82 |
83 | # make sure every node appear, even has not link to other tasks.
84 | for ti in task_instances:
85 | edges.append(Template.an_edge(ti, ti))
86 |
87 | # 1. first time, divid edges into groups.
88 | for edge in edges:
89 | is_in_current_groups = False
90 | for group in groups:
91 | if (edge["from"] in group) or (edge["to"] in group):
92 | is_in_current_groups = True
93 | group.add(edge["from"])
94 | group.add(edge["to"])
95 | if is_in_current_groups is False:
96 | groups.append(set([edge["from"], edge["to"]]))
97 |
98 | # 2. second time, merge groups that has common tasks
99 | # iterate to reduce redudant group
100 | result = list()
101 | for group1 in groups:
102 | append_idx = None
103 | for idx2, group2 in enumerate(result):
104 | if len(group1 & group2) > 0:
105 | append_idx = idx2
106 | break
107 | if append_idx is None:
108 | result.append(group1)
109 | else:
110 | result[append_idx] = result[append_idx] | group1
111 |
112 | result = sorted(result, key=lambda i1: (-len(i1), i1))
113 | return result
114 |
115 |
116 | class Utils(object):
117 | """ only for this file """
118 |
119 | @staticmethod
120 | def read_requires_from_task(task_instance, selected_packages):
121 | deps = task_instance.requires()
122 | if not isinstance(deps, list):
123 | deps = [deps]
124 | # make sure it's a valid luiti task
125 | deps = filter(lambda i1: hasattr(i1, "package_name"), deps)
126 | # filter is very important, or can't find dict data.
127 | deps = filter(lambda i1: i1.package_name in selected_packages, deps)
128 | return deps
129 |
130 | @staticmethod
131 | def add_total_deps(store, tree, store_node, fetch_node=None):
132 | """ add all recursive dependencies.
133 | 1. `store_node` used to store in a result store.
134 | 2. `fetch_node` used to fetch dependencies from a tree.
135 | """
136 | fetch_node = fetch_node or store_node
137 |
138 | for d1 in tree[fetch_node]:
139 | if d1 == store_node:
140 | continue
141 |
142 | store[store_node].add(d1)
143 |
144 | for d2 in tree[d1]:
145 | if d2 not in store[store_node]:
146 | Utils.add_total_deps(store, tree, store_node, d2)
147 |
--------------------------------------------------------------------------------
/luiti/daemon/ptm.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["PTM"]
4 |
5 |
6 | import sys
7 | from etl_utils import singleton, cached_property
8 | import importlib
9 | import inspect
10 |
11 | from .. import manager
12 | from ..utils import VisualiserEnvTemplate
13 |
14 |
15 | @singleton()
16 | class PackageTaskManagementClass(object):
17 | """
18 | Manage packages and tasks.
19 |
20 | When webui daemon started, these values are readed, and will not be modified. It means they are static.
21 | """
22 |
23 | @cached_property
24 | def current_package_name(self):
25 | return manager.luiti_config.get_curr_project_name()
26 |
27 | @cached_property
28 | def current_init_luiti(self):
29 | self.current_package_path # insert pacakge into sys.path
30 | __init_luiti = self.current_package_name + ".luiti_tasks.__init_luiti"
31 | return importlib.import_module(__init_luiti)
32 |
33 | @cached_property
34 | def current_package_path(self):
35 | p1 = manager.luiti_config.get_curr_project_path()
36 | sys.path.insert(0, p1)
37 | return p1
38 |
39 | @cached_property
40 | def current_luiti_visualiser_env(self):
41 | env = getattr(self.current_init_luiti, "luiti_visualiser_env", VisualiserEnvTemplate())
42 | assert isinstance(env, VisualiserEnvTemplate), env
43 | return env.data
44 |
45 | @cached_property
46 | def load_all_tasks_result(self):
47 | return manager.load_all_tasks()
48 |
49 | @cached_property
50 | def task_classes(self):
51 | return [i1["task_cls"] for i1 in self.load_all_tasks_result["success"]]
52 |
53 | @cached_property
54 | def task_class_names(self):
55 | return sorted([i1.__name__ for i1 in self.task_classes])
56 |
57 | @cached_property
58 | def task_clsname_to_package(self):
59 | return manager.PackageMap.task_clsname_to_package
60 |
61 | @cached_property
62 | def task_clsname_to_source_file(self):
63 | def get_pyfile(task_cls):
64 | f1 = inspect.getfile(task_cls)
65 | return f1.replace(".pyc", ".py")
66 |
67 | return {task_cls.__name__: get_pyfile(task_cls) for task_cls in self.task_classes}
68 |
69 | @cached_property
70 | def task_clsname_to_package_name(self):
71 | return {t1: p1.__name__ for t1, p1 in self.task_clsname_to_package.iteritems()}
72 |
73 | @cached_property
74 | def task_package_names(self):
75 | return sorted([p1.__name__ for p1 in set(self.task_clsname_to_package.values())])
76 |
77 | @cached_property
78 | def package_to_task_clsnames(self):
79 | return {package.__name__: sorted(list(task_clsnames)) for package, task_clsnames
80 | in manager.PackageMap.package_to_task_clsnames.iteritems()}
81 |
82 |
83 | PTM = PackageTaskManagementClass()
84 |
--------------------------------------------------------------------------------
/luiti/daemon/query_engine/__init__.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["Query"]
4 |
5 | from .builder import QueryBuilder
6 |
7 |
8 | class Query(object):
9 | """
10 | Use params to query some data from luiti.
11 | """
12 |
13 | cache = dict()
14 |
15 | def __init__(self, ptm):
16 | self.ptm = ptm # global task and package data.
17 |
18 | def get_env(self, raw_params=dict()):
19 | """
20 | Generate all data needed.
21 | """
22 | # Compact with yesterday and today are the same cache key.
23 | raw_params["date_value"] = raw_params.get("date_value", unicode(QueryBuilder.yesterday()))
24 |
25 | # TODO cache maybe replaced by a decorator, such as @functools.lru_cache
26 | cache_key = unicode(sorted(raw_params.items())) # A simple cache
27 |
28 | result = self.cache.get(cache_key, None)
29 | if result is None:
30 | result = QueryBuilder(self.ptm, raw_params).result
31 | self.cache[cache_key] = QueryBuilder(self.ptm, raw_params).result
32 |
33 | return result
34 |
--------------------------------------------------------------------------------
/luiti/daemon/query_engine/builder.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["QueryBuilder"]
4 |
5 | import arrow
6 | from etl_utils import cached_property
7 | from copy import deepcopy
8 |
9 | from ...luigi_extensions import ArrowParameter
10 | from ..graph import Graph
11 | from ..utils import stringify, Template, TaskStorageSet
12 | from .params import Params
13 | from .create_task import CreateTask
14 |
15 |
16 | class QueryBuilder(object):
17 | """
18 | Construct a query builder.
19 |
20 | All propertyies are generated lazily by using `cached_property`, as in a **DAG**.
21 | """
22 |
23 | def __init__(self, ptm, raw_params):
24 | assert isinstance(raw_params, dict), raw_params
25 |
26 | self.raw_params = raw_params
27 | self.ptm = ptm
28 |
29 | @cached_property
30 | def date_begin(self):
31 | return self.ptm.current_luiti_visualiser_env["date_begin"]
32 |
33 | @cached_property
34 | def date_end(self):
35 | date_end = self.ptm.current_luiti_visualiser_env.get("date_end", self.yesterday_str)
36 | self.ptm.current_luiti_visualiser_env["date_end"] = date_end
37 | return date_end
38 |
39 | @staticmethod
40 | def yesterday():
41 | return ArrowParameter.now().replace(days=-1).floor("day")
42 |
43 | @cached_property
44 | def yesterday_str(self):
45 | return QueryBuilder.yesterday().format("YYYY-MM-DD")
46 |
47 | @cached_property
48 | def accepted_params(self):
49 | """
50 | Comes from current luiti that selected.
51 | """
52 | return self.ptm.current_luiti_visualiser_env["additional_task_parameters"]
53 |
54 | @cached_property
55 | def accepted_query_params(self):
56 | """
57 | provide to visualSearch.js, used for autocomplete.
58 |
59 | user query via URL search.
60 |
61 | autocomplete params key/value.
62 | """
63 | # date range related.
64 | days_range = arrow.Arrow.range("day",
65 | ArrowParameter.get(self.date_begin),
66 | ArrowParameter.get(self.date_end))
67 | accepted_date_values = sorted(map(str, days_range))
68 |
69 | # result
70 | return {
71 | "date_value": accepted_date_values,
72 | "task_cls": self.ptm.task_class_names,
73 | "luiti_package": self.ptm.task_package_names,
74 | }
75 |
76 | @cached_property
77 | def default_query(self):
78 | """ Query provide by user config. """
79 | # assign default params
80 | default_query = {
81 | "date_value": str(QueryBuilder.yesterday()),
82 | # to insert more key-value
83 | }
84 |
85 | # get config from current package's luiti_visualiser_env
86 | for task_param, task_param_opt in self.accepted_params.iteritems():
87 | self.accepted_query_params[task_param] = task_param_opt["values"]
88 | default_query[task_param] = task_param_opt["default"]
89 |
90 | return default_query
91 |
92 | @cached_property
93 | def selected_query(self):
94 | selected_query = {k1: v1 for k1, v1 in self.raw_params.iteritems() if k1 in self.accepted_params or k1 == "date_value"}
95 | selected_query["luiti_package"] = self.selected_packages
96 | selected_query = dict(self.default_query.items() + selected_query.items())
97 |
98 | return selected_query
99 |
100 | @cached_property
101 | def default_packages(self):
102 | """ user provided. """
103 | return self.ptm.current_luiti_visualiser_env["package_config"].get("defaults", [])
104 |
105 | @cached_property
106 | def selected_packages(self):
107 | result = self.raw_params.get("luiti_package", self.default_packages)
108 | result = result or self.ptm.task_package_names
109 | return result
110 |
111 | @cached_property
112 | def selected_task_cls_names(self):
113 | """
114 | current selected.
115 | """
116 | result = set(self.raw_params.get("task_cls", []))
117 |
118 | # modify other cached_property
119 | self.selected_query["task_cls"] = list(result)
120 |
121 | return result
122 |
123 | @cached_property
124 | def total_task_instances(self):
125 | """
126 | Total task instances.
127 | """
128 | # 1. build possible params.
129 | # **remove** luiti_package and task_cls query str
130 | params_array = Params.build_params_array(self.default_query, self.selected_query)
131 |
132 | # 2. and generate task instances.
133 | total_task_instances = list()
134 | for ti in self.ptm.task_classes:
135 | # TODO why below two lines exist before.
136 | # if ti.__name__ not in self.selected_task_cls_names:
137 | # continue
138 |
139 | for _params in params_array:
140 | task_instance = CreateTask.new(ti, _params)
141 | total_task_instances.append(task_instance)
142 |
143 | result = sorted(list(set(total_task_instances)))
144 | return result
145 |
146 | @cached_property
147 | def selected_task_instances(self):
148 | """ nodes that drawed in vis.js """
149 | # filter by package
150 | result = sorted(list(set(self.total_task_instances)))
151 | result = filter(lambda ti: ti.package_name in self.selected_packages,
152 | result)
153 |
154 | # To avoid only self is in the graph.
155 | # If select task class, then to find linked task instances.
156 | if not self.selected_task_cls_names:
157 | return result
158 |
159 | pure_selected_task_instances = [ti for ti in result if ti.task_clsname in self.selected_task_cls_names]
160 | pure_linked = TaskStorageSet()
161 | for ti in pure_selected_task_instances:
162 | for t2 in self.graph_infos_python["requires"]["direct"][ti]:
163 | pure_linked.add(t2)
164 | for t2 in self.graph_infos_python["upons"]["direct"][ti]:
165 | pure_linked.add(t2)
166 |
167 | # filter that tasks are linked, in current task_classes.
168 | result = [ti for ti in result if ti in pure_linked]
169 | result.extend(pure_selected_task_instances)
170 | result = list(set(result))
171 | return result
172 |
173 | @cached_property
174 | def graph_infos_data(self):
175 | return Graph.analysis_dependencies_between_nodes(self.total_task_instances,
176 | self.selected_packages)
177 |
178 | @cached_property
179 | def graph_infos_python(self):
180 | return self.graph_infos_data["python"]
181 |
182 | @cached_property
183 | def nodes(self):
184 | return [Template.a_node(ti) for ti in self.selected_task_instances]
185 |
186 | @cached_property
187 | def edges(self):
188 | return Template.edges_from_nodes(self.selected_task_instances)
189 |
190 | @cached_property
191 | def nodes_groups(self):
192 | return Graph.split_edges_into_groups(self.edges,
193 | self.nodes,
194 | self.selected_task_instances)
195 |
196 | @cached_property
197 | def nodes_groups_in_view(self):
198 | return [sorted(list(nodes_set)) for nodes_set in self.nodes_groups]
199 |
200 | @cached_property
201 | def task_instance_repr_to_info(self):
202 | result = dict()
203 | for ti in self.total_task_instances:
204 | param_kwargs = deepcopy(ti.param_kwargs)
205 | if "pool" in param_kwargs:
206 | del param_kwargs["pool"]
207 | result[str(ti)] = {"task_cls": ti.task_clsname, "param_kwargs": stringify(param_kwargs)}
208 | return result
209 |
210 | @cached_property
211 | def result(self):
212 | return {
213 | "title": "Luiti WebUI, a DAG timely visualiser.",
214 |
215 | "queryparams": {
216 | "accepted": self.accepted_query_params,
217 | "selected_query": self.selected_query,
218 | "default_query": self.default_query,
219 | "luiti_visualiser_env": self.ptm.current_luiti_visualiser_env,
220 | },
221 |
222 | "ptm": {
223 | "task_class_names": self.ptm.task_class_names,
224 | "task_package_names": self.ptm.task_package_names,
225 | "task_clsname_to_package_name": self.ptm.task_clsname_to_package_name,
226 | "package_to_task_clsnames": self.ptm.package_to_task_clsnames,
227 | "task_instance_repr_to_info": self.task_instance_repr_to_info,
228 | },
229 |
230 | "nodeedge": {
231 | "nodes": self.nodes,
232 | "edges": self.edges,
233 | "nodes_groups": self.nodes_groups_in_view,
234 | "graph_infos": self.graph_infos_data["json"],
235 | },
236 |
237 | "errors": {
238 | "load_tasks": self.ptm.load_all_tasks_result["failure"],
239 | }
240 | }
241 |
--------------------------------------------------------------------------------
/luiti/daemon/query_engine/create_task.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 |
4 | __all__ = ["CreateTask"]
5 |
6 | import luigi
7 | from ..utils import CacheByDictKey
8 |
9 |
10 | class CreateTask(object):
11 |
12 | task_clsname_cache = dict()
13 |
14 | @staticmethod
15 | def new(task_cls, _params):
16 | """ Initialize a task instance, with filter invalid params. """
17 | task_cls_cache = CreateTask.task_clsname_cache.get(task_cls, None)
18 | if task_cls_cache is None:
19 | task_cls_cache = TaskInstanceCache(task_cls)
20 | CreateTask.task_clsname_cache[task_cls] = task_cls_cache
21 |
22 | return task_cls_cache[_params]
23 |
24 |
25 | class TaskInstanceCache(object):
26 | """
27 | To avoid create duplicated task instances.
28 | """
29 |
30 | def __init__(self, task_cls):
31 | self.task_cls = task_cls
32 | self.cache = CacheByDictKey(self.process)
33 |
34 | def __getitem__(self, _params):
35 | return self.cache[_params]
36 |
37 | def process(self, _params):
38 | _real_task_params = dict()
39 | for k1, v1 in _params.iteritems():
40 | has_key = hasattr(self.task_cls, k1)
41 | is_luigi_params = isinstance(getattr(self.task_cls, k1, None), luigi.Parameter)
42 | if has_key and is_luigi_params:
43 | _real_task_params[k1] = v1
44 | task_instance = self.task_cls(**_real_task_params)
45 | return task_instance
46 |
--------------------------------------------------------------------------------
/luiti/daemon/query_engine/params.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["Params"]
4 |
5 | from ...luigi_extensions import ArrowParameter
6 | import itertools
7 |
8 |
9 | class Params(object):
10 |
11 | @staticmethod
12 | def build_params_array(default_query, selected_query):
13 | """
14 | 1. build possible params
15 | 2. and with default params
16 | """
17 | selected_query_with_kv_array = list()
18 | for k1, v1 in selected_query.iteritems():
19 | k1_v2_list = list()
20 |
21 | # v1 is params value list
22 | if not isinstance(v1, list):
23 | v1 = [v1]
24 |
25 | if len(v1) == 0:
26 | continue # ignore key that no value.
27 |
28 | for v2 in v1:
29 | # Already overwrited params type and luigi.Task#__eq__ in luiti.
30 | # See more details at task_templates.time.task_base.py
31 | if k1 == "date_value":
32 | v2 = ArrowParameter.get(v2)
33 | else:
34 | v2 = unicode(v2)
35 | k1_v2_list.append({"key": k1, "val": v2})
36 | selected_query_with_kv_array.append(k1_v2_list)
37 |
38 | possible_params_in_kv = map(list, itertools.product(*selected_query_with_kv_array))
39 |
40 | params_array = list()
41 | for kv_list in possible_params_in_kv:
42 | opt = {kv1["key"]: kv1["val"] for kv1 in kv_list}
43 | opt = dict(default_query.items() + opt.items())
44 | params_array.append(opt)
45 |
46 | return sorted(params_array)
47 |
--------------------------------------------------------------------------------
/luiti/daemon/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["stringify",
4 | "TaskStorageSet", "TaskStorageDict",
5 | "Template",
6 | "CacheByDictKey", ]
7 |
8 |
9 | from .string import stringify
10 | from .task_storage import TaskStorageSet, TaskStorageDict
11 | from .template import Template
12 | from .cache import CacheByDictKey
13 |
--------------------------------------------------------------------------------
/luiti/daemon/utils/cache.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["CacheByDictKey"]
4 |
5 | # TODO cache maybe replaced by a decorator, such as @functools.lru_cache
6 | # 1. https://pypi.python.org/pypi/py_lru_cache/0.1.4 is slow, 100 ms, but simple dict cache is only 1 ms.
7 | # 2. https://github.com/tkem/cachetools dont support dict parameters.
8 |
9 |
10 | class CacheByDictKey(object):
11 | """
12 | Support cache by a dict.
13 |
14 | Only support dict[] operation.
15 | """
16 |
17 | def __init__(self, func):
18 | self.store = dict()
19 |
20 | assert callable(func)
21 | self.func = func
22 |
23 | def __getitem__(self, query):
24 | cache_key = self.generate_cache_key(query)
25 |
26 | result = self.store.get(cache_key, None)
27 | if result is None:
28 | result = self.func(query)
29 | self.store[cache_key] = result
30 | return result
31 |
32 | def generate_cache_key(self, query):
33 | assert isinstance(query, dict)
34 | return unicode(sorted(query.items()))
35 |
--------------------------------------------------------------------------------
/luiti/daemon/utils/string.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["stringify"]
4 |
5 |
6 | def stringify(default_dict):
7 | """
8 | make an object can be serialized by JSON.
9 |
10 | This function is not general, just for luiti.daemon .
11 | """
12 | result = dict()
13 | for k1, vs1 in default_dict.iteritems():
14 | # only wrap first level, such as ArrowParameter
15 | if isinstance(vs1, (list, set)):
16 | vs1 = map(str, vs1)
17 | else:
18 | vs1 = str(vs1)
19 | result[str(k1)] = vs1
20 | return result
21 |
--------------------------------------------------------------------------------
/luiti/daemon/utils/task_storage.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["TaskStorageSet", "TaskStorageDict"]
4 |
5 | from UserDict import UserDict
6 |
7 | """
8 | Task#__hash isn't consistent when one is from task_instances, and another is from `requires`.
9 |
10 | Here we use #task_id to compare that if two tasks are the same one.
11 | """
12 |
13 |
14 | class TaskStorageSet(set):
15 | """
16 | hash(luigi.Task) don't work well, so use `luigi.Task.task_id` fix it temporarily.
17 | """
18 |
19 | def __init__(self, task_list=list()):
20 | self.store = dict()
21 |
22 | for t1 in task_list:
23 | self.add(t1)
24 |
25 | def __contains__(self, t1):
26 | return t1.task_id in self.store
27 |
28 | def add(self, t1):
29 | self.store[t1.task_id] = t1
30 |
31 | def remove(self, t1):
32 | del self.store[t1.task_id]
33 |
34 | def __repr__(self):
35 | return repr(self.store.keys())
36 |
37 | def __len__(self):
38 | return len(self.store)
39 |
40 | def __iter__(self):
41 | return self.store.itervalues()
42 |
43 |
44 | class TaskStorageDict(UserDict):
45 |
46 | def __getitem__(self, ti):
47 | if ti.task_id in self.data:
48 | return self.data[ti.task_id]
49 | if hasattr(self.__class__, "__missing__"):
50 | return self.__class__.__missing__(self, ti)
51 | raise KeyError(ti)
52 |
53 | def __setitem__(self, ti, item):
54 | self.data[ti.task_id] = item
55 |
56 | def __delitem__(self, ti):
57 | del self.data[ti.task_id]
58 |
59 | def __missing__(self, ti):
60 | s1 = TaskStorageSet()
61 | self.data[ti.task_id] = s1
62 | return s1
63 |
--------------------------------------------------------------------------------
/luiti/daemon/utils/template.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["Template"]
4 |
5 | import luigi
6 |
7 |
8 | class Template(object):
9 | """
10 | Generate some output from entities.
11 | """
12 |
13 | @staticmethod
14 | def task_doc(ti):
15 | """ Get task doc from class. """
16 | doc = (ti.task_class.__doc__ or "").strip()
17 | if isinstance(doc, str):
18 | doc = doc.decode("UTF-8")
19 | return doc
20 |
21 | @staticmethod
22 | def a_node(ti):
23 | result = {"id": ti.task_id,
24 | "label": ti.task_class.__name__,
25 | "group": ti.package_name,
26 |
27 | "detail": str(ti),
28 | "data_file": ti.data_file,
29 | "task_doc": Template.task_doc(ti),
30 | "task_file": ti.task_class.__module__.replace(".", "/") + ".py",
31 | "package_name": ti.package_name,
32 | }
33 | result["size"] = 20
34 | return result
35 |
36 | @staticmethod
37 | def edges_from_nodes(nodes):
38 | """
39 | Generate relations between current task instances, but just only these task instances.
40 | """
41 | # 1. check input is valid
42 | assert isinstance(nodes, list)
43 | if len(nodes):
44 | assert isinstance(nodes[0], luigi.Task)
45 |
46 | edges = list()
47 | for ti in nodes:
48 | t2_in_requires = ti.requires()
49 | if not isinstance(t2_in_requires, list):
50 | t2_in_requires = [t2_in_requires]
51 | for t2 in t2_in_requires:
52 | if t2 is None: # dep on none tasks
53 | continue
54 | if t2 not in nodes:
55 | continue
56 | edges.append(Template.an_edge(t2, ti))
57 |
58 | return edges
59 |
60 | @staticmethod
61 | def an_edge(from_task, to_task):
62 | arrows = "to" # default
63 | if from_task == to_task:
64 | arrows = "self_to_self"
65 |
66 | result = {"id": from_task.task_id + " " + to_task.task_id, # id is uniq.
67 | "from": from_task.task_id,
68 | "source_name": from_task.task_class.__name__,
69 | "to": to_task.task_id,
70 | "target_name": to_task.task_class.__name__,
71 | "strength": 1.0,
72 | "arrows": arrows}
73 |
74 | return result
75 |
--------------------------------------------------------------------------------
/luiti/daemon/web/__init__.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["Server"]
4 |
5 |
6 | from .server import Server
7 |
--------------------------------------------------------------------------------
/luiti/daemon/web/assets.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["assets_main_dir", "assets_thirdparty_dir"]
4 |
5 |
6 | import os
7 |
8 |
9 | luiti_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
10 |
11 | assets_main_dir = os.path.join(luiti_dir, "webui/assets")
12 | assets_thirdparty_dir = os.path.join(luiti_dir, "webui/bower_components")
13 |
14 | assert os.path.isdir(assets_main_dir), "%s is not exists!" % assets_main_dir
15 | assert os.path.isdir(assets_thirdparty_dir), "%s is not exists!" % assets_thirdparty_dir
16 |
--------------------------------------------------------------------------------
/luiti/daemon/web/code_render.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["CodeRender"]
4 |
5 | from etl_utils import cached_property
6 |
7 |
8 | class CodeRender(dict):
9 | """ Highlight luiti task code written in Python. """
10 |
11 | @cached_property
12 | def highlight(self):
13 | """ Lazy load pygments, so user dont need to load all daemon code. """
14 | import pygments
15 | from pygments.lexers import PythonLexer
16 | lexer = PythonLexer()
17 |
18 | return lambda source_code: pygments.highlight(source_code, lexer, self.formatter)
19 |
20 | @cached_property
21 | def formatter(self):
22 | from pygments.formatters import HtmlFormatter
23 | return HtmlFormatter(linenos=True)
24 |
25 | @cached_property
26 | def css_html(self):
27 | return u"""""" % self.formatter.get_style_defs('.highlight')
28 |
29 | def __missing__(self, source_file):
30 | source_code = file(source_file).read()
31 |
32 | path_html = u"""
source_file: %s
""" % source_file
33 | code_html = self.highlight(source_code)
34 |
35 | body_html = path_html + code_html + self.css_html
36 | title = source_file.split("/")[-1]
37 |
38 | return u"""
39 |
40 |
41 | %s
42 |
43 |
44 | %s
45 |
46 |
47 | """ % (title, body_html)
48 |
--------------------------------------------------------------------------------
/luiti/daemon/web/handlers.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["web_handlers"]
4 |
5 | from etl_utils import cached_property
6 | import pkg_resources
7 | import tornado.web
8 |
9 | from .assets import assets_main_dir, assets_thirdparty_dir
10 | from ..ptm import PTM
11 | from ..query_engine import Query
12 | from .code_render import CodeRender
13 |
14 |
15 | class IndexHandler(tornado.web.RequestHandler):
16 |
17 | def get(self):
18 | # one query key has multiple values
19 | self.render("index.html")
20 |
21 | def get_template_path(self):
22 | return pkg_resources.resource_filename(__name__, "../../webui")
23 |
24 |
25 | class InitDataHandler(tornado.web.RequestHandler):
26 |
27 | @cached_property
28 | def query_engine(self):
29 | return Query(PTM)
30 |
31 | def get(self):
32 | params = self.request.query_arguments
33 | data = self.query_engine.get_env(params)
34 |
35 | self.write(data)
36 |
37 |
38 | class CodeShowHandler(tornado.web.RequestHandler):
39 |
40 | @cached_property
41 | def code_render(self):
42 | return CodeRender()
43 |
44 | def get(self, package_name, task_cls_name):
45 | # assert package and task exist!
46 | assert package_name in PTM.task_package_names
47 | assert task_cls_name in PTM.task_clsname_to_package
48 |
49 | source_file = PTM.task_clsname_to_source_file[task_cls_name]
50 | source_code = self.code_render[source_file]
51 | self.write(source_code)
52 |
53 |
54 | web_handlers = [
55 | # make a static HTML vis URL
56 | (r'/luiti/bower_components/(.*)', tornado.web.StaticFileHandler, {'path': assets_thirdparty_dir}),
57 | (r'/luiti/assets/(.*)', tornado.web.StaticFileHandler, {'path': assets_main_dir}),
58 |
59 | (r'/luiti/code/([^/]+)/([^/]+)', CodeShowHandler, {}),
60 | (r'/luiti/dag_visualiser', IndexHandler, {}),
61 | (r'/luiti/init_data.json', InitDataHandler, {}),
62 | (r'/', tornado.web.RedirectHandler, {"url": "/luiti/dag_visualiser"})
63 | ]
64 |
--------------------------------------------------------------------------------
/luiti/daemon/web/server.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | """
4 | A DAG timely visualiser.
5 |
6 | Draw DAG tasks under selected parameters.
7 | """
8 |
9 | from __future__ import unicode_literals
10 |
11 | __all__ = ["Server"]
12 |
13 | from etl_utils import cached_property
14 | import tornado.httpclient
15 | import tornado.httpserver
16 | import tornado.ioloop
17 | import tornado.netutil
18 | import tornado.web
19 | import tornado.escape
20 | from tornado.log import enable_pretty_logging
21 | enable_pretty_logging()
22 |
23 |
24 | import logging
25 | logger = logging.getLogger("luiti.server")
26 |
27 |
28 | # 1. Setup business package env
29 | # list current package's related tasks, group by package name.
30 | from .handlers import web_handlers
31 |
32 |
33 | class Server(object):
34 | """ A tornado server. """
35 |
36 | welcome_doc = u"""
37 | ( \ |\ /|\__ __/\__ __/\__ __/
38 | | ( | ) ( | ) ( ) ( ) (
39 | | | | | | | | | | | | |
40 | | | | | | | | | | | | |
41 | | | | | | | | | | | | |
42 | | (____/\| (___) |___) (___ | | ___) (___
43 | (_______/(_______)\_______/ )_( \_______/
44 | """
45 |
46 | def __init__(self, host, port):
47 | self.host = host
48 | self.port = port
49 |
50 | # Fix cant open http://0.0.0.0 on browser.
51 | self.url = "http://%s:%s" % (self.host.replace("0.0.0.0", "localhost"), self.port)
52 |
53 | print self.welcome_doc
54 | print "Luiti WebUI is mounted on %s" % self.url
55 |
56 | def run(self):
57 | """
58 | Runs one instance of the API server.
59 | """
60 | api_sockets = tornado.netutil.bind_sockets(self.port, address=self.host)
61 | server = tornado.httpserver.HTTPServer(self.app)
62 | server.add_sockets(api_sockets)
63 |
64 | logger.info("Scheduler starting up")
65 | tornado.ioloop.IOLoop.instance().start()
66 |
67 | @cached_property
68 | def app(self):
69 | """ return a API app instance. """
70 | settings = {
71 | "unescape": tornado.escape.xhtml_unescape,
72 | # "autoreload": True
73 | }
74 |
75 | return tornado.web.Application(web_handlers, **settings)
76 |
--------------------------------------------------------------------------------
/luiti/java/MultipleTextFiles.java:
--------------------------------------------------------------------------------
1 | package com.voxlearning.bigdata.MrOutput;
2 |
3 | import org.apache.hadoop.fs.Path;
4 | import org.apache.hadoop.io.Text;
5 | import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat;
6 |
7 | public class MultipleTextFiles extends MultipleTextOutputFormat {
8 | /**
9 | * Currently, the `reducer` function in luiti use below data format.
10 | * yield "", "{"json key": "json value"}"
11 | * If need multiple file output, then we use the unused yield key.
12 | *
13 | * Ref code: http://blog.csdn.net/lmc_wy/article/details/7532213
14 | */
15 |
16 | protected String generateFileNameForKeyValue(Text key, Text value, String name)
17 | {
18 | String outputName = key.toString(); // Get the current filename
19 | key.set(""); // We just need the value, so remove the unneeded key.
20 | return new Path(outputName, name).toString(); // 参考 https://github.com/klbostee/feathers
21 | }
22 |
23 | }
24 |
25 |
26 | /*
27 | * deploy ref: https://github.com/klbostee/feathers/blob/master/build.sh
28 | */
29 |
--------------------------------------------------------------------------------
/luiti/luigi_decorators/__init__.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | """
4 | This folder contains functions only. Please make sure dont make any complex `import` statements.
5 |
6 | See import logic at luiti/luigi_extensions/manage_decorators.py
7 | """
8 |
--------------------------------------------------------------------------------
/luiti/luigi_decorators/as_a_luiti_task.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["as_a_luiti_task"]
4 |
5 | import luigi
6 | from ..luigi_extensions import TaskBase, TaskInit
7 | from ..utils import ExtUtils
8 |
9 | # Extensions to luigi.Task
10 | task_base_members = [k1 for k1 in sorted(TaskBase.__dict__.keys()) if not k1.startswith("__")]
11 | task_base_members = [k1 for k1 in task_base_members if not k1.startswith("_abc")]
12 | """ member list, see details at TaskBase
13 | >>> ['_persist_files', '_ref_tasks', 'data_dir', 'data_file', 'data_name', 'date_str', 'date_type', 'date_value', 'date_value_by_type_in_begin', 'date_value_by_type_in_end', 'date_value_by_type_in_last', 'errput', 'instances_by_date_range', 'is_external', 'is_reach_the_edge', 'output', 'package_name', 'pre_task_by_self', 'requires', 'reset_date', 'root_dir', 'run', 'run_mode', 'task_class', 'task_clsname', 'task_namespace']
14 | """
15 |
16 |
17 | def as_a_luiti_task(**opts): # Decorator
18 | """
19 | Luigi's contrib are really Great, luiti would like to Reuse them through just a decorator.
20 |
21 | Usage:
22 |
23 | @luigi.as_a_luiti_task()
24 | class AnotherHiveDay(HiveQueryTask):
25 | pass
26 |
27 |
28 | https://github.com/spotify/luigi/tree/master/luigi/contrib
29 | """
30 |
31 | def func(task_cls):
32 | """ Main reason is to fix not overwrite `__init__` function. """
33 | # Make sure it's a luigi.contrib
34 | assert issubclass(task_cls, luigi.Task), task_cls
35 |
36 | # copy members to target class
37 | for member in task_base_members:
38 | base_val = getattr(TaskBase, member)
39 | target_val = getattr(task_cls, member, NotImplementedError)
40 | if target_val in [NotImplementedError, NotImplemented]:
41 | setattr(task_cls, member, base_val)
42 |
43 | # let `isinstance` works for this wrap task class
44 | class wrap_cls(task_cls, TaskBase, ExtUtils.ExtendClass):
45 | def __init__(self, *args, **kwargs):
46 | super(wrap_cls, self).__init__(*args, **kwargs)
47 | TaskInit.setup(self)
48 |
49 | wrap_cls.__doc__ = task_cls.__doc__
50 | wrap_cls.__module__ = task_cls.__module__
51 | wrap_cls.__name__ = task_cls.__name__
52 | task_cls = wrap_cls
53 |
54 | return task_cls
55 | return func
56 |
--------------------------------------------------------------------------------
/luiti/luigi_decorators/check_date_range.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["check_date_range"]
4 |
5 | from ..luigi_extensions import ArrowParameter
6 |
7 |
8 | def check_date_range(): # 装饰器
9 | """
10 | 从数据库导数据时,必须注意时间范围内的所有数据是否都齐全了。如果未齐全,
11 | 即在当前时间范围里导的话,那么就会缺失数据了,相当于提前导了。
12 |
13 | 比如在周六就把这周的关联数据导出来,那么周日的数据就没包含在里面。应该在下周一后才开始导。
14 | """
15 | def decorator(orig_run):
16 | def new_run(self):
17 | # 说明时间未到,然后就直接退出
18 | if ArrowParameter.now() < self.date_value_by_type_in_end:
19 | return False
20 | return orig_run(self)
21 | return new_run
22 |
23 | def func(cls):
24 | cls.run = decorator(cls.run)
25 | return cls
26 | return func
27 | # TODO support Hadoop
28 |
--------------------------------------------------------------------------------
/luiti/luigi_decorators/check_runtime_range.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["check_runtime_range"]
4 |
5 | from ..luigi_extensions import ArrowParameter
6 |
7 |
8 | def check_runtime_range(**opts_1): # 装饰器
9 | """
10 | Support hour/weekday indexed range.
11 |
12 | Optional params:
13 | 1. hour_num
14 | 2. weekday_num
15 | 3. now
16 | """
17 | def decorator(orig_run):
18 | def new_run(self):
19 | default_opts = {
20 | "hour_num": range(1, 25),
21 | "weekday_num": range(1, 8),
22 | }
23 | opts = dict(default_opts.items() + opts_1.items())
24 |
25 | now = ArrowParameter.now() # get current time
26 | hour_24 = int(now.format("H")) # 0, 1, 2, ..., 23, 24
27 | day_of_week_7 = int(now.format("d")) # 1, 2, 3, ..., 6, 7
28 |
29 | is_false = False
30 | if hour_24 not in opts['hour_num']:
31 | is_false = True
32 | if day_of_week_7 not in opts['weekday_num']:
33 | is_false = True
34 | if is_false:
35 | print "[info]", now, " is not in ", opts, \
36 | ", so the task exited."
37 | return False
38 |
39 | return orig_run(self)
40 | return new_run
41 |
42 | def func(cls):
43 | cls.run = decorator(cls.run)
44 | return cls
45 | return func
46 |
--------------------------------------------------------------------------------
/luiti/luigi_decorators/mr_local.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["mr_local"]
4 |
5 | from collections import defaultdict
6 | from etl_utils import process_notifier
7 | from ..utils import TargetUtils
8 |
9 |
10 | def mr_local(**opts):
11 | """
12 | Sometimes Hadoop streaming sucks, so we only use the solid HDFS, and turn
13 | MapReduce job into local mode.
14 |
15 | And `mr_local` is optimized by a fixed chunk write operation.
16 | """
17 |
18 | def mr_run(self):
19 | """ Overwrite BaseHadoopJobTask#run function. """
20 | # TODO maybe model cache
21 | map_kv_dict = defaultdict(list)
22 |
23 | inputs = self.input()
24 | if not isinstance(inputs, list):
25 | inputs = [inputs]
26 | for input_hdfs_1 in inputs:
27 | for line2 in TargetUtils.line_read(input_hdfs_1):
28 | for map_key_3, map_val_3 in self.mapper(line2):
29 | map_kv_dict[map_key_3].append(map_val_3)
30 |
31 | with self.output().open("w") as output1:
32 | fixed_chunk = list()
33 | for reduce_key_2 in process_notifier(map_kv_dict.keys()):
34 | reduce_vals_2 = map_kv_dict[reduce_key_2]
35 | for _, reduce_val_2 in self.reducer(
36 | reduce_key_2, reduce_vals_2):
37 | fixed_chunk.append(reduce_val_2)
38 |
39 | if len(fixed_chunk) % self.chunk_size == 0:
40 | output1.write("\n".join(fixed_chunk) + "\n")
41 | fixed_chunk = list()
42 | del map_kv_dict[reduce_key_2]
43 | output1.write("\n".join(fixed_chunk) + "\n")
44 |
45 | def wrap(cls):
46 | cls.run = mr_run
47 | cls.run_mode = "mr_local"
48 |
49 | opts["chunk_size"] = opts.get("chunk_size", 100)
50 | for k1, v1 in opts.iteritems():
51 | setattr(cls, k1, v1)
52 |
53 | return cls
54 | return wrap
55 |
--------------------------------------------------------------------------------
/luiti/luigi_decorators/multiple_text_files.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["multiple_text_files"]
4 |
5 | import os
6 | import commands
7 | from etl_utils import cached_property
8 | from ..utils import CommandUtils
9 | import luigi
10 |
11 |
12 | def multiple_text_files(opts=dict()):
13 | """
14 | Let current task class's result can support outputing into multiple files.
15 |
16 | Usage:
17 |
18 | ```python
19 | @luigi.multiple_text_files
20 | class ManAndWomanDay(TaskDayHadoop):
21 | def mapper(self, line1):
22 | item1 = MRUtils.json_parse(line1)
23 | yield item1['uid'], item1
24 |
25 | def reducer(self, uid1, vals_1):
26 | for item1 in vals_1:
27 | yield item1["gender"], MRUtils.str_dump(item1)
28 | ```
29 |
30 | So above code separate man and woman into two files. File name such as
31 | 1. man_and_woman_day.json/man
32 | 2. man_and_woman_day.json/woman
33 |
34 | But not the default one
35 | 1. man_and_woman_day.json/part-00000
36 |
37 | WARN:
38 | when use `@luigi.multiple_text_files`, consider to wrap subfolders with
39 | StaticFile task class.
40 | """
41 | def func(task_cls):
42 | cjc = CompileJavaCode()
43 |
44 | def compile_java_code(self):
45 | """ compile java code dynamically. """
46 | if not os.path.exists(cjc.target_jar):
47 | CommandUtils.execute(cjc.compile_cmd)
48 |
49 | setattr(task_cls, "output_format", cjc.output_format)
50 | setattr(task_cls, "libjars", [cjc.target_jar, ])
51 | setattr(task_cls, "compile_java_code", compile_java_code)
52 | return task_cls
53 |
54 | # Comptible with old API.
55 | if isinstance(opts, dict):
56 | return func
57 | if issubclass(opts, luigi.Task):
58 | return func(opts)
59 | raise ValueError(opts)
60 |
61 |
62 | class CompileJavaCode(object):
63 | """
64 | assemble jar.
65 | """
66 |
67 | java_namespace = "com.voxlearning.bigdata.MrOutput"
68 | java_lib = "MultipleTextFiles"
69 | output_format = ".".join([java_namespace, java_lib])
70 | root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
71 |
72 | @cached_property
73 | def java_file(self):
74 | return self.java_lib + ".java"
75 |
76 | @cached_property
77 | def target_class(self):
78 | return self.java_lib + ".class"
79 |
80 | @cached_property
81 | def target_jar(self):
82 | return os.path.join(self.root_dir, "java", self.java_lib + ".jar")
83 |
84 | @cached_property
85 | def compile_cmd(self):
86 | classes_dir = self.java_namespace.replace(".", "/")
87 | javac_cmd = commands.getoutput("which javac")
88 | java_classpath = commands.getoutput("hadoop classpath")
89 | jar_cmd = commands.getoutput("which jar")
90 |
91 | compile_cmd = ";\n".join([
92 | # no absolute path, compact with java namespace.
93 | "cd %s/java" % self.root_dir,
94 |
95 | """%s -classpath "%s" %s""" % (javac_cmd,
96 | java_classpath, self.java_file, ),
97 | "rm -rf %s" % classes_dir,
98 | "mkdir -p %s" % classes_dir,
99 | "cp %s %s" % (self.target_class, classes_dir),
100 | "%s cvf %s %s/*.class" % (jar_cmd, self.target_jar, classes_dir, ),
101 | ])
102 | return compile_cmd
103 |
--------------------------------------------------------------------------------
/luiti/luigi_decorators/persist_files.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["persist_files"]
4 |
5 | import os
6 | from luigi import Event
7 | from ..utils import IOUtils
8 |
9 |
10 | # NOTE deprecated
11 | def persist_files(*files): # 装饰器
12 | """ 多个data_file 可以用 DSL 描述,然后和 event_handler(Event.FAILURE) 绑定在一起 """
13 | def func(cls):
14 | # 1. 设置 持久化文件属性
15 | def wrap(file1): # 这样才可以保存 file1 变量,而不至于被覆写。
16 | def _file(self):
17 | return os.path.join(self.data_dir, file1 + ".json")
18 | return _file
19 |
20 | setattr(cls, "__persist_files", files)
21 | for file1 in getattr(cls, "__persist_files"):
22 | setattr(cls, file1, property(wrap(file1))) # @decorator
23 |
24 | # 2. 绑定 失败时删除这些文件
25 | def clean_tmp(task, exception):
26 | for file1 in files:
27 | IOUtils.remove_files(getattr(task, file1))
28 | # IOUtils.remove_files(task.data_file)
29 | # NOTE 好像 Hadoop 会自动处理失败任务的输出文件的,否则就会导致其在N次重试一直在running。
30 | cls.event_handler(Event.FAILURE)(clean_tmp)
31 |
32 | return cls
33 |
34 | return func
35 |
--------------------------------------------------------------------------------
/luiti/luigi_decorators/plug_packages.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["plug_packages"]
4 |
5 | from ..manager import luiti_config
6 |
7 |
8 | def plug_packages(*package_names):
9 | """
10 | Let luigi know which packages should be attached, and can send to
11 | YARN, etc.
12 |
13 | Package format can be any valid Python package name, such as "project_B" or
14 | "project_C==0.0.2", etc.
15 |
16 | Usage: use `active_packages` decorator to notice luigi that these packages
17 | should include.
18 | """
19 | for p1 in package_names:
20 | if p1:
21 | # load all packages's depended pacakges.
22 | luiti_config.attached_package_names.add(p1)
23 | # TODO why should do `luigi.hadoop.attach` in `active_packages`
24 |
--------------------------------------------------------------------------------
/luiti/luigi_decorators/ref_tasks.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["ref_tasks"]
4 |
5 | from ..manager import load_a_task_by_name, luiti_config
6 |
7 |
8 | def ref_tasks(*tasks): # 装饰器
9 | """
10 | 自动把依赖 Task 链接起来,通过属性访问。
11 |
12 | Example:
13 |
14 | ```python
15 | @ref_tasks("TaskB", "TaskC")
16 | class TaskA(TaskWeekBase):
17 | pass
18 |
19 | TaskA().TaskB == TaskB
20 | TaskA().TaskC == TaskC
21 | ```
22 | """
23 | def wrap_cls(ref_task_name):
24 | def _func(self):
25 | v1 = self.__dict__.get(ref_task_name, None)
26 | if v1 is None:
27 | v1 = load_a_task_by_name(ref_task_name)
28 | self.__dict__[ref_task_name] = v1
29 | return v1
30 | return _func
31 |
32 | def wrap_instance(ref_task_name, task_name):
33 | def _func(self):
34 | v1 = self.__dict__.get(task_name, None)
35 | if v1 is None:
36 | v1 = getattr(self, ref_task_name)(self.date_value)
37 | self.__dict__[task_name] = v1
38 | return v1
39 | return _func
40 |
41 | # Fix pickle dump, but it maybe unneeded.
42 | def __getstate__(self):
43 | """ Fix luiti_tasks module namespace conflicts. """
44 | for ref_task1 in self._ref_tasks:
45 | cname = ref_task1 # class name
46 | iname = ref_task1 + "_task" # instance name
47 |
48 | if cname in self.__dict__:
49 | del self.__dict__[cname]
50 | if iname in self.__dict__:
51 | del self.__dict__[iname]
52 | return self.__dict__
53 |
54 | def __setstate__(self, d1):
55 | # 1. default
56 | self.__dict__.update(d1)
57 | # 2. plug other package in `.__init_luiti`
58 | luiti_config.curr_project_name = self.package_name
59 | luiti_config.link_packages()
60 |
61 | # cached_property 捕获不了 ref_task_name 变量, 被重置为某一个了。。
62 | # property 可以捕获 ref_task_name 变量。
63 | def func(cls):
64 | setattr(cls, "_ref_tasks", tasks)
65 | for ref_task_name in cls._ref_tasks:
66 | setattr(cls, ref_task_name, property(wrap_cls(ref_task_name)))
67 |
68 | # TODO 根据当前日期返回。
69 | task_name = "%s_%s" % (ref_task_name, "task")
70 | setattr(cls, task_name,
71 | property(wrap_instance(ref_task_name, task_name)))
72 |
73 | # clear ref task info when pickle.dump
74 | setattr(cls, "__getstate__", __getstate__)
75 | return cls
76 | return func
77 |
--------------------------------------------------------------------------------
/luiti/luigi_extensions/__init__.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["TaskInit", "ArrowParameter", "TaskBase", "HadoopExt", "RootTask", "luigi"]
4 |
5 |
6 | from .task_init import TaskInit
7 | from .parameter import ArrowParameter
8 | from .task_base import TaskBase
9 | from .hadoop_ext import HadoopExt
10 | from .root_task import RootTask
11 |
12 | from .create_python_package import luigi
13 | from .manage_decorators import ManageDecorators
14 | ManageDecorators.bind_to(luigi)
15 |
--------------------------------------------------------------------------------
/luiti/luigi_extensions/create_python_package.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["create_packages_archive_with_support_egg"]
4 |
5 | import os
6 | from .luigi_root_context import luigi
7 |
8 | orig_create_packages_archive = luigi.hadoop.create_packages_archive
9 |
10 |
11 | def create_packages_archive_with_support_egg(packages, filename):
12 | """
13 | Fix original luigi's `create_packages_archive` cannt attach egg packages
14 | (zip file type) to tarfile, Cause it's coping file mechanism by absolute
15 | path.
16 | """
17 | # 1. original create tar file
18 | orig_create_packages_archive(packages, filename)
19 |
20 | # 2. append python egg packages that 1. not covered
21 | import tarfile
22 | tar = tarfile.open(filename, "a") # Force append
23 |
24 | logger = luigi.hadoop.logger
25 | fake_exists_path = "/" # root is awlays exists
26 |
27 | def get_parent_zip_file_within_absolute_path(path1):
28 | path2 = path1[:]
29 | is_success = False
30 | while path2 != fake_exists_path:
31 | path2 = os.path.dirname(path2)
32 | if os.path.isfile(path2):
33 | is_success = True
34 | break
35 | return is_success, path2
36 |
37 | def add(src, dst):
38 | logger.debug('adding to tar: %s -> %s', src, dst)
39 | tar.add(src, dst)
40 |
41 | import zipfile
42 | import tempfile
43 | for package1 in packages:
44 | path2 = (getattr(package1, "__path__", []) + [fake_exists_path])[0]
45 | if os.path.exists(path2):
46 | continue # so luigi can import it.
47 | if not path2.startswith("/"):
48 | continue # we only care about libraries.
49 |
50 | is_success, zipfilename3 = \
51 | get_parent_zip_file_within_absolute_path(path2)
52 | if is_success:
53 | tmp_dir3 = tempfile.mkdtemp()
54 | zipfile.ZipFile(zipfilename3).extractall(tmp_dir3)
55 |
56 | for root4, dirs4, files4 in os.walk(tmp_dir3):
57 | for file5 in files4:
58 | if file5.endswith(".pyc"):
59 | continue
60 | add(
61 | os.path.join(root4, file5),
62 | os.path.join(
63 | root4.replace(tmp_dir3, "").lstrip("/"), file5))
64 |
65 | client_cfg = os.path.join(os.getcwd(), "client.cfg")
66 | if os.path.exists(client_cfg):
67 | tar.add(client_cfg, "client.cfg")
68 | tar.close()
69 |
70 | luigi.hadoop.create_packages_archive = create_packages_archive_with_support_egg # wrap old function
71 |
--------------------------------------------------------------------------------
/luiti/luigi_extensions/hadoop_ext.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | from __future__ import print_function
4 |
5 | __all__ = ['HadoopExt']
6 |
7 | import sys
8 | import luigi.hadoop
9 | from luigi.hadoop import flatten
10 | from itertools import groupby
11 | from etl_utils import cached_property
12 |
13 | from ..utils import ExtUtils, TargetUtils
14 | from .task_init import TaskInit
15 |
16 | # See benchmark at https://gist.github.com/mvj3/02dca2bcc8b0ef1bbfb5
17 | # force to use faster ujson, or it's meaningless to use JSON format with no performance gained.
18 | import ujson as json
19 | import jsonpickle
20 |
21 |
22 | class LuitiHadoopJobRunner(luigi.hadoop.HadoopJobRunner):
23 | """ overwrite DefaultHadoopJobRunner.class """
24 |
25 | # params are copied from HadoopJobRunner
26 | def __init__(self, libjars=None, output_format=None):
27 | config = luigi.hadoop.configuration.get_config()
28 | opts = {
29 | "streaming_jar": config.get('hadoop', 'streaming-jar'),
30 | "output_format": output_format,
31 | "libjars": libjars,
32 | }
33 | super(LuitiHadoopJobRunner, self).__init__(**opts)
34 |
35 |
36 | DataInterchange = {
37 | "python": {"serialize": str,
38 | "internal_serialize": repr,
39 | "deserialize": eval},
40 | "json": {"serialize": json.dumps,
41 | "internal_serialize": json.dumps,
42 | "deserialize": json.loads},
43 | "jsonpickle": {"serialize": jsonpickle.dumps,
44 | "internal_serialize": jsonpickle.dumps,
45 | "deserialize": jsonpickle.loads}
46 | }
47 |
48 |
49 | class HadoopExt(luigi.hadoop.JobTask, ExtUtils.ExtendClass):
50 |
51 | # available formats are "python" and "json".
52 | data_interchange_format = "python"
53 |
54 | @cached_property
55 | def serialize(self):
56 | return DataInterchange[self.data_interchange_format]['serialize']
57 |
58 | @cached_property
59 | def internal_serialize(self):
60 | return DataInterchange[self.data_interchange_format]['internal_serialize']
61 |
62 | @cached_property
63 | def deserialize(self):
64 | return DataInterchange[self.data_interchange_format]['deserialize']
65 |
66 | def writer(self, outputs, stdout, stderr=sys.stderr):
67 | """
68 | Writer format is a method which iterates over the output records
69 | from the reducer and formats them for output.
70 |
71 | The default implementation outputs tab separated items.
72 | """
73 | for output in outputs:
74 | try:
75 | output = flatten(output)
76 | if self.data_interchange_format == "json":
77 | # Only dump one json string, and skip another one, maybe key or value.
78 | output = filter(lambda x: x not in ["", None], output)
79 | else:
80 | # JSON is already serialized, so we put `self.serialize` in a else statement.
81 | output = map(self.serialize, output)
82 | print("\t".join(map(str, output)), file=stdout)
83 | except:
84 | print(output, file=stderr)
85 | raise
86 |
87 | def _reduce_input(self, inputs, reducer, final=NotImplemented):
88 | """
89 | Iterate over input, collect values with the same key, and call the reducer for each unique key.
90 | """
91 | for key, values in groupby(inputs, key=lambda x: self.internal_serialize(x[0])):
92 | for output in reducer(self.deserialize(key), (v[1] for v in values)):
93 | yield output
94 | if final != NotImplemented:
95 | for output in final():
96 | yield output
97 | self._flush_batch_incr_counter()
98 |
99 | def internal_reader(self, input_stream):
100 | """
101 | Reader which uses python eval on each part of a tab separated string.
102 | Yields a tuple of python objects.
103 | """
104 | for input_line in input_stream:
105 | yield list(map(self.deserialize, input_line.split("\t")))
106 |
107 | def internal_writer(self, outputs, stdout):
108 | """
109 | Writer which outputs the python repr for each item.
110 | """
111 | for output in outputs:
112 | print("\t".join(map(self.internal_serialize, output)), file=stdout)
113 |
114 | run_mode = "mr_distribute"
115 | n_reduce_tasks = 1 # 体现在 输出的part-00000数量为reduce数量
116 |
117 | output_format = [
118 | # 单路输出。这个版本有问题。
119 | # "org.apache.hadoop.mapreduce.lib.output.TextOutputFormat",
120 | "org.apache.hadoop.mapred.TextOutputFormat", # 单路输出
121 | "org.apache.hadoop.mapred.lib.MultipleTextOutputFormat", # 多路输出
122 | ][0] # 默认是 单路输出
123 | output_format_default = output_format[:]
124 | libjars = []
125 |
126 | def __init__(self, *args, **kwargs):
127 | """ 参考 TaskBase, 确保在 继承时还可以有TaskBase的覆写日期功能。 """
128 | super(HadoopExt, self).__init__(*args, **kwargs)
129 | TaskInit.setup(self)
130 |
131 | # overwrite
132 | def job_runner(self):
133 | """ will be wraped in `run` function. """
134 | # Auto compile java code
135 | if self.output_format != self.output_format_default:
136 | self.compile_java_code()
137 |
138 | return LuitiHadoopJobRunner(
139 | output_format=self.output_format, libjars=self.libjars)
140 |
141 | def output(self):
142 | return TargetUtils.hdfs(self.data_file)
143 |
144 | def jobconfs_opts(self):
145 | return [
146 | "mapreduce.framework.name=yarn",
147 | 'mapred.reduce.tasks=%s' % self.n_reduce_tasks,
148 | ]
149 |
150 | def jobconfs(self):
151 | jcs = super(luigi.hadoop.JobTask, self).jobconfs()
152 | for conf_opt_1 in self.jobconfs_opts():
153 | jcs.append(conf_opt_1)
154 | return jcs
155 |
156 | # TestCase related attrs
157 | def mrtest_input(self):
158 | raise NotImplementedError
159 |
160 | def mrtest_output(self):
161 | raise NotImplementedError
162 |
163 | def mrtest_attrs(self):
164 | return dict()
165 |
166 | def reader(self, input_stream):
167 | """
168 | Overwrite luigi, skip blank line
169 | """
170 | for line in input_stream:
171 | line = line.strip()
172 | if line:
173 | yield line,
174 |
--------------------------------------------------------------------------------
/luiti/luigi_extensions/luigi_root_context.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["luigi"]
4 |
5 | """
6 | Bind all things to `luigi` root namespace.
7 | """
8 |
9 |
10 | import luigi.hdfs
11 | luigi.hdfs = luigi.hdfs # just make a link
12 |
13 | import luigi.hadoop
14 | luigi.hadoop = luigi.hadoop # just make a ref
15 |
16 | from .hadoop_ext import HadoopExt
17 | luigi.hadoop.HadoopExt = HadoopExt # write back
18 | # NOTE 对 luigi.hadoop 兼容 "track the job: "
19 |
20 | luigi.debug = False
21 |
22 | luigi.tmp_dir = "/tmp" # default one
23 |
24 | # TODO lazily
25 | from ..utils import TargetUtils
26 | luigi.HDFS = TargetUtils.hdfs # 本来就是需要读取全局配置,所以索性就绑定在 luigi 命名空间了吧。
27 |
28 |
29 | from ..manager import luiti_config, active_packages
30 | luigi.ensure_active_packages = lambda: active_packages # make a wrap
31 | luigi.luiti_config = luiti_config
32 | luiti_config.linked_luigi = luigi
33 |
--------------------------------------------------------------------------------
/luiti/luigi_extensions/manage_decorators.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | import os
4 | import glob
5 |
6 |
7 | class ManageDecorators(object):
8 |
9 | @staticmethod
10 | def bind_to(luigi):
11 | root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
12 | decorator_dir = os.path.join(root_dir, "luigi_decorators")
13 | files = glob.glob(os.path.join(decorator_dir, "*.py"))
14 |
15 | # The decorator name Must as the same as the filename.
16 | decorator_names = map(lambda i1: i1.split("/")[-1].split(".")[0], files)
17 | decorator_names = filter(lambda i1: not i1.startswith("__"), decorator_names)
18 | assert len(decorator_names) > 0, decorator_names
19 |
20 | for name in decorator_names:
21 | try:
22 | mod = __import__("luiti.luigi_decorators." + name, fromlist=[name])
23 | except ImportError:
24 | print "[Import error decorator name]", name
25 | exit()
26 | func = getattr(mod, name)
27 | setattr(luigi, name, func)
28 |
29 | return luigi
30 |
--------------------------------------------------------------------------------
/luiti/luigi_extensions/parameter.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ['ArrowParameter', "arrow"]
4 |
5 | import luigi
6 | import arrow
7 | from dateutil import tz
8 |
9 |
10 | class ArrowParameter(luigi.DateParameter):
11 |
12 | """
13 | Convert date or time type into Arrow type.
14 |
15 | "2014-11-24T00:00:00+00:00" # => len 25
16 | "2014-11-24" # => len 10
17 | """
18 |
19 | arrow = arrow # make a ref
20 |
21 | def parse(self, s):
22 | """ overwrite default implement. """
23 | s = str(s) # ensure `s` is a str
24 | assert len(s) in [25, 10], \
25 | "Date format must be 2014-11-24T00:00:00+00:00 or 2014-11-24 !"
26 | return ArrowParameter.get(s)
27 |
28 | @staticmethod
29 | def get(*strs):
30 | """ 把原始的 `arrow.get` 兼容 tzlocal """
31 | return arrow.get(*strs).replace(tzinfo=tz.tzlocal())
32 |
33 | @staticmethod
34 | def now():
35 | return ArrowParameter.get(arrow.now())
36 |
--------------------------------------------------------------------------------
/luiti/luigi_extensions/root_task.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os
4 | import luigi
5 | from luigi import LocalTarget
6 |
7 |
8 | class RootTask(luigi.Task):
9 |
10 | def output(self):
11 | return LocalTarget(os.path.realpath(__file__)) # exist for ever
12 |
--------------------------------------------------------------------------------
/luiti/luigi_extensions/task_base.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ['TaskBase']
4 |
5 | import os
6 | import arrow
7 | from inflector import Inflector
8 | from etl_utils import cached_property
9 |
10 | from .luigi_root_context import luigi
11 | from .root_task import RootTask
12 | from ..utils import DateUtils, ExtUtils, IOUtils
13 | from ..manager import luiti_config
14 |
15 | from .parameter import ArrowParameter
16 | from .task_init import TaskInit
17 |
18 |
19 | class TaskBase(luigi.Task, ExtUtils.ExtendClass):
20 | """ 继承的子类在类名后 必须加 **时间类型**, 如 Day, Week, ... """
21 |
22 | run_mode = ["local", "mr_distribute", "mr_local"][0]
23 |
24 | date_value = ArrowParameter() # **统一** 时间类型, 防止同时跑多个任务
25 |
26 | # will overwritten by @decorator
27 | # 不能以 **两个 __ 开头**, 否则会被 Python 当作隐私变量而无法继承。TODO 隐私变量 可能是错的。
28 | _persist_files = []
29 | _ref_tasks = []
30 |
31 | is_external = False # mark current task as a External Task, same to luigi.ExternalTask
32 |
33 | root_dir = NotImplementedError
34 |
35 | # Default one, always return True
36 | def requires(self):
37 | return RootTask()
38 |
39 | run = NotImplementedError
40 |
41 | def __init__(self, *args, **kwargs):
42 | # Fix date_value type
43 | if "date_value" in kwargs:
44 | kwargs["date_value"] = ArrowParameter.get(kwargs["date_value"])
45 | if len(args) == 1: # just the luiti's date_value parameter
46 | args = (ArrowParameter.get(args[0]), )
47 |
48 | super(TaskBase, self).__init__(*args, **kwargs)
49 | TaskInit.setup(self)
50 |
51 | @cached_property
52 | def data_dir(self):
53 | assert self.root_dir, "self.root_dir should not be None!"
54 | return os.path.join(self.root_dir, self.date_str)
55 |
56 | @cached_property
57 | def data_file(self):
58 | return os.path.join(self.data_dir, self.data_name + ".json")
59 |
60 | @cached_property
61 | def data_name(self):
62 | return Inflector().underscore(self.__class__.__name__)
63 |
64 | def output(self):
65 | return IOUtils.local_target(self.data_file)
66 |
67 | def errput(self):
68 | return IOUtils.local_target(self.data_file + ".err")
69 |
70 | @cached_property
71 | def date_str(self):
72 | return self.date_value.strftime("%Y-%m-%d")
73 |
74 | @cached_property
75 | def date_type(self):
76 | return luiti_config.get_date_type(self.__class__.__name__)
77 |
78 | @cached_property
79 | def date_value_by_type_in_last(self):
80 | return DateUtils.date_value_by_type_in_last(
81 | self.date_value, self.date_type)
82 |
83 | @cached_property
84 | def date_value_by_type_in_begin(self):
85 | return ArrowParameter.get(self.date_value).floor(self.date_type)
86 |
87 | @cached_property
88 | def date_value_by_type_in_end(self):
89 | return ArrowParameter.get(self.date_value).ceil(self.date_type)
90 |
91 | @cached_property
92 | def pre_task_by_self(self):
93 | """ 如果跨了两个周期就没有上次数据文件了 """
94 | return RootTask() if self.is_reach_the_edge else \
95 | self.__class__(self.date_value_by_type_in_last)
96 |
97 | @cached_property
98 | def is_reach_the_edge(self):
99 | return False # default. e.g. add semester
100 |
101 | def reset_date(self):
102 | # **强制** 写为统一时间格式(arrow格式),这样luigi就不会同时跑两个任务了。
103 | self.date_value = ArrowParameter.get(self.date_value)
104 |
105 | orig_date = self.date_value
106 | if self.date_type != 'range':
107 | new_date = orig_date.floor(self.date_type)
108 | if orig_date != new_date:
109 | if luigi.debug:
110 | print "[reset date by %s] from %s to %s" % \
111 | (self.date_type, orig_date, new_date)
112 | self.date_value = new_date
113 |
114 | @classmethod
115 | def instances_by_date_range(cls, first_date, last_date):
116 | """ 返回属于某周期里的所有当前任务实例列表 """
117 | assert isinstance(first_date, arrow.Arrow)
118 | assert isinstance(last_date, arrow.Arrow)
119 |
120 | if "Range" in cls.__name__:
121 | # return head and tail directly
122 | return list(set([cls(first_date), cls(last_date)]))
123 | else:
124 | dates = arrow.Arrow.range(
125 | luiti_config.get_date_type(cls.__name__),
126 | first_date, last_date)
127 | return [cls(date1.datetime) for date1 in dates]
128 |
129 | @cached_property
130 | def task_class(self):
131 | return self.__class__
132 |
133 | @cached_property
134 | def task_clsname(self):
135 | return self.task_class.__name__
136 |
137 | @cached_property
138 | def package_name(self):
139 | module_name = self.task_class.__module__
140 | package_name = module_name.split(".")[0]
141 | return package_name
142 |
--------------------------------------------------------------------------------
/luiti/luigi_extensions/task_init.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | from dateutil import tz
4 | from .parameter import ArrowParameter
5 |
6 |
7 | class TaskInit(object):
8 |
9 | @staticmethod
10 | def setup(task_instance):
11 | """
12 | Let luigi'Task supports luiti's operations.
13 |
14 | You need to call this function, if you want to extend luigi.
15 | """
16 | self = task_instance
17 |
18 | # 在跨期的时候用于判断 该周应该是该周的哪些天。
19 | # 比如这学期开学是 2015-02-17(星期二) 开学, 那么这周的数据只有 0217-0222。
20 | # 而在寒假里(即run 2015-02-16(星期天) 的 task 时,那么该周的天只有 0216 一天。
21 | d1 = ArrowParameter.get(self.date_value).replace(tzinfo=tz.tzlocal())
22 | self.orig_date_value = d1 # exists only if this `setup` executed.
23 |
24 | # reset date to at the beginning of current date type here
25 | self.reset_date()
26 |
27 | assert task_instance.root_dir is not NotImplementedError, [task_instance, task_instance.root_dir]
28 | self.data_file # force load it now, or `output` still load it.
29 | self.package_name # force load it now, use to serialize
30 |
31 | # Fix luigi.Task#__eq__
32 | """
33 | >>> t1.param_args
34 | (,)
35 | >>> map(str, t1.param_args)
36 | ['2015-06-23T00:00:00+08:00']
37 |
38 | def __eq__(self, other):
39 | return self.__class__ == other.__class__ and self.param_args == other.param_args
40 | """
41 | self.param_kwargs["date_value"] = ArrowParameter.get(self.param_kwargs["date_value"])
42 | self.param_args = tuple(sorted(map(str, [value for key, value in self.param_kwargs.iteritems()])))
43 |
44 | # NOTE below codes are copied from luigi's Task
45 | # Build up task id
46 | task_id_parts = ["%s=%s" % (k1, v1) for k1, v1 in self.param_kwargs.iteritems() if k1 not in ["pool"]]
47 | self.task_id = '%s(%s)' % (self.task_family, ', '.join(task_id_parts))
48 | self.__hash = hash(self.task_id)
49 |
--------------------------------------------------------------------------------
/luiti/manager/__init__.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = [
4 | "ld",
5 |
6 | "load_a_task_by_name",
7 | "print_all_tasks",
8 | "new_a_project",
9 | "generate_a_task",
10 | "find_dep_on_tasks",
11 |
12 | "active_packages",
13 |
14 | "luiti_config",
15 |
16 | "Cli",
17 | "PackageMap",
18 | ]
19 |
20 | from .loader import Loader
21 | from .table import Table
22 | from .dep import Dep
23 | from .files import Files
24 |
25 | from .config import luiti_config
26 | from .package_map import PackageMap
27 | from .active_packages import active_packages
28 |
29 |
30 | from .generate_from_templates import GenerateFromTemplates
31 |
32 | from .cli import Cli
33 |
34 |
35 | # API list
36 | find_dep_on_tasks = Dep.find_dep_on_tasks
37 | get_all_date_file_to_task_instances = Files.get_all_date_file_to_task_instances
38 | soft_delete_files = Files.soft_delete_files
39 | load_all_tasks = Loader.load_all_tasks
40 | load_a_task_by_name = Loader.load_a_task_by_name
41 | print_all_tasks = Table.print_all_tasks
42 | print_files_by_task_cls_and_date_range = \
43 | Table.print_files_by_task_cls_and_date_range
44 | new_a_project = GenerateFromTemplates.new_a_project
45 | generate_a_task = GenerateFromTemplates.generate_a_task
46 |
47 |
48 | from .lazy_data import ld
49 |
--------------------------------------------------------------------------------
/luiti/manager/active_packages.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | import os
4 | from .config import luiti_config as lc
5 |
6 | processed_package_names = set([])
7 |
8 |
9 | def active_packages(orig_func):
10 | """
11 | called by `PackageMap.task_clsname_to_package`
12 | """
13 | def new_func(*args, **kwargs):
14 | # 1. Setup env
15 | lc.link_packages()
16 |
17 | # 2. Load related packages.
18 | import pkg_resources
19 | import luigi.hadoop
20 | import re
21 |
22 | # fix Set changed size during iteration
23 | for p1 in list(lc.attached_package_names):
24 | package2, version2 = re.compile("(^[a-z0-9\_]+)(.*)", re.IGNORECASE) \
25 | .match(p1).groups()
26 | if package2 in processed_package_names:
27 | continue
28 | else:
29 | # Pip cant manage versions packages, only exist one version at
30 | # one time.
31 | try:
32 | if version2:
33 | pkg_resources.require(p1)
34 | except:
35 | pkg_resources.require(package2)
36 |
37 | # TODO luiti 拷之前需要版本,之后不需要,分布式时判断目录packages即可。
38 | # Notice Python to import special version package.
39 | # if version2: pkg_resources.require(p1)
40 |
41 | # Let luigi know it.
42 | package2_lib = lc.import2(package2)
43 | luigi.hadoop.attach(package2_lib)
44 |
45 | # Add valid package which has .luiti_tasks
46 | # compact with package with a plain python file.
47 | try:
48 | path = (package2_lib.__path__ + [""])[0]
49 | except:
50 | print "[package2_lib load error]", package2_lib
51 | path = "/package/load/error"
52 | # TODO 兼容 egg zip 格式,看看里面有没有 luiti_tasks
53 | # 文件,然后提示加 zip_safe=False
54 | if os.path.exists(path + "/luiti_tasks"):
55 | # .__init_luiti Maybe not exists, so execute this first
56 | lc.luiti_tasks_packages.add(package2_lib)
57 | processed_package_names.add(p1)
58 | return orig_func(*args, **kwargs) # call it at last.
59 | new_func.func_name = orig_func.func_name
60 | return new_func
61 |
--------------------------------------------------------------------------------
/luiti/manager/config.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | import os
4 | import sys
5 | from inflector import Inflector
6 | from etl_utils import singleton, cached_property
7 | import arrow
8 |
9 |
10 | @singleton()
11 | class LuitiConfigClass(object):
12 |
13 | """ Make sure init variables only once. """
14 | # arrow.Arrow._ATTRS = ['year', 'month', 'day', 'hour', 'minute', 'second', 'microsecond']
15 | DateTypes = ["range", "week", "biweekly", "quarter"] + arrow.Arrow._ATTRS
16 |
17 | curr_project_name = None
18 | curr_project_dir = None
19 |
20 | linked_luigi = None
21 |
22 | @cached_property
23 | def attached_package_names(self):
24 | return set(['luiti'])
25 |
26 | @cached_property
27 | def luiti_tasks_packages(self):
28 | return set([])
29 |
30 | @staticmethod
31 | def import2(a_package):
32 | return __import__(a_package, None, None, 'non_empty')
33 |
34 | @staticmethod
35 | def get_date_type(name1):
36 | """ Inherit class must be in TaskBase{Day,Week,Month,Range} style. """
37 | assert isinstance(name1, (str, unicode))
38 | str1 = Inflector().underscore(name1).split("_")[-1].lower()
39 | assert str1 in luiti_config.DateTypes, [str1, luiti_config.DateTypes]
40 | return str1
41 |
42 | @staticmethod
43 | def get_time_task(name1):
44 | """ return e.g. TaskDay """
45 | type2 = luiti_config.get_date_type(name1)
46 | return "Task" + Inflector().camelize(type2)
47 |
48 | @staticmethod
49 | def link_packages():
50 | """
51 | called by `active_packages`
52 | """
53 | is_in_luigi_distributed = False
54 |
55 | # 1. unmornal task class
56 | if luiti_config.curr_project_name == "__main__":
57 | return False
58 |
59 | # 2. setup current project as root
60 | if luiti_config.curr_project_dir is None:
61 | luiti_config.curr_project_dir = os.getcwd() # auto from current class
62 | luiti_config.fix_project_dir()
63 |
64 | def exists(filename1):
65 | return os.path.exists(os.path.join(luiti_config.curr_project_dir, filename1))
66 |
67 | # These files are created by luigi.
68 | if exists("job-instance.pickle") and exists("job.jar") and \
69 | exists("packages.tar") and exists("luigi"):
70 | is_in_luigi_distributed = True
71 |
72 | # compact with no-luiti project
73 | is_a_luiti_project = exists("luiti_tasks")
74 |
75 | if luiti_config.curr_project_name is None:
76 | if is_in_luigi_distributed:
77 | for item1 in os.listdir(luiti_config.curr_project_dir):
78 | # is a valid python package
79 | if exists(item1 + "/__init__.py") and \
80 | exists(item1 + "/luiti_tasks"):
81 | luiti_config.luiti_tasks_packages.add(luiti_config.import2(item1))
82 | else:
83 | # "project_A"
84 | curr_project_name = luiti_config.get_curr_project_name()
85 | luiti_config.curr_project_name = curr_project_name
86 |
87 | # project_A/
88 | curr_project_syspath = os.path.dirname(luiti_config.curr_project_dir)
89 | if curr_project_syspath not in sys.path:
90 | sys.path.insert(0, curr_project_syspath)
91 |
92 | luiti_config.luiti_tasks_packages.add(luiti_config.import2(luiti_config.curr_project_name))
93 |
94 | # 3. ensure other luiti tasks packages can be loaded.
95 | if is_a_luiti_project:
96 | luiti_config.import2(
97 | luiti_config.curr_project_name + ".luiti_tasks.__init_luiti")
98 |
99 | def get_curr_project_path(self):
100 | curr_package_name = self.get_curr_project_name()
101 | curr_path = luiti_config.curr_project_dir
102 | dir1 = curr_path.rstrip("/")
103 | if dir1.split("/").count(curr_package_name) == 2:
104 | dir1 = os.path.dirname(dir1)
105 | return dir1
106 |
107 | def get_curr_project_name(self):
108 | """ a valid Python package path. """
109 | assert isinstance(luiti_config.curr_project_dir, str), luiti_config.curr_project_dir
110 | return os.path.basename(luiti_config.curr_project_dir)
111 |
112 | def fix_project_dir(self):
113 | """ Fix project_A/project_A/luiti_tasks dir """
114 | _try_dir = os.path.join(
115 | luiti_config.curr_project_dir,
116 | os.path.basename(luiti_config.curr_project_dir))
117 | if os.path.exists(_try_dir): # cause of the same name
118 | luiti_config.curr_project_dir = _try_dir
119 |
120 |
121 | luiti_config = LuitiConfigClass()
122 |
--------------------------------------------------------------------------------
/luiti/manager/dep.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | from collections import defaultdict
4 |
5 |
6 | class Dep(object):
7 |
8 | @staticmethod
9 | def find_dep_on_tasks(curr_task_1, task_classes_1):
10 | """ return all task classes. """
11 | # 找到的DAG库没有对应功能或不好用,比如 dagger。只能自己实现了。
12 | task_name_to_instance = {task_instance_1.__name__: task_instance_1
13 | for task_instance_1 in
14 | (task_classes_1 + [curr_task_1])}
15 |
16 | linked_dict = defaultdict(list) # dep_task => next_task
17 | for task_2 in task_classes_1:
18 | for ref_task_name_3 in task_2._ref_tasks:
19 | linked_dict[ref_task_name_3].append(task_2.__name__)
20 |
21 | # filter linked to self
22 | result = set(
23 | linked_dict[curr_task_1.__name__] +
24 | [curr_task_1.__name__])
25 | _is_add = True
26 | while True:
27 | for next_task_name_1 in list(result): # make a copy
28 | next_task_names_2 = linked_dict[next_task_name_1]
29 | # 1. 没数据
30 | if len(next_task_names_2) == 0:
31 | _is_add = False
32 | # 2. 有数据
33 | else:
34 | for next_task_name_2 in next_task_names_2:
35 | if next_task_name_2 in result:
36 | _is_add = False
37 | else:
38 | result.add(next_task_name_2)
39 |
40 | if not _is_add:
41 | break
42 |
43 | result = [task_name_to_instance[name_1] for name_1 in result]
44 | result.remove(curr_task_1)
45 | return result
46 |
--------------------------------------------------------------------------------
/luiti/manager/files.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | from ..luigi_extensions import ArrowParameter
4 | import luigi.hdfs
5 | from datetime import datetime
6 |
7 |
8 | class Files(object):
9 |
10 | """ Get all outputs which generated by luiti tasks. """
11 |
12 | @staticmethod
13 | def get_all_date_file_to_task_instances(date_range, task_classes):
14 | """ return all instances in date range. """
15 | assert_msg = "[error] correct format is \"20140901-20140905\", " \
16 | "but the input is %s" % date_range
17 | assert len(date_range) == 17, assert_msg
18 |
19 | first_date, last_date = date_range[0:8], date_range[9:]
20 | first_date, last_date = ArrowParameter.get(
21 | first_date, "YYYYMMDD"), ArrowParameter.get(last_date, "YYYYMMDD")
22 |
23 | return dict({file_3: task_instance_2
24 | for task1 in task_classes
25 | for task_instance_2 in task1.instances_by_date_range(
26 | first_date, last_date)
27 | for file_3 in task_instance_2._persist_files +
28 | [task_instance_2.data_file]})
29 |
30 | @staticmethod
31 | def soft_delete_files(*files):
32 | delete_at_str = datetime.now().strftime("-deleted-at-%Y%m%d-%H%M%S")
33 |
34 | for file1 in sorted(files):
35 | print "[delete file]", file1
36 | if luigi.hdfs.clients.exists(file1):
37 | luigi.hdfs.clients.rename(file1, file1 + delete_at_str)
38 | print
39 | else:
40 | print "[err] doesnt exist!"
41 |
42 | print "\nDone!"
43 | return 0
44 |
--------------------------------------------------------------------------------
/luiti/manager/generate_from_templates.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 | # :PEP8 -E221 -W603
3 |
4 | __all__ = ['GenerateFromTemplates']
5 |
6 | import os
7 | from inflector import Inflector
8 | from .config import luiti_config
9 |
10 | join = os.path.join
11 | exists = os.path.exists
12 |
13 |
14 | class GenerateFromTemplates(object):
15 |
16 | @staticmethod
17 | def new_a_project(project_name):
18 | project_name = Inflector().underscore(project_name)
19 | readme_path = join(project_name, "README.markdown")
20 | setup_path = join(project_name, "setup.py")
21 | package_dir = join(project_name, project_name)
22 | package_init = join(package_dir, "__init__.py")
23 | package_luiti_tasks_init = join(package_dir, "luiti_tasks/__init__.py")
24 | package_luiti_tasks_luiti = join(
25 | package_dir, "luiti_tasks/__init_luiti.py")
26 | tests_dir = join(project_name, "tests")
27 | tests_test_main = join(tests_dir, "test_main.py")
28 |
29 | write_content_to_file(a_project_readme(project_name), readme_path)
30 | write_content_to_file(a_project_setup(project_name), setup_path)
31 | write_content_to_file(u"", package_init)
32 | write_content_to_file(u"", package_luiti_tasks_init)
33 | write_content_to_file(
34 | a_project_init_luiti(), package_luiti_tasks_luiti)
35 | write_content_to_file(
36 | a_project_test_main(project_name), tests_test_main)
37 |
38 | # important files
39 | return [readme_path, setup_path,
40 | package_luiti_tasks_luiti, tests_test_main]
41 |
42 | @staticmethod
43 | def generate_a_task(task_name, project_dir=None,):
44 | path = join('luiti_tasks', Inflector().underscore(task_name) + ".py")
45 | if project_dir:
46 | path = join(project_dir, path)
47 | content = write_content_to_file(
48 | a_task_template(Inflector().classify(task_name)),
49 | path,
50 | )
51 | return content
52 |
53 |
54 | """ 1. Project """
55 | a_project_readme = lambda project_name: u"""
56 | %s
57 | =======================
58 |
59 | TODO ...
60 | """.strip() % (Inflector().titleize(project_name), )
61 |
62 | a_project_setup = lambda project_name: u"""
63 | # -*-coding:utf-8-*-
64 |
65 | from setuptools import setup
66 |
67 | setup(
68 | name="%s",
69 | version="0.0.1",
70 | packages=[
71 | "%s",
72 | "%s/luiti_tasks", ],
73 | zip_safe=False,
74 | )
75 | """.strip() % (project_name, project_name, project_name, )
76 |
77 | """ has bugs ...
78 | from setuptools import setup, find_packages
79 | packages=find_packages("%s"),
80 | package_dir = {"": "%s"},
81 | """
82 |
83 |
84 | a_project_init_luiti = lambda: u"""
85 | # -*-coding:utf-8-*-
86 |
87 | from luiti import *
88 | luigi.plug_packages("package_a", "package_b==4.2")
89 | """.strip()
90 |
91 |
92 | a_project_test_main = lambda project_name: u"""
93 | # -*- coding: utf-8 -*-
94 |
95 | import os
96 | import sys
97 | root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
98 | sys.path.insert(0, root_dir)
99 |
100 | import unittest
101 | from luiti import MrTestCase
102 |
103 |
104 | @MrTestCase
105 | class TestMapReduce(unittest.TestCase):
106 | mr_task_names = [
107 | ]
108 |
109 | if __name__ == '__main__':
110 | unittest.main()
111 | """.strip()
112 |
113 |
114 | """ 2. Task """
115 | a_task_template = lambda task_clsname: u"""
116 | # -*-coding:utf-8-*-
117 |
118 | from .__init_luiti import *
119 |
120 |
121 | @luigi.ref_tasks()
122 | class %s(%s):
123 |
124 | root_dir = "/foobar"
125 | """.strip() % (task_clsname, luiti_config.get_time_task(task_clsname), )
126 |
127 |
128 | def write_content_to_file(content, path):
129 | if exists(path):
130 | raise ValueError("path [%s] is already exists!" % path)
131 |
132 | dir1 = os.path.dirname(path)
133 | if not exists(dir1):
134 | os.mkdir(dir1)
135 |
136 | f1 = open(path, 'w')
137 | f1.write(content.encode("UTF-8"))
138 | f1.close()
139 |
140 | print "[info] generate %s file." % path
141 |
142 | return content
143 |
--------------------------------------------------------------------------------
/luiti/manager/lazy_data.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["ld"]
4 |
5 |
6 | from etl_utils import singleton, cached_property
7 |
8 | from .loader import Loader
9 | from .dep import Dep
10 | from .table import Table
11 |
12 |
13 | @singleton()
14 | class LazyData(object):
15 |
16 | @cached_property
17 | def all_task_classes(self):
18 | return [i1['task_cls'] for i1 in self.result['success']]
19 |
20 | @cached_property
21 | def result(self):
22 | return Loader.load_all_tasks()
23 |
24 | ld = LazyData()
25 | Dep.ld = ld
26 | Table.ld = ld
27 |
--------------------------------------------------------------------------------
/luiti/manager/loader.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | import sys
4 | import traceback
5 | from inflector import Inflector
6 |
7 | from .config import luiti_config as lc
8 | from .active_packages import active_packages
9 | from .package_map import PackageMap
10 |
11 |
12 | class Loader(object):
13 |
14 | @staticmethod
15 | @active_packages
16 | def load_all_tasks():
17 | result = {"success": list(), "failure": list()}
18 |
19 | task_clsnames = sorted(PackageMap.task_clsname_to_package.keys())
20 | for task_clsname_1 in task_clsnames:
21 | is_success = False
22 | task_cls = None
23 | err = None
24 |
25 | try:
26 | task_cls = Loader.load_a_task_by_name(task_clsname_1)
27 | is_success = True
28 | except Exception:
29 | err = list(sys.exc_info())
30 | err[2] = "".join(traceback.format_tb(err[2]))
31 | err = str(err[0]) + ": " + str(err[1]) + "\n" + err[2]
32 |
33 | if is_success:
34 | result['success'].append({"task_cls": task_cls})
35 | else:
36 | result['failure'].append(
37 | {"err": err, "task_clsname": task_clsname_1})
38 |
39 | return result
40 |
41 | @staticmethod
42 | @active_packages
43 | def load_a_task_by_name(s1):
44 | task_clsname_1 = Inflector().classify(s1) # force convert
45 | task_filename_1 = Inflector().underscore(s1) # force convert
46 |
47 | assert task_clsname_1 in PackageMap.task_clsname_to_package, u"""
48 | "%s" cannt be found. Auto converted class name is "%s", file name
49 | is "luiti_tasks/%s.py", please check it carefully.
50 |
51 | Already loaded PackageMap.task_clsname_to_package is %s.
52 | """ % (s1, task_clsname_1, task_filename_1, PackageMap.task_clsname_to_package)
53 |
54 | package_path = PackageMap.task_clsname_to_package[task_clsname_1].__name__ + \
55 | ".luiti_tasks." + task_filename_1
56 | task_lib = lc.import2(package_path)
57 | return getattr(task_lib, task_clsname_1)
58 |
--------------------------------------------------------------------------------
/luiti/manager/package_map.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | import os
4 | import glob
5 | from inflector import Inflector
6 | from etl_utils import singleton, cached_property
7 | from collections import defaultdict
8 |
9 | from .config import luiti_config as lc
10 | from .active_packages import active_packages
11 |
12 |
13 | @singleton()
14 | class PackageMapClass(object):
15 |
16 | @cached_property
17 | @active_packages
18 | def task_clsname_to_package(self):
19 |
20 | assert lc.luiti_tasks_packages, "At least have one project!"
21 |
22 | result = dict()
23 | for project1 in lc.luiti_tasks_packages:
24 | project_dir2 = project1.__path__[0]
25 |
26 | # if it's not a zip file, but a normal package directory
27 | is_zip_file = os.path.exists(
28 | os.path.join(project_dir2, "__init__.py"))
29 | if not is_zip_file:
30 | raise Exception(
31 | """[setup.py format error] make sure """
32 | """project "%s" zip_safe=False option exists!"""
33 | % project1.__name__)
34 |
35 | task_path_pattern = os.path.join(
36 | project_dir2, "luiti_tasks/[a-z]*.py")
37 |
38 | for f2 in glob.glob(task_path_pattern):
39 | task_filename3 = os.path.basename(f2).rsplit(".", 1)[0]
40 | task_clsname4 = Inflector().classify(task_filename3)
41 | result[task_clsname4] = project1
42 | return result
43 |
44 | @cached_property
45 | def package_to_task_clsnames(self):
46 | result = defaultdict(set)
47 | for task_clsname, package in self.task_clsname_to_package.iteritems():
48 | result[package].add(task_clsname)
49 | return result
50 |
51 | PackageMap = PackageMapClass()
52 |
--------------------------------------------------------------------------------
/luiti/manager/sys_argv.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["SysArgv"]
4 |
5 |
6 | class SysArgv(object):
7 | """
8 | Modify sys.argv to fix luigi's command interface.
9 | """
10 |
11 | @staticmethod
12 | def convert_to_luigi_accepted_argv(subparsers, argv):
13 | luigi_keep_opts = ["--date-value"]
14 |
15 | def fetch_keys(parser1):
16 | return parser1.__dict__['_option_string_actions'].keys()
17 |
18 | luiti_only_opts = subparsers.choices.keys() + \
19 | list(set(
20 | [k3 for p2 in subparsers._name_parser_map.values()
21 | for k3 in fetch_keys(p2)]))
22 | luiti_only_opts = [i1 for i1 in luiti_only_opts
23 | if i1 not in luigi_keep_opts]
24 |
25 | delete_argv_idxes = set([])
26 | for idx1, arg1 in enumerate(argv):
27 | if idx1 in delete_argv_idxes:
28 | continue
29 | # 1. remove tasks, files, run, etc.
30 | if (not arg1.startswith("--")) and (arg1 in luiti_only_opts):
31 | delete_argv_idxes.add(idx1)
32 | continue
33 | # 2. process --task-name and more params
34 | if "=" in arg1:
35 | arg2, val2 = arg1.split("=", 1)
36 | if arg2 in luiti_only_opts:
37 | delete_argv_idxes.add(idx1)
38 | else:
39 | if (arg1 in luiti_only_opts) and (arg1 not in luigi_keep_opts):
40 | delete_argv_idxes.add(idx1)
41 | delete_argv_idxes.add(idx1 + 1)
42 | argv = [arg1 for idx1, arg1 in enumerate(argv)
43 | if idx1 not in delete_argv_idxes]
44 | return argv
45 |
--------------------------------------------------------------------------------
/luiti/manager/table.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | import os
4 | from .dep import Dep
5 | import luigi
6 |
7 |
8 | class Table(object):
9 | """
10 | print task and package info.
11 | """
12 |
13 | @staticmethod
14 | def puts(task_body, task_headers, **opts):
15 | from tabulate import tabulate
16 | result = tabulate(task_body, task_headers, **opts)
17 | print
18 | print result
19 | print
20 | return result
21 |
22 | @staticmethod
23 | def print_all_tasks(result):
24 | """ input from Loader.load_all_tasks """
25 |
26 | def task_inspect(task_cls, order):
27 | return [
28 | order,
29 | task_cls.__name__,
30 | task_cls.__module__.split(".")[0]
31 | ]
32 |
33 | task_headers = ["", "All Tasks", "luiti_package"]
34 | task_body = [task_inspect(item1['task_cls'], idx1 + 1)
35 | for idx1, item1 in enumerate(sorted(result['success']))]
36 | task_body.extend([["total", len(result['success']), ""]])
37 |
38 | Table.puts(task_body, task_headers, tablefmt="grid")
39 |
40 | if result['failure']:
41 | print
42 | print "[warn] failure parsed files"
43 | print
44 | for failure1 in result['failure']:
45 | print "[task_file] ", failure1['task_clsname']
46 | print "[err] ", failure1['err']
47 | print
48 | return (task_body, task_headers)
49 |
50 | @staticmethod
51 | def print_files_by_task_cls_and_date_range(curr_task, args, opts=None):
52 | opts = opts or dict()
53 | # 打印 依赖类 和 执行配置 信息
54 | task_headers = ["Current Env Key", "Current Env Value"]
55 | task_body = [
56 | ["task name", args.task_name],
57 | ["task date range", args.date_range],
58 | ["task execute mode", "DRY=" + str(args.dry)],
59 | ["task dep mode", "DEP=" + str(args.dep)],
60 | ["related task classes total count", opts['task_classes_count']],
61 | ]
62 | print
63 | print "Tasks related infos"
64 | Table.puts(task_body, task_headers, tablefmt="grid")
65 |
66 | # 打印 要删除的文件列表
67 | file_headers = ["Generated from task", "Storage",
68 | "Date value", "Filename"]
69 |
70 | dep_file_to_task_instances = opts['dep_file_to_task_instances']
71 | file_table = [
72 | [dep_file_to_task_instances[f1].__class__.__name__,
73 | 'HDFS', dep_file_to_task_instances[f1].date_str,
74 | os.path.basename(f1), ]
75 | for f1 in sorted(dep_file_to_task_instances.keys())]
76 | file_table.append(
77 | ['', '', '', "Total count %s" % len(dep_file_to_task_instances)])
78 | file_table.append(['', '', '', ''])
79 | file_uniq_root_dir = set(
80 | [t1.root_dir for t1 in opts['dep_tasks_on_curr_task']])
81 | file_table.append(
82 | ['All root dirs', '', '',
83 | 'Total count %s' % len(file_uniq_root_dir)])
84 | for dir1 in file_uniq_root_dir:
85 | file_table.append(['', '', '', dir1])
86 |
87 | print
88 | print "Files related infos"
89 | Table.puts(file_table, file_headers, tablefmt="grid")
90 | print "\n" * 3
91 | return (file_table, file_headers)
92 |
93 | @staticmethod
94 | def print_task_info(curr_task):
95 | assert issubclass(curr_task, luigi.Task)
96 |
97 | dep_tasks_on_curr_task = Dep.find_dep_on_tasks(
98 | curr_task, Table.ld.all_task_classes)
99 |
100 | task_headers = ["Task name", curr_task.__name__]
101 | task_content = [
102 | ["Tasks self dep on", str(list(curr_task._ref_tasks))],
103 | ["Tasks dep on self",
104 | str(sorted([t2.__name__ for t2 in dep_tasks_on_curr_task]))],
105 | ]
106 | Table.puts(task_content, task_headers, tablefmt="grid")
107 | return (task_content, task_headers)
108 |
--------------------------------------------------------------------------------
/luiti/schedule/__init__.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["SensorSchedule"]
4 |
5 | from .sensor_schedule import SensorSchedule
6 |
--------------------------------------------------------------------------------
/luiti/task_templates/__init__.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["TaskHour",
4 | "TaskHourHadoop",
5 | "TaskDay",
6 | "TaskDayHadoop",
7 | "TaskWeek",
8 | "TaskWeekHadoop",
9 | "TaskBiweekly",
10 | "TaskBiweeklyHadoop",
11 | "TaskMonth",
12 | "TaskMonthHadoop",
13 | "TaskQuarter",
14 | "TaskQuarterHadoop",
15 | "TaskYear",
16 | "TaskYearHadoop",
17 | "TaskRange",
18 | "TaskRangeHadoop",
19 |
20 | "StaticFile",
21 | "HiveTask",
22 | "MongoImportTask", ]
23 |
24 |
25 | from .time.task_hour import TaskHour
26 | from .time.task_day import TaskDay
27 | from .time.task_week import TaskWeek
28 | from .time.task_biweekly import TaskBiweekly
29 | from .time.task_month import TaskMonth
30 | from .time.task_quarter import TaskQuarter
31 | from .time.task_year import TaskYear
32 | from .time.task_range import TaskRange
33 |
34 | from .time.task_hour_hadoop import TaskHourHadoop
35 | from .time.task_day_hadoop import TaskDayHadoop
36 | from .time.task_week_hadoop import TaskWeekHadoop
37 | from .time.task_biweekly_hadoop import TaskBiweeklyHadoop
38 | from .time.task_month_hadoop import TaskMonthHadoop
39 | from .time.task_quarter_hadoop import TaskQuarterHadoop
40 | from .time.task_year_hadoop import TaskYearHadoop
41 | from .time.task_range_hadoop import TaskRangeHadoop
42 |
43 | from .other.static_file import StaticFile
44 | from .other.mongo_import_task import MongoImportTask
45 | from .other.hive_task import HiveTask
46 |
--------------------------------------------------------------------------------
/luiti/task_templates/other/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dchentech/luiti/11a5c62b265a92910a1d4c82431e3697b8b06814/luiti/task_templates/other/__init__.py
--------------------------------------------------------------------------------
/luiti/task_templates/other/hive_task.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["HiveTask"]
4 |
5 |
6 | from etl_utils import cached_property
7 | from luigi.contrib.hive import HiveQueryTask
8 |
9 | from ...utils import TargetUtils
10 | from ...luigi_extensions import luigi, TaskBase
11 |
12 |
13 | @luigi.as_a_luiti_task()
14 | class HiveTask(HiveQueryTask, TaskBase):
15 | """
16 | Hive SQL Template, follows luiti `date_value` date mode。
17 |
18 |
19 | Implement:
20 | 1. hive_db
21 | 2. sql_main
22 |
23 | Example:
24 | from luiti.task_templates import HiveTask
25 |
26 | class AnotherHiveDay(HiveTask):
27 | root_dir = "/another/hive/result/"
28 | use_hive_db = "main_hive_database"
29 |
30 | @cached_property
31 | def sql_main(self):
32 | return "select * from example_table;"
33 |
34 | """
35 |
36 | run_mode = "mr_distribute"
37 |
38 | def output(self):
39 | """ Hive query default output directory has no _SUCCESS, not chunk filename is not MR style, see more details at `TargetUtils.hdfs_dir` . """
40 | assert "ValueError" not in self.data_file, self.data_file
41 | return TargetUtils.hdfs_dir(self.data_file)
42 |
43 | def query(self):
44 | sql = u"""
45 | USE %s;
46 | INSERT OVERWRITE DIRECTORY "%s" %s
47 | """.replace("\n", " ") % (self.use_hive_db, self.data_file, self.sql_main.strip())
48 |
49 | if self.run_mode == "mr_distribute":
50 | print "[info.luiti] run Hive SQL := %s" % sql
51 |
52 | return sql.strip()
53 |
54 | @cached_property
55 | def data_root(self):
56 | raise ValueError("Old API. Please use luiti's standard property `root_dir` instead.")
57 |
58 | @cached_property
59 | def root_dir(self):
60 | # or a cached_property
61 | if self.__class__.data_root not in [NotImplementedError, ValueError]:
62 | return self.data_root # from instance
63 | raise ValueError
64 |
65 | @cached_property
66 | def use_hive_db(self):
67 | if self.hive_db is not NotImplementedError:
68 | return self.hive_db
69 | raise ValueError
70 |
71 | # Deprecated API, use `use_hive_db` instead.
72 | hive_db = NotImplementedError
73 |
74 | @cached_property
75 | def sql_main(self):
76 | """
77 | Need to implemented in subclass
78 | """
79 | raise ValueError
80 |
--------------------------------------------------------------------------------
/luiti/task_templates/other/mongo_import_task.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | from etl_utils import process_notifier, cached_property
4 | import luigi
5 | import os
6 | import arrow
7 | import json
8 |
9 | from ...luigi_extensions import TaskBase
10 | from ...utils import CommandUtils, TargetUtils
11 |
12 |
13 | class MongoImportTask(TaskBase):
14 | """
15 | Copy stat files to MongoDB.
16 |
17 | Steps:
18 | 1. download file from HDFS.
19 | 2. Make some indexes on MongoDB if needed.
20 | 3. Run `mongoimport` to import data.
21 | 4. Update report_status collection in MongoDB.
22 |
23 | Required:
24 | 1. Must be JSON file
25 | """
26 |
27 | report_status_collection_name = "report_status"
28 | report_status_namespace = "latestCollection"
29 | report_name = NotImplementedError
30 |
31 | system_tmp = "/tmp" # default
32 |
33 | @cached_property
34 | def mongodb_db(self):
35 | return self.mongodb_connection[self.database_name]
36 |
37 | @cached_property
38 | def mongodb_connection_address(self):
39 | """ e.g. ('192.168.20.111', 37001) """
40 | methods = dir(self.mongodb_connection)
41 | result = None
42 |
43 | # Compact with new pymongo API
44 | if "address" in methods:
45 | result = getattr(self.mongodb_connection, "address")
46 | if "connection" in methods:
47 | result = getattr(self.mongodb_connection, "connection").address
48 | if ("port" in methods) and ("host" in methods):
49 | result = (self.mongodb_connection.host, self.mongodb_connection.port)
50 | if result:
51 | assert len(result) == 2, result
52 | return result
53 | else:
54 | raise ValueError(self.mongodb_connection)
55 |
56 | @cached_property
57 | def mongodb_connection_host(self):
58 | return self.mongodb_connection_address[0]
59 |
60 | @cached_property
61 | def mongodb_connection_port(self):
62 | return self.mongodb_connection_address[1]
63 |
64 | @cached_property
65 | def report_status_collection_model(self):
66 | return self.mongodb_db[self.report_status_collection_name]
67 |
68 | @cached_property
69 | def data_file_collection_model(self):
70 | return self.mongodb_db[self.collection_name]
71 |
72 | # 1. config
73 | @cached_property
74 | def source_task(self):
75 | raise NotImplementedError
76 |
77 | @cached_property
78 | def mongodb_connection(self):
79 | raise NotImplementedError
80 |
81 | @cached_property
82 | def database_name(self):
83 | raise NotImplementedError
84 |
85 | @cached_property
86 | def index_schema(self):
87 | raise NotImplementedError
88 |
89 | def run_before_hook(self):
90 | pass
91 |
92 | def run_after_hook(self):
93 | pass
94 |
95 | # 2. common
96 | def requires(self):
97 | return [getattr(self, _ref_task_1)(self.date_value)
98 | for _ref_task_1 in self._ref_tasks]
99 |
100 | def run(self):
101 | self.run_before_hook()
102 |
103 | # 1. check is already done.
104 | if self.is_collection_exists():
105 | print "[info] %s already exists!" % (self.data_file_collection_model, )
106 | return False
107 |
108 | # 2. check report status collection is valid
109 | if self.report_status_collection_model.count() == 0:
110 | self.report_status_collection_model.insert(
111 | {self.report_status_namespace: {}})
112 | assert self.report_status_collection_model.count() == 1, "更新纪录 只能有一条!"
113 |
114 | # 3. output json with err
115 | data_file1 = self.source_task_instance.data_file
116 | source1 = luigi.HDFS(data_file1)
117 | tmp_file1 = open(self.tmp_filepath, 'w')
118 |
119 | for line1 in process_notifier(
120 | TargetUtils.line_read(source1), u"[read lines] %s" % source1):
121 | tmp_file1.write(line1 + "\n")
122 | tmp_file1.close()
123 |
124 | # 4. upload to mongodb
125 | CommandUtils.execute(self.mongo_ensure_index)
126 | CommandUtils.execute(self.mongoimport_command)
127 |
128 | # 5. clean tmp
129 | CommandUtils.execute("rm -f %s" % self.tmp_filepath)
130 |
131 | # 6. update report status
132 | item1 = self.report_status_collection_model.find()[0]
133 | del item1['_id']
134 | item1[self.report_status_namespace][self.report_name] = {
135 | 'collection_name': self.collection_name,
136 | 'updated_at': arrow.now().datetime,
137 | }
138 | self.report_status_collection_model.find_and_modify(
139 | query={},
140 | update={"$set": item1},
141 | full_response=True
142 | )
143 |
144 | self.run_after_hook()
145 |
146 | return True
147 |
148 | def is_collection_exists(self):
149 | return self.data_file_collection_model.count() > 0
150 |
151 | @cached_property
152 | def source_task_instance(self):
153 | return self.source_task(self.date_value)
154 |
155 | @cached_property
156 | def mongoimport_command(self):
157 | return "/usr/bin/mongoimport " + \
158 | ("--host %s " % self.mongodb_connection_host) + \
159 | ("--port %s " % self.mongodb_connection_port) + \
160 | ("--db %s " % self.database_name) + \
161 | ("--collection %s " % self.collection_name) + \
162 | ("--file %s" % self.tmp_filepath)
163 |
164 | @cached_property
165 | def mongo_ensure_index(self):
166 | if not isinstance(self.index_schema, basestring):
167 | self.index_schema = json.dumps(self.index_schema)
168 | js_str = "db.%s.ensureIndex(%s)" % \
169 | (self.collection_name, self.index_schema)
170 | return self.mongo_eval(js_str)
171 |
172 | def mongo_eval(self, js_str):
173 | return "/usr/bin/mongo " + \
174 | ("%s:%s/%s " % (self.mongodb_connection_host, self.mongodb_connection_port, self.database_name)) + \
175 | ("--eval \"%s\" " % js_str)
176 |
177 | @cached_property
178 | def collection_name(self):
179 | """ e.g. redmine5954_parent_report_week_20140901 """
180 | return self.data_name + "_" + self.date_value.strftime("%Y%m%d")
181 |
182 | @cached_property
183 | def tmp_filepath(self):
184 | return self.tmp_dir + "/" + self.date_value.strftime("%Y%m%d")
185 |
186 | @cached_property
187 | def tmp_dir(self):
188 | dir1 = os.path.join(self.system_tmp, self.task_class.__name__)
189 | os.system("mkdir -p %s" % dir1)
190 | return dir1
191 |
--------------------------------------------------------------------------------
/luiti/task_templates/other/static_file.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 |
4 | from etl_utils import cached_property
5 | from ...luigi_extensions import luigi
6 | from ...utils import TargetUtils
7 |
8 |
9 | class StaticFile(luigi.ExternalTask):
10 | """
11 | By default, luigi don't have the ability to operate that tasks's outputs are generated by outside system
12 |
13 | So let luiti to schedule the task DAG, it allows to task to wait before submit to `luigid`. Check more details at luiti.schedule.
14 | """
15 |
16 | is_external = True # see more documents at TaskBase
17 | data_file = None # The same as luiti.TaskBase
18 | filepath = None # Deprecated
19 |
20 | # Mimic default luigi.ExternalTask
21 | def run(self):
22 | pass
23 |
24 | def complete(self):
25 | return True
26 |
27 | def output(self):
28 | # Compatible with old API `filepath`
29 | if (self.data_file in [NotImplementedError, None]) \
30 | and isinstance(self.filepath, basestring):
31 | self.data_file = self.filepath
32 |
33 | assert self.data_file, u"Please assign `data_file` !"
34 | return self.IODevice(self.data_file)
35 |
36 | @cached_property
37 | def IODevice(self):
38 | return self.io_devices[0] # default is HDFS
39 |
40 | io_devices = [TargetUtils.hdfs, luigi.LocalTarget]
41 |
--------------------------------------------------------------------------------
/luiti/task_templates/time/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | API are listed at parent __ini__.py .
3 |
4 |
5 | Example:
6 | class TaskDayHadoop(luigi.hadoop.HadoopExt, TaskDay):
7 | pass
8 |
9 | TaskDay.__init__ will overwrite luigi.hadoop.HadoopExt's.
10 |
11 |
12 | NOTE: luigi.hadoop.HadoopExt will overwrite TaskDay
13 |
14 | """
15 |
--------------------------------------------------------------------------------
/luiti/task_templates/time/task_biweekly.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ['TaskBiweekly']
4 |
5 | from ...luigi_extensions import TaskBase
6 |
7 |
8 | class TaskBiweekly(TaskBase):
9 | pass
10 |
--------------------------------------------------------------------------------
/luiti/task_templates/time/task_biweekly_hadoop.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | from .task_biweekly import TaskBiweekly
4 | from ...luigi_extensions import luigi
5 |
6 |
7 | class TaskBiweeklyHadoop(luigi.hadoop.HadoopExt, TaskBiweekly):
8 |
9 | pass
10 |
--------------------------------------------------------------------------------
/luiti/task_templates/time/task_day.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ['TaskDay']
4 |
5 | from ...luigi_extensions import TaskBase
6 | import arrow
7 | from etl_utils import cached_property
8 |
9 | class TaskDay(TaskBase):
10 |
11 | @cached_property
12 | def latest_7_days(self):
13 | return arrow.Arrow.range(
14 | 'day',
15 | self.date_value.replace(days=-6),
16 | self.date_value,)
17 |
18 | @cached_property
19 | def latest_30_days(self):
20 | return arrow.Arrow.range(
21 | 'day',
22 | self.date_value.replace(days=-29),
23 | self.date_value,)
24 |
--------------------------------------------------------------------------------
/luiti/task_templates/time/task_day_hadoop.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | from .task_day import TaskDay
4 | from ...luigi_extensions import luigi
5 |
6 |
7 | class TaskDayHadoop(luigi.hadoop.HadoopExt, TaskDay):
8 |
9 | pass
10 |
--------------------------------------------------------------------------------
/luiti/task_templates/time/task_hour.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ['TaskHour']
4 |
5 | from ...luigi_extensions import TaskBase
6 |
7 |
8 | class TaskHour(TaskBase):
9 |
10 | pass
11 |
--------------------------------------------------------------------------------
/luiti/task_templates/time/task_hour_hadoop.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | from .task_hour import TaskHour
4 | from ...luigi_extensions import luigi
5 |
6 |
7 | class TaskHourHadoop(luigi.hadoop.HadoopExt, TaskHour):
8 |
9 | pass
10 |
--------------------------------------------------------------------------------
/luiti/task_templates/time/task_month.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ['TaskMonth']
4 |
5 | from etl_utils import cached_property
6 | from ...luigi_extensions import TaskBase
7 | import arrow
8 |
9 |
10 | class TaskMonth(TaskBase):
11 |
12 | @cached_property
13 | def days_in_month(self):
14 | return arrow.Arrow.range(
15 | 'day',
16 | self.date_value.floor('month'),
17 | self.date_value.ceil('month'),)
--------------------------------------------------------------------------------
/luiti/task_templates/time/task_month_hadoop.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | from .task_month import TaskMonth
4 | from ...luigi_extensions import luigi
5 |
6 |
7 | class TaskMonthHadoop(luigi.hadoop.HadoopExt, TaskMonth):
8 |
9 | pass
10 |
--------------------------------------------------------------------------------
/luiti/task_templates/time/task_quarter.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ['TaskQuarter']
4 |
5 | from ...luigi_extensions import TaskBase
6 |
7 |
8 | class TaskQuarter(TaskBase):
9 |
10 | pass
11 |
--------------------------------------------------------------------------------
/luiti/task_templates/time/task_quarter_hadoop.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | from .task_quarter import TaskQuarter
4 | from ...luigi_extensions import luigi
5 |
6 |
7 | class TaskQuarterHadoop(luigi.hadoop.HadoopExt, TaskQuarter):
8 |
9 | pass
10 |
--------------------------------------------------------------------------------
/luiti/task_templates/time/task_range.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ['TaskRange']
4 |
5 | from ...luigi_extensions import TaskBase
6 | from ...utils import DateUtils
7 |
8 |
9 | class TaskRange(TaskBase):
10 |
11 | # NOTE date_value 和 date_range 两个值是必须的。
12 | # 1. date_value 是写到那个日期目录
13 | # 2. date_range 是指定了依赖的日期范围
14 |
15 | def date_range(self):
16 | raise ValueError("Overwrite Me!")
17 | # date_range = luigi.DateIntervalParameter()
18 | # date_range = luigi.Parameter() # 临时现为 str 类型吧
19 |
20 | @property
21 | def dates_in_range(self):
22 | # method_1 = self.date_type + "s_in_range" # e.g. weeks_in_range
23 | method_1 = 'week' + "s_in_range" # NOTE 目前直接为 week, 因为是range.
24 |
25 | # s1 = "2014-10-01-2014-10-07"
26 | # s1[0:10] => '2014-10-01'
27 | # s1[11:21] => '2014-10-07'
28 | date_1, date_2 = self.date_range[0:10], self.date_range[11:21]
29 |
30 | return list(getattr(DateUtils, method_1)(date_1, date_2))
31 |
--------------------------------------------------------------------------------
/luiti/task_templates/time/task_range_hadoop.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ['TaskRangeHadoop']
4 |
5 | from .task_range import TaskRange
6 | from ...luigi_extensions import luigi
7 |
8 |
9 | class TaskRangeHadoop(luigi.hadoop.HadoopExt, TaskRange):
10 |
11 | pass
12 |
--------------------------------------------------------------------------------
/luiti/task_templates/time/task_week.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ['TaskWeek']
4 |
5 | from etl_utils import cached_property
6 | from ...luigi_extensions import TaskBase
7 | from ...utils import DateUtils
8 |
9 |
10 | class TaskWeek(TaskBase):
11 |
12 | @cached_property
13 | def days_in_week(self):
14 | return list(DateUtils.days_in_week(self.date_value))
15 |
16 | def requires_with_prev_week(self, ref_task1):
17 | """ require days in current week, and stat data in previous week """
18 | total_tasks = [ref_task1(date_value=date1) for date1 in self.days_in_week]
19 |
20 | prev_week_stat_task1 = self.pre_task_by_self
21 | if isinstance(prev_week_stat_task1, self.task_class):
22 | total_tasks.append(prev_week_stat_task1) # If it's not RootTask
23 |
24 | return total_tasks
25 |
--------------------------------------------------------------------------------
/luiti/task_templates/time/task_week_hadoop.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ["TaskWeekHadoop"]
4 |
5 | from .task_week import TaskWeek
6 | from ...luigi_extensions import luigi
7 |
8 |
9 | class TaskWeekHadoop(luigi.hadoop.HadoopExt, TaskWeek):
10 | pass
11 |
--------------------------------------------------------------------------------
/luiti/task_templates/time/task_year.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ['TaskYear']
4 |
5 | from ...luigi_extensions import TaskBase
6 |
7 |
8 | class TaskYear(TaskBase):
9 |
10 | pass
11 |
--------------------------------------------------------------------------------
/luiti/task_templates/time/task_year_hadoop.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | from .task_year import TaskYear
4 | from ...luigi_extensions import luigi
5 |
6 |
7 | class TaskYearHadoop(luigi.hadoop.HadoopExt, TaskYear):
8 |
9 | pass
10 |
--------------------------------------------------------------------------------
/luiti/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | __all__ = ['MrTestCase', "SetupLuitiPackages", "date_begin"]
4 |
5 |
6 | from .mr_test_case import MrTestCase
7 | from .setup_luiti_packages import SetupLuitiPackages
8 |
9 | date_begin = "2014-09-01"
10 |
--------------------------------------------------------------------------------
/luiti/tests/mr_test_case.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | __all__ = ['MrTestCase']
4 |
5 |
6 | from collections import defaultdict
7 | import json
8 |
9 | from ..manager import Loader
10 |
11 |
12 | def MrTestCase(cls, verbose=False, date_value="2014-09-01"):
13 | """
14 | 功能: 集成测试数据到 类中 ,这样就方便引用了。
15 | """
16 |
17 | assert "mr_task_names" in dir(cls), "%s must assgin some task names!" % cls
18 |
19 | cls.maxDiff = None # compact large json diff
20 |
21 | def map_lines(text):
22 | assert isinstance(text, unicode)
23 | result = list()
24 | for l1 in text.split("\n"):
25 | l1 = l1.strip()
26 | if not l1:
27 | continue
28 | result.append(l1)
29 | return result
30 |
31 | def generate_closure_function(mr_task_name1):
32 | task_cls = Loader.load_a_task_by_name(mr_task_name1) # keep it!
33 | if verbose:
34 | print "[task_cls]", task_cls
35 |
36 | def test_mr(self):
37 | task_instance_1 = task_cls(date_value=date_value)
38 | if verbose:
39 | print "[task_instance]", task_instance_1
40 |
41 | task_instance_1.lines = map_lines(task_instance_1.mrtest_input())
42 | result_expect = sorted(
43 | [read_json_from_mrtest_output(i2, idx + 1) for idx, i2
44 | in enumerate(map_lines(task_instance_1.mrtest_output()))])
45 |
46 | self.assertEqual(result_expect, run_map_reduce(task_instance_1))
47 | return test_mr
48 |
49 | for mr_task_name1 in cls.mr_task_names:
50 | test_method_name = "test_" + mr_task_name1
51 | if verbose:
52 | print
53 | if verbose:
54 | print "[test_method_name]", test_method_name
55 |
56 | setattr(
57 | cls, test_method_name, generate_closure_function(mr_task_name1))
58 |
59 | if verbose:
60 | print
61 | if verbose:
62 | print
63 |
64 | return cls
65 |
66 |
67 | def run_map_reduce(task_instance_1):
68 | # 1. bind attrs
69 | for k1, v1 in task_instance_1.mrtest_attrs().iteritems():
70 | setattr(task_instance_1, k1, v1)
71 |
72 | # 2. map it!
73 | mapper_key_to_vals = defaultdict(list)
74 | for line1 in task_instance_1.lines:
75 | for key_1, val_1 in task_instance_1.mapper(line1.strip()):
76 | mapper_key_to_vals[key_1].append(val_1)
77 |
78 | # 3. reduce it!
79 | result_list = list()
80 | for key_1, vals_1 in mapper_key_to_vals.iteritems():
81 | vals_generator = iter(vals_1)
82 | for _, val_2 in task_instance_1.reducer(key_1, vals_generator):
83 | result_list.append(json.loads(val_2))
84 | return sorted(result_list)
85 |
86 |
87 | def read_json_from_mrtest_output(line, num):
88 | """ print which json line error """
89 | try:
90 | return json.loads(line)
91 | except Exception as e:
92 | print u"[line#%s] %s" % (num, line)
93 | raise e
94 |
--------------------------------------------------------------------------------
/luiti/tests/setup_luiti_packages.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | __all__ = ['SetupLuitiPackages']
4 |
5 | import os
6 | import sys
7 | from etl_utils import cached_property, singleton
8 |
9 |
10 | @singleton()
11 | class SetupLuitiPackagesClass(object):
12 |
13 | @cached_property
14 | def config(self):
15 | root_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
16 | assert os.path.exists(root_dir), root_dir
17 | parent = os.path.join(root_dir, "tests/webui_packages")
18 |
19 | luiti_package_names = "dump clean middle summary".split(" ")
20 | for project_name in luiti_package_names + ["webui_tests"]:
21 | package_path = os.path.join(parent, "luiti_" + project_name)
22 | sys.path.insert(0, package_path)
23 |
24 | sys.path.insert(0, os.path.join(root_dir, "tests"))
25 | sys.path.insert(0, os.path.join(root_dir, "tests/project_A"))
26 | sys.path.insert(0, os.path.join(root_dir, "tests/project_B"))
27 | sys.path.insert(0, os.path.join(root_dir, "tests/zip_package_by_luiti"))
28 |
29 | # setup env
30 | from luiti import config
31 | config.curr_project_dir = os.path.join(root_dir, "tests/webui_packages/luiti_summary")
32 |
33 | return config
34 |
35 | SetupLuitiPackages = SetupLuitiPackagesClass()
36 |
--------------------------------------------------------------------------------
/luiti/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = [
4 | "IOUtils",
5 | "TargetUtils",
6 | "MRUtils",
7 | "MathUtils",
8 | "HDFSUtils",
9 | "CommandUtils",
10 | "CompressUtils",
11 | "DateUtils",
12 | "ExtUtils",
13 | "VisualiserEnvTemplate"
14 | ]
15 |
16 | from .io_utils import IOUtils
17 | from .target_utils import TargetUtils
18 | from .mr_utils import MRUtils
19 | from .math_utils import MathUtils
20 | from .hdfs_utils import HDFSUtils
21 | from .command_utils import CommandUtils
22 | from .date_utils import DateUtils
23 | from .compress_utils import CompressUtils
24 | from .ext_utils import ExtUtils
25 | from .visualiser_env_template import VisualiserEnvTemplate
26 |
--------------------------------------------------------------------------------
/luiti/utils/command_utils.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | import os
4 |
5 |
6 | class CommandUtils:
7 |
8 | @staticmethod
9 | def execute(command_str, dry=False, verbose=True):
10 | if verbose:
11 | print "[command]", command_str
12 | if dry:
13 | return False
14 |
15 | # return commands.getstatusoutput(command_str)
16 | return os.system(command_str) # print logs in realtime.
17 |
--------------------------------------------------------------------------------
/luiti/utils/compress_utils.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | import os
4 | import glob
5 | from .command_utils import CommandUtils
6 | from .hdfs_utils import HDFSUtils
7 |
8 |
9 | class CompressUtils:
10 |
11 | @staticmethod
12 | def unzip_with_upload(orig_filepath, hdfs_filepath,
13 | tmp_dir=NotImplementedError, tmp_name=NotImplementedError):
14 | """
15 | 1. Download zip file from HDFS
16 | 2. Unzip it
17 | 3. Reupload to the same place on HDFS
18 | """
19 | # 1. check
20 | if not HDFSUtils.exists(orig_filepath):
21 | raise ValueError("[hdfs] %s not exists!" % orig_filepath)
22 |
23 | # 2. pull file from hdfs
24 | tmp_local_target = os.path.join(tmp_dir, tmp_name)
25 | HDFSUtils.copyToLocal(orig_filepath, tmp_local_target)
26 |
27 | # 3. unzip
28 | unzip_dir = tmp_dir + "/unzip"
29 | CommandUtils.execute("mkdir -p %s" % unzip_dir)
30 | CommandUtils.execute(
31 | "tar xzvf %s -C %s" % (tmp_local_target, unzip_dir))
32 |
33 | unzip_file = unzip_dir
34 | # 兼容 zip 文件是多层级目录
35 | while (os.path.isdir(unzip_file)):
36 | next_dirs = glob.glob(unzip_file + "/*")
37 | if len(next_dirs) > 1:
38 | raise ValueError(
39 | "%s should only one dir in a zip file!" % unzip_file)
40 | if len(next_dirs) == 0:
41 | raise ValueError(
42 | "%s must always exists one file or one dir in a zip file, "
43 | "but there are %s ." % (unzip_file, str(next_dirs)))
44 | unzip_file = next_dirs[0]
45 |
46 | # 4. push file to hdfs
47 | HDFSUtils.copyFromLocal(unzip_file, hdfs_filepath)
48 | CommandUtils.execute("rm -rf %s" % unzip_dir)
49 | CommandUtils.execute("rm -rf %s" % tmp_local_target)
50 | return True
51 |
--------------------------------------------------------------------------------
/luiti/utils/date_utils.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | __all__ = ['DateUtils']
4 |
5 | import arrow
6 |
7 |
8 | class DateUtils:
9 | arrow = arrow
10 |
11 | @staticmethod
12 | def arrow_str(arrow1):
13 | return arrow.get(arrow1).datetime.strftime("%Y-%m-%d")
14 |
15 | @staticmethod
16 | def days_in_week(arrow1):
17 | arrow1 = arrow.get(arrow1)
18 | return arrow.Arrow.range(
19 | 'day',
20 | arrow1.floor('week'),
21 | arrow1.ceil('week'),)
22 |
23 | @staticmethod
24 | def weeks_in_range(arrow1, arrow2):
25 | return arrow.Arrow.range(
26 | 'week',
27 | arrow.get(arrow1).floor('week'),
28 | arrow.get(arrow2).ceil('week'),)
29 |
30 | @staticmethod
31 | def fixed_weeks_in_range(date_range_str):
32 | """ 修复 一个范围内所有全部覆盖的weeks,即最坏情况是掐头去尾。"""
33 | # 兼容如果date_range的最后一个不是星期天,那该周日志就不完整。
34 | assert len(date_range_str) == 21 # e.g. "2014-09-01-2014-11-19"
35 | first_date = arrow.get(date_range_str[0:10])
36 | last_date = arrow.get(date_range_str[11:21])
37 | dates = DateUtils.weeks_in_range(first_date, last_date)
38 | if len(dates) > 0:
39 | if last_date.weekday() != 6: # 6 index is Sunday
40 | dates = dates[:-1]
41 | if first_date.weekday() != 0: # 0 index is Monday
42 | dates = dates[1:]
43 | return dates
44 |
45 | @staticmethod
46 | def date_value_by_type_in_last(date_value_1, date_type_1):
47 | val1 = arrow.get(date_value_1).replace(**{(date_type_1 + 's'): -1}) \
48 | .floor(date_type_1)
49 | return val1
50 |
--------------------------------------------------------------------------------
/luiti/utils/ext_utils.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | from etl_utils import cached_property
4 |
5 |
6 | class ExtUtils(object):
7 |
8 | class ExtendClass(object):
9 |
10 | """
11 | Extend a class dynamically, and compact with `property` and
12 | `cached_property` in a unified call mechanism.
13 | """
14 |
15 | @classmethod
16 | def extend(cls, attrs):
17 | assert isinstance(attrs, dict), attrs
18 |
19 | for attr_k1, attr_v1 in attrs.iteritems():
20 | orig_attr = getattr(cls, attr_k1, None)
21 |
22 | # convert input to original value type
23 | if isinstance(orig_attr, property) and \
24 | (not isinstance(attr_v1, property)):
25 | new_v1 = property(attr_v1)
26 | elif isinstance(orig_attr, cached_property) and \
27 | (not isinstance(attr_v1, cached_property)):
28 | new_v1 = cached_property(attr_v1)
29 | else:
30 | new_v1 = attr_v1
31 |
32 | setattr(cls, attr_k1, new_v1)
33 |
--------------------------------------------------------------------------------
/luiti/utils/hdfs_utils.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | from .command_utils import CommandUtils
4 | from .target_utils import TargetUtils
5 |
6 |
7 | class HDFSUtils:
8 |
9 | hdfs_cli = NotImplemented
10 |
11 | @staticmethod
12 | def exists(path1):
13 | return TargetUtils.exists(path1)
14 |
15 | @staticmethod
16 | def copy(path1, path2):
17 | command1 = HDFSUtils.hdfs_cli + " -cp %s %s" % (path1, path2)
18 | print "[command]", command1
19 | CommandUtils.execute(command1)
20 |
21 | @staticmethod
22 | def copyFromLocal(path1, path2):
23 | command1 = HDFSUtils.hdfs_cli + \
24 | " -copyFromLocal %s %s" % (path1, path2)
25 | print "[command]", command1
26 | CommandUtils.execute(command1)
27 |
28 | @staticmethod
29 | def copyToLocal(path1, path2):
30 | command1 = HDFSUtils.hdfs_cli + " -copyToLocal %s %s" % (path1, path2)
31 | print "[command]", command1
32 | CommandUtils.execute(command1)
33 |
34 | @staticmethod
35 | def chown(path1):
36 | command1 = HDFSUtils.hdfs_cli + " -chown -R primary_user " + path1
37 | print "[command]", command1
38 | CommandUtils.execute(command1)
39 |
40 | @staticmethod
41 | def mkdir_p(dir1):
42 | command1 = HDFSUtils.hdfs_cli + " -mkdir -p " + dir1
43 | print "[command]", command1
44 | CommandUtils.execute(command1)
45 |
46 | @staticmethod
47 | def mkdir(dir1):
48 | command1 = HDFSUtils.hdfs_cli + " -mkdir " + dir1
49 | print "[command]", command1
50 | CommandUtils.execute(command1)
51 |
52 | @staticmethod
53 | def mv(src, dst):
54 | command1 = HDFSUtils.hdfs_cli + (" -mv %s %s " % (src, dst))
55 | print "[command]", command1
56 | CommandUtils.execute(command1)
57 |
58 |
59 | # TODO 用装饰器来包装 print, CommandUtils.execute等
60 |
--------------------------------------------------------------------------------
/luiti/utils/io_utils.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | import json
4 | import luigi
5 | import luigi.hdfs
6 | from luigi import LocalTarget
7 | from etl_utils import JsonUtils
8 | from .target_utils import TargetUtils
9 |
10 |
11 | class IOUtils:
12 |
13 | SQL_RANGE_LIMIT = 1000
14 |
15 | @staticmethod
16 | def json_dump(o1):
17 | m1 = lambda item1: json.dumps(list(item1))
18 | m2 = lambda item1: JsonUtils.unicode_dump(item1).encode("UTF-8")
19 | if isinstance(o1, (list, set,)):
20 | # Comptible with JsonUtils.unicode_dump dont support list
21 | method = m1
22 | else:
23 | method = m2
24 | return method(o1)
25 |
26 | @staticmethod
27 | def write_json_to_output(result, output1):
28 | """
29 | Support multiple lines.
30 | """
31 | if isinstance(result, dict):
32 | result = [result]
33 | if isinstance(result, set):
34 | result = list(result)
35 | assert isinstance(result, list), result
36 | assert len(result) > 0, result
37 | assert isinstance(result[0], dict), result
38 |
39 | with output1.open('w') as output_hdfs:
40 | for o1 in result:
41 | output_hdfs.write(IOUtils.json_dump(o1) + "\n")
42 | return 0
43 | write_jsons_to_output = write_json_to_output # make a alias
44 |
45 | @staticmethod
46 | def read_json_from_output(output1):
47 | # only one line
48 | item1 = None
49 | read_line_count = 0
50 | for json1 in TargetUtils.json_read(output1):
51 | read_line_count += 1
52 | item1 = json1
53 | if read_line_count >= 2:
54 | raise ValueError("[multiple line error]"
55 | " %s should contain only one line!" % output1)
56 | return item1
57 |
58 | @staticmethod
59 | def remove_files(*files): # 兼容 写入中途失败
60 | for file1 in files:
61 | if luigi.hdfs.exists(file1):
62 | luigi.hdfs.remove(file1)
63 | return True
64 |
65 | @staticmethod
66 | def local_target(path1):
67 | return LocalTarget(path1)
68 |
--------------------------------------------------------------------------------
/luiti/utils/math_utils.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 |
4 | class MathUtils:
5 |
6 | @staticmethod
7 | def percent(a, b):
8 | # reset other False type obj to 0, e.g. None.
9 | if not b:
10 | b = 0
11 | if not a:
12 | a = 0
13 |
14 | if b == 0:
15 | return 0.0
16 | result = a / float(b)
17 | return result
18 |
19 | # 注释原因: 实际存储还是用高精度吧 from @连华
20 | # return int(round(result * 10000)) / 10000.0
21 |
--------------------------------------------------------------------------------
/luiti/utils/mr_utils.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | import json
4 | from etl_utils import JsonUtils
5 |
6 |
7 | class MRUtils:
8 |
9 | map_key_split = u"@@" # map 多维度键 分隔符
10 | map_key_escape = u"\"" # map 字符串默认 JSON dump
11 | mr_separator = u"\t" # map reduce 分隔符
12 |
13 | @staticmethod
14 | def mr_key(item1, postfix=''):
15 | """ example is "104017@@37771707" """
16 | # TODO 业务代码应该剥离
17 | str1 = u"%s%s%s" % (
18 | item1.get('class_id', 0),
19 | MRUtils.map_key_split, item1.get('uid', 0),)
20 | if postfix:
21 | str1 += (MRUtils.map_key_split + unicode(postfix))
22 | return str1
23 |
24 | @staticmethod
25 | def json_parse(line1):
26 | line1 = line1.strip()
27 | if isinstance(line1, str):
28 | line1 = line1.decode("UTF-8")
29 | return json.loads(line1)
30 |
31 | @staticmethod
32 | def is_mr_line(line1):
33 | # 1. 目前标准的 MapReduce 输出
34 | head = line1[0:30]
35 | is_true_1 = (MRUtils.map_key_split in head) or \
36 | (MRUtils.mr_separator in head)
37 | # 2. value 必须是 } 或 ]
38 | is_true_2 = (line1.endswith("}") or line1.endswith("]"))
39 | # 3. 外部Python程序写的一行一行JSON, 没有 map key 。
40 | is_true_3 = (not line1.startswith("{")) and (not line1.startswith("["))
41 | return is_true_1 and is_true_2 and is_true_3
42 |
43 | @staticmethod
44 | def unicode_value(item1, key1):
45 | val1 = item1.get(key1, u"")
46 | if isinstance(val1, str):
47 | val1 = val1.decode("UTF-8")
48 | return val1
49 |
50 | @staticmethod
51 | def split_mr_kv(line1):
52 | """ 返回一个 解析好的 [k,v] 数组。 """
53 | if isinstance(line1, str):
54 | line1 = line1.decode("UTF-8")
55 | k_str, v_str = line1.split(MRUtils.mr_separator, 1)
56 |
57 | return [
58 | MRUtils.select_prefix_keys(k_str),
59 | json.loads(v_str),
60 | ]
61 |
62 | # key related
63 | @staticmethod
64 | def merge_keys_in_dict(vals_1, keys_1):
65 | """ 合并多个键的整数值。 """
66 | merge = {key_1: 0 for key_1 in keys_1}
67 | for v_2 in vals_1:
68 | for key_1 in keys_1:
69 | merge[key_1] += v_2[key_1]
70 | return merge
71 |
72 | @staticmethod
73 | def concat_prefix_keys(*keys):
74 | items_str = map(unicode, keys)
75 | return MRUtils.map_key_split.join(items_str)
76 |
77 | @staticmethod
78 | def split_prefix_keys(line_part_a):
79 | """ return list """
80 | fixed_str = MRUtils.select_prefix_keys(line_part_a)
81 | return fixed_str.split(MRUtils.map_key_split)
82 |
83 | @staticmethod
84 | def select_prefix_keys(line_part_a, idxes=None):
85 | """
86 | 根据索引数组 转化出新的 map key
87 | e.g. select_prefix_keys("232@@8923802@@afenti", [0,1])
88 | # => "232@8923802"
89 | """
90 | if isinstance(line_part_a, str):
91 | line_part_a = line_part_a.decode("UTF-8")
92 | # 兼容解析格式错误的jsonkey
93 | if line_part_a.startswith(MRUtils.map_key_escape) and \
94 | (not line_part_a.endswith(MRUtils.map_key_escape)):
95 | line_part_a = line_part_a[1:]
96 | if line_part_a.startswith(MRUtils.map_key_escape): # is a json
97 | line_part_a = json.loads(line_part_a)
98 |
99 | if idxes is None:
100 | return line_part_a
101 | else:
102 | parts = line_part_a.split(MRUtils.map_key_split)
103 | new_parts = []
104 | for idx_1 in idxes:
105 | new_parts.append(parts[idx_1])
106 | return MRUtils.map_key_split.join(new_parts)
107 |
108 | @staticmethod
109 | def str_dump(result_dict):
110 | return JsonUtils.unicode_dump(result_dict).encode("UTF-8")
111 |
112 | @staticmethod
113 | def filter_dict(d1, keys):
114 | if not isinstance(keys, list):
115 | keys = [keys]
116 | return {k1: d1[k1] for k1 in keys}
117 |
--------------------------------------------------------------------------------
/luiti/utils/target_utils.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | import json
4 | import luigi
5 | from etl_utils import singleton, cached_property
6 |
7 |
8 | @singleton()
9 | class TargetUtilsClass(object):
10 |
11 | def line_read(self, hdfs1):
12 | with hdfs1.open('r') as data1:
13 | for line1 in data1:
14 | line1 = line1.decode("UTF-8").strip()
15 | # filter blank line
16 | if len(line1) == 0:
17 | continue
18 | yield line1
19 |
20 | def json_read(self, hdfs1):
21 | for line1 in TargetUtils.line_read(hdfs1):
22 | yield json.loads(line1) # as item1
23 |
24 | def hdfs(self, data_file1):
25 | # [兼容] 可以判断出 data_file1 是否包含 part-00000 的目录。
26 |
27 | # 兼容 snakebite 对 不存在目录的 test 有bug,或者是因为从hadoop用户切换到primary_user导致。
28 | f1 = luigi.hdfs.HdfsTarget(data_file1)
29 |
30 | # isdir 在 luigi/hdfs.py 没有实现哦
31 | is_curr_dir = lambda: len(list(f1.fs.listdir(data_file1))) > 1
32 |
33 | if f1.exists() and is_curr_dir():
34 | # There's no part-000 when use multiple text output in streaming
35 | def _exists(name):
36 | return luigi.hdfs.HdfsTarget(data_file1 + name).exists()
37 | is_mr_output_root = _exists("/_SUCCESS")
38 | has_part_000000 = _exists("/part-00000")
39 | if is_mr_output_root or has_part_000000:
40 | return luigi.hdfs.HdfsTarget(data_file1,
41 | format=luigi.hdfs.PlainDir)
42 |
43 | return f1
44 |
45 | def hdfs_dir(self, path1):
46 | """
47 | Compact with someone use 000000_0 file naming style, but not the default MR part-00000。
48 | """
49 | return luigi.hdfs.HdfsTarget(path1, format=luigi.hdfs.PlainDir)
50 |
51 | def isdir(self, path1):
52 | return self.client.get_bite().test(path1, directory=True)
53 |
54 | def exists(self, path1):
55 | return self.client.exists(path1)
56 |
57 | @cached_property
58 | def client(self):
59 | return HdfsClient.client
60 |
61 | TargetUtils = TargetUtilsClass()
62 |
63 |
64 | @singleton()
65 | class HdfsClientClass(object):
66 | # TODO use delegate
67 |
68 | @cached_property
69 | def client(self):
70 | import luigi.hdfs
71 | return luigi.hdfs.clients
72 | HdfsClient = HdfsClientClass()
73 | TargetUtils.HdfsClient = HdfsClient
74 |
--------------------------------------------------------------------------------
/luiti/utils/visualiser_env_template.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | from etl_utils import cached_property
4 | from ..luigi_extensions import ArrowParameter
5 |
6 |
7 | class VisualiserEnvTemplate(object):
8 | """
9 | Setup luiti webui.
10 |
11 | Overwrite below attributes, see keys and their examples in `data`.
12 | """
13 | def __init__(self, kwargs=dict()):
14 | assert isinstance(kwargs, dict), kwargs
15 |
16 | for k1, v1 in kwargs.iteritems():
17 | if not hasattr(self, k1):
18 | raise ValueError("%s dont has attribute \"%s\"" % self, k1)
19 | setattr(self, k1, v1)
20 |
21 | @cached_property
22 | def data(self):
23 | def maybe_call(o1):
24 | if callable(o1):
25 | o1 = o1()
26 | return o1
27 |
28 | result = {
29 | "file_web_url_prefix": maybe_call(self.file_web_url_prefix),
30 | "date_begin": maybe_call(self.date_begin),
31 | "additional_task_parameters": maybe_call(self.additional_task_parameters),
32 | "package_config": maybe_call(self.package_config),
33 | }
34 |
35 | # check data valid
36 | assert isinstance(result["additional_task_parameters"], dict)
37 | if len(result["additional_task_parameters"]) > 0:
38 | val = result["additional_task_parameters"].values()[0]
39 | assert "values" in val
40 | assert "default" in val
41 |
42 | return result
43 |
44 | def __getitem__(self, k1):
45 | return self.data[k1]
46 |
47 | # API list
48 | file_web_url_prefix = ""
49 | date_begin = ArrowParameter.now().replace(weeks=-1).format("YYYY-MM-DD")
50 |
51 | def additional_task_parameters(self):
52 | """
53 | Example is
54 |
55 | {
56 | "subject": {
57 | "values": ["english", "math"],
58 | "default": "english",
59 | }
60 | }
61 | """
62 | return dict()
63 |
64 | def package_config(self):
65 | return {
66 | "default_selected": []
67 | }
68 |
--------------------------------------------------------------------------------
/luiti/webui/INSTALL.markdown:
--------------------------------------------------------------------------------
1 | Install by http://bower.io/
2 | ==============
3 | ```bash
4 | bower install
5 | ```
6 |
--------------------------------------------------------------------------------
/luiti/webui/assets/javascripts/luiti.js:
--------------------------------------------------------------------------------
1 | (function() {
2 | 'use strict';
3 |
4 | // mark color, when select a task, separate in and out.
5 | var colors = {
6 | "requires": "lime",
7 | "self": "#7BE141",
8 | "upons": "green",
9 | };
10 |
11 | var render_network = function(nodes, edges, container_id, click_event) {
12 | nodes = _.map(nodes, function(node) {
13 | if (_.contains(queryparams.selected_query.task_cls, node.label)) {
14 | node.color = colors.self;
15 | } else {
16 | node.color = colors.requires;
17 | };
18 | return node;
19 | });
20 |
21 | // NOTE: original code is http://visjs.org/examples/network/nodeStyles/customGroups.html
22 | var container = $(container_id)[0]; // create a network
23 | var data = {
24 | nodes: nodes,
25 | edges: edges
26 | };
27 | var options = {
28 | nodes: {
29 | shape: 'dot',
30 | size: 20,
31 | font: {
32 | size: 15,
33 | color: '#000000'
34 | },
35 | borderWidth: 2
36 | },
37 | edges: {
38 | width: 2
39 | }
40 | };
41 |
42 | var network = new vis.Network(container, data, options);
43 | network.on("click", click_event);
44 | };
45 |
46 |
47 | var render_visualSearch = function(container_id, default_query, selected_query, vs_accepted_params) {
48 | var env_config_visualSearch = {
49 | "facet_values": (function() {
50 | var task_namespaces = _.map(["task_cls", "luiti_package"], function(param) {
51 | return {"label": param, "category": "Namespaces"};
52 | });
53 | var task_params= _.map(_.keys(default_query), function(param) {
54 | return {"label": param, "category": "Params"};
55 | });
56 | return task_params.concat(task_namespaces);
57 | })(),
58 | };
59 |
60 | var get_current_query = function(visualSearch) {
61 | var result = {};
62 |
63 | _.map(visualSearch.searchQuery.facets(), function(facet) {
64 | var kv = _.pairs(facet)[0];
65 | if (_.has(result, kv[0])) {
66 | result[kv[0]].push(kv[1]);
67 | } else {
68 | result[kv[0]] = [kv[1]];
69 | };
70 | });
71 |
72 | return result;
73 | }
74 |
75 | var vs_config = {
76 | container: $(container_id),
77 | query: '',
78 | autosearch: true,
79 | callbacks: {
80 | search: function(query, searchCollection) {
81 | return false;
82 | },
83 | facetMatches: function(callback) {
84 | callback(env_config_visualSearch["facet_values"]);
85 | },
86 | valueMatches: function(facet, searchTerm, callback) {
87 | // support smart match, from any position of strs.
88 | var orig_array = vs_accepted_params[facet];
89 | searchTerm = searchTerm.toLowerCase();
90 | var result = _.filter(orig_array , function(str) {
91 | return s.contains(str.toLowerCase(), searchTerm);
92 | });
93 | // dont work, see more details at search_fact.js#autocompleteValues
94 | return callback(result);
95 | },
96 | blur: function() {
97 | var result = get_current_query(visualSearch);
98 |
99 | // Update a React view.
100 | group_summary.setState({"selected_luiti_packages": result["luiti_package"]})
101 | },
102 | }
103 | };
104 |
105 | // Example format is: visualSearch.searchBox.value("Country: US State: \"New York\" Key: Value")
106 | var load_params = function(query_opts) {
107 | // support same key with multiple values.
108 | var vs_values = [];
109 | _.each(query_opts, function(opt_values, opt_key) {
110 | _.each(opt_values, function(opt_value) {
111 | vs_values = vs_values.concat(JSON.stringify(opt_key) + ": " + JSON.stringify(opt_value));
112 | });
113 | });
114 | return vs_values.join(" ");
115 | };
116 |
117 | // Run it!
118 | var visualSearch = VS.init(vs_config);
119 |
120 | visualSearch.current_query = (function() {
121 | var result = _.extend({}, selected_query, URI.parseQuery(URI(window.location)._parts.query));
122 | // wrap value in a Array.
123 | _.each(_.keys(result), function(key) {
124 | if (!_.isArray(result[key])) {
125 | result[key] = [result[key]];
126 | };
127 | });
128 | return result;
129 | })();
130 |
131 | visualSearch.setValue = function(opts) {
132 | return visualSearch.searchBox.value(load_params(opts));
133 | };
134 | visualSearch.setValue(visualSearch.current_query);
135 |
136 | // support click query
137 | var searchBox = visualSearch.options.container.find(".VS-icon-search");
138 | searchBox.click(function(event) {
139 | var result = get_current_query(visualSearch);
140 |
141 | // build a url query
142 | var url = URI(window.location);
143 | url._parts.query = "";
144 | url.setQuery(result);
145 | window.location = url.build();
146 |
147 | return false;
148 | });
149 | searchBox.css("cursor", "pointer");
150 |
151 | return visualSearch;
152 | };
153 |
154 |
155 | var render_header_title = function(title) {
156 | $("head title").html(title);
157 | $("body #header .title").html(title);
158 | };
159 |
160 | var render_all = function(env) {
161 | // 1. render network
162 | render_network(nodeedge.nodes,
163 | nodeedge.edges,
164 | "#network",
165 | function (params) {
166 | console.log("[click a node on #network]", params);
167 | var task_id = params["nodes"][0]; // only one task can be clicked.
168 | // Delegate to show TaskDetailView
169 | $("#nodes_groups").find('.nodes_group li[data-task-id="' + task_id + '"]').click();
170 | });
171 |
172 | // 2. render visualSearch
173 | env.visualSearch = render_visualSearch(".visual_search", queryparams.default_query, queryparams.selected_query, queryparams.accepted);
174 |
175 | // Other views.
176 | render_header_title(title);
177 | };
178 |
179 | var init_data_url = "init_data.json" + location.search;
180 |
181 | $.getJSON(init_data_url, function(data) {
182 | // bind env's first level key to global `window` object.
183 | _.each(data, function(value, key) {
184 | window[key] = value;
185 | });
186 | window.env = data;
187 | console.log("load data", env);
188 |
189 | // transform data
190 | nodeedge.nodeid_to_node_dict = _.reduce(nodeedge.nodes, function(dict, node) {
191 | dict[node.id] = node;
192 | return dict;
193 | }, {});
194 |
195 | render_all(env);
196 |
197 | // orig is
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
39 |
40 |
51 |
52 |
53 |
54 |
55 |