├── .dockerignore
├── .gitignore
├── README.md
├── docker-compose.yml
└── docker
    ├── airflow
        ├── airflow.cfg
        ├── dockerfile
        └── entrypoint.sh
    └── hadoop
        ├── conf
            ├── beeline-log4j2.properties
            ├── core-site.xml
            ├── hive-env.sh
            ├── hive-exec-log4j2.properties
            ├── hive-log4j2.properties
            ├── hive-site.xml
            ├── ivysettings.xml
            ├── llap-daemon-log4j2.properties
            ├── sqoop-env.sh
            └── yarn-site.xml
        ├── dockerfile
        └── entrypoint.sh


/.dockerignore:
--------------------------------------------------------------------------------
1 | .git


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_store
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Customized airflow docker
 2 | 
 3 | ## Dependencies
 4 | 
 5 | - `airflow==1.10.14`
 6 | - `hadoop==3.2.1`
 7 | - `hive==3.1.2`
 8 | - `spark==3.1.1`
 9 | - `sqoop==1.4.7`
10 | 
11 | ## How to run
12 | 
13 | You can clone this repository and run the following command to start the airflow webserver.
14 | 
15 | ```
16 | docker compose up -d
17 | ```
18 | 
19 | You can access the airflow webserver with http://localhost:8080
20 | 
21 | It's already mount the `dags` and `plugins` parts to the docker volume, please feel free to change the configuration in the `docker-compose.yml` for you preference. 
22 | 
23 | To close the airflow container
24 | ```
25 | docker compose down -v
26 | ```
27 | 
28 | ## References
29 | 
30 | - Airflow docker based : https://github.com/puckel/docker-airflow
31 | - Hadoop based configuration : https://github.com/pavank/docker-bigdata-cluster
32 | 
33 | ## Todos
34 | 
35 | - [ ] build a `hive`, `spark`, and `sqoop` cluster for testing the `airflow` operators.
36 | 
37 | # Project milestones
38 | 
39 | - [X] Build an airflow docker image with `Postgres, `Sqoop`, `Spark`, and `Hive` components.
40 | - [X] Publish to the docker hub for `arm64` architecture contribution.
41 | - [ ] Used it in the following project to build a data engineer challenge pipeline.
42 | 
43 | # Learning objectives
44 | 
45 | ## Docker
46 | - Understanding how to build a docker image from other built images with `dockerfile` configuration.
47 |     - understand the parameter difference in the docker file (`ENV`, `RUN`, `ARG`, `CMD`, etc.).
48 | - Able to change or modify the parameter from the existing built image.
49 |     - Successfully modify and build the image for the airflow container.
50 | - Learn how to structure a docker project. e.g., `.dockerignore`, `docker-compose.yml`
51 | 
52 | ## Hadoop
53 | - Understanding the basic need for configuring the Hadoop ecosystem. e.g., configuration files `core-site.xml`, `hdfs-site.xml`, etc. 
54 | - Be able to work around the dependency issues between Hadoop components. For example, which Hive version should we use with the Hadoop 3.2.1?
55 | 
56 | # Notes
57 | 
58 | 1. The tricky part of this project is not a docker or Hadoop ecosystem. But it's to make all the components dependencies working together. For example, you have to understand why you have to use the `python:3.6-stretch version for building the Hadoop-based image instead of `python:3.7-slim-buster` provided in the original docker image.
59 |     - Quick answer is that the `slim-buster` version doesn't support the JAVA 8, which we have to use for installing the Hadoop components.
60 | 
61 | 2. You will face many dependency problems not only from the Linux-based but also from the python and pip environment. Almost all of the time, you have to find the workaround in the stack overflow and trust me you are not the first to face the issues.
62 |     - Trial and error help you a lot in fixing each issue. Please don't give up that's all I learned from this process.
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.7'
 2 | services:
 3 |     postgres:
 4 |         container_name: airflow-database
 5 |         image: postgres:9.6
 6 |         environment:
 7 |             - POSTGRES_USER=airflow
 8 |             - POSTGRES_PASSWORD=airflow
 9 |             - POSTGRES_DB=airflow
10 |         logging:
11 |             options:
12 |                 max-size: 10m
13 |                 max-file: "3"
14 | 
15 |     webserver:
16 |         container_name: airflow-webserver
17 |         image: ppatcoding/airflow:latest
18 |         restart: always
19 |         depends_on:
20 |             - postgres
21 |         environment:
22 |             - LOAD_EX=n
23 |             - EXECUTOR=Local
24 |         logging:
25 |             options:
26 |                 max-size: 10m
27 |                 max-file: "3"
28 |         volumes:
29 |             - ./ariflow/dags:/usr/local/airflow/dags
30 |             - ./airflow/plugins:/usr/local/airflow/plugins
31 |         ports:
32 |             - "8080:8080"
33 |         command: webserver
34 |         healthcheck:
35 |             test: ["CMD-SHELL", "[ -f /usr/local/airflow/airflow-webserver.pid ]"]
36 |             interval: 30s
37 |             timeout: 30s
38 |             retries: 3
39 | 


--------------------------------------------------------------------------------
/docker/airflow/airflow.cfg:
--------------------------------------------------------------------------------
  1 | [core]
  2 | # The folder where your airflow pipelines live, most likely a
  3 | # subfolder in a code repository. This path must be absolute.
  4 | dags_folder = /usr/local/airflow/dags
  5 | 
  6 | # The folder where airflow should store its log files
  7 | # This path must be absolute
  8 | base_log_folder = /usr/local/airflow/logs
  9 | 
 10 | # Airflow can store logs remotely in AWS S3, Google Cloud Storage or Elastic Search.
 11 | # Set this to True if you want to enable remote logging.
 12 | remote_logging = False
 13 | 
 14 | # Users must supply an Airflow connection id that provides access to the storage
 15 | # location.
 16 | remote_log_conn_id =
 17 | remote_base_log_folder =
 18 | encrypt_s3_logs = False
 19 | 
 20 | # Logging level
 21 | logging_level = INFO
 22 | 
 23 | # Logging level for Flask-appbuilder UI
 24 | fab_logging_level = WARN
 25 | 
 26 | # Logging class
 27 | # Specify the class that will specify the logging configuration
 28 | # This class has to be on the python classpath
 29 | # Example: logging_config_class = my.path.default_local_settings.LOGGING_CONFIG
 30 | logging_config_class =
 31 | 
 32 | # Flag to enable/disable Colored logs in Console
 33 | # Colour the logs when the controlling terminal is a TTY.
 34 | colored_console_log = True
 35 | 
 36 | # Log format for when Colored logs is enabled
 37 | colored_log_format = [%%(blue)s%%(asctime)s%%(reset)s] {{%%(blue)s%%(filename)s:%%(reset)s%%(lineno)d}} %%(log_color)s%%(levelname)s%%(reset)s - %%(log_color)s%%(message)s%%(reset)s
 38 | colored_formatter_class = airflow.utils.log.colored_log.CustomTTYColoredFormatter
 39 | 
 40 | # Format of Log line
 41 | log_format = [%%(asctime)s] {{%%(filename)s:%%(lineno)d}} %%(levelname)s - %%(message)s
 42 | simple_log_format = %%(asctime)s %%(levelname)s - %%(message)s
 43 | 
 44 | # Log filename format
 45 | log_filename_template = {{ ti.dag_id }}/{{ ti.task_id }}/{{ ts }}/{{ try_number }}.log
 46 | log_processor_filename_template = {{ filename }}.log
 47 | dag_processor_manager_log_location = /usr/local/airflow/logs/dag_processor_manager/dag_processor_manager.log
 48 | 
 49 | # Name of handler to read task instance logs.
 50 | # Default to use task handler.
 51 | task_log_reader = task
 52 | 
 53 | # Hostname by providing a path to a callable, which will resolve the hostname.
 54 | # The format is "package:function".
 55 | #
 56 | # For example, default value "socket:getfqdn" means that result from getfqdn() of "socket"
 57 | # package will be used as hostname.
 58 | #
 59 | # No argument should be required in the function specified.
 60 | # If using IP address as hostname is preferred, use value ``airflow.utils.net:get_host_ip_address``
 61 | hostname_callable = socket:getfqdn
 62 | 
 63 | # Default timezone in case supplied date times are naive
 64 | # can be utc (default), system, or any IANA timezone string (e.g. Europe/Amsterdam)
 65 | default_timezone = utc
 66 | 
 67 | # The executor class that airflow should use. Choices include
 68 | # SequentialExecutor, LocalExecutor, CeleryExecutor, DaskExecutor, KubernetesExecutor
 69 | executor = SequentialExecutor
 70 | 
 71 | # The SqlAlchemy connection string to the metadata database.
 72 | # SqlAlchemy supports many different database engine, more information
 73 | # their website
 74 | # sql_alchemy_conn = sqlite:////tmp/airflow.db
 75 | 
 76 | # The encoding for the databases
 77 | sql_engine_encoding = utf-8
 78 | 
 79 | # If SqlAlchemy should pool database connections.
 80 | sql_alchemy_pool_enabled = True
 81 | 
 82 | # The SqlAlchemy pool size is the maximum number of database connections
 83 | # in the pool. 0 indicates no limit.
 84 | sql_alchemy_pool_size = 5
 85 | 
 86 | # The maximum overflow size of the pool.
 87 | # When the number of checked-out connections reaches the size set in pool_size,
 88 | # additional connections will be returned up to this limit.
 89 | # When those additional connections are returned to the pool, they are disconnected and discarded.
 90 | # It follows then that the total number of simultaneous connections the pool will allow
 91 | # is pool_size + max_overflow,
 92 | # and the total number of "sleeping" connections the pool will allow is pool_size.
 93 | # max_overflow can be set to -1 to indicate no overflow limit;
 94 | # no limit will be placed on the total number of concurrent connections. Defaults to 10.
 95 | sql_alchemy_max_overflow = 10
 96 | 
 97 | # The SqlAlchemy pool recycle is the number of seconds a connection
 98 | # can be idle in the pool before it is invalidated. This config does
 99 | # not apply to sqlite. If the number of DB connections is ever exceeded,
100 | # a lower config value will allow the system to recover faster.
101 | sql_alchemy_pool_recycle = 1800
102 | 
103 | # Check connection at the start of each connection pool checkout.
104 | # Typically, this is a simple statement like "SELECT 1".
105 | # More information here:
106 | # https://docs.sqlalchemy.org/en/13/core/pooling.html#disconnect-handling-pessimistic
107 | sql_alchemy_pool_pre_ping = True
108 | 
109 | # The schema to use for the metadata database.
110 | # SqlAlchemy supports databases with the concept of multiple schemas.
111 | sql_alchemy_schema =
112 | 
113 | # The amount of parallelism as a setting to the executor. This defines
114 | # the max number of task instances that should run simultaneously
115 | # on this airflow installation
116 | parallelism = 32
117 | 
118 | # The number of task instances allowed to run concurrently by the scheduler
119 | dag_concurrency = 16
120 | 
121 | # Are DAGs paused by default at creation
122 | dags_are_paused_at_creation = True
123 | 
124 | # The maximum number of active DAG runs per DAG
125 | max_active_runs_per_dag = 16
126 | 
127 | # Whether to load the examples that ship with Airflow. It's good to
128 | # get started, but you probably want to set this to False in a production
129 | # environment
130 | load_examples = True
131 | 
132 | # Where your Airflow plugins are stored
133 | plugins_folder = /usr/local/airflow/plugins
134 | 
135 | # Secret key to save connection passwords in the db
136 | fernet_key = $FERNET_KEY
137 | 
138 | # Whether to disable pickling dags
139 | donot_pickle = False
140 | 
141 | # How long before timing out a python file import
142 | dagbag_import_timeout = 30
143 | 
144 | # How long before timing out a DagFileProcessor, which processes a dag file
145 | dag_file_processor_timeout = 50
146 | 
147 | # The class to use for running task instances in a subprocess
148 | task_runner = StandardTaskRunner
149 | 
150 | # If set, tasks without a ``run_as_user`` argument will be run with this user
151 | # Can be used to de-elevate a sudo user running Airflow when executing tasks
152 | default_impersonation =
153 | 
154 | # What security module to use (for example kerberos)
155 | security =
156 | 
157 | # If set to False enables some unsecure features like Charts and Ad Hoc Queries.
158 | # In 2.0 will default to True.
159 | secure_mode = False
160 | 
161 | # Turn unit test mode on (overwrites many configuration options with test
162 | # values at runtime)
163 | unit_test_mode = False
164 | 
165 | # Whether to enable pickling for xcom (note that this is insecure and allows for
166 | # RCE exploits). This will be deprecated in Airflow 2.0 (be forced to False).
167 | enable_xcom_pickling = True
168 | 
169 | # When a task is killed forcefully, this is the amount of time in seconds that
170 | # it has to cleanup after it is sent a SIGTERM, before it is SIGKILLED
171 | killed_task_cleanup_time = 60
172 | 
173 | # Whether to override params with dag_run.conf. If you pass some key-value pairs
174 | # through ``airflow dags backfill -c`` or
175 | # ``airflow dags trigger -c``, the key-value pairs will override the existing ones in params.
176 | dag_run_conf_overrides_params = False
177 | 
178 | # Worker initialisation check to validate Metadata Database connection
179 | worker_precheck = False
180 | 
181 | # When discovering DAGs, ignore any files that don't contain the strings ``DAG`` and ``airflow``.
182 | dag_discovery_safe_mode = True
183 | 
184 | # The number of retries each task is going to have by default. Can be overridden at dag or task level.
185 | default_task_retries = 0
186 | 
187 | # Whether to serialises DAGs and persist them in DB.
188 | # If set to True, Webserver reads from DB instead of parsing DAG files
189 | # More details: https://airflow.apache.org/docs/stable/dag-serialization.html
190 | store_serialized_dags = False
191 | 
192 | # Updating serialized DAG can not be faster than a minimum interval to reduce database write rate.
193 | min_serialized_dag_update_interval = 30
194 | 
195 | # On each dagrun check against defined SLAs
196 | check_slas = True
197 | 
198 | [cli]
199 | # In what way should the cli access the API. The LocalClient will use the
200 | # database directly, while the json_client will use the api running on the
201 | # webserver
202 | api_client = airflow.api.client.local_client
203 | 
204 | # If you set web_server_url_prefix, do NOT forget to append it here, ex:
205 | # ``endpoint_url = http://localhost:8080/myroot``
206 | # So api will look like: ``http://localhost:8080/myroot/api/experimental/...``
207 | endpoint_url = http://localhost:8080
208 | 
209 | [debug]
210 | # Used only with DebugExecutor. If set to True DAG will fail with first
211 | # failed task. Helpful for debugging purposes.
212 | fail_fast = False
213 | 
214 | [api]
215 | # How to authenticate users of the API
216 | auth_backend = airflow.api.auth.backend.default
217 | 
218 | [lineage]
219 | # what lineage backend to use
220 | backend =
221 | 
222 | [atlas]
223 | sasl_enabled = False
224 | host =
225 | port = 21000
226 | username =
227 | password =
228 | 
229 | [operators]
230 | # The default owner assigned to each new operator, unless
231 | # provided explicitly or passed via ``default_args``
232 | default_owner = airflow
233 | default_cpus = 1
234 | default_ram = 512
235 | default_disk = 512
236 | default_gpus = 0
237 | 
238 | [hive]
239 | # Default mapreduce queue for HiveOperator tasks
240 | default_hive_mapred_queue =
241 | 
242 | [webserver]
243 | # The base url of your website as airflow cannot guess what domain or
244 | # cname you are using. This is used in automated emails that
245 | # airflow sends to point links to the right web server
246 | base_url = http://localhost:8080
247 | 
248 | # The ip specified when starting the web server
249 | web_server_host = 0.0.0.0
250 | 
251 | # The port on which to run the web server
252 | web_server_port = 8080
253 | 
254 | # Paths to the SSL certificate and key for the web server. When both are
255 | # provided SSL will be enabled. This does not change the web server port.
256 | web_server_ssl_cert =
257 | 
258 | # Paths to the SSL certificate and key for the web server. When both are
259 | # provided SSL will be enabled. This does not change the web server port.
260 | web_server_ssl_key =
261 | 
262 | # Number of seconds the webserver waits before killing gunicorn master that doesn't respond
263 | web_server_master_timeout = 120
264 | 
265 | # Number of seconds the gunicorn webserver waits before timing out on a worker
266 | web_server_worker_timeout = 120
267 | 
268 | # Number of workers to refresh at a time. When set to 0, worker refresh is
269 | # disabled. When nonzero, airflow periodically refreshes webserver workers by
270 | # bringing up new ones and killing old ones.
271 | worker_refresh_batch_size = 1
272 | 
273 | # Number of seconds to wait before refreshing a batch of workers.
274 | worker_refresh_interval = 30
275 | 
276 | # Secret key used to run your flask app
277 | # It should be as random as possible
278 | secret_key = \xc5\xe3\x9f\xf2JK\xb9q\xec\xda\x98\x846\xebCm
279 | 
280 | # Number of workers to run the Gunicorn web server
281 | workers = 4
282 | 
283 | # The worker class gunicorn should use. Choices include
284 | # sync (default), eventlet, gevent
285 | worker_class = sync
286 | 
287 | # Log files for the gunicorn webserver. '-' means log to stderr.
288 | access_logfile = -
289 | 
290 | # Log files for the gunicorn webserver. '-' means log to stderr.
291 | error_logfile = -
292 | 
293 | # Expose the configuration file in the web server
294 | expose_config = True
295 | 
296 | # Expose hostname in the web server
297 | expose_hostname = True
298 | 
299 | # Expose stacktrace in the web server
300 | expose_stacktrace = True
301 | 
302 | # Set to true to turn on authentication:
303 | # https://airflow.apache.org/security.html#web-authentication
304 | authenticate = False
305 | 
306 | # Filter the list of dags by owner name (requires authentication to be enabled)
307 | filter_by_owner = False
308 | 
309 | # Filtering mode. Choices include user (default) and ldapgroup.
310 | # Ldap group filtering requires using the ldap backend
311 | #
312 | # Note that the ldap server needs the "memberOf" overlay to be set up
313 | # in order to user the ldapgroup mode.
314 | owner_mode = user
315 | 
316 | # Default DAG view. Valid values are:
317 | # tree, graph, duration, gantt, landing_times
318 | dag_default_view = tree
319 | 
320 | # "Default DAG orientation. Valid values are:"
321 | # LR (Left->Right), TB (Top->Bottom), RL (Right->Left), BT (Bottom->Top)
322 | dag_orientation = LR
323 | 
324 | # Puts the webserver in demonstration mode; blurs the names of Operators for
325 | # privacy.
326 | demo_mode = False
327 | 
328 | # The amount of time (in secs) webserver will wait for initial handshake
329 | # while fetching logs from other worker machine
330 | log_fetch_timeout_sec = 5
331 | 
332 | # Time interval (in secs) to wait before next log fetching.
333 | log_fetch_delay_sec = 2
334 | 
335 | # Distance away from page bottom to enable auto tailing.
336 | log_auto_tailing_offset = 30
337 | 
338 | # Animation speed for auto tailing log display.
339 | log_animation_speed = 1000
340 | 
341 | # By default, the webserver shows paused DAGs. Flip this to hide paused
342 | # DAGs by default
343 | hide_paused_dags_by_default = False
344 | 
345 | # Consistent page size across all listing views in the UI
346 | page_size = 100
347 | 
348 | # Use FAB-based webserver with RBAC feature
349 | rbac = False
350 | 
351 | # Define the color of navigation bar
352 | navbar_color = #007A87
353 | 
354 | # Default dagrun to show in UI
355 | default_dag_run_display_number = 25
356 | 
357 | # Enable werkzeug ``ProxyFix`` middleware for reverse proxy
358 | enable_proxy_fix = False
359 | 
360 | # Number of values to trust for ``X-Forwarded-For``.
361 | # More info: https://werkzeug.palletsprojects.com/en/0.16.x/middleware/proxy_fix/
362 | proxy_fix_x_for = 1
363 | 
364 | # Number of values to trust for ``X-Forwarded-Proto``
365 | proxy_fix_x_proto = 1
366 | 
367 | # Number of values to trust for ``X-Forwarded-Host``
368 | proxy_fix_x_host = 1
369 | 
370 | # Number of values to trust for ``X-Forwarded-Port``
371 | proxy_fix_x_port = 1
372 | 
373 | # Number of values to trust for ``X-Forwarded-Prefix``
374 | proxy_fix_x_prefix = 1
375 | 
376 | # Set secure flag on session cookie
377 | cookie_secure = False
378 | 
379 | # Set samesite policy on session cookie
380 | cookie_samesite =
381 | 
382 | # Default setting for wrap toggle on DAG code and TI log views.
383 | default_wrap = False
384 | 
385 | # Allow the UI to be rendered in a frame
386 | x_frame_enabled = True
387 | 
388 | # Send anonymous user activity to your analytics tool
389 | # choose from google_analytics, segment, or metarouter
390 | # analytics_tool =
391 | 
392 | # Unique ID of your account in the analytics tool
393 | # analytics_id =
394 | 
395 | # Update FAB permissions and sync security manager roles
396 | # on webserver startup
397 | update_fab_perms = True
398 | 
399 | # Minutes of non-activity before logged out from UI
400 | # 0 means never get forcibly logged out
401 | force_log_out_after = 0
402 | 
403 | # The UI cookie lifetime in days
404 | session_lifetime_days = 30
405 | 
406 | [email]
407 | email_backend = airflow.utils.email.send_email_smtp
408 | 
409 | [smtp]
410 | 
411 | # If you want airflow to send emails on retries, failure, and you want to use
412 | # the airflow.utils.email.send_email_smtp function, you have to configure an
413 | # smtp server here
414 | smtp_host = localhost
415 | smtp_starttls = True
416 | smtp_ssl = False
417 | # Example: smtp_user = airflow
418 | # smtp_user =
419 | # Example: smtp_password = airflow
420 | # smtp_password =
421 | smtp_port = 25
422 | smtp_mail_from = airflow@example.com
423 | 
424 | [sentry]
425 | 
426 | # Sentry (https://docs.sentry.io) integration
427 | sentry_dsn =
428 | 
429 | [celery]
430 | 
431 | # This section only applies if you are using the CeleryExecutor in
432 | # ``[core]`` section above
433 | # The app name that will be used by celery
434 | celery_app_name = airflow.executors.celery_executor
435 | 
436 | # The concurrency that will be used when starting workers with the
437 | # ``airflow celery worker`` command. This defines the number of task instances that
438 | # a worker will take, so size up your workers based on the resources on
439 | # your worker box and the nature of your tasks
440 | worker_concurrency = 16
441 | 
442 | # The maximum and minimum concurrency that will be used when starting workers with the
443 | # ``airflow celery worker`` command (always keep minimum processes, but grow
444 | # to maximum if necessary). Note the value should be max_concurrency,min_concurrency
445 | # Pick these numbers based on resources on worker box and the nature of the task.
446 | # If autoscale option is available, worker_concurrency will be ignored.
447 | # http://docs.celeryproject.org/en/latest/reference/celery.bin.worker.html#cmdoption-celery-worker-autoscale
448 | # Example: worker_autoscale = 16,12
449 | worker_autoscale = 16,12
450 | 
451 | # When you start an airflow worker, airflow starts a tiny web server
452 | # subprocess to serve the workers local log files to the airflow main
453 | # web server, who then builds pages and sends them to users. This defines
454 | # the port on which the logs are served. It needs to be unused, and open
455 | # visible from the main web server to connect into the workers.
456 | worker_log_server_port = 8793
457 | 
458 | # The Celery broker URL. Celery supports RabbitMQ, Redis and experimentally
459 | # a sqlalchemy database. Refer to the Celery documentation for more
460 | # information.
461 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#broker-settings
462 | broker_url = redis://redis:6379/1
463 | 
464 | # The Celery result_backend. When a job finishes, it needs to update the
465 | # metadata of the job. Therefore it will post a message on a message bus,
466 | # or insert it into a database (depending of the backend)
467 | # This status is used by the scheduler to update the state of the task
468 | # The use of a database is highly recommended
469 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#task-result-backend-settings
470 | result_backend = db+postgresql://airflow:airflow@postgres/airflow
471 | 
472 | # Celery Flower is a sweet UI for Celery. Airflow has a shortcut to start
473 | # it ``airflow flower``. This defines the IP that Celery Flower runs on
474 | flower_host = 0.0.0.0
475 | 
476 | # The root URL for Flower
477 | # Example: flower_url_prefix = /flower
478 | flower_url_prefix =
479 | 
480 | # This defines the port that Celery Flower runs on
481 | flower_port = 5555
482 | 
483 | # Securing Flower with Basic Authentication
484 | # Accepts user:password pairs separated by a comma
485 | # Example: flower_basic_auth = user1:password1,user2:password2
486 | flower_basic_auth =
487 | 
488 | # Default queue that tasks get assigned to and that worker listen on.
489 | default_queue = default
490 | 
491 | # How many processes CeleryExecutor uses to sync task state.
492 | # 0 means to use max(1, number of cores - 1) processes.
493 | sync_parallelism = 0
494 | 
495 | # Import path for celery configuration options
496 | celery_config_options = airflow.config_templates.default_celery.DEFAULT_CELERY_CONFIG
497 | 
498 | # In case of using SSL
499 | ssl_active = False
500 | ssl_key =
501 | ssl_cert =
502 | ssl_cacert =
503 | 
504 | # Celery Pool implementation.
505 | # Choices include: prefork (default), eventlet, gevent or solo.
506 | # See:
507 | # https://docs.celeryproject.org/en/latest/userguide/workers.html#concurrency
508 | # https://docs.celeryproject.org/en/latest/userguide/concurrency/eventlet.html
509 | pool = prefork
510 | 
511 | # The number of seconds to wait before timing out ``send_task_to_executor`` or
512 | # ``fetch_celery_task_state`` operations.
513 | operation_timeout = 2
514 | 
515 | [celery_broker_transport_options]
516 | 
517 | # This section is for specifying options which can be passed to the
518 | # underlying celery broker transport. See:
519 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-broker_transport_options
520 | # The visibility timeout defines the number of seconds to wait for the worker
521 | # to acknowledge the task before the message is redelivered to another worker.
522 | # Make sure to increase the visibility timeout to match the time of the longest
523 | # ETA you're planning to use.
524 | # visibility_timeout is only supported for Redis and SQS celery brokers.
525 | # See:
526 | # http://docs.celeryproject.org/en/master/userguide/configuration.html#std:setting-broker_transport_options
527 | # Example: visibility_timeout = 21600
528 | # visibility_timeout =
529 | 
530 | [dask]
531 | 
532 | # This section only applies if you are using the DaskExecutor in
533 | # [core] section above
534 | # The IP address and port of the Dask cluster's scheduler.
535 | cluster_address = 127.0.0.1:8786
536 | 
537 | # TLS/ SSL settings to access a secured Dask scheduler.
538 | tls_ca =
539 | tls_cert =
540 | tls_key =
541 | 
542 | [scheduler]
543 | # Task instances listen for external kill signal (when you clear tasks
544 | # from the CLI or the UI), this defines the frequency at which they should
545 | # listen (in seconds).
546 | job_heartbeat_sec = 5
547 | 
548 | # The scheduler constantly tries to trigger new tasks (look at the
549 | # scheduler section in the docs for more information). This defines
550 | # how often the scheduler should run (in seconds).
551 | scheduler_heartbeat_sec = 5
552 | 
553 | # After how much time should the scheduler terminate in seconds
554 | # -1 indicates to run continuously (see also num_runs)
555 | run_duration = -1
556 | 
557 | # The number of times to try to schedule each DAG file
558 | # -1 indicates unlimited number
559 | num_runs = -1
560 | 
561 | # The number of seconds to wait between consecutive DAG file processing
562 | processor_poll_interval = 1
563 | 
564 | # after how much time (seconds) a new DAGs should be picked up from the filesystem
565 | min_file_process_interval = 0
566 | 
567 | # How often (in seconds) to scan the DAGs directory for new files. Default to 5 minutes.
568 | dag_dir_list_interval = 300
569 | 
570 | # How often should stats be printed to the logs. Setting to 0 will disable printing stats
571 | print_stats_interval = 30
572 | 
573 | # If the last scheduler heartbeat happened more than scheduler_health_check_threshold
574 | # ago (in seconds), scheduler is considered unhealthy.
575 | # This is used by the health check in the "/health" endpoint
576 | scheduler_health_check_threshold = 30
577 | child_process_log_directory = /usr/local/airflow/logs/scheduler
578 | 
579 | # Local task jobs periodically heartbeat to the DB. If the job has
580 | # not heartbeat in this many seconds, the scheduler will mark the
581 | # associated task instance as failed and will re-schedule the task.
582 | scheduler_zombie_task_threshold = 300
583 | 
584 | # Turn off scheduler catchup by setting this to False.
585 | # Default behavior is unchanged and
586 | # Command Line Backfills still work, but the scheduler
587 | # will not do scheduler catchup if this is False,
588 | # however it can be set on a per DAG basis in the
589 | # DAG definition (catchup)
590 | catchup_by_default = True
591 | 
592 | # This changes the batch size of queries in the scheduling main loop.
593 | # If this is too high, SQL query performance may be impacted by one
594 | # or more of the following:
595 | # - reversion to full table scan
596 | # - complexity of query predicate
597 | # - excessive locking
598 | # Additionally, you may hit the maximum allowable query length for your db.
599 | # Set this to 0 for no limit (not advised)
600 | max_tis_per_query = 512
601 | 
602 | # Statsd (https://github.com/etsy/statsd) integration settings
603 | statsd_on = False
604 | statsd_host = localhost
605 | statsd_port = 8125
606 | statsd_prefix = airflow
607 | 
608 | # If you want to avoid send all the available metrics to StatsD,
609 | # you can configure an allow list of prefixes to send only the metrics that
610 | # start with the elements of the list (e.g: scheduler,executor,dagrun)
611 | statsd_allow_list =
612 | 
613 | # The scheduler can run multiple threads in parallel to schedule dags.
614 | # This defines how many threads will run.
615 | max_threads = 2
616 | authenticate = False
617 | 
618 | # Turn off scheduler use of cron intervals by setting this to False.
619 | # DAGs submitted manually in the web UI or with trigger_dag will still run.
620 | use_job_schedule = True
621 | 
622 | # Allow externally triggered DagRuns for Execution Dates in the future
623 | # Only has effect if schedule_interval is set to None in DAG
624 | allow_trigger_in_future = False
625 | 
626 | [ldap]
627 | # set this to ldaps://<your.ldap.server>:<port>
628 | uri =
629 | user_filter = objectClass=*
630 | user_name_attr = uid
631 | group_member_attr = memberOf
632 | superuser_filter =
633 | data_profiler_filter =
634 | bind_user = cn=Manager,dc=example,dc=com
635 | bind_password = insecure
636 | basedn = dc=example,dc=com
637 | cacert = /etc/ca/ldap_ca.crt
638 | search_scope = LEVEL
639 | 
640 | # This setting allows the use of LDAP servers that either return a
641 | # broken schema, or do not return a schema.
642 | ignore_malformed_schema = False
643 | 
644 | [mesos]
645 | # Mesos master address which MesosExecutor will connect to.
646 | master = localhost:5050
647 | 
648 | # The framework name which Airflow scheduler will register itself as on mesos
649 | framework_name = Airflow
650 | 
651 | # Number of cpu cores required for running one task instance using
652 | # 'airflow run <dag_id> <task_id> <execution_date> --local -p <pickle_id>'
653 | # command on a mesos slave
654 | task_cpu = 1
655 | 
656 | # Memory in MB required for running one task instance using
657 | # 'airflow run <dag_id> <task_id> <execution_date> --local -p <pickle_id>'
658 | # command on a mesos slave
659 | task_memory = 256
660 | 
661 | # Enable framework checkpointing for mesos
662 | # See http://mesos.apache.org/documentation/latest/slave-recovery/
663 | checkpoint = False
664 | 
665 | # Failover timeout in milliseconds.
666 | # When checkpointing is enabled and this option is set, Mesos waits
667 | # until the configured timeout for
668 | # the MesosExecutor framework to re-register after a failover. Mesos
669 | # shuts down running tasks if the
670 | # MesosExecutor framework fails to re-register within this timeframe.
671 | # Example: failover_timeout = 604800
672 | # failover_timeout =
673 | 
674 | # Enable framework authentication for mesos
675 | # See http://mesos.apache.org/documentation/latest/configuration/
676 | authenticate = False
677 | 
678 | # Mesos credentials, if authentication is enabled
679 | # Example: default_principal = admin
680 | # default_principal =
681 | # Example: default_secret = admin
682 | # default_secret =
683 | 
684 | # Optional Docker Image to run on slave before running the command
685 | # This image should be accessible from mesos slave i.e mesos slave
686 | # should be able to pull this docker image before executing the command.
687 | # Example: docker_image_slave = puckel/docker-airflow
688 | # docker_image_slave =
689 | 
690 | [kerberos]
691 | ccache = /tmp/airflow_krb5_ccache
692 | 
693 | # gets augmented with fqdn
694 | principal = airflow
695 | reinit_frequency = 3600
696 | kinit_path = kinit
697 | keytab = airflow.keytab
698 | 
699 | [github_enterprise]
700 | api_rev = v3
701 | 
702 | [admin]
703 | # UI to hide sensitive variable fields when set to True
704 | hide_sensitive_variable_fields = True
705 | 
706 | [elasticsearch]
707 | # Elasticsearch host
708 | host =
709 | 
710 | # Format of the log_id, which is used to query for a given tasks logs
711 | log_id_template = {{dag_id}}-{{task_id}}-{{execution_date}}-{{try_number}}
712 | 
713 | # Used to mark the end of a log stream for a task
714 | end_of_log_mark = end_of_log
715 | 
716 | # Qualified URL for an elasticsearch frontend (like Kibana) with a template argument for log_id
717 | # Code will construct log_id using the log_id template from the argument above.
718 | # NOTE: The code will prefix the https:// automatically, don't include that here.
719 | frontend =
720 | 
721 | # Write the task logs to the stdout of the worker, rather than the default files
722 | write_stdout = False
723 | 
724 | # Instead of the default log formatter, write the log lines as JSON
725 | json_format = False
726 | 
727 | # Log fields to also attach to the json output, if enabled
728 | json_fields = asctime, filename, lineno, levelname, message
729 | 
730 | [elasticsearch_configs]
731 | use_ssl = False
732 | verify_certs = True
733 | 
734 | [kubernetes]
735 | # The repository, tag and imagePullPolicy of the Kubernetes Image for the Worker to Run
736 | worker_container_repository =
737 | worker_container_tag =
738 | worker_container_image_pull_policy = IfNotPresent
739 | 
740 | # If True (default), worker pods will be deleted upon termination
741 | delete_worker_pods = True
742 | 
743 | # Number of Kubernetes Worker Pod creation calls per scheduler loop
744 | worker_pods_creation_batch_size = 1
745 | 
746 | # The Kubernetes namespace where airflow workers should be created. Defaults to ``default``
747 | namespace = default
748 | 
749 | # The name of the Kubernetes ConfigMap containing the Airflow Configuration (this file)
750 | # Example: airflow_configmap = airflow-configmap
751 | airflow_configmap =
752 | 
753 | # The name of the Kubernetes ConfigMap containing ``airflow_local_settings.py`` file.
754 | #
755 | # For example:
756 | #
757 | # ``airflow_local_settings_configmap = "airflow-configmap"`` if you have the following ConfigMap.
758 | #
759 | # ``airflow-configmap.yaml``:
760 | #
761 | # .. code-block:: yaml
762 | #
763 | #   ---
764 | #   apiVersion: v1
765 | #   kind: ConfigMap
766 | #   metadata:
767 | #     name: airflow-configmap
768 | #   data:
769 | #     airflow_local_settings.py: |
770 | #         def pod_mutation_hook(pod):
771 | #             ...
772 | #     airflow.cfg: |
773 | #         ...
774 | # Example: airflow_local_settings_configmap = airflow-configmap
775 | airflow_local_settings_configmap =
776 | 
777 | # For docker image already contains DAGs, this is set to ``True``, and the worker will
778 | # search for dags in dags_folder,
779 | # otherwise use git sync or dags volume claim to mount DAGs
780 | dags_in_image = False
781 | 
782 | # For either git sync or volume mounted DAGs, the worker will look in this subpath for DAGs
783 | dags_volume_subpath =
784 | 
785 | # For DAGs mounted via a volume claim (mutually exclusive with git-sync and host path)
786 | dags_volume_claim =
787 | 
788 | # For volume mounted logs, the worker will look in this subpath for logs
789 | logs_volume_subpath =
790 | 
791 | # A shared volume claim for the logs
792 | logs_volume_claim =
793 | 
794 | # For DAGs mounted via a hostPath volume (mutually exclusive with volume claim and git-sync)
795 | # Useful in local environment, discouraged in production
796 | dags_volume_host =
797 | 
798 | # A hostPath volume for the logs
799 | # Useful in local environment, discouraged in production
800 | logs_volume_host =
801 | 
802 | # A list of configMapsRefs to envFrom. If more than one configMap is
803 | # specified, provide a comma separated list: configmap_a,configmap_b
804 | env_from_configmap_ref =
805 | 
806 | # A list of secretRefs to envFrom. If more than one secret is
807 | # specified, provide a comma separated list: secret_a,secret_b
808 | env_from_secret_ref =
809 | 
810 | # Git credentials and repository for DAGs mounted via Git (mutually exclusive with volume claim)
811 | git_repo =
812 | git_branch =
813 | git_subpath =
814 | 
815 | # The specific rev or hash the git_sync init container will checkout
816 | # This becomes GIT_SYNC_REV environment variable in the git_sync init container for worker pods
817 | git_sync_rev =
818 | 
819 | # Use git_user and git_password for user authentication or git_ssh_key_secret_name
820 | # and git_ssh_key_secret_key for SSH authentication
821 | git_user =
822 | git_password =
823 | git_sync_root = /git
824 | git_sync_dest = repo
825 | 
826 | # Mount point of the volume if git-sync is being used.
827 | # i.e. /usr/local/airflow/dags
828 | git_dags_folder_mount_point =
829 | 
830 | # To get Git-sync SSH authentication set up follow this format
831 | #
832 | # ``airflow-secrets.yaml``:
833 | #
834 | # .. code-block:: yaml
835 | #
836 | #   ---
837 | #   apiVersion: v1
838 | #   kind: Secret
839 | #   metadata:
840 | #     name: airflow-secrets
841 | #   data:
842 | #     # key needs to be gitSshKey
843 | #     gitSshKey: <base64_encoded_data>
844 | # Example: git_ssh_key_secret_name = airflow-secrets
845 | git_ssh_key_secret_name =
846 | 
847 | # To get Git-sync SSH authentication set up follow this format
848 | #
849 | # ``airflow-configmap.yaml``:
850 | #
851 | # .. code-block:: yaml
852 | #
853 | #   ---
854 | #   apiVersion: v1
855 | #   kind: ConfigMap
856 | #   metadata:
857 | #     name: airflow-configmap
858 | #   data:
859 | #     known_hosts: |
860 | #         github.com ssh-rsa <...>
861 | #     airflow.cfg: |
862 | #         ...
863 | # Example: git_ssh_known_hosts_configmap_name = airflow-configmap
864 | git_ssh_known_hosts_configmap_name =
865 | 
866 | # To give the git_sync init container credentials via a secret, create a secret
867 | # with two fields: GIT_SYNC_USERNAME and GIT_SYNC_PASSWORD (example below) and
868 | # add ``git_sync_credentials_secret = <secret_name>`` to your airflow config under the
869 | # ``kubernetes`` section
870 | #
871 | # Secret Example:
872 | #
873 | # .. code-block:: yaml
874 | #
875 | #   ---
876 | #   apiVersion: v1
877 | #   kind: Secret
878 | #   metadata:
879 | #     name: git-credentials
880 | #   data:
881 | #     GIT_SYNC_USERNAME: <base64_encoded_git_username>
882 | #     GIT_SYNC_PASSWORD: <base64_encoded_git_password>
883 | git_sync_credentials_secret =
884 | 
885 | # For cloning DAGs from git repositories into volumes: https://github.com/kubernetes/git-sync
886 | git_sync_container_repository = k8s.gcr.io/git-sync
887 | git_sync_container_tag = v3.1.1
888 | git_sync_init_container_name = git-sync-clone
889 | git_sync_run_as_user = 65533
890 | 
891 | # The name of the Kubernetes service account to be associated with airflow workers, if any.
892 | # Service accounts are required for workers that require access to secrets or cluster resources.
893 | # See the Kubernetes RBAC documentation for more:
894 | # https://kubernetes.io/docs/admin/authorization/rbac/
895 | worker_service_account_name =
896 | 
897 | # Any image pull secrets to be given to worker pods, If more than one secret is
898 | # required, provide a comma separated list: secret_a,secret_b
899 | image_pull_secrets =
900 | 
901 | # GCP Service Account Keys to be provided to tasks run on Kubernetes Executors
902 | # Should be supplied in the format: key-name-1:key-path-1,key-name-2:key-path-2
903 | gcp_service_account_keys =
904 | 
905 | # Use the service account kubernetes gives to pods to connect to kubernetes cluster.
906 | # It's intended for clients that expect to be running inside a pod running on kubernetes.
907 | # It will raise an exception if called from a process not running in a kubernetes environment.
908 | in_cluster = True
909 | 
910 | # When running with in_cluster=False change the default cluster_context or config_file
911 | # options to Kubernetes client. Leave blank these to use default behaviour like ``kubectl`` has.
912 | # cluster_context =
913 | # config_file =
914 | 
915 | # Affinity configuration as a single line formatted JSON object.
916 | # See the affinity model for top-level key names (e.g. ``nodeAffinity``, etc.):
917 | # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.12/#affinity-v1-core
918 | affinity =
919 | 
920 | # A list of toleration objects as a single line formatted JSON array
921 | # See:
922 | # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.12/#toleration-v1-core
923 | tolerations =
924 | 
925 | # Keyword parameters to pass while calling a kubernetes client core_v1_api methods
926 | # from Kubernetes Executor provided as a single line formatted JSON dictionary string.
927 | # List of supported params are similar for all core_v1_apis, hence a single config
928 | # variable for all apis.
929 | # See:
930 | # https://raw.githubusercontent.com/kubernetes-client/python/master/kubernetes/client/apis/core_v1_api.py
931 | # Note that if no _request_timeout is specified, the kubernetes client will wait indefinitely
932 | # for kubernetes api responses, which will cause the scheduler to hang.
933 | # The timeout is specified as [connect timeout, read timeout]
934 | kube_client_request_args = {{"_request_timeout" : [60,60] }}
935 | 
936 | # Specifies the uid to run the first process of the worker pods containers as
937 | run_as_user =
938 | 
939 | # Specifies a gid to associate with all containers in the worker pods
940 | # if using a git_ssh_key_secret_name use an fs_group
941 | # that allows for the key to be read, e.g. 65533
942 | fs_group =
943 | 
944 | [kubernetes_node_selectors]
945 | 
946 | # The Key-value pairs to be given to worker pods.
947 | # The worker pods will be scheduled to the nodes of the specified key-value pairs.
948 | # Should be supplied in the format: key = value
949 | 
950 | [kubernetes_annotations]
951 | 
952 | # The Key-value annotations pairs to be given to worker pods.
953 | # Should be supplied in the format: key = value
954 | 
955 | [kubernetes_environment_variables]
956 | 
957 | # The scheduler sets the following environment variables into your workers. You may define as
958 | # many environment variables as needed and the kubernetes launcher will set them in the launched workers.
959 | # Environment variables in this section are defined as follows
960 | # ``<environment_variable_key> = <environment_variable_value>``
961 | #
962 | # For example if you wanted to set an environment variable with value `prod` and key
963 | # ``ENVIRONMENT`` you would follow the following format:
964 | # ENVIRONMENT = prod
965 | #
966 | # Additionally you may override worker airflow settings with the ``AIRFLOW__<SECTION>__<KEY>``
967 | # formatting as supported by airflow normally.
968 | 
969 | [kubernetes_secrets]
970 | 
971 | # The scheduler mounts the following secrets into your workers as they are launched by the
972 | # scheduler. You may define as many secrets as needed and the kubernetes launcher will parse the
973 | # defined secrets and mount them as secret environment variables in the launched workers.
974 | # Secrets in this section are defined as follows
975 | # ``<environment_variable_mount> = <kubernetes_secret_object>=<kubernetes_secret_key>``
976 | #
977 | # For example if you wanted to mount a kubernetes secret key named ``postgres_password`` from the
978 | # kubernetes secret object ``airflow-secret`` as the environment variable ``POSTGRES_PASSWORD`` into
979 | # your workers you would follow the following format:
980 | # ``POSTGRES_PASSWORD = airflow-secret=postgres_credentials``
981 | #
982 | # Additionally you may override worker airflow settings with the ``AIRFLOW__<SECTION>__<KEY>``
983 | # formatting as supported by airflow normally.
984 | 
985 | [kubernetes_labels]
986 | 
987 | # The Key-value pairs to be given to worker pods.
988 | # The worker pods will be given these static labels, as well as some additional dynamic labels
989 | # to identify the task.
990 | # Should be supplied in the format: ``key = value``
991 | 


--------------------------------------------------------------------------------
/docker/airflow/dockerfile:
--------------------------------------------------------------------------------
 1 | # VERSION 1.10.14
 2 | # AUTHOR: Pathairush S.
 3 | # DESCRIPTION: Airflow with sqoop, spark, hive components
 4 | # BUILD: docker build --rm -t ppatcoding/docker-airflow .
 5 | # BASED ON: https://github.com/puckel/docker-airflow
 6 | 
 7 | FROM ppatcoding/hadoop-base:0.1
 8 | LABEL maintainer="Pat"
 9 | 
10 | # Never prompt the user for choices on installation/configuration of packages
11 | ENV DEBIAN_FRONTEND noninteractive
12 | ENV TERM linux
13 | 
14 | # Airflow
15 | ARG AIRFLOW_VERSION=1.10.14
16 | ARG AIRFLOW_USER_HOME=/usr/local/airflow
17 | ARG AIRFLOW_DEPS=""
18 | ARG PYTHON_DEPS=""
19 | ENV AIRFLOW_HOME=${AIRFLOW_USER_HOME}
20 | 
21 | # Define en_US.
22 | ENV LANGUAGE en_US.UTF-8
23 | ENV LANG en_US.UTF-8
24 | ENV LC_ALL en_US.UTF-8
25 | ENV LC_CTYPE en_US.UTF-8
26 | ENV LC_MESSAGES en_US.UTF-8
27 | 
28 | # Disable noisy "Handling signal" log messages:
29 | ENV GUNICORN_CMD_ARGS --log-level WARNING
30 | 
31 | RUN set -ex \
32 |     && buildDeps=' \
33 |         build-essential \
34 |         libblas-dev \
35 |         libatlas-base-dev \
36 |         freetds-dev \
37 |         libkrb5-dev \
38 |         libsasl2-dev \
39 |         libssl-dev \
40 |         libffi-dev \
41 |         libpq-dev \
42 |         git \
43 |     ' \
44 |     && apt-get update -yqq \
45 |     && apt-get upgrade -yqq \
46 |     && apt-get install -yqq --no-install-recommends \
47 |         $buildDeps \
48 |         freetds-bin \
49 |         build-essential \
50 |         default-libmysqlclient-dev \
51 |         apt-utils \
52 |         curl \
53 |         rsync \
54 |         netcat \
55 |         locales \
56 |         gcc \
57 |         libpq5 \
58 |     && sed -i 's/^# en_US.UTF-8 UTF-8$/en_US.UTF-8 UTF-8/g' /etc/locale.gen \
59 |     && locale-gen \
60 |     && update-locale LANG=en_US.UTF-8 LC_ALL=en_US.UTF-8 \
61 |     && useradd -ms /bin/bash -d ${AIRFLOW_USER_HOME} airflow \
62 |     && pip install -U pip setuptools wheel \
63 |     && pip install pytz \
64 |     && pip install pyOpenSSL \
65 |     && pip install ndg-httpsclient \
66 |     && pip install pyasn1 \
67 |     && pip install cython \
68 |     && pip install apache-airflow[crypto,celery,postgres,hive,spark,sqoop,jdbc,mysql,ssh${AIRFLOW_DEPS:+,}${AIRFLOW_DEPS}]==${AIRFLOW_VERSION} \
69 |     && pip install 'redis==3.2' \
70 |     && pip install SQLAlchemy==1.3.23 Flask-SQLAlchemy==2.4.4 \
71 |     && if [ -n "${PYTHON_DEPS}" ]; then pip install ${PYTHON_DEPS}; fi \
72 |     && apt-get purge --auto-remove -yqq $buildDeps \
73 |     && apt-get autoremove -yqq --purge \
74 |     && apt-get clean \
75 |     && rm -rf \
76 |         /var/lib/apt/lists/* \
77 |         /tmp/* \
78 |         /var/tmp/* \
79 |         /usr/share/man \
80 |         /usr/share/doc \
81 |         /usr/share/doc-base
82 | 
83 | ADD entrypoint.sh /entrypoint.sh
84 | ADD airflow.cfg ${AIRFLOW_USER_HOME}/airflow.cfg
85 | 
86 | RUN chown -R airflow: ${AIRFLOW_USER_HOME}
87 | 
88 | EXPOSE 8080 5555 8793
89 | 
90 | USER airflow
91 | WORKDIR ${AIRFLOW_USER_HOME}
92 | ENTRYPOINT ["/entrypoint.sh"]
93 | CMD ["webserver"]


--------------------------------------------------------------------------------
/docker/airflow/entrypoint.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # User-provided configuration must always be respected.
  4 | #
  5 | # Therefore, this script must only derives Airflow AIRFLOW__ variables from other variables
  6 | # when the user did not provide their own configuration.
  7 | 
  8 | TRY_LOOP="20"
  9 | 
 10 | # Global defaults and back-compat
 11 | : "${AIRFLOW_HOME:="/usr/local/airflow"}"
 12 | : "${AIRFLOW__CORE__FERNET_KEY:=${FERNET_KEY:=$(python -c "from cryptography.fernet import Fernet; FERNET_KEY = Fernet.generate_key().decode(); print(FERNET_KEY)")}}"
 13 | : "${AIRFLOW__CORE__EXECUTOR:=${EXECUTOR:-Sequential}Executor}"
 14 | 
 15 | # Load DAGs examples (default: Yes)
 16 | if [[ -z "$AIRFLOW__CORE__LOAD_EXAMPLES" && "${LOAD_EX:=n}" == n ]]; then
 17 |   AIRFLOW__CORE__LOAD_EXAMPLES=False
 18 | fi
 19 | 
 20 | export \
 21 |   AIRFLOW_HOME \
 22 |   AIRFLOW__CORE__EXECUTOR \
 23 |   AIRFLOW__CORE__FERNET_KEY \
 24 |   AIRFLOW__CORE__LOAD_EXAMPLES \
 25 | 
 26 | # Install custom python package if requirements.txt is present
 27 | if [ -e "/requirements.txt" ]; then
 28 |     $(command -v pip) install --user -r /requirements.txt
 29 | fi
 30 | 
 31 | wait_for_port() {
 32 |   local name="$1" host="$2" port="$3"
 33 |   local j=0
 34 |   while ! nc -z "$host" "$port" >/dev/null 2>&1 < /dev/null; do
 35 |     j=$((j+1))
 36 |     if [ $j -ge $TRY_LOOP ]; then
 37 |       echo >&2 "$(date) - $host:$port still not reachable, giving up"
 38 |       exit 1
 39 |     fi
 40 |     echo "$(date) - waiting for $name... $j/$TRY_LOOP"
 41 |     sleep 5
 42 |   done
 43 | }
 44 | 
 45 | # Other executors than SequentialExecutor drive the need for an SQL database, here PostgreSQL is used
 46 | if [ "$AIRFLOW__CORE__EXECUTOR" != "SequentialExecutor" ]; then
 47 |   # Check if the user has provided explicit Airflow configuration concerning the database
 48 |   if [ -z "$AIRFLOW__CORE__SQL_ALCHEMY_CONN" ]; then
 49 |     # Default values corresponding to the default compose files
 50 |     : "${POSTGRES_HOST:="postgres"}"
 51 |     : "${POSTGRES_PORT:="5432"}"
 52 |     : "${POSTGRES_USER:="airflow"}"
 53 |     : "${POSTGRES_PASSWORD:="airflow"}"
 54 |     : "${POSTGRES_DB:="airflow"}"
 55 |     : "${POSTGRES_EXTRAS:-""}"
 56 | 
 57 |     AIRFLOW__CORE__SQL_ALCHEMY_CONN="postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}${POSTGRES_EXTRAS}"
 58 |     export AIRFLOW__CORE__SQL_ALCHEMY_CONN
 59 | 
 60 |     # Check if the user has provided explicit Airflow configuration for the broker's connection to the database
 61 |     if [ "$AIRFLOW__CORE__EXECUTOR" = "CeleryExecutor" ]; then
 62 |       AIRFLOW__CELERY__RESULT_BACKEND="db+postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}${POSTGRES_EXTRAS}"
 63 |       export AIRFLOW__CELERY__RESULT_BACKEND
 64 |     fi
 65 |   else
 66 |     if [[ "$AIRFLOW__CORE__EXECUTOR" == "CeleryExecutor" && -z "$AIRFLOW__CELERY__RESULT_BACKEND" ]]; then
 67 |       >&2 printf '%s\n' "FATAL: if you set AIRFLOW__CORE__SQL_ALCHEMY_CONN manually with CeleryExecutor you must also set AIRFLOW__CELERY__RESULT_BACKEND"
 68 |       exit 1
 69 |     fi
 70 | 
 71 |     # Derive useful variables from the AIRFLOW__ variables provided explicitly by the user
 72 |     POSTGRES_ENDPOINT=$(echo -n "$AIRFLOW__CORE__SQL_ALCHEMY_CONN" | cut -d '/' -f3 | sed -e 's,.*@,,')
 73 |     POSTGRES_HOST=$(echo -n "$POSTGRES_ENDPOINT" | cut -d ':' -f1)
 74 |     POSTGRES_PORT=$(echo -n "$POSTGRES_ENDPOINT" | cut -d ':' -f2)
 75 |   fi
 76 | 
 77 |   wait_for_port "Postgres" "$POSTGRES_HOST" "$POSTGRES_PORT"
 78 | fi
 79 | 
 80 | # CeleryExecutor drives the need for a Celery broker, here Redis is used
 81 | if [ "$AIRFLOW__CORE__EXECUTOR" = "CeleryExecutor" ]; then
 82 |   # Check if the user has provided explicit Airflow configuration concerning the broker
 83 |   if [ -z "$AIRFLOW__CELERY__BROKER_URL" ]; then
 84 |     # Default values corresponding to the default compose files
 85 |     : "${REDIS_PROTO:="redis://"}"
 86 |     : "${REDIS_HOST:="redis"}"
 87 |     : "${REDIS_PORT:="6379"}"
 88 |     : "${REDIS_PASSWORD:=""}"
 89 |     : "${REDIS_DBNUM:="1"}"
 90 | 
 91 |     # When Redis is secured by basic auth, it does not handle the username part of basic auth, only a token
 92 |     if [ -n "$REDIS_PASSWORD" ]; then
 93 |       REDIS_PREFIX=":${REDIS_PASSWORD}@"
 94 |     else
 95 |       REDIS_PREFIX=
 96 |     fi
 97 | 
 98 |     AIRFLOW__CELERY__BROKER_URL="${REDIS_PROTO}${REDIS_PREFIX}${REDIS_HOST}:${REDIS_PORT}/${REDIS_DBNUM}"
 99 |     export AIRFLOW__CELERY__BROKER_URL
100 |   else
101 |     # Derive useful variables from the AIRFLOW__ variables provided explicitly by the user
102 |     REDIS_ENDPOINT=$(echo -n "$AIRFLOW__CELERY__BROKER_URL" | cut -d '/' -f3 | sed -e 's,.*@,,')
103 |     REDIS_HOST=$(echo -n "$POSTGRES_ENDPOINT" | cut -d ':' -f1)
104 |     REDIS_PORT=$(echo -n "$POSTGRES_ENDPOINT" | cut -d ':' -f2)
105 |   fi
106 | 
107 |   wait_for_port "Redis" "$REDIS_HOST" "$REDIS_PORT"
108 | fi
109 | 
110 | case "$1" in
111 |   webserver)
112 |     airflow initdb
113 |     if [ "$AIRFLOW__CORE__EXECUTOR" = "LocalExecutor" ] || [ "$AIRFLOW__CORE__EXECUTOR" = "SequentialExecutor" ]; then
114 |       # With the "Local" and "Sequential" executors it should all run in one container.
115 |       airflow scheduler &
116 |     fi
117 |     exec airflow webserver
118 |     ;;
119 |   worker|scheduler)
120 |     # Give the webserver time to run initdb.
121 |     sleep 10
122 |     exec airflow "$@"
123 |     ;;
124 |   flower)
125 |     sleep 10
126 |     exec airflow "$@"
127 |     ;;
128 |   version)
129 |     exec airflow "$@"
130 |     ;;
131 |   *)
132 |     # The command is something like bash, not an airflow subcommand. Just run it in the right environment.
133 |     exec "$@"
134 |     ;;
135 | esac
136 | 


--------------------------------------------------------------------------------
/docker/hadoop/conf/beeline-log4j2.properties:
--------------------------------------------------------------------------------
 1 | status = INFO
 2 | name = BeelineLog4j2
 3 | packages = org.apache.hadoop.hive.ql.log
 4 | 
 5 | # list of properties
 6 | property.hive.log.level = WARN
 7 | property.hive.root.logger = console
 8 | 
 9 | # list of all appenders
10 | appenders = console
11 | 
12 | # console appender
13 | appender.console.type = Console
14 | appender.console.name = console
15 | appender.console.target = SYSTEM_ERR
16 | appender.console.layout.type = PatternLayout
17 | appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} [%t]: %p %c{2}: %m%n
18 | 
19 | # list of all loggers
20 | loggers = HiveConnection
21 | 
22 | # HiveConnection logs useful info for dynamic service discovery
23 | logger.HiveConnection.name = org.apache.hive.jdbc.HiveConnection
24 | logger.HiveConnection.level = INFO
25 | 
26 | # root logger
27 | rootLogger.level = ${sys:hive.log.level}
28 | rootLogger.appenderRefs = root
29 | rootLogger.appenderRef.root.ref = ${sys:hive.root.logger}


--------------------------------------------------------------------------------
/docker/hadoop/conf/core-site.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
3 | 
4 | <configuration>
5 | </configuration>


--------------------------------------------------------------------------------
/docker/hadoop/conf/hive-env.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pathairush/airflow_hive_spark_sqoop/79c435fe997846dba48e277bba151a4080137166/docker/hadoop/conf/hive-env.sh


--------------------------------------------------------------------------------
/docker/hadoop/conf/hive-exec-log4j2.properties:
--------------------------------------------------------------------------------
 1 | status = INFO
 2 | name = HiveExecLog4j2
 3 | packages = org.apache.hadoop.hive.ql.log
 4 | 
 5 | # list of properties
 6 | property.hive.log.level = INFO
 7 | property.hive.root.logger = FA
 8 | property.hive.query.id = hadoop
 9 | property.hive.log.dir = ${sys:java.io.tmpdir}/${sys:user.name}
10 | property.hive.log.file = ${sys:hive.query.id}.log
11 | 
12 | # list of all appenders
13 | appenders = console, FA
14 | 
15 | # console appender
16 | appender.console.type = Console
17 | appender.console.name = console
18 | appender.console.target = SYSTEM_ERR
19 | appender.console.layout.type = PatternLayout
20 | appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} [%t]: %p %c{2}: %m%n
21 | 
22 | # simple file appender
23 | appender.FA.type = File
24 | appender.FA.name = FA
25 | appender.FA.fileName = ${sys:hive.log.dir}/${sys:hive.log.file}
26 | appender.FA.layout.type = PatternLayout
27 | appender.FA.layout.pattern = %d{ISO8601} %-5p [%t]: %c{2} (%F:%M(%L)) - %m%n
28 | 
29 | # list of all loggers
30 | loggers = NIOServerCnxn, ClientCnxnSocketNIO, DataNucleus, Datastore, JPOX
31 | 
32 | logger.NIOServerCnxn.name = org.apache.zookeeper.server.NIOServerCnxn
33 | logger.NIOServerCnxn.level = WARN
34 | 
35 | logger.ClientCnxnSocketNIO.name = org.apache.zookeeper.ClientCnxnSocketNIO
36 | logger.ClientCnxnSocketNIO.level = WARN
37 | 
38 | logger.DataNucleus.name = DataNucleus
39 | logger.DataNucleus.level = ERROR
40 | 
41 | logger.Datastore.name = Datastore
42 | logger.Datastore.level = ERROR
43 | 
44 | logger.JPOX.name = JPOX
45 | logger.JPOX.level = ERROR
46 | 
47 | # root logger
48 | rootLogger.level = ${sys:hive.log.level}
49 | rootLogger.appenderRefs = root
50 | rootLogger.appenderRef.root.ref = ${sys:hive.root.logger}


--------------------------------------------------------------------------------
/docker/hadoop/conf/hive-log4j2.properties:
--------------------------------------------------------------------------------
 1 | status = INFO
 2 | name = HiveLog4j2
 3 | packages = org.apache.hadoop.hive.ql.log
 4 | 
 5 | # list of properties
 6 | property.hive.log.level = INFO
 7 | property.hive.root.logger = DRFA
 8 | property.hive.log.dir = ${sys:java.io.tmpdir}/${sys:user.name}
 9 | property.hive.log.file = hive.log
10 | 
11 | # list of all appenders
12 | appenders = console, DRFA
13 | 
14 | # console appender
15 | appender.console.type = Console
16 | appender.console.name = console
17 | appender.console.target = SYSTEM_ERR
18 | appender.console.layout.type = PatternLayout
19 | appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} [%t]: %p %c{2}: %m%n
20 | 
21 | # daily rolling file appender
22 | appender.DRFA.type = RollingFile
23 | appender.DRFA.name = DRFA
24 | appender.DRFA.fileName = ${sys:hive.log.dir}/${sys:hive.log.file}
25 | # Use %pid in the filePattern to append <process-id>@<host-name> to the filename if you want separate log files for different CLI session
26 | appender.DRFA.filePattern = ${sys:hive.log.dir}/${sys:hive.log.file}.%d{yyyy-MM-dd}
27 | appender.DRFA.layout.type = PatternLayout
28 | appender.DRFA.layout.pattern = %d{ISO8601} %-5p [%t]: %c{2} (%F:%M(%L)) - %m%n
29 | appender.DRFA.policies.type = Policies
30 | appender.DRFA.policies.time.type = TimeBasedTriggeringPolicy
31 | appender.DRFA.policies.time.interval = 1
32 | appender.DRFA.policies.time.modulate = true
33 | appender.DRFA.strategy.type = DefaultRolloverStrategy
34 | appender.DRFA.strategy.max = 30
35 | 
36 | # list of all loggers
37 | loggers = NIOServerCnxn, ClientCnxnSocketNIO, DataNucleus, Datastore, JPOX
38 | 
39 | logger.NIOServerCnxn.name = org.apache.zookeeper.server.NIOServerCnxn
40 | logger.NIOServerCnxn.level = WARN
41 | 
42 | logger.ClientCnxnSocketNIO.name = org.apache.zookeeper.ClientCnxnSocketNIO
43 | logger.ClientCnxnSocketNIO.level = WARN
44 | 
45 | logger.DataNucleus.name = DataNucleus
46 | logger.DataNucleus.level = ERROR
47 | 
48 | logger.Datastore.name = Datastore
49 | logger.Datastore.level = ERROR
50 | 
51 | logger.JPOX.name = JPOX
52 | logger.JPOX.level = ERROR
53 | 
54 | # root logger
55 | rootLogger.level = ${sys:hive.log.level}
56 | rootLogger.appenderRefs = root
57 | rootLogger.appenderRef.root.ref = ${sys:hive.root.logger}


--------------------------------------------------------------------------------
/docker/hadoop/conf/hive-site.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
3 | 
4 | <configuration>
5 | </configuration>


--------------------------------------------------------------------------------
/docker/hadoop/conf/ivysettings.xml:
--------------------------------------------------------------------------------
 1 | <!--
 2 |    Licensed to the Apache Software Foundation (ASF) under one or more
 3 |    contributor license agreements.  See the NOTICE file distributed with
 4 |    this work for additional information regarding copyright ownership.
 5 |    The ASF licenses this file to You under the Apache License, Version 2.0
 6 |    (the "License"); you may not use this file except in compliance with
 7 |    the License.  You may obtain a copy of the License at
 8 |        http://www.apache.org/licenses/LICENSE-2.0
 9 |    Unless required by applicable law or agreed to in writing, software
10 |    distributed under the License is distributed on an "AS IS" BASIS,
11 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |    See the License for the specific language governing permissions and
13 |    limitations under the License.
14 |    -->
15 | 
16 | <!--This file is used by grapes to download dependencies from a maven repository.
17 |     This is just a template and can be edited to add more repositories.
18 | -->
19 | 
20 | <ivysettings>
21 |   <!--name of the defaultResolver should always be 'downloadGrapes'. -->
22 |   <settings defaultResolver="downloadGrapes"/>
23 |   <!-- Only set maven.local.repository if not already set -->
24 |   <property name="maven.local.repository" value="${user.home}/.m2/repository" override="false" />
25 |   <property name="m2-pattern"
26 |             value="file:${maven.local.repository}/[organisation]/[module]/[revision]/[module]-[revision](-[classifier]).[ext]"
27 |             override="false"/>
28 |   <resolvers>
29 |     <!-- more resolvers can be added here -->
30 |     <chain name="downloadGrapes">
31 |       <!-- This resolver uses ibiblio to find artifacts, compatible with maven2 repository -->
32 |       <ibiblio name="central" m2compatible="true"/>
33 |       <url name="local-maven2" m2compatible="true">
34 |         <artifact pattern="${m2-pattern}"/>
35 |       </url>
36 |       <!-- File resolver to add jars from the local system. -->
37 |       <filesystem name="test" checkmodified="true">
38 |         <artifact pattern="/tmp/[module]-[revision](-[classifier]).jar"/>
39 |       </filesystem>
40 | 
41 |     </chain>
42 |   </resolvers>
43 | </ivysettings>


--------------------------------------------------------------------------------
/docker/hadoop/conf/llap-daemon-log4j2.properties:
--------------------------------------------------------------------------------
 1 | status = INFO
 2 | name = LlapDaemonLog4j2
 3 | packages = org.apache.hadoop.hive.ql.log
 4 | 
 5 | # list of properties
 6 | property.llap.daemon.log.level = INFO
 7 | property.llap.daemon.root.logger = console
 8 | property.llap.daemon.log.dir = .
 9 | property.llap.daemon.log.file = llapdaemon.log
10 | property.llap.daemon.historylog.file = llapdaemon_history.log
11 | property.llap.daemon.log.maxfilesize = 256MB
12 | property.llap.daemon.log.maxbackupindex = 20
13 | 
14 | # list of all appenders
15 | appenders = console, RFA, HISTORYAPPENDER
16 | 
17 | # console appender
18 | appender.console.type = Console
19 | appender.console.name = console
20 | appender.console.target = SYSTEM_ERR
21 | appender.console.layout.type = PatternLayout
22 | appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} [%t%x] %p %c{2} : %m%n
23 | 
24 | # rolling file appender
25 | appender.RFA.type = RollingFile
26 | appender.RFA.name = RFA
27 | appender.RFA.fileName = ${sys:llap.daemon.log.dir}/${sys:llap.daemon.log.file}
28 | appender.RFA.filePattern = ${sys:llap.daemon.log.dir}/${sys:llap.daemon.log.file}_%i
29 | appender.RFA.layout.type = PatternLayout
30 | appender.RFA.layout.pattern = %d{ISO8601} %-5p [%t%x]: %c{2} (%F:%M(%L)) - %m%n
31 | appender.RFA.policies.type = Policies
32 | appender.RFA.policies.size.type = SizeBasedTriggeringPolicy
33 | appender.RFA.policies.size.size = ${sys:llap.daemon.log.maxfilesize}
34 | appender.RFA.strategy.type = DefaultRolloverStrategy
35 | appender.RFA.strategy.max = ${sys:llap.daemon.log.maxbackupindex}
36 | 
37 | # history file appender
38 | appender.HISTORYAPPENDER.type = RollingFile
39 | appender.HISTORYAPPENDER.name = HISTORYAPPENDER
40 | appender.HISTORYAPPENDER.fileName = ${sys:llap.daemon.log.dir}/${sys:llap.daemon.historylog.file}
41 | appender.HISTORYAPPENDER.filePattern = ${sys:llap.daemon.log.dir}/${sys:llap.daemon.historylog.file}_%i
42 | appender.HISTORYAPPENDER.layout.type = PatternLayout
43 | appender.HISTORYAPPENDER.layout.pattern = %m%n
44 | appender.HISTORYAPPENDER.policies.type = Policies
45 | appender.HISTORYAPPENDER.policies.size.type = SizeBasedTriggeringPolicy
46 | appender.HISTORYAPPENDER.policies.size.size = ${sys:llap.daemon.log.maxfilesize}
47 | appender.HISTORYAPPENDER.strategy.type = DefaultRolloverStrategy
48 | appender.HISTORYAPPENDER.strategy.max = ${sys:llap.daemon.log.maxbackupindex}
49 | 
50 | # list of all loggers
51 | loggers = NIOServerCnxn, ClientCnxnSocketNIO, DataNucleus, Datastore, JPOX, HistoryLogger
52 | 
53 | logger.NIOServerCnxn.name = org.apache.zookeeper.server.NIOServerCnxn
54 | logger.NIOServerCnxn.level = WARN
55 | 
56 | logger.ClientCnxnSocketNIO.name = org.apache.zookeeper.ClientCnxnSocketNIO
57 | logger.ClientCnxnSocketNIO.level = WARN
58 | 
59 | logger.DataNucleus.name = DataNucleus
60 | logger.DataNucleus.level = ERROR
61 | 
62 | logger.Datastore.name = Datastore
63 | logger.Datastore.level = ERROR
64 | 
65 | logger.JPOX.name = JPOX
66 | logger.JPOX.level = ERROR
67 | 
68 | logger.HistoryLogger.name = org.apache.hadoop.hive.llap.daemon.HistoryLogger
69 | logger.HistoryLogger.level = INFO
70 | logger.HistoryLogger.additivity = false
71 | logger.HistoryLogger.appenderRefs = HistoryAppender
72 | logger.HistoryLogger.appenderRef.HistoryAppender.ref = HISTORYAPPENDER
73 | 
74 | # root logger
75 | rootLogger.level = ${sys:llap.daemon.log.level}
76 | rootLogger.appenderRefs = root
77 | rootLogger.appenderRef.root.ref = ${sys:llap.daemon.root.logger}


--------------------------------------------------------------------------------
/docker/hadoop/conf/sqoop-env.sh:
--------------------------------------------------------------------------------
 1 | # Set Hadoop-specific environment variables here.
 2 | 
 3 | #Set path to where bin/hadoop is available
 4 | export HADOOP_VERSION=3.2.1
 5 | export HADOOP_COMMON_HOME=/opt/hadoop-$HADOOP_VERSION
 6 | 
 7 | #Set path to where hadoop-*-core.jar is available
 8 | export HADOOP_MAPRED_HOME=/opt/hadoop-$HADOOP_VERSION
 9 | 
10 | #Set the path to where bin/hive is available
11 | export HIVE_HOME=/opt/hive


--------------------------------------------------------------------------------
/docker/hadoop/conf/yarn-site.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
3 | 
4 | <configuration>
5 | </configuration>


--------------------------------------------------------------------------------
/docker/hadoop/dockerfile:
--------------------------------------------------------------------------------
  1 | FROM python:3.6-stretch
  2 | 
  3 | # --------------------------------------------------------
  4 | # JAVA
  5 | # --------------------------------------------------------
  6 | RUN apt-get update && apt-get install -y --no-install-recommends \
  7 |     openjdk-8-jdk
  8 | ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-arm64/
  9 | 
 10 | # --------------------------------------------------------
 11 | # HADOOP
 12 | # --------------------------------------------------------
 13 | ENV HADOOP_VERSION=3.2.1
 14 | ENV HADOOP_URL=https://downloads.apache.org/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz
 15 | ENV HADOOP_PREFIX=/opt/hadoop-$HADOOP_VERSION
 16 | ENV HADOOP_CONF_DIR=/etc/hadoop
 17 | ENV MULTIHOMED_NETWORK=1
 18 | ENV USER=root
 19 | ENV PATH $HADOOP_PREFIX/bin/:$PATH
 20 | 
 21 | RUN set -x \
 22 |     && curl -fSL "$HADOOP_URL" -o /tmp/hadoop.tar.gz \
 23 |     && tar -xvf /tmp/hadoop.tar.gz -C /opt/ \
 24 |     && rm /tmp/hadoop.tar.gz*
 25 | 
 26 | RUN ln -s /opt/hadoop-$HADOOP_VERSION/etc/hadoop /etc/hadoop
 27 | RUN mkdir /opt/hadoop-$HADOOP_VERSION/logs
 28 | RUN mkdir /hadoop-data
 29 | 
 30 | # --------------------------------------------------------
 31 | # HIVE
 32 | # --------------------------------------------------------
 33 | 
 34 | ENV HIVE_VERSION=3.1.2
 35 | ENV POSTGRES_JAR_VERSION=42.2.20
 36 | ENV HIVE_HOME=/opt/hive
 37 | ENV PATH=$HIVE_HOME/bin:$PATH
 38 | ENV HADOOP_HOME=/opt/hadoop-$HADOOP_VERSION
 39 | 
 40 | WORKDIR /opt
 41 | 
 42 | RUN apt-get update && apt-get install -y wget procps && \
 43 | 	wget http://www-us.apache.org/dist/hive/hive-$HIVE_VERSION/apache-hive-$HIVE_VERSION-bin.tar.gz && \
 44 | 	tar -xzvf apache-hive-$HIVE_VERSION-bin.tar.gz && \
 45 | 	mv apache-hive-$HIVE_VERSION-bin hive && \
 46 | 	wget https://jdbc.postgresql.org/download/postgresql-$POSTGRES_JAR_VERSION.jar -O $HIVE_HOME/lib/postgresql-jdbc.jar && \
 47 | 	rm apache-hive-$HIVE_VERSION-bin.tar.gz && \
 48 | 	apt-get --purge remove -y wget && \
 49 | 	apt-get clean && \
 50 | 	rm -rf /var/lib/apt/lists/*
 51 | 
 52 | ADD conf/hive-site.xml $HIVE_HOME/conf
 53 | ADD conf/beeline-log4j2.properties $HIVE_HOME/conf
 54 | ADD conf/hive-exec-log4j2.properties $HIVE_HOME/conf
 55 | ADD conf/hive-log4j2.properties $HIVE_HOME/conf
 56 | ADD conf/llap-daemon-log4j2.properties $HIVE_HOME/conf
 57 | ADD conf/ivysettings.xml $HIVE_HOME/conf
 58 | ADD conf/hive-env.sh $HIVE_HOME/conf
 59 | 
 60 | # Work around for guava dependency [HIVE-22915 issues]
 61 | RUN rm /opt/hive/lib/guava-19.0.jar
 62 | RUN cp /opt/hadoop-$HADOOP_VERSION/share/hadoop/hdfs/lib/guava-27.0-jre.jar /opt/hive/lib/
 63 | 
 64 | # --------------------------------------------------------
 65 | # SQOOP
 66 | # --------------------------------------------------------
 67 | 
 68 | ENV SQOOP_VERSION=1.4.7
 69 | ENV SQOOP_HOME=/opt/sqoop-${SQOOP_VERSION}.bin__hadoop-2.6.0
 70 | ENV PATH=$PATH:${SQOOP_HOME}/bin
 71 | 
 72 | RUN curl -fSL https://downloads.apache.org/sqoop/${SQOOP_VERSION}/sqoop-${SQOOP_VERSION}.bin__hadoop-2.6.0.tar.gz -o /tmp/sqoop.tar.gz \
 73 |     && tar -xvf /tmp/sqoop.tar.gz -C /opt/ \
 74 |     && rm /tmp/sqoop.tar.gz
 75 | 
 76 | ADD conf/sqoop-env.sh ${SQOOP_HOME}/conf
 77 | RUN cp $HIVE_HOME/lib/postgresql-jdbc.jar ${SQOOP_HOME}/lib/postgresql-jdbc.jar
 78 | 
 79 | # --------------------------------------------------------
 80 | # SPARK
 81 | # --------------------------------------------------------
 82 | 
 83 | ENV SPARK_VERSION=spark-3.1.1-bin-hadoop3.2
 84 | ENV SPARK_URL https://www.apache.org/dist/spark/spark-3.1.1/${SPARK_VERSION}.tgz
 85 | ENV SPARK_HOME=/opt/$SPARK_VERSION
 86 | ENV PATH $SPARK_HOME/bin:$PATH
 87 | ENV HADOOP_CONF_DIR=$SPARK_HOME/conf
 88 | ENV PYSPARK_PYTHON=python3
 89 | ENV PYTHONHASHSEED=1
 90 | 
 91 | RUN set -x \
 92 |     && curl -fSL "${SPARK_URL}" -o /tmp/spark.tar.gz \
 93 |     && tar -xvzf /tmp/spark.tar.gz -C /opt/ \
 94 |     && rm /tmp/spark.tar.gz*
 95 | 
 96 | ADD conf/hive-site.xml $SPARK_HOME/conf
 97 | ADD conf/core-site.xml $SPARK_HOME/conf
 98 | ADD conf/yarn-site.xml $SPARK_HOME/conf
 99 | 
100 | # --------------------------------------------------------
101 | # ENVIRONMENT FIELD CONFIGURATION
102 | # --------------------------------------------------------
103 | 
104 | ADD entrypoint.sh /entrypoint.sh
105 | RUN chmod a+x /entrypoint.sh
106 | 
107 | ENTRYPOINT ["/entrypoint.sh"]


--------------------------------------------------------------------------------
/docker/hadoop/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | function addProperty() {
 4 |   local path=$1
 5 |   local name=$2
 6 |   local value=$3
 7 | 
 8 |   local entry="<property><name>$name</name><value>${value}</value></property>"
 9 |   local escapedEntry=$(echo $entry | sed 's/\//\\\//g')
10 |   sed -i "/<\/configuration>/ s/.*/${escapedEntry}\n&/" $path
11 | }
12 | 
13 | function configure() {
14 |    local path=$1
15 |    local module=$2
16 |    local envPrefix=$3
17 | 
18 |    echo "Configuring $module"
19 |    for c in `printenv | grep "^$3.*"`
20 |    do
21 |       echo " Found env $c"
22 |       env_name=${c%=*}
23 |       env_value=${c#*=}
24 |       config_name=${env_name#$3_}
25 |       config_value=$env_value
26 |       echo "  config_name: $config_name   config_value: $config_value"
27 |       property_name=`echo $config_name | sed 's/___/-/g' | sed 's/__/_/g' | sed 's/_/./g'`
28 |       property_value=$config_value
29 |       echo "  -setting config $property_name=$property_value in $path"
30 |       addProperty $path $property_name $property_value
31 |    done
32 | }
33 | 
34 | # --------------------------------------------------------
35 | # HADOOP
36 | # --------------------------------------------------------
37 | configure /etc/hadoop/core-site.xml core CORE_CONF
38 | configure /etc/hadoop/hdfs-site.xml hdfs HDFS_CONF
39 | configure /etc/hadoop/yarn-site.xml yarn YARN_CONF
40 | configure /etc/hadoop/mapred-site.xml mapred MAPRED_CONF
41 | 
42 | if [ "$MULTIHOMED_NETWORK" = "1" ]; then
43 |     echo "Configuring for multihomed network"
44 | 
45 |     # HDFS
46 |     addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.rpc-bind-host 0.0.0.0
47 |     addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.servicerpc-bind-host 0.0.0.0
48 |     addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.http-bind-host 0.0.0.0
49 |     addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.https-bind-host 0.0.0.0
50 |     addProperty /etc/hadoop/hdfs-site.xml dfs.client.use.datanode.hostname true
51 |     addProperty /etc/hadoop/hdfs-site.xml dfs.datanode.use.datanode.hostname true
52 | 
53 |     # YARN
54 |     addProperty /etc/hadoop/yarn-site.xml yarn.resourcemanager.bind-host 0.0.0.0
55 |     addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0
56 |     addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0
57 |     addProperty /etc/hadoop/yarn-site.xml yarn.timeline-service.bind-host 0.0.0.0
58 | 
59 |     # MAPRED
60 |     addProperty /etc/hadoop/mapred-site.xml yarn.nodemanager.bind-host 0.0.0.0
61 | fi
62 | 
63 | # --------------------------------------------------------
64 | # HIVE
65 | # --------------------------------------------------------
66 | configure /opt/hive/conf/hive-site.xml hive HIVE_SITE_CONF
67 | 
68 | # --------------------------------------------------------
69 | # SPARK
70 | # --------------------------------------------------------
71 | configure $SPARK_HOME/conf/core-site.xml core CORE_CONF
72 | configure $SPARK_HOME/conf/yarn-site.xml yarn YARN_CONF
73 | configure $SPARK_HOME/conf/hive-site.xml hive HIVE_SITE_CONF
74 | 
75 | exec $@


--------------------------------------------------------------------------------