├── _config.yml
├── googled57bdb220576a44a.html
├── code
└── apps
│ ├── livy-server
│ ├── Dockerfile
│ └── conf
│ │ ├── livy-env.sh
│ │ └── livy.conf
│ ├── spark
│ ├── spark-defaults.conf
│ └── conf
│ │ └── log4j.properties
│ └── docker-compose.yaml
├── LICENSE
├── .gitignore
├── README.md
└── notebooks
└── test_livy.ipynb
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman
--------------------------------------------------------------------------------
/googled57bdb220576a44a.html:
--------------------------------------------------------------------------------
1 | google-site-verification: googled57bdb220576a44a.html
--------------------------------------------------------------------------------
/code/apps/livy-server/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM docker.io/bitnami/spark:2
2 |
3 | USER root
4 | ENV LIVY_HOME /opt/bitnami/livy
5 | WORKDIR /opt/bitnami/
6 |
7 | RUN install_packages unzip \
8 | && curl "https://downloads.apache.org/incubator/livy/0.7.1-incubating/apache-livy-0.7.1-incubating-bin.zip" -O \
9 | && unzip "apache-livy-0.7.1-incubating-bin" \
10 | && rm -rf "apache-livy-0.7.1-incubating-bin.zip" \
11 | && mv "apache-livy-0.7.1-incubating-bin" $LIVY_HOME \
12 | && mkdir $LIVY_HOME/logs \
13 | && chown -R 1001:1001 $LIVY_HOME
14 |
15 | USER 1001
16 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Ramses Alexander Coraspe Valdez
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/code/apps/spark/spark-defaults.conf:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | # Default system properties included when running spark-submit.
19 | # This is useful for setting default environmental settings.
20 |
21 | # Example:
22 | # spark.master spark://master:7077
23 | # spark.eventLog.enabled true
24 | # spark.eventLog.dir hdfs://namenode:8021/directory
25 | # spark.serializer org.apache.spark.serializer.KryoSerializer
26 | # spark.driver.memory 2g
27 | # spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
28 |
29 |
30 | #spark.jars.packages = org.postgresql:postgresql:9.4.1207.jar
31 | #spark.driver.extraClassPath = /opt/bitnami/spark/jars/postgresql-9.4.1207.jar
32 |
33 |
--------------------------------------------------------------------------------
/code/apps/livy-server/conf/livy-env.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #
3 | # Licensed to the Apache Software Foundation (ASF) under one or more
4 | # contributor license agreements. See the NOTICE file distributed with
5 | # this work for additional information regarding copyright ownership.
6 | # The ASF licenses this file to You under the Apache License, Version 2.0
7 | # (the "License"); you may not use this file except in compliance with
8 | # the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | # LIVY ENVIRONMENT VARIABLES
19 | #
20 | # - JAVA_HOME Java runtime to use. By default use "java" from PATH.
21 | # - HADOOP_CONF_DIR Directory containing the Hadoop / YARN configuration to use.
22 | # - SPARK_HOME Spark which you would like to use in Livy.
23 | # - SPARK_CONF_DIR Optional directory where the Spark configuration lives.
24 | # (Default: $SPARK_HOME/conf)
25 | # - LIVY_LOG_DIR Where log files are stored. (Default: ${LIVY_HOME}/logs)
26 | # - LIVY_PID_DIR Where the pid file is stored. (Default: /tmp)
27 | # - LIVY_SERVER_JAVA_OPTS Java Opts for running livy server (You can set jvm related setting here,
28 | # like jvm memory/gc algorithm and etc.)
29 | # - LIVY_IDENT_STRING A name that identifies the Livy server instance, used to generate log file
30 | # names. (Default: name of the user starting Livy).
31 | # - LIVY_MAX_LOG_FILES Max number of log file to keep in the log directory. (Default: 5.)
32 | # - LIVY_NICENESS Niceness of the Livy server process when running in the background. (Default: 0.)
33 |
34 | export SPARK_HOME=/opt/bitnami/spark/
35 |
--------------------------------------------------------------------------------
/code/apps/spark/conf/log4j.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | # Set everything to be logged to the console
19 | log4j.rootCategory=DEBUG, console
20 | log4j.appender.console=org.apache.log4j.ConsoleAppender
21 | log4j.appender.console.target=System.err
22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
23 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
24 |
25 | # Set the default spark-shell log level to WARN. When running the spark-shell, the
26 | # log level for this class is used to overwrite the root logger's log level, so that
27 | # the user can have different defaults for the shell and regular Spark apps.
28 | log4j.logger.org.apache.spark.repl.Main=DEBUG
29 |
30 | # Settings to quiet third party logs that are too verbose
31 | log4j.logger.org.spark_project.jetty=WARN
32 | log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR
33 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=DEBUG
34 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=DEBUG
35 | log4j.logger.org.apache.parquet=ERROR
36 | log4j.logger.parquet=ERROR
37 |
38 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
39 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
40 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
41 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
--------------------------------------------------------------------------------
/code/apps/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | version: '3'
2 |
3 | services:
4 | spark-master:
5 | image: docker.io/bitnami/spark:2
6 | environment:
7 | - SPARK_MODE=master
8 | - SPARK_RPC_AUTHENTICATION_ENABLED=no
9 | - SPARK_RPC_ENCRYPTION_ENABLED=no
10 | - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
11 | - SPARK_SSL_ENABLED=no
12 | user: root
13 | volumes:
14 | - type: bind
15 | source: ./spark/conf/log4j.properties
16 | target: /opt/bitnami/spark/conf/log4j.properties
17 | - ./spark/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf
18 | - ./spark/jars_dir:/opt/bitnami/spark/ivy:z
19 | ports:
20 | - '8080:8080'
21 | - '7077:7077'
22 | networks:
23 | - net
24 |
25 | spark-worker1:
26 | image: docker.io/bitnami/spark:2
27 | environment:
28 | - SPARK_MODE=worker
29 | - SPARK_MASTER_URL=spark://spark-master:7077
30 | - SPARK_WORKER_MEMORY=1G
31 | - SPARK_WORKER_CORES=1
32 | - SPARK_RPC_AUTHENTICATION_ENABLED=no
33 | - SPARK_RPC_ENCRYPTION_ENABLED=no
34 | - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
35 | - SPARK_SSL_ENABLED=no
36 | user: root
37 | volumes:
38 | - type: bind
39 | source: ./spark/conf/log4j.properties
40 | target: /opt/bitnami/spark/conf/log4j.properties
41 | - ./spark/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf
42 | - ./spark/jars_dir:/opt/bitnami/spark/ivy:z
43 | ports:
44 | - '8081:8081'
45 | networks:
46 | - net
47 | depends_on:
48 | - spark-master
49 |
50 | spark-worker2:
51 | image: docker.io/bitnami/spark:2
52 | environment:
53 | - SPARK_MODE=worker
54 | - SPARK_MASTER_URL=spark://spark-master:7077
55 | - SPARK_WORKER_MEMORY=1G
56 | - SPARK_WORKER_CORES=1
57 | - SPARK_RPC_AUTHENTICATION_ENABLED=no
58 | - SPARK_RPC_ENCRYPTION_ENABLED=no
59 | - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
60 | - SPARK_SSL_ENABLED=no
61 | user: root
62 | volumes:
63 | - type: bind
64 | source: ./spark/conf/log4j.properties
65 | target: /opt/bitnami/spark/conf/log4j.properties
66 | - ./spark/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf
67 | - ./spark/jars_dir:/opt/bitnami/spark/ivy:z
68 | ports:
69 | - '8082:8082'
70 | networks:
71 | - net
72 | depends_on:
73 | - spark-master
74 |
75 | livy-server:
76 | container_name: livy_server
77 | build: ./livy-server/
78 | command: ["sh", "-c", "/opt/bitnami/livy/bin/livy-server"]
79 | user: root
80 | volumes:
81 | - type: bind
82 | source: ./livy-server/conf/
83 | target: /opt/bitnami/livy/conf/
84 | - type: bind
85 | source: ./livy-server/target/
86 | target: /target/
87 | - type: bind
88 | source: ./livy-server/data/
89 | target: /data/
90 | ports:
91 | - '8998:8998'
92 | networks:
93 | - net
94 | depends_on:
95 | - spark-master
96 | - spark-worker1
97 | - spark-worker2
98 |
99 | db:
100 | container_name: pg_container
101 | image: postgres
102 | restart: always
103 | environment:
104 | POSTGRES_USER: "postgres"
105 | POSTGRES_PASSWORD: "12345"
106 | POSTGRES_DB: "db"
107 | POSTGRES_HOST_AUTH_METHOD: trust
108 | ports:
109 | - "5432:5432"
110 | networks:
111 | - net
112 |
113 | volumes:
114 | pg_data:
115 |
116 | networks:
117 | net:
118 | driver: bridge
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # docker-livy
2 |
3 | ### Check the article here: Building Real-time communication with Apache Spark through Apache Livy
4 |
5 |
6 | ## Dockerizing and Consuming an Apache Livy environment
7 |
8 |
9 |
12 |
13 |
14 | As you can see, In order to reproduce a real example we would need three components:
15 |
16 | - Apache Spark Cluster
17 | - Apache Livy Server
18 | - Apache Livy Client
19 |
20 | As an additional component I would add docker for a faster implementation, and a PostgreSQL database server to simulate an external data source available for Apache Spark.
21 |
22 | ## In order to reproduce the experiment we would need to follow the next steps:
23 |
24 | - Install Docker Desktop on Windows, it will install Docker Compose as well, Docker Compose will allow you to run multiple container applications.
25 | - Install git-bash for windows, once installed, open git bash and download this repository, this will download the docker-compose.yaml file, and other files needed.
26 |
27 | ```linux
28 | ramse@DESKTOP-K6K6E5A MINGW64 /c
29 | $ git clone https://github.com/Wittline/docker-livy.git
30 | ```
31 |
32 | - Once all the files needed were downloaded from the repository, let's run everything. We will use the git bash tool again, go to the folder docker-livy and we will run the Docker Compose command:
33 |
34 | ```linux
35 | ramse@DESKTOP-K6K6E5A MINGW64 /c
36 | $ cd docker-livy
37 |
38 | ramse@DESKTOP-K6K6E5A MINGW64 /c/docker-livy
39 | $ cd code
40 |
41 | ramse@DESKTOP-K6K6E5A MINGW64 /c/docker-livy/code
42 | $ cd apps
43 |
44 | @DESKTOP-K6K6E5A MINGW64 /c/docker-livy/code/apps
45 | $ docker-compose up -d --build
46 | ```
47 |
48 | - Wait for a minutes, and when the execution of the last command is finished, use the following command to check the status of all the containers.
49 |
50 | ```linux
51 | docker ps
52 | ```
53 |
54 |
55 |
58 |
59 |
60 | - If everything is up and running then we are going to move forward with other steps to interact with the Apache Livy interface using livyc
61 | - Setiting a local environment with Google Colab
62 | - Go to git-bash and put the next command:
63 |
64 | ```linux
65 | ramse@DESKTOP-K6K6E5A MINGW64 ~
66 | jupyter notebook --NotebookApp.allow_origin='https://colab.research.google.com' --port=8888 --NotebookApp.port_retries=0
67 | ```
68 |
69 | - After the execution of the last command, copy the localhost URL, you will need it for colab
70 | - Go to Google Colab
71 | - Create a new Notebook
72 | - Go to -> Connect -> "Connect to local runtime" -> Paste the url copied from the last step and put it in Backend URL -> connect
73 | - Upload the file test-livy.ipynb, and use it into your colab local env:
74 | - Run each cell and see the results on each steps.
75 |
76 | Note that among the actions taken in the test-livy.ipynb file, the postgres database is being populated with data using an external .csv file, this in order for apache spark to have data to interact with.
77 |
78 | - If you want to monitorize all the session created in Apache Livy Server go to the address: http://localhost:8998/ui
79 |
80 |
81 |
84 |
85 |
86 |
87 | The Python Package livyc works well to submit pyspark scripts dynamically and asynchronously to the Apache Livy server, this in turn interacts with the Apache Spark Cluster in a transparent way, check this project and remember to check all the files before interacting with the jupyter notebook file.
88 |
89 | ## Contributing and Feedback
90 | Any ideas or feedback about this repository?. Help me to improve it.
91 |
92 | ## Authors
93 | - Created by Ramses Alexander Coraspe Valdez
94 | - Created on 2022
95 |
96 | ## License
97 | This project is licensed under the terms of the MIT License.
98 |
99 |
--------------------------------------------------------------------------------
/code/apps/livy-server/conf/livy.conf:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | # Use this keystore for the SSL certificate and key.
18 | # livy.keystore =
19 |
20 | # Specify the keystore password.
21 | # livy.keystore.password =
22 | #
23 | # Specify the key password.
24 | # livy.key-password =
25 |
26 | # Hadoop Credential Provider Path to get "livy.keystore.password" and "livy.key-password".
27 | # Credential Provider can be created using command as follow:
28 | # hadoop credential create "livy.keystore.password" -value "secret" -provider jceks://hdfs/path/to/livy.jceks
29 | # livy.hadoop.security.credential.provider.path =
30 |
31 | # What host address to start the server on. By default, Livy will bind to all network interfaces.
32 | livy.server.host = 0.0.0.0
33 |
34 | # What port to start the server on.
35 | livy.server.port = 8998
36 |
37 | # What base path ui should work on. By default UI is mounted on "/".
38 | # E.g.: livy.ui.basePath = /my_livy - result in mounting UI on /my_livy/
39 | # livy.ui.basePath = ""
40 |
41 | # What spark master Livy sessions should use.
42 | livy.spark.master = spark://spark-master:7077
43 |
44 | # What spark deploy mode Livy sessions should use.
45 | livy.spark.deploy-mode = client
46 |
47 | # Configure Livy server http request and response header size.
48 | # livy.server.request-header.size = 131072
49 | # livy.server.response-header.size = 131072
50 |
51 | # Enabled to check whether timeout Livy sessions should be stopped.
52 | livy.server.session.timeout-check = true
53 | #
54 | # Whether or not to skip timeout check for a busy session
55 | livy.server.session.timeout-check.skip-busy = false
56 |
57 | # Time in milliseconds on how long Livy will wait before timing out an inactive session.
58 | # Note that the inactive session could be busy running jobs.
59 | livy.server.session.timeout = 5m
60 | #
61 | # How long a finished session state should be kept in LivyServer for query.
62 | livy.server.session.state-retain.sec = 60s
63 |
64 | # If livy should impersonate the requesting users when creating a new session.
65 | # livy.impersonation.enabled = false
66 |
67 | # Logs size livy can cache for each session/batch. 0 means don't cache the logs.
68 | # livy.cache-log.size = 200
69 |
70 | # Comma-separated list of Livy RSC jars. By default Livy will upload jars from its installation
71 | # directory every time a session is started. By caching these files in HDFS, for example, startup
72 | # time of sessions on YARN can be reduced.
73 | # livy.rsc.jars =
74 |
75 | # Comma-separated list of Livy REPL jars. By default Livy will upload jars from its installation
76 | # directory every time a session is started. By caching these files in HDFS, for example, startup
77 | # time of sessions on YARN can be reduced. Please list all the repl dependencies including
78 | # Scala version-specific livy-repl jars, Livy will automatically pick the right dependencies
79 | # during session creation.
80 | # livy.repl.jars =
81 |
82 | # Location of PySpark archives. By default Livy will upload the file from SPARK_HOME, but
83 | # by caching the file in HDFS, startup time of PySpark sessions on YARN can be reduced.
84 | # livy.pyspark.archives =
85 |
86 | # Location of the SparkR package. By default Livy will upload the file from SPARK_HOME, but
87 | # by caching the file in HDFS, startup time of R sessions on YARN can be reduced.
88 | # livy.sparkr.package =
89 |
90 | # List of local directories from where files are allowed to be added to user sessions. By
91 | # default it's empty, meaning users can only reference remote URIs when starting their
92 | # sessions.
93 | livy.file.local-dir-whitelist = /target/
94 |
95 | # Whether to enable csrf protection, by default it is false. If it is enabled, client should add
96 | # http-header "X-Requested-By" in request if the http method is POST/DELETE/PUT/PATCH.
97 | # livy.server.csrf-protection.enabled =
98 |
99 | # Whether to enable HiveContext in livy interpreter, if it is true hive-site.xml will be detected
100 | # on user request and then livy server classpath automatically.
101 | # livy.repl.enable-hive-context =
102 |
103 | # Recovery mode of Livy. Possible values:
104 | # off: Default. Turn off recovery. Every time Livy shuts down, it stops and forgets all sessions.
105 | # recovery: Livy persists session info to the state store. When Livy restarts, it recovers
106 | # previous sessions from the state store.
107 | # Must set livy.server.recovery.state-store and livy.server.recovery.state-store.url to
108 | # configure the state store.
109 | # livy.server.recovery.mode = off
110 |
111 | # Where Livy should store state to for recovery. Possible values:
112 | # : Default. State store disabled.
113 | # filesystem: Store state on a file system.
114 | # zookeeper: Store state in a Zookeeper instance.
115 | # livy.server.recovery.state-store =
116 |
117 | # For filesystem state store, the path of the state store directory. Please don't use a filesystem
118 | # that doesn't support atomic rename (e.g. S3). e.g. file:///tmp/livy or hdfs:///.
119 | # For zookeeper, the address to the Zookeeper servers. e.g. host1:port1,host2:port2
120 | # livy.server.recovery.state-store.url =
121 |
122 | # If Livy can't find the yarn app within this time, consider it lost.
123 | # livy.server.yarn.app-lookup-timeout = 120s
124 | # When the cluster is busy, we may fail to launch yarn app in app-lookup-timeout, then it would
125 | # cause session leakage, so we need to check session leakage.
126 | # How long to check livy session leakage
127 | # livy.server.yarn.app-leakage.check-timeout = 600s
128 | # how often to check livy session leakage
129 | # livy.server.yarn.app-leakage.check-interval = 60s
130 |
131 | # How often Livy polls YARN to refresh YARN app state.
132 | # livy.server.yarn.poll-interval = 5s
133 | #
134 | # Days to keep Livy server request logs.
135 | # livy.server.request-log-retain.days = 5
136 |
137 | # If the Livy Web UI should be included in the Livy Server. Enabled by default.
138 | # livy.ui.enabled = true
139 |
140 | # Whether to enable Livy server access control, if it is true then all the income requests will
141 | # be checked if the requested user has permission.
142 | # livy.server.access-control.enabled = false
143 |
144 | # Allowed users to access Livy, by default any user is allowed to access Livy. If user want to
145 | # limit who could access Livy, user should list all the permitted users with comma separated.
146 | # livy.server.access-control.allowed-users = *
147 |
148 | # A list of users with comma separated has the permission to change other user's submitted
149 | # session, like submitting statements, deleting session.
150 | # livy.server.access-control.modify-users =
151 |
152 | # A list of users with comma separated has the permission to view other user's infomation, like
153 | # submitted session state, statement results.
154 | # livy.server.access-control.view-users =
155 | #
156 | # Authentication support for Livy server
157 | # Livy has a built-in SPnego authentication support for HTTP requests with below configurations.
158 | # livy.server.auth.type = kerberos
159 | # livy.server.auth.kerberos.principal =
160 | # livy.server.auth.kerberos.keytab =
161 | # livy.server.auth.kerberos.name-rules = DEFAULT
162 | #
163 | # If user wants to use custom authentication filter, configurations are:
164 | # livy.server.auth.type =
165 | # livy.server.auth..class =
166 | # livy.server.auth..param. =
167 | # livy.server.auth..param. =
--------------------------------------------------------------------------------
/notebooks/test_livy.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "id": "-hHsmlGN0ETj"
7 | },
8 | "source": [
9 | "# **Installing livyc** 🔧"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 83,
15 | "metadata": {
16 | "colab": {
17 | "base_uri": "https://localhost:8080/"
18 | },
19 | "id": "z9HFmSWHybp8",
20 | "outputId": "5a9cd433-f9ef-4519-cf8e-e63c1db3655d"
21 | },
22 | "outputs": [
23 | {
24 | "name": "stdout",
25 | "output_type": "stream",
26 | "text": [
27 | "Collecting livyc==0.0.13Note: you may need to restart the kernel to use updated packages.\n",
28 | " Downloading livyc-0.0.13-py3-none-any.whl (4.9 kB)\n",
29 | "Installing collected packages: livyc\n",
30 | " Attempting uninstall: livyc\n",
31 | " Found existing installation: livyc 0.0.12\n",
32 | " Uninstalling livyc-0.0.12:\n",
33 | " Successfully uninstalled livyc-0.0.12\n"
34 | ]
35 | },
36 | {
37 | "name": "stderr",
38 | "output_type": "stream",
39 | "text": [
40 | "WARNING: You are using pip version 21.3.1; however, version 22.1.2 is available.\n",
41 | "You should consider upgrading via the 'c:\\python\\python37\\python.exe -m pip install --upgrade pip' command.\n"
42 | ]
43 | },
44 | {
45 | "name": "stdout",
46 | "output_type": "stream",
47 | "text": [
48 | "\n",
49 | "Successfully installed livyc-0.0.13\n"
50 | ]
51 | }
52 | ],
53 | "source": [
54 | "pip install livyc==0.0.13"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 84,
60 | "metadata": {
61 | "colab": {
62 | "base_uri": "https://localhost:8080/"
63 | },
64 | "id": "YB7JoZCj9luc",
65 | "outputId": "5fba4b3b-d468-42ec-ec08-a7c3597dac99"
66 | },
67 | "outputs": [
68 | {
69 | "name": "stdout",
70 | "output_type": "stream",
71 | "text": [
72 | "Requirement already satisfied: psycopg2 in c:\\python\\python37\\lib\\site-packages (2.8.6)\n",
73 | "Note: you may need to restart the kernel to use updated packages.\n"
74 | ]
75 | },
76 | {
77 | "name": "stderr",
78 | "output_type": "stream",
79 | "text": [
80 | "WARNING: You are using pip version 21.3.1; however, version 22.1.2 is available.\n",
81 | "You should consider upgrading via the 'c:\\python\\python37\\python.exe -m pip install --upgrade pip' command.\n"
82 | ]
83 | }
84 | ],
85 | "source": [
86 | "pip install psycopg2"
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "metadata": {
92 | "id": "TsGFrrZG0Xgj"
93 | },
94 | "source": [
95 | "# **Importing livyc library** ⚡"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 86,
101 | "metadata": {
102 | "id": "svZULhLhy7yo"
103 | },
104 | "outputs": [],
105 | "source": [
106 | "from livyc import livyc"
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "metadata": {
112 | "id": "nUdzhm0D1ABb"
113 | },
114 | "source": [
115 | "# **Setting livy configuration** ✍"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": 88,
121 | "metadata": {
122 | "id": "utAaRVFq0b8q"
123 | },
124 | "outputs": [],
125 | "source": [
126 | "data_livy = {\n",
127 | " \"livy_server_url\": \"localhost\",\n",
128 | " \"port\": \"8998\",\n",
129 | " \"jars\": [\"org.postgresql:postgresql:42.3.1\"]\n",
130 | "}"
131 | ]
132 | },
133 | {
134 | "cell_type": "markdown",
135 | "metadata": {
136 | "id": "Ru1gpwWu13hJ"
137 | },
138 | "source": [
139 | "# **Populate PostgreSQL DB with data** 🗄"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 136,
145 | "metadata": {
146 | "colab": {
147 | "base_uri": "https://localhost:8080/"
148 | },
149 | "id": "86xKmv_43nd_",
150 | "outputId": "eb127be2-3453-4fe6-9211-3ab4383217cf"
151 | },
152 | "outputs": [
153 | {
154 | "name": "stdout",
155 | "output_type": "stream",
156 | "text": [
157 | "Connecting to the PostgreSQL database\n",
158 | "PostgreSQL database version:\n",
159 | "('PostgreSQL 14.3 (Debian 14.3-1.pgdg110+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit',)\n",
160 | "Dropping tables\n",
161 | "Tables dropped\n",
162 | "Creating created\n",
163 | "Tables created\n",
164 | "Copying data from .csv to staging zone\n",
165 | "Staging ready\n",
166 | "Table staging has 106 records\n",
167 | "Database connection closed\n"
168 | ]
169 | }
170 | ],
171 | "source": [
172 | "# DROP TABLES\n",
173 | "staging_table_drop = \"DROP TABLE IF EXISTS staging\"\n",
174 | "\n",
175 | "# CREATE TABLES\n",
176 | "\n",
177 | "staging_table_create = (\"\"\"\n",
178 | "CREATE TABLE IF NOT EXISTS staging(\n",
179 | " id serial PRIMARY KEY NOT NULL,\n",
180 | " first_name varchar,\n",
181 | " last_name varchar,\n",
182 | " company_name varchar,\n",
183 | " address varchar,\n",
184 | " city varchar,\n",
185 | " state varchar,\n",
186 | " zip varchar,\n",
187 | " phone1 varchar,\n",
188 | " phone2 varchar,\n",
189 | " email varchar,\n",
190 | " department varchar\n",
191 | ");\n",
192 | "\"\"\")\n",
193 | "\n",
194 | "create_table_queries = [staging_table_create]\n",
195 | "drop_table_queries = [staging_table_drop]\n",
196 | "\n",
197 | "\n",
198 | "import psycopg2\n",
199 | "import pandas as pd\n",
200 | "import os\n",
201 | "\n",
202 | "\n",
203 | "def create_connection(params):\n",
204 | " \"\"\"\n",
205 | " create a new connection with the postgreSQL \n",
206 | " database and return the cur and conn object\n",
207 | " :param params: connection string \n",
208 | " \"\"\"\n",
209 | " conn = None\n",
210 | "\n",
211 | " try:\n",
212 | " print('Connecting to the PostgreSQL database')\n",
213 | " conn = psycopg2.connect(**params)\n",
214 | " conn.set_session(autocommit=True)\n",
215 | "\n",
216 | " cur = conn.cursor()\n",
217 | "\n",
218 | " print('PostgreSQL database version:')\n",
219 | " cur.execute('SELECT version()')\n",
220 | "\n",
221 | " db_version = cur.fetchone()\n",
222 | " print(db_version) \n",
223 | " return cur, conn\n",
224 | " except (Exception, psycopg2.DatabaseError) as error:\n",
225 | " print(error)\n",
226 | "\n",
227 | "\n",
228 | "def close_connection(cur, conn):\n",
229 | " \"\"\"\n",
230 | " close the connection with the postgreSQL database \n",
231 | " :param cur: cursor\n",
232 | " :param conn: connection object\n",
233 | " \"\"\"\n",
234 | " try:\n",
235 | " cur.close()\n",
236 | " if conn is not None:\n",
237 | " conn.close()\n",
238 | " print('Database connection closed') \n",
239 | " except (Exception, psycopg2.DatabaseError) as error:\n",
240 | " print(error)\n",
241 | "\n",
242 | "\n",
243 | "def drop_tables(cur, conn):\n",
244 | " \"\"\"\n",
245 | " drop all the tables in the example \n",
246 | " :param cur: cursor\n",
247 | " :param conn: connection object\n",
248 | " \"\"\"\n",
249 | " print(\"Dropping tables\")\n",
250 | " for query in drop_table_queries: \n",
251 | " cur.execute(query)\n",
252 | " conn.commit()\n",
253 | " print(\"Tables dropped\")\n",
254 | "\n",
255 | "\n",
256 | "def create_tables(cur, conn):\n",
257 | " \"\"\"\n",
258 | " create all the tables in the example \n",
259 | " :param cur: cursor\n",
260 | " :param conn: connection object\n",
261 | " \"\"\"\n",
262 | " print(\"Creating created\")\n",
263 | " for query in create_table_queries:\n",
264 | " cur.execute(query)\n",
265 | " conn.commit()\n",
266 | " print(\"Tables created\")\n",
267 | "\n",
268 | "\n",
269 | "def check_data(cur, conn, tables):\n",
270 | " \"\"\"\n",
271 | " Check count of records in tables\n",
272 | " :param cur: cursor\n",
273 | " :param conn: connection object\n",
274 | " :param tables: tables to check\n",
275 | " \"\"\"\n",
276 | "\n",
277 | " count_values = {}\n",
278 | "\n",
279 | " for table in tables:\n",
280 | " query_count = \"SELECT COUNT(*) FROM {0}\".format(table)\n",
281 | "\n",
282 | " try:\n",
283 | " cur = conn.cursor()\n",
284 | " cur.execute(query_count)\n",
285 | " count_values[table] = cur.fetchone()[0] \n",
286 | " except (Exception, psycopg2.DatabaseError) as error:\n",
287 | " print(\"Error: %s\" % error)\n",
288 | " raise\n",
289 | "\n",
290 | " return count_values\n",
291 | "\n",
292 | "def set_staging(cur, conn, staging_file, columns):\n",
293 | "\n",
294 | " print(\"Copying data from .csv to staging zone\")\n",
295 | "\n",
296 | " try:\n",
297 | " copy_cmd = f\"copy staging({','.join(columns)}) from stdout (format csv)\"\n",
298 | " with open(staging_file, 'r') as f:\n",
299 | " next(f)\n",
300 | " cur.copy_expert(copy_cmd, f) \n",
301 | " conn.commit()\n",
302 | " print(\"Staging ready\")\n",
303 | " except (psycopg2.Error) as e:\n",
304 | " print(e)\n",
305 | "\n",
306 | " \n",
307 | " \n",
308 | "class Pipeline:\n",
309 | "\n",
310 | " def __init__(self, params, staging_file):\n",
311 | " self.params = params\n",
312 | " self.staging_file = staging_file\n",
313 | "\n",
314 | " def run(self):\n",
315 | " tables = ['staging']\n",
316 | " columns_staging = ['first_name','last_name','company_name','address','city','state','zip','phone1','phone2','email','department']\n",
317 | " cur, conn = create_connection(self.params)\n",
318 | " drop_tables(cur, conn)\n",
319 | " create_tables(cur, conn)\n",
320 | " set_staging(cur, conn, self.staging_file, columns_staging)\n",
321 | " count_tables = check_data(cur, conn, tables)\n",
322 | " for k, v in count_tables.items():\n",
323 | " print(\"Table {0} has {1} records\".format(k, v))\n",
324 | " close_connection(cur, conn)\n",
325 | "\n",
326 | "\n",
327 | "params = {\"host\": \"localhost\", \"port\":\"5432\", \"database\": \"db\", \"user\": \"postgres\", \"password\": \"pg12345\"}\n",
328 | "\n",
329 | "\n",
330 | "staging_file = \"./Documents/sample.csv\"\n",
331 | "pipeline = Pipeline(params, staging_file)\n",
332 | "pipeline.run()\n"
333 | ]
334 | },
335 | {
336 | "cell_type": "markdown",
337 | "metadata": {
338 | "id": "eHoROghy3m9o"
339 | },
340 | "source": [
341 | "# **Let's try launch a pySpark script to Apache Livy Server** 🤓"
342 | ]
343 | },
344 | {
345 | "cell_type": "code",
346 | "execution_count": 137,
347 | "metadata": {
348 | "colab": {
349 | "base_uri": "https://localhost:8080/"
350 | },
351 | "id": "GlujsSQK4ZIo",
352 | "outputId": "a9d562e8-27c6-490b-f8ec-11ab0109aa1e"
353 | },
354 | "outputs": [
355 | {
356 | "data": {
357 | "text/plain": [
358 | "'\\n\\n from pyspark.sql.functions import udf, col, explode\\n from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType\\n from pyspark.sql import Row\\n from pyspark.sql import SparkSession\\n\\n\\n df = spark.read.format(\"jdbc\") .option(\"url\", \"jdbc:postgresql://pg_container:5432/db\") .option(\"driver\", \"org.postgresql.Driver\") .option(\"dbtable\", \"staging\") .option(\"user\", \"postgres\") .option(\"password\", \"pg12345\") .load()\\n \\n n_rows = df.count()\\n\\n spark.stop()\\n'"
359 | ]
360 | },
361 | "execution_count": 137,
362 | "metadata": {},
363 | "output_type": "execute_result"
364 | }
365 | ],
366 | "source": [
367 | "\n",
368 | "params[\"table\"] = \"staging\"\n",
369 | "\n",
370 | "pyspark_script = \"\"\"\n",
371 | "\n",
372 | " from pyspark.sql.functions import udf, col, explode\n",
373 | " from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType\n",
374 | " from pyspark.sql import Row\n",
375 | " from pyspark.sql import SparkSession\n",
376 | "\n",
377 | "\n",
378 | " df = spark.read.format(\"jdbc\") \\\n",
379 | " .option(\"url\", \"jdbc:postgresql://pg_container:{port}/{database}\") \\\n",
380 | " .option(\"driver\", \"org.postgresql.Driver\") \\\n",
381 | " .option(\"dbtable\", \"{table}\") \\\n",
382 | " .option(\"user\", \"{user}\") \\\n",
383 | " .option(\"password\", \"{password}\") \\\n",
384 | " .load()\n",
385 | " \n",
386 | " n_rows = df.count()\n",
387 | "\n",
388 | " spark.stop()\n",
389 | "\"\"\"\n",
390 | "\n",
391 | "pyspark_script = pyspark_script.format(**params)\n",
392 | "pyspark_script\n"
393 | ]
394 | },
395 | {
396 | "cell_type": "code",
397 | "execution_count": 157,
398 | "metadata": {
399 | "id": "MEcMsmKh12FK"
400 | },
401 | "outputs": [],
402 | "source": [
403 | "lvy = livyc.LivyC(data_livy)"
404 | ]
405 | },
406 | {
407 | "cell_type": "code",
408 | "execution_count": 161,
409 | "metadata": {
410 | "id": "65aOoDKv2J0S"
411 | },
412 | "outputs": [],
413 | "source": [
414 | "session = lvy.create_session()"
415 | ]
416 | },
417 | {
418 | "cell_type": "code",
419 | "execution_count": 162,
420 | "metadata": {
421 | "colab": {
422 | "base_uri": "https://localhost:8080/"
423 | },
424 | "id": "Uv8PYIBPykd7",
425 | "outputId": "5292aff7-3de1-42e9-cd8b-cc6687f79c9e"
426 | },
427 | "outputs": [
428 | {
429 | "data": {
430 | "text/plain": [
431 | "''"
432 | ]
433 | },
434 | "execution_count": 162,
435 | "metadata": {},
436 | "output_type": "execute_result"
437 | }
438 | ],
439 | "source": [
440 | "lvy.run_script(session, pyspark_script)"
441 | ]
442 | },
443 | {
444 | "cell_type": "code",
445 | "execution_count": 163,
446 | "metadata": {
447 | "colab": {
448 | "base_uri": "https://localhost:8080/"
449 | },
450 | "id": "g8HL2K4fypX7",
451 | "outputId": "c2626dc0-5eec-4342-ac9e-b0c51c5a6a7b"
452 | },
453 | "outputs": [
454 | {
455 | "data": {
456 | "text/plain": [
457 | "106"
458 | ]
459 | },
460 | "execution_count": 163,
461 | "metadata": {},
462 | "output_type": "execute_result"
463 | }
464 | ],
465 | "source": [
466 | "lvy.read_variable(session, \"n_rows\")"
467 | ]
468 | }
469 | ],
470 | "metadata": {
471 | "colab": {
472 | "collapsed_sections": [],
473 | "name": "test-livy.ipynb",
474 | "provenance": []
475 | },
476 | "kernelspec": {
477 | "display_name": "Python 3",
478 | "language": "python",
479 | "name": "python3"
480 | },
481 | "language_info": {
482 | "codemirror_mode": {
483 | "name": "ipython",
484 | "version": 3
485 | },
486 | "file_extension": ".py",
487 | "mimetype": "text/x-python",
488 | "name": "python",
489 | "nbconvert_exporter": "python",
490 | "pygments_lexer": "ipython3",
491 | "version": "3.7.9"
492 | }
493 | },
494 | "nbformat": 4,
495 | "nbformat_minor": 1
496 | }
497 |
--------------------------------------------------------------------------------