├── _config.yml ├── googled57bdb220576a44a.html ├── code └── apps │ ├── livy-server │ ├── Dockerfile │ └── conf │ │ ├── livy-env.sh │ │ └── livy.conf │ ├── spark │ ├── spark-defaults.conf │ └── conf │ │ └── log4j.properties │ └── docker-compose.yaml ├── LICENSE ├── .gitignore ├── README.md └── notebooks └── test_livy.ipynb /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman -------------------------------------------------------------------------------- /googled57bdb220576a44a.html: -------------------------------------------------------------------------------- 1 | google-site-verification: googled57bdb220576a44a.html -------------------------------------------------------------------------------- /code/apps/livy-server/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.io/bitnami/spark:2 2 | 3 | USER root 4 | ENV LIVY_HOME /opt/bitnami/livy 5 | WORKDIR /opt/bitnami/ 6 | 7 | RUN install_packages unzip \ 8 | && curl "https://downloads.apache.org/incubator/livy/0.7.1-incubating/apache-livy-0.7.1-incubating-bin.zip" -O \ 9 | && unzip "apache-livy-0.7.1-incubating-bin" \ 10 | && rm -rf "apache-livy-0.7.1-incubating-bin.zip" \ 11 | && mv "apache-livy-0.7.1-incubating-bin" $LIVY_HOME \ 12 | && mkdir $LIVY_HOME/logs \ 13 | && chown -R 1001:1001 $LIVY_HOME 14 | 15 | USER 1001 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Ramses Alexander Coraspe Valdez 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /code/apps/spark/spark-defaults.conf: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Default system properties included when running spark-submit. 19 | # This is useful for setting default environmental settings. 20 | 21 | # Example: 22 | # spark.master spark://master:7077 23 | # spark.eventLog.enabled true 24 | # spark.eventLog.dir hdfs://namenode:8021/directory 25 | # spark.serializer org.apache.spark.serializer.KryoSerializer 26 | # spark.driver.memory 2g 27 | # spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" 28 | 29 | 30 | #spark.jars.packages = org.postgresql:postgresql:9.4.1207.jar 31 | #spark.driver.extraClassPath = /opt/bitnami/spark/jars/postgresql-9.4.1207.jar 32 | 33 | -------------------------------------------------------------------------------- /code/apps/livy-server/conf/livy-env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | # LIVY ENVIRONMENT VARIABLES 19 | # 20 | # - JAVA_HOME Java runtime to use. By default use "java" from PATH. 21 | # - HADOOP_CONF_DIR Directory containing the Hadoop / YARN configuration to use. 22 | # - SPARK_HOME Spark which you would like to use in Livy. 23 | # - SPARK_CONF_DIR Optional directory where the Spark configuration lives. 24 | # (Default: $SPARK_HOME/conf) 25 | # - LIVY_LOG_DIR Where log files are stored. (Default: ${LIVY_HOME}/logs) 26 | # - LIVY_PID_DIR Where the pid file is stored. (Default: /tmp) 27 | # - LIVY_SERVER_JAVA_OPTS Java Opts for running livy server (You can set jvm related setting here, 28 | # like jvm memory/gc algorithm and etc.) 29 | # - LIVY_IDENT_STRING A name that identifies the Livy server instance, used to generate log file 30 | # names. (Default: name of the user starting Livy). 31 | # - LIVY_MAX_LOG_FILES Max number of log file to keep in the log directory. (Default: 5.) 32 | # - LIVY_NICENESS Niceness of the Livy server process when running in the background. (Default: 0.) 33 | 34 | export SPARK_HOME=/opt/bitnami/spark/ 35 | -------------------------------------------------------------------------------- /code/apps/spark/conf/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Set everything to be logged to the console 19 | log4j.rootCategory=DEBUG, console 20 | log4j.appender.console=org.apache.log4j.ConsoleAppender 21 | log4j.appender.console.target=System.err 22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 23 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 24 | 25 | # Set the default spark-shell log level to WARN. When running the spark-shell, the 26 | # log level for this class is used to overwrite the root logger's log level, so that 27 | # the user can have different defaults for the shell and regular Spark apps. 28 | log4j.logger.org.apache.spark.repl.Main=DEBUG 29 | 30 | # Settings to quiet third party logs that are too verbose 31 | log4j.logger.org.spark_project.jetty=WARN 32 | log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR 33 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=DEBUG 34 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=DEBUG 35 | log4j.logger.org.apache.parquet=ERROR 36 | log4j.logger.parquet=ERROR 37 | 38 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support 39 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL 40 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /code/apps/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | spark-master: 5 | image: docker.io/bitnami/spark:2 6 | environment: 7 | - SPARK_MODE=master 8 | - SPARK_RPC_AUTHENTICATION_ENABLED=no 9 | - SPARK_RPC_ENCRYPTION_ENABLED=no 10 | - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no 11 | - SPARK_SSL_ENABLED=no 12 | user: root 13 | volumes: 14 | - type: bind 15 | source: ./spark/conf/log4j.properties 16 | target: /opt/bitnami/spark/conf/log4j.properties 17 | - ./spark/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf 18 | - ./spark/jars_dir:/opt/bitnami/spark/ivy:z 19 | ports: 20 | - '8080:8080' 21 | - '7077:7077' 22 | networks: 23 | - net 24 | 25 | spark-worker1: 26 | image: docker.io/bitnami/spark:2 27 | environment: 28 | - SPARK_MODE=worker 29 | - SPARK_MASTER_URL=spark://spark-master:7077 30 | - SPARK_WORKER_MEMORY=1G 31 | - SPARK_WORKER_CORES=1 32 | - SPARK_RPC_AUTHENTICATION_ENABLED=no 33 | - SPARK_RPC_ENCRYPTION_ENABLED=no 34 | - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no 35 | - SPARK_SSL_ENABLED=no 36 | user: root 37 | volumes: 38 | - type: bind 39 | source: ./spark/conf/log4j.properties 40 | target: /opt/bitnami/spark/conf/log4j.properties 41 | - ./spark/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf 42 | - ./spark/jars_dir:/opt/bitnami/spark/ivy:z 43 | ports: 44 | - '8081:8081' 45 | networks: 46 | - net 47 | depends_on: 48 | - spark-master 49 | 50 | spark-worker2: 51 | image: docker.io/bitnami/spark:2 52 | environment: 53 | - SPARK_MODE=worker 54 | - SPARK_MASTER_URL=spark://spark-master:7077 55 | - SPARK_WORKER_MEMORY=1G 56 | - SPARK_WORKER_CORES=1 57 | - SPARK_RPC_AUTHENTICATION_ENABLED=no 58 | - SPARK_RPC_ENCRYPTION_ENABLED=no 59 | - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no 60 | - SPARK_SSL_ENABLED=no 61 | user: root 62 | volumes: 63 | - type: bind 64 | source: ./spark/conf/log4j.properties 65 | target: /opt/bitnami/spark/conf/log4j.properties 66 | - ./spark/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf 67 | - ./spark/jars_dir:/opt/bitnami/spark/ivy:z 68 | ports: 69 | - '8082:8082' 70 | networks: 71 | - net 72 | depends_on: 73 | - spark-master 74 | 75 | livy-server: 76 | container_name: livy_server 77 | build: ./livy-server/ 78 | command: ["sh", "-c", "/opt/bitnami/livy/bin/livy-server"] 79 | user: root 80 | volumes: 81 | - type: bind 82 | source: ./livy-server/conf/ 83 | target: /opt/bitnami/livy/conf/ 84 | - type: bind 85 | source: ./livy-server/target/ 86 | target: /target/ 87 | - type: bind 88 | source: ./livy-server/data/ 89 | target: /data/ 90 | ports: 91 | - '8998:8998' 92 | networks: 93 | - net 94 | depends_on: 95 | - spark-master 96 | - spark-worker1 97 | - spark-worker2 98 | 99 | db: 100 | container_name: pg_container 101 | image: postgres 102 | restart: always 103 | environment: 104 | POSTGRES_USER: "postgres" 105 | POSTGRES_PASSWORD: "12345" 106 | POSTGRES_DB: "db" 107 | POSTGRES_HOST_AUTH_METHOD: trust 108 | ports: 109 | - "5432:5432" 110 | networks: 111 | - net 112 | 113 | volumes: 114 | pg_data: 115 | 116 | networks: 117 | net: 118 | driver: bridge -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # docker-livy 2 | 3 | ### Check the article here: Building Real-time communication with Apache Spark through Apache Livy 4 | 5 | 6 | ## Dockerizing and Consuming an Apache Livy environment 7 | 8 |

9 | 12 |

13 | 14 | As you can see, In order to reproduce a real example we would need three components: 15 | 16 | - Apache Spark Cluster 17 | - Apache Livy Server 18 | - Apache Livy Client 19 | 20 | As an additional component I would add docker for a faster implementation, and a PostgreSQL database server to simulate an external data source available for Apache Spark. 21 | 22 | ## In order to reproduce the experiment we would need to follow the next steps: 23 | 24 | - Install Docker Desktop on Windows, it will install Docker Compose as well, Docker Compose will allow you to run multiple container applications. 25 | - Install git-bash for windows, once installed, open git bash and download this repository, this will download the docker-compose.yaml file, and other files needed. 26 | 27 | ```linux 28 | ramse@DESKTOP-K6K6E5A MINGW64 /c 29 | $ git clone https://github.com/Wittline/docker-livy.git 30 | ``` 31 | 32 | - Once all the files needed were downloaded from the repository, let's run everything. We will use the git bash tool again, go to the folder docker-livy and we will run the Docker Compose command: 33 | 34 | ```linux 35 | ramse@DESKTOP-K6K6E5A MINGW64 /c 36 | $ cd docker-livy 37 | 38 | ramse@DESKTOP-K6K6E5A MINGW64 /c/docker-livy 39 | $ cd code 40 | 41 | ramse@DESKTOP-K6K6E5A MINGW64 /c/docker-livy/code 42 | $ cd apps 43 | 44 | @DESKTOP-K6K6E5A MINGW64 /c/docker-livy/code/apps 45 | $ docker-compose up -d --build 46 | ``` 47 | 48 | - Wait for a minutes, and when the execution of the last command is finished, use the following command to check the status of all the containers. 49 | 50 | ```linux 51 | docker ps 52 | ``` 53 | 54 |

55 | 58 |

59 | 60 | - If everything is up and running then we are going to move forward with other steps to interact with the Apache Livy interface using livyc 61 | - Setiting a local environment with Google Colab 62 | - Go to git-bash and put the next command: 63 | 64 | ```linux 65 | ramse@DESKTOP-K6K6E5A MINGW64 ~ 66 | jupyter notebook --NotebookApp.allow_origin='https://colab.research.google.com' --port=8888 --NotebookApp.port_retries=0 67 | ``` 68 | 69 | - After the execution of the last command, copy the localhost URL, you will need it for colab 70 | - Go to Google Colab 71 | - Create a new Notebook 72 | - Go to -> Connect -> "Connect to local runtime" -> Paste the url copied from the last step and put it in Backend URL -> connect 73 | - Upload the file test-livy.ipynb, and use it into your colab local env: 74 | - Run each cell and see the results on each steps. 75 | 76 | Note that among the actions taken in the test-livy.ipynb file, the postgres database is being populated with data using an external .csv file, this in order for apache spark to have data to interact with. 77 | 78 | - If you want to monitorize all the session created in Apache Livy Server go to the address: http://localhost:8998/ui 79 | 80 |

81 | 84 |

85 | 86 | 87 | The Python Package livyc works well to submit pyspark scripts dynamically and asynchronously to the Apache Livy server, this in turn interacts with the Apache Spark Cluster in a transparent way, check this project and remember to check all the files before interacting with the jupyter notebook file. 88 | 89 | ## Contributing and Feedback 90 | Any ideas or feedback about this repository?. Help me to improve it. 91 | 92 | ## Authors 93 | - Created by Ramses Alexander Coraspe Valdez 94 | - Created on 2022 95 | 96 | ## License 97 | This project is licensed under the terms of the MIT License. 98 | 99 | -------------------------------------------------------------------------------- /code/apps/livy-server/conf/livy.conf: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # Use this keystore for the SSL certificate and key. 18 | # livy.keystore = 19 | 20 | # Specify the keystore password. 21 | # livy.keystore.password = 22 | # 23 | # Specify the key password. 24 | # livy.key-password = 25 | 26 | # Hadoop Credential Provider Path to get "livy.keystore.password" and "livy.key-password". 27 | # Credential Provider can be created using command as follow: 28 | # hadoop credential create "livy.keystore.password" -value "secret" -provider jceks://hdfs/path/to/livy.jceks 29 | # livy.hadoop.security.credential.provider.path = 30 | 31 | # What host address to start the server on. By default, Livy will bind to all network interfaces. 32 | livy.server.host = 0.0.0.0 33 | 34 | # What port to start the server on. 35 | livy.server.port = 8998 36 | 37 | # What base path ui should work on. By default UI is mounted on "/". 38 | # E.g.: livy.ui.basePath = /my_livy - result in mounting UI on /my_livy/ 39 | # livy.ui.basePath = "" 40 | 41 | # What spark master Livy sessions should use. 42 | livy.spark.master = spark://spark-master:7077 43 | 44 | # What spark deploy mode Livy sessions should use. 45 | livy.spark.deploy-mode = client 46 | 47 | # Configure Livy server http request and response header size. 48 | # livy.server.request-header.size = 131072 49 | # livy.server.response-header.size = 131072 50 | 51 | # Enabled to check whether timeout Livy sessions should be stopped. 52 | livy.server.session.timeout-check = true 53 | # 54 | # Whether or not to skip timeout check for a busy session 55 | livy.server.session.timeout-check.skip-busy = false 56 | 57 | # Time in milliseconds on how long Livy will wait before timing out an inactive session. 58 | # Note that the inactive session could be busy running jobs. 59 | livy.server.session.timeout = 5m 60 | # 61 | # How long a finished session state should be kept in LivyServer for query. 62 | livy.server.session.state-retain.sec = 60s 63 | 64 | # If livy should impersonate the requesting users when creating a new session. 65 | # livy.impersonation.enabled = false 66 | 67 | # Logs size livy can cache for each session/batch. 0 means don't cache the logs. 68 | # livy.cache-log.size = 200 69 | 70 | # Comma-separated list of Livy RSC jars. By default Livy will upload jars from its installation 71 | # directory every time a session is started. By caching these files in HDFS, for example, startup 72 | # time of sessions on YARN can be reduced. 73 | # livy.rsc.jars = 74 | 75 | # Comma-separated list of Livy REPL jars. By default Livy will upload jars from its installation 76 | # directory every time a session is started. By caching these files in HDFS, for example, startup 77 | # time of sessions on YARN can be reduced. Please list all the repl dependencies including 78 | # Scala version-specific livy-repl jars, Livy will automatically pick the right dependencies 79 | # during session creation. 80 | # livy.repl.jars = 81 | 82 | # Location of PySpark archives. By default Livy will upload the file from SPARK_HOME, but 83 | # by caching the file in HDFS, startup time of PySpark sessions on YARN can be reduced. 84 | # livy.pyspark.archives = 85 | 86 | # Location of the SparkR package. By default Livy will upload the file from SPARK_HOME, but 87 | # by caching the file in HDFS, startup time of R sessions on YARN can be reduced. 88 | # livy.sparkr.package = 89 | 90 | # List of local directories from where files are allowed to be added to user sessions. By 91 | # default it's empty, meaning users can only reference remote URIs when starting their 92 | # sessions. 93 | livy.file.local-dir-whitelist = /target/ 94 | 95 | # Whether to enable csrf protection, by default it is false. If it is enabled, client should add 96 | # http-header "X-Requested-By" in request if the http method is POST/DELETE/PUT/PATCH. 97 | # livy.server.csrf-protection.enabled = 98 | 99 | # Whether to enable HiveContext in livy interpreter, if it is true hive-site.xml will be detected 100 | # on user request and then livy server classpath automatically. 101 | # livy.repl.enable-hive-context = 102 | 103 | # Recovery mode of Livy. Possible values: 104 | # off: Default. Turn off recovery. Every time Livy shuts down, it stops and forgets all sessions. 105 | # recovery: Livy persists session info to the state store. When Livy restarts, it recovers 106 | # previous sessions from the state store. 107 | # Must set livy.server.recovery.state-store and livy.server.recovery.state-store.url to 108 | # configure the state store. 109 | # livy.server.recovery.mode = off 110 | 111 | # Where Livy should store state to for recovery. Possible values: 112 | # : Default. State store disabled. 113 | # filesystem: Store state on a file system. 114 | # zookeeper: Store state in a Zookeeper instance. 115 | # livy.server.recovery.state-store = 116 | 117 | # For filesystem state store, the path of the state store directory. Please don't use a filesystem 118 | # that doesn't support atomic rename (e.g. S3). e.g. file:///tmp/livy or hdfs:///. 119 | # For zookeeper, the address to the Zookeeper servers. e.g. host1:port1,host2:port2 120 | # livy.server.recovery.state-store.url = 121 | 122 | # If Livy can't find the yarn app within this time, consider it lost. 123 | # livy.server.yarn.app-lookup-timeout = 120s 124 | # When the cluster is busy, we may fail to launch yarn app in app-lookup-timeout, then it would 125 | # cause session leakage, so we need to check session leakage. 126 | # How long to check livy session leakage 127 | # livy.server.yarn.app-leakage.check-timeout = 600s 128 | # how often to check livy session leakage 129 | # livy.server.yarn.app-leakage.check-interval = 60s 130 | 131 | # How often Livy polls YARN to refresh YARN app state. 132 | # livy.server.yarn.poll-interval = 5s 133 | # 134 | # Days to keep Livy server request logs. 135 | # livy.server.request-log-retain.days = 5 136 | 137 | # If the Livy Web UI should be included in the Livy Server. Enabled by default. 138 | # livy.ui.enabled = true 139 | 140 | # Whether to enable Livy server access control, if it is true then all the income requests will 141 | # be checked if the requested user has permission. 142 | # livy.server.access-control.enabled = false 143 | 144 | # Allowed users to access Livy, by default any user is allowed to access Livy. If user want to 145 | # limit who could access Livy, user should list all the permitted users with comma separated. 146 | # livy.server.access-control.allowed-users = * 147 | 148 | # A list of users with comma separated has the permission to change other user's submitted 149 | # session, like submitting statements, deleting session. 150 | # livy.server.access-control.modify-users = 151 | 152 | # A list of users with comma separated has the permission to view other user's infomation, like 153 | # submitted session state, statement results. 154 | # livy.server.access-control.view-users = 155 | # 156 | # Authentication support for Livy server 157 | # Livy has a built-in SPnego authentication support for HTTP requests with below configurations. 158 | # livy.server.auth.type = kerberos 159 | # livy.server.auth.kerberos.principal = 160 | # livy.server.auth.kerberos.keytab = 161 | # livy.server.auth.kerberos.name-rules = DEFAULT 162 | # 163 | # If user wants to use custom authentication filter, configurations are: 164 | # livy.server.auth.type = 165 | # livy.server.auth..class = 166 | # livy.server.auth..param. = 167 | # livy.server.auth..param. = -------------------------------------------------------------------------------- /notebooks/test_livy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "-hHsmlGN0ETj" 7 | }, 8 | "source": [ 9 | "# **Installing livyc** 🔧" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 83, 15 | "metadata": { 16 | "colab": { 17 | "base_uri": "https://localhost:8080/" 18 | }, 19 | "id": "z9HFmSWHybp8", 20 | "outputId": "5a9cd433-f9ef-4519-cf8e-e63c1db3655d" 21 | }, 22 | "outputs": [ 23 | { 24 | "name": "stdout", 25 | "output_type": "stream", 26 | "text": [ 27 | "Collecting livyc==0.0.13Note: you may need to restart the kernel to use updated packages.\n", 28 | " Downloading livyc-0.0.13-py3-none-any.whl (4.9 kB)\n", 29 | "Installing collected packages: livyc\n", 30 | " Attempting uninstall: livyc\n", 31 | " Found existing installation: livyc 0.0.12\n", 32 | " Uninstalling livyc-0.0.12:\n", 33 | " Successfully uninstalled livyc-0.0.12\n" 34 | ] 35 | }, 36 | { 37 | "name": "stderr", 38 | "output_type": "stream", 39 | "text": [ 40 | "WARNING: You are using pip version 21.3.1; however, version 22.1.2 is available.\n", 41 | "You should consider upgrading via the 'c:\\python\\python37\\python.exe -m pip install --upgrade pip' command.\n" 42 | ] 43 | }, 44 | { 45 | "name": "stdout", 46 | "output_type": "stream", 47 | "text": [ 48 | "\n", 49 | "Successfully installed livyc-0.0.13\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "pip install livyc==0.0.13" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 84, 60 | "metadata": { 61 | "colab": { 62 | "base_uri": "https://localhost:8080/" 63 | }, 64 | "id": "YB7JoZCj9luc", 65 | "outputId": "5fba4b3b-d468-42ec-ec08-a7c3597dac99" 66 | }, 67 | "outputs": [ 68 | { 69 | "name": "stdout", 70 | "output_type": "stream", 71 | "text": [ 72 | "Requirement already satisfied: psycopg2 in c:\\python\\python37\\lib\\site-packages (2.8.6)\n", 73 | "Note: you may need to restart the kernel to use updated packages.\n" 74 | ] 75 | }, 76 | { 77 | "name": "stderr", 78 | "output_type": "stream", 79 | "text": [ 80 | "WARNING: You are using pip version 21.3.1; however, version 22.1.2 is available.\n", 81 | "You should consider upgrading via the 'c:\\python\\python37\\python.exe -m pip install --upgrade pip' command.\n" 82 | ] 83 | } 84 | ], 85 | "source": [ 86 | "pip install psycopg2" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": { 92 | "id": "TsGFrrZG0Xgj" 93 | }, 94 | "source": [ 95 | "# **Importing livyc library** ⚡" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 86, 101 | "metadata": { 102 | "id": "svZULhLhy7yo" 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "from livyc import livyc" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": { 112 | "id": "nUdzhm0D1ABb" 113 | }, 114 | "source": [ 115 | "# **Setting livy configuration** ✍" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 88, 121 | "metadata": { 122 | "id": "utAaRVFq0b8q" 123 | }, 124 | "outputs": [], 125 | "source": [ 126 | "data_livy = {\n", 127 | " \"livy_server_url\": \"localhost\",\n", 128 | " \"port\": \"8998\",\n", 129 | " \"jars\": [\"org.postgresql:postgresql:42.3.1\"]\n", 130 | "}" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": { 136 | "id": "Ru1gpwWu13hJ" 137 | }, 138 | "source": [ 139 | "# **Populate PostgreSQL DB with data** 🗄" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 136, 145 | "metadata": { 146 | "colab": { 147 | "base_uri": "https://localhost:8080/" 148 | }, 149 | "id": "86xKmv_43nd_", 150 | "outputId": "eb127be2-3453-4fe6-9211-3ab4383217cf" 151 | }, 152 | "outputs": [ 153 | { 154 | "name": "stdout", 155 | "output_type": "stream", 156 | "text": [ 157 | "Connecting to the PostgreSQL database\n", 158 | "PostgreSQL database version:\n", 159 | "('PostgreSQL 14.3 (Debian 14.3-1.pgdg110+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit',)\n", 160 | "Dropping tables\n", 161 | "Tables dropped\n", 162 | "Creating created\n", 163 | "Tables created\n", 164 | "Copying data from .csv to staging zone\n", 165 | "Staging ready\n", 166 | "Table staging has 106 records\n", 167 | "Database connection closed\n" 168 | ] 169 | } 170 | ], 171 | "source": [ 172 | "# DROP TABLES\n", 173 | "staging_table_drop = \"DROP TABLE IF EXISTS staging\"\n", 174 | "\n", 175 | "# CREATE TABLES\n", 176 | "\n", 177 | "staging_table_create = (\"\"\"\n", 178 | "CREATE TABLE IF NOT EXISTS staging(\n", 179 | " id serial PRIMARY KEY NOT NULL,\n", 180 | " first_name varchar,\n", 181 | " last_name varchar,\n", 182 | " company_name varchar,\n", 183 | " address varchar,\n", 184 | " city varchar,\n", 185 | " state varchar,\n", 186 | " zip varchar,\n", 187 | " phone1 varchar,\n", 188 | " phone2 varchar,\n", 189 | " email varchar,\n", 190 | " department varchar\n", 191 | ");\n", 192 | "\"\"\")\n", 193 | "\n", 194 | "create_table_queries = [staging_table_create]\n", 195 | "drop_table_queries = [staging_table_drop]\n", 196 | "\n", 197 | "\n", 198 | "import psycopg2\n", 199 | "import pandas as pd\n", 200 | "import os\n", 201 | "\n", 202 | "\n", 203 | "def create_connection(params):\n", 204 | " \"\"\"\n", 205 | " create a new connection with the postgreSQL \n", 206 | " database and return the cur and conn object\n", 207 | " :param params: connection string \n", 208 | " \"\"\"\n", 209 | " conn = None\n", 210 | "\n", 211 | " try:\n", 212 | " print('Connecting to the PostgreSQL database')\n", 213 | " conn = psycopg2.connect(**params)\n", 214 | " conn.set_session(autocommit=True)\n", 215 | "\n", 216 | " cur = conn.cursor()\n", 217 | "\n", 218 | " print('PostgreSQL database version:')\n", 219 | " cur.execute('SELECT version()')\n", 220 | "\n", 221 | " db_version = cur.fetchone()\n", 222 | " print(db_version) \n", 223 | " return cur, conn\n", 224 | " except (Exception, psycopg2.DatabaseError) as error:\n", 225 | " print(error)\n", 226 | "\n", 227 | "\n", 228 | "def close_connection(cur, conn):\n", 229 | " \"\"\"\n", 230 | " close the connection with the postgreSQL database \n", 231 | " :param cur: cursor\n", 232 | " :param conn: connection object\n", 233 | " \"\"\"\n", 234 | " try:\n", 235 | " cur.close()\n", 236 | " if conn is not None:\n", 237 | " conn.close()\n", 238 | " print('Database connection closed') \n", 239 | " except (Exception, psycopg2.DatabaseError) as error:\n", 240 | " print(error)\n", 241 | "\n", 242 | "\n", 243 | "def drop_tables(cur, conn):\n", 244 | " \"\"\"\n", 245 | " drop all the tables in the example \n", 246 | " :param cur: cursor\n", 247 | " :param conn: connection object\n", 248 | " \"\"\"\n", 249 | " print(\"Dropping tables\")\n", 250 | " for query in drop_table_queries: \n", 251 | " cur.execute(query)\n", 252 | " conn.commit()\n", 253 | " print(\"Tables dropped\")\n", 254 | "\n", 255 | "\n", 256 | "def create_tables(cur, conn):\n", 257 | " \"\"\"\n", 258 | " create all the tables in the example \n", 259 | " :param cur: cursor\n", 260 | " :param conn: connection object\n", 261 | " \"\"\"\n", 262 | " print(\"Creating created\")\n", 263 | " for query in create_table_queries:\n", 264 | " cur.execute(query)\n", 265 | " conn.commit()\n", 266 | " print(\"Tables created\")\n", 267 | "\n", 268 | "\n", 269 | "def check_data(cur, conn, tables):\n", 270 | " \"\"\"\n", 271 | " Check count of records in tables\n", 272 | " :param cur: cursor\n", 273 | " :param conn: connection object\n", 274 | " :param tables: tables to check\n", 275 | " \"\"\"\n", 276 | "\n", 277 | " count_values = {}\n", 278 | "\n", 279 | " for table in tables:\n", 280 | " query_count = \"SELECT COUNT(*) FROM {0}\".format(table)\n", 281 | "\n", 282 | " try:\n", 283 | " cur = conn.cursor()\n", 284 | " cur.execute(query_count)\n", 285 | " count_values[table] = cur.fetchone()[0] \n", 286 | " except (Exception, psycopg2.DatabaseError) as error:\n", 287 | " print(\"Error: %s\" % error)\n", 288 | " raise\n", 289 | "\n", 290 | " return count_values\n", 291 | "\n", 292 | "def set_staging(cur, conn, staging_file, columns):\n", 293 | "\n", 294 | " print(\"Copying data from .csv to staging zone\")\n", 295 | "\n", 296 | " try:\n", 297 | " copy_cmd = f\"copy staging({','.join(columns)}) from stdout (format csv)\"\n", 298 | " with open(staging_file, 'r') as f:\n", 299 | " next(f)\n", 300 | " cur.copy_expert(copy_cmd, f) \n", 301 | " conn.commit()\n", 302 | " print(\"Staging ready\")\n", 303 | " except (psycopg2.Error) as e:\n", 304 | " print(e)\n", 305 | "\n", 306 | " \n", 307 | " \n", 308 | "class Pipeline:\n", 309 | "\n", 310 | " def __init__(self, params, staging_file):\n", 311 | " self.params = params\n", 312 | " self.staging_file = staging_file\n", 313 | "\n", 314 | " def run(self):\n", 315 | " tables = ['staging']\n", 316 | " columns_staging = ['first_name','last_name','company_name','address','city','state','zip','phone1','phone2','email','department']\n", 317 | " cur, conn = create_connection(self.params)\n", 318 | " drop_tables(cur, conn)\n", 319 | " create_tables(cur, conn)\n", 320 | " set_staging(cur, conn, self.staging_file, columns_staging)\n", 321 | " count_tables = check_data(cur, conn, tables)\n", 322 | " for k, v in count_tables.items():\n", 323 | " print(\"Table {0} has {1} records\".format(k, v))\n", 324 | " close_connection(cur, conn)\n", 325 | "\n", 326 | "\n", 327 | "params = {\"host\": \"localhost\", \"port\":\"5432\", \"database\": \"db\", \"user\": \"postgres\", \"password\": \"pg12345\"}\n", 328 | "\n", 329 | "\n", 330 | "staging_file = \"./Documents/sample.csv\"\n", 331 | "pipeline = Pipeline(params, staging_file)\n", 332 | "pipeline.run()\n" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": { 338 | "id": "eHoROghy3m9o" 339 | }, 340 | "source": [ 341 | "# **Let's try launch a pySpark script to Apache Livy Server** 🤓" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 137, 347 | "metadata": { 348 | "colab": { 349 | "base_uri": "https://localhost:8080/" 350 | }, 351 | "id": "GlujsSQK4ZIo", 352 | "outputId": "a9d562e8-27c6-490b-f8ec-11ab0109aa1e" 353 | }, 354 | "outputs": [ 355 | { 356 | "data": { 357 | "text/plain": [ 358 | "'\\n\\n from pyspark.sql.functions import udf, col, explode\\n from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType\\n from pyspark.sql import Row\\n from pyspark.sql import SparkSession\\n\\n\\n df = spark.read.format(\"jdbc\") .option(\"url\", \"jdbc:postgresql://pg_container:5432/db\") .option(\"driver\", \"org.postgresql.Driver\") .option(\"dbtable\", \"staging\") .option(\"user\", \"postgres\") .option(\"password\", \"pg12345\") .load()\\n \\n n_rows = df.count()\\n\\n spark.stop()\\n'" 359 | ] 360 | }, 361 | "execution_count": 137, 362 | "metadata": {}, 363 | "output_type": "execute_result" 364 | } 365 | ], 366 | "source": [ 367 | "\n", 368 | "params[\"table\"] = \"staging\"\n", 369 | "\n", 370 | "pyspark_script = \"\"\"\n", 371 | "\n", 372 | " from pyspark.sql.functions import udf, col, explode\n", 373 | " from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType\n", 374 | " from pyspark.sql import Row\n", 375 | " from pyspark.sql import SparkSession\n", 376 | "\n", 377 | "\n", 378 | " df = spark.read.format(\"jdbc\") \\\n", 379 | " .option(\"url\", \"jdbc:postgresql://pg_container:{port}/{database}\") \\\n", 380 | " .option(\"driver\", \"org.postgresql.Driver\") \\\n", 381 | " .option(\"dbtable\", \"{table}\") \\\n", 382 | " .option(\"user\", \"{user}\") \\\n", 383 | " .option(\"password\", \"{password}\") \\\n", 384 | " .load()\n", 385 | " \n", 386 | " n_rows = df.count()\n", 387 | "\n", 388 | " spark.stop()\n", 389 | "\"\"\"\n", 390 | "\n", 391 | "pyspark_script = pyspark_script.format(**params)\n", 392 | "pyspark_script\n" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 157, 398 | "metadata": { 399 | "id": "MEcMsmKh12FK" 400 | }, 401 | "outputs": [], 402 | "source": [ 403 | "lvy = livyc.LivyC(data_livy)" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 161, 409 | "metadata": { 410 | "id": "65aOoDKv2J0S" 411 | }, 412 | "outputs": [], 413 | "source": [ 414 | "session = lvy.create_session()" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": 162, 420 | "metadata": { 421 | "colab": { 422 | "base_uri": "https://localhost:8080/" 423 | }, 424 | "id": "Uv8PYIBPykd7", 425 | "outputId": "5292aff7-3de1-42e9-cd8b-cc6687f79c9e" 426 | }, 427 | "outputs": [ 428 | { 429 | "data": { 430 | "text/plain": [ 431 | "''" 432 | ] 433 | }, 434 | "execution_count": 162, 435 | "metadata": {}, 436 | "output_type": "execute_result" 437 | } 438 | ], 439 | "source": [ 440 | "lvy.run_script(session, pyspark_script)" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": 163, 446 | "metadata": { 447 | "colab": { 448 | "base_uri": "https://localhost:8080/" 449 | }, 450 | "id": "g8HL2K4fypX7", 451 | "outputId": "c2626dc0-5eec-4342-ac9e-b0c51c5a6a7b" 452 | }, 453 | "outputs": [ 454 | { 455 | "data": { 456 | "text/plain": [ 457 | "106" 458 | ] 459 | }, 460 | "execution_count": 163, 461 | "metadata": {}, 462 | "output_type": "execute_result" 463 | } 464 | ], 465 | "source": [ 466 | "lvy.read_variable(session, \"n_rows\")" 467 | ] 468 | } 469 | ], 470 | "metadata": { 471 | "colab": { 472 | "collapsed_sections": [], 473 | "name": "test-livy.ipynb", 474 | "provenance": [] 475 | }, 476 | "kernelspec": { 477 | "display_name": "Python 3", 478 | "language": "python", 479 | "name": "python3" 480 | }, 481 | "language_info": { 482 | "codemirror_mode": { 483 | "name": "ipython", 484 | "version": 3 485 | }, 486 | "file_extension": ".py", 487 | "mimetype": "text/x-python", 488 | "name": "python", 489 | "nbconvert_exporter": "python", 490 | "pygments_lexer": "ipython3", 491 | "version": "3.7.9" 492 | } 493 | }, 494 | "nbformat": 4, 495 | "nbformat_minor": 1 496 | } 497 | --------------------------------------------------------------------------------