├── .DS_Store ├── .gitignore ├── Dockerfile ├── Dockerfile.md ├── LICENSE ├── License.md ├── README.md ├── aerospike ├── aerospike.conf ├── aerospike.template.conf ├── bin └── build-docker-image.sh ├── binder-pre └── binder_run_first.ipynb ├── entrypoint.sh ├── features.conf ├── grep ├── jupyter_notebook_config.py ├── logo.png ├── notebooks ├── .DS_Store ├── README.md ├── java │ ├── README.md │ ├── SimplePutGetExample.ipynb │ ├── add_namespace.sh │ ├── async_ops.ipynb │ ├── batch_ops.ipynb │ ├── cdt_indexing.ipynb │ ├── doc_api.ipynb │ ├── doc_api_example_store.json │ ├── doc_api_example_tommyleejones.json │ ├── expressions.ipynb │ ├── hello_world.ipynb │ ├── java-advanced_collection_data_types.ipynb │ ├── java-intro_to_data_modeling.ipynb │ ├── java-intro_to_transactions.ipynb │ ├── java-modeling_using_lists.ipynb │ ├── java-modeling_using_maps.ipynb │ ├── java-working_with_lists.ipynb │ ├── java-working_with_maps.ipynb │ ├── look_aside_cache_mongo.ipynb │ ├── nobel_prizes.json │ ├── object_mapper.ipynb │ ├── query_splits.ipynb │ ├── query_streams.ipynb │ ├── query_udf.ipynb │ ├── space_companies.json │ ├── sql_aggregates_1.ipynb │ ├── sql_aggregates_2.ipynb │ ├── sql_select.ipynb │ ├── sql_update.ipynb │ └── tweetaspike.ipynb ├── presto │ ├── AerospikePrestoDemo.ipynb │ └── AerospikePython.ipynb ├── python │ ├── README.md │ ├── basic_operations.ipynb │ ├── hello_world.ipynb │ ├── local_cache.ipynb │ ├── look_aside_cache.ipynb │ ├── query.ipynb │ ├── readme_tips.ipynb │ ├── simple_put_get_example.ipynb │ └── transactions_rmw_pattern.ipynb ├── readme_tips.ipynb ├── spark │ ├── .gitignore │ ├── AerospikeSparkPython.ipynb │ ├── AerospikeSparkScala.ipynb │ ├── feature-store-feature-eng.ipynb │ ├── feature-store-model-serving.ipynb │ ├── feature-store-model-training.ipynb │ ├── other_notebooks │ │ ├── AerospikeSparkH2ODemo.ipynb │ │ ├── AerospikeSparkMLDemo.ipynb │ │ ├── AerospikeSparkMLLinearRegression.ipynb │ │ ├── AerospikeSparkPythonJSONSQL.ipynb │ │ ├── AerospikeSparkPythonParquet.ipynb │ │ ├── AerospikeSparkSQLSyntaxDemo.ipynb │ │ └── nested_data.json │ ├── resources │ │ ├── creditcard_small.csv │ │ ├── fs-arch.jpg │ │ ├── fs-model-ws.py │ │ ├── fs_model_rf.tar.gz │ │ ├── install.txt │ │ ├── nested_data.json │ │ └── pushdown-expressions.ipynb │ └── simple-load-store.ipynb └── udf │ ├── aggregate_fns.lua │ └── update_example.lua ├── update.sh └── update_readme.sh /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aerospike-examples/interactive-notebooks/0e582d4305974f6cadd390e2086e8550f1b3ecf7/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | *.iml 4 | *.ipr 5 | *.iws 6 | *.pyc 7 | *.pyo 8 | *.swp 9 | .DS_Store 10 | .cache 11 | #Spark related stuff 12 | spark-warehouse/ 13 | .ipynb_checkpoints/ 14 | #Idea realted stuff 15 | .idea/ 16 | .idea_modules/ 17 | target 18 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # 2 | # Aerospike Server Dockerfile 3 | # 4 | # http://github.com/aerospike/aerospike-server.docker 5 | # 6 | # This docker file is compatible with Aerospike Community Edition. It provides Java and Python environments and access to the Aerospike DB. 7 | FROM jupyter/base-notebook:python-3.8.6 8 | 9 | USER root 10 | 11 | ENV AEROSPIKE_VERSION 7.2.0.4 12 | ENV AEROSPIKE_SHA256 f742ad19d6a75901134e8a6a9a8c9bba9830b019c06053145ad56d8d1b189af8 13 | ENV LOGFILE /var/log/aerospike/aerospike.log 14 | ARG AEROSPIKE_TOOLS_VERSION=11.1.1 15 | 16 | ARG NB_USER=jovyan 17 | ARG NB_UID=1000 18 | ENV USER ${NB_USER} 19 | ENV NB_UID ${NB_UID} 20 | ENV HOME /home/${NB_USER} 21 | USER root 22 | RUN chown -R ${NB_UID} ${HOME} 23 | 24 | # spark notebook 25 | RUN mkdir /opt/spark-nb; cd /opt/spark-nb\ 26 | && wget -qO- "https://archive.apache.org/dist/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3-scala2.13.tgz" | tar -xvz \ 27 | && ln -s spark-3.4.0-bin-hadoop3-scala2.13 spark-dir-link \ 28 | && pip install findspark numpy pandas matplotlib sklearn \ 29 | && wget "https://download.aerospike.com/artifacts/aerospike-spark/4.5.1/aerospike-spark-4.5.1-spark3.4-scala2.13-clientunshaded.jar" \ 30 | && ln -s aerospike-spark-4.5.1-spark3.4-scala2.13-clientunshaded.jar aerospike-jar-link 31 | 32 | # install jupyter notebook extensions, and enable these extensions by default: table of content, collapsible headers, and scratchpad 33 | RUN pip install jupyter_contrib_nbextensions\ 34 | && jupyter contrib nbextension install --sys-prefix\ 35 | && jupyter nbextension enable toc2/main --sys-prefix\ 36 | && jupyter nbextension enable collapsible_headings/main --sys-prefix\ 37 | && jupyter nbextension enable scratchpad/main --sys-prefix 38 | 39 | RUN mkdir /var/run/aerospike\ 40 | && apt-get update -y \ 41 | && apt-get install software-properties-common dirmngr gpg-agent -y --no-install-recommends\ 42 | && apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 0xB1998361219BD9C9 \ 43 | && apt-add-repository 'deb http://repos.azulsystems.com/ubuntu stable main' \ 44 | && apt-get install -y --no-install-recommends build-essential wget lua5.2 gettext-base libldap-dev curl unzip python python3-pip python3-dev python3 zulu-11\ 45 | && wget "https://www.aerospike.com/artifacts/aerospike-server-enterprise/${AEROSPIKE_VERSION}/aerospike-server-enterprise_${AEROSPIKE_VERSION}_tools-${AEROSPIKE_TOOLS_VERSION}_ubuntu20.04_x86_64.tgz" -O aerospike-server.tgz \ 46 | && echo "$AEROSPIKE_SHA256 *aerospike-server.tgz" | sha256sum -c - \ 47 | && wget "https://github.com/aerospike/aerospike-loader/releases/download/4.0.3/aerospike-load-4.0.3-jar-with-dependencies.jar" \ 48 | && mkdir aerospike \ 49 | && tar xzf aerospike-server.tgz --strip-components=1 -C aerospike \ 50 | && dpkg -i aerospike/aerospike-server*.deb \ 51 | && dpkg -i aerospike/aerospike-tools*.deb \ 52 | && mkdir -p /opt/aerospike/lib/java \ 53 | && mv aerospike-load-*-jar-with-dependencies.jar /opt/aerospike/lib/java/ \ 54 | && pip install --no-cache-dir aerospike\ 55 | && pip install --no-cache-dir pymongo\ 56 | && wget "https://github.com/SpencerPark/IJava/releases/download/v1.3.0/ijava-1.3.0.zip" -O ijava-kernel.zip\ 57 | && unzip ijava-kernel.zip -d ijava-kernel \ 58 | && python3 ijava-kernel/install.py --sys-prefix\ 59 | && rm ijava-kernel.zip\ 60 | && rm -rf aerospike-server.tgz aerospike /var/lib/apt/lists/* \ 61 | && apt-get purge -y \ 62 | && apt autoremove -y \ 63 | && mkdir -p /var/log/aerospike 64 | 65 | COPY aerospike /etc/init.d/ 66 | RUN usermod -a -G aerospike ${NB_USER} 67 | 68 | # Add the Aerospike configuration specific to this dockerfile 69 | COPY aerospike.template.conf /etc/aerospike/aerospike.template.conf 70 | COPY aerospike.conf /etc/aerospike/aerospike.conf 71 | COPY features.conf /etc/aerospike/features.conf 72 | 73 | RUN chown -R ${NB_UID} /etc/aerospike 74 | RUN chown -R ${NB_UID} /opt/aerospike 75 | RUN chown -R ${NB_UID} /var/log/aerospike 76 | RUN chown -R ${NB_UID} /var/run/aerospike 77 | 78 | #RUN fix-permissions /etc/aerospike/ 79 | #RUN fix-permissions /var/log/aerospike 80 | 81 | COPY notebooks* /home/${NB_USER}/notebooks 82 | RUN echo "Versions:" > /home/${NB_USER}/notebooks/README.md 83 | RUN python -V >> /home/${NB_USER}/notebooks/README.md 84 | RUN java -version 2>> /home/${NB_USER}/notebooks/README.md 85 | RUN asd --version >> /home/${NB_USER}/notebooks/README.md 86 | RUN echo -e "Aerospike Python Client `pip show aerospike|grep Version|sed -e 's/Version://g'`" >> /home/${NB_USER}/notebooks/README.md 87 | #RUN echo -e "Aerospike Java Client 5.0.0" >> /home/${NB_USER}/notebooks/README.md 88 | 89 | COPY jupyter_notebook_config.py /home/${NB_USER}/ 90 | RUN fix-permissions /home/${NB_USER}/ 91 | 92 | # I don't know why this has to be like this 93 | # rather than overiding 94 | COPY entrypoint.sh /usr/local/bin/start-notebook.sh 95 | WORKDIR /home/${NB_USER}/notebooks 96 | USER ${NB_USER} 97 | -------------------------------------------------------------------------------- /Dockerfile.md: -------------------------------------------------------------------------------- 1 | ## Aerospike Development Notebooks Dockerfile 2 | 3 | This repository contains the Dockerfile for building a Docker image for running [Aerospike](http://aerospike.com). 4 | 5 | ## Installation 6 | 7 | 1. Install [Docker](https://www.docker.io/). 8 | 9 | 2. Download from public [Docker Registry](https://index.docker.io/): 10 | 11 | docker pull aerospike-examples/interactive-notebooks 12 | 13 | _Alternatively, you can build an image from Dockerfile:_ 14 | 15 | docker build -t="aerospike-examples/interactive-notebooks" github.com/aerospike-examples/interactive-notebooks 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 aerospike-examples/interactive-notebooks 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /License.md: -------------------------------------------------------------------------------- 1 | # Aerospike Evaluation License Agreement 2 | PLEASE READ THIS EVALUATION LICENSE AGREEMENT (THE “AGREEMENT”) CAREFULLY BEFORE USING THE SOFTWARE AND SERVICES OFFERED BY AEROSPIKE, INC. (“AEROSPIKE”). BY DOWNLOADING THE AEROSPIKE SOFTWARE ON A TRIAL OR EVALUATION BASIS (AN “EVALUATION”), YOU OR THE ENTITY THAT YOU REPRESENT (“LICENSEE”) ARE UNCONDITIONALLY CONSENTING TO BE BOUND BY AND ARE BECOMING A PARTY TO THE MASTER LICENSE AGREEMENT CONSISTING OF THIS PARAGRAPH AND THE FOLLOWING TERMS. PROVISION OF THE PRODUCT IS CONDITIONED ON, AND LICENSEE’S INSTALLATION OR USE OF THE PRODUCT SHALL CONSTITUTE, LICENSEE’S ASSENT TO THE TERMS OF THIS AGREEMENT TO THE EXCLUSION OF ALL OTHER TERMS. 3 | 4 | 1. Grant of License and Restrictions. Subject to the terms hereof and any applicable user/use limitations, Aerospike grants Licensee a personal, nonsublicensable, nonexclusive, limited right to use the licensed product downloaded for an Evaluation in object code form only (the “Product”) subject to any of the limitations herein and only in accordance with Aerospike’s applicable user documentation. Licensee may possess only the number of copies of the Product that you download for an Evaluation (an “Evaluation Product”), may be used only during the Evaluation Term (defined below) and only for purposes of internal evaluation, and not for any production use. Aerospike retains ownership of all Products and rights therein and Licensee will maintain the copyright notice and any other notices that appear on the Product on any copies and any media. Licensee will not (and will not allow any third party to) (i) reverse engineer or attempt to discover any source code or underlying ideas or algorithms of any Product (except to the extent that applicable law prohibits reverse engineering restrictions), (ii) provide, lease, lend, disclose, use for timesharing or service bureau purposes, or otherwise use or allow others to use for the benefit of any third party, any Product (except as expressly and specifically authorized by Aerospike), (iii) possess or use any Product, or allow the transfer, transmission, export, or re-export of any Product or portion thereof in violation of any export control laws or regulations administered by the U.S. Commerce Department, U.S. Treasury Department’s Office of Foreign Assets Control, or any other government agency, (iv) disclose to any third party any benchmarking or comparative study involving any Product, (v) modify any Product, or (vi) run any production instance of an enterprise edition version of a Product with any community edition version of a Product. Prior to disposing of any media or apparatus containing any part of the Product, Licensee shall completely destroy any Product contained therein. All the limitations and restrictions on Products in this Agreement also apply to documentation. 5 | 2. Support and Maintenance. Aerospike will use reasonable commercial efforts to provide the support and maintenance services for the Evaluation Product as and to the extent described in Aerospike’s then-current Support and Maintenance Terms. Licensee shall not use any Support Services for any unsupported application, including, without limitation, any open source or community edition of any Aerospike product, without paying Aerospike’s then-current enterprise subscription fees. Aerospike shall be entitled to invoice Licensee (and Licensee shall pay) Aerospike’s then-current enterprise subscription fees for every instance of any Aerospike product (including, without limitation, any open source or community edition thereof) in connection with which Licensee uses (or otherwise benefits from) any Support Services. 6 | 3. Indemnification. Aerospike shall defend, indemnify and hold Licensee harmless from liability to third parties resulting from infringement by a Product of any United States patent or any copyright or misappropriation of any trade secret, provided Aerospike is promptly notified of any and all threats, claims and proceedings related thereto and given reasonable assistance and the opportunity to assume sole control over defense and settlement; Aerospike will not be responsible for any settlement it does not approve. The foregoing obligations do not apply with respect to a Product or portions or components thereof to the extent (i) not created by Aerospike, (ii) made in whole or in part in accordance to Licensee specifications, (iii) that are modified after delivery by Aerospike, (iv) combined with other products, processes or materials where the alleged infringement relates to such combination, (v) where Licensee continues allegedly infringing activity after being notified thereof or after being informed of modifications that would have avoided the alleged infringement, or (vi) where Licensee’s use of such Product is not strictly in accordance herewith. Licensee will indemnify Aerospike from all damages, costs, settlements, attorneys’ fees and expenses related to any claim of infringement or misappropriation excluded from Aerospike’s indemnity obligation by the preceding sentence. 7 | 4. Embedded Reporting/Compliance Routine; Data Access and Use; Feedback. Licensee acknowledges that Products (excluding the Enterprise Edition of the Product) may contain automated reporting routines that will automatically identify and analyze certain aspects of use and performance of Products and/or the systems on which they are installed, as well as the operator and operating environment (including problems and issues that arise in connection therewith), and provide e-mail and other reports to Aerospike; this includes, without limitation, information on usage that Aerospike uses for billing purposes. Aerospike will be entitled to inspect the installation and configuration of such Products and systems from time to time on reasonable notice. Provided it does not identify Licensee, Aerospike will be free to use for development, diagnostic and corrective purposes any data and information it so collects relating to diagnosis, problems, systems, performance, use or functionality, and may allow others to do so. Notwithstanding anything else, Licensee agrees that Aerospike may freely exploit and make available any and all feedback, suggestions, ideas, enhancement requests, recommendations or other information provided by Licensee any other party relating to the Products. 8 | 5. No Warranty. ALL PRODUCTS AND SERVICES (INCLUDING, WITHOUT LIMITATION, EVALUATION PRODUCTS) ARE PROVIDED “AS IS” WITHOUT WARRANTY OF ANY KIND FROM ANYONE, INCLUDING WITHOUT LIMITATION, ANY WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE OR NONINFRINGEMENT. FURTHER, AEROSPIKE DOES NOT WARRANT RESULTS OF USE OR THAT THE PRODUCTS ARE BUG FREE OR THAT THE PRODUCT’S USE WILL BE UNINTERRUPTED. 9 | 6. Limitation of Liability. NOTWITHSTANDING ANYTHING ELSE HEREIN OR OTHERWISE, AND EXCEPT FOR BODILY INJURY, NEITHER AEROSPIKE NOR ANY LICENSOR SHALL BE LIABLE OR OBLIGATED WITH RESPECT TO THE SUBJECT MATTER HEREOF OR UNDER ANY CONTRACT, NEGLIGENCE, STRICT LIABILITY OR OTHER LEGAL OR EQUITABLE THEORY (I) FOR ANY DOLLAR AMOUNTS OR (II) FOR ANY COST OF PROCUREMENT OF SUBSTITUTE GOODS, TECHNOLOGY, SERVICES OR RIGHTS; (III) FOR ANY INCIDENTAL OR CONSEQUENTIAL DAMAGES; (IV) FOR INTERRUPTION OF USE OR LOSS OR CORRUPTION OF DATA; OR (V) FOR ANY MATTER BEYOND ITS REASONABLE CONTROL. THE PRODUCT IS NOT DESIGNED, MANUFACTURED, OR INTENDED FOR USE IN HAZARDOUS ENVIRONMENTS REQUIRING FAIL-SAFE PERFORMANCE WHERE THE FAILURE OF THE PRODUCT COULD LEAD DIRECTLY TO DEATH, PERSONAL INJURY, OR SIGNIFICANT PHYSICAL OR ENVIRONMENTAL DAMAGE (“HIGH RISK ACTIVITIES”). USE OF THE PRODUCT IN HIGH RISK ACTIVITIES IS NOT AUTHORIZED. THE PARTIES AGREE THAT THIS SECTION 8 REPRESENTS A REASONABLE ALLOCATION OF RISK AND THAT AEROSPIKE WOULD NOT PROCEED IN THE ABSENCE OF SUCH ALLOCATION. 10 | 7. Confidentiality. Licensee understands that Company has disclosed or may disclose information relating to Company’s technology or business, including, without limitation, the Products and any other software, documentation, updates, modifications, or new releases thereof and the existence, terms and conditions of this Agreement (hereinafter collectively referred to as “Proprietary Information”). Licensee agrees: (i) not to divulge to any third person any such Proprietary Information, (ii) to give access to such Proprietary information solely to those employees with a need to have access thereto for purposes of this Agreement, and (iii) to take the same security precautions to protect against disclosure or unauthorized use of such Proprietary information that Licensee takes with its own proprietary information, but in no event will Licensee apply less than reasonable precautions to protect such Proprietary Information. Nothing in this Agreement will prevent Licensee from disclosing the Proprietary Information pursuant to any judicial or governmental order, provided that Licensee gives Company reasonable prior notice of such disclosure to contest such order. 11 | 8. Miscellaneous. Neither this Agreement nor the licenses granted hereunder are assignable or transferable (and any attempt to do so shall be void); provided that either party may assign and transfer the foregoing to a successor to substantially all of (i) in the case of Aerospike, Aerospike’s Product business or assets or, (ii) in the case of Licensee, Licensee’s business for which Products are licensed (but if the authorized use is not limited, the assignee is not licensed to expand use beyond Licensee’s bona fide pre-assignment use plus reasonably expected growth assuming the assignment and related transactions had not occurred). The provisions hereof are for the benefit of the parties only and not for any other person or entity. No failure or delay in exercising any right hereunder will operate as a waiver thereof, nor will any partial exercise of any right or power hereunder preclude further exercise. If any provision shall be adjudged by any court of competent jurisdiction to be unenforceable or invalid, that provision shall be limited or eliminated to the minimum extent necessary so that this Agreement shall otherwise remain in full force and effect and enforceable. This Agreement shall be deemed to have been made in, and shall be construed pursuant to the laws of the State of California and the United States without regard to conflicts of laws provisions thereof, and without regard to the United Nations Convention on the International Sale of Goods or the Uniform Computer Information Transactions Act. This Agreement is the complete and exclusive statement of the mutual understanding of the parties and supersedes and cancels all previous written and oral agreements and communications relating to the subject matter hereof and any waivers or amendments shall be effective only if made in a writing executed by authorized representatives of both parties; however, any pre-printed or standard terms of any Licensee purchase order, confirmation, or similar form, even if signed by the parties after the effective date hereof, shall have no force or effect. The substantially prevailing party in any action to enforce this agreement will be entitled to recover its attorney’s fees and costs in connection with such action. 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This repository contains the build information for the Docker Image used at https://hub.docker.com/r/aerospike/intro-notebooks. 2 | 3 | # Aerospike Interactive Notebooks 4 | 5 | Aerospike is a distributed database designed to serve global applications with low latency, fast throughput, and resilience to failure. 6 | 7 | The Docker Image in this repo contains a complete [Aerospike](https://www.aerospike.com) development environment and two categories of Jupyter Notebooks: 8 | - **Aerospike Client Tutorials:** Notebooks that provide interactive examples of Java and Python client use of Aerospike Database. 9 | - **Spark Notebooks:** Notebooks that show how Aerospike can be used in conjunction with Spark. 10 | 11 | Useful links: 12 | - [Run the **Aerospike Java and Python Client Development Environment** locally in Docker](#run-client-notebooks-in-docker-container). 13 | - How-to set up and run Spark on [Linux (CentOS)](#set-up-spark-notebooks-on-linux-centos) or [MacOS X](#set-up-spark-notebooks-on-macos-x) to work with the Spark notebooks in the development environment. 14 | - [Notebook list](notebooks/README.md#notebooks) 15 | - [Software contents list](#software-contents) 16 | 17 | Documentation for Aerospike is available at [https://aerospike.com/docs](https://aerospike.com/docs), 18 | and Docker Desktop installation at [https://docs.docker.com/desktop/](https://docs.docker.com/desktop/). 19 | 20 | The download and use of this Aerospike software is governed by [Aerospike Evaluation License Agreement](https://www.aerospike.com/forms/evaluation-license-agreement/). 21 | 22 | 23 | ## Run Client Notebooks in Docker Container 24 | Notebooks for Java and Python clients are currently avaialble. Some Spark notebooks can currently run in the container. 25 | 26 | 1. Install [Docker](https://www.docker.com). 27 | 28 | 1. Get [the Intro Notebooks image](https://hub.docker.com/r/aerospike/intro-notebooks) from [Docker Hub](https://hub.docker.com/u/aerospike): 29 | ``` 30 | docker pull aerospike/intro-notebooks 31 | ``` 32 | [Alternatively] If building the image: 33 | 1. Git clone image repo: 34 | ``` 35 | git clone https://github.com/aerospike-examples/interactive-notebooks.docker.git 36 | ``` 37 | 1. cd to "interactive-notebooks.docker" and build from Dockerfile: 38 | ``` 39 | docker build -t aerospike/intro-notebooks . 40 | ``` 41 | 1. Run the image and expose port 8888: 42 | ``` 43 | docker run --name aero-nb -p 8888:8888 aerospike/intro-notebooks 44 | ``` 45 | [Optional alternative] Use the LOGFILE environment variable to specify a log file path in the image: 46 | ``` 47 | docker run -e "LOGFILE=/opt/aerospike/aerospike.log" --name aero-nb -p 8888:8888 aerospike/intro-notebooks 48 | ``` 49 | 1. Point your browser at the url with token which should be printed on the output. By default it should be: 50 | ``` 51 | http://127.0.0.1:8888/?token= 52 | ``` 53 | 54 | Example: 55 | ```text 56 | $ docker run --name aero-nb -p 8888:8888 aerospike/intro-notebooks 57 | 58 | link eth0 state up 59 | link eth0 state up in 0 60 | Set username to: jovyan 61 | usermod: no changes 62 | Executing the command: jupyter notebook 63 | [I 05:28:34.202 NotebookApp] Writing notebook server cookie secret to /home/jovyan/.local/share/jupyter/runtime/notebook_cookie_secret 64 | [I 05:28:34.954 NotebookApp] JupyterLab extension loaded from /opt/conda/lib/python3.8/site-packages/jupyterlab 65 | [I 05:28:34.954 NotebookApp] JupyterLab application directory is /opt/conda/share/jupyter/lab 66 | [I 05:28:34.957 NotebookApp] Serving notebooks from local directory: /home/jovyan/notebooks 67 | [I 05:28:34.957 NotebookApp] Jupyter Notebook 6.1.4 is running at: 68 | [I 05:28:34.957 NotebookApp] http://6a374afd9f00:8888/?token=c45783e6631e305c97f6919905250e61f09049e750813cf6 69 | [I 05:28:34.957 NotebookApp] or http://127.0.0.1:8888/?token=c45783e6631e305c97f6919905250e61f09049e750813cf6 70 | [I 05:28:34.957 NotebookApp] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation). 71 | 72 | ``` 73 | 74 | ## Spark Notebooks 75 | Spark notebooks can run on Linux (CentOS) and MacOS X. 76 | 77 | Some Spark notebooks can fully run in the container by following the specific setup provided in the respective notebook. To run any Spark notebook with an external Spark, Aerospike, and/or Jupyter server, follow the instructions below. 78 | 79 | ### Set up Spark Notebooks on Linux (CentOS) 80 | 81 | yum installer used below - use dbpkg/rpm/other if your Linux distribution does not support yum 82 | 83 | ``` bash 84 | sudo yum -y install gcc zlib-devel openssl-devel libffi-devel sqlite-devel bzip2-devel bzip2 xz-devel screen wget 85 | ``` 86 | 87 | Get your own local copy of Python 3.7 (ignore if you have it already). Below we install to ~/.localpython 88 | 89 | ``` bash 90 | PYTHON_VERSION=3.7.1 91 | wget http://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz 92 | tar zxvf Python-${PYTHON_VERSION}.tgz 93 | cd Python-${PYTHON_VERSION} 94 | mkdir ~/.localpython 95 | ./configure --prefix=$HOME/.localpython 96 | make 97 | make install 98 | ``` 99 | 100 | Set up a virtual Python environment - this is a sandbox which avoids you making system wide changes 101 | 102 | ``` bash 103 | # Install virtualenv tool 104 | ~/.localpython/bin/pip3 install virtualenv 105 | # Create on-disk representation of virtual environment at ~/spark-venv 106 | ~/.localpython/bin/virtualenv ~/spark-venv 107 | # Activate virtual environment 108 | source ~/spark-venv/bin/activate 109 | ``` 110 | 111 | Use of a virtual environment is indicated in the command line string - the name of the virtual environment - spark-env is added to the command line prompt - e.g., 112 | 113 | ``` 114 | (spark-venv) [ec2-user@ip-10-0-0-248 Python-3.7.1]$ 115 | ``` 116 | 117 | You can return to the system enviroment by typing ```deactivate``` and reactivate using ```source ~/spark-venv/bin/activate``` 118 | 119 | Get rid of annoying messages concerning pip upgrade 120 | 121 | ``` 122 | pip install --upgrade pip 123 | ``` 124 | 125 | Note at this point, all our Python related tooling is local to our virtual environment. So ```which pip``` will give 126 | 127 | ``` 128 | ~/spark-venv/bin/pip 129 | ``` 130 | 131 | Install required Python dependencies 132 | 133 | ``` 134 | pip install jupyter PySpark findspark numpy pandas matplotlib sklearn 135 | ``` 136 | 137 | If you plan on using scala in your workbooks you need to install the spylon kernel - some care is needed with Python versioning 138 | ``` 139 | pip install spylon_kernel 140 | PYTHON=$(which python) 141 | sudo $PYTHON -m spylon_kernel install 142 | ``` 143 | 144 | Install Spark and set ```$SPARK_HOME```. Note you may need to change the SPARK_VERSION if you get a 404 following the wget. 145 | 146 | ``` bash 147 | SPARK_VERSION=2.4.7 148 | HADOOP_VERSION=2.7 149 | cd /tmp 150 | wget https://downloads.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz 151 | tar xvfz spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz 152 | sudo mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} /opt/ 153 | export SPARK_HOME=/opt/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} 154 | export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH 155 | cd ~ 156 | ``` 157 | 158 | Use of the Aerospike Spark Connector requires a valid feature key. The notebooks assume this is located at ```/etc/aerospike/features.conf```. Make sure your feature key is locally available, and if it is not located as above, modify the ```AS_FEATURE_KEY_PATH``` variable at the head of the notebook. You may need to run 159 | 160 | ``` bash 161 | sudo mkdir /etc/aerospike 162 | sudo chmod 777 /etc/aerospike 163 | ``` 164 | 165 | Make sure you have the interactive-notebooks repository locally. 166 | 167 | ``` 168 | git clone https://github.com/aerospike-examples/interactive-notebooks 169 | ``` 170 | Finally start Jupyter. Change the IP in the string below - it can be localhost, but if you want to access from a remote host, choose the IP of one of your ethernet interfaces. You could replace with $(hostname -I | awk '{print $1}') 171 | 172 | Note I set the notebook-dir to point to the directory containing the notebooks in this repository. You also will need SPARK_HOME and PYTHONPATH set correctly (reproducing the former from the above). 173 | 174 | ``` 175 | SPARK_VERSION=2.4.7 176 | HADOOP_VERSION=2.7 177 | export SPARK_HOME=/opt/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} 178 | export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH 179 | jupyter notebook --no-browser --ip= --port=8888 --notebook-dir=~/interactive-notebooks/spark/ 180 | ``` 181 | 182 | You will see output similar to 183 | 184 | ``` 185 | [I 09:36:52.202 NotebookApp] Writing notebook server cookie secret to /home/ec2-user/.local/share/jupyter/runtime/notebook_cookie_secret 186 | [I 09:36:52.370 NotebookApp] Serving notebooks from local directory: /home/ec2-user/interactive-notebooks/spark 187 | [I 09:36:52.370 NotebookApp] Jupyter Notebook 6.1.4 is running at: 188 | [I 09:36:52.370 NotebookApp] http://10.0.0.248:8888/?token=5bf2910a2527567346323e0a4735e94136e1c70d392b561f 189 | [I 09:36:52.370 NotebookApp] or http://127.0.0.1:8888/?token=5bf2910a2527567346323e0a4735e94136e1c70d392b561f 190 | [I 09:36:52.371 NotebookApp] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation). 191 | [C 09:36:52.373 NotebookApp] 192 | ``` 193 | 194 | You will need to use the URLs in the output to access jupyter - as the security token is expected. 195 | 196 | You can omit this step by omitting the --no-browser flag - in that case jupyter will open a browser window local to itself, and request the Notebook app URL above. 197 | 198 | You may wish to run the jupyter startup command from a [screen](https://linuxize.com/post/how-to-use-linux-screen/) so it will stay running if your session terminates. We installed screen at the outset to allow for this. 199 | 200 | #### pyenv / Linux 201 | 202 | You can go down the pyenv route on Linux as per the instructions for Mac. You install pyenv differently 203 | 204 | ``` 205 | sudo yum -y install gcc git zlib-devel openssl-devel libffi-devel sqlite-devel bzip2-devel bzip2 xz-devel screen 206 | git clone http://github.com/pyenv/pyenv .pyenv 207 | export PATH=$PATH:~/.pyenv/bin 208 | ``` 209 | 210 | but once done, just pick up the MacOS instructions at ```pyenv install 3.7.3``` 211 | 212 | ### Set Up Spark Notebooks on MacOS X 213 | The main challenge is getting a sufficiently up to date version of Python installed and set as your working version. You mustn't mess with your existing version of Python (see [xkcd](https://xkcd.com/1987/)). 214 | 215 | [pyenv](https://github.com/pyenv/pyenv) is the tool to help with this. 216 | 217 | First you'll need **brew** the package manager for macOS. From [instructions](https://brew.sh) 218 | ``` 219 | /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install.sh)" 220 | ``` 221 | 222 | Next install pyenv 223 | ``` 224 | brew install pyenv 225 | ``` 226 | and finally we can install our required python version. The subsequent 'global' command sets 3.7.3 as our selected version 227 | ``` 228 | pyenv install 3.7.3 229 | pyenv global 3.7.3 230 | ``` 231 | The command below sets up our path so the required version of Python is used. Once done, do ```python --version``` to check. 232 | ``` 233 | eval "$(pyenv init -)" 234 | ``` 235 | You can now set up your virtual environment - this is a sandbox which avoids you making system wide changes. Note this is the same as the steps above for Linux, except we don't have to give explicit paths to pip, virtualenv. 236 | 237 | ``` bash 238 | # Install virtualenv tool 239 | pip install virtualenv 240 | # Create on-disk representation of virtual environment at ~/spark-venv 241 | virtualenv ~/spark-venv 242 | # Activate virtual environment 243 | source ~/spark-venv/bin/activate 244 | ``` 245 | 246 | You can now follow the Linux instructions from 247 | ``` 248 | pip install jupyter PySpark findspark numpy pandas matplotlib sklearn 249 | ``` 250 | 251 | onwards. 252 | 253 | 254 | ## Software Contents 255 | 256 | - [Aerospike](https://www.aerospike.com) development software: 257 | - Aerospike Database 258 | - Aerospike Java and Python client libraries 259 | - Aerospike Tools 260 | - [Jupyter Notebook Server](https://jupyter.org/) 261 | -------------------------------------------------------------------------------- /aerospike: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Start/stop the aerospike daemon. 3 | # 4 | ### BEGIN INIT INFO 5 | # Provides: aerospike 6 | # Required-Start: $remote_fs $syslog $time 7 | # Required-Stop: $remote_fs $syslog $time 8 | # Should-Start: $network $named slapd autofs ypbind nscd nslcd winbind 9 | # Should-Stop: $network $named slapd autofs ypbind nscd nslcd winbind 10 | # Default-Start: 2 3 4 5 11 | # Default-Stop: 12 | # Short-Description: Aerospike 13 | # Description: Aerospike 14 | ### END INIT INFO 15 | 16 | PATH=/bin:/usr/bin:/sbin:/usr/sbin 17 | DESC="aerospike daemon" 18 | NAME=aerospike 19 | DAEMON=/usr/bin/asd 20 | PIDFILE=/var/run/aerospike/asd.pid 21 | SCRIPTNAME=/etc/init.d/"$NAME" 22 | 23 | test -f $DAEMON || exit 0 24 | 25 | . /lib/lsb/init-functions 26 | 27 | case "$1" in 28 | start) log_success_msg "Starting aerospike" "aerospike" 29 | start_daemon -p $PIDFILE $DAEMON --config-file /etc/aerospike/aerospike.conf $EXTRA_OPTS 30 | log_success_msg $? 31 | ;; 32 | stop) log_success_msg "Stopping aerospike" "aerospike" 33 | pkill asd 34 | sleep 1 35 | if [ $? -ne 0 ]; then pkill -9 asd; fi 36 | log_success_msg $RETVAL 37 | ;; 38 | restart) log_success_msg "Restarting aerospike" "aerospike" 39 | $0 stop 40 | $0 start 41 | ;; 42 | coldstart) log_success_msg "Starting aerospike" "aerospike" 43 | start_daemon -p $PIDFILE $DAEMON --cold-start $EXTRA_OPTS 44 | log_success_msg $? 45 | ;; 46 | status) 47 | status_of_proc -p $PIDFILE $DAEMON $NAME && exit 0 || exit $? 48 | ;; 49 | *) log_success_msg "Usage: /etc/init.d/aerospike {start|stop|status|restart|coldstart}" 50 | exit 2 51 | ;; 52 | esac 53 | exit 0 54 | -------------------------------------------------------------------------------- /aerospike.conf: -------------------------------------------------------------------------------- 1 | # Aerospike database configuration file. 2 | 3 | # This stanza must come first. 4 | service { 5 | user jovyan 6 | #group aerospike 7 | pidfile /var/run/aerospike/asd.pid 8 | # service-threads 4 # cpu x 5 in 4.7 9 | # transaction-queues 4 # obsolete in 4.7 10 | # transaction-threads-per-queue 4 # obsolete in 4.7 11 | proto-fd-max 15000 12 | } 13 | 14 | logging { 15 | 16 | # Log file must be an absolute path. 17 | file /var/log/aerospike/aerospike.log { 18 | context any info 19 | } 20 | 21 | # Send log messages to stdout 22 | console { 23 | context any info 24 | } 25 | } 26 | 27 | network { 28 | service { 29 | address any 30 | port 3000 31 | 32 | # Uncomment the following to set the `access-address` parameter to the 33 | # IP address of the Docker host. This will the allow the server to correctly 34 | # publish the address which applications and other nodes in the cluster to 35 | # use when addressing this node. 36 | # access-address 37 | } 38 | 39 | heartbeat { 40 | 41 | address any 42 | # mesh is used for environments that do not support multicast 43 | mode mesh 44 | port 3002 45 | 46 | # use asinfo -v 'tip:host=;port=3002' to inform cluster of 47 | # other mesh nodes 48 | 49 | interval 150 50 | timeout 10 51 | } 52 | 53 | fabric { 54 | address any 55 | port 3001 56 | } 57 | 58 | info { 59 | address any 60 | port 3003 61 | } 62 | } 63 | 64 | namespace test { 65 | replication-factor 2 66 | # memory-size 1G 67 | default-ttl 30d # 5 days, use 0 to never expire/evict. 68 | nsup-period 120 69 | 70 | storage-engine memory { 71 | file /opt/aerospike/data/test.dat 72 | filesize 4G 73 | # data-in-memory true # Store data in memory in addition to file. 74 | } 75 | } 76 | 77 | -------------------------------------------------------------------------------- /aerospike.template.conf: -------------------------------------------------------------------------------- 1 | # Aerospike database configuration file. 2 | 3 | # This stanza must come first. 4 | service { 5 | user jovyan 6 | 7 | pidfile /var/run/aerospike/asd.pid 8 | # service-threads ${SERVICE_THREADS} # cpu x 5 in 4.7 9 | # transaction-queues ${TRANSACTION_QUEUES} # obsolete in 4.7 10 | # transaction-threads-per-queue ${TRANSACTION_THREADS_PER_QUEUE} # obsolete in 4.7 11 | proto-fd-max 15000 12 | } 13 | 14 | logging { 15 | 16 | # Log file must be an absolute path. 17 | file ${LOGFILE} { 18 | context any info 19 | } 20 | 21 | # Send log messages to stdout 22 | console { 23 | context any info 24 | } 25 | } 26 | 27 | network { 28 | service { 29 | address ${SERVICE_ADDRESS} 30 | port ${SERVICE_PORT} 31 | 32 | # Uncomment the following to set the `access-address` parameter to the 33 | # IP address of the Docker host. This will the allow the server to correctly 34 | # publish the address which applications and other nodes in the cluster to 35 | # use when addressing this node. 36 | # access-address 37 | } 38 | 39 | heartbeat { 40 | 41 | address ${HB_ADDRESS} 42 | # mesh is used for environments that do not support multicast 43 | mode mesh 44 | port ${HB_PORT} 45 | 46 | # use asinfo -v 'tip:host=;port=3002' to inform cluster of 47 | # other mesh nodes 48 | 49 | interval 150 50 | timeout 10 51 | } 52 | 53 | fabric { 54 | address ${FABRIC_ADDRESS} 55 | port ${FABRIC_PORT} 56 | } 57 | 58 | info { 59 | address ${INFO_ADDRESS} 60 | port ${INFO_PORT} 61 | } 62 | } 63 | 64 | namespace ${NAMESPACE} { 65 | replication-factor ${REPL_FACTOR} 66 | # memory-size ${MEM_GB}G 67 | default-ttl ${DEFAULT_TTL} # 5 days, use 0 to never expire/evict. 68 | nsup-period ${NSUP_PERIOD} 69 | 70 | storage-engine memory { 71 | file /opt/aerospike/data/${NAMESPACE}.dat 72 | filesize ${STORAGE_GB}G 73 | # data-in-memory true # Store data in memory in addition to file. 74 | } 75 | } 76 | 77 | -------------------------------------------------------------------------------- /bin/build-docker-image.sh: -------------------------------------------------------------------------------- 1 | #rm -rf target 2 | #mkdir target 3 | #cp -r docker/* target 4 | #cp -r notebooks/java target/notebooks/ 5 | #cp -r notebooks/python target/notebooks/ 6 | 7 | docker build --no-cache -t ${1:-aerospike/intro-notebooks} . 8 | -------------------------------------------------------------------------------- /binder-pre/binder_run_first.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# This notebook sets some prereqs if running in some environments such as https://mybinder.org\n", 8 | "\n", 9 | "*** Important: If in Binder, this notebook will auto close if unused for 10 minutes. If that occurs reload from the initial url. ***\n", 10 | "\n" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "## In a binder environment, asd process must be started\n", 18 | "\n", 19 | "If running enterprise edition, feture key must be set as well" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "import os\n", 29 | "import subprocess\n", 30 | "import pprint\n", 31 | "import time\n", 32 | "\n", 33 | "features = \"\"\"\n", 34 | "COPY FEATURE FILE CONTENTS HERE\n", 35 | "\"\"\"\n", 36 | "\n", 37 | "if features.strip() == \"\" or features.strip() == \"COPY FEATURE FILE CONTENTS HERE\":\n", 38 | " features_file = open(\"/etc/aerospike/features.conf\", \"w\")\n", 39 | " n = features_file.write(features)\n", 40 | " features_file.close()\n", 41 | "\n", 42 | "\n", 43 | "os.system(\"asd\")\n" 44 | ] 45 | } 46 | ], 47 | "metadata": { 48 | "kernelspec": { 49 | "display_name": "Python 3", 50 | "language": "python", 51 | "name": "python3" 52 | }, 53 | "language_info": { 54 | "codemirror_mode": { 55 | "name": "ipython", 56 | "version": 3 57 | }, 58 | "file_extension": ".py", 59 | "mimetype": "text/x-python", 60 | "name": "python", 61 | "nbconvert_exporter": "python", 62 | "pygments_lexer": "ipython3", 63 | "version": "3.8.2-final" 64 | } 65 | }, 66 | "nbformat": 4, 67 | "nbformat_minor": 4 68 | } -------------------------------------------------------------------------------- /entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -m 3 | 4 | export CORES=$(grep -c ^processor /proc/cpuinfo) 5 | export SERVICE_THREADS=${SERVICE_THREADS:-$CORES} 6 | export TRANSACTION_QUEUES=${TRANSACTION_QUEUES:-$CORES} 7 | export TRANSACTION_THREADS_PER_QUEUE=${TRANSACTION_THREADS_PER_QUEUE:-4} 8 | export LOGFILE=${LOGFILE:-/dev/null} 9 | export SERVICE_ADDRESS=${SERVICE_ADDRESS:-any} 10 | export SERVICE_PORT=${SERVICE_PORT:-3000} 11 | export HB_ADDRESS=${HB_ADDRESS:-any} 12 | export HB_PORT=${HB_PORT:-3002} 13 | export FABRIC_ADDRESS=${FABRIC_ADDRESS:-any} 14 | export FABRIC_PORT=${FABRIC_PORT:-3001} 15 | export INFO_ADDRESS=${INFO_ADDRESS:-any} 16 | export INFO_PORT=${INFO_PORT:-3003} 17 | export NAMESPACE=${NAMESPACE:-test} 18 | export REPL_FACTOR=${REPL_FACTOR:-2} 19 | export MEM_GB=${MEM_GB:-1} 20 | export DEFAULT_TTL=${DEFAULT_TTL:-30d} 21 | export STORAGE_GB=${STORAGE_GB:-4} 22 | export NSUP_PERIOD=${NSUP_PERIOD:-120} 23 | export USER=${USER:-jovyan} 24 | export MEMORY_SIZE=${MEMORY_SIZE:-128} 25 | export INDEX_STAGE_SIZE=${INDEX_STAGE_SIZE:-128} 26 | 27 | # Fill out conffile with above values 28 | if [ -f /etc/aerospike/aerospike.template.conf ]; then 29 | envsubst < /etc/aerospike/aerospike.template.conf > /etc/aerospike/aerospike.conf 30 | fi 31 | 32 | NETLINK=${NETLINK:-eth0} 33 | 34 | # we will wait a bit for the network link to be up. 35 | NETLINK_UP=0 36 | NETLINK_COUNT=0 37 | echo "link $NETLINK state $(cat /sys/class/net/${NETLINK}/operstate)" 38 | while [ $NETLINK_UP -eq 0 ] && [ $NETLINK_COUNT -lt 20 ]; do 39 | if grep -q "up" /sys/class/net/${NETLINK}/operstate; then 40 | NETLINK_UP=1 41 | else 42 | sleep 0.1 43 | let NETLINK_COUNT=NETLINK_COUNT+1 44 | fi 45 | done 46 | echo "link $NETLINK state $(cat /sys/class/net/${NETLINK}/operstate) in ${NETLINK_COUNT}" 47 | 48 | service aerospike restart 49 | 50 | ##### 51 | # Jupiter stuff 52 | ##### 53 | 54 | wrapper="" 55 | if [[ "${RESTARTABLE}" == "yes" ]]; then 56 | wrapper="run-one-constantly" 57 | fi 58 | 59 | if [[ ! -z "${JUPYTERHUB_API_TOKEN}" ]]; then 60 | # launched by JupyterHub, use single-user entrypoint 61 | exec /usr/local/bin/start-singleuser.sh "$@" 62 | elif [[ ! -z "${JUPYTER_ENABLE_LAB}" ]]; then 63 | . /usr/local/bin/start.sh $wrapper jupyter lab "$@" 64 | else 65 | . /usr/local/bin/start.sh $wrapper jupyter notebook "$@" 66 | fi 67 | -------------------------------------------------------------------------------- /features.conf: -------------------------------------------------------------------------------- 1 | # generated 2024-05-24 19:44:42 2 | 3 | feature-key-version 2 4 | serial-number 136523944 5 | 6 | account-name Aerospike 7 | account-ID Aerospike_Eval_ver.7.2 8 | 9 | valid-until-version 7.2 10 | 11 | asdb-change-notification true 12 | asdb-cluster-nodes-limit 1 13 | asdb-compression true 14 | asdb-encryption-at-rest true 15 | asdb-flash-index true 16 | asdb-ldap true 17 | asdb-pmem true 18 | asdb-rack-aware true 19 | asdb-secrets true 20 | asdb-strong-consistency true 21 | asdb-vault true 22 | asdb-xdr true 23 | database-recovery true 24 | elasticsearch-connector true 25 | graph-service true 26 | mesg-jms-connector true 27 | mesg-kafka-connector true 28 | presto-connector true 29 | pulsar-connector true 30 | spark-connector true 31 | 32 | ----- SIGNATURE ------------------------------------------------ 33 | MEYCIQDykHgLkd9N7xIzOV80QHpMfmwtu6rtFV/E/9wUcdb5PgIhALKE7QQakgsp 34 | EyjZJtoGtyO1UnXLioru9cY6uIizUozmJA== 35 | ----- END OF SIGNATURE ----------------------------------------- 36 | -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aerospike-examples/interactive-notebooks/0e582d4305974f6cadd390e2086e8550f1b3ecf7/logo.png -------------------------------------------------------------------------------- /notebooks/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aerospike-examples/interactive-notebooks/0e582d4305974f6cadd390e2086e8550f1b3ecf7/notebooks/.DS_Store -------------------------------------------------------------------------------- /notebooks/README.md: -------------------------------------------------------------------------------- 1 | # Notebooks 2 | 3 | This area is for [Jupyter notebooks](https://jupyter.org/) in .ipynb format. Python and Java notebooks are currently supported by the kernel. 4 | 5 | The list of notebooks below has links to browse each notebook in the viewer and to launch it in interactive mode in Binder. This repository also provides a Docker container that you can install (see the [instructions](../README.md)) to run the notebooks locally. 6 | 7 | 8 | All Notebooks | [View All](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks) | [Launch in Binder](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=) 9 | :-------- | ---- | ------ 10 |   Aerospike Notebooks Readme/Tips | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/readme_tips.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=readme_tips.ipynb) 11 | | | | | 12 | **Java Notebooks** | [View All](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/java) | [Launch in Binder](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=java) 13 | | | | | 14 |   A Simple Put-Get Example | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/java/SimplePutGetExample.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=java/SimplePutGetExample.ipynb) 15 |   Understanding Asynchronous Operations in Aerospike | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/java/async_ops.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=java/async_ops.ipynb) 16 |   Aerospike Document API for JSON Documents | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/java/doc_api.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=java/doc_api.ipynb) 17 |   Understanding Expressions in Aerospike | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/java/expressions.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=java/expressions.ipynb) 18 |   Aerospike Hello World! | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/java/hello_world.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=java/hello_world.ipynb) 19 |   Aerospike Java Client – Advanced Collection Data Types | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/java/java-advanced_collection_data_types.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=java/java-advanced_collection_data_types.ipynb) 20 |   Aerospike Java Client – Introduction to Data Modeling | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/java/java-intro_to_data_modeling.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=java/java-intro_to_data_modeling.ipynb) 21 |   Introduction to Transactions with Aerospike | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/java/java-intro_to_transactions.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=java/java-intro_to_transactions.ipynb) 22 |   Modeling Using Lists | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/java/java-modeling_using_lists.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=java/java-modeling_using_lists.ipynb) 23 |   Modeling Using Maps | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/java/java-modeling_using_maps.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=java/java-modeling_using_maps.ipynb) 24 |   Aerospike Java Client – Reading and Updating Lists | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/java/java-working_with_lists.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=java/java-working_with_lists.ipynb) 25 |   Aerospike Java Client – Reading and Updating Maps | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/java/java-working_with_maps.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=java/java-working_with_maps.ipynb) 26 |   Look-Aside Cache for MongoDB | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/java/look_aside_cache_mongo.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=java/look_aside_cache_mongo.ipynb) 27 |   Java Object Mapper | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/java/object_mapper.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=java/object_mapper.ipynb) 28 |   Aerospike Query and UDF | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/java/query_udf.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=java/query_udf.ipynb) 29 |   Implementing SQL Operations: Aggregates (Part 1) | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/java/sql_aggregates_1.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=java/sql_aggregates_1.ipynb) 30 |   Implementing SQL Operations: Aggregates (Part 2) | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/java/sql_aggregates_2.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=java/sql_aggregates_2.ipynb) 31 |   Implementing SQL Operations: SELECT | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/java/sql_select.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=java/sql_select.ipynb) 32 |   Implementing SQL Operations: CREATE, UPDATE, DELETE | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/java/sql_update.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=java/sql_update.ipynb) 33 |   Tweetaspike: A Simple Application | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/java/tweetaspike.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=java/tweetaspike.ipynb) 34 | | | | | 35 | **Python Notebooks** | [View All](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/python) | [Launch in Binder](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=python) 36 | | | | | 37 |   Aerospike Basic Operations | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/python/basic_operations.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=python/basic_operations.ipynb) 38 |   Aerospike Hello World! | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/python/hello_world.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=python/hello_world.ipynb) 39 |   Local Cache | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/python/local_cache.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=python/local_cache.ipynb) 40 |   Look-Aside Cache for MongoDB | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/python/look_aside_cache.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=python/look_aside_cache.ipynb) 41 |   Aerospike Queries in Python | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/python/query.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=python/query.ipynb) 42 |   Aerospike Notebooks Readme/Tips | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/python/readme_tips.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=python/readme_tips.ipynb) 43 |   A Simple Put-Get Example | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/python/simple_put_get_example.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=python/simple_put_get_example.ipynb) 44 |   Implementing Read-Write Transactions with R-M-W Pattern | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/python/transactions_rmw_pattern.ipynb) | [Launch](https://mybinder.org/v2/gh/aerospike-examples/interactive-notebooks/main?filepath=python/transactions_rmw_pattern.ipynb) 45 | | | | | 46 | **Spark Notebooks** | [View All](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/spark) 47 | | | | | 48 |   Aerospike Connect for Spark Tutorial for Python | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/spark/AerospikeSparkPython.ipynb) 49 |   Aerospike Spark Connector Tutorial for Scala | [View](https://github.com/aerospike-examples/interactive-notebooks/tree/main/notebooks/spark/AerospikeSparkScala.ipynb) 50 | 51 | -------------------------------------------------------------------------------- /notebooks/java/README.md: -------------------------------------------------------------------------------- 1 | This area is for Java Jupyter notebooks in .ipynb format. 2 | 3 | Visit [Aerospike notebooks repo](https://github.com/aerospike-examples/interactive-notebooks) to run additional Aerospike notebooks. To run a different notebook, download the notebook from the repo to your local machine, and then in the notebook interface click on File->Open, and select Upload. 4 | -------------------------------------------------------------------------------- /notebooks/java/SimplePutGetExample.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# A Simple Put-Get Example\n", 8 | "\n", 9 | "A simple example of `put` and `get` calls in Aerospike.\n", 10 | "\n", 11 | "This notebook requires the Aerospike Database running locally with Java kernel and Aerospike Java Client. To create a Docker container that satisfies the requirements and holds a copy of Aerospike notebooks, visit the [Aerospike Notebooks Repo](https://github.com/aerospike-examples/interactive-notebooks)." 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "hide_input": false 18 | }, 19 | "source": [ 20 | "# Use magics to load Aerospike Client from POM" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "%%loadFromPOM\n", 30 | "\n", 31 | " \n", 32 | " com.aerospike\n", 33 | " aerospike-client\n", 34 | " 5.0.0\n", 35 | " \n", 36 | "" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "## Use client to write a record to Aerospike DB and read it back" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "import com.aerospike.client.AerospikeClient;\n", 53 | "import com.aerospike.client.policy.WritePolicy;\n", 54 | "import com.aerospike.client.Bin;\n", 55 | "import com.aerospike.client.Key;\n", 56 | "import com.aerospike.client.Record;\n", 57 | "import com.aerospike.client.Value;\n", 58 | "\n", 59 | "public class Test{\n", 60 | " public static void putRecordGetRecord () {\n", 61 | " AerospikeClient client = new AerospikeClient(\"localhost\", 3000);\n", 62 | "\n", 63 | " Key key = new Key(\"test\", \"demo\", \"putgetkey\");\n", 64 | " Bin bin1 = new Bin(\"bin1\", \"value1\");\n", 65 | " Bin bin2 = new Bin(\"bin2\", \"value2\");\n", 66 | "\n", 67 | " // Write a record\n", 68 | " client.put(null, key, bin1, bin2);\n", 69 | "\n", 70 | " // Read a record\n", 71 | " Record record = client.get(null, key);\n", 72 | " client.close(); \n", 73 | " System.out.println(\"Record values are:\");\n", 74 | " System.out.println(record);\n", 75 | " }\n", 76 | "}\n", 77 | "\n", 78 | "Test.putRecordGetRecord()\n", 79 | "\n" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "## You can also skip the java boilerplate" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "import com.aerospike.client.AerospikeClient;\n", 96 | "import com.aerospike.client.policy.WritePolicy;\n", 97 | "import com.aerospike.client.Bin;\n", 98 | "import com.aerospike.client.Key;\n", 99 | "import com.aerospike.client.Record;\n", 100 | "import com.aerospike.client.Value;\n", 101 | "\n", 102 | "AerospikeClient client = new AerospikeClient(\"localhost\", 3000);\n", 103 | "\n", 104 | "Key key = new Key(\"test\", \"demo\", \"putgetkey\");\n", 105 | "Bin bin1 = new Bin(\"bin1\", \"value1\");\n", 106 | "Bin bin2 = new Bin(\"bin2\", \"value2\");\n", 107 | "\n", 108 | "// Write a record\n", 109 | "client.put(null, key, bin1, bin2);\n", 110 | "\n", 111 | "// Read a record\n", 112 | "Record record = client.get(null, key);\n", 113 | "client.close(); \n", 114 | "System.out.println(\"Record values are:\");\n", 115 | "System.out.println(record);\n" 116 | ] 117 | } 118 | ], 119 | "metadata": { 120 | "kernelspec": { 121 | "display_name": "Java", 122 | "language": "java", 123 | "name": "java" 124 | }, 125 | "language_info": { 126 | "codemirror_mode": "java", 127 | "file_extension": ".jshell", 128 | "mimetype": "text/x-java-source", 129 | "name": "Java", 130 | "pygments_lexer": "java", 131 | "version": "11.0.8+10-LTS" 132 | } 133 | }, 134 | "nbformat": 4, 135 | "nbformat_minor": 2 136 | } 137 | -------------------------------------------------------------------------------- /notebooks/java/add_namespace.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/sh 2 | cp /etc/aerospike/aerospike.conf ~/notebooks/java/aerospike.conf 3 | sed -i '/paxos-single-replica-limit/d' ~/notebooks/java/aerospike.conf 4 | echo "namespace $1 {\n memory-size 1G \n}" >> ~/notebooks/java/aerospike.conf 5 | pkill asd; asd --config-file ~/notebooks/java/aerospike.conf 6 | -------------------------------------------------------------------------------- /notebooks/java/doc_api_example_store.json: -------------------------------------------------------------------------------- 1 | { 2 | "store": { 3 | "book": [ 4 | { 5 | "category": "reference", 6 | "author": "Nigel Rees", 7 | "title": "Sayings of the Century", 8 | "price": 8.95, 9 | "ref": [1,2] 10 | }, 11 | { 12 | "category": "fiction", 13 | "author": "Evelyn Waugh", 14 | "title": "Sword of Honour", 15 | "price": 12.99, 16 | "ref": [2,4,16] 17 | }, 18 | { 19 | "category": "fiction", 20 | "author": "Herman Melville", 21 | "title": "Moby Dick", 22 | "isbn": "0-553-21311-3", 23 | "price": 8.99, 24 | "ref": [1,3,5] 25 | }, 26 | { 27 | "category": "fiction", 28 | "author": "J. R. R. Tolkien", 29 | "title": "The Lord of the Rings", 30 | "isbn": "0-395-19395-8", 31 | "price": 22.99, 32 | "ref": [1,2,7] 33 | } 34 | ], 35 | "bicycle": { 36 | "color": "red", 37 | "price": 19.95 38 | } 39 | }, 40 | "expensive": 10 41 | } -------------------------------------------------------------------------------- /notebooks/java/doc_api_example_tommyleejones.json: -------------------------------------------------------------------------------- 1 | { 2 | "forenames": [ 3 | "Tommy", 4 | "Lee" 5 | ], 6 | "surname": "Jones", 7 | "date_of_birth": { 8 | "day": 15, 9 | "month": 9, 10 | "year": 1946 11 | }, 12 | "selected_filmography":{ 13 | "2012":["Lincoln","Men In Black 3"], 14 | "2007":["No Country For Old Men"], 15 | "2002":["Men in Black 2"], 16 | "1997":["Men in Black","Volcano"], 17 | "1994":["Natural Born Killers","Cobb"], 18 | "1991":["JFK"], 19 | "1980":["Coal Miner's Daughter","Barn Burning"] 20 | }, 21 | "imdb_rank":{ 22 | "source":"https://www.imdb.com/list/ls050274118/", 23 | "rank":51 24 | }, 25 | "best_films_ranked": [ 26 | { 27 | "source": "http://www.rottentomatoes.com", 28 | "films": ["The Fugitive","No Country For Old Men","Men In Black","Coal Miner's Daughter","Lincoln"] 29 | }, 30 | { 31 | "source":"https://medium.com/the-greatest-films-according-to-me/10-greatest-films-of-tommy-lee-jones-97426103e3d6", 32 | "films":["The Three Burials of Melquiades Estrada","The Homesman","No Country for Old Men","In the Valley of Elah","Coal Miner's Daughter"] 33 | } 34 | ] 35 | } -------------------------------------------------------------------------------- /notebooks/java/hello_world.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "toc": true 7 | }, 8 | "source": [ 9 | "

Table of Contents

\n", 10 | "" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "# Aerospike Hello World!\n", 18 | "\n", 19 | "Hello World! in Java with Aerospike.\n", 20 | "This notebook requires Aerospike datbase running locally and that Java kernel has been installed. Visit [Aerospike notebooks repo](https://github.com/aerospike-examples/interactive-notebooks) for additional details and the docker container." 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "## Ensure database is running\n", 28 | "This notebook requires that Aerospike datbase is running." 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 1, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "import io.github.spencerpark.ijava.IJava;\n", 38 | "import io.github.spencerpark.jupyter.kernel.magic.common.Shell;\n", 39 | "IJava.getKernelInstance().getMagics().registerMagics(Shell.class);\n", 40 | "%sh asd" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": { 46 | "hide_input": false 47 | }, 48 | "source": [ 49 | "## Download Aerospike client from POM" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 2, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "%%loadFromPOM\n", 59 | "\n", 60 | " \n", 61 | " com.aerospike\n", 62 | " aerospike-client\n", 63 | " 5.0.0\n", 64 | " \n", 65 | "" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "## Import the modules\n", 73 | "\n", 74 | "Import the client library and other modules." 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 3, 80 | "metadata": {}, 81 | "outputs": [ 82 | { 83 | "name": "stdout", 84 | "output_type": "stream", 85 | "text": [ 86 | "Client modules imported.\n" 87 | ] 88 | } 89 | ], 90 | "source": [ 91 | "import com.aerospike.client.AerospikeClient;\n", 92 | "import com.aerospike.client.policy.WritePolicy;\n", 93 | "import com.aerospike.client.Bin;\n", 94 | "import com.aerospike.client.Key;\n", 95 | "import com.aerospike.client.Record;\n", 96 | "import com.aerospike.client.Value;\n", 97 | "System.out.println(\"Client modules imported.\");" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "## Initialize the client\n", 105 | "\n", 106 | "Initialize the client and connect to the cluster. The configuration is for Aerospike database running on port 3000 of localhost which is the default. Modify config if your environment is different (Aerospike database running on a different host or different port).\n" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 4, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "Initialized the client and connected to the cluster.\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "AerospikeClient client = new AerospikeClient(\"localhost\", 3000);\n", 124 | "System.out.println(\"Initialized the client and connected to the cluster.\");" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "## Understand records are addressable via a tuple of (namespace, set, userkey) \n", 132 | "\n", 133 | "The three components namespace, set, and userkey (with set being optional) form the Primary Key (PK) or simply key, of the record. The key serves as a handle to the record, and using it, a record can be read or written. By default userkey is not stored on server, only a hash (a byte array, the fourth component in the output below) which is the internal representation of the key is stored. For a detailed description of the data model see the [Data Model overview](https://www.aerospike.com/docs/architecture/data-model.html)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 5, 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "name": "stdout", 143 | "output_type": "stream", 144 | "text": [ 145 | "Working with record key:\n", 146 | "test:demo:foo:f57ec18335f7100c0458f8a644bcbc766d93471e\n" 147 | ] 148 | } 149 | ], 150 | "source": [ 151 | "Key key = new Key(\"test\", \"demo\", \"foo\");\n", 152 | "System.out.println(\"Working with record key:\");\n", 153 | "System.out.println(key);" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "## Write a record\n", 161 | "\n", 162 | "Aerospike is schema-less and records may be written without any other setup. Here the bins or fields: name, age and greeting, are being written to a record with the key as defined above. " 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 6, 168 | "metadata": {}, 169 | "outputs": [ 170 | { 171 | "name": "stdout", 172 | "output_type": "stream", 173 | "text": [ 174 | "Successfully written the record.\n" 175 | ] 176 | } 177 | ], 178 | "source": [ 179 | "Bin bin1 = new Bin(\"name\", \"John Doe\");\n", 180 | "Bin bin2 = new Bin(\"age\", 32);\n", 181 | "Bin bin3 = new Bin(\"greeting\", \"Hello World!\");\n", 182 | "\n", 183 | "// Write a record\n", 184 | "client.put(null, key, bin1, bin2, bin3);\n", 185 | "System.out.println(\"Successfully written the record.\");" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "## Read a record\n", 193 | "\n", 194 | "The record can be retrieved using the same key." 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 7, 200 | "metadata": {}, 201 | "outputs": [ 202 | { 203 | "name": "stdout", 204 | "output_type": "stream", 205 | "text": [ 206 | "Read back the record.\n" 207 | ] 208 | } 209 | ], 210 | "source": [ 211 | "// Read the record\n", 212 | "Record record = client.get(null, key);\n", 213 | "System.out.println(\"Read back the record.\");" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "## Display result\n", 221 | "\n", 222 | "Print the record that was just retrieved. We are printing: \n", 223 | "\n", 224 | "1. The metadata with the record's generation (or version) and expiration time. \n", 225 | "1. The actual value of the record's bins. " 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 8, 231 | "metadata": {}, 232 | "outputs": [ 233 | { 234 | "name": "stdout", 235 | "output_type": "stream", 236 | "text": [ 237 | "Record values are:\n", 238 | "(gen:3),(exp:351567215),(bins:(name:John Doe),(age:32),(gpa:4.3),(greeting:Hello World!))\n" 239 | ] 240 | } 241 | ], 242 | "source": [ 243 | "System.out.println(\"Record values are:\");\n", 244 | "System.out.println(record);" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": {}, 250 | "source": [ 251 | "## Clean up\n", 252 | "Finally close the client connection." 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 9, 258 | "metadata": {}, 259 | "outputs": [ 260 | { 261 | "name": "stdout", 262 | "output_type": "stream", 263 | "text": [ 264 | "Connection closed.\n" 265 | ] 266 | } 267 | ], 268 | "source": [ 269 | "client.close(); \n", 270 | "System.out.println(\"Connection closed.\");" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "## All code in Java boilerplate\n", 278 | "All the above code can also be written in the Java boilerplate format and run in a cell." 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 10, 284 | "metadata": {}, 285 | "outputs": [ 286 | { 287 | "name": "stdout", 288 | "output_type": "stream", 289 | "text": [ 290 | "Record values are:\n", 291 | "(gen:1),(exp:351567216),(bins:(bin1:value1),(bin2:value2))\n" 292 | ] 293 | } 294 | ], 295 | "source": [ 296 | "import com.aerospike.client.AerospikeClient;\n", 297 | "import com.aerospike.client.policy.WritePolicy;\n", 298 | "import com.aerospike.client.Bin;\n", 299 | "import com.aerospike.client.Key;\n", 300 | "import com.aerospike.client.Record;\n", 301 | "import com.aerospike.client.Value;\n", 302 | "\n", 303 | "public class Test{\n", 304 | " public static void putRecordGetRecord () {\n", 305 | " AerospikeClient client = new AerospikeClient(\"localhost\", 3000);\n", 306 | "\n", 307 | " Key key = new Key(\"test\", \"demo\", \"putgetkey\");\n", 308 | " Bin bin1 = new Bin(\"bin1\", \"value1\");\n", 309 | " Bin bin2 = new Bin(\"bin2\", \"value2\");\n", 310 | "\n", 311 | " // Write a record\n", 312 | " client.put(null, key, bin1, bin2);\n", 313 | "\n", 314 | " // Read a record\n", 315 | " Record record = client.get(null, key);\n", 316 | " client.close(); \n", 317 | " System.out.println(\"Record values are:\");\n", 318 | " System.out.println(record);\n", 319 | " }\n", 320 | "}\n", 321 | "\n", 322 | "Test.putRecordGetRecord()" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": {}, 328 | "source": [ 329 | "## Next steps\n", 330 | "\n", 331 | "Visit [Aerospike notebooks repo](https://github.com/aerospike-examples/interactive-notebooks) to run additional Aerospike notebooks. To run a different notebook, download the notebook from the repo to your local machine, and then click on File->Open, and select Upload." 332 | ] 333 | } 334 | ], 335 | "metadata": { 336 | "kernelspec": { 337 | "display_name": "Java", 338 | "language": "java", 339 | "name": "java" 340 | }, 341 | "language_info": { 342 | "codemirror_mode": "java", 343 | "file_extension": ".jshell", 344 | "mimetype": "text/x-java-source", 345 | "name": "Java", 346 | "pygments_lexer": "java", 347 | "version": "11.0.8+10-LTS" 348 | }, 349 | "toc": { 350 | "base_numbering": 1, 351 | "nav_menu": {}, 352 | "number_sections": true, 353 | "sideBar": true, 354 | "skip_h1_title": false, 355 | "title_cell": "Table of Contents", 356 | "title_sidebar": "Contents", 357 | "toc_cell": true, 358 | "toc_position": {}, 359 | "toc_section_display": true, 360 | "toc_window_display": false 361 | } 362 | }, 363 | "nbformat": 4, 364 | "nbformat_minor": 2 365 | } 366 | -------------------------------------------------------------------------------- /notebooks/java/look_aside_cache_mongo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Look-Aside Cache for MongoDB\n", 8 | "### This is a sample notebook for using Aerospike as a read/look-aside cache\n", 9 | "\n", 10 | "- This notebook demonstrates the use of Aerospike as a cache using Mongo as another primary datastore\n", 11 | "- It is required to run Mongo as a separate container using `docker run --name some-mongo -d mongo:latest`\n", 12 | "\n", 13 | "To test: Run the `cache.getData(\"id\", \"data\");` method once - to fetch from Mongo and populate Aerospike\n", 14 | "\n", 15 | "Another run will fetch the data from Aerospike cache\n" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "#### Ensure that Aerospike Database is running" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import io.github.spencerpark.ijava.IJava;\n", 32 | "import io.github.spencerpark.jupyter.kernel.magic.common.Shell;\n", 33 | "IJava.getKernelInstance().getMagics().registerMagics(Shell.class);\n", 34 | "%sh asd" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "#### Load Aerospike and Mongo dependencies from POM" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 6, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "%%loadFromPOM\n", 51 | "\n", 52 | " \n", 53 | " com.aerospike\n", 54 | " aerospike-client\n", 55 | " 5.0.0\n", 56 | " \n", 57 | " \n", 58 | " org.mongodb\n", 59 | " mongo-java-driver\n", 60 | " 3.12.7\n", 61 | " \n", 62 | "" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 7, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "import com.aerospike.client.AerospikeClient;\n", 72 | "import com.aerospike.client.policy.WritePolicy;\n", 73 | "import com.aerospike.client.Bin;\n", 74 | "import com.aerospike.client.Key;\n", 75 | "import com.aerospike.client.Record;\n", 76 | "import com.aerospike.client.Value;\n", 77 | "\n", 78 | "import com.mongodb.client.MongoDatabase;\n", 79 | "import com.mongodb.client.MongoCollection;\n", 80 | "import com.mongodb.MongoClient; \n", 81 | "import com.mongodb.MongoCredential; \n", 82 | "import org.bson.Document;\n", 83 | "import com.mongodb.client.model.Filters;\n", 84 | "import java.util.Set;" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "## Configure the clients\n", 92 | "\n", 93 | "The configuration is for \n", 94 | " - Aerospike database running on port 3000 of localhost (IP 127.0.0.1) which is the default. \n", 95 | " - Mongo running in a separate container whose IP can be found by `docker inspect | grep -i ipaddress`\n", 96 | "\n", 97 | "\n", 98 | "Modify config if your environment is different (Aerospike database running on a different host or different port)." 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 8, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "public class Cache{\n", 108 | " //Database Constants\n", 109 | " public static final String AEROSPIKE_HOST = \"0.0.0.0\";\n", 110 | " public static final String MONGO_HOST = \"172.17.0.3\";\n", 111 | " public static final int AEROSPIKE_PORT = 3000;\n", 112 | " public static final int MONGO_PORT = 27017;\n", 113 | " \n", 114 | " public static final String AEROSPIKE_NAMESPACE = \"test\";\n", 115 | " public static final String AEROSPIKE_SET = \"demo\";\n", 116 | " public static final String MONGO_USER = \"sampleUser\";\n", 117 | " public static final String MONGO_PASSWORD = \"password\";\n", 118 | " public static final String MONGO_DB = \"myDb\";\n", 119 | " public static final String MONGO_COLLECTION = \"sampleCollection\";\n", 120 | " \n", 121 | " private AerospikeClient client;\n", 122 | " private MongoClient mongo;\n", 123 | " private MongoCredential credential;\n", 124 | " private MongoDatabase database;\n", 125 | " \n", 126 | " public Cache() {\n", 127 | " client = new AerospikeClient(AEROSPIKE_HOST, AEROSPIKE_PORT);\n", 128 | " mongo = new MongoClient(MONGO_HOST , MONGO_PORT);\n", 129 | " credential = MongoCredential.createCredential(MONGO_USER, MONGO_DB, \n", 130 | " MONGO_PASSWORD.toCharArray());\n", 131 | " database = mongo.getDatabase(MONGO_DB);\n", 132 | " }\n", 133 | " \n", 134 | " private boolean collectionExists(final String collectionName) {\n", 135 | " // Check and return if the collection exists in Mongo\n", 136 | " return database.listCollectionNames()\n", 137 | " .into(new ArrayList()).contains(collectionName);\n", 138 | " }\n", 139 | "\n", 140 | " public void populateMongoData(String id, String data) {\n", 141 | " // Populate Mongodb first\n", 142 | " Document document = new Document(id, data);\n", 143 | " if (! collectionExists(MONGO_COLLECTION)) {\n", 144 | " database.createCollection(MONGO_COLLECTION);\n", 145 | " } else {\n", 146 | " MongoCollection collection = database.getCollection(MONGO_COLLECTION);\n", 147 | " collection.insertOne(document);\n", 148 | " }\n", 149 | " Key key = new Key(AEROSPIKE_NAMESPACE, AEROSPIKE_SET, id);\n", 150 | " client.delete(null, key);\n", 151 | " }\n", 152 | " \n", 153 | " public String getData(String id, String data) {\n", 154 | " // This is just an example code that exhibits a cache fetch for a String id with String data\n", 155 | " \n", 156 | " Key key = new Key(AEROSPIKE_NAMESPACE, AEROSPIKE_SET, id);\n", 157 | " String BIN_NAME = \"value\";\n", 158 | " Record record = client.get(null,key);\n", 159 | " if ( record == null ) {\n", 160 | " System.out.println(\"First Fetch Record does not exist in Aerospike cache\");\n", 161 | " MongoCollection collection = database.getCollection(MONGO_COLLECTION);\n", 162 | " Document document = collection.find(Filters.eq(id, data)).first();\n", 163 | " //System.out.println(\"Document \" + document.get(id));\n", 164 | " String json = document.get(id).toString();\n", 165 | " client.put(null, key, new Bin(BIN_NAME,json));\n", 166 | " return client.get(null, key).toString(); \n", 167 | " \n", 168 | " } else {\n", 169 | " System.out.println(\"Data retrieved from Aerospike cache\");\n", 170 | " return record.toString();\n", 171 | " \n", 172 | " }\n", 173 | " }\n", 174 | "}" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 9, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "Cache cache = new Cache();\n", 184 | "cache.populateMongoData(\"id\", \"data\");" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 10, 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "name": "stdout", 194 | "output_type": "stream", 195 | "text": [ 196 | "First Fetch Record does not exist in Aerospike cache\n" 197 | ] 198 | }, 199 | { 200 | "data": { 201 | "text/plain": [ 202 | "(gen:1),(exp:350708590),(bins:(value:data))" 203 | ] 204 | }, 205 | "execution_count": 10, 206 | "metadata": {}, 207 | "output_type": "execute_result" 208 | } 209 | ], 210 | "source": [ 211 | "cache.getData(\"id\", \"data\");\n" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [] 220 | } 221 | ], 222 | "metadata": { 223 | "kernelspec": { 224 | "display_name": "Java", 225 | "language": "java", 226 | "name": "java" 227 | }, 228 | "language_info": { 229 | "codemirror_mode": "java", 230 | "file_extension": ".jshell", 231 | "mimetype": "text/x-java-source", 232 | "name": "Java", 233 | "pygments_lexer": "java", 234 | "version": "11.0.8+10-LTS" 235 | }, 236 | "toc": { 237 | "base_numbering": 1, 238 | "nav_menu": {}, 239 | "number_sections": true, 240 | "sideBar": true, 241 | "skip_h1_title": false, 242 | "title_cell": "Table of Contents", 243 | "title_sidebar": "Contents", 244 | "toc_cell": false, 245 | "toc_position": {}, 246 | "toc_section_display": true, 247 | "toc_window_display": false 248 | } 249 | }, 250 | "nbformat": 4, 251 | "nbformat_minor": 4 252 | } 253 | -------------------------------------------------------------------------------- /notebooks/presto/AerospikePrestoDemo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Aerospike Connect for Presto Tutorial for Python\n", 8 | "## Tested with Python 3.7, Java 11, Presto 343, Presto Connector (Beta), and PyHive 0.6.3" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "from pyhive import presto\n", 18 | "presto_conn = presto.connect(\n", 19 | " host='localhost',\n", 20 | " port=8080,\n", 21 | " catalog='aerospike',\n", 22 | " schema='test'\n", 23 | ")\n", 24 | "presto_cursor=presto_conn.cursor()" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "presto_cursor.execute('SELECT * FROM test.write_set LIMIT 3')" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "(234, 'Individual: 234', 33.568431516802406, 66363)\n", 46 | "(13, 'Individual: 013', 25.752921531369164, 48610)\n", 47 | "(79, 'Individual: 079', 25.16109674428971, 60357)\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "records = presto_cursor.fetchall()\n", 53 | "for row in records:\n", 54 | " print(row)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "presto_cursor.execute('select name from test.write_set where age>40 and age<45')" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "records1 = presto_cursor.fetchall()\n", 73 | "for row in records1:\n", 74 | " print(row)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "### Refer to https://www.aerospike.com/docs/connect/access/presto/examples.html for more examples." 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [] 90 | } 91 | ], 92 | "metadata": { 93 | "kernelspec": { 94 | "display_name": "Python 3", 95 | "language": "python", 96 | "name": "python3" 97 | }, 98 | "language_info": { 99 | "codemirror_mode": { 100 | "name": "ipython", 101 | "version": 3 102 | }, 103 | "file_extension": ".py", 104 | "mimetype": "text/x-python", 105 | "name": "python", 106 | "nbconvert_exporter": "python", 107 | "pygments_lexer": "ipython3", 108 | "version": "3.7.5" 109 | } 110 | }, 111 | "nbformat": 4, 112 | "nbformat_minor": 4 113 | } 114 | -------------------------------------------------------------------------------- /notebooks/python/README.md: -------------------------------------------------------------------------------- 1 | This area is for Python Jupyter notebooks in .ipynb format. 2 | 3 | Visit [Aerospike notebooks repo](https://github.com/aerospike-examples/interactive-notebooks) to run additional Aerospike notebooks. To run a different notebook, download the notebook from the repo to your local machine, and then in the notebook interface click on File->Open, and select Upload. 4 | -------------------------------------------------------------------------------- /notebooks/python/hello_world.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "toc": true 7 | }, 8 | "source": [ 9 | "

Table of Contents

\n", 10 | "" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "# Aerospike Hello World!\n", 18 | "\n", 19 | "Hello, World! in Python with Aerospike.\n", 20 | "
\n", 21 | "This notebook requires Aerospike datbase running on localhost and that python and the Aerospike python client have been installed (`pip install aerospike`). Visit [Aerospike notebooks repo](https://github.com/aerospike-examples/interactive-notebooks) for additional details and the docker container." 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "## Ensure database is running\n", 29 | "This notebook requires that Aerospike datbase is running." 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 1, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "name": "stdout", 39 | "output_type": "stream", 40 | "text": [ 41 | "Aerospike database is running!\r\n" 42 | ] 43 | } 44 | ], 45 | "source": [ 46 | "!asd >& /dev/null\n", 47 | "!pgrep -x asd >/dev/null && echo \"Aerospike database is running!\" || echo \"**Aerospike database is not running!**\"" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "## Import the module\n", 55 | "\n", 56 | "Import the client library." 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 2, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "name": "stdout", 66 | "output_type": "stream", 67 | "text": [ 68 | "Client module imported\n" 69 | ] 70 | } 71 | ], 72 | "source": [ 73 | "import aerospike\n", 74 | "print(\"Client module imported\")" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "## Configure the client\n", 82 | "\n", 83 | "The configuration is for Aerospike database running on port 3000 of localhost (IP 127.0.0.1) which is the default. Modify config if your environment is different (Aerospike database running on a different host or different port)." 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 3, 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "name": "stdout", 93 | "output_type": "stream", 94 | "text": [ 95 | "Configuring with seed host: [('127.0.0.1', 3000)]\n" 96 | ] 97 | } 98 | ], 99 | "source": [ 100 | "config = {\n", 101 | " 'hosts': [ ('127.0.0.1', 3000) ]\n", 102 | "}\n", 103 | "print(\"Configuring with seed host:\", config['hosts'])" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "## Create client object and connect to the cluster" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 4, 116 | "metadata": {}, 117 | "outputs": [ 118 | { 119 | "name": "stdout", 120 | "output_type": "stream", 121 | "text": [ 122 | "Connected to the cluster\n" 123 | ] 124 | } 125 | ], 126 | "source": [ 127 | "try:\n", 128 | " client = aerospike.client(config).connect()\n", 129 | "except:\n", 130 | " import sys\n", 131 | " print(\"Failed to connect to the cluster with\", config['hosts'])\n", 132 | " sys.exit(1)\n", 133 | "print(\"Connected to the cluster\")" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "## Understand records are addressable via a tuple of (namespace, set, userkey) \n", 141 | "\n", 142 | "The three components namespace, set, and userkey (with set being optional) form the Primary Key (PK) or simply key, of the record. The key serves as a handle to the record, and using it, a record can be read or written. For a detailed description of the data model see the [Data Model overview](https://www.aerospike.com/docs/architecture/data-model.html)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 5, 148 | "metadata": {}, 149 | "outputs": [ 150 | { 151 | "name": "stdout", 152 | "output_type": "stream", 153 | "text": [ 154 | "Working with record key ('test', 'demo', 'foo')\n" 155 | ] 156 | } 157 | ], 158 | "source": [ 159 | "key = ('test', 'demo', 'foo')\n", 160 | "print('Working with record key ', key)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "## Write a record\n", 168 | "\n", 169 | "Aerospike is schema-less and records may be written without any other setup. Here the bins or fields: name, age and greeting, are being written to a record with the key as defined above. " 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 6, 175 | "metadata": {}, 176 | "outputs": [ 177 | { 178 | "name": "stdout", 179 | "output_type": "stream", 180 | "text": [ 181 | "Successfully written the record\n" 182 | ] 183 | } 184 | ], 185 | "source": [ 186 | "try:\n", 187 | " # Write a record\n", 188 | " client.put(key, {\n", 189 | " 'name': 'John Doe',\n", 190 | " 'age': 32,\n", 191 | " 'greeting': 'Hello, World!'\n", 192 | " })\n", 193 | "except Exception as e:\n", 194 | " import sys\n", 195 | " print(\"error: {0}\".format(e), file=sys.stderr)\n", 196 | " sys.exit(1)\n", 197 | "print('Successfully written the record')" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "## Read a record\n", 205 | "\n", 206 | "The record may be retrieved using the same key." 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 7, 212 | "metadata": {}, 213 | "outputs": [ 214 | { 215 | "name": "stdout", 216 | "output_type": "stream", 217 | "text": [ 218 | "Read back the record\n" 219 | ] 220 | } 221 | ], 222 | "source": [ 223 | "(key, metadata, record) = client.get(key)\n", 224 | "print('Read back the record')" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "## Display result\n", 232 | "\n", 233 | "Print the record that was just retrieved. We are also printing: \n", 234 | "\n", 235 | "1. The components of the key which are: namespace, set, and userkey. By default userkey is not stored on server, only a hash (appearing as bytearray in the output below) which is the internal representation of the key is stored.\n", 236 | "1. The metadata with the time-to-live and the record's generation or version. \n", 237 | "1. The actual value of the record's bins. " 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 8, 243 | "metadata": {}, 244 | "outputs": [ 245 | { 246 | "name": "stdout", 247 | "output_type": "stream", 248 | "text": [ 249 | "Record contents are {'name': 'John Doe', 'age': 32, 'gpa': 4.3, 'greeting': 'Hello, World!'}\n", 250 | "Key's components are ('test', 'demo', None, bytearray(b'\\xf5~\\xc1\\x835\\xf7\\x10\\x0c\\x04X\\xf8\\xa6D\\xbc\\xbcvm\\x93G\\x1e'))\n", 251 | "Metadata is {'ttl': 2592000, 'gen': 2}\n" 252 | ] 253 | } 254 | ], 255 | "source": [ 256 | "print(\"Record contents are\", record)\n", 257 | "print(\"Key's components are\", key)\n", 258 | "print(\"Metadata is\", metadata)" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "## Clean up\n", 266 | "Finally close the client we created at the beginning." 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 9, 272 | "metadata": {}, 273 | "outputs": [ 274 | { 275 | "name": "stdout", 276 | "output_type": "stream", 277 | "text": [ 278 | "Connection closed.\n" 279 | ] 280 | } 281 | ], 282 | "source": [ 283 | "# Close the connection to the Aerospike cluster\n", 284 | "client.close()\n", 285 | "print('Connection closed.')" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": {}, 291 | "source": [ 292 | "## Next steps\n", 293 | "\n", 294 | "Visit [Aerospike notebooks repo](https://github.com/aerospike-examples/interactive-notebooks) to run additional Aerospike notebooks. To run a different notebook, download the notebook from the repo to your local machine, and then click on File->Open, and select Upload.\n" 295 | ] 296 | } 297 | ], 298 | "metadata": { 299 | "file_extension": ".py", 300 | "kernelspec": { 301 | "display_name": "Python 3", 302 | "language": "python", 303 | "name": "python3" 304 | }, 305 | "language_info": { 306 | "codemirror_mode": { 307 | "name": "ipython", 308 | "version": 3 309 | }, 310 | "file_extension": ".py", 311 | "mimetype": "text/x-python", 312 | "name": "python", 313 | "nbconvert_exporter": "python", 314 | "pygments_lexer": "ipython3", 315 | "version": "3.8.6" 316 | }, 317 | "mimetype": "text/x-python", 318 | "name": "python", 319 | "npconvert_exporter": "python", 320 | "pygments_lexer": "ipython3", 321 | "toc": { 322 | "base_numbering": 1, 323 | "nav_menu": {}, 324 | "number_sections": true, 325 | "sideBar": true, 326 | "skip_h1_title": false, 327 | "title_cell": "Table of Contents", 328 | "title_sidebar": "Contents", 329 | "toc_cell": true, 330 | "toc_position": {}, 331 | "toc_section_display": true, 332 | "toc_window_display": false 333 | }, 334 | "version": 3 335 | }, 336 | "nbformat": 4, 337 | "nbformat_minor": 2 338 | } 339 | -------------------------------------------------------------------------------- /notebooks/python/look_aside_cache.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Look-Aside Cache for MongoDB\n", 8 | "### This is a sample notebook for using Aerospike as a read/look-aside cache\n", 9 | "\n", 10 | "- This notebook demonstrates the use of Aerospike as a cache using Mongo as another primary datastore\n", 11 | "- It is required to run Mongo as a separte container using `docker run --name some-mongo -d mongo:latest`\n", 12 | "\n", 13 | "To test: Run the `get_data(key, value)` method once - to fetch from Mongo and populate Aerospike\n", 14 | "\n", 15 | "Another run will fetch the data from Aerospike cache\n" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "#### Ensure that the Aerospike Database is running" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 1, 28 | "metadata": { 29 | "scrolled": true 30 | }, 31 | "outputs": [ 32 | { 33 | "name": "stdout", 34 | "output_type": "stream", 35 | "text": [ 36 | "Aerospike database is running!\r\n" 37 | ] 38 | } 39 | ], 40 | "source": [ 41 | "!asd >& /dev/null\n", 42 | "!pgrep -x asd >/dev/null && echo \"Aerospike database is running!\" || echo \"**Aerospike database is not running!**\"" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "#### Import all dependencies" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 4, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "import aerospike\n", 59 | "import pymongo\n", 60 | "from pymongo import MongoClient\n", 61 | "import sys" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "## Configure the clients\n", 69 | "\n", 70 | "The configuration is for \n", 71 | " - Aerospike database running on port 3000 of localhost (IP 127.0.0.1) which is the default. \n", 72 | " - Mongo running in a separate container whose IP can be found by `docker inspect | grep -i ipaddress`\n", 73 | "\n", 74 | "\n", 75 | "Modify config if your environment is different (Aerospike database running on a different host or different port)." 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 17, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "# Define a few constants\n", 85 | "\n", 86 | "AEROSPIKE_HOST = \"0.0.0.0\"\n", 87 | "AEROSPIKE_PORT = 3000\n", 88 | "AEROSPIKE_NAMESPACE = \"test\"\n", 89 | "AEROSPIKE_SET = \"demo\"\n", 90 | "MONGO_HOST = \"172.17.0.3\"\n", 91 | "MONGO_PORT = 27017\n", 92 | "MONGO_DB = \"test-database\"\n", 93 | "MONGO_COLLECTION = \"test-collection\"" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 18, 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "name": "stdout", 103 | "output_type": "stream", 104 | "text": [ 105 | "Connected to Aerospike\n", 106 | "Connected to Mongo\n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "#Aerospike configuration\n", 112 | "aero_config = {\n", 113 | " 'hosts': [ (AEROSPIKE_HOST, AEROSPIKE_PORT) ]\n", 114 | "}\n", 115 | "try:\n", 116 | " aero_client = aerospike.client(aero_config).connect()\n", 117 | "except:\n", 118 | " print(\"Failed to connect to the cluster with\", aero_config['hosts'])\n", 119 | " sys.exit(1)\n", 120 | "print(\"Connected to Aerospike\")\n", 121 | "\n", 122 | "#Mongo configuration\n", 123 | "try:\n", 124 | " mongo_client = MongoClient(MONGO_HOST, MONGO_PORT)\n", 125 | " print(\"Connected to Mongo\")\n", 126 | "except:\n", 127 | " print(\"Failed to connect to Mongo\")\n", 128 | " sys.exit(1)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "#### Store data in Mongo and clear the keys in Aerospike if any" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 20, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "db = mongo_client[MONGO_DB]\n", 145 | "collection = db[MONGO_COLLECTION]" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 21, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "def store_data(data_id, data):\n", 155 | " m_data = {data_id: data}\n", 156 | " collection.drop()\n", 157 | " aero_key = ('test', 'demo', data_id)\n", 158 | " #aero_client.remove(aero_key)\n", 159 | " post_id = collection.insert_one(m_data)\n", 160 | "store_data(\"key\", \"value\")" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "#### Fetch the data. In this instance we are using a simple key value pair.\n", 168 | "If the data exists in the cache it is returned, if not data is read from Mongo, put in the cache and then returned" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 23, 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "name": "stdout", 178 | "output_type": "stream", 179 | "text": [ 180 | "Data retrieved from Aerospike cache\n", 181 | "Record::: key value\n" 182 | ] 183 | } 184 | ], 185 | "source": [ 186 | "def get_data(data_id, data):\n", 187 | " aero_key = (AEROSPIKE_NAMESPACE, AEROSPIKE_SET, data_id)\n", 188 | " #aero_client.remove(aero_key)\n", 189 | " data_check = aero_client.exists(aero_key)\n", 190 | " if data_check[1]:\n", 191 | " (key, metadata, record) = aero_client.get(aero_key)\n", 192 | " print(\"Data retrieved from Aerospike cache\")\n", 193 | " print(\"Record::: {} {}\".format(data_id, record['value']))\n", 194 | " else:\n", 195 | " mongo_data = collection.find_one({data_id: data})\n", 196 | " print(\"Data not present in Aerospike cache, retrieved from mongo {}\".format(mongo_data))\n", 197 | " aero_client.put(aero_key, {'value': mongo_data[data_id]})\n", 198 | "get_data(\"key\", \"value\")" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [] 207 | } 208 | ], 209 | "metadata": { 210 | "kernelspec": { 211 | "display_name": "Python 3", 212 | "language": "python", 213 | "name": "python3" 214 | }, 215 | "language_info": { 216 | "codemirror_mode": { 217 | "name": "ipython", 218 | "version": 3 219 | }, 220 | "file_extension": ".py", 221 | "mimetype": "text/x-python", 222 | "name": "python", 223 | "nbconvert_exporter": "python", 224 | "pygments_lexer": "ipython3", 225 | "version": "3.8.6" 226 | } 227 | }, 228 | "nbformat": 4, 229 | "nbformat_minor": 4 230 | } 231 | -------------------------------------------------------------------------------- /notebooks/python/query.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "toc": true 7 | }, 8 | "source": [ 9 | "

Table of Contents

\n", 10 | "" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "# Aerospike Queries in Python\n", 18 | "Intoduction to Aerospike queries in Python.\n", 19 | "
\n", 20 | "This notebook requires Aerospike datbase running on localhost and that python and the Aerospike python client have been installed (`pip install aerospike`). Visit [Aerospike notebooks repo](https://github.com/aerospike-examples/interactive-notebooks) for additional details and the docker container." 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "## Ensure database is running\n", 28 | "This notebook requires that Aerospike datbase is running." 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 1, 34 | "metadata": {}, 35 | "outputs": [ 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "Aerospike database is running!\r\n" 41 | ] 42 | } 43 | ], 44 | "source": [ 45 | "!asd >& /dev/null\n", 46 | "!pgrep -x asd >/dev/null && echo \"Aerospike database is running!\" || echo \"**Aerospike database is not running!**\"" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "## Connect to database and populate test data\n", 54 | "The test data has ten records with user-key \"id1-10\", two bins (fields) \"name\" and \"age\", in the namespace \"test\" and set \"demo\". " 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 2, 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "name": "stdout", 64 | "output_type": "stream", 65 | "text": [ 66 | "Test data populated.\n" 67 | ] 68 | } 69 | ], 70 | "source": [ 71 | "# import the module\n", 72 | "from __future__ import print_function\n", 73 | "import aerospike\n", 74 | "\n", 75 | "# Configure the client\n", 76 | "config = {\n", 77 | " 'hosts': [ ('127.0.0.1', 3000) ],\n", 78 | " 'policy' : {'key': aerospike.POLICY_KEY_SEND}\n", 79 | "}\n", 80 | "\n", 81 | "# Create a client and connect it to the cluster\n", 82 | "try:\n", 83 | " client = aerospike.client(config).connect()\n", 84 | "except:\n", 85 | " import sys\n", 86 | " print(\"failed to connect to the cluster with\", config['hosts'])\n", 87 | " sys.exit(1)\n", 88 | "\n", 89 | "# Records are addressable via a tuple of (namespace, set, key)\n", 90 | "people = [ {'id':1, 'name':'John Doe', 'age': 53},\n", 91 | " {'id':2, 'name':'Brian Yu', 'age': 21},\n", 92 | " {'id':3, 'name':'Will Kim', 'age': 34},\n", 93 | " {'id':4, 'name':'Dorothy Smith', 'age': 48},\n", 94 | " {'id':5, 'name':'Sara Poe', 'age': 29},\n", 95 | " {'id':6, 'name':'Kim Knott', 'age': 56},\n", 96 | " {'id':7, 'name':'Joe Miller', 'age': 30},\n", 97 | " {'id':8, 'name':'Jeff Nye', 'age': 32},\n", 98 | " {'id':9, 'name':'Jane Doe', 'age': 44},\n", 99 | " {'id':10, 'name':'Emily Tuck', 'age': 22} ]\n", 100 | "try:\n", 101 | " for i in range(10):\n", 102 | " # Write the records\n", 103 | " client.put(('test', 'demo', 'id'+str(people[i]['id'])), people[i])\n", 104 | "except Exception as e:\n", 105 | " import sys\n", 106 | " print(\"error: {0}\".format(e), file=sys.stderr)\n", 107 | "\n", 108 | "print('Test data populated.')" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "## Create secondary index\n", 116 | "To use the query API, a secondary index must exist on the query field. We will create an integer secondary index on the \"age\" bin." 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 3, 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "name": "stdout", 126 | "output_type": "stream", 127 | "text": [ 128 | "Secondary index created.\n" 129 | ] 130 | } 131 | ], 132 | "source": [ 133 | "# Must create an index to query on a bin\n", 134 | "from aerospike import exception as ex\n", 135 | "try:\n", 136 | " client.index_integer_create(\"test\", \"demo\", \"age\", \"test_demo_number_idx\")\n", 137 | "except ex.IndexFoundError:\n", 138 | " pass\n", 139 | "\n", 140 | "print('Secondary index created.')" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "# Querying with secondary indexes\n", 148 | "\n", 149 | "In addition to retrieving records with the primary index using the key-value store APIs, the Aerospike Python client provides an API to query records using secondary indexes. To use the query API, a secondary index must exist on the query field.\n", 150 | "\n", 151 | "Use the Query APIs to query the database using secondary indexes." 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "## Create a query\n", 159 | "The API client.query() takes the namespace (required) and set (optional) arguments. The parameter set can be omitted or None, in which case records in the namespace that are outside any set are returned. The return value is a new aerospike.Query class instance.\n", 160 | "\n", 161 | "This example creates a query on the test namespace, demo set." 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 4, 167 | "metadata": {}, 168 | "outputs": [ 169 | { 170 | "name": "stdout", 171 | "output_type": "stream", 172 | "text": [ 173 | "Query object created.\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "query = client.query('test', 'demo')\n", 179 | "print('Query object created.')" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "## Project bins\n", 187 | "Project (or select) bins using select() on the Query class instance. The select() API accepts one or many bin names (strings).\n", 188 | "\n", 189 | "This example selects \"name\" and \"age\" bins from the specified records." 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 5, 195 | "metadata": {}, 196 | "outputs": [ 197 | { 198 | "name": "stdout", 199 | "output_type": "stream", 200 | "text": [ 201 | "Bins name and age selected.\n" 202 | ] 203 | } 204 | ], 205 | "source": [ 206 | "query.select('name', 'age')\n", 207 | "print('Bins name and age selected.')" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "## Add query predicate\n", 215 | "Define predicates using the where() API on the Query class instance. The where() API accepts a predicate created using one of the functions in aerospike.predicates including:\n", 216 | "\n", 217 | "- equals(bin, value) — Find records containing the bin with the specified value (integer or string).\n", 218 | "- between(bin, min, max) — Find records containing the bin with a value in the min and max range (integer only).\n", 219 | "\n", 220 | "This example adds the between() predicate to a query." 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 6, 226 | "metadata": {}, 227 | "outputs": [ 228 | { 229 | "name": "stdout", 230 | "output_type": "stream", 231 | "text": [ 232 | "Predicate defined.\n" 233 | ] 234 | } 235 | ], 236 | "source": [ 237 | "from aerospike import predicates as p\n", 238 | "query.where( p.between('age', 14, 25) )\n", 239 | "print('Predicate defined.')" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "## Define foreach function\n", 247 | "In order to executer the query and read the results, we need to use the foreach() API in the Query class instance. The foreach() API accepts a callback function for each record read from the query. The callback function must accept a single argument as a tuple:\n", 248 | "\n", 249 | "- key tuple — The tuple to identify the record.\n", 250 | "- metadata — The dict containing the record metadata (TTL and generation).\n", 251 | "- record — The dict containing the record bins.\n", 252 | "\n", 253 | "If the callback returns False, the client stops reading results.\n", 254 | "\n", 255 | "This examples executes the query and prints results as they are read.\n", 256 | "\n", 257 | "To print the records as they are read, we define a print_result function." 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 7, 263 | "metadata": {}, 264 | "outputs": [ 265 | { 266 | "name": "stdout", 267 | "output_type": "stream", 268 | "text": [ 269 | "Foreach function defined.\n" 270 | ] 271 | } 272 | ], 273 | "source": [ 274 | "def print_result(result_tuple):\n", 275 | " print(result_tuple)\n", 276 | " \n", 277 | "print('Foreach function defined.')" 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "metadata": {}, 283 | "source": [ 284 | "## Execute query and foreach\n", 285 | "Now we are ready to execute the query by passing in the print_result that will be called for each record. Based on the data we populated earlier, we expect 2 results between ages 14 and 25." 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 8, 291 | "metadata": {}, 292 | "outputs": [ 293 | { 294 | "name": "stdout", 295 | "output_type": "stream", 296 | "text": [ 297 | "Executing query and printing results:\n", 298 | "(('test', 'demo', None, bytearray(b'\\xb2\\x13X\\x1dI\\xd8\\xba`\\xab\\x96\\xa2\\xf0\\xd9\\x8b\\x19\\xf9DZug')), {'ttl': 2591998, 'gen': 1}, {'name': 'Brian Yu', 'age': 21})\n", 299 | "(('test', 'demo', None, bytearray(b'\\x0bR\\xbc\\xa1\\x02`SF?\\x01\\xe7\\xd3`\\x8d[F\\xcb\\xd71V')), {'ttl': 2591998, 'gen': 1}, {'name': 'Emily Tuck', 'age': 22})\n" 300 | ] 301 | } 302 | ], 303 | "source": [ 304 | "print(\"Executing query and printing results:\")\n", 305 | "query.foreach(print_result)" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "## Explore other query capabilities\n", 313 | "Please feel free to play with the \"equals\" predicate, adding secondary indexes on other fields, populating more test data to the \"null\" set and querying those records, and so on." 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": {}, 319 | "source": [ 320 | "## Clean up" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 9, 326 | "metadata": {}, 327 | "outputs": [ 328 | { 329 | "name": "stdout", 330 | "output_type": "stream", 331 | "text": [ 332 | "Connection closed.\n" 333 | ] 334 | } 335 | ], 336 | "source": [ 337 | "# Close the connection to the Aerospike cluster\n", 338 | "client.close()\n", 339 | "print('Connection closed.')" 340 | ] 341 | }, 342 | { 343 | "cell_type": "markdown", 344 | "metadata": {}, 345 | "source": [ 346 | "## Next steps\n", 347 | "\n", 348 | "Visit [Aerospike notebooks repo](https://github.com/aerospike-examples/interactive-notebooks) to run additional Aerospike notebooks. To run a different notebook, download the notebook from the repo to your local machine, and then click on File->Open, and select Upload." 349 | ] 350 | } 351 | ], 352 | "metadata": { 353 | "kernelspec": { 354 | "display_name": "Python 3", 355 | "language": "python", 356 | "name": "python3" 357 | }, 358 | "language_info": { 359 | "codemirror_mode": { 360 | "name": "ipython", 361 | "version": 3 362 | }, 363 | "file_extension": ".py", 364 | "mimetype": "text/x-python", 365 | "name": "python", 366 | "nbconvert_exporter": "python", 367 | "pygments_lexer": "ipython3", 368 | "version": "3.8.6" 369 | }, 370 | "toc": { 371 | "base_numbering": 1, 372 | "nav_menu": {}, 373 | "number_sections": true, 374 | "sideBar": true, 375 | "skip_h1_title": false, 376 | "title_cell": "Table of Contents", 377 | "title_sidebar": "Contents", 378 | "toc_cell": true, 379 | "toc_position": {}, 380 | "toc_section_display": true, 381 | "toc_window_display": false 382 | } 383 | }, 384 | "nbformat": 4, 385 | "nbformat_minor": 4 386 | } 387 | -------------------------------------------------------------------------------- /notebooks/python/simple_put_get_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# A Simple Put-Get Example\n", 8 | "\n", 9 | "A simple example of `put` and `get` calls in Aerospike.\n", 10 | "\n", 11 | "This notebook requires Aerospike datbase running locally and that python and the Aerospike python client have been installed (`pip install aerospike`). Visit [Aerospike notebooks repo](https://github.com/aerospike-examples/interactive-notebooks) for additional details and the docker container." 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "## Import the module\n", 19 | "\n", 20 | "The Aerospike client must be imported." 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import aerospike" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "## Configure the client\n", 37 | "\n", 38 | "This configuration is for aerospike running on port 3000 of localhost which is the default. If your environment is different (Aerospike server running on a different host or different port, etc)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "config = {\n", 48 | " 'hosts': [ ('127.0.0.1', 3000) ]\n", 49 | "}" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "## Create a client and connect it to the cluster" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "try:\n", 66 | " client = aerospike.client(config).connect()\n", 67 | "except:\n", 68 | " import sys\n", 69 | " print(\"failed to connect to the cluster with\", config['hosts'])\n", 70 | " sys.exit(1)\n" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "## Records are addressable via a tuple of (namespace, set, key) \n", 78 | "\n", 79 | "These three components (with set being optionsl) form the key. Using this key records may be read or written. For a detailed description of the data model see the [Data Model overview](https://www.aerospike.com/docs/architecture/data-model.html)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "key = ('test', 'demo', 'foo')" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "## Writing a record\n", 96 | "\n", 97 | "Aerospike is schema-less and records may be written without any other setup. Here a record with two bins (name and age) is being written to a record with they key defined above. " 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "\n", 107 | "try:\n", 108 | " # Write a record\n", 109 | " client.put(key, {\n", 110 | " 'name': 'John Doe',\n", 111 | " 'age': 32\n", 112 | " })\n", 113 | "except Exception as e:\n", 114 | " import sys\n", 115 | " print(\"error: {0}\".format(e), file=sys.stderr)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "## Reading a record\n", 123 | "\n", 124 | "This same record may be retrieved using the same key." 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "(key, metadata, record) = client.get(key)\n" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "## Display result\n", 141 | "\n", 142 | "Print the record that was just retrieved. We are also printing: \n", 143 | "\n", 144 | "1. The components of the key which are: namespace, the set, a userkey (by default there is no user key), and a hash which is the internal representation of the key.\n", 145 | "1. The metadata with the time to live and the record's generation. \n", 146 | "1. The actual value of the record with two bins. \n", 147 | "\n", 148 | "Lastly it is important to clean up the client we created at the beginning." 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "print(\"record contents are\", record)\n", 158 | "print(\"key components are\", key)\n", 159 | "print(\"metadata is\", metadata)\n", 160 | "# Close the connection to the Aerospike cluster\n", 161 | "client.close()" 162 | ] 163 | } 164 | ], 165 | "metadata": { 166 | "file_extension": ".py", 167 | "kernelspec": { 168 | "display_name": "Python 3.7.9 64-bit", 169 | "language": "python", 170 | "name": "python37964bit728b6d8a91f74e8c9f0db525f58accf3" 171 | }, 172 | "language_info": { 173 | "codemirror_mode": { 174 | "name": "ipython", 175 | "version": 3 176 | }, 177 | "file_extension": ".py", 178 | "mimetype": "text/x-python", 179 | "name": "python", 180 | "nbconvert_exporter": "python", 181 | "pygments_lexer": "ipython3", 182 | "version": "3.7.9" 183 | }, 184 | "mimetype": "text/x-python", 185 | "name": "python", 186 | "npconvert_exporter": "python", 187 | "pygments_lexer": "ipython3", 188 | "version": 3 189 | }, 190 | "nbformat": 4, 191 | "nbformat_minor": 2 192 | } 193 | -------------------------------------------------------------------------------- /notebooks/python/transactions_rmw_pattern.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "toc": true 7 | }, 8 | "source": [ 9 | "

Table of Contents

\n", 10 | "" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "# Implementing Read-Write Transactions with R-M-W Pattern \n", 18 | "This tutorial explains how to use the Read-Modify-Write pattern in order to ensure atomicity and isolation for read-write single-record transactions. \n", 19 | "\n", 20 | "This notebook requires Aerospike datbase running on localhost and that python and the Aerospike python client have been installed (`pip install aerospike`). Visit [Aerospike notebooks repo](https://github.com/aerospike-examples/interactive-notebooks) for additional details and the docker container." 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "## Introduction\n", 28 | "In Aerospike, the transactional boundaries are \"single request, single record\". While multiple operations may be specified in a single request on a single record, each such operation can involve a single bin and only certain write operations are allowed. Therefore, neither updates involving multiple bins (e.g, \"a=a+b\") nor general logic (e.g., \"concatenate alternate letters and append\") are possible as server-side operations. Of course, UDFs allow complex logic in a transactional update of a single record, however they are not suitable for all situations for various reasons such as performance and ease. Therefore most updates entail the R-M-W pattern or Reading the record, Modifying bins on the client side, and then Writing the record updates back to the server. \n", 29 | "\n", 30 | "The tutorial first demonstrates how read-write operations can result in lost writes in a concurrent multi-client environment. \n", 31 | "\n", 32 | "Then we show how to specify conditional writes with version check to address the problem by disallowing intereaved read-write and thus protecting against lost writes." 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "## Prerequisites\n", 40 | "This tutorial assumes familiarity with the following topics:\n", 41 | "\n", 42 | "[Provide topics and links. For example:]\n", 43 | "- [Hello World](hello_world.ipynb)\n", 44 | "- [Aerospike Basic Operations](basic_operations.ipynb)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "## Initialization" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "### Ensure database is running\n", 59 | "This notebook requires that Aerospike datbase is running. \n", 60 | "[Include the right code cell for Java or Python from the two cells below.] " 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 1, 66 | "metadata": { 67 | "ExecuteTime": { 68 | "end_time": "2020-12-29T20:48:49.695739Z", 69 | "start_time": "2020-12-29T20:48:49.447020Z" 70 | } 71 | }, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "Aerospike database is running!\r\n" 78 | ] 79 | } 80 | ], 81 | "source": [ 82 | "!asd >& /dev/null\n", 83 | "!pgrep -x asd >/dev/null && echo \"Aerospike database is running!\" || echo \"**Aerospike database is not running!**\"" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "### Connect to database." 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 3, 96 | "metadata": { 97 | "ExecuteTime": { 98 | "end_time": "2020-12-29T20:48:51.190060Z", 99 | "start_time": "2020-12-29T20:48:51.110597Z" 100 | } 101 | }, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "Client successfully connected to the database.\n" 108 | ] 109 | } 110 | ], 111 | "source": [ 112 | "# import the modules\n", 113 | "import sys\n", 114 | "import aerospike\n", 115 | "\n", 116 | "# Configure the client\n", 117 | "config = {\n", 118 | " 'hosts': [ ('127.0.0.1', 3000) ],\n", 119 | " 'policy' : {'key': aerospike.POLICY_KEY_SEND}\n", 120 | "}\n", 121 | "\n", 122 | "# Create a client and connect it to the cluster\n", 123 | "try:\n", 124 | " client = aerospike.client(config).connect()\n", 125 | "except:\n", 126 | " print(\"failed to connect to the cluster with\", config['hosts'])\n", 127 | " sys.exit(1)\n", 128 | "print('Client successfully connected to the database.')" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "### Populate database with test data.\n", 136 | "We create one record with an integer bin \"gen-times-2\" (the names will become clear below), initialized to 1." 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 4, 142 | "metadata": { 143 | "ExecuteTime": { 144 | "end_time": "2020-12-29T20:48:52.195181Z", 145 | "start_time": "2020-12-29T20:48:52.189787Z" 146 | } 147 | }, 148 | "outputs": [ 149 | { 150 | "name": "stdout", 151 | "output_type": "stream", 152 | "text": [ 153 | "Test data populated.\n" 154 | ] 155 | } 156 | ], 157 | "source": [ 158 | "namespace = 'test'\n", 159 | "tutorial_set = 'rmw-tutorial-set'\n", 160 | "user_key = 'id-1'\n", 161 | "# Records are addressable via a tuple of (namespace, set, user_key)\n", 162 | "rec_key = (namespace, tutorial_set, user_key)\n", 163 | "rmw_bin = 'gen-times-2'\n", 164 | "try:\n", 165 | " # Create the record\n", 166 | " client.put(rec_key, {rmw_bin: 1})\n", 167 | "except Exception as e:\n", 168 | " print(\"error: {0}\".format(e), file=sys.stderr)\n", 169 | "\n", 170 | "print('Test data populated.')" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "# The Problem of Lost Writes\n", 178 | "In a concurrent setting, multiple clients may be performaing Read-Modify-Write on the same record in a way that get in each other's way. Since various R-M-W transactions can interleave, a transaction can be lost, if another client updates the record without reading the transaction's update.\n", 179 | "\n", 180 | "To demonstrate this, we make use of a record's \"generation\" or version, that is available as the record metadata, and is automatically incremented on each successful update of the record.\n", 181 | "\n", 182 | "The integer bin “gen-times-2” holds the value that is 2 times the value of the current generation of the record. A client first reads the current generation of the record, and then updates the bin value 2 times that value.\n", 183 | "\n", 184 | "In the case of a single client, there are no issues in maintaining the semantics of the bin. However when there are multiple clients, the interleaving of reads and writes of different transactions can violate the semantics. By updating the bin using an older generation value, it may not be 2 times the current generation, which is the constraint that we want to preserve.\n", 185 | "\n", 186 | "First, we will show how transaction writes are lost in a simple concurrent case by observing whether the relationship between record's current generation and the bin value is maintained. Then we will show how the problem is solved using a conditional write with version check.\n" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "## Test Framework\n", 194 | "We spawn multiple (num_threads) threads to simulate concurrent access. Each thread repeatedly (num_txns) does the following:\n", 195 | "- waits for a random duration (with average of txn_wait_ms) \n", 196 | "- executes a passed-in R-M-W function that returns the failure type (string, null if success).\n", 197 | "\n", 198 | "At the end the thread prints out the aggregate counts for each error type. In aggregate, they signify the likelihood of a read-write transaction failing." 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 9, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "import threading\n", 208 | "import time\n", 209 | "import random\n", 210 | "\n", 211 | "num_txns = 10\n", 212 | "txn_wait_ms = 500\n", 213 | "\n", 214 | "def thread_fn(thread_id, rmw_fn):\n", 215 | " random.seed(thread_id)\n", 216 | " lost_writes_count = 0\n", 217 | " failures = {}\n", 218 | " for i in range(num_txns):\n", 219 | " failure = rmw_fn()\n", 220 | " if failure:\n", 221 | " if not failure in failures:\n", 222 | " failures[failure] = 1\n", 223 | " else: \n", 224 | " failures[failure] += 1 \n", 225 | " print('\\tThead {0} failures: {1}'.format(thread_id, failures))\n", 226 | " return\n", 227 | " \n", 228 | " \n", 229 | "def run_test(num_threads, rmw_fn):\n", 230 | " threads = list()\n", 231 | " print('{0} threads, {1} transcations per thread:'.format(num_threads, num_txns))\n", 232 | " for thread_index in range(num_threads):\n", 233 | " thread = threading.Thread(target=thread_fn, args=(thread_index, rmw_fn))\n", 234 | " threads.append(thread)\n", 235 | " thread.start()\n", 236 | " for thread in threads:\n", 237 | " thread.join()\n", 238 | " return" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "## Simple RMW Function\n", 246 | "Next we implement a simple RMW function simple_rmw_fn to pass into the above framework. The function: \n", 247 | "- Reads the record.\n", 248 | "- Computes new value of gen_times_2 (= 2 * read generation). Then waits for a random duration, with average of write_wait_ms average to simulate the application computation time between read and write.\n", 249 | "- Writes the new bin value. In the same (multi-op) request, reads back the record for the record's new generation value.\n", 250 | "- Returns \"lost writes\" if the updated value of gen_times_2/2 is smaller than the new gen. If they are the same, it returns None." 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 10, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "import aerospike_helpers.operations.operations as op_helpers\n", 260 | "\n", 261 | "write_wait_ms = 50\n", 262 | "\n", 263 | "def rmw_simple():\n", 264 | " #read\n", 265 | " _, meta, bins = client.get(rec_key)\n", 266 | " # wait before write to simulate computation time\n", 267 | " time.sleep(random.uniform(0,2*write_wait_ms/1000.0))\n", 268 | " # modify \n", 269 | " read_gen = meta['gen']\n", 270 | " new_rmw_bin_value = 2*(read_gen+1)\n", 271 | " # write and read back bin_inc to compare\n", 272 | " ops = [op_helpers.write(rmw_bin, new_rmw_bin_value),\n", 273 | " op_helpers.read(rmw_bin)]\n", 274 | " try:\n", 275 | " _, meta, bins = client.operate(rec_key, ops)\n", 276 | " except Exception as e:\n", 277 | " print(\"error: {0}\".format(e), file=sys.stderr)\n", 278 | " exit(-1)\n", 279 | " # compare new_rmw_bin_value//2 and new gen; if different return 'lost writes'\n", 280 | " new_gen = meta['gen']\n", 281 | " if new_rmw_bin_value//2 != new_gen: \n", 282 | " #print('gen: {0}, bin: {1}, lost: {2}'.format(new_gen, new_rmw_bin_value//2, new_gen-new_rmw_bin_value//2))\n", 283 | " return 'lost writes'\n", 284 | " return None" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": {}, 290 | "source": [ 291 | "## Test Results\n", 292 | "For various values of concurrency (num_threads), we can see that with greater concurrent updates, a larger percentage of read-write transactions are lost, meaning greater likelihood of the semantics of the gen_times_2 bin not being preserved." 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 11, 298 | "metadata": {}, 299 | "outputs": [ 300 | { 301 | "name": "stdout", 302 | "output_type": "stream", 303 | "text": [ 304 | "1 threads, 10 transcations per thread:\n", 305 | "\tThead 0 failures: {}\n", 306 | "2 threads, 10 transcations per thread:\n", 307 | "\tThead 0 failures: {'lost writes': 5}\n", 308 | "\tThead 1 failures: {'lost writes': 6}\n", 309 | "3 threads, 10 transcations per thread:\n", 310 | "\tThead 0 failures: {'lost writes': 4}\n", 311 | "\tThead 1 failures: {'lost writes': 8}\n", 312 | "\tThead 2 failures: {'lost writes': 7}\n", 313 | "4 threads, 10 transcations per thread:\n", 314 | "\tThead 0 failures: {'lost writes': 9}\n", 315 | "\tThead 3 failures: {'lost writes': 8}\n", 316 | "\tThead 1 failures: {'lost writes': 8}\n", 317 | "\tThead 2 failures: {'lost writes': 8}\n" 318 | ] 319 | } 320 | ], 321 | "source": [ 322 | "run_test(num_threads=1, rmw_fn=rmw_simple)\n", 323 | "run_test(num_threads=2, rmw_fn=rmw_simple)\n", 324 | "run_test(num_threads=3, rmw_fn=rmw_simple)\n", 325 | "run_test(num_threads=4, rmw_fn=rmw_simple)" 326 | ] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "metadata": {}, 331 | "source": [ 332 | "# Using Generation Check\n", 333 | "To solve the problem of lost writes, the simple R-M-W is modified with how the Write is done: by making it conditional on the record not having been modified since the Read. It is a \"check-and-set (CAS)\" like operation that succeeds if the record generation (version) is still the same as at the time of Read. Otherwise it fails, and the client must retry the whole R-M-W pattern. The syntax and usage is shown in the code below." 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": {}, 339 | "source": [ 340 | "## RMW Function with Version Check and Retries\n", 341 | "In the rmw_with_gen_check function below, a failed read-write due to generation mismatch is retried for max_retries attempts or until the write is successful. Each retry is attempted after a exponential backoff wait of (retry_number ** 2) * retry_wait_ms.\n", 342 | "\n", 343 | "A write can still fail after max_retries attempts, and the client can suitably handle it. However no writes are overwritten or lost, and the intended semantics of the gen-times-2 bin is always preserved.\n", 344 | "\n", 345 | "We perform the same concurrent test with the version check at Write. We expect no interleaved_writes reported in any thread." 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 12, 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "from aerospike_helpers.operations import operations as op_helpers\n", 355 | "from aerospike import exception as ex\n", 356 | "\n", 357 | "max_retries = 3\n", 358 | "retry_wait_ms = 20\n", 359 | "\n", 360 | "def rmw_with_gen_check():\n", 361 | " retryRMWCount = 0\n", 362 | " done = False\n", 363 | " while (not done):\n", 364 | " #read\n", 365 | " _, meta, bins = client.get(rec_key)\n", 366 | " # wait before write to simulate computation time\n", 367 | " time.sleep(random.uniform(0,2*write_wait_ms/1000.0))\n", 368 | " # modify \n", 369 | " read_gen = meta['gen']\n", 370 | " new_rmw_bin_value = 2*(read_gen+1)\n", 371 | " # write and read back bin_inc to compare\n", 372 | " ops = [op_helpers.write(rmw_bin, new_rmw_bin_value),\n", 373 | " op_helpers.read(rmw_bin)]\n", 374 | " write_policy = { 'gen': aerospike.POLICY_GEN_EQ }\n", 375 | " try:\n", 376 | " _, meta, bins = client.operate(rec_key, ops, meta={'gen': read_gen}, policy=write_policy)\n", 377 | " except ex.RecordGenerationError as e:\n", 378 | " if retryRMWCount < max_retries:\n", 379 | " retryRMWCount += 1\n", 380 | " time.sleep((2**retryRMWCount)*retry_wait_ms/1000.0) \n", 381 | " else:\n", 382 | " return 'max retries exceeded' \n", 383 | " except Exception as e:\n", 384 | " print(\"error: {0}\".format(e), file=sys.stderr)\n", 385 | " exit(-1)\n", 386 | " else:\n", 387 | " done = True \n", 388 | " # compare new_rmw_bin_value//2 and new gen; if different \n", 389 | " new_gen = meta['gen']\n", 390 | " if new_rmw_bin_value//2 != new_gen: \n", 391 | " return 'lost writes'\n", 392 | " return None" 393 | ] 394 | }, 395 | { 396 | "cell_type": "markdown", 397 | "metadata": {}, 398 | "source": [ 399 | "## Test Results\n", 400 | "Let's execute for various levels of concurrency and see the results. We expect to see no lost writes. Even when max-retries are exceeded, transaction and database integrity is preserved." 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": 13, 406 | "metadata": {}, 407 | "outputs": [ 408 | { 409 | "name": "stdout", 410 | "output_type": "stream", 411 | "text": [ 412 | "2 threads, 10 transcations per thread:\n", 413 | "\tThead 1 failures: {}\n", 414 | "\tThead 0 failures: {}\n", 415 | "3 threads, 10 transcations per thread:\n", 416 | "\tThead 1 failures: {}\n", 417 | "\tThead 0 failures: {}\n", 418 | "\tThead 2 failures: {}\n", 419 | "4 threads, 10 transcations per thread:\n", 420 | "\tThead 0 failures: {}\n", 421 | "\tThead 3 failures: {'max retries exceeded': 1}\n", 422 | "\tThead 2 failures: {'max retries exceeded': 1}\n", 423 | "\tThead 1 failures: {'max retries exceeded': 2}\n" 424 | ] 425 | } 426 | ], 427 | "source": [ 428 | "run_test(num_threads=2, rmw_fn=rmw_with_gen_check)\n", 429 | "run_test(num_threads=3, rmw_fn=rmw_with_gen_check)\n", 430 | "run_test(num_threads=4, rmw_fn=rmw_with_gen_check)" 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "metadata": {}, 436 | "source": [ 437 | "# Takeaways\n", 438 | "In the tutorial we showed:\n", 439 | "- the need for read-write transactions in Aerospike to use the R-M-W pattern \n", 440 | "- how writes can be overwritten and lost in a concurrent environment if performed simply\n", 441 | "- how the developer can ensure atomicity and isolation of a read-write transaction by using version check logic and syntax." 442 | ] 443 | }, 444 | { 445 | "cell_type": "markdown", 446 | "metadata": {}, 447 | "source": [ 448 | "# Clean up\n", 449 | "Remove data and close connection." 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": 14, 455 | "metadata": { 456 | "ExecuteTime": { 457 | "end_time": "2020-12-29T20:49:21.100931Z", 458 | "start_time": "2020-12-29T20:49:21.095318Z" 459 | } 460 | }, 461 | "outputs": [ 462 | { 463 | "name": "stdout", 464 | "output_type": "stream", 465 | "text": [ 466 | "Removed tutorial data. Connection closed.\n" 467 | ] 468 | } 469 | ], 470 | "source": [ 471 | "client.truncate(namespace, tutorial_set, 0)\n", 472 | "# Close the connection to the Aerospike cluster\n", 473 | "client.close()\n", 474 | "print('Removed tutorial data. Connection closed.')" 475 | ] 476 | }, 477 | { 478 | "cell_type": "markdown", 479 | "metadata": {}, 480 | "source": [ 481 | "# Further Exploration and Resources\n", 482 | "For further exploration of transactions support in Aerospike, check out the following resources:\n", 483 | "\n", 484 | "- Blog posts\n", 485 | " - [Developers: Understanding Aerospike Transactions](https://www.aerospike.com/blog/developers-understanding-aerospike-transactions/)\n", 486 | " - [Twelve Do's of Consistency in Aerospike](https://www.aerospike.com/blog/twelve-dos-of-consistency-in-aerospike/)\n", 487 | "- Video\n", 488 | " - [Strong Consistency in Databases. What does it actually guarantee?](https://www.aerospike.com/resources/videos/strong-consistency-in-databases-what-does-it-actually-guarantee/)" 489 | ] 490 | }, 491 | { 492 | "cell_type": "markdown", 493 | "metadata": {}, 494 | "source": [ 495 | "## Next steps\n", 496 | "\n", 497 | "Visit [Aerospike notebooks repo](https://github.com/aerospike-examples/interactive-notebooks) to run additional Aerospike notebooks. To run a different notebook, download the notebook from the repo to your local machine, and then click on File->Open, and select Upload." 498 | ] 499 | } 500 | ], 501 | "metadata": { 502 | "kernelspec": { 503 | "display_name": "Python 3", 504 | "language": "python", 505 | "name": "python3" 506 | }, 507 | "language_info": { 508 | "codemirror_mode": { 509 | "name": "ipython", 510 | "version": 3 511 | }, 512 | "file_extension": ".py", 513 | "mimetype": "text/x-python", 514 | "name": "python", 515 | "nbconvert_exporter": "python", 516 | "pygments_lexer": "ipython3", 517 | "version": "3.8.6" 518 | }, 519 | "toc": { 520 | "base_numbering": 1, 521 | "nav_menu": {}, 522 | "number_sections": true, 523 | "sideBar": true, 524 | "skip_h1_title": false, 525 | "title_cell": "Table of Contents", 526 | "title_sidebar": "Contents", 527 | "toc_cell": true, 528 | "toc_position": {}, 529 | "toc_section_display": true, 530 | "toc_window_display": false 531 | } 532 | }, 533 | "nbformat": 4, 534 | "nbformat_minor": 4 535 | } 536 | -------------------------------------------------------------------------------- /notebooks/spark/.gitignore: -------------------------------------------------------------------------------- 1 | aerospike-spark-assembly*.jar -------------------------------------------------------------------------------- /notebooks/spark/other_notebooks/AerospikeSparkH2ODemo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Aerospike Connect for Spark - H2O Tutorial for Python\n", 8 | "## Tested with Java 8, Spark 2.4.0, H2O 3.30.1.2, h2o_pysparkling_2.4, Python 3.7, and Aerospike Spark Connector 2.5" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "### Setup\n", 16 | "\n", 17 | "Below, a seed address for your Aerospike database cluster is required\n", 18 | "\n", 19 | "Check the given namespace is available, and your feature key is located as per AS_FEATURE_KEY_PATH\n", 20 | "\n", 21 | "Finally, review https://www.aerospike.com/enterprise/download/connectors/ to ensure AEROSPIKE_SPARK_JAR_VERSION is correct" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 4, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "# IP Address or DNS name for one host in your Aerospike cluster\n", 31 | "AS_HOST =\"127.0.0.1\"\n", 32 | "# Name of one of your namespaces. Type 'show namespaces' at the aql prompt if you are not sure\n", 33 | "AS_NAMESPACE = \"test\" \n", 34 | "AS_FEATURE_KEY_PATH = \"/etc/aerospike/features.conf\"\n", 35 | "AEROSPIKE_SPARK_JAR_VERSION=\"2.5.0\"\n", 36 | "\n", 37 | "AS_PORT = 3000 # Usually 3000, but change here if not\n", 38 | "AS_CONNECTION_STRING = AS_HOST + \":\"+ str(AS_PORT)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 5, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "# Next we locate the Spark installation - this will be found using the SPARK_HOME environment variable that you will have set \n", 48 | "# if you followed the repository README\n", 49 | "\n", 50 | "import findspark\n", 51 | "findspark.init()" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 1, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "name": "stdout", 61 | "output_type": "stream", 62 | "text": [ 63 | "Checking whether there is an H2O instance running at http://localhost:54321 . connected.\n" 64 | ] 65 | }, 66 | { 67 | "data": { 68 | "text/html": [ 69 | "
\n", 70 | "\n", 71 | "\n", 72 | "\n", 73 | "\n", 74 | "\n", 75 | "\n", 76 | "\n", 77 | "\n", 78 | "\n", 79 | "\n", 80 | "\n", 81 | "\n", 82 | "\n", 83 | "\n", 84 | "\n", 85 | "\n", 86 | "\n", 87 | "\n", 88 | "\n", 89 | "\n", 90 | "\n", 91 | "\n", 92 | "\n", 93 | "\n", 94 | "\n", 95 | "\n", 96 | "\n", 97 | "\n", 98 | "\n", 99 | "\n", 100 | "
H2O_cluster_uptime:24 days 15 hours 18 mins
H2O_cluster_timezone:America/Los_Angeles
H2O_data_parsing_timezone:UTC
H2O_cluster_version:3.30.1.2
H2O_cluster_version_age:1 month and 11 days
H2O_cluster_name:H2O_from_python_kmatty_mnldpz
H2O_cluster_total_nodes:1
H2O_cluster_free_memory:3.057 Gb
H2O_cluster_total_cores:16
H2O_cluster_allowed_cores:16
H2O_cluster_status:locked, healthy
H2O_connection_url:http://localhost:54321
H2O_connection_proxy:{\"http\": null, \"https\": null}
H2O_internal_security:False
H2O_API_Extensions:Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4
Python_version:3.7.5 final
" 101 | ], 102 | "text/plain": [ 103 | "-------------------------- ------------------------------------------------------------------\n", 104 | "H2O_cluster_uptime: 24 days 15 hours 18 mins\n", 105 | "H2O_cluster_timezone: America/Los_Angeles\n", 106 | "H2O_data_parsing_timezone: UTC\n", 107 | "H2O_cluster_version: 3.30.1.2\n", 108 | "H2O_cluster_version_age: 1 month and 11 days\n", 109 | "H2O_cluster_name: H2O_from_python_kmatty_mnldpz\n", 110 | "H2O_cluster_total_nodes: 1\n", 111 | "H2O_cluster_free_memory: 3.057 Gb\n", 112 | "H2O_cluster_total_cores: 16\n", 113 | "H2O_cluster_allowed_cores: 16\n", 114 | "H2O_cluster_status: locked, healthy\n", 115 | "H2O_connection_url: http://localhost:54321\n", 116 | "H2O_connection_proxy: {\"http\": null, \"https\": null}\n", 117 | "H2O_internal_security: False\n", 118 | "H2O_API_Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4\n", 119 | "Python_version: 3.7.5 final\n", 120 | "-------------------------- ------------------------------------------------------------------" 121 | ] 122 | }, 123 | "metadata": {}, 124 | "output_type": "display_data" 125 | } 126 | ], 127 | "source": [ 128 | "import h2o\n", 129 | "h2o.init()" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 6, 135 | "metadata": {}, 136 | "outputs": [ 137 | { 138 | "name": "stdout", 139 | "output_type": "stream", 140 | "text": [ 141 | "aerospike-spark-assembly-2.5.0.jar already downloaded\n" 142 | ] 143 | } 144 | ], 145 | "source": [ 146 | "# Here we download the Aerospike Spark jar\n", 147 | "import urllib\n", 148 | "import os\n", 149 | "\n", 150 | "def aerospike_spark_jar_download_url(version=AEROSPIKE_SPARK_JAR_VERSION):\n", 151 | " DOWNLOAD_PREFIX=\"https://www.aerospike.com/enterprise/download/connectors/aerospike-spark/\"\n", 152 | " DOWNLOAD_SUFFIX=\"/artifact/jar\"\n", 153 | " AEROSPIKE_SPARK_JAR_DOWNLOAD_URL = DOWNLOAD_PREFIX+AEROSPIKE_SPARK_JAR_VERSION+DOWNLOAD_SUFFIX\n", 154 | " return AEROSPIKE_SPARK_JAR_DOWNLOAD_URL\n", 155 | "\n", 156 | "def download_aerospike_spark_jar(version=AEROSPIKE_SPARK_JAR_VERSION):\n", 157 | " JAR_NAME=\"aerospike-spark-assembly-\"+AEROSPIKE_SPARK_JAR_VERSION+\".jar\"\n", 158 | " if(not(os.path.exists(JAR_NAME))) :\n", 159 | " urllib.request.urlretrieve(aerospike_spark_jar_download_url(),JAR_NAME)\n", 160 | " else :\n", 161 | " print(JAR_NAME+\" already downloaded\")\n", 162 | " return os.path.join(os.getcwd(),JAR_NAME)\n", 163 | "\n", 164 | "AEROSPIKE_JAR_PATH=download_aerospike_spark_jar()\n", 165 | "os.environ[\"PYSPARK_SUBMIT_ARGS\"] = '--jars ' + AEROSPIKE_JAR_PATH + ' pyspark-shell'" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 10, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "import pyspark\n", 175 | "from pyspark.context import SparkContext\n", 176 | "from pyspark.sql.context import SQLContext\n", 177 | "from pyspark.sql.session import SparkSession\n", 178 | "from pyspark.sql.types import StringType, StructField, StructType, ArrayType, IntegerType, MapType, LongType, DoubleType\n", 179 | "from pysparkling import *" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "Get a spark session object and set required Aerospike configuration properties" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "Set up spark and point aerospike db to AS_HOST" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 11, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "sc = SparkContext.getOrCreate()\n", 203 | "spark = SparkSession(sc)\n", 204 | "sqlContext = SQLContext(sc)\n", 205 | "spark.conf.set(\"aerospike.namespace\",AS_NAMESPACE)\n", 206 | "spark.conf.set(\"aerospike.seedhost\",AS_CONNECTION_STRING)\n", 207 | "spark.conf.set(\"aerospike.keyPath\",AS_FEATURE_KEY_PATH )" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 12, 213 | "metadata": {}, 214 | "outputs": [ 215 | { 216 | "name": "stdout", 217 | "output_type": "stream", 218 | "text": [ 219 | "Connecting to H2O server at http://192.168.1.6:54321 ... successful.\n" 220 | ] 221 | }, 222 | { 223 | "data": { 224 | "text/html": [ 225 | "
\n", 226 | "\n", 227 | "\n", 228 | "\n", 229 | "\n", 230 | "\n", 231 | "\n", 232 | "\n", 233 | "\n", 234 | "\n", 235 | "\n", 236 | "\n", 237 | "\n", 238 | "\n", 239 | "\n", 240 | "\n", 241 | "\n", 242 | "\n", 243 | "\n", 244 | "\n", 245 | "\n", 246 | "\n", 247 | "\n", 248 | "\n", 249 | "\n", 250 | "\n", 251 | "\n", 252 | "\n", 253 | "\n", 254 | "\n", 255 | "\n", 256 | "
H2O_cluster_uptime:22 secs
H2O_cluster_timezone:America/Los_Angeles
H2O_data_parsing_timezone:UTC
H2O_cluster_version:3.30.1.2
H2O_cluster_version_age:1 month and 11 days
H2O_cluster_name:sparkling-water-kmatty_local-1602784872166
H2O_cluster_total_nodes:1
H2O_cluster_free_memory:794 Mb
H2O_cluster_total_cores:16
H2O_cluster_allowed_cores:16
H2O_cluster_status:locked, healthy
H2O_connection_url:http://192.168.1.6:54321
H2O_connection_proxy:null
H2O_internal_security:False
H2O_API_Extensions:XGBoost, Algos, Amazon S3, Sparkling Water REST API Extensions, AutoML, Core V3, TargetEncoder, Core V4
Python_version:3.7.5 final
" 257 | ], 258 | "text/plain": [ 259 | "-------------------------- -------------------------------------------------------------------------------------------------------\n", 260 | "H2O_cluster_uptime: 22 secs\n", 261 | "H2O_cluster_timezone: America/Los_Angeles\n", 262 | "H2O_data_parsing_timezone: UTC\n", 263 | "H2O_cluster_version: 3.30.1.2\n", 264 | "H2O_cluster_version_age: 1 month and 11 days\n", 265 | "H2O_cluster_name: sparkling-water-kmatty_local-1602784872166\n", 266 | "H2O_cluster_total_nodes: 1\n", 267 | "H2O_cluster_free_memory: 794 Mb\n", 268 | "H2O_cluster_total_cores: 16\n", 269 | "H2O_cluster_allowed_cores: 16\n", 270 | "H2O_cluster_status: locked, healthy\n", 271 | "H2O_connection_url: http://192.168.1.6:54321\n", 272 | "H2O_connection_proxy: null\n", 273 | "H2O_internal_security: False\n", 274 | "H2O_API_Extensions: XGBoost, Algos, Amazon S3, Sparkling Water REST API Extensions, AutoML, Core V3, TargetEncoder, Core V4\n", 275 | "Python_version: 3.7.5 final\n", 276 | "-------------------------- -------------------------------------------------------------------------------------------------------" 277 | ] 278 | }, 279 | "metadata": {}, 280 | "output_type": "display_data" 281 | }, 282 | { 283 | "name": "stdout", 284 | "output_type": "stream", 285 | "text": [ 286 | "\n", 287 | "Sparkling Water Context:\n", 288 | " * Sparkling Water Version: 3.30.1.2-1-2.4\n", 289 | " * H2O name: sparkling-water-kmatty_local-1602784872166\n", 290 | " * cluster size: 1\n", 291 | " * list of used nodes:\n", 292 | " (executorId, host, port)\n", 293 | " ------------------------\n", 294 | " (0,192.168.1.6,54321)\n", 295 | " ------------------------\n", 296 | "\n", 297 | " Open H2O Flow in browser: http://192.168.1.6:54323 (CMD + click in Mac OSX)\n", 298 | "\n", 299 | " \n" 300 | ] 301 | } 302 | ], 303 | "source": [ 304 | "h2oContext = H2OContext.getOrCreate()" 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "metadata": {}, 310 | "source": [ 311 | "## Create Sample Data and load it into Aerospike" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 13, 317 | "metadata": {}, 318 | "outputs": [ 319 | { 320 | "name": "stdout", 321 | "output_type": "stream", 322 | "text": [ 323 | "Data created\n" 324 | ] 325 | } 326 | ], 327 | "source": [ 328 | "# We create age vs salary data, using three different Gaussian distributions\n", 329 | "import numpy as np\n", 330 | "import matplotlib.pyplot as plt\n", 331 | "import pandas as pd\n", 332 | "import math\n", 333 | "\n", 334 | "# Create covariance matrix from std devs + correlation\n", 335 | "def covariance_matrix(std_dev_1,std_dev_2,correlation):\n", 336 | " return [[std_dev_1 ** 2, correlation * std_dev_1 * std_dev_2], \n", 337 | " [correlation * std_dev_1 * std_dev_2, std_dev_2 ** 2]]\n", 338 | "\n", 339 | "# Return a bivariate sample given means/std dev/correlation\n", 340 | "def age_salary_sample(distribution_params,sample_size):\n", 341 | " mean = [distribution_params[\"age_mean\"], distribution_params[\"salary_mean\"]]\n", 342 | " cov = covariance_matrix(distribution_params[\"age_std_dev\"],distribution_params[\"salary_std_dev\"],\n", 343 | " distribution_params[\"age_salary_correlation\"])\n", 344 | " return np.random.multivariate_normal(mean, cov, sample_size).T\n", 345 | "\n", 346 | "# Define the characteristics of our age/salary distribution\n", 347 | "age_salary_distribution_1 = {\"age_mean\":25,\"salary_mean\":50000,\n", 348 | " \"age_std_dev\":1,\"salary_std_dev\":5000,\"age_salary_correlation\":0.3}\n", 349 | "\n", 350 | "age_salary_distribution_2 = {\"age_mean\":45,\"salary_mean\":80000,\n", 351 | " \"age_std_dev\":4,\"salary_std_dev\":10000,\"age_salary_correlation\":0.7}\n", 352 | "\n", 353 | "age_salary_distribution_3 = {\"age_mean\":35,\"salary_mean\":70000,\n", 354 | " \"age_std_dev\":2,\"salary_std_dev\":9000,\"age_salary_correlation\":0.1}\n", 355 | "\n", 356 | "distribution_data = [age_salary_distribution_1,age_salary_distribution_2,age_salary_distribution_3]\n", 357 | "\n", 358 | "# Sample age/salary data for each distributions\n", 359 | "group_1_ages,group_1_salaries = age_salary_sample(age_salary_distribution_1,sample_size=100)\n", 360 | "group_2_ages,group_2_salaries = age_salary_sample(age_salary_distribution_2,sample_size=120)\n", 361 | "group_3_ages,group_3_salaries = age_salary_sample(age_salary_distribution_3,sample_size=80)\n", 362 | "\n", 363 | "ages=np.concatenate([group_1_ages,group_2_ages,group_3_ages])\n", 364 | "salaries=np.concatenate([group_1_salaries,group_2_salaries,group_3_salaries])\n", 365 | "\n", 366 | "print(\"Data created\")" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 14, 372 | "metadata": {}, 373 | "outputs": [], 374 | "source": [ 375 | "# Turn the above records into a Data Frame\n", 376 | "# First of all, create an array of arrays\n", 377 | "inputBuf = []\n", 378 | "\n", 379 | "for i in range(0, len(ages)) :\n", 380 | " id = i + 1 # Avoid counting from zero\n", 381 | " name = \"Individual: {:03d}\".format(id)\n", 382 | " # Note we need to make sure values are typed correctly\n", 383 | " # salary will have type numpy.float64 - if it is not cast as below, an error will be thrown\n", 384 | " age = float(ages[i])\n", 385 | " salary = int(salaries[i])\n", 386 | " inputBuf.append((id, name,age,salary))\n", 387 | "\n", 388 | "# Convert to an RDD \n", 389 | "inputRDD = spark.sparkContext.parallelize(inputBuf)\n", 390 | " \n", 391 | "# Convert to a data frame using a schema\n", 392 | "schema = StructType([\n", 393 | " StructField(\"id\", IntegerType(), True),\n", 394 | " StructField(\"name\", StringType(), True),\n", 395 | " StructField(\"age\", DoubleType(), True),\n", 396 | " StructField(\"salary\",IntegerType(), True)\n", 397 | "])\n", 398 | "\n", 399 | "inputDF=spark.createDataFrame(inputRDD,schema)\n", 400 | "\n", 401 | "#Write the data frame to Aerospike, the id field is used as the primary key\n", 402 | "inputDF \\\n", 403 | ".write \\\n", 404 | ".mode('overwrite') \\\n", 405 | ".format(\"com.aerospike.spark.sql\") \\\n", 406 | ".option(\"aerospike.set\", \"salary_data\")\\\n", 407 | ".option(\"aerospike.updateByKey\", \"id\") \\\n", 408 | ".save()" 409 | ] 410 | }, 411 | { 412 | "cell_type": "markdown", 413 | "metadata": {}, 414 | "source": [ 415 | "## Step 1: Load data into a DataFrame using user specified schema " 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": 15, 421 | "metadata": {}, 422 | "outputs": [ 423 | { 424 | "name": "stdout", 425 | "output_type": "stream", 426 | "text": [ 427 | "+---+---------------+-----------------+------+\n", 428 | "| id| name| age|salary|\n", 429 | "+---+---------------+-----------------+------+\n", 430 | "|239|Individual: 239|31.83300818606226| 74975|\n", 431 | "|101|Individual: 101|43.01299505505053| 73747|\n", 432 | "|194|Individual: 194|40.82834439786344| 63853|\n", 433 | "| 31|Individual: 031|25.38038331484876| 52375|\n", 434 | "|139|Individual: 139|47.62537494799876| 80100|\n", 435 | "+---+---------------+-----------------+------+\n", 436 | "only showing top 5 rows\n", 437 | "\n" 438 | ] 439 | } 440 | ], 441 | "source": [ 442 | "# If we explicitly set the schema, using the previously created schema object\n", 443 | "# we effectively type the rows in the Data Frame\n", 444 | "\n", 445 | "loadedDFWithSchema=spark \\\n", 446 | ".read \\\n", 447 | ".format(\"com.aerospike.spark.sql\") \\\n", 448 | ".schema(schema) \\\n", 449 | ".option(\"aerospike.set\", \"salary_data\").load()\n", 450 | "\n", 451 | "loadedDFWithSchema.show(5)" 452 | ] 453 | }, 454 | { 455 | "cell_type": "markdown", 456 | "metadata": {}, 457 | "source": [ 458 | "## Step 2: Load Data from Spark DataFrame into H2OFrame" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": null, 464 | "metadata": {}, 465 | "outputs": [], 466 | "source": [ 467 | "#Save into an H2OFrame using a Key. A key is an entry in the H2O Key value store that maps to an object in H2O.\n", 468 | "loadedDFWithSchema.write.format(\"h2o\").option(\"key\", \"key_one\").save()" 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": 20, 474 | "metadata": {}, 475 | "outputs": [], 476 | "source": [ 477 | "#List the current contents of the H2O cluster, you can use the h2o.ls.\n", 478 | "h2o.ls()\n", 479 | "\n", 480 | "h2oframe = h2o.get_frame(\"key_one\")" 481 | ] 482 | }, 483 | { 484 | "cell_type": "markdown", 485 | "metadata": {}, 486 | "source": [ 487 | "## Step 3: Create a model using H2O libraries" 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": 21, 493 | "metadata": {}, 494 | "outputs": [ 495 | { 496 | "data": { 497 | "text/html": [ 498 | "\n", 499 | "\n", 500 | "\n", 501 | "\n", 502 | "\n", 503 | "\n", 504 | "\n", 505 | "\n", 506 | "\n", 507 | "\n", 508 | "\n", 509 | "\n", 510 | "\n", 511 | "\n", 512 | "\n", 513 | "\n", 514 | "\n", 515 | "\n", 516 | "\n", 517 | "\n", 518 | "\n", 519 | "\n", 520 | "\n", 521 | "
id name age salary
type int string real int
mins 1.0 NaN 22.40559084734761837748.0
mean 150.5 NaN 35.59354008698268567127.00666666667
maxs 300.0 NaN 60.312589253321136107261.0
sigma 86.74675786448738NaN 8.788476744518679 15177.875046143428
zeros 0 0 0 0
missing0 0 0 0
0 239.0 Individual: 23931.83300818606226 74975.0
1 101.0 Individual: 10143.01299505505053 73747.0
2 194.0 Individual: 19440.82834439786344 63853.0
3 31.0 Individual: 03125.38038331484876 52375.0
4 139.0 Individual: 13947.62537494799876 80100.0
5 14.0 Individual: 01425.41226437694945 50203.0
6 142.0 Individual: 14235.49930947093095 66239.0
7 272.0 Individual: 27232.59037083790934 51935.0
8 76.0 Individual: 07625.06627919363843750236.0
9 147.0 Individual: 14744.56553010864746577111.0
" 522 | ] 523 | }, 524 | "metadata": {}, 525 | "output_type": "display_data" 526 | } 527 | ], 528 | "source": [ 529 | "h2oframe.summary()" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": null, 535 | "metadata": {}, 536 | "outputs": [], 537 | "source": [] 538 | } 539 | ], 540 | "metadata": { 541 | "kernelspec": { 542 | "display_name": "Python 3", 543 | "language": "python", 544 | "name": "python3" 545 | }, 546 | "language_info": { 547 | "codemirror_mode": { 548 | "name": "ipython", 549 | "version": 3 550 | }, 551 | "file_extension": ".py", 552 | "mimetype": "text/x-python", 553 | "name": "python", 554 | "nbconvert_exporter": "python", 555 | "pygments_lexer": "ipython3", 556 | "version": "3.7.5" 557 | } 558 | }, 559 | "nbformat": 4, 560 | "nbformat_minor": 2 561 | } 562 | -------------------------------------------------------------------------------- /notebooks/spark/other_notebooks/AerospikeSparkPythonParquet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Convert Aerospike data into a Parquet file using Spark\n", 8 | "## Tested with Spark connector 3.1.0, Java 8, Apache Spark 3.0.2, Python 3.7" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "### The purpose of this notebook is to walk you through how to convert Aerospike data into a Parquet file using [Spark APIs](https://spark.apache.org/docs/latest/sql-data-sources-parquet.html). [Apache Parquet](https://parquet.apache.org/) is a columnar storage format that is extensively used as a format of choice for analysis in the big data ecosystem. " 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "# IP Address or DNS name for one host in your Aerospike cluster\n", 25 | "AS_HOST =\"127.0.0.1\"\n", 26 | "# Name of one of your namespaces. Type 'show namespaces' at the aql prompt if you are not sure\n", 27 | "AS_NAMESPACE = \"testNameSpace\" \n", 28 | "AEROSPIKE_SPARK_JAR_VERSION=\"3.1.0\"\n", 29 | "AS_PORT = 3000 # Usually 3000, but change here if not\n", 30 | "AS_CONNECTION_STRING = AS_HOST + \":\"+ str(AS_PORT)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "# Next we locate the Spark installation - this will be found using the SPARK_HOME \n", 40 | "# environment variable that you will have set \n", 41 | "\n", 42 | "import findspark\n", 43 | "findspark.init()" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "#### Please download the Aeropsike Connect for Spark from the [download page](https://enterprise.aerospike.com/enterprise/download/connectors/aerospike-spark/notes.html) and make sure you check the [interoperability page]( https://docs.aerospike.com/docs/connect/processing/spark/installation.html#prerequisites-for-using-the-spark-connector ).\n", 51 | "Set `AEROSPIKE_JAR_PATH` with path to the downloaded binary" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "import os \n", 61 | "AEROSPIKE_JAR_PATH= \"aerospike-spark-assembly-\"+AEROSPIKE_SPARK_JAR_VERSION+\".jar\"\n", 62 | "os.environ[\"PYSPARK_SUBMIT_ARGS\"] = '--jars ' + AEROSPIKE_JAR_PATH + ' pyspark-shell'" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 4, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "import pyspark\n", 72 | "from pyspark.context import SparkContext\n", 73 | "from pyspark.sql.context import SQLContext\n", 74 | "from pyspark.sql.session import SparkSession\n", 75 | "from pyspark.sql.types import StringType, StructField, StructType, ArrayType, IntegerType, MapType, LongType, DoubleType" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "### Configure Aerospike properties in the Spark Session object. Please visit [Configuring Aerospike Connect for Spark](https://docs.aerospike.com/docs/connect/processing/spark/configuration.html) for more information about the properties used on this page." 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 5, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "sc = SparkContext.getOrCreate()\n", 92 | "conf=sc._conf.setAll([(\"aerospike.namespace\",AS_NAMESPACE),(\"aerospike.seedhost\",AS_CONNECTION_STRING)])\n", 93 | "sc.stop()\n", 94 | "sc = pyspark.SparkContext(conf=conf)\n", 95 | "spark = SparkSession(sc)\n", 96 | "sqlContext = SQLContext(sc)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "### Load data from Aerospike into a Spark DataFrame" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 15, 109 | "metadata": {}, 110 | "outputs": [ 111 | { 112 | "name": "stdout", 113 | "output_type": "stream", 114 | "text": [ 115 | "+--------------+--------------------+---------+------------+-------+------------+--------------+-----------+-----------+--------------+------------+-----------+-----------+-------------+------------+--------------+-----------+----------+\n", 116 | "| __key| __digest| __expiry|__generation| __ttl|drate_100Kl7|conf_rate_100K|probable_dd|d_rate_100K| state_ter|total_deaths|total_cases|d_in_last_7|confirm_cases|crate_100Kl7|case_last_week|pbble_cases|confirm_dd|\n", 117 | "+--------------+--------------------+---------+------------+-------+------------+--------------+-----------+-----------+--------------+------------+-----------+-----------+-------------+------------+--------------+-----------+----------+\n", 118 | "|Virgin Islands|[2D 40 5A 16 9B 9...|377621369| 2|2591982| 0.3| 1342.0| 0| 21.0|Virgin Islands| 23| 1405| 2| 0| 3.7| 27| 0| 0|\n", 119 | "|North Carolina|[83 70 D3 0C A3 2...|377621369| 2|2591982| 0.3| 2825.0| 94| 44.0|North Carolina| 4607| 293339| 224| 280213| 22.9| 16647| 13126| 4513|\n", 120 | "| Indiana|[91 60 2C F4 F4 4...|377621369| 2|2591982| 0.6| 3144.0| 246| 69.0| Indiana| 4629| 210374| 265| 0| 60.3| 28266| 0| 4383|\n", 121 | "| Oklahoma|[EF 70 A8 4C 85 0...|377621369| 2|2591982| 0.4| 3720.0| 43| 36.0| Oklahoma| 1450| 146692| 98| 124671| 58.5| 16151| 22021| 1407|\n", 122 | "| Missouri|[0A 91 83 C6 45 D...|377621369| 2|2591982| 0.3| 3415.0| 0| 51.0| Missouri| 3153| 209197| 127| 0| 55.2| 23662| 0| 0|\n", 123 | "+--------------+--------------------+---------+------------+-------+------------+--------------+-----------+-----------+--------------+------------+-----------+-----------+-------------+------------+--------------+-----------+----------+\n", 124 | "only showing top 5 rows\n", 125 | "\n" 126 | ] 127 | } 128 | ], 129 | "source": [ 130 | "#We will not specify the schema here, but rather use the schema inference capability of the Spark connector. \n", 131 | "as_df=spark \\\n", 132 | ".read \\\n", 133 | ".format(\"aerospike\") \\\n", 134 | ".option(\"aerospike.set\", \"covid_stats\") \\\n", 135 | ".option(\"aerospike.sendKey\", \"true\") \\\n", 136 | ".load() \n", 137 | "as_df.show(5)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 21, 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "name": "stdout", 147 | "output_type": "stream", 148 | "text": [ 149 | "root\n", 150 | " |-- __key: string (nullable = true)\n", 151 | " |-- __digest: binary (nullable = true)\n", 152 | " |-- __expiry: integer (nullable = false)\n", 153 | " |-- __generation: integer (nullable = false)\n", 154 | " |-- __ttl: integer (nullable = false)\n", 155 | " |-- drate_100Kl7: double (nullable = true)\n", 156 | " |-- conf_rate_100K: double (nullable = true)\n", 157 | " |-- probable_dd: long (nullable = true)\n", 158 | " |-- d_rate_100K: double (nullable = true)\n", 159 | " |-- state_ter: string (nullable = true)\n", 160 | " |-- total_deaths: long (nullable = true)\n", 161 | " |-- total_cases: long (nullable = true)\n", 162 | " |-- d_in_last_7: long (nullable = true)\n", 163 | " |-- confirm_cases: long (nullable = true)\n", 164 | " |-- crate_100Kl7: double (nullable = true)\n", 165 | " |-- case_last_week: long (nullable = true)\n", 166 | " |-- pbble_cases: long (nullable = true)\n", 167 | " |-- confirm_dd: long (nullable = true)\n", 168 | "\n" 169 | ] 170 | } 171 | ], 172 | "source": [ 173 | "as_df.printSchema()" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "### Dump the DataFrame into a parquet file in your local FS, HDFS, or S3" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 17, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "as_df.write.parquet(\"proto.parquet\")" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "#### Notice that a directory \"proto.parquet\" is created in your current directory with a bunch of files" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "### Read the parquet file from your data store for further analysis" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 22, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "#Read in the parquet file created above\n", 213 | "#Parquet files are self-describing so the schema is preserved\n", 214 | "#The result of loading a Parquet file is also a DataFrame\n", 215 | "parquetFileDF = spark.read.parquet(\"proto.parquet\")\n", 216 | "\n", 217 | "#Parquet files can also be used to create a temporary view and then used in SQL statements\n", 218 | "parquetFileDF.createOrReplaceTempView(\"parquetFile\")" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "### Analyze data" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 23, 231 | "metadata": {}, 232 | "outputs": [ 233 | { 234 | "name": "stdout", 235 | "output_type": "stream", 236 | "text": [ 237 | "+--------------------+------------+\n", 238 | "| states|covid_deaths|\n", 239 | "+--------------------+------------+\n", 240 | "|Federated States ...| 0|\n", 241 | "|Republic of Marsh...| 0|\n", 242 | "|Northern Mariana ...| 2|\n", 243 | "|District of Columbia| 654|\n", 244 | "| North Carolina| 4607|\n", 245 | "| Virgin Islands| 23|\n", 246 | "| American Samoa| 0|\n", 247 | "| South Carolina| 4036|\n", 248 | "| New Hampshire| 489|\n", 249 | "| West Virginia| 502|\n", 250 | "| Massachusetts| 10131|\n", 251 | "| New York City| 24086|\n", 252 | "| New Jersey| 16429|\n", 253 | "| Guam| 88|\n", 254 | "| Pennsylvania| 9020|\n", 255 | "| Rhode Island| 1224|\n", 256 | "| North Dakota| 639|\n", 257 | "| Arizona| 6164|\n", 258 | "| California| 17963|\n", 259 | "| Idaho| 686|\n", 260 | "+--------------------+------------+\n", 261 | "only showing top 20 rows\n", 262 | "\n" 263 | ] 264 | } 265 | ], 266 | "source": [ 267 | "namesDF = spark.sql(\"SELECT state_ter as states, total_deaths as covid_deaths FROM parquetFile\").show()" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 26, 273 | "metadata": {}, 274 | "outputs": [ 275 | { 276 | "name": "stdout", 277 | "output_type": "stream", 278 | "text": [ 279 | "+--------------+\n", 280 | "| hot_zones|\n", 281 | "+--------------+\n", 282 | "|North Carolina|\n", 283 | "| Massachusetts|\n", 284 | "| New Jersey|\n", 285 | "| Pennsylvania|\n", 286 | "| Arizona|\n", 287 | "| California|\n", 288 | "| Georgia|\n", 289 | "| Tennessee|\n", 290 | "| Wisconsin|\n", 291 | "| Minnesota|\n", 292 | "| Colorado|\n", 293 | "| Kentucky|\n", 294 | "| Illinois|\n", 295 | "| Virginia|\n", 296 | "| Missouri|\n", 297 | "| New York|\n", 298 | "| Nebraska|\n", 299 | "| Oklahoma|\n", 300 | "| Michigan|\n", 301 | "| Florida|\n", 302 | "+--------------+\n", 303 | "only showing top 20 rows\n", 304 | "\n" 305 | ] 306 | } 307 | ], 308 | "source": [ 309 | "namesDF = spark.sql(\"SELECT state_ter as hot_zones FROM parquetFile where case_last_week > 10000\").show()" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [] 318 | } 319 | ], 320 | "metadata": { 321 | "kernelspec": { 322 | "display_name": "Python 3", 323 | "language": "python", 324 | "name": "python3" 325 | }, 326 | "language_info": { 327 | "codemirror_mode": { 328 | "name": "ipython", 329 | "version": 3 330 | }, 331 | "file_extension": ".py", 332 | "mimetype": "text/x-python", 333 | "name": "python", 334 | "nbconvert_exporter": "python", 335 | "pygments_lexer": "ipython3", 336 | "version": "3.7.5" 337 | } 338 | }, 339 | "nbformat": 4, 340 | "nbformat_minor": 2 341 | } 342 | -------------------------------------------------------------------------------- /notebooks/spark/other_notebooks/AerospikeSparkSQLSyntaxDemo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Aerospike Connect for Spark - SQL Syntax Tutorial for Python\n", 8 | "## Tested with Java 8, Spark 2.4.0, Python 3.7, and Aerospike Spark Connector 2.5" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "### Setup\n", 16 | "\n", 17 | "Below, a seed address for your Aerospike database cluster is required\n", 18 | "\n", 19 | "Check the given namespace is available, and your feature key is located as per AS_FEATURE_KEY_PATH\n", 20 | "\n", 21 | "Finally, review https://www.aerospike.com/enterprise/download/connectors/ to ensure AEROSPIKE_SPARK_JAR_VERSION is correct" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "# IP Address or DNS name for one host in your Aerospike cluster\n", 31 | "AS_HOST =\"127.0.0.1\"\n", 32 | "# Name of one of your namespaces. Type 'show namespaces' at the aql prompt if you are not sure\n", 33 | "AS_NAMESPACE = \"test\" \n", 34 | "AS_FEATURE_KEY_PATH = \"/etc/aerospike/features.conf\"\n", 35 | "AEROSPIKE_SPARK_JAR_VERSION=\"2.5.0\"\n", 36 | "\n", 37 | "AS_PORT = 3000 # Usually 3000, but change here if not\n", 38 | "AS_CONNECTION_STRING = AS_HOST + \":\"+ str(AS_PORT)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 2, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "# Next we locate the Spark installation - this will be found using the SPARK_HOME environment variable that you will have set \n", 48 | "# if you followed the repository README\n", 49 | "\n", 50 | "import findspark\n", 51 | "findspark.init()" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "name": "stdout", 61 | "output_type": "stream", 62 | "text": [ 63 | "aerospike-spark-assembly-2.5.0.jar already downloaded\n" 64 | ] 65 | } 66 | ], 67 | "source": [ 68 | "# Here we download the Aerospike Spark jar\n", 69 | "import urllib\n", 70 | "import os\n", 71 | "\n", 72 | "def aerospike_spark_jar_download_url(version=AEROSPIKE_SPARK_JAR_VERSION):\n", 73 | " DOWNLOAD_PREFIX=\"https://www.aerospike.com/enterprise/download/connectors/aerospike-spark/\"\n", 74 | " DOWNLOAD_SUFFIX=\"/artifact/jar\"\n", 75 | " AEROSPIKE_SPARK_JAR_DOWNLOAD_URL = DOWNLOAD_PREFIX+AEROSPIKE_SPARK_JAR_VERSION+DOWNLOAD_SUFFIX\n", 76 | " return AEROSPIKE_SPARK_JAR_DOWNLOAD_URL\n", 77 | "\n", 78 | "def download_aerospike_spark_jar(version=AEROSPIKE_SPARK_JAR_VERSION):\n", 79 | " JAR_NAME=\"aerospike-spark-assembly-\"+AEROSPIKE_SPARK_JAR_VERSION+\".jar\"\n", 80 | " if(not(os.path.exists(JAR_NAME))) :\n", 81 | " urllib.request.urlretrieve(aerospike_spark_jar_download_url(),JAR_NAME)\n", 82 | " else :\n", 83 | " print(JAR_NAME+\" already downloaded\")\n", 84 | " return os.path.join(os.getcwd(),JAR_NAME)\n", 85 | "\n", 86 | "AEROSPIKE_JAR_PATH=download_aerospike_spark_jar()\n", 87 | "os.environ[\"PYSPARK_SUBMIT_ARGS\"] = '--jars ' + AEROSPIKE_JAR_PATH + ' pyspark-shell'" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 4, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "import pyspark\n", 97 | "from pyspark.context import SparkContext\n", 98 | "from pyspark.sql.context import SQLContext\n", 99 | "from pyspark.sql.session import SparkSession\n", 100 | "from pyspark.sql.types import StringType, StructField, StructType, ArrayType, IntegerType, MapType, LongType, DoubleType" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "Get a spark session object and set required Aerospike configuration properties" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "Set up spark and point aerospike db to AS_HOST" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 5, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "sc = SparkContext.getOrCreate()\n", 124 | "spark = SparkSession(sc)\n", 125 | "sqlContext = SQLContext(sc)\n", 126 | "spark.conf.set(\"aerospike.namespace\",AS_NAMESPACE)\n", 127 | "spark.conf.set(\"aerospike.seedhost\",AS_CONNECTION_STRING)\n", 128 | "spark.conf.set(\"aerospike.keyPath\",AS_FEATURE_KEY_PATH )" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "## Create Sample Data and load it into Aerospike" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 6, 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "name": "stdout", 145 | "output_type": "stream", 146 | "text": [ 147 | "Data created\n" 148 | ] 149 | } 150 | ], 151 | "source": [ 152 | "# We create age vs salary data, using three different Gaussian distributions\n", 153 | "import numpy as np\n", 154 | "import matplotlib.pyplot as plt\n", 155 | "import pandas as pd\n", 156 | "import math\n", 157 | "\n", 158 | "# Create covariance matrix from std devs + correlation\n", 159 | "def covariance_matrix(std_dev_1,std_dev_2,correlation):\n", 160 | " return [[std_dev_1 ** 2, correlation * std_dev_1 * std_dev_2], \n", 161 | " [correlation * std_dev_1 * std_dev_2, std_dev_2 ** 2]]\n", 162 | "\n", 163 | "# Return a bivariate sample given means/std dev/correlation\n", 164 | "def age_salary_sample(distribution_params,sample_size):\n", 165 | " mean = [distribution_params[\"age_mean\"], distribution_params[\"salary_mean\"]]\n", 166 | " cov = covariance_matrix(distribution_params[\"age_std_dev\"],distribution_params[\"salary_std_dev\"],\n", 167 | " distribution_params[\"age_salary_correlation\"])\n", 168 | " return np.random.multivariate_normal(mean, cov, sample_size).T\n", 169 | "\n", 170 | "# Define the characteristics of our age/salary distribution\n", 171 | "age_salary_distribution_1 = {\"age_mean\":25,\"salary_mean\":50000,\n", 172 | " \"age_std_dev\":1,\"salary_std_dev\":5000,\"age_salary_correlation\":0.3}\n", 173 | "\n", 174 | "age_salary_distribution_2 = {\"age_mean\":45,\"salary_mean\":80000,\n", 175 | " \"age_std_dev\":4,\"salary_std_dev\":10000,\"age_salary_correlation\":0.7}\n", 176 | "\n", 177 | "age_salary_distribution_3 = {\"age_mean\":35,\"salary_mean\":70000,\n", 178 | " \"age_std_dev\":2,\"salary_std_dev\":9000,\"age_salary_correlation\":0.1}\n", 179 | "\n", 180 | "distribution_data = [age_salary_distribution_1,age_salary_distribution_2,age_salary_distribution_3]\n", 181 | "\n", 182 | "# Sample age/salary data for each distributions\n", 183 | "group_1_ages,group_1_salaries = age_salary_sample(age_salary_distribution_1,sample_size=100)\n", 184 | "group_2_ages,group_2_salaries = age_salary_sample(age_salary_distribution_2,sample_size=120)\n", 185 | "group_3_ages,group_3_salaries = age_salary_sample(age_salary_distribution_3,sample_size=80)\n", 186 | "\n", 187 | "ages=np.concatenate([group_1_ages,group_2_ages,group_3_ages])\n", 188 | "salaries=np.concatenate([group_1_salaries,group_2_salaries,group_3_salaries])\n", 189 | "\n", 190 | "print(\"Data created\")" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 7, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "# Turn the above records into a Data Frame\n", 200 | "# First of all, create an array of arrays\n", 201 | "inputBuf = []\n", 202 | "\n", 203 | "for i in range(0, len(ages)) :\n", 204 | " id = i + 1 # Avoid counting from zero\n", 205 | " name = \"Individual: {:03d}\".format(id)\n", 206 | " # Note we need to make sure values are typed correctly\n", 207 | " # salary will have type numpy.float64 - if it is not cast as below, an error will be thrown\n", 208 | " age = float(ages[i])\n", 209 | " salary = int(salaries[i])\n", 210 | " inputBuf.append((id, name,age,salary))\n", 211 | "\n", 212 | "# Convert to an RDD \n", 213 | "inputRDD = spark.sparkContext.parallelize(inputBuf)\n", 214 | " \n", 215 | "# Convert to a data frame using a schema\n", 216 | "schema = StructType([\n", 217 | " StructField(\"id\", IntegerType(), True),\n", 218 | " StructField(\"name\", StringType(), True),\n", 219 | " StructField(\"age\", DoubleType(), True),\n", 220 | " StructField(\"salary\",IntegerType(), True)\n", 221 | "])\n", 222 | "\n", 223 | "inputDF=spark.createDataFrame(inputRDD,schema)\n", 224 | "\n", 225 | "#Write the data frame to Aerospike, the id field is used as the primary key\n", 226 | "inputDF \\\n", 227 | ".write \\\n", 228 | ".mode('overwrite') \\\n", 229 | ".format(\"com.aerospike.spark.sql\") \\\n", 230 | ".option(\"aerospike.set\", \"salary_data\")\\\n", 231 | ".option(\"aerospike.updateByKey\", \"id\") \\\n", 232 | ".save()" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "## Step 1: Load data into a DataFrame using user specified schema " 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 8, 245 | "metadata": {}, 246 | "outputs": [ 247 | { 248 | "name": "stdout", 249 | "output_type": "stream", 250 | "text": [ 251 | "+---+---------------+------------------+------+\n", 252 | "| id| name| age|salary|\n", 253 | "+---+---------------+------------------+------+\n", 254 | "|239|Individual: 239|35.045714151242784| 64851|\n", 255 | "|101|Individual: 101| 48.94863100225242| 92233|\n", 256 | "|194|Individual: 194| 43.87904465057981| 76336|\n", 257 | "| 31|Individual: 031|25.419955216543517| 51542|\n", 258 | "|139|Individual: 139|39.658710069583876| 80585|\n", 259 | "+---+---------------+------------------+------+\n", 260 | "only showing top 5 rows\n", 261 | "\n" 262 | ] 263 | }, 264 | { 265 | "data": { 266 | "text/plain": [ 267 | "300" 268 | ] 269 | }, 270 | "execution_count": 8, 271 | "metadata": {}, 272 | "output_type": "execute_result" 273 | } 274 | ], 275 | "source": [ 276 | "# If we explicitly set the schema, using the previously created schema object\n", 277 | "# we effectively type the rows in the Data Frame\n", 278 | "\n", 279 | "loadedDFWithSchema=spark \\\n", 280 | ".read \\\n", 281 | ".format(\"com.aerospike.spark.sql\") \\\n", 282 | ".schema(schema) \\\n", 283 | ".option(\"aerospike.set\", \"salary_data\").load()\n", 284 | "\n", 285 | "loadedDFWithSchema.show(5)\n", 286 | "loadedDFWithSchema.count()" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "## Step 2: Register a Temp Table" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 9, 299 | "metadata": { 300 | "scrolled": true 301 | }, 302 | "outputs": [], 303 | "source": [ 304 | "loadedDFWithSchema.registerTempTable(\"myview\")" 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "metadata": {}, 310 | "source": [ 311 | "## Step 3a: Read Data" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 18, 317 | "metadata": {}, 318 | "outputs": [ 319 | { 320 | "data": { 321 | "text/plain": [ 322 | "[Row(id=239, name='Individual: 239', age=36.31763988049552, salary=73121),\n", 323 | " Row(id=101, name='Individual: 101', age=42.131372446959624, salary=88392),\n", 324 | " Row(id=194, name='Individual: 194', age=45.67209291776493, salary=68430),\n", 325 | " Row(id=31, name='Individual: 031', age=25.369666877630568, salary=48846),\n", 326 | " Row(id=139, name='Individual: 139', age=43.51114009073862, salary=82116),\n", 327 | " Row(id=14, name='Individual: 014', age=26.58855120481238, salary=61593),\n", 328 | " Row(id=142, name='Individual: 142', age=43.170929881406686, salary=86203),\n", 329 | " Row(id=272, name='Individual: 272', age=38.43340146883269, salary=72691),\n", 330 | " Row(id=76, name='Individual: 076', age=24.93997559158264, salary=64180),\n", 331 | " Row(id=147, name='Individual: 147', age=52.175425376631246, salary=88246),\n", 332 | " Row(id=79, name='Individual: 079', age=24.65820831985479, salary=54088),\n", 333 | " Row(id=96, name='Individual: 096', age=25.518457474526, salary=49251),\n", 334 | " Row(id=132, name='Individual: 132', age=41.798677512668064, salary=84438),\n", 335 | " Row(id=10, name='Individual: 010', age=25.509944072858175, salary=45908),\n", 336 | " Row(id=141, name='Individual: 141', age=49.80648644002289, salary=87623),\n", 337 | " Row(id=140, name='Individual: 140', age=41.11269768838019, salary=78535),\n", 338 | " Row(id=160, name='Individual: 160', age=36.35698689416882, salary=61116),\n", 339 | " Row(id=112, name='Individual: 112', age=47.632639902424046, salary=78404),\n", 340 | " Row(id=120, name='Individual: 120', age=49.876620096920284, salary=94501),\n", 341 | " Row(id=34, name='Individual: 034', age=26.77243285030579, salary=46245)]" 342 | ] 343 | }, 344 | "execution_count": 18, 345 | "metadata": {}, 346 | "output_type": "execute_result" 347 | } 348 | ], 349 | "source": [ 350 | "spark.sql(\"select * from myview\").take(20)" 351 | ] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "metadata": {}, 356 | "source": [ 357 | "## Step 3b: Write data (Coming Soon!) \n", 358 | "#### Please note that Spark does on support DELETE statement, so just INSERT INTO and INSERT statements will be supported" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [] 367 | } 368 | ], 369 | "metadata": { 370 | "kernelspec": { 371 | "display_name": "Python 3", 372 | "language": "python", 373 | "name": "python3" 374 | }, 375 | "language_info": { 376 | "codemirror_mode": { 377 | "name": "ipython", 378 | "version": 3 379 | }, 380 | "file_extension": ".py", 381 | "mimetype": "text/x-python", 382 | "name": "python", 383 | "nbconvert_exporter": "python", 384 | "pygments_lexer": "ipython3", 385 | "version": "3.7.5" 386 | } 387 | }, 388 | "nbformat": 4, 389 | "nbformat_minor": 2 390 | } 391 | -------------------------------------------------------------------------------- /notebooks/spark/resources/fs-arch.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aerospike-examples/interactive-notebooks/0e582d4305974f6cadd390e2086e8550f1b3ecf7/notebooks/spark/resources/fs-arch.jpg -------------------------------------------------------------------------------- /notebooks/spark/resources/fs-model-ws.py: -------------------------------------------------------------------------------- 1 | # fs-model-ws.py 2 | # This file implements the web service for a simple fraud prediction model from 3 | # the Jupyter notebook feature-store-model-serving.ipynb. 4 | 5 | from flask import Flask, jsonify 6 | from flask_restful import Resource, Api, reqparse 7 | 8 | app = Flask(__name__) 9 | api = Api(app) 10 | 11 | # globals 12 | client = None 13 | spark = None 14 | rf_model = None 15 | namespace = 'test' 16 | entity_set = 'cctxn-features' 17 | features = None 18 | schema = None 19 | 20 | class CCTxnModel(Resource): 21 | 22 | # predict() processes requests and returns predictions 23 | @app.route('/', methods=['GET', 'POST']) 24 | def predict(): 25 | global client, spark, rf_model, namespace, entity_set, features, schema 26 | 27 | # use parser to find txnid 28 | parser = reqparse.RequestParser() 29 | parser.add_argument('txnid') 30 | args = parser.parse_args() 31 | txnid = args['txnid'] 32 | 33 | # Retrieving Features 34 | record_key = (namespace, entity_set, txnid) 35 | try: 36 | (key, meta, bins) = client.select(record_key, features) 37 | except: 38 | import sys 39 | print('failed to get record') 40 | sys.exit(1) 41 | 42 | # create a input dataframe for the model 43 | featureBuf = [tuple([bins[f] for f in features])] 44 | featureRDD = spark.sparkContext.parallelize(featureBuf) 45 | featureDF = spark.createDataFrame(featureRDD, schema) 46 | 47 | # Construct Feature Vector 48 | from pyspark.ml.feature import VectorAssembler 49 | 50 | # create a feature vector from features 51 | assembler = VectorAssembler(inputCols=features, outputCol="fvector") 52 | featureVectorDF = assembler.transform(featureDF) 53 | 54 | # Predict 55 | from pyspark.ml.classification import RandomForestClassificationModel 56 | rf_prediction = rf_model.transform(featureVectorDF['fvector', ]) 57 | result = rf_prediction['probability', 'prediction'].collect()[0] 58 | 59 | return jsonify({'normal_prob': result[0][0], 60 | 'fraud_prob': result[0][1], 61 | 'prediction':'no fraud' if result[1] < 0.5 else 'fraud'}) 62 | 63 | # add resource for processing requests 64 | api.add_resource(CCTxnModel, '/') 65 | 66 | # initialization of client, spark, model 67 | def initialize(): 68 | global client, spark, rf_model, features, schema 69 | 70 | # Initialize Client 71 | # connect to the database 72 | import aerospike 73 | import sys 74 | config = { 75 | 'hosts': [ ('127.0.0.1', 3000) ] 76 | } 77 | try: 78 | client = aerospike.client(config).connect() 79 | except: 80 | print("failed to connect to the cluster with", config['hosts']) 81 | sys.exit(1) 82 | print('Client initialized and connected to database') 83 | 84 | # Initialize Spark 85 | # directory where spark notebook requisites are installed 86 | #SPARK_NB_DIR = '/home/jovyan/notebooks/spark' 87 | SPARK_NB_DIR = '/opt/spark-nb' 88 | SPARK_HOME = SPARK_NB_DIR + '/spark-3.0.3-bin-hadoop3.2' 89 | # IP Address or DNS name for one host in your Aerospike cluster 90 | AS_HOST ="localhost" 91 | # Name of one of your namespaces. Type 'show namespaces' at the aql prompt if you are not sure 92 | AS_NAMESPACE = "test" 93 | AEROSPIKE_SPARK_JAR_VERSION="3.2.0" 94 | AS_PORT = 3000 # Usually 3000, but change here if not 95 | AS_CONNECTION_STRING = AS_HOST + ":"+ str(AS_PORT) 96 | # Next we locate the Spark installation - this will be found using the SPARK_HOME environment 97 | # variable that you will have set 98 | import findspark 99 | findspark.init(SPARK_HOME) 100 | # Aerospike Spark Connector related settings 101 | import os 102 | AEROSPIKE_JAR_PATH= "aerospike-spark-assembly-"+AEROSPIKE_SPARK_JAR_VERSION+".jar" 103 | os.environ["PYSPARK_SUBMIT_ARGS"] = '--jars ' + SPARK_NB_DIR + '/' + AEROSPIKE_JAR_PATH + ' pyspark-shell' 104 | # imports 105 | import pyspark 106 | from pyspark.context import SparkContext 107 | from pyspark.sql.session import SparkSession 108 | from pyspark.sql.types import StructField, StructType, DoubleType 109 | 110 | sc = SparkContext.getOrCreate() 111 | conf=sc._conf.setAll([("aerospike.namespace",AS_NAMESPACE),("aerospike.seedhost",AS_CONNECTION_STRING)]) 112 | sc.stop() 113 | sc = pyspark.SparkContext(conf=conf) 114 | spark = SparkSession(sc) 115 | 116 | # Load Model 117 | from pyspark.ml.classification import RandomForestClassificationModel 118 | 119 | rf_model = RandomForestClassificationModel.read().load( 120 | "/home/jovyan/notebooks/spark/resources/fs_model_rf") 121 | print("Loaded Random Forest model.") 122 | 123 | # Initialize model features and schema 124 | features = ["CC1_V"+str(i) for i in range(1,29)] # need features CC1_V1-CC1_V28 125 | schema = StructType() 126 | for i in range(1,29): # all features are of type float or Double 127 | schema.add("CC1_V"+str(i), DoubleType(), True) 128 | return 129 | 130 | 131 | if __name__ == '__main__': 132 | initialize() 133 | app.run(debug=True) 134 | -------------------------------------------------------------------------------- /notebooks/spark/resources/fs_model_rf.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aerospike-examples/interactive-notebooks/0e582d4305974f6cadd390e2086e8550f1b3ecf7/notebooks/spark/resources/fs_model_rf.tar.gz -------------------------------------------------------------------------------- /notebooks/spark/resources/install.txt: -------------------------------------------------------------------------------- 1 | 2 | #Jupyter notebook installation for Mac 3 | 4 | brew install pyenv 5 | pyenv install 3.7.0 6 | 7 | pyenv global 3.7.0 //setting 3.7.0 as our python version 8 | echo -e 'if command -v pyenv 1>/dev/null 2>&1; then\n eval "$(pyenv init -)"\nfi' >> ~/.bash_profile 9 | source ~/.bash_profile 10 | 11 | pip install --upgrade pip 12 | pip install jupyter 13 | pip install spylon-kernel 14 | python -m spylon_kernel install 15 | jupyter kernelspec list // should display installed kernels => python3 and spylon-kernel -------------------------------------------------------------------------------- /notebooks/udf/aggregate_fns.lua: -------------------------------------------------------------------------------- 1 | -- aggregate_fns.lua - stream UDF functions to implement aggregates 2 | 3 | -- count and sum reducer 4 | local function add_values(val1, val2) 5 | return (val1 or 0) + (val2 or 0) 6 | end 7 | 8 | -- count mapper 9 | -- note closures are used to access aggregate parameters such as bin 10 | local function rec_to_count_closure(bin) 11 | local function rec_to_count(rec) 12 | -- if bin is specified: if bin exists in record return 1 else 0; if no bin is specified, return 1 13 | return (not bin and 1) or ((rec[bin] and 1) or 0) 14 | end 15 | return rec_to_count 16 | end 17 | 18 | -- count 19 | function count(stream) 20 | return stream : map(rec_to_count_closure()) : reduce(add_values) 21 | end 22 | 23 | -- mapper for various single bin aggregates 24 | local function rec_to_bin_value_closure(bin) 25 | local function rec_to_bin_value(rec) 26 | -- if a numeric bin exists in record return its value; otherwise return nil 27 | local val = rec[bin] 28 | if (type(val) ~= "number") then val = nil end 29 | return val 30 | end 31 | return rec_to_bin_value 32 | end 33 | 34 | -- sum 35 | function sum(stream, bin) 36 | return stream : map(rec_to_bin_value_closure(bin)) : reduce(add_values) 37 | end 38 | 39 | 40 | -- range filter 41 | local function range_filter_closure(range_bin, range_low, range_high) 42 | local function range_filter(rec) 43 | -- if bin value is in [low,high] return true, false otherwise 44 | local val = rec[range_bin] 45 | if (not val or type(val) ~= "number") then val = nil end 46 | return (val and (val >= range_low and val <= range_high)) or false 47 | end 48 | return range_filter 49 | end 50 | 51 | -- sum of range: sum(sum_bin) where range_bin in [range_low, range_high] 52 | function sum_range(stream, sum_bin, range_bin, range_low, range_high) 53 | return stream : filter(range_filter_closure(range_bin, range_low, range_high)) 54 | : map(rec_to_bin_value_closure(sum_bin)) : reduce(add_values) 55 | end 56 | 57 | -- min reducer 58 | local function get_min(val1, val2) 59 | local min = nil 60 | if val1 then 61 | if val2 then 62 | if val1 < val2 then min = val1 else min = val2 end 63 | else min = val1 64 | end 65 | else 66 | if val2 then min = val2 end 67 | end 68 | return min 69 | end 70 | 71 | -- min 72 | function min(stream, bin) 73 | return stream : map(rec_to_bin_value_closure(bin)) : reduce(get_min) 74 | end 75 | 76 | -- max reducer 77 | local function get_max(val1, val2) 78 | local max = nil 79 | if val1 then 80 | if val2 then 81 | if val1 > val2 then max = val1 else max = val2 end 82 | else max = val1 83 | end 84 | else 85 | if val2 then max = val2 end 86 | end 87 | return max 88 | end 89 | 90 | -- max 91 | function max(stream, bin) 92 | return stream : map(rec_to_bin_value_closure(bin)) : reduce(get_max) 93 | end 94 | 95 | -- map function to comoute average and range 96 | local function compute_final_stats(stats) 97 | local ret = map(); 98 | ret['AVERAGE'] = stats["sum"] / stats["count"] 99 | ret['RANGE'] = stats["max"] - stats["min"] 100 | return ret 101 | end 102 | 103 | -- merge partial stream maps into one 104 | local function merge_stats(a, b) 105 | local ret = map() 106 | ret["sum"] = add_values(a["sum"], b["sum"]) 107 | ret["count"] = add_values(a["count"], b["count"]) 108 | ret["min"] = get_min(a["min"], b["min"]) 109 | ret["max"] = get_max(a["max"], b["max"]) 110 | return ret 111 | end 112 | 113 | -- aggregate operator to compute stream state for average_range 114 | local function aggregate_stats(agg, val) 115 | agg["count"] = (agg["count"] or 0) + ((val["bin_avg"] and 1) or 0) 116 | agg["sum"] = (agg["sum"] or 0) + (val["bin_avg"] or 0) 117 | agg["min"] = get_min(agg["min"], val["bin_range"]) 118 | agg["max"] = get_max(agg["max"], val["bin_range"]) 119 | return agg 120 | end 121 | 122 | -- average_range 123 | function average_range(stream, bin_avg, bin_range) 124 | local function rec_to_bins(rec) 125 | -- extract the values of the two bins in ret 126 | local ret = map() 127 | ret["bin_avg"] = rec[bin_avg] 128 | ret["bin_range"] = rec[bin_range] 129 | return ret 130 | end 131 | return stream : map(rec_to_bins) : aggregate(map(), aggregate_stats) : reduce(merge_stats) : map(compute_final_stats) 132 | end 133 | 134 | -- nested map merge for group-by sum/count; explicit map merge at each nested level 135 | local function merge_group_sum(a, b) 136 | local function merge_group(x, y) 137 | -- inner map merge 138 | return map.merge(x, y, add_values) 139 | end 140 | -- outer map merge 141 | return map.merge(a, b, merge_group) 142 | end 143 | 144 | -- aggregate for group-by sum 145 | -- creates a map for each distinct group value and adds the value tagged for a group to the group's sum 146 | local function group_sum(agg, groupval) 147 | if not agg[groupval["group"]] then agg[groupval["group"]] = map() end 148 | agg[groupval["group"]]["sum"] = (agg[groupval["group"]]["sum"] or 0) + (groupval["value"] or 0) 149 | return agg 150 | end 151 | 152 | -- group-by with sum 153 | function groupby_with_sum(stream, bin_grpby, bin_sum) 154 | local function rec_to_group_and_bin(rec) 155 | -- tag the group by bin_grpby value, return a map containing group and bin_sum value 156 | local ret = map() 157 | ret["group"] = rec[bin_grpby] 158 | local val = rec[bin_sum] 159 | if (not val or type(val) ~= "number") then val = 0 end 160 | ret["value"] = val 161 | return ret 162 | end 163 | return stream : map(rec_to_group_and_bin) : aggregate(map(), group_sum) : reduce(merge_group_sum) 164 | end 165 | 166 | -- aggregate for group-by count 167 | -- creates a map for each distinct group value and increments the tagged group's count 168 | local function group_count(agg, group) 169 | if not agg[group] then agg[group] = map() end 170 | agg[group]["count"] = (agg[group]["count"] or 0) + ((group and 1) or 0) 171 | return agg 172 | end 173 | 174 | -- map function for group-by processing 175 | local function rec_to_group_closure(bin_grpby) 176 | local function rec_to_group(rec) 177 | -- returns group-by bin value in a record 178 | return rec[bin_grpby] 179 | end 180 | return rec_to_group 181 | end 182 | 183 | -- group-by having example: count(*) having low <= count <= high 184 | function groupby_with_count_having(stream, bin_grpby, having_range_low, having_range_high) 185 | local function process_having(stats) 186 | -- filters groups with count in the range 187 | local ret = map() 188 | for key, value in map.pairs(stats) do 189 | if (key >= having_range_low and key <= having_range_high) then 190 | ret[key] = value 191 | end 192 | end 193 | return ret 194 | end 195 | return stream : map(rec_to_group_closure(bin_grpby)) : aggregate(map(), group_count) 196 | : reduce(merge_group_sum) : map(process_having) 197 | end 198 | 199 | -- group-by count(*) order-by count 200 | function groupby_with_count_orderby(stream, bin_grpby, bin_orderby) 201 | local function orderby(t, order) 202 | -- collect the keys 203 | local keys = {} 204 | for k in pairs(t) do keys[#keys+1] = k end 205 | -- sort by the order by passing the table and keys a, b, 206 | table.sort(keys, function(a,b) return order(t, a, b) end) 207 | -- return the iterator function 208 | local i = 0 209 | return function() 210 | i = i + 1 211 | if keys[i] then 212 | return keys[i], t[keys[i] ] 213 | end 214 | end 215 | end 216 | local function process_orderby(stats) 217 | -- uses lua table sort to sort aggregate map into a list 218 | -- list has k and v separately added for sorted entries 219 | local ret = list() 220 | local t = {} 221 | for k,v in map.pairs(stats) do t[k] = v end 222 | for k,v in orderby(t, function(t, a, b) return t[a][bin_orderby] < t[b][bin_orderby] end) do 223 | list.append(ret, k) 224 | list.append(ret, v) 225 | end 226 | return ret 227 | end 228 | return stream : map(rec_to_group_closure(bin_grpby)) : aggregate(map(), group_count) 229 | : reduce(merge_group_sum) : map(process_orderby) 230 | end 231 | 232 | -- return map keys in a list 233 | local function map_to_list(values) 234 | local ret = list() 235 | for k in map.keys(values) do list.append(ret, k) end 236 | return ret 237 | end 238 | 239 | -- merge partial aggregate maps 240 | local function merge_values(a, b) 241 | return map.merge(a, b, function(v1, v2) return ((v1 or v2) and 1) or nil end) 242 | end 243 | 244 | -- map for distinct; using map unique keys 245 | local function distinct_values(agg, value) 246 | if value then agg[value] = 1 end 247 | return agg 248 | end 249 | 250 | -- distinct 251 | function distinct(stream, bin) 252 | local function rec_to_bin_value(rec) 253 | -- simply return bin value in rec 254 | return rec[bin] 255 | end 256 | return stream : map(rec_to_bin_value) : aggregate(map(), distinct_values) 257 | : reduce(merge_values) : map(map_to_list) 258 | end 259 | 260 | -- limit 261 | function limit(stream, bin, max) 262 | local function list_limit(agg, rec) 263 | -- add to list if the list size is below the limit 264 | if list.size(agg) < max then 265 | local ret = map() 266 | ret[bin] = rec[bin] 267 | list.append(agg, ret) 268 | end 269 | return agg 270 | end 271 | local function list_merge_limit(a, b) 272 | local ret = list() 273 | list.concat(ret, list.take(a, max)) 274 | list.concat(ret, list.take(b, (max > list.size(ret) and max-list.size(ret)) or 0)) 275 | return ret 276 | end 277 | return stream : aggregate(list(), list_limit) : reduce(list_merge_limit) 278 | end 279 | 280 | -- top n 281 | function top_n(stream, bin, n) 282 | local function get_top_n(values) 283 | -- return top n values in a map as an ordered list 284 | -- uses lua table sort 285 | local t = {} 286 | local i = 1 287 | for k in map.keys(values) do 288 | t[i] = k 289 | i = i + 1 290 | end 291 | table.sort(t, function(a,b) return a > b end) 292 | local ret = list() 293 | local i = 0 294 | for k, v in pairs(t) do 295 | list.append(ret, v) 296 | i = i + 1 297 | if i == n then break end 298 | end 299 | return ret 300 | end 301 | local function top_n_values(agg, value) 302 | if value then agg[value] = 1 end 303 | -- if map size exceeds n*10, trim to top n 304 | if map.size(agg) > n*10 then 305 | local new_agg = map() 306 | local trimmed = trim_to_top_n(agg) 307 | for value in list.iterator(trimmed) do 308 | new_agg[value] = 1 309 | end 310 | agg = new_agg 311 | end 312 | return agg 313 | end 314 | return stream : map(rec_to_bin_value_closure(bin)) : aggregate(map(), top_n_values) 315 | : reduce(merge_values) : map(get_top_n) 316 | end 317 | -------------------------------------------------------------------------------- /notebooks/udf/update_example.lua: -------------------------------------------------------------------------------- 1 | -- update_example.lua 2 | 3 | function multiplyBy(rec, binName, factor) 4 | rec[binName] = rec[binName] * factor 5 | aerospike:update(rec) 6 | end 7 | 8 | function increment(rec, binName, value) 9 | rec[binName] = rec[binName] + value 10 | aerospike:update(rec) 11 | end 12 | 13 | function increment_and_get(rec, binName, value) 14 | local ret = map() -- Initialize the return value (a map) 15 | rec[binName] = rec[binName] + value 16 | ret[binName] = rec[binName] 17 | aerospike:update(rec) 18 | return ret 19 | end 20 | 21 | -- update the specified bins by adding and appending the values provided 22 | function add_append(rec, binName1, addVal, binName2, appendVal) 23 | rec[binName1] = rec[binName1] + addVal 24 | rec[binName2] = rec[binName2] .. appendVal 25 | aerospike:update(rec) 26 | end -------------------------------------------------------------------------------- /update.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | if [ -z "$1" ]; then 5 | 6 | serverVer="$(curl -sSL 'https://download.aerospike.com/artifacts/aerospike-server-enterprise/' | grep -E '&2 93 | echo '"aerospike-examples/interactive-notebooks" repo before running this script.' >&2 94 | hints=$( find $HOME -type d -name notebooks 2>/dev/null | grep 'interactive-notebooks/notebooks' ) 95 | if [[ hints ]] ; then 96 | echo '' >&2 97 | echo 'Maybe try one of these:' >&2 98 | echo $hints >&2 99 | fi 100 | exit 1 101 | fi 102 | 103 | # Record md5sum so we can check whether we changed the file 104 | old_sum=$( md5sum $OUTPUT_FILE 2>/dev/null) 105 | 106 | { 107 | echo "${HEADER_TEXT}" 108 | echo "" 109 | 110 | echo "" 111 | view_url="${VIEWER_URL}" 112 | launch_url="${BINDER_URL}" 113 | echo "All Notebooks | [View All](${view_url}) | [Launch in Binder](${launch_url})" 114 | echo ":-------- | ---- | ------" 115 | nbs=$( get_notebooks . ) 116 | print_notebook_rows "$nbs" 117 | 118 | view_url="${VIEWER_URL}/java" 119 | launch_url="${BINDER_URL}java" 120 | echo " | | | | " 121 | echo "**Java Notebooks** | [View All](${view_url}) | [Launch in Binder](${launch_url})" 122 | echo " | | | | " 123 | nbs=$( get_notebooks ./java ) 124 | print_notebook_rows "$nbs" 125 | 126 | view_url="${VIEWER_URL}/python" 127 | launch_url="${BINDER_URL}python" 128 | echo " | | | | " 129 | echo "**Python Notebooks** | [View All](${view_url}) | [Launch in Binder](${launch_url})" 130 | echo " | | | | " 131 | nbs=$( get_notebooks ./python ) 132 | print_notebook_rows "$nbs" 133 | 134 | view_url="${VIEWER_URL}/spark" 135 | echo " | | | | " 136 | echo "**Spark Notebooks** | [View All](${view_url})" 137 | echo " | | | | " 138 | nbs=$( get_notebooks ./spark ) 139 | print_notebook_rows_nobinder "$nbs" 140 | 141 | echo "" 142 | } > ${OUTPUT_FILE} 143 | 144 | # Compute new md5sum and check whether it's different 145 | new_sum=$( md5sum $OUTPUT_FILE ) 146 | 147 | if [[ "${old_sum}" == "${new_sum}" ]] ; then 148 | echo "${OUTPUT_FILE} did not change" >&2 149 | else 150 | echo "${OUTPUT_FILE} changed" >&2 151 | git add ${OUTPUT_FILE} 152 | fi 153 | --------------------------------------------------------------------------------