├── ansible
    ├── data
    │   ├── group_vars
    │   │   └── .gitkeep
    │   ├── host_vars
    │   │   └── .gitkeep
    │   ├── roles
    │   │   ├── common
    │   │   │   ├── defaults
    │   │   │   │   └── .gitkeep
    │   │   │   ├── files
    │   │   │   │   └── .gitkeep
    │   │   │   ├── handlers
    │   │   │   │   └── .gitkeep
    │   │   │   ├── meta
    │   │   │   │   └── .gitkeep
    │   │   │   ├── vars
    │   │   │   │   └── main.yml
    │   │   │   ├── templates
    │   │   │   │   └── motd
    │   │   │   └── tasks
    │   │   │   │   ├── main.yml
    │   │   │   │   ├── motd.yml
    │   │   │   │   ├── package.yml
    │   │   │   │   └── oracle-jdk.yml
    │   │   ├── docker
    │   │   │   ├── meta
    │   │   │   │   └── main.yml
    │   │   │   └── tasks
    │   │   │   │   └── main.yml
    │   │   └── schema-registry
    │   │   │   ├── meta
    │   │   │       └── main.yml
    │   │   │   ├── defaults
    │   │   │       └── main.yml
    │   │   │   ├── img
    │   │   │       ├── ansible.png
    │   │   │       └── draw-io-ansible.xml
    │   │   │   ├── handlers
    │   │   │       └── main.yml
    │   │   │   ├── docker-compose-local.yml
    │   │   │   ├── files
    │   │   │       ├── log4j.properties
    │   │   │       └── schema-registry.properties
    │   │   │   ├── README.md
    │   │   │   └── tasks
    │   │   │       └── main.yml
    │   ├── hosts
    │   └── site.yml
    ├── destroy_ansible.sh
    ├── setup_share.sh
    ├── setup_ansible.sh
    └── Vagrantfile
├── aws
    └── emr
    │   ├── application
    │       ├── api
    │       │   ├── __init__.py
    │       │   ├── status_api.py
    │       │   ├── emr_api.py
    │       │   └── example_api.py
    │       ├── service
    │       │   ├── __init__.py
    │       │   ├── emr_service.py
    │       │   └── example_service.py
    │       ├── templates
    │       │   ├── page_not_found.html
    │       │   └── hello.html
    │       ├── static
    │       │   └── example.txt
    │       ├── __init__.py
    │       ├── configuration.py
    │       ├── main.py
    │       └── logger.py
    │   ├── setup.cfg
    │   ├── .dockerignore
    │   ├── MANIFEST.in
    │   ├── dev.sh
    │   ├── requirements.txt
    │   ├── README.md
    │   ├── Dockerfile
    │   ├── setup.py
    │   └── tests
    │       └── application_test.py
├── hadoop
    ├── example
    │   ├── map-reduce
    │   │   ├── src
    │   │   │   ├── test
    │   │   │   │   └── java
    │   │   │   │   │   └── .gitkeep
    │   │   │   └── main
    │   │   │   │   └── java
    │   │   │   │       └── com
    │   │   │   │           └── github
    │   │   │   │               └── niqdev
    │   │   │   │                   ├── IntSumReducer.java
    │   │   │   │                   ├── TokenizerMapper.java
    │   │   │   │                   └── WordCount.java
    │   │   ├── settings.gradle
    │   │   ├── README.md
    │   │   ├── gradle
    │   │   │   └── wrapper
    │   │   │   │   ├── gradle-wrapper.jar
    │   │   │   │   └── gradle-wrapper.properties
    │   │   ├── build.gradle
    │   │   ├── gradlew.bat
    │   │   └── gradlew
    │   └── spark
    │   │   ├── src
    │   │       ├── test
    │   │       │   └── scala
    │   │       │   │   └── .gitkeep
    │   │       └── main
    │   │       │   └── scala
    │   │       │       └── com
    │   │       │           └── github
    │   │       │               └── niqdev
    │   │       │                   └── App.scala
    │   │   ├── project
    │   │       ├── build.properties
    │   │       └── Dependencies.scala
    │   │   └── build.sbt
    ├── file
    │   ├── hadoop
    │   │   ├── config
    │   │   │   ├── masters
    │   │   │   ├── slaves
    │   │   │   ├── core-site.xml
    │   │   │   ├── hdfs-site.xml
    │   │   │   ├── mapred-site.xml
    │   │   │   ├── fair-scheduler.xml
    │   │   │   └── yarn-site.xml
    │   │   └── profile-hadoop.sh
    │   ├── oozie
    │   │   ├── profile-oozie.sh
    │   │   └── config
    │   │   │   ├── oozie-env.sh
    │   │   │   └── oozie-site.xml
    │   ├── spark
    │   │   ├── profile-spark.sh
    │   │   └── config
    │   │   │   ├── spark-env.sh
    │   │   │   ├── spark-defaults.conf
    │   │   │   └── log4j.properties
    │   ├── ssh
    │   │   └── config
    │   ├── zeppelin
    │   │   ├── profile-zeppelin.sh
    │   │   └── config
    │   │   │   └── zeppelin-env.sh
    │   ├── hosts
    │   └── motd
    ├── script
    │   ├── setup_zeppelin.sh
    │   ├── bootstrap.sh
    │   ├── setup_hadoop.sh
    │   ├── setup_spark.sh
    │   ├── setup_ubuntu.sh
    │   └── setup_oozie.sh
    ├── Vagrantfile
    └── vagrant_hadoop.sh
├── docs
    ├── img
    │   ├── hdfs-read.png
    │   ├── spark-job.png
    │   ├── hdfs-write.png
    │   ├── kafka-topic.png
    │   ├── kafka-cluster.png
    │   ├── kafka-consumer.png
    │   ├── kafka-producer.png
    │   ├── kubernetes-run.png
    │   ├── map-reduce-job.png
    │   ├── yarn-scheduler.png
    │   ├── cassandra-memory.png
    │   ├── cassandra-query.png
    │   ├── kubernetes-rbac.png
    │   ├── yarn-application.png
    │   ├── cassandra-read-path.png
    │   ├── kubernetes-client.png
    │   ├── kubernetes-cluster.png
    │   ├── kubernetes-volume.png
    │   ├── spark-architecture.png
    │   ├── cassandra-token-ring.png
    │   ├── cassandra-write-path.png
    │   ├── kafka-consumer-group.png
    │   ├── kafka-rebalance-lost.png
    │   ├── kubernetes-deployment.png
    │   ├── map-reduce-data-flow.png
    │   ├── kubernetes-architecture.png
    │   ├── kubernetes-container-api.png
    │   └── kafka-rebalance-duplicate.png
    ├── jvm.md
    ├── scala.md
    ├── index.md
    ├── azure.md
    ├── other-resources.md
    ├── zookeeper.md
    ├── programming.md
    ├── cloud.md
    ├── operating-system.md
    ├── docker.md
    ├── ansible.md
    ├── toolbox.md
    ├── cassandra.md
    ├── system-design.md
    ├── kafka.md
    └── hadoop.md
├── .github
    ├── dependabot.yml
    └── workflows
    │   └── gh-pages.yml
├── miscellaneous
    ├── hello.c
    └── setup_k8s.sh
├── cassandra
    ├── docker-compose.yml
    ├── cql
    │   ├── all_users.csv
    │   ├── column_users.csv
    │   ├── example_create.cql
    │   └── example_query.cql
    └── docker-compose-cluster.yml
├── requirements.txt
├── base
    ├── supervisor.sed
    └── Dockerfile
├── .gitignore
├── zookeeper
    ├── supervisor.ini
    ├── zoo.cfg
    └── Dockerfile
├── kafka
    ├── supervisor-connect.ini
    ├── supervisor-kafka.ini
    ├── docker-compose.yml
    ├── Dockerfile
    └── docker-compose-hub.yml
├── docs-todo
    ├── _aws.md
    ├── _neo4j.md
    └── _spark.md
├── mkdocs.yml
├── README.md
└── dev.txt


/ansible/data/group_vars/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ansible/data/host_vars/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/aws/emr/application/api/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ansible/data/roles/common/defaults/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ansible/data/roles/common/files/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ansible/data/roles/common/handlers/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ansible/data/roles/common/meta/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/aws/emr/application/service/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/aws/emr/setup.cfg:
--------------------------------------------------------------------------------
1 | [aliases]
2 | test=pytest


--------------------------------------------------------------------------------
/aws/emr/.dockerignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.pyc
3 | 


--------------------------------------------------------------------------------
/hadoop/example/map-reduce/src/test/java/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hadoop/example/spark/src/test/scala/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/aws/emr/application/templates/page_not_found.html:
--------------------------------------------------------------------------------
1 | D'oh!


--------------------------------------------------------------------------------
/ansible/data/roles/common/vars/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | apt_cache: 3600


--------------------------------------------------------------------------------
/aws/emr/application/static/example.txt:
--------------------------------------------------------------------------------
1 | example-static-file
2 | 


--------------------------------------------------------------------------------
/hadoop/file/hadoop/config/masters:
--------------------------------------------------------------------------------
1 | secondary-namenode.local
2 | 


--------------------------------------------------------------------------------
/aws/emr/MANIFEST.in:
--------------------------------------------------------------------------------
1 | graft app/templates
2 | graft app/static
3 | 


--------------------------------------------------------------------------------
/hadoop/example/spark/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.1.1
2 | 


--------------------------------------------------------------------------------
/ansible/data/roles/docker/meta/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | dependencies:
3 |   - common


--------------------------------------------------------------------------------
/hadoop/example/map-reduce/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'map-reduce'
2 | 


--------------------------------------------------------------------------------
/hadoop/file/hadoop/config/slaves:
--------------------------------------------------------------------------------
1 | node-1.local
2 | node-2.local
3 | node-3.local
4 | 


--------------------------------------------------------------------------------
/ansible/data/roles/schema-registry/meta/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | dependencies:
3 |   - common


--------------------------------------------------------------------------------
/docs/img/hdfs-read.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/hdfs-read.png


--------------------------------------------------------------------------------
/docs/img/spark-job.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/spark-job.png


--------------------------------------------------------------------------------
/docs/img/hdfs-write.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/hdfs-write.png


--------------------------------------------------------------------------------
/docs/img/kafka-topic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kafka-topic.png


--------------------------------------------------------------------------------
/docs/img/kafka-cluster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kafka-cluster.png


--------------------------------------------------------------------------------
/docs/img/kafka-consumer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kafka-consumer.png


--------------------------------------------------------------------------------
/docs/img/kafka-producer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kafka-producer.png


--------------------------------------------------------------------------------
/docs/img/kubernetes-run.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kubernetes-run.png


--------------------------------------------------------------------------------
/docs/img/map-reduce-job.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/map-reduce-job.png


--------------------------------------------------------------------------------
/docs/img/yarn-scheduler.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/yarn-scheduler.png


--------------------------------------------------------------------------------
/docs/img/cassandra-memory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/cassandra-memory.png


--------------------------------------------------------------------------------
/docs/img/cassandra-query.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/cassandra-query.png


--------------------------------------------------------------------------------
/docs/img/kubernetes-rbac.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kubernetes-rbac.png


--------------------------------------------------------------------------------
/docs/img/yarn-application.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/yarn-application.png


--------------------------------------------------------------------------------
/docs/img/cassandra-read-path.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/cassandra-read-path.png


--------------------------------------------------------------------------------
/docs/img/kubernetes-client.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kubernetes-client.png


--------------------------------------------------------------------------------
/docs/img/kubernetes-cluster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kubernetes-cluster.png


--------------------------------------------------------------------------------
/docs/img/kubernetes-volume.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kubernetes-volume.png


--------------------------------------------------------------------------------
/docs/img/spark-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/spark-architecture.png


--------------------------------------------------------------------------------
/docs/jvm.md:
--------------------------------------------------------------------------------
1 | # JVM
2 | 
3 | Moved to <a href="https://niqdev.github.io/scala-fp" target="_blank">scala-fp</a>
4 | 


--------------------------------------------------------------------------------
/docs/img/cassandra-token-ring.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/cassandra-token-ring.png


--------------------------------------------------------------------------------
/docs/img/cassandra-write-path.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/cassandra-write-path.png


--------------------------------------------------------------------------------
/docs/img/kafka-consumer-group.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kafka-consumer-group.png


--------------------------------------------------------------------------------
/docs/img/kafka-rebalance-lost.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kafka-rebalance-lost.png


--------------------------------------------------------------------------------
/docs/img/kubernetes-deployment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kubernetes-deployment.png


--------------------------------------------------------------------------------
/docs/img/map-reduce-data-flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/map-reduce-data-flow.png


--------------------------------------------------------------------------------
/docs/scala.md:
--------------------------------------------------------------------------------
1 | # Scala
2 | 
3 | Moved to <a href="https://niqdev.github.io/scala-fp" target="_blank">scala-fp</a>
4 | 


--------------------------------------------------------------------------------
/docs/img/kubernetes-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kubernetes-architecture.png


--------------------------------------------------------------------------------
/docs/img/kubernetes-container-api.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kubernetes-container-api.png


--------------------------------------------------------------------------------
/docs/img/kafka-rebalance-duplicate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kafka-rebalance-duplicate.png


--------------------------------------------------------------------------------
/hadoop/example/map-reduce/README.md:
--------------------------------------------------------------------------------
1 | # map-reduce-example
2 | 
3 | ```
4 | ./gradlew clean build
5 | ./gradlew jar
6 | ```
7 | 


--------------------------------------------------------------------------------
/hadoop/file/oozie/profile-oozie.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | export OOZIE_HOME=/usr/local/oozie
4 | export PATH=${OOZIE_HOME}/bin:${PATH}
5 | 


--------------------------------------------------------------------------------
/hadoop/file/spark/profile-spark.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | export SPARK_HOME=/usr/local/spark
4 | export PATH=${SPARK_HOME}/bin:${PATH}
5 | 


--------------------------------------------------------------------------------
/ansible/data/roles/schema-registry/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | schema:
3 |   registry:
4 |     user: cp-schema-registry
5 |     group: confluent
6 | 


--------------------------------------------------------------------------------
/hadoop/file/ssh/config:
--------------------------------------------------------------------------------
1 | Host *
2 |     StrictHostKeyChecking no
3 |     UserKnownHostsFile=/dev/null
4 |     NoHostAuthenticationForLocalhost yes
5 | 


--------------------------------------------------------------------------------
/ansible/data/roles/common/templates/motd:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | echo '\nHost: {{ ansible_nodename }}'
4 | echo 'Groups: {{ group_names | join(', ') }}'
5 | 


--------------------------------------------------------------------------------
/ansible/data/roles/schema-registry/img/ansible.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/ansible/data/roles/schema-registry/img/ansible.png


--------------------------------------------------------------------------------
/hadoop/file/zeppelin/profile-zeppelin.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | export ZEPPELIN_HOME=/usr/local/zeppelin
4 | export PATH=${ZEPPELIN_HOME}/bin:${PATH}
5 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 |   - package-ecosystem: "github-actions"
4 |     directory: "/"
5 |     schedule:
6 |       interval: "daily"
7 | 


--------------------------------------------------------------------------------
/hadoop/example/map-reduce/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niqdev/devops/HEAD/hadoop/example/map-reduce/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/miscellaneous/hello.c:
--------------------------------------------------------------------------------
1 | #include <stdio.h>
2 | 
3 | main() {
4 |     printf("Hello, World.\n");
5 | }
6 | 
7 | // compile: cc -o hello hello.c
8 | // run: ./hello
9 | 


--------------------------------------------------------------------------------
/ansible/data/roles/common/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - import_tasks: package.yml
 4 |   tags:
 5 |     - package
 6 | 
 7 | - import_tasks: motd.yml
 8 |   tags:
 9 |     - motd
10 | 


--------------------------------------------------------------------------------
/hadoop/file/oozie/config/oozie-env.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | export OOZIE_BASE_PATH=/vol/oozie
4 | export OOZIE_DATA=${OOZIE_BASE_PATH}/data
5 | export OOZIE_LOG=${OOZIE_BASE_PATH}/log
6 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | # DevOps
2 | 
3 | A collection of notes, resources, documentation and POCs mainly related to distributed systems for local development, learning purposes and quick prototyping.
4 | 


--------------------------------------------------------------------------------
/aws/emr/application/templates/hello.html:
--------------------------------------------------------------------------------
1 | <!doctype html>
2 | <title>Hello from Flask</title>
3 | {% if name %}
4 |   <h1>Hello {{ name }}!</h1>
5 | {% else %}
6 |   <h1>Hello, World!</h1>
7 | {% endif %}


--------------------------------------------------------------------------------
/hadoop/file/spark/config/spark-env.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | export SPARK_LOG_DIR=/vol/spark/log
4 | # fix warning in spark-shell
5 | export SPARK_LOCAL_IP=$(hostname -i | sed 's/^127.0.0.1 //')
6 | 


--------------------------------------------------------------------------------
/cassandra/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 |   cassandra:
 4 |     container_name: devops-cassandra
 5 |     image: cassandra:3.11
 6 |     ports:
 7 |       - 9042:9042
 8 |     volumes:
 9 |       - ./cql:/cql
10 | 


--------------------------------------------------------------------------------
/aws/emr/dev.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | rm -fr .eggs/ *.egg-info */__pycache__/ */*/__pycache__/
 4 | 
 5 | source venv/bin/activate
 6 | 
 7 | pip install -e .
 8 | 
 9 | export FLASK_APP=application
10 | export FLASK_DEBUG=1
11 | flask run
12 | 


--------------------------------------------------------------------------------
/aws/emr/application/__init__.py:
--------------------------------------------------------------------------------
1 | from flask import Flask
2 | 
3 | app = Flask(__name__)
4 | app.config.from_object('application.configuration.Config')
5 | #app.config.from_envvar('APPLICATION_SETTINGS', silent=True)
6 | 
7 | import application.main
8 | 


--------------------------------------------------------------------------------
/ansible/data/roles/schema-registry/handlers/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: restart schema-registry
 4 |   systemd:
 5 |     name: "{{ schema.registry.service_name }}"
 6 |     state: restarted
 7 | 
 8 | - name: reload systemd
 9 |   command: systemctl daemon-reload
10 | 


--------------------------------------------------------------------------------
/ansible/data/hosts:
--------------------------------------------------------------------------------
 1 | [ansible]
 2 | 192.168.100.10
 3 | 
 4 | [cluster]
 5 | #ip-192-168-100-11.local
 6 | 192.168.100.11
 7 | 192.168.100.12
 8 | 192.168.100.13
 9 | 
10 | [docker]
11 | 192.168.100.11
12 | 192.168.100.12
13 | 
14 | [schema-registry]
15 | 192.168.100.11
16 | 


--------------------------------------------------------------------------------
/hadoop/file/zeppelin/config/zeppelin-env.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | export ZEPPELLIN_BASE_PATH=/vol/zeppelin
4 | export ZEPPELIN_LOG_DIR=${ZEPPELLIN_BASE_PATH}/log
5 | export ZEPPELIN_NOTEBOOK_DIR=${ZEPPELLIN_BASE_PATH}/notebook
6 | 
7 | export ZEPPELIN_MEM="-Xms1024m -Xmx1024m"
8 | 


--------------------------------------------------------------------------------
/aws/emr/requirements.txt:
--------------------------------------------------------------------------------
 1 | astroid==1.6.1
 2 | click==6.7
 3 | Flask==1.1.1
 4 | isort==4.3.4
 5 | itsdangerous==0.24
 6 | Jinja2==2.10.1
 7 | lazy-object-proxy==1.3.1
 8 | MarkupSafe==1.0
 9 | mccabe==0.6.1
10 | pylint==1.8.2
11 | six==1.11.0
12 | Werkzeug==0.15.3
13 | wrapt==1.10.11
14 | 


--------------------------------------------------------------------------------
/aws/emr/README.md:
--------------------------------------------------------------------------------
 1 | # aws-emr
 2 | 
 3 | ### Development
 4 | 
 5 | ```
 6 | # create
 7 | virtualenv -p $(which python3) venv
 8 | 
 9 | # activate virtualenv
10 | source venv/bin/activate
11 | 
12 | # development script
13 | ./dev.sh
14 | 
15 | # deactivate virtualenv
16 | deactivate
17 | ```
18 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | click==7.1.2
 2 | future==0.18.2
 3 | Jinja2==2.11.3
 4 | joblib==0.14.1
 5 | livereload==2.6.1
 6 | lunr==0.5.6
 7 | Markdown==3.2.2
 8 | MarkupSafe==1.1.1
 9 | mkdocs==1.1
10 | nltk==3.5
11 | PyYAML==5.4
12 | regex==2020.5.7
13 | six==1.14.0
14 | tornado==6.0.4
15 | tqdm==4.46.0
16 | 


--------------------------------------------------------------------------------
/base/supervisor.sed:
--------------------------------------------------------------------------------
1 | s/logfile=\/tmp\/supervisord.log/logfile=\/var\/log\/supervisord.log/
2 | s/pidfile=\/tmp\/supervisord.pid/pidfile=\/var\/run\/supervisord.pid/
3 | s/nodaemon=false/nodaemon=true/
4 | s/\;\[include\]/\[include\]/
5 | s/\;files = relative\/directory\/\*.ini/files = \/etc\/supervisor\/conf.d\/\*/
6 | 


--------------------------------------------------------------------------------
/hadoop/example/map-reduce/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | #Tue Jan 23 20:22:38 GMT 2018
2 | distributionBase=GRADLE_USER_HOME
3 | distributionPath=wrapper/dists
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | distributionUrl=https\://services.gradle.org/distributions/gradle-4.4.1-all.zip
7 | 


--------------------------------------------------------------------------------
/docs/azure.md:
--------------------------------------------------------------------------------
1 | # Azure
2 | 
3 | * ARM template [documentation](https://docs.microsoft.com/en-us/azure/azure-resource-manager/templates)
4 | * Azure Automation [documentation](https://docs.microsoft.com/en-us/azure/automation)
5 | * Azure Security Center [documentation](https://docs.microsoft.com/en-us/azure/security-center)
6 | 


--------------------------------------------------------------------------------
/aws/emr/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.6
 2 | 
 3 | WORKDIR /usr/src
 4 | 
 5 | COPY requirements.txt ./
 6 | RUN pip install --no-cache-dir -r requirements.txt
 7 | 
 8 | COPY ./application ./application
 9 | 
10 | COPY setup.py setup.cfg MANIFEST.in ./
11 | RUN pip install --editable .
12 | 
13 | CMD [ "python", "./application/main.py" ]
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *~
 2 | .DS_Store
 3 | 
 4 | */.vagrant
 5 | 
 6 | ansible/.share
 7 | ansible/data/site.retry
 8 | aws/*/logs/
 9 | cassandra/.cassandra
10 | hadoop/.data/
11 | 
12 | .gradle/
13 | build/
14 | 
15 | __pycache__
16 | *.pyc
17 | .pytest_cache/
18 | venv/
19 | .eggs/
20 | *.egg-info
21 | 
22 | *.iml
23 | .idea/
24 | target/
25 | .vscode/
26 | *.log
27 | 
28 | site
29 | 


--------------------------------------------------------------------------------
/aws/emr/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name='aws-emr',
 5 |     version='0.1',
 6 |     packages=['application'],
 7 |     include_package_data=True,
 8 |     install_requires=[
 9 |         'flask',
10 |     ],
11 |     setup_requires=[
12 |         'pytest-runner',
13 |     ],
14 |     tests_require=[
15 |         'pytest',
16 |     ],
17 | )
18 | 


--------------------------------------------------------------------------------
/aws/emr/tests/application_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | class ApplicationTestCase(unittest.TestCase):
 4 | 
 5 |     def setUp(self):
 6 |         print('test setUp')
 7 | 
 8 |     def tearDown(self):
 9 |         print('test tearDown')
10 |     
11 |     def test_example(self):
12 |         assert 'aaa' in 'aaa'
13 | 
14 | if __name__ == '__main__':
15 |     unittest.main()
16 | 


--------------------------------------------------------------------------------
/zookeeper/supervisor.ini:
--------------------------------------------------------------------------------
 1 | [program:zookeeper]
 2 | command=/opt/zookeeper/bin/zkServer.sh start-foreground
 3 | redirect_stderr=false
 4 | stdout_logfile=/var/log/zookeeper/stdout
 5 | stdout_logfile_maxbytes=0
 6 | stderr_logfile=/var/log/zookeeper/stderr
 7 | stderr_logfile_maxbytes=0
 8 | stopsignal=INT
 9 | numprocs_start=1
10 | startsecs=2
11 | autostart=true
12 | autorestart=true
13 | 


--------------------------------------------------------------------------------
/hadoop/example/spark/build.sbt:
--------------------------------------------------------------------------------
 1 | import Dependencies.{V, allDependencies}
 2 | 
 3 | lazy val root = (project in file(".")).
 4 |   settings(
 5 |     inThisBuild(List(
 6 |       organization := "com.github.niqdev",
 7 |       scalaVersion := V.scala,
 8 |       version := "0.1.0-SNAPSHOT"
 9 |     )),
10 |     name := "spark-github",
11 |     libraryDependencies ++= allDependencies
12 |   )
13 | 


--------------------------------------------------------------------------------
/kafka/supervisor-connect.ini:
--------------------------------------------------------------------------------
 1 | [program:connect]
 2 | command=/opt/kafka/bin/connect-distributed.sh /opt/kafka/config/connect-distributed.properties
 3 | redirect_stderr=false
 4 | stdout_logfile=/var/log/connect/stdout
 5 | stdout_logfile_maxbytes=0
 6 | stderr_logfile=/var/log/connect/stderr
 7 | stderr_logfile_maxbytes=0
 8 | stopsignal=INT
 9 | numprocs_start=1
10 | startsecs=2
11 | autostart=true
12 | autorestart=true
13 | 


--------------------------------------------------------------------------------
/hadoop/file/spark/config/spark-defaults.conf:
--------------------------------------------------------------------------------
1 | spark.master yarn
2 | # TODO spark.yarn.jars hdfs://namenode.local:9000/user/spark/share/lib/*.jar
3 | # TODO spark.yarn.archive hdfs://namenode.local:9000/user/spark/share/spark-archive.zip
4 | 
5 | # history server
6 | spark.eventLog.enabled true
7 | spark.eventLog.dir hdfs://namenode.local:9000/user/spark/log
8 | spark.history.fs.logDirectory hdfs://namenode.local:9000/user/spark/log
9 | 


--------------------------------------------------------------------------------
/hadoop/example/map-reduce/build.gradle:
--------------------------------------------------------------------------------
 1 | apply plugin: 'java-library'
 2 | apply plugin: 'application'
 3 | 
 4 | repositories {
 5 |     jcenter()
 6 | }
 7 | 
 8 | mainClassName = "com.github.niqdev.WordCount"
 9 | 
10 | jar {
11 |     manifest {
12 |         attributes 'Main-Class': "$mainClassName"
13 |     }
14 | }
15 | 
16 | dependencies {
17 |     compile group: 'org.apache.hadoop', name: 'hadoop-client', version: '2.7.5'
18 | }
19 | 


--------------------------------------------------------------------------------
/hadoop/file/hadoop/profile-hadoop.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | export HADOOP_HOME=/usr/local/hadoop
 4 | export PATH=${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin:${PATH}
 5 | 
 6 | export HADOOP_LOG_PATH=/vol/hadoop/log
 7 | export HADOOP_LOG_DIR=${HADOOP_LOG_PATH}/hadoop
 8 | export YARN_LOG_DIR=${HADOOP_LOG_PATH}/yarn
 9 | export HADOOP_MAPRED_LOG_DIR=${HADOOP_LOG_PATH}/mapred
10 | 
11 | # required by spark
12 | export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop
13 | 


--------------------------------------------------------------------------------
/kafka/supervisor-kafka.ini:
--------------------------------------------------------------------------------
 1 | [program:kafka]
 2 | command=/opt/kafka/bin/kafka-server-start.sh /opt/kafka/config/server.properties --override zookeeper.connect="%(ENV_ZOOKEEPER_HOSTS)s"
 3 | redirect_stderr=false
 4 | stdout_logfile=/var/log/kafka/stdout
 5 | stdout_logfile_maxbytes=0
 6 | stderr_logfile=/var/log/kafka/stderr
 7 | stderr_logfile_maxbytes=0
 8 | stopsignal=INT
 9 | numprocs_start=1
10 | startsecs=2
11 | autostart=true
12 | autorestart=true
13 | 


--------------------------------------------------------------------------------
/.github/workflows/gh-pages.yml:
--------------------------------------------------------------------------------
 1 | name: github-pages
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - master
 6 | 
 7 | jobs:
 8 |   build:
 9 |     name: Deploy docs
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: Checkout main
13 |         uses: actions/checkout@v3.2.0
14 | 
15 |       - name: Deploy docs
16 |         uses: mhausenblas/mkdocs-deploy-gh-pages@nomaterial
17 |         env:
18 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
19 | 


--------------------------------------------------------------------------------
/aws/emr/application/configuration.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | class DefaultConfig(object):
 4 |     APP_NAME = 'aws-emr'
 5 |     LOG_PATH = 'logs/application.log'
 6 |     ENVIRONMENT = 'DEFAULT'
 7 |     DEBUG = False
 8 |     HTTP_HOST = '127.0.0.1'
 9 |     HTTP_PORT = 5000
10 | 
11 | class Config(DefaultConfig):
12 |     # docker doesn't forward 127.0.0.1
13 |     HTTP_HOST = os.getenv('HTTP_HOST', '0.0.0.0')
14 |     HTTP_PORT = int(os.getenv('HTTP_PORT', 5000))
15 | 


--------------------------------------------------------------------------------
/aws/emr/application/api/status_api.py:
--------------------------------------------------------------------------------
 1 | from application import app
 2 | 
 3 | from flask import jsonify
 4 | 
 5 | @app.route('/status')
 6 | def status():
 7 |     app.logger.debug('status')
 8 |     return jsonify({
 9 |         'status': 'OK'
10 |     })
11 | 
12 | @app.route('/info')
13 | def info():
14 |     app.logger.debug('info')
15 |     return jsonify({
16 |         'application': app.config['APP_NAME'],
17 |         'env': app.config['ENVIRONMENT']
18 |     })
19 | 


--------------------------------------------------------------------------------
/hadoop/file/hosts:
--------------------------------------------------------------------------------
1 | # hadoop hosts
2 | 172.16.0.10 master master.local namenode.local secondary-namenode.local resource-manager.local web-proxy.local history.local
3 | 172.16.0.10 spark.local spark-history.local zeppelin.local postgres.local oozie.local
4 | 172.16.0.101 node-1 node-1.local datanode-1.local node-manager-1.local
5 | 172.16.0.102 node-2 node-2.local datanode-2.local node-manager-2.local
6 | 172.16.0.103 node-3 node-3.local datanode-3.local node-manager-3.local
7 | 


--------------------------------------------------------------------------------
/miscellaneous/setup_k8s.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | curl -Lo minikube https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64 \
 4 |   && chmod +x minikube \
 5 |   && sudo mv minikube /usr/local/bin/
 6 | 
 7 | curl -Lo kubectl https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl \
 8 |   && chmod +x kubectl \
 9 |   && sudo mv kubectl /usr/local/bin/
10 | 


--------------------------------------------------------------------------------
/ansible/data/roles/common/tasks/motd.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | # custom banner
 4 | # https://ownyourbits.com/2017/04/05/customize-your-motd-login-message-in-debian-and-ubuntu/
 5 | 
 6 | - name: remove help banner from motd
 7 |   become: yes
 8 |   file:
 9 |     path: /etc/update-motd.d/10-help-text
10 |     state: absent
11 | 
12 | - name: add custom banner to motd
13 |   become: yes
14 |   template:
15 |     src: motd
16 |     dest: /etc/update-motd.d/10-custom-text
17 |     mode: 0755
18 | 


--------------------------------------------------------------------------------
/ansible/destroy_ansible.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # unofficial bash strict mode
 4 | set -euo pipefail
 5 | IFS=$'\n\t'
 6 | 
 7 | # run from any directory (no symlink allowed)
 8 | CURRENT_PATH=$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd -P)
 9 | cd ${CURRENT_PATH}
10 | 
11 | echo "[+] destroy ansible"
12 | 
13 | read -p "Are you sure? [y/n]" -n 1 -r
14 | echo
15 | if [[ $REPLY =~ ^[Yy]$ ]]
16 | then
17 |   vagrant destroy -f
18 | 
19 |   rm -frv \
20 |     .vagrant \
21 |     .share
22 | fi
23 | 
24 | echo "[-] destroy ansible"
25 | 


--------------------------------------------------------------------------------
/ansible/data/site.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: test
 4 |   hosts: all
 5 |   tasks:
 6 |     - name: test uptime
 7 |       shell: uptime
 8 |   tags:
 9 |     - test
10 | 
11 | - name: common setup
12 |   hosts: cluster
13 |   roles:
14 |     - common
15 |   tags:
16 |     - common
17 | 
18 | - name: docker setup
19 |   hosts: docker
20 |   roles:
21 |     - docker
22 |   tags:
23 |     - docker
24 | 
25 | - name: schema registry setup
26 |   hosts: schema-registry
27 |   roles:
28 |     - schema-registry
29 |   tags:
30 |     - schema-registry
31 | 


--------------------------------------------------------------------------------
/hadoop/file/hadoop/config/core-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <configuration>
 4 |     <property>
 5 |         <name>fs.defaultFS</name>
 6 |         <value>hdfs://namenode.local:9000</value>
 7 |     </property>
 8 | 
 9 |     <!-- Oozie -->
10 |     <property>
11 |         <name>hadoop.proxyuser.hadoop.hosts</name>
12 |         <value>*</value>
13 |     </property>
14 |     <property>
15 |         <name>hadoop.proxyuser.hadoop.groups</name>
16 |         <value>*</value>
17 |     </property>
18 | </configuration>
19 | 


--------------------------------------------------------------------------------
/hadoop/file/motd:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | echo "\nHostname: \033[1;31m$(hostname -s)\033[0m"
 4 | echo "Uptime:$(uptime)\n"
 5 | 
 6 | echo '* master: 172.16.0.10'
 7 | echo '* node-1: 172.16.0.101\n'
 8 | 
 9 | echo '* NameNode: http://namenode.local:50070'
10 | echo '* ResourceManager: http://resource-manager.local:8088'
11 | echo '* MapReduce Job History Server: http://history.local:19888'
12 | echo '* DataNode/NodeManager (1): http://node-1.local:8042/node\n'
13 | 
14 | echo '* Spark: http://spark.local:4040'
15 | echo '* Zeppelin: http://zeppelin.local:8080'
16 | echo '* Oozie: http://oozie.local:11000'
17 | 


--------------------------------------------------------------------------------
/ansible/setup_share.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # unofficial bash strict mode
 4 | set -euo pipefail
 5 | IFS=$'\n\t'
 6 | 
 7 | # run from any directory (no symlink allowed)
 8 | CURRENT_PATH=$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd -P)
 9 | cd ${CURRENT_PATH}
10 | 
11 | echo "[+] setup share"
12 | 
13 | SHARE_PATH="$CURRENT_PATH/.share"
14 | SSH_PATH="$SHARE_PATH/ssh"
15 | 
16 | echo "share path: $SHARE_PATH"
17 | 
18 | rm -fr ${SHARE_PATH}
19 | mkdir -p ${SHARE_PATH}/node-{1,2,3} ${SSH_PATH}
20 | 
21 | ssh-keygen -t rsa -b 4096 -C "ansible" -N "" -f "$SSH_PATH/ansible_rsa"
22 | 
23 | echo "[-] setup share"
24 | 


--------------------------------------------------------------------------------
/kafka/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | 
 3 | services:
 4 |   zookeeper:
 5 |     container_name: devops-zookeeper
 6 |     build:
 7 |       context: ../zookeeper
 8 |       args:
 9 |         - VERSION=3.4.12
10 |     ports:
11 |       - 12181:2181
12 |     networks:
13 |       - devops_network
14 |   kafka:
15 |     container_name: devops-kafka
16 |     build: .
17 |     depends_on:
18 |       - zookeeper
19 |     ports:
20 |       - 19092:9092
21 |     networks:
22 |       - devops_network
23 |     environment:
24 |       - ZOOKEEPER_HOSTS="zookeeper:2181"
25 | 
26 | networks:
27 |   devops_network:
28 | 


--------------------------------------------------------------------------------
/zookeeper/zoo.cfg:
--------------------------------------------------------------------------------
 1 | # http://zookeeper.apache.org/doc/current/zookeeperAdmin.html
 2 | 
 3 | # The number of milliseconds of each tick
 4 | tickTime=2000
 5 | # The number of ticks that the initial synchronization phase can take
 6 | initLimit=10
 7 | # The number of ticks that can pass between sending a request and getting an acknowledgement
 8 | syncLimit=5
 9 | # The directory where the snapshot is stored
10 | dataDir=/var/lib/zookeeper/data
11 | # The port at which the clients will connect
12 | clientPort=2181
13 | # Write the transaction log to the dataLogDir rather than the dataDir
14 | dataLogDir=/var/log/zookeeper
15 | 


--------------------------------------------------------------------------------
/hadoop/file/hadoop/config/hdfs-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <configuration>
 4 |     <property>
 5 |         <name>dfs.namenode.name.dir</name>
 6 |         <value>file:///vol/hadoop/namenode</value>
 7 |     </property>
 8 |     <property>
 9 |         <name>dfs.namenode.checkpoint.dir</name>
10 |         <value>file:///vol/hadoop/secondary</value>
11 |     </property>
12 |     <property>
13 |         <name>dfs.datanode.data.dir</name>
14 |         <value>file:///vol/hadoop/datanode</value>
15 |     </property>
16 | </configuration>
17 | 


--------------------------------------------------------------------------------
/ansible/data/roles/common/tasks/package.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: update & upgrade
 4 |   become: yes
 5 |   apt:
 6 |     update_cache: yes
 7 |     cache_valid_time: "{{ apt_cache }}"
 8 |     upgrade: dist
 9 | 
10 | - name: install common packages
11 |   become: yes
12 |   apt:
13 |     name:
14 |       - jq
15 |       - tree
16 |       - httpie
17 |     state: present
18 |     update_cache: yes
19 |     cache_valid_time: "{{ apt_cache }}"
20 |     
21 | - import_tasks: oracle-jdk.yml
22 |   tags:
23 |     - oracle-jdk
24 | 
25 | - name: cleanup
26 |   become: yes
27 |   apt:
28 |     autoclean: yes
29 |     autoremove: yes
30 | 


--------------------------------------------------------------------------------
/aws/emr/application/main.py:
--------------------------------------------------------------------------------
 1 | from application import app
 2 | from application.logger import Logger
 3 | 
 4 | Logger().init()
 5 | 
 6 | # api
 7 | import application.api.status_api
 8 | import application.api.example_api
 9 | import application.api.emr_api
10 | 
11 | # if run with cli this is NOT executed
12 | if __name__ == '__main__':
13 |     app.logger.info('start application: [{0}] @ {1}:{2} in DEBUG={3}'.format(
14 |         app.config['APP_NAME'], app.config['HTTP_HOST'], app.config['HTTP_PORT'], app.config['DEBUG']))
15 |     app.run(host=app.config['HTTP_HOST'], port=app.config['HTTP_PORT'], debug=app.config['DEBUG'])
16 | 


--------------------------------------------------------------------------------
/hadoop/example/spark/project/Dependencies.scala:
--------------------------------------------------------------------------------
 1 | import sbt._
 2 | 
 3 | object Dependencies {
 4 | 
 5 |   lazy val N = new {
 6 |     val spark = "org.apache.spark"
 7 |   }
 8 | 
 9 |   lazy val V = new {
10 |     val scala = "2.11.12"
11 | 
12 |     val spark = "2.2.1"
13 | 
14 |     val scalatest = "3.0.5"
15 |   }
16 | 
17 |   lazy val libDependencies = Seq(
18 |     N.spark %% "spark-core" % V.spark % Provided,
19 |     N.spark %% "spark-sql" % V.spark % Provided
20 |   )
21 | 
22 |   lazy val testDependencies = Seq(
23 |     "org.scalatest" %% "scalatest" % V.scalatest % Test
24 |   )
25 | 
26 |   lazy val allDependencies = libDependencies ++ testDependencies
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/aws/emr/application/service/emr_service.py:
--------------------------------------------------------------------------------
 1 | from application import app
 2 | 
 3 | class EmrService(object):
 4 | 
 5 |     def create_cluster(self):
 6 |         app.logger.debug('TODO create_cluster')
 7 |         return {
 8 |             'instance_id': 'TODO_INSTANCE_ID'
 9 |         }
10 |     
11 |     def destroy_cluster(self):
12 |         app.logger.debug('TODO destroy_cluster')
13 |         return {
14 |             'instance_id': 'TODO_INSTANCE_ID'
15 |         }
16 |     
17 |     def info_cluster(self):
18 |         app.logger.debug('TODO info_cluster')
19 |         return {
20 |             'instance_id': 'TODO_INSTANCE_ID',
21 |             'name': 'TODO_NAME'
22 |         }
23 | 


--------------------------------------------------------------------------------
/cassandra/cql/all_users.csv:
--------------------------------------------------------------------------------
1 | firstNameCsvAll1;"{'home': {street: 'street1'; city: 'city1'; state: 'STATE'; zip_code: 12345}}";;"{'csv1a@example.com'; 'csv1b@example.com'}";True;;lastNameCsv1;;
2 | firstNameCsvAll2;"{'home': {street: 'street1'; city: 'city1'; state: 'STATE'; zip_code: 12345}}";;"{'csv2a@example.com'; 'csv2b@example.com'}";True;;lastNameCsv2;;
3 | firstNameCsvAll3;"{'home': {street: 'street1'; city: 'city1'; state: 'STATE'; zip_code: 12345}}";;"{'csv3a@example.com'; 'csv3b@example.com'}";False;;lastNameCsv3;;
4 | firstNameCsvAll4;"{'home': {street: 'street1'; city: 'city1'; state: 'STATE'; zip_code: 12345}}";;"{'csv4a@example.com'; 'csv4b@example.com'}";False;;lastNameCsv4;;
5 | 


--------------------------------------------------------------------------------
/ansible/data/roles/schema-registry/docker-compose-local.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | 
 3 | services:
 4 | 
 5 |   zookeeper:
 6 |     container_name: my-local-zookeeper
 7 |     image: niqdev/zookeeper:3.4.13
 8 |     ports:
 9 |       - 2181:2181
10 |     hostname: zookeeper
11 |     networks:
12 |       - my_local_network
13 | 
14 |   kafka:
15 |     container_name: my-local-kafka
16 |     image: niqdev/kafka:2.0.0
17 |     depends_on:
18 |       - zookeeper
19 |     ports:
20 |       - 9092:9092
21 |       - 8083:8083
22 |     hostname: kafka
23 |     networks:
24 |       - my_local_network
25 |     environment:
26 |       - ZOOKEEPER_HOSTS="zookeeper:2181"
27 | 
28 | networks:
29 |   my_local_network:
30 | 


--------------------------------------------------------------------------------
/ansible/setup_ansible.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # unofficial bash strict mode
 4 | set -euo pipefail
 5 | IFS=$'\n\t'
 6 | 
 7 | # run from any directory (no symlink allowed)
 8 | CURRENT_PATH=$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd -P)
 9 | cd ${CURRENT_PATH}
10 | 
11 | echo "[+] setup ansible"
12 | 
13 | sudo apt-add-repository ppa:ansible/ansible
14 | sudo apt-get update
15 | 
16 | sudo apt-get install -y \
17 |   software-properties-common \
18 |   ansible
19 | 
20 | # http://docs.ansible.com/ansible/latest/intro_getting_started.html#host-key-checking
21 | sudo sed -i -r "s/#host_key_checking = False/host_key_checking = False/" /etc/ansible/ansible.cfg
22 | 
23 | echo "[-] setup ansible"
24 | 


--------------------------------------------------------------------------------
/docs-todo/_aws.md:
--------------------------------------------------------------------------------
 1 | # AWS
 2 | 
 3 | > TODO
 4 | 
 5 | Documentation
 6 | 
 7 | * [Boto 3](https://boto3.readthedocs.io/en/latest/reference/services/index.html)
 8 | 
 9 | ## CLI
10 | 
11 | TODO
12 | 
13 | ## Setup
14 | 
15 | Build `devops/aws-emr` image
16 | ```bash
17 | # change path
18 | cd devops/aws/emr
19 | 
20 | # build image
21 | docker build -t devops/aws-emr .
22 | 
23 | # start temporary container [port=HOST:CONTAINER]
24 | docker run \
25 |   --rm \
26 |   -e HTTP_PORT=8080 \
27 |   -p 5000:8080 \
28 |   --name aws-emr \
29 |   devops/aws-emr:latest
30 | 
31 | # access container
32 | docker exec -it aws-emr bash
33 | ```
34 | 
35 | ### S3
36 | 
37 | TODO
38 | 
39 | ### EMR
40 | 
41 | TODO
42 | 


--------------------------------------------------------------------------------
/ansible/data/roles/common/tasks/oracle-jdk.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: add java repository
 4 |   tags:
 5 |     - oracle-jdk
 6 |   become: yes
 7 |   apt_repository:
 8 |     repo: 'ppa:linuxuprising/java'
 9 |     state: present
10 | 
11 | - name: accept oracle license
12 |   tags:
13 |     - oracle-jdk
14 |   become: yes
15 |   debconf: name='oracle-java11-installer' question='shared/accepted-oracle-license-v1-2' value='true' vtype='select'
16 | 
17 | - name: install java
18 |   tags:
19 |     - oracle-jdk
20 |   become: yes
21 |   apt:
22 |     name: "{{ packages }}"
23 |     state: latest
24 |   vars:
25 |     packages:
26 |       - oracle-java11-installer
27 |       - oracle-java11-set-default
28 | 


--------------------------------------------------------------------------------
/cassandra/cql/column_users.csv:
--------------------------------------------------------------------------------
1 | first_name,last_name,addresses,emails,enable
2 | firstNameCsv1,lastNameCsv1,"{'home': {street: 'street1', city: 'city1', state: 'STATE', zip_code: 12345}}","{'csv1a@example.com', 'csv1b@example.com'}",True
3 | firstNameCsv2,lastNameCsv2,"{'home': {street: 'street1', city: 'city1', state: 'STATE', zip_code: 12345}}","{'csv2a@example.com', 'csv2b@example.com'}",True
4 | firstNameCsv3,lastNameCsv3,"{'home': {street: 'street1', city: 'city1', state: 'STATE', zip_code: 12345}}","{'csv3a@example.com', 'csv3b@example.com'}",False
5 | firstNameCsv4,lastNameCsv4,"{'home': {street: 'street1', city: 'city1', state: 'STATE', zip_code: 12345}}","{'csv4a@example.com', 'csv4b@example.com'}",False
6 | 


--------------------------------------------------------------------------------
/ansible/data/roles/docker/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: create docker group
 4 |   become: yes
 5 |   group:
 6 |     name: docker
 7 |     state: present
 8 | 
 9 | - name: create docker user
10 |   become: yes
11 |   user:
12 |     name: docker
13 |     shell: /bin/bash
14 |     groups: docker,sudo
15 |     append: yes
16 | 
17 | - name: install docker
18 |   become: yes
19 |   #become_user: docker
20 |   command: 'bash -c "curl -fsSL https://get.docker.com/ | sh"'
21 | 
22 | - name: install docker-compose
23 |   become: yes
24 |   #become_user: docker
25 |   get_url:
26 |     url: "https://github.com/docker/compose/releases/download/1.22.0/docker-compose-Linux-x86_64"
27 |     dest: /usr/local/bin/docker-compose
28 |     mode: +x
29 | 


--------------------------------------------------------------------------------
/zookeeper/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM devops/base:latest
 2 | #FROM niqdev/phusion-base:latest
 3 | 
 4 | ARG VERSION=3.5.5
 5 | 
 6 | ENV ZOOKEEPER_HOME "/opt/zookeeper"
 7 | ENV PATH "$ZOOKEEPER_HOME/bin:$PATH"
 8 | 
 9 | RUN apt-get install -y \
10 |   telnet \
11 |   netcat && \
12 |   apt-get clean
13 | 
14 | RUN curl https://www-eu.apache.org/dist/zookeeper/zookeeper-${VERSION}/apache-zookeeper-${VERSION}-bin.tar.gz | tar -xzf - -C /opt && \
15 |   mv /opt/apache-zookeeper-${VERSION}-bin /opt/zookeeper-${VERSION} && \
16 |   ln -s /opt/zookeeper-${VERSION} /opt/zookeeper && \
17 |   mkdir -p /var/log/zookeeper /var/lib/zookeeper/data
18 | 
19 | ADD zoo.cfg /opt/zookeeper/conf/zoo.cfg
20 | ADD supervisor.ini /etc/supervisor/conf.d/zookeeper.conf
21 | 


--------------------------------------------------------------------------------
/base/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM phusion/baseimage:latest-amd64
 2 | 
 3 | RUN apt-get update && apt-get upgrade -y
 4 | RUN add-apt-repository ppa:openjdk-r/ppa -y
 5 | 
 6 | RUN apt-get update && apt-get install -y \
 7 |   iputils-ping \
 8 |   python2.7 \
 9 |   python-pip \
10 |   httpie \
11 |   jq \
12 |   openjdk-8-jdk && \
13 |   apt-get clean
14 | 
15 | ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64
16 | 
17 | RUN pip install --upgrade pip wheel setuptools supervisor
18 | 
19 | ADD supervisor.sed /tmp/supervisor.sed
20 | 
21 | RUN echo_supervisord_conf > /etc/supervisord.conf && \
22 |   sed -i -r -f /tmp/supervisor.sed /etc/supervisord.conf && \
23 |   mkdir -p /etc/supervisor/conf.d
24 | 
25 | CMD ["supervisord", "-c", "/etc/supervisord.conf", "-n"]
26 | 


--------------------------------------------------------------------------------
/docs-todo/_neo4j.md:
--------------------------------------------------------------------------------
 1 | # Neo4j
 2 | 
 3 | > TODO
 4 | 
 5 | * [Graph Databases](TODO) (2015) by Ian Robinson, Jim Webber, and Emil Eifrem (Book)
 6 | 
 7 | Graph databases help leveraging complex and dynamic relationships in highly connected data to generate insight and competitive advantage. Connected data is data whose interpretation and value requires users first to understand the ways in which its constituent elements are related.
 8 | 
 9 | > https://github.com/iansrobinson/graph-databases-use-cases
10 | 
11 | **What Is a Graph?**
12 | 
13 | A graph is just a collection of vertices and edges or, in different words, a set of nodes and the relationships that connect them. Graphs represent entities as nodes and the ways in which those entities relate to the world as relationships.
14 | 


--------------------------------------------------------------------------------
/hadoop/example/map-reduce/src/main/java/com/github/niqdev/IntSumReducer.java:
--------------------------------------------------------------------------------
 1 | package com.github.niqdev;
 2 | 
 3 | import org.apache.hadoop.io.IntWritable;
 4 | import org.apache.hadoop.io.Text;
 5 | import org.apache.hadoop.mapreduce.Reducer;
 6 | 
 7 | import java.io.IOException;
 8 | 
 9 | public class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
10 | 
11 |     private IntWritable result = new IntWritable();
12 | 
13 |     @Override
14 |     protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
15 |         int sum = 0;
16 |         for (IntWritable value : values) {
17 |             sum += value.get();
18 |         }
19 |         result.set(sum);
20 |         context.write(key, result);
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: DevOps
 2 | site_author: niqdev
 3 | repo_url: https://github.com/niqdev/devops
 4 | theme: readthedocs
 5 | google_analytics: ['UA-68888222-4', 'niqdev.github.io']
 6 | 
 7 | nav:
 8 |     - Linux: linux.md
 9 |     - Docker: docker.md
10 |     - Ansible: ansible.md
11 |     - Cassandra: cassandra.md
12 |     - ZooKeeper: zookeeper.md
13 |     - Kafka: kafka.md
14 |     - Hadoop: hadoop.md
15 |     - Cloud: cloud.md
16 |     - Kubernetes: kubernetes.md
17 |     - System Design: system-design.md
18 |     - Operating System: operating-system.md
19 |     - Programming: programming.md
20 |     - Other Resources: other-resources.md
21 |     - Toolbox: toolbox.md
22 |     - JVM (OLD): jvm.md
23 |     - Scala (OLD): scala.md
24 | 
25 | # disable search plugin
26 | #plugins: []
27 | 


--------------------------------------------------------------------------------
/hadoop/file/hadoop/config/mapred-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <configuration>
 4 |     <property>
 5 |         <name>mapreduce.framework.name</name>
 6 |         <value>yarn</value>
 7 |     </property>
 8 | 
 9 |     <!-- MapReduce Job History Server -->
10 |     <property>
11 |         <name>mapreduce.jobhistory.address</name>
12 |         <value>history.local:10020</value>
13 |     </property>
14 |     <property>
15 |         <name>mapreduce.jobhistory.webapp.address</name>
16 |         <value>history.local:19888</value>
17 |     </property>
18 |     <!-- hdfs path -->
19 |     <property>
20 |         <name>yarn.app.mapreduce.am.staging-dir</name>
21 |         <value>/mr-history</value>
22 |     </property>
23 | </configuration>
24 | 


--------------------------------------------------------------------------------
/aws/emr/application/logger.py:
--------------------------------------------------------------------------------
 1 | from application import app
 2 | 
 3 | import os
 4 | import logging
 5 | from logging.handlers import TimedRotatingFileHandler
 6 | 
 7 | class Logger(object):
 8 |     
 9 |     def __init__(self):
10 |         self.log_path = app.config['LOG_PATH']
11 |     
12 |     def init(self):
13 |         # create directory if doesn't exist
14 |         os.makedirs(os.path.dirname(self.log_path), exist_ok=True)
15 | 
16 |         formatter = logging.Formatter("[%(asctime)s][%(levelname)s][%(pathname)s:%(lineno)d] %(message)s")
17 |         handler = TimedRotatingFileHandler(self.log_path, when='midnight', interval=1, backupCount=5)
18 |         handler.setLevel(logging.DEBUG)
19 |         handler.setFormatter(formatter)
20 | 
21 |         app.logger.addHandler(handler)
22 |         app.logger.setLevel(logging.DEBUG)
23 |         app.logger.debug('init logger')
24 | 


--------------------------------------------------------------------------------
/hadoop/file/hadoop/config/fair-scheduler.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <allocations>
 3 | 
 4 |     <!--
 5 |         if "priority_queue" does not receive 80% of its FairShare within 120 seconds,
 6 |         FairScheduler will begin preempting applications from "default_queue"
 7 |         and giving resources to "priority_queue"
 8 |     -->
 9 |     <queue name="priority_queue">
10 |         <weight>60.0</weight>
11 |         <fairSharePreemptionThreshold>0.8</fairSharePreemptionThreshold>
12 |         <fairSharePreemptionTimeout>120</fairSharePreemptionTimeout>
13 |     </queue>
14 |     <queue name="default_queue">
15 |         <weight>40.0</weight>
16 |     </queue>
17 | 
18 |     <queuePlacementPolicy>
19 |         <rule name="specified" />
20 |         <rule name="default" queue="default_queue" />
21 |     </queuePlacementPolicy>
22 | 
23 | </allocations>
24 | 


--------------------------------------------------------------------------------
/hadoop/example/map-reduce/src/main/java/com/github/niqdev/TokenizerMapper.java:
--------------------------------------------------------------------------------
 1 | package com.github.niqdev;
 2 | 
 3 | import org.apache.hadoop.io.IntWritable;
 4 | import org.apache.hadoop.io.Text;
 5 | import org.apache.hadoop.mapreduce.Mapper;
 6 | 
 7 | import java.io.IOException;
 8 | import java.util.StringTokenizer;
 9 | 
10 | public class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
11 | 
12 |     private final static IntWritable one = new IntWritable(1);
13 |     private Text word = new Text();
14 | 
15 |     @Override
16 |     protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
17 |         StringTokenizer iterator = new StringTokenizer(value.toString());
18 | 
19 |         while (iterator.hasMoreTokens()) {
20 |             word.set(iterator.nextToken());
21 |             context.write(word, one);
22 |         }
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/kafka/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM devops/base:latest
 2 | #FROM niqdev/phusion-base:latest
 3 | 
 4 | ARG SCALA_VERSION=2.12
 5 | ARG KAFKA_VERSION=2.3.0
 6 | 
 7 | ENV ZOOKEEPER_HOSTS="localhost:2181"
 8 | ENV KAFKA_HOME "/opt/kafka"
 9 | ENV PATH "$KAFKA_HOME/bin:$PATH"
10 | 
11 | RUN apt-get install -y \
12 |   kafkacat && \
13 |   apt-get clean
14 | 
15 | RUN curl https://www-eu.apache.org/dist/kafka/${KAFKA_VERSION}/kafka_${SCALA_VERSION}-${KAFKA_VERSION}.tgz | tar -xzf - -C /opt && \
16 |   ln -s /opt/kafka_${SCALA_VERSION}-${KAFKA_VERSION} /opt/kafka && \
17 |   # bash expansion not working /var/log/{kafka,connect}
18 |   mkdir -p /var/log/kafka /var/log/connect
19 | 
20 | # update data directory
21 | RUN sed -i -r ' \
22 |   s/log.dirs=\/tmp\/kafka-logs/log.dirs=\/var\/lib\/kafka\/data/; \
23 |   ' /opt/kafka/config/server.properties
24 | 
25 | ADD supervisor-kafka.ini /etc/supervisor/conf.d/kafka.conf
26 | ADD supervisor-connect.ini /etc/supervisor/conf.d/connect.conf
27 | 


--------------------------------------------------------------------------------
/ansible/data/roles/schema-registry/files/log4j.properties:
--------------------------------------------------------------------------------
 1 | # TODO
 2 | 
 3 | log4j.rootLogger=INFO, stdout, file
 4 | 
 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
 6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
 7 | log4j.appender.stdout.layout.ConversionPattern=[%d] %p %m (%c:%L)%n
 8 | 
 9 | log4j.logger.kafka=ERROR, stdout
10 | log4j.logger.org.apache.zookeeper=ERROR, stdout
11 | log4j.logger.org.apache.kafka=ERROR, stdout
12 | log4j.logger.org.I0Itec.zkclient=ERROR, stdout
13 | log4j.additivity.kafka.server=false
14 | log4j.additivity.kafka.consumer.ZookeeperConsumerConnector=false
15 | 
16 | log4j.appender.file=org.apache.log4j.RollingFileAppender
17 | log4j.appender.file.maxBackupIndex=10
18 | log4j.appender.file.maxFileSize=100MB
19 | log4j.appender.file.File=${schema-registry.log.dir}/schema-registry.log
20 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
21 | log4j.appender.file.layout.ConversionPattern=[%d] %p %m (%c)%n
22 | 


--------------------------------------------------------------------------------
/hadoop/example/spark/src/main/scala/com/github/niqdev/App.scala:
--------------------------------------------------------------------------------
 1 | package com.github.niqdev
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object App {
 6 | 
 7 |   def main(args: Array[String]): Unit = {
 8 |     val spark = SparkSession.builder
 9 |       .appName("spark-github")
10 |       .master("local[*]")
11 |       .getOrCreate()
12 | 
13 |     val sc = spark.sparkContext
14 | 
15 |     val homeDir = System.getenv("HOME")
16 |     val inputPath = s"file:$homeDir/github-archive/*.json"
17 |     val outputDir = s"file:$homeDir/github-archive/output"
18 |     val githubLog = spark.read.json(inputPath)
19 |     val pushes = githubLog.filter("type = 'PushEvent'")
20 | 
21 |     pushes.printSchema
22 |     println(s"all events: ${githubLog.count}")
23 |     println(s"only pushes: ${pushes.count}")
24 |     pushes.show(5)
25 | 
26 |     val grouped = pushes.groupBy("actor.login").count
27 |     grouped.show(5)
28 |     val ordered = grouped.orderBy(grouped("count").desc)
29 |     ordered.show(5)
30 | 
31 |     ordered.write.format("json").save(outputDir)
32 |   }
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/aws/emr/application/service/example_service.py:
--------------------------------------------------------------------------------
 1 | from application import app
 2 | 
 3 | from datetime import datetime
 4 | from flask import jsonify
 5 | 
 6 | class ExampleService(object):
 7 | 
 8 |     def get_tasks(self, request):
 9 |         tasks = [
10 |             {
11 |                 'id': 1,
12 |                 'title': u'Buy groceries',
13 |                 'description': u'Milk, Cheese, Pizza, Fruit, Tylenol', 
14 |                 'done': False
15 |             },
16 |             {
17 |                 'id': 2,
18 |                 'title': u'Learn Python',
19 |                 'description': u'Need to find a good Python tutorial on the web', 
20 |                 'done': False
21 |             }
22 |         ]
23 |         app.logger.debug(request.method)
24 |         app.logger.debug(request.url)
25 |         #app.logger.debug('\n'.join('{}: {}'.format(k, v) for k, v in request.headers.items()))
26 |         #app.logger.debug(request.body)
27 |         return jsonify({
28 |             'href': request.url,
29 |             'createdAt': datetime.utcnow().isoformat(),
30 |             'modifiedAt': datetime.utcnow().isoformat(),
31 |             'tasks': tasks
32 |         })
33 | 


--------------------------------------------------------------------------------
/ansible/data/roles/schema-registry/img/draw-io-ansible.xml:
--------------------------------------------------------------------------------
1 | <mxfile userAgent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36" version="9.4.1" editor="www.draw.io" type="device"><diagram id="c91b35e1-85a6-d7d1-ea06-58d1e7553336" name="Page-1">5ZhBk9ogFMc/TY7bkRCjHqu17aE9eej2yCaYUEmeg7jqfvq+JCSREDs7s1m1rQcH/vCA9wMeb+LRRXb8otg2/Q4xl54/io8e/eT5/nQ2wv9COFXCeDaphESJuJJIK6zECzeisUv2IuY7q6MGkFpsbTGCPOeRtjSmFBzsbmuQ9qxblnBHWEVMuuoPEevUuOVPWv0rF0laz0zCWdXyxKJNomCfm/k8n67LX9WcsXos4+guZTEcziS69OhCAeiqlB0XXBZoa2yV3ecLrc26Fc/1awxqi2cm97xecrkwfaphlO7wwmDk0fkhFZqvtiwqWg+4+6ilOpNYI1g0w3Gl+fHimkjjKR4gDhnX6oRdjMHYsDnZ1UO7EeOp0dKzTQjqjsxsftKM3ALAgmFwAeDk/niQkQ2E+C6RIOwhQskARKaO/zzGu2KqoHQKCeRMLlt1bhM6o8GPQj+elX8WXT6Mi1qO63o0FmWlbfvFtT6ZIMH2GlBq5/0GsLVYF8v7M2n0BvYqMr3MBmumEl6D698PxSXT4tke/S1sZ7dlO7kJ3PGV4JLwPzy5ZHQlur4TJ1m+E08I8ebhktjh0p+44bIJoefhMhwgWlIHS4650cPtH5GGwi2ojPup0LujQnuSjXejUp/Te8o1ukAIdYH4fdlXQ+lNRNxsdBelPGMPiidiV44WSpxx/qSwlOjG6V5k5DrIph1k17xZxI3DMUQbru4QVOB3QtDolaDoEKACB9SGrTfsL+BEQpdTb7o/BCY3VL8A3COk7q27JqSpA0lCxKQDBd3TtucYw2DDFyBBoZJDXiSdayFlR2JSJDlWI8SBl5nOC1gC5/hoGjIRx2XG2ofafjUGoN3NqYKeqxv00B7kUZg5tMvsgfy7uJ0nJXg33FhtvzaVbWdf9OjyNw==</diagram></mxfile>


--------------------------------------------------------------------------------
/hadoop/file/spark/config/log4j.properties:
--------------------------------------------------------------------------------
 1 | # set global logging severity to INFO (and upwards: WARN, ERROR, FATAL)
 2 | log4j.rootCategory=INFO, console, file
 3 | 
 4 | # console config (restrict only to ERROR and FATAL)
 5 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 6 | log4j.appender.console.target=System.err
 7 | log4j.appender.console.threshold=ERROR
 8 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 9 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
10 | 
11 | # file config
12 | log4j.appender.file=org.apache.log4j.RollingFileAppender
13 | log4j.appender.file.File=/vol/spark/log/info.log
14 | log4j.appender.file.MaxFileSize=5MB
15 | log4j.appender.file.MaxBackupIndex=10
16 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
17 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
18 | 
19 | # settings to quiet third party logs that are too verbose
20 | log4j.logger.org.apache.spark.repl.Main=WARN
21 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
22 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
23 | log4j.logger.org.apache.spark=WARN
24 | log4j.logger.org.apache.hadoop=WARN
25 | 


--------------------------------------------------------------------------------
/hadoop/example/map-reduce/src/main/java/com/github/niqdev/WordCount.java:
--------------------------------------------------------------------------------
 1 | package com.github.niqdev;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | import org.apache.hadoop.fs.Path;
 5 | import org.apache.hadoop.io.IntWritable;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapreduce.Job;
 8 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 9 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
10 | 
11 | import java.io.IOException;
12 | 
13 | /**
14 |  * http://hadoop.apache.org/docs/r2.7.5/hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html
15 |  */
16 | public class WordCount {
17 | 
18 |     public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
19 |         Configuration conf = new Configuration();
20 |         Job job = Job.getInstance(conf, "word-count");
21 | 
22 |         job.setJarByClass(WordCount.class);
23 |         job.setMapperClass(TokenizerMapper.class);
24 |         job.setCombinerClass(IntSumReducer.class);
25 |         job.setReducerClass(IntSumReducer.class);
26 | 
27 |         job.setOutputKeyClass(Text.class);
28 |         job.setOutputValueClass(IntWritable.class);
29 | 
30 |         FileInputFormat.addInputPath(job, new Path(args[0]));
31 |         FileOutputFormat.setOutputPath(job, new Path(args[1]));
32 | 
33 |         System.exit(job.waitForCompletion(true) ? 0 : 1);
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/cassandra/cql/example_create.cql:
--------------------------------------------------------------------------------
 1 | DROP KEYSPACE IF EXISTS example;
 2 | 
 3 | CREATE KEYSPACE IF NOT EXISTS example WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 1};
 4 | 
 5 | USE example;
 6 | 
 7 | CREATE TABLE IF NOT EXISTS messages (
 8 |   id uuid PRIMARY KEY,
 9 |   body text,
10 |   created_at timestamp,
11 |   updated_at timestamp
12 | );
13 | 
14 | ALTER TABLE messages ADD title text;
15 | 
16 | CREATE TABLE IF NOT EXISTS example.counters (
17 |   id timeuuid PRIMARY KEY,
18 |   total counter
19 | );
20 | 
21 | CREATE TABLE IF NOT EXISTS example.users (
22 |   first_name text PRIMARY KEY,
23 |   last_name text,
24 |   last_ip inet,
25 |   any_value blob,
26 |   enable boolean
27 | );
28 | 
29 | ALTER TABLE example.users ADD emails set<text>;
30 | 
31 | ALTER TABLE example.users ADD phone_numbers list<text>;
32 | 
33 | ALTER TABLE example.users ADD login_sessions map<timeuuid, int>;
34 | 
35 | CREATE TYPE example.address (
36 |   street text,
37 |   city text,
38 |   state text,
39 |   zip_code int
40 | );
41 | 
42 | -- frozen: user-defined data type is considered a collection
43 | ALTER TABLE example.users ADD addresses map<text, frozen<address>>;
44 | 
45 | -- avoid secondary indexes
46 | CREATE INDEX users_last_name_idx ON example.users ( last_name );
47 | 
48 | CREATE CUSTOM INDEX users_last_name_sasi_idx ON example.users ( last_name )
49 | USING 'org.apache.cassandra.index.sasi.SASIIndex';
50 | 
51 | CREATE INDEX ON example.users ( emails );
52 | 


--------------------------------------------------------------------------------
/kafka/docker-compose-hub.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | 
 3 | services:
 4 | 
 5 |   zookeeper:
 6 |     container_name: devops-zookeeper
 7 |     image: niqdev/zookeeper:latest
 8 |     ports:
 9 |       - 2181:2181
10 |     hostname: zookeeper
11 |     networks:
12 |       - devops_network
13 | 
14 |   kafka:
15 |     container_name: devops-kafka
16 |     image: niqdev/kafka:latest
17 |     ports:
18 |       - 9092:9092
19 |       - 8083:8083
20 |     hostname: kafka
21 |     networks:
22 |       - devops_network
23 |     environment:
24 |       - ZOOKEEPER_HOSTS="zookeeper:2181"
25 | 
26 |   schema-registry:
27 |     container_name: devops-schema-registry
28 |     image: confluentinc/cp-schema-registry
29 |     depends_on:
30 |       - kafka
31 |     ports:
32 |       - 8081:8081
33 |     hostname: schema-registry
34 |     networks:
35 |       - devops_network
36 |     environment:
37 |       - SCHEMA_REGISTRY_KAFKASTORE_CONNECTION_URL=zookeeper:2181
38 |       - SCHEMA_REGISTRY_HOST_NAME=schema-registry
39 |       - SCHEMA_REGISTRY_LISTENERS=http://schema-registry:8081
40 | 
41 |   schema-registry-ui:
42 |     container_name: devops-schema-registry-ui
43 |     image: landoop/schema-registry-ui
44 |     depends_on:
45 |       - schema-registry
46 |     ports:
47 |       - 8082:8000
48 |     hostname: schema-registry-ui
49 |     networks:
50 |       - devops_network
51 |     environment:
52 |       - SCHEMAREGISTRY_URL=http://schema-registry:8081
53 |       - PROXY=true
54 | 
55 | networks:
56 |   devops_network:
57 | 


--------------------------------------------------------------------------------
/cassandra/docker-compose-cluster.yml:
--------------------------------------------------------------------------------
 1 | version: '3.2'
 2 | 
 3 | services:
 4 | 
 5 |   cassandra-seed:
 6 |     container_name: devops-cassandra-seed
 7 |     image: cassandra:3.11
 8 |     restart: unless-stopped
 9 |     ports:
10 |       - 19042:9042
11 |     networks:
12 |       - network_cluster
13 |     volumes:
14 |       - ./cql:/cql
15 |       #- .cassandra/cassandra-seed/data:/var/lib/cassandra
16 |       #- .cassandra/cassandra-seed/log:/var/log/cassandra
17 | 
18 |   cassandra-node-1:
19 |     container_name: devops-cassandra-node-1
20 |     image: cassandra:3.11
21 |     depends_on:
22 |       - cassandra-seed
23 |     restart: unless-stopped
24 |     ports:
25 |       - 19043:9042
26 |     networks:
27 |       - network_cluster
28 |     environment:
29 |       CASSANDRA_SEEDS: "devops-cassandra-seed"
30 |     volumes:
31 |       - ./cql:/cql
32 |       #- .cassandra/cassandra-node-1/data:/var/lib/cassandra
33 |       #- .cassandra/cassandra-node-1/log:/var/log/cassandra
34 | 
35 |   cassandra-node-2:
36 |     container_name: devops-cassandra-node-2
37 |     image: cassandra:3.11
38 |     depends_on:
39 |       - cassandra-seed
40 |     restart: unless-stopped
41 |     ports:
42 |       - 19044:9042
43 |     networks:
44 |       - network_cluster
45 |     environment:
46 |       CASSANDRA_SEEDS: "devops-cassandra-seed"
47 |     volumes:
48 |       - ./cql:/cql
49 |       #- .cassandra/cassandra-node-2/data:/var/lib/cassandra
50 |       #- .cassandra/cassandra-node-2/log:/var/log/cassandra
51 | 
52 | networks:
53 |   network_cluster:
54 | 


--------------------------------------------------------------------------------
/ansible/data/roles/schema-registry/README.md:
--------------------------------------------------------------------------------
 1 | # schema-registry
 2 | 
 3 | ```bash
 4 | cd ansible
 5 | 
 6 | # setup
 7 | ./setup_share.sh
 8 | vagrant up
 9 | 
10 | # setup docker
11 | vagrant ssh ansible
12 | ansible-playbook /ansible/site.yml -t docker
13 | 
14 | # (local) copy docker compose manually
15 | cp data/roles/schema-registry/docker-compose-local.yml .share/node-1/docker-compose-local.yml
16 | 
17 | vagrant ssh node-1
18 | # update hosts
19 | echo -e "# docker images\n127.0.1.1 zookeeper\n127.0.1.1 kafka\n" | sudo tee -a /etc/hosts
20 | # start docker
21 | sudo -i -u docker
22 | docker-compose -f /data/docker-compose-local.yml up
23 | 
24 | # setup schema registry
25 | vagrant ssh ansible
26 | ansible-playbook /ansible/site.yml -t schema-registry
27 | 
28 | # verify schema registry
29 | vagrant ssh node-1
30 | sudo systemctl start confluent-schema-registry
31 | sudo systemctl status confluent-schema-registry
32 | sudo journalctl -u confluent-schema-registry -b
33 | sudo journalctl -ru confluent-schema-registry --no-pager
34 | ll /etc/schema-registry/
35 | ll /var/log/confluent/schema-registry/
36 | ll /home/cp-schema-registry/logs/
37 | less +G /var/log/confluent/schema-registry/schema-registry.log
38 | tail -F /var/log/confluent/schema-registry/schema-registry.log
39 | 
40 | # (local) examples
41 | http -v 192.168.100.11:8081/subjects
42 | 
43 | # check running services
44 | sudo netstat -ltp
45 | 
46 | # check user
47 | ps -ef | grep schema
48 | cat /etc/passwd
49 | 
50 | # verify zookeeper
51 | docker exec -it my-local-zookeeper bash
52 | zkCli.sh
53 | get /brokers/ids/0
54 | ```
55 | 


--------------------------------------------------------------------------------
/aws/emr/application/api/emr_api.py:
--------------------------------------------------------------------------------
 1 | from application import app
 2 | from application.service.emr_service import EmrService
 3 | 
 4 | import json
 5 | from datetime import datetime
 6 | from flask import jsonify, request
 7 | 
 8 | emr_service = EmrService()
 9 | 
10 | # TODO env|region|config-name (yaml)
11 | 
12 | @app.route('/v1/emr/clusters/create', methods=['POST'])
13 | def route_clusters_create():
14 |     data = emr_service.create_cluster()
15 |     return __build_response(request, data)
16 | 
17 | @app.route('/v1/emr/clusters/destroy', methods=['POST'])
18 | def route_clusters_destroy():
19 |     data = emr_service.destroy_cluster()
20 |     return __build_response(request, data)
21 | 
22 | @app.route('/v1/emr/clusters/info')
23 | def route_clusters_info():
24 |     data = emr_service.info_cluster()
25 |     return __build_response(request, data)
26 | 
27 | def __build_response(request, data = {}, debug = True):
28 |     """
29 |     Build Response
30 |     """
31 | 
32 |     if debug:
33 |         data_request = {
34 |             'url': request.url,
35 |             'method': request.method,
36 |             'headers': dict(request.headers),
37 |             'params': request.args
38 |         }
39 |         data_response = {
40 |             #'params': request.params,
41 |             #'body': request.body,
42 |             'data': data
43 |         }
44 |         return jsonify({
45 |             'timestamp': datetime.utcnow().isoformat(),
46 |             'request': data_request,
47 |             'response': data_response
48 |         })
49 |     else:
50 |         return jsonify(data)
51 | 


--------------------------------------------------------------------------------
/hadoop/file/oozie/config/oozie-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <configuration>
 4 | 
 5 |     <property>
 6 |         <name>oozie.service.HadoopAccessorService.hadoop.configurations</name>
 7 |         <value>*=/usr/local/hadoop/etc/hadoop</value>
 8 |     </property>
 9 | 
10 |     <!-- PostgreSQL -->
11 |     <!--
12 |     <property>
13 |         <name>oozie.db.schema.name</name>
14 |         <value>oozie</value>
15 |     </property>
16 |     <property>
17 |         <name>oozie.service.JPAService.create.db.schema</name>
18 |         <value>false</value>
19 |     </property>
20 |     <property>
21 |         <name>oozie.service.JPAService.validate.db.connection</name>
22 |         <value>true</value>
23 |     </property>
24 |     <property>
25 |         <name>oozie.service.JPAService.jdbc.driver</name>
26 |         <value>org.postgresql.Driver</value>
27 |     </property>
28 |     <property>
29 |         <name>oozie.service.JPAService.jdbc.url</name>
30 |         <value>jdbc:postgresql://postgres.local:5432/oozie-db</value>
31 |     </property>
32 |     <property>
33 |         <name>oozie.service.JPAService.jdbc.username</name>
34 |         <value>postgres</value>
35 |     </property>
36 |     <property>
37 |         <name>oozie.service.JPAService.jdbc.password</name>
38 |         <value>postgres</value>
39 |     </property>
40 |     <property>
41 |         <name>oozie.service.JPAService.connection.properties</name>
42 |         <value>connectTimeout=30,socketTimeout=30</value>
43 |     </property>
44 |     -->
45 | </configuration>
46 | 


--------------------------------------------------------------------------------
/aws/emr/application/api/example_api.py:
--------------------------------------------------------------------------------
 1 | from application import app
 2 | from application.service.example_service import ExampleService
 3 | 
 4 | from flask import render_template, request, abort, redirect, url_for
 5 | 
 6 | example_service = ExampleService()
 7 | 
 8 | # http://127.0.0.1:5000/static/example.txt
 9 | 
10 | # http://127.0.0.1:5000
11 | @app.route('/')
12 | def index():
13 |     app.logger.debug('A value for debugging')
14 |     app.logger.warning('A warning occurred (%d apples)', 42)
15 |     app.logger.error('An error occurred')
16 |     return 'Hello, World!'
17 | 
18 | # default is GET only
19 | # http://127.0.0.1:5000/query?key=aaa
20 | @app.route('/query', methods=['GET', 'POST'])
21 | def query_param():
22 |     return 'METHOD %s' % request.args.get('key', '')
23 | 
24 | # http://127.0.0.1:5000/path/TODO/hello
25 | @app.route('/path/<param>/hello')
26 | @app.route('/path/<param>/hello/')
27 | def path_param(param):
28 |     return 'param %s' % param
29 | 
30 | # http://127.0.0.1:5000/hello/name
31 | @app.route('/hello/')
32 | @app.route('/hello/<name>')
33 | def hello(name=None):
34 |     return render_template('hello.html', name=name)
35 | 
36 | # http://127.0.0.1:5000/redirect
37 | @app.route('/redirect')
38 | def my_redirect():
39 |     return redirect(url_for('error'))
40 | 
41 | # http://127.0.0.1:5000/error
42 | @app.route('/error')
43 | def error():
44 |     abort(401)
45 | 
46 | # http://127.0.0.1:5000/xxx
47 | @app.errorhandler(404)
48 | def page_not_found(error):
49 |     return render_template('page_not_found.html'), 404
50 | 
51 | @app.teardown_appcontext
52 | def teardown(error):
53 |     app.logger.debug('after each request')
54 | 
55 | # http://127.0.0.1:5000/v1/tasks
56 | @app.route('/v1/tasks', methods=['GET'])
57 | def get_tasks():
58 |     return example_service.get_tasks(request)
59 | 


--------------------------------------------------------------------------------
/hadoop/file/hadoop/config/yarn-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <configuration>
 4 |     <property>
 5 |         <name>yarn.nodemanager.aux-services</name>
 6 |         <value>mapreduce_shuffle</value>
 7 |     </property>
 8 | 
 9 |     <!-- Fair Scheduler -->
10 |     <property>
11 |         <name>yarn.resourcemanager.scheduler.class</name>
12 |         <value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler</value>
13 |     </property>
14 |     <property>
15 |         <name>yarn.scheduler.fair.preemption</name>
16 |         <value>true</value>
17 |     </property>
18 | 
19 |     <!-- logs -->
20 |     <property>
21 |         <name>yarn.log-aggregation-enable</name>
22 |         <value>true</value>
23 |     </property>
24 |     <property>
25 |         <name>yarn.nodemanager.log-dirs</name>
26 |         <value>/vol/hadoop/log/yarn</value>
27 |     </property>
28 |     <!-- hdfs path -->
29 |     <property>
30 |         <name>yarn.nodemanager.remote-app-log-dir</name>
31 |         <value>/yarn/app</value>
32 |     </property>
33 |     <property>
34 |         <name>yarn.nodemanager.remote-app-log-dir-suffix</name>
35 |         <value>logs</value>
36 |     </property>
37 | 
38 |     <!-- MapReduce Job History Server -->
39 |     <property>
40 |         <name>yarn.log.server.url</name>
41 |         <value>http://history.local:19888/jobhistory/logs</value>
42 |     </property>
43 | 
44 |     <property>
45 |         <name>yarn.resourcemanager.hostname</name>
46 |         <value>resource-manager.local</value>
47 |     </property>
48 |     <!-- Web Application Proxy Server -->
49 |     <property>
50 |         <name>yarn.web-proxy.address</name>
51 |         <value>web-proxy.local:8100</value>
52 |     </property>
53 | 
54 | </configuration>
55 | 


--------------------------------------------------------------------------------
/docs-todo/_spark.md:
--------------------------------------------------------------------------------
 1 | # Spark
 2 | 
 3 | Spark wasn't made with Online Transaction Processing (OLTP) applications in mind (fast, numerous, atomic transactions).
 4 | It's better suited for Online Analytical Processing (OLAP): batch jobs and data mining.
 5 | 
 6 | MapReduce job results need to be stored in HDFS before they can be used by another job.
 7 | For this reason, MapReduce is inherently bad with iterative algorithms.
 8 | Furthermore, many kinds of problems don’t easily fit MapReduce’s two-step paradigm.
 9 | 
10 | There are two types of RDD operations: transformations and actions.
11 | Transformations (for example, filter or map) are operations that produce a new RDD by performing some useful data manipulation on another RDD.
12 | Actions (for example, count or foreach) trigger a computation in order to return the result to the calling program or to perform some actions on an RDD's elements.
13 | 
14 | It's important to understand that transformations are evaluated lazily, meaning computation doesn't take place until you invoke an action.
15 | 
16 | Data partitioning is Spark’s mechanism for dividing data between multiple nodes in a cluster.
17 | 
18 | Physical movement of data between partitions is called shuffling.
19 | It occurs when data from multiple partitions needs to be combined in order to build partitions for a new RDD.
20 | When grouping elements by key, for example, Spark needs to examine all of the RDD's partitions, find elements with the same key, and then physically group them, thus forming new partitions.
21 | 
22 | The spark.shuffle.consolidateFiles parameter specifies whether to consolidate intermediate files created during a shuffle.
23 | For performance reasons, we recommend that you change this to true (the default value is false ) if you’re using an ext4 or XFS filesystem.
24 | 
25 | Coalesce is used for either reducing or increasing the number of partitions and force shuffling
26 | 


--------------------------------------------------------------------------------
/ansible/Vagrantfile:
--------------------------------------------------------------------------------
 1 | # -*- mode: ruby -*-
 2 | # vi: set ft=ruby :
 3 | 
 4 | BOX_IMAGE = "ubuntu/bionic64"
 5 | NODE_COUNT = 3
 6 | PATH_SHARE = ".share"
 7 | 
 8 | Vagrant.configure("2") do |config|
 9 | 
10 |   config.vm.box = BOX_IMAGE
11 | 
12 |   config.vm.provider "virtualbox" do |vb|
13 |     vb.memory = 2048
14 |     vb.cpus = 2
15 |   end
16 | 
17 |   config.vm.define "ansible" do |ansible|
18 |     ansible.vm.hostname = "ansible"
19 |     ansible.vm.network "private_network", ip: "192.168.100.10"
20 |     ansible.vm.synced_folder ENV['HOME'], "/local"
21 |     ansible.vm.provision "file", source: PATH_SHARE + "/ssh/ansible_rsa", destination: "$HOME/.ssh/id_rsa"
22 |     ansible.vm.provision "shell", path: "setup_ansible.sh"
23 |     ansible.vm.synced_folder "data/", "/ansible"
24 |     ansible.vm.provision "shell", inline: <<-SHELL
25 |       # default syncronized inventory
26 |       ln -sf /ansible/hosts /etc/ansible/hosts
27 |     SHELL
28 |   end
29 | 
30 |   (1..NODE_COUNT).each do |i|
31 |     config.vm.define "node-#{i}" do |node|
32 |       node.vm.hostname = "ip-192-168-100-#{i + 10}"
33 |       node.vm.network :private_network, ip: "192.168.100.#{i + 10}"
34 |       node.vm.synced_folder PATH_SHARE + "/node-#{i}", "/data"
35 |     end
36 |   end
37 | 
38 |   # give ssh access to each machine
39 |   config.vm.provision "file", source: PATH_SHARE + "/ssh/ansible_rsa.pub", destination: "/tmp/ansible_rsa.pub"
40 |   config.vm.provision "shell", inline: <<-SHELL
41 |     # append ansible key to avoid lose vagrant key with copy
42 |     cat /tmp/ansible_rsa.pub >> .ssh/authorized_keys
43 |     rm /tmp/ansible_rsa.pub
44 |   SHELL
45 | 
46 |   config.vm.provision "shell", inline: <<-SHELL
47 |     # update
48 |     apt-get update
49 |     # required python2 missing on ubuntu-18
50 |     apt-get install -y python2.7 python-pip
51 |     # dns
52 |     apt-get install -y avahi-daemon libnss-mdns 
53 |   SHELL
54 | end
55 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DevOps
 2 | 
 3 | [![github-pages](https://github.com/niqdev/devops/actions/workflows/gh-pages.yml/badge.svg)](https://github.com/niqdev/devops/actions/workflows/gh-pages.yml)
 4 | 
 5 | A collection of notes, resources, documentation and POCs mainly related to distributed systems for local development, learning purposes and quick prototyping.
 6 | 
 7 | * [Linux](https://niqdev.github.io/devops/linux)
 8 | * [Docker](https://niqdev.github.io/devops/docker) 
 9 | * [Ansible](https://niqdev.github.io/devops/ansible)
10 | * [Cassandra](https://niqdev.github.io/devops/cassandra)
11 | * [ZooKeeper](https://niqdev.github.io/devops/zookeeper)
12 | * [Kafka](https://niqdev.github.io/devops/kafka)
13 | * [Hadoop](https://niqdev.github.io/devops/hadoop)
14 |   * [HDFS and MapReduce](https://niqdev.github.io/devops/hadoop/#hdfs-and-mapreduce)
15 |   * [Spark](https://niqdev.github.io/devops/hadoop/#spark)
16 |   * [Zeppelin](https://niqdev.github.io/devops/hadoop/#zeppelin)
17 |   * [Oozie](https://niqdev.github.io/devops/hadoop/#oozie)
18 | * [Cloud](https://niqdev.github.io/devops/cloud)
19 | * [Kubernetes](https://niqdev.github.io/devops/kubernetes)
20 | * [System Design](https://niqdev.github.io/devops/system-design)
21 | * [Operating System](https://niqdev.github.io/devops/operating-system)
22 | * [Programming](https://niqdev.github.io/devops/programming)
23 | * [Other Resources](https://niqdev.github.io/devops/other-resources)
24 | * [Toolbox](https://niqdev.github.io/devops/toolbox)
25 | 
26 | ## Development
27 | 
28 | Ubuntu
29 | 
30 | ```bash
31 | # install pip3
32 | sudo apt install -y python3-pip
33 | 
34 | # install virtualenv globally 
35 | sudo pip3 install virtualenv
36 | 
37 | # create virtualenv
38 | virtualenv -p $(which python3) venv
39 | 
40 | # how-to activate virtualenv
41 | source venv/bin/activate
42 | 
43 | # verify virtualenv
44 | which python
45 | python --version
46 | 
47 | # how-to deactivate virtualenv
48 | deactivate
49 | 
50 | # install new package
51 | pip install mkdocs
52 | 
53 | # update requirements
54 | pip freeze > requirements.txt
55 | 
56 | # run locally
57 | # http://localhost:8000
58 | mkdocs serve
59 | ```
60 | 


--------------------------------------------------------------------------------
/hadoop/script/setup_zeppelin.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # unofficial bash strict mode
 4 | set -euo pipefail
 5 | IFS=$'\n\t'
 6 | 
 7 | # run from any directory (no symlink allowed)
 8 | CURRENT_PATH=$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd -P)
 9 | cd ${CURRENT_PATH}
10 | 
11 | ##############################
12 | 
13 | FILE_PATH="/vagrant/file"
14 | DATA_PATH="/vagrant/.data"
15 | USER_NAME="hadoop"
16 | 
17 | ZEPPELIN_VERSION="0.7.3"
18 | ZEPPELIN_NAME="zeppelin-$ZEPPELIN_VERSION-bin-all"
19 | 
20 | ##############################
21 | 
22 | function download_dist {
23 |   local ZEPPELIN_MIRROR_DOWNLOAD="http://www-eu.apache.org/dist/zeppelin/zeppelin-$ZEPPELIN_VERSION/$ZEPPELIN_NAME.tgz"
24 |   echo "[*] download dist"
25 |   wget -q -P $DATA_PATH $ZEPPELIN_MIRROR_DOWNLOAD
26 | }
27 | 
28 | function setup_dist {
29 |   local ZEPPELIN_DIST_PATH="$DATA_PATH/$ZEPPELIN_NAME*"
30 |   echo "[*] setup dist"
31 | 
32 |   if [ ! -e $ZEPPELIN_DIST_PATH ]; then
33 |     download_dist
34 |   fi
35 | 
36 |   tar -xf $ZEPPELIN_DIST_PATH -C /opt
37 |   ln -s /opt/$ZEPPELIN_NAME /usr/local/zeppelin
38 |   chown -R $USER_NAME:$USER_NAME /opt/$ZEPPELIN_NAME
39 | }
40 | 
41 | function setup_config {
42 |   local DATA_PATH_GUEST="/vol/zeppelin"
43 |   local ZEPPELIN_BASE_PATH="/usr/local/zeppelin"
44 |   local CONFIG_PATH="$ZEPPELIN_BASE_PATH/conf"
45 |   local FILES=( "zeppelin-env.sh" )
46 | 
47 |   echo "[*] create directories"
48 |   mkdir -pv \
49 |     $DATA_PATH_GUEST/{log,notebook}
50 |   
51 |   for FILE in "${FILES[@]}"
52 |   do
53 |     echo "[*] update config: $FILE"
54 |     # backup only if exists
55 |     [ -e $CONFIG_PATH/$FILE ] && mv $CONFIG_PATH/$FILE $CONFIG_PATH/$FILE.orig
56 |     cp $FILE_PATH/zeppelin/config/$FILE $CONFIG_PATH/$FILE
57 |   done
58 | 
59 |   echo "[*] update permissions"
60 |   chown -R $USER_NAME:$USER_NAME \
61 |     $ZEPPELIN_BASE_PATH/
62 |   
63 |   echo "[*] update env"
64 |   cp $FILE_PATH/zeppelin/profile-zeppelin.sh /etc/profile.d/profile-zeppelin.sh && \
65 |     source /etc/profile.d/profile-zeppelin.sh
66 | }
67 | 
68 | function main {
69 |   echo "[+] setup zeppelin"
70 |   setup_dist
71 |   setup_config
72 |   echo "[-] setup zeppelin"
73 | }
74 | 
75 | main
76 | 


--------------------------------------------------------------------------------
/ansible/data/roles/schema-registry/files/schema-registry.properties:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Confluent Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # >>>>>>>>>> https://docs.confluent.io/current/schema-registry/docs/config.html
16 | 
17 | # The address the socket server listens on.
18 | #   FORMAT:
19 | #     listeners = listener_name://host_name:port
20 | #   EXAMPLE:
21 | #     listeners = PLAINTEXT://your.host.name:9092
22 | listeners=http://0.0.0.0:8081
23 | 
24 | # Zookeeper connection string for the Zookeeper cluster used by your Kafka cluster
25 | # (see zookeeper docs for details).
26 | # This is a comma separated host:port pairs, each corresponding to a zk
27 | # server. e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002".
28 | #kafkastore.connection.url=localhost:2181
29 | 
30 | # Alternatively, Schema Registry can now operate without Zookeeper, handling all coordination via
31 | # Kafka brokers. Use this setting to specify the bootstrap servers for your Kafka cluster and it
32 | # will be used both for selecting the master schema registry instance and for storing the data for
33 | # registered schemas.
34 | # (Note that you cannot mix the two modes; use this mode only on new deployments or by shutting down
35 | # all instances, switching to the new configuration, and then starting the schema registry
36 | # instances again.)
37 | kafkastore.bootstrap.servers=PLAINTEXT://localhost:9092
38 | 
39 | # The name of the topic to store schemas in
40 | kafkastore.topic=_schemas
41 | 
42 | # If true, API requests that fail will include extra debugging information, including stack traces
43 | debug=true
44 | 
45 | kafkastore.topic.replication.factor=1
46 | 


--------------------------------------------------------------------------------
/hadoop/Vagrantfile:
--------------------------------------------------------------------------------
 1 | # -*- mode: ruby -*-
 2 | # vi: set ft=ruby :
 3 | 
 4 | BOX_IMAGE = "ubuntu/bionic64"
 5 | NODE_COUNT = 3
 6 | 
 7 | DATA_PATH_HOST = ".data"
 8 | DATA_PATH_GUEST = "/vol"
 9 | KEY_PATH = DATA_PATH_HOST + "/hadoop_rsa"
10 | HADOOP_ID = "1101"
11 | 
12 | VAGRANT_COMMAND = ARGV[0]
13 | Vagrant.configure("2") do |config|
14 | 
15 |   config.vm.box = BOX_IMAGE
16 | 
17 |   if VAGRANT_COMMAND == "ssh"
18 |     config.ssh.username = "hadoop"
19 |     config.ssh.private_key_path = KEY_PATH
20 |   end
21 | 
22 |   config.vm.define "master" do |node|
23 |     node.vm.hostname = "master"
24 |     
25 |     node.vm.provider "virtualbox" do |vb|
26 |       vb.memory = 4096
27 |       vb.cpus = 4
28 |     end
29 | 
30 |     node.vm.network "private_network", ip: "172.16.0.10"
31 |     # NameNode
32 |     node.vm.network "forwarded_port", guest: 50070, host: 50070
33 |     # ResourceManager
34 |     node.vm.network "forwarded_port", guest: 8088, host: 8088
35 |     # MapReduce Job History server
36 |     node.vm.network "forwarded_port", guest: 19888, host: 19888
37 |     # Spark
38 |     node.vm.network "forwarded_port", guest: 4040, host: 4040
39 |     # Oozie
40 |     node.vm.network "forwarded_port", guest: 11000, host: 11000
41 | 
42 |     node.vm.synced_folder ENV['HOME'], "/local"
43 |     # synced_folder permission issue https://github.com/hashicorp/vagrant/issues/936
44 |     node.vm.synced_folder DATA_PATH_HOST + "/master", DATA_PATH_GUEST,
45 |       mount_options: ["uid=" + HADOOP_ID, "gid=" + HADOOP_ID]
46 |   end
47 | 
48 |   (1..NODE_COUNT).each do |i|
49 |     config.vm.define "node-#{i}" do |node|
50 |       node.vm.hostname = "node-#{i}"
51 | 
52 |       node.vm.provider "virtualbox" do |vb|
53 |         vb.memory = 1024
54 |         vb.cpus = 1
55 |       end
56 | 
57 |       node.vm.network :private_network, ip: "172.16.0.#{i + 100}"
58 | 
59 |       node.vm.synced_folder DATA_PATH_HOST + "/node-#{i}", DATA_PATH_GUEST,
60 |         mount_options: ["uid=" + HADOOP_ID, "gid=" + HADOOP_ID]
61 |     end
62 |   end
63 | 
64 |   config.vm.provision "shell", path: "./script/setup_ubuntu.sh"
65 |   config.vm.provision "shell", path: "./script/setup_hadoop.sh"
66 |   config.vm.provision "shell", path: "./script/setup_spark.sh"
67 |   config.vm.provision "shell", run: "always", inline: <<-SHELL
68 |     su --login hadoop /vagrant/script/bootstrap.sh
69 |   SHELL
70 | end
71 | 


--------------------------------------------------------------------------------
/ansible/data/roles/schema-registry/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | # https://docs.confluent.io/current/installation/installing_cp/deb-ubuntu.html
 4 | # https://docs.confluent.io/current/schema-registry/docs/index.html
 5 | 
 6 | - name: create group
 7 |   become: yes
 8 |   group:
 9 |     name: "{{ schema.registry.group }}"
10 | 
11 | - name: create user
12 |   become: yes
13 |   user:
14 |     name: "{{ schema.registry.user }}"
15 |     comment: "Schema Registry User"
16 |     system: yes
17 |     group: "{{ schema.registry.group }}"
18 | 
19 | - name: create and update directories
20 |   become: yes
21 |   file:
22 |     path: "{{ item }}"
23 |     state: directory
24 |     owner: "{{ schema.registry.user }}"
25 |     group: "{{ schema.registry.group }}"
26 |     recurse: yes
27 |   with_items:
28 |     - "/home/{{ schema.registry.user }}"
29 |     - "/home/{{ schema.registry.user }}/logs"
30 |     - /var/log/confluent/schema-registry
31 | 
32 | - name: install confluent public key
33 |   become: yes
34 |   apt_key:
35 |     url: https://packages.confluent.io/deb/5.0/archive.key
36 |     state: present
37 | 
38 | - name: add confluent repository
39 |   become: yes
40 |   apt_repository:
41 |     repo: 'deb [arch=amd64] https://packages.confluent.io/deb/5.0 stable main'
42 |     state: present
43 | 
44 | - name: install confluent schema registry
45 |   become: yes
46 |   apt:
47 |     name: confluent-schema-registry
48 |     update_cache: yes
49 | 
50 | - name: symlink /var/log/confluent/schema-registry to /home/{{ schema.registry.user }}/logs
51 |   become: yes
52 |   #become_user: "{{ schema.registry.user }}"
53 |   file:
54 |     src: /var/log/confluent/schema-registry
55 |     dest: /home/{{ schema.registry.user }}/logs
56 |     state: link
57 |     force: yes
58 |     owner: "{{ schema.registry.user }}"
59 |     group: "{{ schema.registry.group }}"
60 | 
61 | - name: copy configs
62 |   become: yes
63 |   copy:
64 |     src: "{{ item }}"
65 |     dest: "/etc/schema-registry/{{ item }}"
66 |     owner: "{{ schema.registry.user }}"
67 |     group: "{{ schema.registry.group }}"
68 |     mode: u=rwx,g=r,o=r
69 |   with_items:
70 |     - log4j.properties
71 |     - schema-registry.properties
72 | #  notify:
73 | #    - restart schema-registry
74 | 
75 | - name: start confluent schema registry
76 |   become: yes
77 |   #become_user: "{{ schema.registry.user }}"
78 |   systemd:
79 |     state: started
80 |     name: confluent-schema-registry
81 | 


--------------------------------------------------------------------------------
/hadoop/script/bootstrap.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # unofficial bash strict mode
  4 | set -euo pipefail
  5 | IFS=$'\n\t'
  6 | 
  7 | # run from any directory (no symlink allowed)
  8 | CURRENT_PATH=$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd -P)
  9 | cd ${CURRENT_PATH}
 10 | 
 11 | ##############################
 12 | 
 13 | USER_NAME="hadoop"
 14 | 
 15 | PARAM_SERVICE_NAME=${1:-"all"}
 16 | 
 17 | ##############################
 18 | 
 19 | function start_hadoop {
 20 |   local HOSTNAME=$(hostname)
 21 |   echo "[*] start hadoop on [$HOSTNAME]"
 22 |   hadoop version
 23 | 
 24 |   case $HOSTNAME in
 25 |     "master")
 26 |       hadoop-daemon.sh --script hdfs start namenode
 27 |       hadoop-daemon.sh --script hdfs start secondarynamenode
 28 |       yarn-daemon.sh start resourcemanager
 29 |       yarn-daemon.sh start proxyserver
 30 |       mr-jobhistory-daemon.sh start historyserver
 31 |       ;;
 32 |     *)
 33 |       hadoop-daemons.sh --script hdfs start datanode
 34 |       yarn-daemons.sh start nodemanager
 35 |       ;;
 36 |   esac
 37 | 
 38 |   jps
 39 | }
 40 | 
 41 | function start_spark {
 42 |   echo "[*] start spark"
 43 |   spark-shell --version
 44 | 
 45 |   # set "spark.eventLog.enabled" to "true"
 46 |   # History Server expects to find a file named APPLICATION_COMPLETE
 47 |   # in the applicationSs directory (/tmp/spark-events/<application_id> by default)
 48 |   /usr/local/spark/sbin/start-history-server.sh
 49 | }
 50 | 
 51 | function start_zeppelin {
 52 |   # check if exists
 53 |   if [ -x "$(command -v zeppelin.sh)" ]; then
 54 |     echo "[*] start zeppelin"
 55 |     zeppelin-daemon.sh start
 56 |   fi
 57 | }
 58 | 
 59 | function start_oozie {
 60 |   # check if exists
 61 |   if [ -x "$(command -v oozie)" ]; then
 62 |     echo "[*] start oozie"
 63 |     oozied.sh start
 64 |     oozie admin -oozie http://oozie.local:11000/oozie -status
 65 |   fi
 66 | }
 67 | 
 68 | function start_all {
 69 |   start_hadoop
 70 |   start_spark
 71 |   start_zeppelin
 72 |   start_oozie
 73 | }
 74 | 
 75 | function main {
 76 |   echo "[+] boostrap"
 77 |   local SERVICE_NAME=$(echo "${PARAM_SERVICE_NAME}" | awk '{print toupper($0)}')
 78 | 
 79 |   case $SERVICE_NAME in
 80 |     "ZEPPELIN")
 81 |       start_zeppelin
 82 |       ;;
 83 |     "OOZIE")
 84 |       start_oozie
 85 |       ;;
 86 |     "ALL")
 87 |       start_all
 88 |       ;;
 89 |     *)
 90 |       echo "[-] invalid parameters"
 91 |       ;;
 92 |   esac
 93 |   echo "[-] boostrap"
 94 | }
 95 | 
 96 | if [ $USER_NAME == "$(whoami)" ]; then
 97 |   main
 98 | else
 99 |   echo "[-] execute as [$USER_NAME] user only"
100 | fi
101 | 


--------------------------------------------------------------------------------
/hadoop/example/map-reduce/gradlew.bat:
--------------------------------------------------------------------------------
 1 | @if "%DEBUG%" == "" @echo off
 2 | @rem ##########################################################################
 3 | @rem
 4 | @rem  Gradle startup script for Windows
 5 | @rem
 6 | @rem ##########################################################################
 7 | 
 8 | @rem Set local scope for the variables with windows NT shell
 9 | if "%OS%"=="Windows_NT" setlocal
10 | 
11 | set DIRNAME=%~dp0
12 | if "%DIRNAME%" == "" set DIRNAME=.
13 | set APP_BASE_NAME=%~n0
14 | set APP_HOME=%DIRNAME%
15 | 
16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
17 | set DEFAULT_JVM_OPTS=
18 | 
19 | @rem Find java.exe
20 | if defined JAVA_HOME goto findJavaFromJavaHome
21 | 
22 | set JAVA_EXE=java.exe
23 | %JAVA_EXE% -version >NUL 2>&1
24 | if "%ERRORLEVEL%" == "0" goto init
25 | 
26 | echo.
27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28 | echo.
29 | echo Please set the JAVA_HOME variable in your environment to match the
30 | echo location of your Java installation.
31 | 
32 | goto fail
33 | 
34 | :findJavaFromJavaHome
35 | set JAVA_HOME=%JAVA_HOME:"=%
36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37 | 
38 | if exist "%JAVA_EXE%" goto init
39 | 
40 | echo.
41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42 | echo.
43 | echo Please set the JAVA_HOME variable in your environment to match the
44 | echo location of your Java installation.
45 | 
46 | goto fail
47 | 
48 | :init
49 | @rem Get command-line arguments, handling Windows variants
50 | 
51 | if not "%OS%" == "Windows_NT" goto win9xME_args
52 | 
53 | :win9xME_args
54 | @rem Slurp the command line arguments.
55 | set CMD_LINE_ARGS=
56 | set _SKIP=2
57 | 
58 | :win9xME_args_slurp
59 | if "x%~1" == "x" goto execute
60 | 
61 | set CMD_LINE_ARGS=%*
62 | 
63 | :execute
64 | @rem Setup the command line
65 | 
66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
67 | 
68 | @rem Execute Gradle
69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
70 | 
71 | :end
72 | @rem End local scope for the variables with windows NT shell
73 | if "%ERRORLEVEL%"=="0" goto mainEnd
74 | 
75 | :fail
76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
77 | rem the _cmd.exe /c_ return code!
78 | if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
79 | exit /b 1
80 | 
81 | :mainEnd
82 | if "%OS%"=="Windows_NT" endlocal
83 | 
84 | :omega
85 | 


--------------------------------------------------------------------------------
/hadoop/script/setup_hadoop.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # unofficial bash strict mode
 4 | set -euo pipefail
 5 | IFS=$'\n\t'
 6 | 
 7 | # run from any directory (no symlink allowed)
 8 | CURRENT_PATH=$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd -P)
 9 | cd ${CURRENT_PATH}
10 | 
11 | ##############################
12 | 
13 | FILE_PATH="/vagrant/file"
14 | DATA_PATH="/vagrant/.data"
15 | USER_NAME="hadoop"
16 | 
17 | HADOOP_VERSION="2.7.6"
18 | HADOOP_NAME="hadoop-$HADOOP_VERSION"
19 | 
20 | ##############################
21 | 
22 | function download_dist {
23 |   local HADOOP_MIRROR_DOWNLOAD="http://www-eu.apache.org/dist/hadoop/common/$HADOOP_NAME/$HADOOP_NAME.tar.gz"
24 |   echo "[*] download dist"
25 |   wget -q -P $DATA_PATH $HADOOP_MIRROR_DOWNLOAD
26 | }
27 | 
28 | function setup_dist {
29 |   local HADOOP_DIST_PATH="$DATA_PATH/$HADOOP_NAME*"
30 |   echo "[*] setup dist"
31 | 
32 |   if [ ! -e $HADOOP_DIST_PATH ]; then
33 |     download_dist
34 |   fi
35 | 
36 |   tar -xzf $HADOOP_DIST_PATH -C /opt
37 |   ln -s /opt/$HADOOP_NAME /usr/local/hadoop
38 |   chown -R $USER_NAME:$USER_NAME /opt/$HADOOP_NAME
39 | }
40 | 
41 | function setup_config {
42 |   local DATA_PATH_GUEST="/vol/hadoop"
43 |   local HADOOP_BASE_PATH="/usr/local/hadoop"
44 |   local CONFIG_PATH="$HADOOP_BASE_PATH/etc/hadoop"
45 |   local FILES=( "core-site.xml" "hdfs-site.xml" "mapred-site.xml" "yarn-site.xml" "fair-scheduler.xml" "masters" "slaves" )
46 | 
47 |   echo "[*] create directories"
48 |   mkdir -pv \
49 |     $DATA_PATH_GUEST/{namenode,secondary,datanode} \
50 |     $DATA_PATH_GUEST/log/{hadoop,yarn,mapred}
51 |   
52 |   for FILE in "${FILES[@]}"
53 |   do
54 |     echo "[*] update config: $FILE"
55 |     # backup only if exists
56 |     [ -e $CONFIG_PATH/$FILE ] && mv $CONFIG_PATH/$FILE $CONFIG_PATH/$FILE.orig
57 |     cp $FILE_PATH/hadoop/config/$FILE $CONFIG_PATH/$FILE
58 |   done
59 | 
60 |   echo "[*] update permissions"
61 |   # important final slash to be recursive
62 |   chown -R $USER_NAME:$USER_NAME \
63 |     $HADOOP_BASE_PATH/ \
64 |     $DATA_PATH_GUEST/
65 |   
66 |   echo "[*] update env"
67 |   cp $FILE_PATH/hadoop/profile-hadoop.sh /etc/profile.d/profile-hadoop.sh && \
68 |     source /etc/profile.d/profile-hadoop.sh
69 | }
70 | 
71 | function init_hdfs {
72 |   local HOSTNAME=$(hostname)
73 |   echo "[*] init hdfs: $HOSTNAME"
74 |   hadoop version
75 | 
76 |   case $HOSTNAME in
77 |     "master")
78 |       sudo -i -u $USER_NAME hdfs namenode -format
79 |       ;;
80 |     *)
81 |       # nothing to do
82 |       ;;
83 |   esac
84 | }
85 | 
86 | function main {
87 |   echo "[+] setup hadoop"
88 |   setup_dist
89 |   setup_config
90 |   init_hdfs
91 |   echo "[-] setup hadoop"
92 | }
93 | 
94 | main
95 | 


--------------------------------------------------------------------------------
/docs/other-resources.md:
--------------------------------------------------------------------------------
 1 | # Other resources
 2 | 
 3 | ## Computer Science
 4 | 
 5 | * [CS 101: Introduction to Computing Principles](https://web.stanford.edu/class/cs101/)
 6 | * [Stanford CS Education Library](http://cslibrary.stanford.edu)
 7 | * [Foundations of Computer Science](http://infolab.stanford.edu/~ullman/focs.html)
 8 | * [Computer Networks From Scratch](https://www.networksfromscratch.com)
 9 | * [Code With Engineering Playbook](https://microsoft.github.io/code-with-engineering-playbook)
10 | * [Which programs are faster?](https://benchmarksgame-team.pages.debian.net/benchmarksgame)
11 | * [Addison-Wesley Professional Computing Series](https://informit.com/series/professionalcomputing)
12 | 
13 | ## Machine Learning
14 | 
15 | * [Machine Learning](https://www.coursera.org/learn/machine-learning) (Course)
16 | * [Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course) (Course)
17 | * [Amazon's Machine Learning University](https://www.amazon.science/latest-news/machine-learning-course-free-online-from-amazon-machine-learning-university) (Course)
18 | * [Making Things Think](https://www.holloway.com/g/making-things-think) (Book)
19 | * [Machine Learning from Scratch](https://dafriedman97.github.io/mlbook/content/introduction.html) (Book)
20 | * [What is Natural Language Processing?](https://blog.algorithmia.com/introduction-natural-language-processing-nlp)
21 | * [Scipy Lecture Notes](http://scipy-lectures.org/index.html)
22 | * [Neural Networks](https://aegeorge42.github.io)
23 | * [An Introduction to Tensor Calculus](https://grinfeld.org/books/An-Introduction-To-Tensor-Calculus)
24 | * [Neural Network From Scratch](https://sirupsen.com/napkin/neural-net)
25 | * [The latest in Machine Learning](https://paperswithcode.com) (Papers)
26 | 
27 | ## Book collections
28 | 
29 | * [The Online Books Page](https://onlinebooks.library.upenn.edu)
30 | * [A collection of free books from Springer](https://hnarayanan.github.io/springer-books)
31 | * [E-Books Directory](http://www.e-booksdirectory.com)
32 | * [OpenStax](https://openstax.org/subjects)
33 | * [Mark Watson: author of 20+ books](https://markwatson.com/#books)
34 | * [LibriVox](https://librivox.org) (audiobook)
35 | * [Textbooks](https://textbooks.cs.ksu.edu/)
36 | * [Global Grey](https://www.globalgreyebooks.com/index.html)
37 | * [Pirate Library Mirror](http://pilimi.org)
38 | * [freeread.org: For the human right to read](https://freeread.org)
39 | 
40 | ## Random
41 | 
42 | * [suckless](http://suckless.org)
43 | * [Biohacking Lite](https://karpathy.github.io/2020/06/11/biohacking-lite)
44 | 
45 | ## Hacker News
46 | 
47 | * [Ask HN: Great Blogs by Programmers](https://news.ycombinator.com/item?id=30245247)
48 | * [Ask HN: Can I see your cheatsheet?](https://news.ycombinator.com/item?id=31928736)
49 | * [Ask HN: What are the major open source alternatives to Auth0?](https://news.ycombinator.com/item?id=29392517)
50 | 
51 | <br>
52 | 


--------------------------------------------------------------------------------
/hadoop/vagrant_hadoop.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # unofficial bash strict mode
  4 | set -euo pipefail
  5 | IFS=$'\n\t'
  6 | 
  7 | # run from any directory (no symlink allowed)
  8 | CURRENT_PATH=$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd -P)
  9 | cd ${CURRENT_PATH}
 10 | 
 11 | ##############################
 12 | 
 13 | DATA_PATH=".data"
 14 | KEY_NAME="hadoop_rsa"
 15 | BOX_NAME="master"
 16 | 
 17 | ##############################
 18 | 
 19 | # param #1: <bin>
 20 | function verify_requirement {
 21 |   local BIN=$1
 22 |   echo "[*] verify requirement: $BIN"
 23 |   command -v $BIN >/dev/null 2>&1 || (echo "[-] error: $BIN not found" && exit 1)
 24 | }
 25 | 
 26 | # param #1: <name>
 27 | # param #2: <path>
 28 | function init_key_pair {
 29 |   local NAME=$1
 30 |   local BASE_PATH=$2
 31 |   local KEY_PATH="$BASE_PATH/$NAME"
 32 | 
 33 |   if [ ! -e $KEY_PATH ]; then
 34 |     mkdir -p $BASE_PATH
 35 |     ssh-keygen -t rsa -b 4096 -C $NAME -N "" -f $KEY_PATH
 36 |     echo "[*] new ssh key pair generated: $KEY_PATH"
 37 |   else
 38 |     echo "[*] ssh key pair found: $KEY_PATH"
 39 |   fi
 40 | }
 41 | 
 42 | # param #1: <name>
 43 | function start_vagrant {
 44 |   local NAME=$1
 45 |   local STATUS=$(vagrant status | grep -m 1 $NAME | awk '{ print toupper($2) }')
 46 |   echo -e "[*] start vagrant: name=$NAME | status=$STATUS"
 47 | 
 48 |   case $STATUS in
 49 |     # not created | poweroff | aborted
 50 |     "NOT"|"POWEROFF"|"ABORTED"|"SAVED")
 51 |       #vagrant up --debug &> .vagrant/debug.log
 52 |       vagrant up && vagrant ssh $NAME
 53 |       ;;
 54 |     # running
 55 |     "RUNNING")
 56 |       vagrant ssh $NAME
 57 |       ;;
 58 |     *)
 59 |       echo "[-] error: vagrant status unknown"
 60 |       ;;
 61 |   esac
 62 | }
 63 | 
 64 | explosion() {
 65 | cat<<"EOT"
 66 |                              ____
 67 |                      __,-~~/~    `---.
 68 |                    _/_,---(      ,    )
 69 |                __ /        <    /   )  \___
 70 | - ------===;;;'====------------------===;;;===----- -  -
 71 |                   \/  ~"~"~"~"~"~\~"~)~"/
 72 |                   (_ (   \  (     >    \)
 73 |                    \_( _ <         >_>'
 74 |                       ~ `-i' ::>|--"
 75 |                           I;|.|.|
 76 |                          <|i::|i|`.
 77 |                         (` ^'"`-' ")
 78 | ------------------------------------------------------------------
 79 | 
 80 | EOT
 81 | }
 82 | 
 83 | ##############################
 84 | 
 85 | function init_folder {
 86 |   echo "[*] init folder"
 87 |   mkdir -pv \
 88 |     ${DATA_PATH}/$BOX_NAME \
 89 |     ${DATA_PATH}/node-{1,2,3}
 90 | }
 91 | 
 92 | function hadoop-start {
 93 |   verify_requirement vagrant
 94 |   verify_requirement ssh-keygen
 95 | 
 96 |   init_folder
 97 |   init_key_pair $KEY_NAME $DATA_PATH
 98 |   start_vagrant $BOX_NAME
 99 | }
100 | 
101 | function hadoop-destroy {
102 |   read -p "Are you sure? [y/n]" -n 1 -r
103 |   echo
104 |   if [[ $REPLY =~ ^[Yy]$ ]]
105 |   then
106 |     vagrant destroy -f
107 |     rm -frv \
108 |       .vagrant \
109 |       ${DATA_PATH}/$KEY_NAME* \
110 |       ${DATA_PATH}/$BOX_NAME \
111 |       ${DATA_PATH}/node-{1,2,3}
112 |     explosion
113 |   fi
114 | }
115 | 


--------------------------------------------------------------------------------
/docs/zookeeper.md:
--------------------------------------------------------------------------------
 1 | # ZooKeeper
 2 | 
 3 | > **ZooKeeper** is a centralized service for maintaining configuration information, naming, providing distributed synchronization, and providing group services
 4 | 
 5 | Resources
 6 | 
 7 | * [Documentation](https://zookeeper.apache.org)
 8 | 
 9 | * [Curator](https://curator.apache.org)
10 | 
11 | ## Setup
12 | 
13 | Requirements
14 | 
15 | * [Base](docker/#base-image) image
16 | 
17 | Build `devops/zookeeper` image
18 | ```bash
19 | # change path
20 | cd devops/zookeeper
21 | 
22 | # build image
23 | docker build -t devops/zookeeper:latest .
24 | # build image with specific version - see Dockerfile for version 3.5.x
25 | docker build -t devops/zookeeper:3.4.10 --build-arg VERSION=3.4.10 .
26 | 
27 | # temporary container [host:container]
28 | docker run --rm --name zookeeper -p 12181:2181 devops/zookeeper
29 | # access container
30 | docker exec -it zookeeper bash
31 | 
32 | # paths
33 | /opt/zookeeper
34 | /var/log/zookeeper
35 | /var/lib/zookeeper
36 | /var/log/supervisord.log
37 | 
38 | # logs
39 | tail -F /var/log/supervisord.log
40 | # check service status
41 | supervisorctl status
42 | supervisorctl restart zookeeper
43 | ```
44 | 
45 | Example
46 | ```bash
47 | docker exec -it zookeeper bash
48 | 
49 | # (option 1) check zookeeper status
50 | echo ruok | nc localhost 2181
51 | 
52 | # (option 2) check zookeeper status
53 | telnet localhost 2181
54 | # expect answer imok
55 | > ruok
56 | 
57 | zkCli.sh -server 127.0.0.1:2181
58 | help
59 | # list znodes
60 | ls /
61 | # create znode and associate value
62 | create /zk_test my_data
63 | # verify data
64 | get /zk_test
65 | # change value
66 | set /zk_test junk
67 | # delete znode
68 | delete /zk_test
69 | ```
70 | 
71 | ## The four-letter words
72 | 
73 | | Category | Command | Description |
74 | | -------- |:-------:| ----------- |
75 | | Server status | **ruok** | Prints *imok* if the server is running and not in an error state |
76 | | | **conf** | Prints the server configuration (from zoo.cfg) |
77 | | | **envi** | Prints the server environment, including ZooKeeper version, Java version, and other system properties |
78 | | | **srvr** | Prints server statistics, including latency statistics, the number of znodes, and the server mode (standalone, leader, or follower) |
79 | | | **stat** | Prints server statistics and connected clients |
80 | | | **srst** | Resets server statistics |
81 | | | **isro** | Shows whether the server is in read-only ( ro ) mode (due to a network partition) or read/write mode (rw) |
82 | | Client connections | **dump** | Lists all the sessions and ephemeral znodes for the ensemble. You must connect to the leader (see srvr) for this command |
83 | | | **cons** | Lists connection statistics for all the server's clients |
84 | | | **crst** | Resets connection statistics |
85 | | Watches | **wchs** | Lists summary information for the server's watches |
86 | | | **wchc** | Lists all the server's watches by connection, may impact server performance for a large number of watches |
87 | | | **wchp** | Lists all the server’s watches by znode path, may impact server performance for a large number of watches |
88 | | Monitoring | **mntr** | Lists server statistics in Java properties format, suitable as a source for monitoring systems such as Ganglia and Nagios |
89 | 
90 | <br>
91 | 


--------------------------------------------------------------------------------
/hadoop/script/setup_spark.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # unofficial bash strict mode
  4 | set -euo pipefail
  5 | IFS=$'\n\t'
  6 | 
  7 | # run from any directory (no symlink allowed)
  8 | CURRENT_PATH=$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd -P)
  9 | cd ${CURRENT_PATH}
 10 | 
 11 | ##############################
 12 | 
 13 | FILE_PATH="/vagrant/file"
 14 | DATA_PATH="/vagrant/.data"
 15 | USER_NAME="hadoop"
 16 | 
 17 | SPARK_VERSION="2.2.1"
 18 | HADOOP_VERSION="2.7"
 19 | SPARK_NAME="spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION"
 20 | 
 21 | ##############################
 22 | 
 23 | function download_dist {
 24 |   local SPARK_MIRROR_DOWNLOAD="http://www-eu.apache.org/dist/spark/spark-$SPARK_VERSION/$SPARK_NAME.tgz"
 25 |   echo "[*] download dist"
 26 |   wget -q -P $DATA_PATH $SPARK_MIRROR_DOWNLOAD
 27 | }
 28 | 
 29 | function setup_dist {
 30 |   local SPARK_DIST_PATH="$DATA_PATH/$SPARK_NAME*"
 31 |   echo "[*] setup dist"
 32 | 
 33 |   if [ ! -e $SPARK_DIST_PATH ]; then
 34 |     download_dist
 35 |   fi
 36 | 
 37 |   tar -xf $SPARK_DIST_PATH -C /opt
 38 |   ln -s /opt/$SPARK_NAME /usr/local/spark
 39 |   chown -R $USER_NAME:$USER_NAME /opt/$SPARK_NAME
 40 | }
 41 | 
 42 | function setup_config {
 43 |   local DATA_PATH_GUEST="/vol/spark"
 44 |   local SPARK_BASE_PATH="/usr/local/spark"
 45 |   local CONFIG_PATH="$SPARK_BASE_PATH/conf"
 46 |   local HISTORY_PATH="/tmp/spark-events"
 47 |   local FILES=( "spark-env.sh" "log4j.properties" )
 48 | 
 49 |   echo "[*] create directories"
 50 |   mkdir -pv \
 51 |     $DATA_PATH_GUEST/log \
 52 |     $HISTORY_PATH
 53 |   
 54 |   for FILE in "${FILES[@]}"
 55 |   do
 56 |     echo "[*] update config: $FILE"
 57 |     # backup only if exists
 58 |     [ -e $CONFIG_PATH/$FILE ] && mv $CONFIG_PATH/$FILE $CONFIG_PATH/$FILE.orig
 59 |     cp $FILE_PATH/spark/config/$FILE $CONFIG_PATH/$FILE
 60 |   done
 61 | 
 62 |   echo "[*] update permissions"
 63 |   chown -R $USER_NAME:$USER_NAME \
 64 |     $SPARK_BASE_PATH/ \
 65 |     $HISTORY_PATH
 66 |   
 67 |   echo "[*] update env"
 68 |   cp $FILE_PATH/spark/profile-spark.sh /etc/profile.d/profile-spark.sh && \
 69 |     source /etc/profile.d/profile-spark.sh
 70 |   
 71 |   # TODO config spark on yarn as default
 72 |   # verify jars/archive path, ports between nodes and memory issues
 73 |   # https://www.linode.com/docs/databases/hadoop/install-configure-run-spark-on-top-of-hadoop-yarn-cluster/
 74 |   
 75 |   # spark-shell --master yarn --deploy-mode client
 76 |   # ERROR SparkContext: Error initializing SparkContext
 77 |   # org.apache.spark.SparkException: Yarn application has already ended! It might have been killed or unable to launch application master.
 78 |   
 79 |   # @see spark-defaults.conf
 80 |   
 81 |   # hadoop fs -ls -h -R /
 82 |   # hdfs dfs -mkdir -p /user/spark/{log,share/lib}
 83 |   # hadoop fs -put /usr/local/spark/jars/*.jar /user/spark/share/lib/
 84 | 
 85 |   # yarn-site.xml
 86 |   #<!-- required by Spark -->
 87 |   #<property>
 88 |   #    <name>yarn.resourcemanager.address</name>
 89 |   #    <value>resource-manager.local:8032</value>
 90 |   #</property>
 91 | 
 92 |   # zip -j /vol/spark/log/spark-archive.zip /usr/local/spark/jars/*.jar
 93 |   # hadoop fs -put /vol/spark/log/spark-archive.zip /user/spark/share/spark-archive.zip
 94 | }
 95 | 
 96 | function main {
 97 |   echo "[+] setup spark"
 98 |   setup_dist
 99 |   setup_config
100 |   echo "[-] setup spark"
101 | }
102 | 
103 | main
104 | 


--------------------------------------------------------------------------------
/hadoop/script/setup_ubuntu.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # unofficial bash strict mode
  4 | set -euo pipefail
  5 | IFS=$'\n\t'
  6 | 
  7 | # run from any directory (no symlink allowed)
  8 | CURRENT_PATH=$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd -P)
  9 | cd ${CURRENT_PATH}
 10 | 
 11 | ##############################
 12 | 
 13 | FILE_PATH="/vagrant/file"
 14 | DATA_PATH="/vagrant/.data"
 15 | KEY_NAME="hadoop_rsa"
 16 | USER_NAME="hadoop"
 17 | USER_ID=1101
 18 | HOME_PATH="/home/$USER_NAME"
 19 | 
 20 | ##############################
 21 | 
 22 | function apt_update {
 23 |   echo "[*] apt update"
 24 |   apt-get -qq update && apt-get -qq upgrade -y
 25 | }
 26 | 
 27 | function setup_packages {
 28 |   local LOG_PATH="/tmp/apt-packages.log"
 29 |   echo "[*] setup packages"
 30 | 
 31 |   apt-get -qq update && apt-get install -y \
 32 |     tree \
 33 |     zip \
 34 |     unzip \
 35 |     jq \
 36 |     httpie \
 37 |     &> $LOG_PATH && \
 38 |     apt-get clean
 39 | }
 40 | 
 41 | function setup_java {
 42 |   local LOG_PATH="/tmp/apt-java.log"
 43 |   echo "[*] setup java"
 44 | 
 45 |   add-apt-repository ppa:openjdk-r/ppa -y &> $LOG_PATH
 46 | 
 47 |   apt-get -qq update && apt-get install -y \
 48 |     openjdk-8-jdk \
 49 |     &> $LOG_PATH && \
 50 |     apt-get clean
 51 | 
 52 |   java -version
 53 | 
 54 |   # https://askubuntu.com/questions/866161/setting-path-variable-in-etc-environment-vs-profile
 55 |   # /usr/lib/jvm/java-8-openjdk-amd64
 56 |   echo "JAVA_HOME=$(readlink -f /usr/bin/java | sed "s:bin/java::")" | sudo tee --append /etc/environment && \
 57 |     source /etc/environment
 58 | }
 59 | 
 60 | # param #1: <name>
 61 | # param #2: <id>
 62 | function create_user {
 63 |   local NAME=$1
 64 |   local ID=$2
 65 |   echo "[*] create user: $NAME"
 66 | 
 67 |   groupadd --gid $ID $NAME
 68 |   useradd --uid $ID --gid $ID --create-home --shell /bin/bash $NAME
 69 |   usermod --append --groups sudo,$NAME $NAME
 70 |   echo "$NAME ALL=(ALL:ALL) NOPASSWD: ALL" > /etc/sudoers.d/$NAME
 71 |   id $NAME
 72 |   groups $NAME
 73 | }
 74 | 
 75 | ##############################
 76 | 
 77 | function config_ssh {
 78 |   local SSH_PATH="$HOME_PATH/.ssh"
 79 |   echo "[*] config ssh"
 80 | 
 81 |   mkdir -p $SSH_PATH
 82 |   # default name to avoid -i parameter
 83 |   cp $DATA_PATH/$KEY_NAME $SSH_PATH/id_rsa
 84 |   # passphraseless
 85 |   cat $DATA_PATH/$KEY_NAME.pub >> $SSH_PATH/authorized_keys
 86 |   # avoid prompt first time
 87 |   cp $FILE_PATH/ssh/config $SSH_PATH/config
 88 |   # update permissions
 89 |   chmod 0600 $SSH_PATH/id_rsa $SSH_PATH/authorized_keys
 90 |   chown -R $USER_NAME:$USER_NAME $SSH_PATH
 91 | }
 92 | 
 93 | function config_profile {
 94 |   echo "[*] config profile"
 95 |   sed -i -r "s/alias ll='ls -alF'/alias ll='ls -alh'/" $HOME_PATH/.bashrc
 96 |   source $HOME_PATH/.bashrc
 97 | }
 98 | 
 99 | function config_host {
100 |   echo "[*] config host"
101 |   cat $FILE_PATH/hosts >> /etc/hosts
102 | }
103 | 
104 | function setup_motd {
105 |   local MOTD_PATH="/etc/update-motd.d"
106 |   echo "[*] setup motd"
107 |   rm -fr $MOTD_PATH/10-help-text
108 |   cp $FILE_PATH/motd $MOTD_PATH/10-custom-text
109 |   chmod 0755 $MOTD_PATH/10-custom-text
110 | }
111 | 
112 | function main {
113 |   echo "[+] setup ubuntu"
114 |   #apt_update
115 |   setup_packages
116 |   setup_java
117 |   create_user $USER_NAME $USER_ID
118 |   config_ssh
119 |   config_profile
120 |   config_host
121 |   setup_motd
122 |   echo "[-] setup ubuntu"
123 | }
124 | 
125 | main
126 | 


--------------------------------------------------------------------------------
/cassandra/cql/example_query.cql:
--------------------------------------------------------------------------------
 1 | SELECT * FROM example.messages;
 2 | SELECT * FROM example.counters;
 3 | SELECT * FROM example.users;
 4 | 
 5 | SELECT COUNT(*) FROM example.messages;
 6 | 
 7 | INSERT INTO example.messages(id, body, created_at, updated_at)
 8 | VALUES(uuid(), 'message1', toTimestamp(now()), toTimestamp(now()));
 9 | 
10 | INSERT INTO example.messages(id, body, title, created_at, updated_at)
11 | VALUES(uuid(), 'message2', 'title2', toTimestamp(now()), toTimestamp(now()));
12 | 
13 | -- no timestamp allowed on primary key
14 | SELECT id, body, WRITETIME(body) FROM example.messages;
15 | 
16 | INSERT INTO example.users(first_name, last_name, last_ip, any_value, enable)
17 | VALUES('firstName1', 'lastName1', '127.0.0.1', textAsBlob('{"key1": "value1", "key2": "value2"}'), true);
18 | 
19 | INSERT INTO example.users(first_name, last_name, last_ip, any_value, enable)
20 | VALUES('firstName2', 'lastName2', '0.0.0.0', textAsBlob('{"key": "value"}'), false);
21 | 
22 | -- value from column
23 | SELECT blobAsText(0x7b226b6579223a202276616c7565227d) FROM example.users;
24 | 
25 | -- timestamp in microsecond - old timestamp are ignored
26 | UPDATE example.users USING TIMESTAMP 1434373756626000
27 | SET last_name = 'lastName2' WHERE first_name = 'firstName1';
28 | 
29 | UPDATE example.users USING TIMESTAMP 2034373756626000
30 | SET last_name = 'lastName3' WHERE first_name = 'firstName1';
31 | 
32 | -- time to live TTL - stored on a per-column level no row
33 | -- 60 seconds
34 | UPDATE example.users USING TTL 60
35 | SET last_name = 'lastName4' WHERE first_name = 'firstName2';
36 | 
37 | SELECT first_name, last_name, TTL(last_name) FROM example.users;
38 | 
39 | -- insert not allowed
40 | UPDATE example.counters SET total = total + 2
41 | WHERE id = now();
42 | 
43 | SELECT * FROM example.counters;
44 | 
45 | UPDATE example.users SET emails = { 'hello@example.com' }
46 | WHERE first_name = 'firstName1';
47 | 
48 | UPDATE example.users SET emails = emails + { 'world@example.com' }
49 | WHERE first_name = 'firstName1';
50 | 
51 | UPDATE example.users SET emails = emails - { 'world@example.com' }
52 | WHERE first_name = 'firstName1';
53 | 
54 | UPDATE example.users SET emails = {}
55 | WHERE first_name = 'firstName1';
56 | 
57 | UPDATE example.users SET emails = { 'hello@example.com', 'world@example.com' }
58 | WHERE first_name = 'firstName2';
59 | 
60 | UPDATE example.users SET phone_numbers = [ '1-800-999-9999' ]
61 | WHERE first_name = 'firstName1';
62 | 
63 | UPDATE example.users SET phone_numbers = phone_numbers + [ '480-111-1111' ]
64 | WHERE first_name = 'firstName1';
65 | 
66 | UPDATE example.users SET phone_numbers = [ '111-222-3333' ] + phone_numbers
67 | WHERE first_name = 'firstName1';
68 | 
69 | -- start from index 0
70 | UPDATE example.users SET phone_numbers[1] = '000-000-0000'
71 | WHERE first_name = 'firstName1';
72 | 
73 | UPDATE example.users SET phone_numbers = phone_numbers - [ '000-000-0000' ]
74 | WHERE first_name = 'firstName1';
75 | 
76 | UPDATE example.users SET login_sessions = { now(): 13, now(): 18}
77 | WHERE first_name = 'firstName2';
78 | 
79 | -- use index
80 | SELECT * FROM example.users WHERE last_name = 'lastName3';
81 | 
82 | -- use sasi index
83 | SELECT * FROM example.users WHERE last_name LIKE 'last%';
84 | 
85 | UPDATE example.users SET addresses = addresses +
86 | {'home': { street: 'street1', city: 'city1', state: 'STATE', zip_code: 12345} }
87 | WHERE first_name = 'firstName2';
88 | 
89 | DROP INDEX example.users_last_name_idx;
90 | 
91 | DELETE phone_numbers[0] FROM example.users
92 | WHERE first_name = 'firstName1';
93 | 
94 | DELETE last_ip from example.users WHERE first_name = 'firstName2';
95 | 
96 | TRUNCATE example.messages;
97 | 
98 | DROP TABLE example.messages;
99 | 


--------------------------------------------------------------------------------
/docs/programming.md:
--------------------------------------------------------------------------------
 1 | # Programming
 2 | 
 3 | ## Courses
 4 | 
 5 | * 6.001: Structure and Interpretation of Computer Programs MIT [ [course](https://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-001-structure-and-interpretation-of-computer-programs-spring-2005) | [book](https://mitpress.mit.edu/sites/default/files/sicp/index.html) ]
 6 | * [CS 6120: Advanced Compilers](https://www.cs.cornell.edu/courses/cs6120/2020fa/self-guided)
 7 | * [History of Programming Languages](https://felleisen.org/matthias/7480-s21/index.html)
 8 | 
 9 | ## Books
10 | 
11 | * [Compilers: Principles, Techniques, and Tools](https://suif.stanford.edu/dragonbook)
12 | * [Writing an Interpreter and Compiler in Go](https://gumroad.com/l/waiig_wacig_bundle)
13 | * [Crafting Interpreters](https://craftinginterpreters.com) (online)
14 | * [Programming Languages: Application and Interpretation](https://cs.brown.edu/courses/cs173/2012/book) (online)
15 | * [Build Your Own Lisp](https://buildyourownlisp.com) (online)
16 | * [LispE: Lisp Elémentaire](https://github.com/naver/lispe/wiki) (online)
17 | 
18 | ## Interpreter / Compiler
19 | 
20 | * [awesome-compilers](https://github.com/aalhour/awesome-compilers)
21 | * [Compiler Explorer](https://godbolt.org)
22 | * [Let's Build a Compiler](https://xmonader.github.io/letsbuildacompiler-pretty/about.html)
23 | * [A C version of the "Let's Build a Compiler"](https://github.com/lotabout/Let-s-build-a-compiler)
24 | * [Let's write a compiler](https://briancallahan.net/blog/20210814.html)
25 | * [An Intro to Compilers](https://web.archive.org/web/20210111064441/https://nicoleorchard.com/blog/compilers) (archive)
26 | * [Tiny C Compiler](https://bellard.org/tcc)
27 | * [rui314/chibicc: A small C compiler](https://github.com/rui314/chibicc)
28 | * [The Super Tiny Compiler!](https://github.com/jamiebuilds/the-super-tiny-compiler)
29 | * [Obfuscated Tiny C Compiler](https://bellard.org/otcc)
30 | * [Tinylisp: Lisp in 99 lines of C and how to write one yourself](https://github.com/Robert-van-Engelen/tinylisp)
31 | * [Lessons from Writing a Compiler](https://borretti.me/article/lessons-writing-compiler)
32 | 
33 | ## Parser
34 | 
35 | * [Parsing Text with Nom](https://blog.adamchalmers.com/nom-chars)
36 | * [How to write a tree-sitter grammar in an afternoon](https://siraben.dev/2022/03/01/tree-sitter.html)
37 | * [Writing a Simple Parser in Rust](https://adriann.github.io/rust_parser.html)
38 | 
39 | ## LLVM
40 | 
41 | * [How to learn compilers: LLVM edition](https://lowlevelbits.org/how-to-learn-compilers-llvm-edition)
42 | * [A Complete Guide to LLVM for Programming Language Creators](https://mukulrathi.com/create-your-own-programming-language/llvm-ir-cpp-api-tutorial)
43 | 
44 | ## Random
45 | 
46 | * [Esolang](https://esolangs.org)
47 | * [Compile code into silicon](https://www.siliconcompiler.com)
48 | * [Make A Language](https://arzg.github.io/lang) (rust)
49 | * [Designing a programming language](http://ducklang.org/designing-a-programming-language-i) 
50 | * [mirdaki/theforce: The Force - A Star Wars themed programming language](https://github.com/mirdaki/theforce)
51 | * [riicchhaarrd/ocean: Programming language that compiles into a x86 ELF executable](https://github.com/riicchhaarrd/ocean)
52 | * [adam-mcdaniel/oakc: An infinitely more portable alternative to the C programming language](http://github.com/adam-mcdaniel/oakc)
53 | * [Creating the Golfcart Programming Language](https://healeycodes.com/creating-the-golfcart-programming-language)
54 | * [I wrote a linker everyone can understand!](https://briancallahan.net/blog/20210609.html)
55 | * [spencertipping/jit-tutorial: How to write a JIT compiler](https://github.com/spencertipping/jit-tutorial)
56 | * [What Every Computer Scientist Should Know About Floating-Point Arithmetic](https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html)
57 | 
58 | <br>
59 | 


--------------------------------------------------------------------------------
/hadoop/script/setup_oozie.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # unofficial bash strict mode
  4 | set -euo pipefail
  5 | IFS=$'\n\t'
  6 | 
  7 | # run from any directory (no symlink allowed)
  8 | CURRENT_PATH=$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd -P)
  9 | cd ${CURRENT_PATH}
 10 | 
 11 | ##############################
 12 | 
 13 | FILE_PATH="/vagrant/file"
 14 | DATA_PATH="/vagrant/.data"
 15 | USER_NAME="hadoop"
 16 | 
 17 | HADOOP_VERSION="2.7.5"
 18 | EXTJS_NAME="ext-2.2"
 19 | OOZIE_VERSION="5.0.0-beta1"
 20 | OOZIE_NAME="oozie-$OOZIE_VERSION"
 21 | OOZIE_BASE_PATH="/usr/local/oozie"
 22 | 
 23 | ##############################
 24 | 
 25 | function setup_maven {
 26 |   local LOG_PATH="/tmp/apt-maven.log"
 27 |   echo "[*] setup maven"
 28 | 
 29 |   apt-get -qq update && apt-get install -y \
 30 |     maven \
 31 |     &> $LOG_PATH && \
 32 |     apt-get clean
 33 |   
 34 |   mvn -version
 35 | 
 36 |   # environment variables
 37 |   #export M2_HOME=/usr/share/maven
 38 |   #export PATH=${M2_HOME}/bin:${PATH}
 39 | 
 40 |   # configuration path
 41 |   #/etc/maven
 42 | }
 43 | 
 44 | function download_oozie_dist {
 45 |   local OOZIE_MIRROR_DOWNLOAD="http://www-eu.apache.org/dist/oozie/$OOZIE_VERSION/$OOZIE_NAME.tar.gz"
 46 |   echo "[*] download oozie dist"
 47 |   wget -q -P $DATA_PATH $OOZIE_MIRROR_DOWNLOAD
 48 | }
 49 | 
 50 | function download_extjs_dist {
 51 |   local EXTJS_MIRROR_DOWNLOAD="http://archive.cloudera.com/gplextras/misc/$EXTJS_NAME.zip"
 52 |   echo "[*] download extjs dist"
 53 |   wget -q -P $DATA_PATH $EXTJS_MIRROR_DOWNLOAD
 54 | }
 55 | 
 56 | function setup_dist {
 57 |   local DATA_PATH_GUEST="/vol/oozie"
 58 |   local OOZIE_DIST_PATH="$DATA_PATH/$OOZIE_NAME*"
 59 |   local EXTJS_DIST_PATH="$DATA_PATH/$EXTJS_NAME*"
 60 |   local CONFIG_PATH="$OOZIE_BASE_PATH/conf"
 61 |   local FILES=( "oozie-site.xml" "oozie-env.sh" )
 62 |   echo "[*] setup dist"
 63 | 
 64 |   if [ ! -e $OOZIE_DIST_PATH ]; then
 65 |     download_oozie_dist
 66 |   fi
 67 |   if [ ! -e $EXTJS_DIST_PATH ]; then
 68 |     download_extjs_dist
 69 |   fi
 70 | 
 71 |   echo "[*] create directories"
 72 |   mkdir -pv \
 73 |     $DATA_PATH_GUEST/{log,data}
 74 | 
 75 |   echo "[*] build sources"
 76 |   tar -xzf $OOZIE_DIST_PATH -C /tmp
 77 |   /tmp/$OOZIE_NAME/bin/mkdistro.sh \
 78 |     -DskipTests \
 79 |     -Puber \
 80 |     -Dhadoop.version=$HADOOP_VERSION
 81 |   tar -xzf /tmp/$OOZIE_NAME/distro/target/$OOZIE_NAME-distro.tar.gz -C /opt
 82 |   ln -s /opt/$OOZIE_NAME $OOZIE_BASE_PATH
 83 | 
 84 |   echo "[*] add ExtJS external lib"
 85 |   mkdir -p $OOZIE_BASE_PATH/libext
 86 |   cp $EXTJS_DIST_PATH $OOZIE_BASE_PATH/libext
 87 |   
 88 |   echo "[*] setup examples"
 89 |   tar -xzf $OOZIE_BASE_PATH/oozie-examples.tar.gz -C $DATA_PATH_GUEST
 90 | 
 91 |   for FILE in "${FILES[@]}"
 92 |   do
 93 |     echo "[*] update config: $FILE"
 94 |     # backup only if exists
 95 |     [ -e $CONFIG_PATH/$FILE ] && mv $CONFIG_PATH/$FILE $CONFIG_PATH/$FILE.orig
 96 |     cp $FILE_PATH/oozie/config/$FILE $CONFIG_PATH/$FILE
 97 |   done
 98 | 
 99 |   echo "[*] update permissions"
100 |   chown -R $USER_NAME:$USER_NAME \
101 |     $OOZIE_BASE_PATH/ \
102 |     $DATA_PATH_GUEST/
103 | 
104 |   echo "[*] update env"
105 |   cp $FILE_PATH/oozie/profile-oozie.sh /etc/profile.d/profile-oozie.sh && \
106 |     source /etc/profile.d/profile-oozie.sh
107 | }
108 | 
109 | function init_oozie {
110 |   echo "[*] init oozie"
111 |   su --login $USER_NAME -c "$OOZIE_BASE_PATH/bin/oozie-setup.sh sharelib create -fs hdfs://namenode.local:9000"
112 |   su --login $USER_NAME -c "$OOZIE_BASE_PATH/bin/ooziedb.sh create -sqlfile oozie.sql -run"
113 | }
114 | 
115 | # only for development purpose
116 | function remove_oozie {
117 |   echo "[*] remove oozie"
118 |   hadoop fs -rm -f -R /user/$USER_NAME/examples
119 | 
120 |   rm -frv \
121 |     "/vol/oozie" \
122 |     "/usr/local/oozie" \
123 |     "/opt/oozie-*" \
124 |     "/tmp/oozie*"
125 | }
126 | 
127 | function main {
128 |   echo "[+] setup oozie"
129 |   setup_maven
130 |   setup_dist
131 |   init_oozie
132 |   echo "[-] setup oozie"
133 | }
134 | 
135 | main
136 | #remove_oozie
137 | 
138 | # http://www.thecloudavenue.com/2013/10/installation-and-configuration-of.html
139 | # https://www.edureka.co/blog/apache-oozie-tutorial/
140 | 


--------------------------------------------------------------------------------
/docs/cloud.md:
--------------------------------------------------------------------------------
 1 | # Cloud
 2 | 
 3 | * [CNCF cloud native landscape](https://landscape.cncf.io)
 4 | * [CloudSkew](https://www.cloudskew.com) - Draw cloud architecture diagrams
 5 | * [Steampipe](https://steampipe.io) - `select * from cloud;`
 6 | * [Infracost](https://www.infracost.io) - Cloud cost estimates for Terraform in pull requests
 7 | * [Rover - Terraform Visualizer](https://github.com/im2nguyen/rover)
 8 | 
 9 | ## AWS
10 | 
11 | * [AWS diagrams & notes](https://www.awsgeek.com)
12 | * [cfn-diagram](https://github.com/mhlabs/cfn-diagram) - Visualise CloudFormation/SAM/CDK templates as diagrams
13 | * [CDK-Dia](https://github.com/pistazie/cdk-dia) - Automated diagrams for CDK infrastructure
14 | 
15 | ## Kubernetes
16 | 
17 | ### Resources
18 | 
19 | * [Kube by Example](http://kubebyexample.com)
20 | * [Kubernetes The Hard Way](https://github.com/kelseyhightower/kubernetes-the-hard-way)
21 | * [Kubernetes Best Practices 101](https://github.com/diegolnasc/kubernetes-best-practices)
22 | * [Kubernetes Failure Stories](https://k8s.af)
23 | * [10 most common mistakes using kubernetes](https://blog.pipetail.io/posts/2020-05-04-most-common-mistakes-k8s)
24 | * [Container Training](https://container.training)
25 | * [15 Kubernetes Best Practices Every Developer Should Know](https://spacelift.io/blog/kubernetes-best-practices)
26 | 
27 | ### Tools
28 | 
29 | * [Kubetools](https://collabnix.github.io/kubetools) - A Curated List of Kubernetes Tools
30 | * [KEDA](https://keda.sh) - Kubernetes Event-driven Autoscaling
31 | * [Mizu](https://getmizu.io) - API Traffic Viewer for Kubernetes
32 | * [Sloop](https://github.com/salesforce/sloop) - Kubernetes History Visualization
33 | * [Atlas](https://greenops.io/atlas) - Effortless deployment pipelines for Kubernetes
34 | * [kube-chaos](https://github.com/Shogan/kube-chaos)
35 | * [minikube](https://minikube.sigs.k8s.io)
36 | * [K3s](https://k3s.io) - Lightweight Kubernetes
37 | * [devtron](https://github.com/devtron-labs/devtron) - Tool integration platform for Kubernetes
38 | * [arkade](https://github.com/alexellis/arkade) - Open Source Marketplace For Kubernetes
39 | * [Kubernetes YAML Generator](https://k8syaml.com)
40 | * [kind](https://kind.sigs.k8s.io) - Local Kubernetes clusters using Docker
41 | 
42 | ### Cli
43 | 
44 | * [Kustomize](https://kustomize.io) - Customization of Kubernetes YAML configurations
45 | * [Krew](https://github.com/kubernetes-sigs/krew) - Find and install kubectl plugins
46 | * [kubectx](https://ahmet.im/blog/kubectx) - A tool to switch between Kubernetes contexts
47 | * [Display the current kubectl context in the Bash prompt](https://pracucci.com/display-the-current-kubelet-context-in-the-bash-prompt.html)
48 | * [Kubie](https://github.com/sbstp/kubie) - A more powerful alternative to kubectx and kubens
49 | * [kube-ps1](https://github.com/jonmosco/kube-ps1) - Kubernetes prompt
50 | * [kube-prompt](https://github.com/c-bata/kube-prompt) - An interactive kubernetes client featuring auto-complete
51 | * [kubeprompt](https://github.com/jlesquembre/kubeprompt) - Isolated kubectl shells and prompt info
52 | * [kubefwd](https://kubefwd.com) - Kubernetes port forwarding for local development
53 | * [stern](https://github.com/wercker/stern) - Multi pod and container log tailing for Kubernetes (obsolete)
54 | * [kail](https://github.com/boz/kail) - kubernetes log viewer
55 | * [k9s](https://k9scli.io) - Kubernetes CLI To Manage Your Clusters In Style!
56 | * [KDash](https://github.com/kdash-rs/kdash) - A fast and simple dashboard for Kubernetes
57 | * [Skaffold](https://skaffold.dev) - Local Kubernetes Development
58 | 
59 | ### Homelab
60 | 
61 | * [k8s@home](https://docs.k8s-at-home.com)
62 | * [k3sup](https://github.com/alexellis/k3sup)
63 | * [Dan Manners' Homelab](https://github.com/danmanners/homelab-kube-cluster)
64 | * [Khue's Homelab](https://github.com/khuedoan/homelab)
65 | * [Humble Project](https://github.com/locmai/humble)
66 | * [Truxnell's home k8s cluster](https://github.com/onedr0p/home-cluster)
67 | 
68 | ### Raspberry Pi
69 | 
70 | * [Setup Kubernetes on a Raspberry Pi Cluster easily the official way!](https://blog.hypriot.com/post/setup-kubernetes-raspberry-pi-cluster)
71 | * [Raspberry Pi Kubernetes Cluster](https://chrisshort.net/my-raspberry-pi-kubernetes-cluster)
72 | * [Building an ARM Kubernetes Cluster](https://itnext.io/building-an-arm-kubernetes-cluster-ef31032636f9)
73 | * [kube-arm](https://github.com/lahsivjar/kube-arm)
74 | 
75 | ### Other
76 | 
77 | * [Kubelist Podcast](https://kubelist.com/podcast)
78 | * [Kubernetes comic](https://cloud.google.com/kubernetes-engine/kubernetes-comic)
79 | 


--------------------------------------------------------------------------------
/dev.txt:
--------------------------------------------------------------------------------
  1 | --- OLD
  2 | 
  3 | # emr info on master node
  4 | cat /mnt/var/lib/info/job-flow.json | jq
  5 | 
  6 | * sla, percentile
  7 | * add in linux: monit, nohup, screen, mc, vim keyboard shortcuts
  8 | * command vs event api - use paste tense to name event
  9 | 
 10 | * Avro **Avro** is a data serialization system
 11 | * Parquet **Parquet** is a columnar storage format that can efficiently store nested data
 12 | * Flume
 13 | * Sqoop
 14 | * Pig
 15 | * Hive
 16 | * Presto
 17 | * Crunch
 18 | * HBase
 19 | * Flink
 20 | * Ganglia **Ganglia** is a monitoring system for Hadoop
 21 | * Zeppelin
 22 | * Knox
 23 | 
 24 | * elastic-search
 25 | * kong
 26 | * etcd
 27 | * linux containers LXD
 28 | 
 29 | ---
 30 | 
 31 | # keytool
 32 | https://www.digitalocean.com/community/tutorials/java-keytool-essentials-working-with-java-keystores#viewing-keystore-entries
 33 | https://www.digitalocean.com/community/tutorials/openssl-essentials-working-with-ssl-certificates-private-keys-and-csrs
 34 | 
 35 | # api
 36 | https://hackernoon.com/restful-api-designing-guidelines-the-best-practices-60e1d954e7c9
 37 | https://github.com/WhiteHouse/api-standards
 38 | https://geemus.gitbooks.io/http-api-design/content/en/
 39 | 
 40 | # spark-in-action
 41 | curl -O https://raw.githubusercontent.com/spark-in-action/first-edition/master/spark-in-action-box.json
 42 | vagrant box add spark-in-action-box.json
 43 | vagrant init manning/spark-in-action
 44 | vagrant up
 45 | vagrant halt
 46 | vagrant destroy
 47 | vagrant box remove manning/spark-in-action
 48 | 
 49 | # hadoop
 50 | https://dwbi.org/etl/bigdata/183-setup-hadoop-cluster
 51 | https://www.linode.com/docs/databases/hadoop/how-to-install-and-set-up-hadoop-cluster
 52 | https://www.linode.com/docs/databases/hadoop/install-configure-run-spark-on-top-of-hadoop-yarn-cluster
 53 | 
 54 | # vagrant images
 55 | https://github.com/martinprobson/vagrant-hadoop-hive-spark
 56 | 
 57 | # back pressure
 58 | https://mechanical-sympathy.blogspot.com/2012/05/apply-back-pressure-when-overloaded.html
 59 | 
 60 | # IAC infrastructure-as-code
 61 | https://blog.gruntwork.io/why-we-use-terraform-and-not-chef-puppet-ansible-saltstack-or-cloudformation-7989dad2865c
 62 | 
 63 | # makefile
 64 | https://diamantidis.github.io/tips/2020/07/01/list-makefile-targets
 65 | `make -pRrq`, that is a very useful command to debug your makefiles, especially in a big project. The option `-p` prints the make data-base, -R and -r removes the implicit rules and variables, and -q indicates only asking make a question about current state, avoid executing anything
 66 | 
 67 | # copy cd-dvd
 68 | https://unix.stackexchange.com/questions/224277/is-it-better-to-use-cat-dd-pv-or-another-procedure-to-copy-a-cd-dvd/224314#224314
 69 | 
 70 | # Serving up zero-knowledge proofs
 71 | https://blog.trailofbits.com/2021/02/19/serving-up-zero-knowledge-proofs
 72 | 
 73 | # windows
 74 | https://arstechnica.com/gadgets/2021/01/dosbox-pure-for-retroarch-aims-to-simplify-classic-ms-dos-gaming
 75 | 
 76 | # discord exporter
 77 | https://github.com/Tyrrrz/DiscordChatExporter
 78 | 
 79 | # build your own amazing illustrations
 80 | https://iradesign.io
 81 | 
 82 | # service mesh
 83 | https://linkerd.io
 84 | 
 85 | ---
 86 | 
 87 | # torrent
 88 | https://en.m.wikibooks.org/wiki/The_World_of_Peer-to-Peer_(P2P)/Networks_and_Protocols/BitTorrent
 89 | https://github.com/arvidn/libtorrent
 90 | https://github.com/johang/btfs
 91 | https://blog.libtorrent.org/2020/09/bittorrent-v2
 92 | # Kademlia: A Design Specification
 93 | http://xlattice.sourceforge.net/components/protocol/kademlia/specs.html
 94 | https://github.com/smmr-software/mabel
 95 | 
 96 | # testing
 97 | https://github.com/nakabonne/ali
 98 | https://playwright.dev
 99 | 
100 | # tracing
101 | https://opentracing.io/specification
102 | https://blog.techlanika.com/distributed-tracing-the-why-what-and-how-ab9ca9e40081
103 | 
104 | # aws dynamodb
105 | https://github.com/aws-samples/amazon-dynamodb-labs
106 | http://rh-web-bucket.s3.amazonaws.com/index.html
107 | https://aws.amazon.com/blogs/database/how-to-determine-if-amazon-dynamodb-is-appropriate-for-your-needs-and-then-plan-your-migration
108 | https://aws.amazon.com/blogs/database/amazon-dynamodb-auto-scaling-performance-and-cost-optimization-at-any-scale
109 | https://tech.nextroll.com/blog/dev/2019/02/05/dynamodb-managed-autoscaling.html
110 | 
111 | # aws batch
112 | https://towardsdatascience.com/get-your-own-data-building-a-scalable-web-scraper-with-aws-654feb9fdad7
113 | 
114 | # crypto/mininig
115 | https://monokh.com/posts/bitcoin-from-scratch-part-1
116 | https://github.com/smartcontracts/eth2-book
117 | https://cardano.org
118 | https://nano.org
119 | https://www.chia.net
120 | https://www.anchorage.com
121 | 
122 | # lambda
123 | https://www.serverless.com
124 | https://www.openfaas.com
125 | https://dev.to/kumo/we-tested-the-best-serverless-monitoring-solutions-so-you-dont-have-to-121m
126 | https://jvns.ca/blog/2021/01/23/firecracker--start-a-vm-in-less-than-a-second
127 | https://www.talhoffman.com/2021/07/18/firecracker-internals
128 | 


--------------------------------------------------------------------------------
/docs/operating-system.md:
--------------------------------------------------------------------------------
 1 | # Operating System
 2 | 
 3 | ## Courses
 4 | 
 5 | * [6.033: Computer System Engineering MIT](https://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-033-computer-system-engineering-spring-2018)
 6 | * [6.S081: Operating System Engineering](https://pdos.csail.mit.edu/6.S081/2021/schedule.html)
 7 |     - [Xv6: A simple Unix-like teaching operating system](https://pdos.csail.mit.edu/6.828/2020/xv6.html)
 8 | * [CS 377: Operating Systems](https://m.youtube.com/playlist?list=PLacuG5pysFbDQU8kKxbUh4K5c1iL5_k7k) (youtube)
 9 | * [Operating System](https://m.youtube.com/playlist?list=PLBlnK6fEyqRiVhbXDGLXDk_OQAeuVcp2O) (youtube)
10 | * [CS 422/522: Design and Implementation of Operating Systems](https://flint.cs.yale.edu/cs422/index.html)
11 | * [CS 3210: Build an operating system in Rust programming language on Raspberry Pi 3](https://tc.gts3.org/cs3210/2020/spring/lab.html)
12 | * [ITSC 3181: Introduction to Computer Architecture](https://passlab.github.io/ITSC3181)
13 | * [LFD103: A Beginner's Guide to Linux Kernel Development](https://training.linuxfoundation.org/training/a-beginners-guide-to-linux-kernel-development-lfd103)
14 | 
15 | ## Books
16 | 
17 | * Operating System Concepts (10th) [ [book](https://www.os-book.com/OS10/index.html) | [slides](https://codex.cs.yale.edu/avi/courses/CS-423/slides/index.html) ]
18 | * [Lion's Commentary on UNIX with Source Code](https://www.bookdepository.com/Lion-s-Commentary-on-UNIX-with-Source-Code/9781573980135)
19 | * [UNIX Internals: The New Frontiers](https://www.amazon.co.uk/UNIX-Internals-Frontiers-Uresh-Vahalia/dp/013021034X)
20 | * [Linux From Scratch](https://www.linuxfromscratch.org) (online)
21 | * [The little book about OS development](https://littleosbook.github.io)
22 | * [OS01: Bootstrap yourself to write an OS from scratch](https://tuhdo.github.io/os01) (incomplete)
23 | * [Learning operating system development using Linux kernel and Raspberry Pi](https://s-matyukevich.github.io/raspberry-pi-os) (incomplete)
24 | * [Writing a "bare metal" operating system for Raspberry Pi 4](https://www.rpi4os.com)
25 | * [Writing a simple 16 bit VM in less than 125 lines of C](https://www.andreinc.net/2021/12/01/writing-a-simple-vm-in-less-than-125-lines-of-c)
26 | * [os-tutorial: How to create an OS from scratch](https://github.com/cfenollosa/os-tutorial)
27 | 
28 | ## Kernel
29 | 
30 | * [The Linux Kernel Archives](https://www.kernel.org/lore.html)
31 | * [OldLinux: Early Linux Kernel Analysis and Comments](http://www.oldlinux.org)
32 | * [Writing Your First Kernel Module](https://scottc130.medium.com/writing-your-first-kernel-module-98ae68edf0e)
33 | * [Biscuit: An OS kernel in a high-level language](https://pdos.csail.mit.edu/projects/biscuit.html)
34 | * [HermiTux: A binary-compatible unikernel](https://ssrg-vt.github.io/hermitux)
35 | * [The big idea around unikernels](https://changelog.com/posts/the-big-idea-around-unikernels)
36 | * [State of the art for Unikernels](https://github.com/seeker89/unikernels)
37 | * [Tiny Core Linux](http://www.tinycorelinux.net)
38 | 
39 | ## Boot
40 | 
41 | * [Bootloader basics](https://notes.eatonphil.com/bootloader-basics.html)
42 | * [Interactive x86 bootloader](https://blog.benjojo.co.uk/post/interactive-x86-bootloader-tutorial)
43 | * [A set of minimal dependency bootstrap binaries](https://github.com/oriansj/stage0)
44 | * [Writing an x86 bootloader in Rust that can launch vmlinux](https://vmm.dev/en/rust/krabs.md)
45 | 
46 | ## Rust
47 | 
48 | * [Writing an OS in Rust](https://os.phil-opp.com)
49 | * [Rust OS comparison](https://github.com/flosse/rust-os-comparison)
50 | * [Kerla: A new operating system kernel with Linux binary compatibility written in Rust](https://github.com/nuta/kerla)
51 | * [Redox: Redox is a Unix-like Operating System written in Rust](https://www.redox-os.org)
52 | * [Operating System development tutorials in Rust on the Raspberry Pi](https://github.com/rust-embedded/rust-raspberrypi-OS-tutorials)
53 | * [CrabOS: My hobby operating system written in Rust](https://github.com/haileys/crabos) (hobby)
54 | * [SnakeOS: Bootable x86 snake game in rust](https://github.com/trusch/snakeos) (hobby)
55 | 
56 | ## Hobby
57 | 
58 | * [SerenityOS](https://github.com/SerenityOS/serenity)
59 | * [Chicago95](https://github.com/grassmunk/Chicago95)
60 | * [ToaruOS](https://github.com/klange/toaruos)
61 | * [MenuetOS](https://www.menuetos.net)
62 | * [oasis](https://github.com/oasislinux/oasis)
63 | 
64 | ## Alternative
65 | 
66 | * [Nanos](https://nanos.org)
67 | * [Qubes OS](https://www.qubes-os.org)
68 | 
69 | ## Random
70 | 
71 | * [Awesome Operating System Stuff](https://github.com/jubalh/awesome-os)
72 | * [QEMU: A generic and open source machine emulator and virtualizer](https://www.qemu.org)
73 | * [Hypervisor From Scratch](https://rayanfam.com/topics/hypervisor-from-scratch-part-1)
74 | * [SCAMP CPU](https://github.com/jes/scamp-cpu) - A homebrew 16-bit CPU with a homebrew Unix-like-ish operating system
75 | * [Virtual Hackintosh](https://github.com/kholia/OSX-KVM)
76 | * [How To Write a Computer Emulator](https://fms.komkon.org/EMUL8/HOWTO.html)
77 | * [Linux x86 Program Start Up](http://dbp-consulting.com/tutorials/debugging/linuxProgramStartup.html)
78 | * [Floppinux: An Embedded Linux on a Single Floppy](https://bits.p1x.in/floppinux-an-embedded-linux-on-a-single-floppy)
79 | 
80 | <br>
81 | 


--------------------------------------------------------------------------------
/docs/docker.md:
--------------------------------------------------------------------------------
  1 | # Docker
  2 | 
  3 | > **Docker** is an open platform for developers and sysadmins to build, ship, and run distributed applications
  4 | 
  5 | Resources
  6 | 
  7 | * [Documentation](https://docs.docker.com)
  8 | 
  9 | * [Docker in Action](https://amzn.to/2MxbJTt) (2016) by Jeff Nickoloff (Book)
 10 | 
 11 | ## How-To
 12 | 
 13 | Setup
 14 | ```bash
 15 | # install docker
 16 | curl -fsSL get.docker.com -o get-docker.sh && \
 17 |   chmod u+x $_ && \
 18 |   ./$_ && \
 19 |   sudo usermod -aG docker docker
 20 | 
 21 | docker --version
 22 | 
 23 | # install docker-compose
 24 | sudo curl -L https://github.com/docker/compose/releases/download/1.19.0/docker-compose-`uname -s`-`uname -m` \
 25 |   -o /usr/local/bin/docker-compose && \
 26 |   sudo chmod +x /usr/local/bin/docker-compose
 27 | 
 28 | docker-compose --version
 29 | 
 30 | # install docker-machine (VirtualBox required)
 31 | curl -L https://github.com/docker/machine/releases/download/v0.13.0/docker-machine-`uname -s`-`uname -m` >/tmp/docker-machine && \
 32 |   sudo install /tmp/docker-machine /usr/local/bin/docker-machine
 33 | 
 34 | docker-machine --version
 35 | ```
 36 | 
 37 | Useful commands
 38 | ```bash
 39 | # list images
 40 | docker images
 41 | # list containers
 42 | docker ps -a
 43 | # list volumes
 44 | docker volume ls
 45 | 
 46 | # run temporary container
 47 | docker run --rm --name phusion phusion/baseimage:latest
 48 | # access container from another shell
 49 | docker exec -it phusion bash
 50 | 
 51 | # remove container by name
 52 | docker ps -a -q -f name=CONTAINER_NAME | xargs --no-run-if-empty docker rm -f
 53 | # delete dangling images <none>
 54 | docker images -q -f dangling=true | xargs --no-run-if-empty docker rmi
 55 | # delete dangling volumes
 56 | docker volume ls -q -f dangling=true | xargs --no-run-if-empty docker volume rm
 57 | ```
 58 | 
 59 | Docker Machine
 60 | ```bash
 61 | # create local machine
 62 | docker-machine create --driver virtualbox default
 63 | 
 64 | # list
 65 | docker-machine ls
 66 | docker-machine ls --filter name=default
 67 | docker-machine ls --filter state=Running
 68 | docker-machine ls --format "{{.Name}}: {{.DriverName}} - {{.State}}"
 69 | 
 70 | # info
 71 | docker-machine inspect default
 72 | docker-machine inspect --format='{{.Driver.IPAddress}}' default
 73 | docker-machine status default
 74 | docker-machine ip default
 75 | 
 76 | # management
 77 | docker-machine start default
 78 | docker-machine stop default
 79 | docker-machine restart default
 80 | docker-machine rm default
 81 | 
 82 | # mount volume
 83 | #https://docs.docker.com/machine/reference/mount
 84 | 
 85 | # show command to connect to machine
 86 | docker-machine env default
 87 | # check if variables are set
 88 | env | grep DOCKER
 89 | 
 90 | # connect to machine
 91 | eval "$(docker-machine env default)"
 92 | docker ps -a
 93 | 
 94 | # show command to disconnect from machine
 95 | docker-machine env -u
 96 | # unset all
 97 | eval $(docker-machine env -u)
 98 | 
 99 | # access
100 | docker-machine ssh default
101 | # execute command and exit
102 | docker-machine ssh default uptime
103 | # copy files from host to guest
104 | docker-machine scp -r /FROM default:/TO
105 | 
106 | # start nginx on default machine
107 | docker run -d -p 8000:80 nginx
108 | # verify from host
109 | curl $(docker-machine ip default):8000
110 | # forward to port 8080
111 | docker-machine ssh default -L 8080:localhost:8000
112 | # verify tunnel from host
113 | curl localhost:8080
114 | 
115 | # disable error crash reporting
116 | mkdir -p ~/.docker/machine && touch ~/.docker/machine/no-error-report
117 | ```
118 | 
119 | ## Base image
120 | 
121 | * [Supervisor](http://supervisord.org)
122 | 
123 | Build `devops/base` image
124 | ```bash
125 | # change path
126 | cd devops/base
127 | 
128 | # build image
129 | docker build -t devops/base .
130 | 
131 | # temporary container
132 | docker run --rm --name devops-base devops/base
133 | # access container
134 | docker exec -it devops-base bash
135 | 
136 | # configurations
137 | /etc/supervisor/conf.d
138 | 
139 | # supervisor actions
140 | supervisorctl status
141 | supervisorctl start SERVICE_NAME
142 | supervisorctl stop SERVICE_NAME
143 | ```
144 | 
145 | ## Docker Hub
146 | 
147 | * [niqdev/phusion-base](https://hub.docker.com/r/niqdev/phusion-base)
148 | * [niqdev/zookeeper](https://hub.docker.com/r/niqdev/zookeeper)
149 | * [niqdev/kafka](https://hub.docker.com/r/niqdev/kafka)
150 | 
151 | ```bash
152 | docker login
153 | 
154 | # phusion-base
155 | # https://github.com/phusion/baseimage-docker
156 | docker build -t devops/base:latest ./base
157 | docker tag devops/base niqdev/phusion-base:latest-amd64
158 | docker tag devops/base niqdev/phusion-base:latest
159 | docker push niqdev/phusion-base:latest-amd64
160 | docker push niqdev/phusion-base:latest
161 | 
162 | # zookeeper
163 | docker build -t devops/zookeeper:latest ./zookeeper
164 | docker tag devops/zookeeper niqdev/zookeeper:3.5.5
165 | docker tag devops/zookeeper niqdev/zookeeper
166 | docker push niqdev/zookeeper:3.5.5
167 | docker push niqdev/zookeeper:latest
168 | 
169 | # kafka
170 | docker build -t devops/kafka:latest ./kafka
171 | docker tag devops/kafka niqdev/kafka:2.3.0
172 | docker tag devops/kafka niqdev/kafka
173 | docker push niqdev/kafka:2.3.0
174 | docker push niqdev/kafka:latest
175 | 
176 | docker-compose -f kafka/docker-compose-hub.yml up
177 | ```
178 | 
179 | <br>
180 | 


--------------------------------------------------------------------------------
/docs/ansible.md:
--------------------------------------------------------------------------------
  1 | # Ansible
  2 | 
  3 | > **Ansible** is an open source automation platform that can help with config management, deployment and task automation
  4 | 
  5 | Resources
  6 | 
  7 | * [Documentation](http://docs.ansible.com/ansible/latest/index.html)
  8 | 
  9 | * [Ansible - Up and Running](https://amzn.to/2IDtDSd) (2017) by Lorin Hochstein and Rene Moser (Book)
 10 | 
 11 | * [Tutorial](https://serversforhackers.com/c/an-ansible2-tutorial)
 12 | 
 13 | * [Playbook example](https://gist.github.com/marktheunissen/2979474)
 14 | 
 15 | * [Ansible Tutorial for Beginners: Ultimate Playbook & Examples](https://spacelift.io/blog/ansible-tutorial)
 16 | 
 17 | The following guide explains how to provision Ansible locally and play with it. Checkout the [Vagrantfile](https://github.com/niqdev/devops/blob/master/ansible/Vagrantfile) and the Vagrant [guide](toolbox/#vagrant) for more details.
 18 | 
 19 | ### Setup
 20 | 
 21 | Requirements
 22 | 
 23 | * [Vagrant 2](https://www.vagrantup.com)
 24 | * [VirtualBox 5](https://www.virtualbox.org)
 25 | 
 26 | Directory structure
 27 | ```bash
 28 | tree -a ansible/
 29 | ansible/
 30 | ├── .share
 31 | │   ├── node-1
 32 | │   ├── node-2
 33 | │   ├── node-3
 34 | │   └── ssh
 35 | │       ├── ansible_rsa
 36 | │       └── ansible_rsa.pub
 37 | ├── Vagrantfile
 38 | ├── data
 39 | │   ├── group_vars
 40 | │   ├── host_vars
 41 | │   ├── hosts
 42 | │   ├── roles
 43 | │   │   ├── common
 44 | │   │   │   ├── defaults
 45 | │   │   │   ├── files
 46 | │   │   │   ├── handlers
 47 | │   │   │   ├── meta
 48 | │   │   │   ├── tasks
 49 | │   │   │   │   ├── main.yml
 50 | │   │   │   │   ├── motd.yml
 51 | │   │   │   │   ├── oracle-jdk.yml
 52 | │   │   │   │   └── package.yml
 53 | │   │   │   ├── templates
 54 | │   │   │   │   └── motd
 55 | │   │   │   └── vars
 56 | │   │   │       └── main.yml
 57 | │   │   └── docker
 58 | │   │       ├── meta
 59 | │   │       │   └── main.yml
 60 | │   │       └── tasks
 61 | │   │           └── main.yml
 62 | │   └── site.yml
 63 | ├── destroy_ansible.sh
 64 | ├── setup_ansible.sh
 65 | └── setup_share.sh
 66 | ```
 67 | 
 68 | The first time *only*, you have to setup the shared folders and generate the ssh key needed by ansible to access all nodes executing
 69 | 
 70 | ```bash
 71 | ./setup_share.sh
 72 | ```
 73 | 
 74 | Start the boxes with
 75 | ```bash
 76 | vagrant up
 77 | ```
 78 | *The first time it could take a while*
 79 | 
 80 | Verify status of the boxes with
 81 | ```bash
 82 | vagrant status
 83 | ```
 84 | 
 85 | Verify access to the boxes with
 86 | ```bash
 87 | vagrant ssh ansible
 88 | vagrant ssh node-1
 89 | ```
 90 | 
 91 | From inside the boxes you should be able to communicate with the others
 92 | ```bash
 93 | ping ansible.local
 94 | ping ip-192-168-100-11.local
 95 | ping 192.168.100.12
 96 | ```
 97 | 
 98 | The following paths are shared with the boxes
 99 | 
100 | * `/vagrant` provision-tool
101 | * `/local` host $HOME
102 | * `/ansible` data *(ansible only)*
103 | * `/data` .share *(node only)*
104 | 
105 | Cleanup
106 | ```bash
107 | ./destroy_ansible.sh
108 | ```
109 | 
110 | ## Ad-Hoc Commands
111 | 
112 | Access the ansible box with
113 | ```bash
114 | vagrant ssh ansible
115 | ```
116 | 
117 | Below a list of examples
118 | ```bash
119 | 
120 | # ping all nodes (default inventory /etc/ansible/hosts)
121 | ansible all -m ping
122 | ansible ansible -m ping
123 | ansible cluster -m ping
124 | 
125 | # ping all nodes (specify inventory)
126 | ansible all -i "/vagrant/data/hosts" -m ping
127 | 
128 | # gathering facts
129 | ansible all -m setup
130 | ansible ansible -m setup
131 | 
132 | # specify host and user
133 | ansible ip-192-168-100-11.local -m ping -u vagrant
134 | 
135 | # execute command
136 | ansible all -a "/bin/echo hello"
137 | ansible all -a "uptime"
138 | ansible all -a "/bin/date"
139 | # do NOT reboot vagrant through ansible (use vagrant reload)
140 | ansible cluster -a "/sbin/reboot" --become
141 | 
142 | # shell module
143 | ansible all -m shell -a "pwd"
144 | # be carefull to quotes
145 | ansible all -m shell -a 'echo $HOME'
146 | 
147 | # update && upgrade
148 | ansible all -m apt -a "update_cache=yes upgrade=dist" --become
149 | # restart after upgrade
150 | vagrant reload
151 | # install package
152 | ansible all -m apt -a "name=tree state=present" --become
153 | ```
154 | 
155 | ## Playbooks
156 | 
157 | Access the ansible box with
158 | ```bash
159 | vagrant ssh ansible
160 | ```
161 | 
162 | Below a list of examples
163 | 
164 | ```bash
165 | # test uptime on all node
166 | ansible-playbook /ansible/site.yml --tags=test --verbose
167 | 
168 | # update & upgrade only on cluster nodes
169 | ansible-playbook /ansible/site.yml -t package --skip-tags=oracle-jdk --verbose
170 | 
171 | # install oracle-jdk only on cluster nodes
172 | ansible-playbook /ansible/site.yml -t oracle-jdk
173 | 
174 | # install all packages on cluster nodes
175 | ansible-playbook /ansible/site.yml -t package --verbose
176 | 
177 | # run common task on cluster node
178 | ansible-playbook /ansible/site.yml -t common
179 | 
180 | # setup docker
181 | ansible-playbook /ansible/site.yml -t docker
182 | # test docker
183 | vagrant ssh node-1
184 | sudo -i -u docker
185 | docker ps -a
186 | 
187 | # custom banner
188 | ansible-playbook /ansible/site.yml -t motd
189 | 
190 | # setup all infrastructure at once
191 | ansible-playbook /ansible/site.yml
192 | 
193 | # dry run
194 | ansible-playbook -i /ansible/hosts /ansible/site.yml --check --diff
195 | ```
196 | 
197 | <br>
198 | 


--------------------------------------------------------------------------------
/hadoop/example/map-reduce/gradlew:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env sh
  2 | 
  3 | ##############################################################################
  4 | ##
  5 | ##  Gradle start up script for UN*X
  6 | ##
  7 | ##############################################################################
  8 | 
  9 | # Attempt to set APP_HOME
 10 | # Resolve links: $0 may be a link
 11 | PRG="$0"
 12 | # Need this for relative symlinks.
 13 | while [ -h "$PRG" ] ; do
 14 |     ls=`ls -ld "$PRG"`
 15 |     link=`expr "$ls" : '.*-> \(.*\)$'`
 16 |     if expr "$link" : '/.*' > /dev/null; then
 17 |         PRG="$link"
 18 |     else
 19 |         PRG=`dirname "$PRG"`"/$link"
 20 |     fi
 21 | done
 22 | SAVED="`pwd`"
 23 | cd "`dirname \"$PRG\"`/" >/dev/null
 24 | APP_HOME="`pwd -P`"
 25 | cd "$SAVED" >/dev/null
 26 | 
 27 | APP_NAME="Gradle"
 28 | APP_BASE_NAME=`basename "$0"`
 29 | 
 30 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
 31 | DEFAULT_JVM_OPTS=""
 32 | 
 33 | # Use the maximum available, or set MAX_FD != -1 to use that value.
 34 | MAX_FD="maximum"
 35 | 
 36 | warn () {
 37 |     echo "$*"
 38 | }
 39 | 
 40 | die () {
 41 |     echo
 42 |     echo "$*"
 43 |     echo
 44 |     exit 1
 45 | }
 46 | 
 47 | # OS specific support (must be 'true' or 'false').
 48 | cygwin=false
 49 | msys=false
 50 | darwin=false
 51 | nonstop=false
 52 | case "`uname`" in
 53 |   CYGWIN* )
 54 |     cygwin=true
 55 |     ;;
 56 |   Darwin* )
 57 |     darwin=true
 58 |     ;;
 59 |   MINGW* )
 60 |     msys=true
 61 |     ;;
 62 |   NONSTOP* )
 63 |     nonstop=true
 64 |     ;;
 65 | esac
 66 | 
 67 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
 68 | 
 69 | # Determine the Java command to use to start the JVM.
 70 | if [ -n "$JAVA_HOME" ] ; then
 71 |     if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
 72 |         # IBM's JDK on AIX uses strange locations for the executables
 73 |         JAVACMD="$JAVA_HOME/jre/sh/java"
 74 |     else
 75 |         JAVACMD="$JAVA_HOME/bin/java"
 76 |     fi
 77 |     if [ ! -x "$JAVACMD" ] ; then
 78 |         die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
 79 | 
 80 | Please set the JAVA_HOME variable in your environment to match the
 81 | location of your Java installation."
 82 |     fi
 83 | else
 84 |     JAVACMD="java"
 85 |     which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
 86 | 
 87 | Please set the JAVA_HOME variable in your environment to match the
 88 | location of your Java installation."
 89 | fi
 90 | 
 91 | # Increase the maximum file descriptors if we can.
 92 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
 93 |     MAX_FD_LIMIT=`ulimit -H -n`
 94 |     if [ $? -eq 0 ] ; then
 95 |         if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
 96 |             MAX_FD="$MAX_FD_LIMIT"
 97 |         fi
 98 |         ulimit -n $MAX_FD
 99 |         if [ $? -ne 0 ] ; then
100 |             warn "Could not set maximum file descriptor limit: $MAX_FD"
101 |         fi
102 |     else
103 |         warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
104 |     fi
105 | fi
106 | 
107 | # For Darwin, add options to specify how the application appears in the dock
108 | if $darwin; then
109 |     GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
110 | fi
111 | 
112 | # For Cygwin, switch paths to Windows format before running java
113 | if $cygwin ; then
114 |     APP_HOME=`cygpath --path --mixed "$APP_HOME"`
115 |     CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
116 |     JAVACMD=`cygpath --unix "$JAVACMD"`
117 | 
118 |     # We build the pattern for arguments to be converted via cygpath
119 |     ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
120 |     SEP=""
121 |     for dir in $ROOTDIRSRAW ; do
122 |         ROOTDIRS="$ROOTDIRS$SEP$dir"
123 |         SEP="|"
124 |     done
125 |     OURCYGPATTERN="(^($ROOTDIRS))"
126 |     # Add a user-defined pattern to the cygpath arguments
127 |     if [ "$GRADLE_CYGPATTERN" != "" ] ; then
128 |         OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
129 |     fi
130 |     # Now convert the arguments - kludge to limit ourselves to /bin/sh
131 |     i=0
132 |     for arg in "$@" ; do
133 |         CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
134 |         CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
135 | 
136 |         if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
137 |             eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
138 |         else
139 |             eval `echo args$i`="\"$arg\""
140 |         fi
141 |         i=$((i+1))
142 |     done
143 |     case $i in
144 |         (0) set -- ;;
145 |         (1) set -- "$args0" ;;
146 |         (2) set -- "$args0" "$args1" ;;
147 |         (3) set -- "$args0" "$args1" "$args2" ;;
148 |         (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
149 |         (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
150 |         (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
151 |         (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
152 |         (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
153 |         (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
154 |     esac
155 | fi
156 | 
157 | # Escape application args
158 | save () {
159 |     for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
160 |     echo " "
161 | }
162 | APP_ARGS=$(save "$@")
163 | 
164 | # Collect all arguments for the java command, following the shell quoting and substitution rules
165 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
166 | 
167 | # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
168 | if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
169 |   cd "$(dirname "$0")"
170 | fi
171 | 
172 | exec "$JAVACMD" "$@"
173 | 


--------------------------------------------------------------------------------
/docs/toolbox.md:
--------------------------------------------------------------------------------
  1 | # Toolbox
  2 | 
  3 | ## Vagrant
  4 | 
  5 | > **Vagrant** is a tool for building and managing virtual machine environments in a single workflow
  6 | 
  7 | Resources
  8 | 
  9 | * [Documentation](https://www.vagrantup.com/docs)
 10 | * [VirtualBox](https://www.virtualbox.org/wiki/Downloads)
 11 | 
 12 | Setup project creating a Vagrantfile
 13 | ```bash
 14 | vagrant init
 15 | ```
 16 | 
 17 | Boot and connect to the default virtual machine
 18 | ```bash
 19 | vagrant up
 20 | vagrant status
 21 | vagrant ssh
 22 | ```
 23 | 
 24 | Useful commands
 25 | ```bash
 26 | # shut down gracefully
 27 | vagrant halt
 28 | 
 29 | # reload (halt + up) + re-provision
 30 | vagrant reload --provision
 31 | 
 32 | # update box
 33 | vagrant box update
 34 | vagrant box list
 35 | 
 36 | # delete virtual machine without prompt
 37 | vagrant destory -f
 38 | ```
 39 | 
 40 | <br>
 41 | 
 42 | ## MkDocs
 43 | 
 44 | > **MkDocs** is a static site generator
 45 | 
 46 | Resources
 47 | 
 48 | * [Documentation](http://www.mkdocs.org)
 49 | 
 50 | Install
 51 | ```bash
 52 | pip install mkdocs
 53 | sudo -H pip3 install mkdocs
 54 | ```
 55 | 
 56 | Useful commands
 57 | ```bash
 58 | # setup in current directory
 59 | mkdocs new .
 60 | 
 61 | # start dev server with hot reload @ http://127.0.0.1:8000
 62 | mkdocs serve
 63 | 
 64 | # build static site
 65 | mkdocs build --clean
 66 | 
 67 | # deploy to github
 68 | mkdocs gh-deploy
 69 | ```
 70 | 
 71 | <br>
 72 | 
 73 | ## Hugo
 74 | 
 75 | > **Hugo** is a static site generator
 76 | 
 77 | * [Documentation](https://gohugo.io/documentation)
 78 | 
 79 | Useful commands
 80 | ```bash
 81 | # install
 82 | snap install hugo
 83 | # switch to extended Sass/SCSS version
 84 | snap refresh hugo --channel=extended
 85 | 
 86 | # create skeleton
 87 | hugo new site docs
 88 | # create skeleton in current non-empty folder
 89 | hugo new site . --force
 90 | 
 91 | # setup theme
 92 | git submodule add https://github.com/alex-shpak/hugo-book themes/book
 93 | echo 'theme = "book"' >> config.toml
 94 | 
 95 | # start dev server
 96 | hugo server -D
 97 | ```
 98 | 
 99 | <br>
100 | 
101 | ## SDKMAN!
102 | 
103 | > **SDKMAN!** is a tool for managing parallel versions of multiple Software Development Kits on most Unix based systems
104 | 
105 | Resources
106 | 
107 | * [Documentation](http://sdkman.io)
108 | 
109 | Setup
110 | ```
111 | curl -s "https://get.sdkman.io" | bash
112 | source "$HOME/.sdkman/bin/sdkman-init.sh"
113 | sdk version
114 | ```
115 | 
116 | Gradle
117 | ```bash
118 | # setup
119 | sdk list gradle
120 | sdk install gradle 4.4.1
121 | gradle -version
122 | 
123 | # create Gradle project
124 | mkdir -p PROJECT_NAME && cd $_
125 | gradle init --type java-library
126 | 
127 | ./gradlew clean build
128 | ```
129 | 
130 | Scala
131 | ```bash
132 | # setup sbt
133 | sdk list sbt
134 | sdk install sbt
135 | sbt sbtVersion
136 | sbt about
137 | 
138 | # setup scala
139 | sdk list scala
140 | sdk install scala 2.11.8
141 | scala -version
142 | 
143 | # sample project
144 | sbt new sbt/scala-seed.g8
145 | ```
146 | 
147 | <br>
148 | 
149 | ## Giter8
150 | 
151 | > **Giter8** is a command line tool to generate files and directories from templates published on GitHub or any other git repository
152 | 
153 | Resources
154 | 
155 | * [Documentation](http://www.foundweekends.org/giter8)
156 | * [Templates](https://github.com/foundweekends/giter8/wiki/giter8-templates)
157 | 
158 | Setup
159 | ```bash
160 | # install conscript
161 | curl https://raw.githubusercontent.com/foundweekends/conscript/master/setup.sh | sh
162 | source ~/.bashrc
163 | 
164 | # install g8
165 | cs foundweekends/giter8
166 | ```
167 | 
168 | Example
169 | ```bash
170 | # interactive
171 | g8 sbt/scala-seed.g8
172 | # non-interactive
173 | g8 sbt/scala-seed.g8 --name=my-new-website
174 | ```
175 | 
176 | <br>
177 | 
178 | ## Snap
179 | 
180 | Resources
181 | 
182 | * [Documentation](https://docs.snapcraft.io)
183 | 
184 | Useful commands
185 | ```bash
186 | # search
187 | snap find gimp
188 | 
189 | # info
190 | snap info gimp
191 | 
192 | # install
193 | snap install gimp
194 | 
195 | # list installed app
196 | snap list
197 | 
198 | # update all packages
199 | snap refresh
200 | 
201 | # remove
202 | snap remove gimp
203 | ```
204 | 
205 | <br>
206 | 
207 | ## Python
208 | 
209 | Resources
210 | 
211 | * [pip](https://pip.pypa.io/en/stable/user_guide)
212 | * [virtualenv](https://virtualenv.pypa.io/en/stable/userguide)
213 | * [What is the difference between virtualenv | pyenv | virtualenvwrapper | venv ?](https://stackoverflow.com/questions/41573587/what-is-the-difference-between-venv-pyvenv-pyenv-virtualenv-virtualenvwrappe/41573588#41573588)
214 | 
215 | Setup
216 | ```bash
217 | # search
218 | apt-get update && apt-cache search python | grep python2
219 | 
220 | # setup python
221 | apt-get install -y python2.7
222 | apt-get install -y python3
223 | 
224 | # install pip + setuptools
225 | curl https://bootstrap.pypa.io/get-pip.py | python2.7 -
226 | curl https://bootstrap.pypa.io/get-pip.py | python3 -
227 | apt install -y python-pip
228 | apt install -y python3-pip
229 | 
230 | # upgrade pip
231 | pip install -U pip
232 | 
233 | # install virtualenv globally 
234 | pip install virtualenv
235 | ```
236 | 
237 | virtualenv
238 | ```bash
239 | # create virtualenv
240 | virtualenv venv
241 | virtualenv -p python3 venv
242 | virtualenv -p $(which python3) venv
243 | 
244 | # activate virtualenv
245 | source venv/bin/activate
246 | 
247 | # verify virtualenv
248 | which python
249 | python --version
250 | 
251 | # deactivate virtualenv
252 | deactivate
253 | ```
254 | 
255 | pip
256 | ```bash
257 | # search package
258 | pip search <package>
259 | 
260 | # install new package
261 | pip install <package>
262 | 
263 | # update requirements with new packages
264 | pip freeze > requirements.txt
265 | 
266 | # install all requirements
267 | pip install -r requirements.txt
268 | ```
269 | 
270 | Other
271 | ```bash
272 | # generate rc file
273 | pylint --generate-rcfile > .pylintrc
274 | 
275 | # create module
276 | touch app/{__init__,main}.py
277 | ```
278 | 
279 | <br>
280 | 
281 | ## Git
282 | 
283 | Resources
284 | 
285 | * [git - the simple guide](https://rogerdudler.github.io/git-guide)
286 | * [git notes (1)](https://github.com/niqdev/git-notes/blob/master/git-real-1.md)
287 | * [git notes (2)](https://github.com/niqdev/git-notes/blob/master/git-real-2.md)
288 | 
289 | Other
290 | 
291 | * [Oh Shit, Git!?!](https://ohshitgit.com)
292 | * [Using Askgit](https://willschenk.com/articles/2020/using_askgit)
293 | * [`git filter-repo` is a versatile tool for rewriting history](https://github.com/newren/git-filter-repo)
294 | * [Merkle Tree](https://brilliant.org/wiki/merkle-tree)
295 | * [The Myers diff algorithm](https://blog.jcoglan.com/2017/02/12/the-myers-diff-algorithm-part-1)
296 | 
297 | <br>
298 | 
299 | ## Mercurial
300 | 
301 | Resources
302 | 
303 | * [A Guide to Branching in Mercurial](http://stevelosh.com/blog/2009/08/a-guide-to-branching-in-mercurial)
304 | 
305 | ```bash
306 | # changes since last commit
307 | hg st
308 | 
309 | # verify current branch
310 | hg branch
311 | 
312 | # lists all branches
313 | hg branches
314 | 
315 | # checkout default branch
316 | hg up default
317 | 
318 | # pull latest changes
319 | hg pull -u
320 | 
321 | # create new branch
322 | hg branch "branch-name"
323 | 
324 | # track new file
325 | hg add .
326 | 
327 | # track new files and untrack removed files
328 | hg addremove
329 | 
330 | # commit all tracked files
331 | hg commit -m "my-comment"
332 | 
333 | # commit specific files
334 | hg commit FILE_1 FILE_2 -m "my-comment"
335 | 
336 | # commit and track/untrack files (i.e. addremove)
337 | hg commit -A -m "my-comment-with-addremove"
338 | 
339 | # rename last unpushed commit message
340 | hg commit -m "bad-commit-message"
341 | hg commit --amend -m "good-commit-message"
342 | 
343 | # discard untracked files
344 | hg purge
345 | 
346 | # discard uncommitted local changes
347 | hg up -C
348 | 
349 | # discard local uncommitted branch
350 | hg strip "branch-name"
351 | 
352 | # push commits in all branches
353 | hg push
354 | 
355 | # push commits in current branch
356 | hg push -b .
357 | 
358 | # create a new branch and push commits in current branch (first time only)
359 | hg push -b . --new-branch
360 | 
361 | # lists unpushed commit
362 | hg outgoing
363 | 
364 | # change head to specific revision
365 | hg up -r 12345
366 | 
367 | # merge default branch on current branch
368 | hg up default
369 | hg pull -u
370 | hg status
371 | hg up CURRENT-BRANCH
372 | hg merge default
373 | hg diff
374 | 
375 | # remove all resolved conflicts
376 | rm **/*.orig
377 | 
378 | # list stashes
379 | hg shelve --list
380 | 
381 | # stash
382 | hg shelve -n "my-draft"
383 | 
384 | # unstash
385 | hg unshelve "my-draft"
386 | 
387 | # revert/undo last unpushed commit
388 | hg strip -r -1 --keep
389 | hg strip --keep --rev .
390 | 
391 | # solve conflicts manually and then mark it as merged
392 | hg resolve -m FILE-NAME
393 | 
394 | # lists commits
395 | hg log
396 | hg ls
397 | 
398 | # pretty log
399 | hg history --graph --limit 10
400 | ```
401 | 
402 | <br>
403 | 


--------------------------------------------------------------------------------
/docs/cassandra.md:
--------------------------------------------------------------------------------
  1 | # Cassandra
  2 | 
  3 | > **Cassandra** is a distributed database for managing large amounts of structured data across many commodity servers, while providing highly available service and no single point of failure
  4 | 
  5 | Resources
  6 | 
  7 | * [Documentation](https://cassandra.apache.org)
  8 | 
  9 | * [Cassandra: The Definitive Guide](https://amzn.to/2KvnEjY) (2016)(4th) by Eben Hewitt, Jeff Carpenter (Book)
 10 | 
 11 | * [A Decentralized Structured Storage System](https://www.cs.cornell.edu/projects/ladis2009/papers/lakshman-ladis2009.pdf) (Paper)
 12 | 
 13 | * [A Big Data Modeling Methodology for Apache Cassandra](http://www.cs.wayne.edu/andrey/papers/TR-BIGDATA-05-2015-CKL.pdf) (Paper)
 14 | 
 15 | * [Facebook’s Cassandra paper](https://docs.datastax.com/en/articles/cassandra/cassandrathenandnow.html)
 16 | 
 17 | * [Cassandra Data Modeling Best Practices](https://www.ebayinc.com/stories/blogs/tech/cassandra-data-modeling-best-practices-part-1)
 18 | 
 19 | * [Difference between partition key, composite key and clustering key](https://stackoverflow.com/questions/24949676/difference-between-partition-key-composite-key-and-clustering-key-in-cassandra)
 20 | 
 21 | * [Cassandra Cluster Manager](https://github.com/riptano/ccm)
 22 | 
 23 | * [Netflix Priam](https://github.com/Netflix/Priam)
 24 | 
 25 | * [cstar_perf](https://www.datastax.com/dev/blog/cassandra-performance-testing-with-cstar_perf)
 26 | 
 27 | * [Amy's Cassandra 2.1 tuning guide](https://tobert.github.io/pages/als-cassandra-21-tuning-guide.html)
 28 | 
 29 | * [Repair in Cassandra](https://www.datastax.com/dev/blog/repair-in-cassandra)
 30 | 
 31 | <!-- * [Cassandra Modeling Kata](https://github.com/allegro/cassandra-modeling-kata) -->
 32 | 
 33 | Cassandra uses a tick-tock release model, even-numbered releases are feature releases, while odd-numbered releases are focused on bug fixes
 34 | 
 35 | ## Architecture
 36 | 
 37 | * A **rack** is a logical set of nodes in close proximity to each other
 38 | 
 39 | * A **data center** is a logical set of racks
 40 | 
 41 | * Cassandra uses a **gossip protocol** (called epidemic protocol) that allows each node to keep track of state information about the other nodes in the cluster implementing an algorithm called *Phi Accrual Failure Detection* instead of simple heartbeats
 42 | 
 43 | * The job of a **snitch** is to determine relative host proximity for each node in a cluster, which is used to determine which nodes to read and write from
 44 | 
 45 | * Cassandra represents the data managed by a cluster as a **ring**. Each node in the ring is assigned one or more ranges of data described by a **token**, which determines its position in the ring and is used to identify each partition
 46 | 
 47 | ![cassandra-token-ring](img/cassandra-token-ring.png)
 48 | 
 49 | * **virtual nodes** allow to break a token range and assign multiple tokens to a single physical node
 50 | 
 51 | * A **partitioner** is a hash function for computing the token of a partition key and determines how a (wide) row or partition of data is distributed within the ring
 52 | 
 53 | * The **replication factor** is the number of nodes in a cluster that will receive copies of the same row and the replication strategy is set independently for each keyspace
 54 | 
 55 | * Cassandra provides tuneable **consistency** levels and must be specified on each read or write
 56 | 
 57 | * A client may connect to any node in the cluster, named **coordinator node**, to initiate a read or write query. The coordinator identifies which nodes are replicas for the data and forwards the queries to them
 58 | 
 59 | ![cassandra-query](img/cassandra-query.png)
 60 | 
 61 | * When a write operation is performed, it's immediately written to a **commit log** to ensure that data is not lost. It is a crash-recovery mechanism only, clients never read from it
 62 | 
 63 | * After it's written to the commit log, the value is written (already ordered) to a memory-resident data structure called the **memtable** divided by Column Family (table)
 64 | 
 65 | * When the number of objects stored in the memtable or in the commit log reaches a threshold, the contents of the memtable are flushed (non-blocking operation) to disk in a file called **SSTable** and a new memtable or commit log is then created/recycled
 66 | 
 67 | * No reads or seeks of any kind are required for writing a value to Cassandra because all writes are append operations to immutable SSTables. However, periodic **compaction** operations in Cassandra are performed in order to support fast read performance: the keys are merged, columns are combined, tombstones are discarded, and a new index is created
 68 | 
 69 | * The **key cache** stores a map of partition keys to row index entries, facilitating faster read access into SSTables stored on disk. The key cache is stored on the JVM heap
 70 | 
 71 | * The **row cache** caches entire rows and can greatly speed up read access for frequently accessed rows, at the cost of more memory usage. The row cache is stored in off-heap memory
 72 | 
 73 | * The **counter cache** is used to improve counter performance by reducing lock contention for the most frequently accessed counters
 74 | 
 75 | * In a scenario in which a write request is sent to Cassandra, but a replica node where the write properly belongs is not available due to network partition, hardware failure, or some other reason, to ensure general availability Cassandra implements a feature called **hinted handoff**. The coordinator node while store temporarily the data until it detects that the node is available again
 76 | 
 77 | *Write Path*
 78 | ![cassandra-write-path](img/cassandra-write-path.png)
 79 | 
 80 | *Read Path*
 81 | ![cassandra-read-path](img/cassandra-read-path.png)
 82 | 
 83 | * To provide *linearizable consistency* e.g. read-before-write, Cassandra supports a **lightweight transaction** or LWT. The implementation is based on *paxos* and is limited to a single partition
 84 | 
 85 | * A **tombstone** is a deletion marker that is required to suppress older data in SSTables until compaction or garbage collection run. Data is not immediately deleted but it's treated as an update operation
 86 | 
 87 | * **Bloom filters** are very fast, non-deterministic algorithms for testing whether an element is a member of a set. It is possible to get a false-positive read, but not a false-negative. When a read is performed, the filter is checked first before accessing disk, if it indicates that the element does not exist in the set, it certainly doesn't, but if the filter thinks that the element is in the set, the disk is accessed to make sure
 88 | 
 89 | * *Replica Synchronization (1)* Cassandra reads data from multiple replicas in order to achieve the requested consistency level and detects if any replicas have out of date values. If an insufficient number of nodes have the latest value, a **read repair** is performed immediately to update the out of date replicas
 90 | 
 91 | * *Replica Synchronization (2)* **Anti-entropy repair** is a manually initiated operation performed on nodes as part of a regular maintenance process executed with *nodetool* causing a *major compaction* during which a node exchange *Merkle trees* with neighboring nodes
 92 | 
 93 | ## Setup
 94 | 
 95 | Single Node Cluster
 96 | ```bash
 97 | # change path
 98 | cd devops/cassandra
 99 | 
100 | # start single node
101 | docker-compose up
102 | 
103 | # paths
104 | /etc/cassandra
105 | /var/lib/cassandra
106 | /var/log/cassandra
107 | 
108 | # remove container and volume
109 | docker rm -fv devops-cassandra
110 | ```
111 | 
112 | Multi Node Cluster
113 | ```bash
114 | # change path
115 | cd devops/cassandra
116 | 
117 | # start node
118 | docker-compose -f docker-compose-cluster.yml up
119 | 
120 | # optional mounted volumes
121 | mkdir -p \
122 |   .cassandra/cassandra-seed/{data,log} \
123 |   .cassandra/cassandra-node-1/{data,log} \
124 |   .cassandra/cassandra-node-2/{data,log}
125 | tree .cassandra/
126 | 
127 | # ISSUES releated to host permissions
128 | # > Small commitlog volume detected at /var/lib/cassandra/commitlog
129 | # > There is insufficient memory for the Java Runtime Environment to continue
130 | (cassandra) /var/lib/cassandra
131 | (root) /var/log/cassandra
132 | ```
133 | 
134 | Access container
135 | ```bash
136 | # access container
137 | docker exec -it devops-cassandra bash
138 | docker exec -it devops-cassandra bash -c cqlsh
139 | docker exec -it devops-cassandra-seed bash
140 | docker exec -it devops-cassandra-node-1 bash
141 | 
142 | # execute cql script from host
143 | (docker exec -i devops-cassandra bash \
144 |   -c "cat > example.cql; cqlsh -f example.cql") < cql/example_create.cql
145 | ```
146 | 
147 | ## CQL
148 | 
149 | `cqlsh` script [examples](https://github.com/niqdev/devops/tree/master/cassandra/cql)
150 | 
151 | ```bash
152 | # connect
153 | cqlsh localhost 9042
154 | cqlsh localhost 9042 -u cassandra -p cassandra
155 | 
156 | # execute cql script
157 | cqlsh -f cql/example_create.cql
158 | 
159 | # info
160 | SHOW VERSION;
161 | DESCRIBE CLUSTER;
162 | DESCRIBE KEYSPACES;
163 | DESCRIBE KEYSPACE example;
164 | DESCRIBE TABLE example.messages;
165 | 
166 | # nice format
167 | EXPAND ON;
168 | # trace query
169 | TRACING ON;
170 | 
171 | # bulk loading
172 | COPY example.users TO '/cql/users.csv' WITH HEADER=TRUE;
173 | COPY example.users FROM '/cql/all_users.csv' WITH DELIMITER = ';';
174 | COPY example.users (first_name,last_name,addresses,emails,enable) FROM '/cql/column_users.csv' WITH HEADER=TRUE;
175 | 
176 | # automatic paging
177 | PAGING;
178 | PAGING ON;
179 | PAGING 100;
180 | # limit
181 | SELECT * FROM example.users LIMIT 1;
182 | ```
183 | 
184 | * [Batch](https://docs.datastax.com/en/cql/3.3/cql/cql_using/useBatch.html)
185 | * [User-Defined Type](https://docs.datastax.com/en/dse/6.0/cql/cql/cql_using/useCreateUDT.html)
186 | * [User-Defined Function](https://docs.datastax.com/en/cql/3.3/cql/cql_using/useCreateUDF.html)
187 | * [User-Defined Aggregate Function](https://docs.datastax.com/en/cql/3.3/cql/cql_using/useCreateUDA.html)
188 | 
189 | Old `cassandra-cli` deprecated and removed in Cassandra 3.0
190 | 
191 | ```
192 | USE keyspace_name;
193 | LIST table_name;
194 | GET table_name["primary_key"];
195 | SET table_name["primary_key"]["column_name"];
196 | ```
197 | 
198 | ## nodetool
199 | 
200 | ```bash
201 | # help
202 | nodetool
203 | 
204 | # cluster informations
205 | nodetool describecluster
206 | nodetool status
207 | 
208 | # node informations
209 | nodetool -h xxx.xxx.xxx.xxx info
210 | nodetool -h xxx.xxx.xxx.xxx statusgossip|statusthrift|statusbinary|statushandoff
211 | nodetool gossipinfo
212 | 
213 | # ring informations
214 | nodetool ring
215 | nodetool describering KEYSPACE
216 | 
217 | # monitor network
218 | nodetool netstats
219 | 
220 | # threadpool statistics
221 | nodetool tpstats
222 | 
223 | # keyspace statistics
224 | nodetool tablestats KEYSPACE
225 | 
226 | # dynamic logging via JMX
227 | nodetool getlogginglevels
228 | 
229 | # force to write data from memtables to SSTables
230 | nodetool flush
231 | 
232 | # gracefully shutdown
233 | nodetool drain
234 | 
235 | # discards any data that is no longer owned by the node
236 | # e.g. after changing replication factor or token range
237 | nodetool cleanup
238 | 
239 | # anti-entropy repair or manual repair: reconcile data exchanging Merkle trees among nodes
240 | # maintenance: incremental parallel repair on the primary token range (run on each node)
241 | nodetool repair -pr
242 | 
243 | # create snapshot
244 | nodetool snapshot
245 | nodetool listsnapshots
246 | 
247 | # restore snapshot (create schema or truncate table before)
248 | # 1) same cluster and configuration
249 | # copy SSTable ".db" files into the data directory and on the running node execute refresh
250 | nodetool refresh
251 | # 2) different configuration (e.g. topology, token ranges, or replication)
252 | sstableloader
253 | 
254 | # stress tool
255 | cassandra-stress write n=1000000
256 | cassandra-stress read n=200000
257 | ```
258 | 
259 | <br>
260 | 


--------------------------------------------------------------------------------
/docs/system-design.md:
--------------------------------------------------------------------------------
  1 | # System Design
  2 | 
  3 | ## Books
  4 | 
  5 | * [Designing Data-Intensive Applications](https://amzn.to/2lKJMvU) (2017) by Martin Kleppmann
  6 | * [Domain-Driven Design: Tackling Complexity in the Heart of Software](https://amzn.to/2VTvGYS) (2003) by Eric Evans
  7 | * [Functional and Reactive Domain Modeling](https://www.manning.com/books/functional-and-reactive-domain-modeling) (2016) by Debasish Ghosh
  8 | * [Versioning in an Event Sourced System](https://leanpub.com/esversioning/read)
  9 | * [Exploring CQRS and Event Sourcing](https://docs.microsoft.com/en-us/previous-versions/msp-n-p/jj554200(v%3dpandp.10))
 10 | * [Database Internals - A Deep Dive into How Distributed Data Systems Work](https://www.databass.dev)
 11 | * [The Architecture of Open Source Applications](http://aosabook.org/en/index.html) (free)
 12 | 
 13 | ## Resources
 14 | 
 15 | * [6.824 Distributed Systems MIT](https://www.youtube.com/playlist?list=PLrw6a1wE39_tb2fErI4-WkMbsvGQk9_UB) (course)
 16 | * [Distributed Systems lecture series](https://www.youtube.com/playlist?list=PLeKd45zvjcDFUEv_ohr_HdUFe97RItdiB) by Martin Kleppmann (course)
 17 | * [Software Architecture Monday](https://www.youtube.com/playlist?list=PLdsOZAx8I5umhnn5LLTNJbFgwA3xbycar) (videos)
 18 | * [CQRS](https://www.martinfowler.com/bliki/CQRS.html) by Martin Fowler
 19 | * [Clarified CQRS](http://udidahan.com/2009/12/09/clarified-cqrs)
 20 | * [1 Year of Event Sourcing and CQRS](https://hackernoon.com/1-year-of-event-sourcing-and-cqrs-fb9033ccd1c6)
 21 | * [Eventually Consistent - Revisited](https://www.allthingsdistributed.com/2008/12/eventually_consistent.html)
 22 | * [How do CRDTs solve distributed data consistency challenges?](https://ably.com/blog/crdts-distributed-data-consistency-challenges)
 23 | * [Are CRDTs suitable for shared editing?](https://blog.kevinjahns.de/are-crdts-suitable-for-shared-editing)
 24 | * [On Designing and Deploying Internet-Scale Services](https://www.usenix.org/legacy/events/lisa07/tech/full_papers/hamilton/hamilton_html)
 25 | * [There is No Now](https://queue.acm.org/detail.cfm?id=2745385)
 26 | * [Online Event Processing](https://queue.acm.org/detail.cfm?id=3321612)
 27 | * [The world beyond batch: Streaming 101](https://www.oreilly.com/ideas/the-world-beyond-batch-streaming-101)
 28 | * [Questioning the Lambda Architecture](https://www.oreilly.com/ideas/questioning-the-lambda-architecture)
 29 | * [The Difference between SLI, SLO, and SLA](https://enqueuezero.com/the-difference-between-sli-slo-and-sla.html)
 30 | * [A review of consensus protocols](https://thomasvilhena.com/2020/10/a-review-of-consensus-protocols)
 31 | * [How you could have come up with Paxos yourself](https://explain.yshui.dev/distributed%20system/2020/09/20/paxos.html)
 32 | * [Implementing Raft's Leader Election in Rust](https://blog.laurocaetano.com/programming/2021/01/23/raft-leader-election-rust)
 33 | * [Consensus Protocol](https://www.consul.io/docs/architecture/consensus)
 34 | * [Implementing Raft for Browsers with Rust and WebRTC](https://eevans.co/blog/wraft)
 35 | * [HTTP Feeds](https://www.http-feeds.org)
 36 | * [Autopilot Pattern Applications](http://autopilotpattern.io)
 37 | * [REST Hooks](https://resthooks.org)
 38 | 
 39 | ## Blogs
 40 | 
 41 | * [Jepsen](https://aphyr.com/tags/Jepsen)
 42 | * [The Paper Trail](https://www.the-paper-trail.org)
 43 | * [High Scalability](http://highscalability.com)
 44 | * [InfoQ: Architecture & Design Content](https://www.infoq.com/architecture-design/presentations)
 45 | 
 46 | ## CAP
 47 | 
 48 | * [Brewer's CAP Theorem](http://www.julianbrowne.com/article/brewers-cap-theorem)
 49 | * [CAP Twelve Years Later: How the "Rules" Have Changed](https://www.infoq.com/articles/cap-twelve-years-later-how-the-rules-have-changed)
 50 | * [Please stop calling databases CP or AP](https://martin.kleppmann.com/2015/05/11/please-stop-calling-databases-cp-or-ap.html)
 51 | * [The CAP FAQ](https://www.the-paper-trail.org/page/cap-faq)
 52 | * [You Can't Sacrifice Partition Tolerance](https://codahale.com/you-cant-sacrifice-partition-tolerance)
 53 | 
 54 | ## Papers
 55 | 
 56 | * [Foundational distributed systems papers](https://muratbuffalo.blogspot.com/2021/02/foundational-distributed-systems-papers.html) (collection)
 57 | * [Distributed Systems Reading List](https://dancres.github.io/Pages) (collection)
 58 | * [Best Paper Awards in Computer Science](https://jeffhuang.com/best_paper_awards) (collection)
 59 | * [Ask HN: Recommended books and papers on distributed systems?](https://news.ycombinator.com/item?id=25987664) (collection)
 60 | * [The Google File System](https://static.googleusercontent.com/media/research.google.com/en//archive/gfs-sosp2003.pdf)
 61 | * [MapReduce: Simplified Data Processing on Large Clusters](https://static.googleusercontent.com/media/research.google.com/en//archive/mapreduce-osdi04.pdf)
 62 | * [Raft: In Search of an Understandable Consensus Algorithm](https://raft.github.io/raft.pdf)
 63 | * [Paxos Made Simple](https://www.microsoft.com/en-us/research/uploads/prod/2016/12/paxos-simple-Copy.pdf)
 64 | * [Zab: A simple totally ordered broadcast protocol](http://diyhpl.us/~bryan/papers2/distributed/distributed-systems/zab.totally-ordered-broadcast-protocol.2008.pdf)
 65 | * [The Chubby lock service for loosely-coupled distributed systems](https://static.googleusercontent.com/media/research.google.com/en//archive/chubby-osdi06.pdf)
 66 | * [Spanner: Google's Globally-Distributed Database](https://static.googleusercontent.com/media/research.google.com/en//archive/spanner-osdi2012.pdf)
 67 | * [Dynamo: Amazon’s Highly Available Key-value Store](https://s3.amazonaws.com/AllThingsDistributed/sosp/amazon-dynamo-sosp2007.pdf)
 68 | * [HyperLogLog in Practice](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/40671.pdf)
 69 | * [Dapper, a Large-Scale Distributed Systems Tracing Infrastructure](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/36356.pdf)
 70 | * [Large-scale cluster management at Google with Borg](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43438.pdf)
 71 | * [Linearizability: A Correctness Condition for Concurrent Objects](https://cs.brown.edu/~mph/HerlihyW90/p463-herlihy.pdf)
 72 | * [Harvest, Yield, and Scalable Tolerant Systems](https://s3.amazonaws.com/systemsandpapers/papers/FOX_Brewer_99-Harvest_Yield_and_Scalable_Tolerant_Systems.pdf)
 73 | * [Life beyond Distributed Transactions](https://web.archive.org/web/20210303104924/https://www-db.cs.wisc.edu/cidr/cidr2007/papers/cidr07p15.pdf) (webarchive)
 74 | * [The ϕ Accrual Failure Detector](https://web.archive.org/web/20170517022242/http://fubica.lsd.ufcg.edu.br/hp/cursos/cfsc/papers/hayashibara04theaccrual.pdf) (webarchive)
 75 | * [Conflict-free Replicated Data Types](https://hal.inria.fr/inria-00609399v1/document)
 76 | * [FLP - Impossibility of Distributed Consensus with One Faulty Process](https://web.archive.org/web/20210211213256/http://macs.citadel.edu/rudolphg/csci604/ImpossibilityofConsensus.pdf) (webarchive)
 77 | * [SEDA: An Architecture for Well-Conditioned, Scalable Internet Services](http://nms.lcs.mit.edu/~kandula/projects/killbots/killbots_files/seda-sosp01.pdf)
 78 | * [Pregel: A System for Large-Scale Graph Processing](https://kowshik.github.io/JPregel/pregel_paper.pdf)
 79 | * [Hashed and Hierarchical Timing Wheels](http://www.cs.columbia.edu/~nahum/w6998/papers/sosp87-timing-wheels.pdf)
 80 | * [Merkle Hash Tree based Techniques for Data Integrity of Outsourced Data](http://ceur-ws.org/Vol-1366/paper13.pdf)
 81 | * [What Every Programmer Should Know About Memory](https://www.akkadia.org/drepper/cpumemory.pdf)
 82 | * [Fallacies of Distributed Computing Explained](https://web.archive.org/web/20201108163119/http://www.rgoarchitects.com/Files/fallacies.pdf) (webarchive)
 83 | * [The Dataflow Model: A Practical Approach to Balancing Correctness, Latency, and Cost in Massive-Scale, Unbounded, Out-of-Order Data Processing](https://www.vldb.org/pvldb/vol8/p1792-Akidau.pdf)
 84 | * [A Dataset of Dockerfiles](https://arxiv.org/pdf/2003.12912.pdf)
 85 | 
 86 | <br>
 87 | 
 88 | <!--
 89 | ## Notes
 90 | 
 91 | * [*Domain-driven design (DDD)*](http://dddcommunity.org/learning-ddd/what_is_ddd) is an approach to developing software for complex needs by deeply connecting the implementation to an evolving model of the core business concepts. Domain-driven design is not a technology or a methodology. DDD provides a structure of practices and terminology for making design decisions that focus and accelerate software projects dealing with complicated domains. Its premise is:
 92 |     * Place the project's primary focus on the core domain and domain logic
 93 |     * Base complex designs on a model
 94 |     * Initiate a creative collaboration between technical and domain experts to iteratively cut ever closer to the conceptual heart of the problem
 95 | 
 96 | * Any domain model of nontrivial complexity is a collection of smaller models, each with its own data and domain vocabulary. In the world of *domain-driven design*, the term **bounded context** denotes one such smaller model within the whole. So the complete domain model is really a collection of bounded contexts
 97 | 
 98 | * Types of domain elements
 99 |     * An **entity** it's uniquely identifiable, has an identity and might change attributes in the course of its entire life-time within the system - an entity has an identity that can't change
100 |     * A **value object** it's uniquely identifiable, is immutable and you can't change the contents without changing the object itself, after you create it - a value object has a value that can't change
101 |     * In a **service**, multiple domain entities interact according to specific business rules and deliver a specific functionality in the system
102 | 
103 | * Lifecycle of a domain object
104 |     * **Creation**: how the object is created within the system
105 |     * **Participation in behaviors**: how the object is represented in memory when it interacts within the system
106 |     * **Persistence**: how the object is maintained in the persistent form
107 | 
108 | * Patterns
109 |     * The **factory** lets you create different types of objects using the same API
110 |         * It keeps all creational code in one place
111 |         * It abstracts the process of creation of an entity from the caller
112 |     * An **aggregate** can consist of one or more entities, value objects and other primitive attributes. Besides ensuring the consistency of business rules, an aggregate within a bounded context is also often looked at as a *transaction boundary* in the model. *Aggregates are created by factories and represent the underlying entities in memory during the active phase of the objects' lifecycle*
113 |     * A **repository** gives the interface for parking an aggregate in a persistent form so that can be fetched back to an in-memory entity representation when needed. The persistent model of the aggregate may be entirely different from the in-memory aggregate representation and is mostly driven by the underlying storage data model
114 | 
115 | * *The three types of elements participate in domain interactions and their lifecycles are controlled by the three patterns*. This interaction needs to reflect the underlying business semantics and must contain **vocabulary** from the problem domain modelled. Vocabulary it means the names of participating objects and the behaviors that are executed as part of the use cases. Having a consistent **ubiquitous language** has a lot to do with designing proper APIs of a model, which is known as *domain-specific language (DSL)*
116 | 
117 | * *Latency* is defined as the time period that elapses between a request and a response. If is possible to bind the latency to an acceptable limit to users, you achieve *responsiveness*. And being responsive is the primary criterion of a model being *reactive*
118 | 
119 | * Characteristics of a reactive model
120 |     * Responsive to user interaction
121 |     * Resilient i.e. responsive to failures
122 |     * Elastic i.e. responsive to varying load
123 |     * Message-driven i.e. to stay responsive and elastic, systems must be loosely coupled and minimize blocking by using asynchronous message passing
124 | 
125 | * *Design for failure*. This is a core concept when developing large services that are comprised of many cooperating components. Those components will fail and they will fail frequently
126 | 
127 | * One of the ways to make a system elastic is by reducing the coupling between the components of the model. Reactive models encourage loosely connected architectures that use asynchronous message boundaries as the means of nonblocking communications and components that interact using immutable messages without any sharing of mutable state to promote transparency of location, concurrency models and other paradigms which are cornerstone of functional programming
128 | 
129 | * An event is a form of notification. *Domain events* are based on the action that they perform within the domain model and are immutable. Such domain models are called *self-tracing* models, because domain event logs make our models traceable at any point in time
130 |     * *Uniquely identifiable as a type* - for each event, you have a type in your model
131 |     * *Self-contained as a behavior* — Every domain event contains all information relevant to the change that just occurred in the system
132 |     * *Observable by consumers* - Events are meant to be consumed for further action by downstream components of your model
133 |     * *Time relevant* — A monotonicity of time is built into the stream of events
134 | 
135 | ---
136 | 
137 | > TODO acronyms
138 | 
139 | * commands vs events
140 | * OLEP online event processing
141 | * OLTP online transaction processing
142 | * OLAP online analytical processing
143 | * DDD
144 | * CQRS
145 | * Event Source
146 | * Eventual Consistency
147 | * CRDT
148 | * CAP theorem
149 | * two-phase commit
150 | * circuit breaker pattern
151 | * SAGA
152 | * SLA service-level agreement
153 | * ACID
154 | * ACID 2.0
155 |     * **Associative** grouping doesn't matter `a+(b+c)=(a+b)+c`
156 |     * **Commutative** order doesn't matter `a+b=b+a`
157 |     * **Idempotent** duplication doesn't matter `a+a=a`
158 | 
159 | -->
160 | 


--------------------------------------------------------------------------------
/docs/kafka.md:
--------------------------------------------------------------------------------
  1 | # Kafka
  2 | 
  3 | > **Kafka** is a distributed streaming platform
  4 | 
  5 | Resources
  6 | 
  7 | * [Documentation](https://kafka.apache.org)
  8 | 
  9 | * [Kafka: The Definitive Guide](https://amzn.to/2tQCryv) (2017) by Gwen Shapira, Neha Narkhede, Todd Palino (Book)
 10 | 
 11 | * [Kafka Streams in Action](https://www.manning.com/books/kafka-streams-in-action) (2018) by William P. Bejeck Jr. (Book)
 12 | 
 13 | * [Kafka: a Distributed Messaging System for Log Processing](http://notes.stephenholiday.com/Kafka.pdf) (Paper)
 14 | 
 15 | * [The Internals of Kafka Streams](https://jaceklaskowski.gitbooks.io/mastering-kafka-streams) (Book)
 16 | 
 17 | * [Gently down the stream](https://www.gentlydownthe.stream) (Kid's Book)
 18 | 
 19 | * [Schema Registry](https://docs.confluent.io/current/schema-registry/docs/index.html)
 20 | 
 21 | * [KafkaProducer javadocs](https://kafka.apache.org/20/javadoc/index.html?org/apache/kafka/clients/producer/KafkaProducer.html)
 22 | 
 23 | * [KafkaConsumer javadocs](https://kafka.apache.org/20/javadoc/index.html?org/apache/kafka/clients/consumer/KafkaConsumer.html)
 24 | 
 25 | * [Reactive Kafka](https://doc.akka.io/docs/akka-stream-kafka/current/home.html)
 26 | 
 27 | ## Architecture
 28 | 
 29 | * Kafka is a publish/subscribe messaging system often described as a *distributed commit log* or *distributing streaming platform*
 30 | 
 31 | * The unit of data is called a **message**, which is simply an array of bytes and it can have a **key** used to assign partitions. A **batch** is a collection of messages, all of which are being produced to the same topic and partition
 32 | 
 33 | * Messages are categorized into **topics** which are additionally broken down into a number of **partitions**. Each partition is splitted into **segments** for storage purposes and each segment is stored in a single data file which contains messages and their offsets
 34 | 
 35 | * Messages are written in an append-only fashion and are read in order from beginning to end. As a topic typically has multiple partitions, there is no guarantee of message time-ordering across the entire topic, just within a single partition
 36 | 
 37 | * In order to help brokers quickly locate the message for a given offset, Kafka maintains an **index** for each partition. The index maps offsets to segment files and positions within the file
 38 | 
 39 | * A **stream** is considered to be a single topic of data, regardless of the number of partitions
 40 | 
 41 | * **Producers**, publishers or writers, create new messages to a specific topic. By default, the producer does not care what partition a specific message is written to and will balance messages over all partitions of a topic evenly
 42 | 
 43 | ![kafka-producer](img/kafka-producer.png)
 44 | 
 45 | * **Consumers**, subscribers or readers, read messages. The consumer subscribes to one or more topics and reads the messages in the order in which they were produced. The consumer keeps track of which messages it has already consumed by keeping track of the **offset** of messages i.e. an integer value that continually
 46 | increases. Each message in a given partition has a unique offset stored either in Zookeeper or in Kafka itself
 47 | 
 48 | ![kafka-consumer](img/kafka-consumer.png)
 49 | 
 50 | * Consumers work as part of a **consumer group**, which is one or more consumers that work together to consume a topic. The group assures that each partition is only consumed by one member. The mapping of a consumer to a partition is often called **ownership** of the partition by the consumer
 51 | 
 52 | * When a new consumer is added to a group, or when a consumer shuts down or crashes leaving the group, it cause reassignment of partitions to other consumers. Moving partition ownership from one consumer to another is called a **rebalance** which provide high availability and scalability
 53 | 
 54 | * Consumers maintain membership in a consumer group and ownership of the partitions assigned to them by sending **heartbeats** to a Kafka broker designated as the **group coordinator**
 55 | 
 56 | * You can't have multiple consumers that belong to the same group in one thread and you can't have multiple threads safely use the same consumer
 57 | 
 58 | ![kafka-consumer-group](img/kafka-consumer-group.png)
 59 | 
 60 | * Consumers must keep polling or they will be considered dead and the partitions they are consuming will be handed to another consumer in the group to continue consuming. Consumers **commit** (track) their offset (position) in each partition to a special `__consumer_offsets` topic. If a consumer crashes or a new consumer joins the consumer group, this will trigger a rebalance. After a rebalance, each consumer may be assigned a new set of partitions than the one it processed before. In order to know where to pick up the work, the consumer will read the latest committed offset of each partition and continue from there
 61 | 
 62 | ![kafka-rebalance-duplicate](img/kafka-rebalance-duplicate.png)
 63 | ![kafka-rebalance-lost](img/kafka-rebalance-lost.png)
 64 | 
 65 | * A single Kafka server is called a **broker**. The broker receives messages from producers, assigns offsets to them, and commits the messages to storage on disk. It also services consumers, responding to fetch requests for partitions and responding with the messages that have been committed to disk
 66 | 
 67 | * Kafka brokers are designed to operate as part of a **cluster**. A partition is owned by a single broker in the cluster and that broker is called the **leader** of the partition. A partition may be assigned to multiple brokers, which will result in the partition being replicated. All events are produced to and consumed from the *leader* replica. Other *follower* replicas just need to stay **in-sync** with the leader and replicate all the recent events on time
 68 | 
 69 | * Kafka uses **Zookeeper** to maintain the list of brokers that are currently members of a cluster. Every time a broker process starts, it registers itself with a unique identifier by creating an [ephemeral node](http://zookeeper.apache.org/doc/current/zookeeperProgrammers.html#Ephemeral+Nodes). Kafka uses Zookeeper's ephemeral node feature to elect a **controller**. The controller is responsible for electing leaders among the partitions and replicas whenever it notices nodes join and leave the cluster
 70 | 
 71 | ![kafka-cluster](img/kafka-cluster.png)
 72 | 
 73 | * Data in Kafka is organized by topics. Each topic is partitioned and each partition can have multiple **replicas**. Those replicas are stored on brokers and each broker stores replicas belonging to different topics and partitions
 74 | 
 75 | * A key feature is that of **retention**. Brokers are configured with a default retention setting for topics, either retaining messages for some period of *time* or until the topic reaches a certain *size* in bytes. Once these limits are reached, messages are expired and deleted
 76 | 
 77 | * **MirrorMaker** is a tool to coordinates multiple clusters or datacenters and replicate data
 78 | 
 79 | ## Details
 80 | 
 81 | * The underlying technology of a Kafka topic is a **log**, which is a file, an append-only, totally ordered sequence of records ordered by time. Topics in Kafka are logs that are segregated by topic name
 82 | 
 83 | * The configuration settings `log.dir` specifies where Kafka stores log data and each topic maps to a subdirectory. There will be as many subdirectories as there are topic partitions, with a format of `partition-name_partition-number`. Once the log files reach a certain size (either a number of records or size on disk), or when a configured time difference between message timestamps is reached, the log file is **rolled** and Kafka appends incoming messages to a new log
 84 | 
 85 | * To manage the increasing size of the logs, Kafka rolls them into **segments**. The timing of log rolling is based on *timestamps* embedded in the messages. Kafka rolls a log when a new message arrives, and its timestamp is greater than the timestamp of the first message in the log plus the `log.roll.ms`. At that point, the log is rolled and a new segment is created as the new active log. The previous active segment is still used to retrieve messages for consumers. Over time, the number of segments will continue to grow, and older segments will need to be deleted to make room for incoming data. To handle the deletion, you can specify how long to retain the segments by `log.retention` configurations
 86 | 
 87 | * **Log compaction** ensures that Kafka will always retain at least the last known value for each message key within the log of data for a single topic partition. Instead of taking a coarse-grained approach and deleting entire segments based on time or size, compaction is more fine-grained and deletes old records per key in a log. A log cleaner (a pool of threads) runs in the background, recopying log-segment files and removing records if there's an occurrence later in the log with the same key. To use compaction for a topic, set the `log.cleanup.policy=compact` property when creating the topic. With a compacted topic, deletion provides a `null` value for the given key, setting a tombstone marker
 88 | 
 89 | * **Partitions** guarantee that data with the same keys will be sent to the same consumer and in order. Partitioning a topic essentially splits the data forwarded to a topic across parallel streams, and it's key for performances and high throughput. Each message has an **offset** number assigned to it. The order of messages across partitions isn't guaranteed, but the order of messages within each partition is guaranteed
 90 | 
 91 | * Kafka works with data in key/value pairs. If the keys are `null`, the Kafka producer will write records to partitions chosen in a round-robin fashion, otherwise Kafka uses the formula `partition = hashCode(key) % numberOfPartitions` to determine to which partition to send the key/value pair to. By using a deterministic approach to select a partition, records with the same key will always be sent to the same partition and in order
 92 | 
 93 | * To determe the correct number of partitions, one of the key considerations is the amount of data flowing into a given topic. More data implies more partitions for higher throughput. On the other hand, increasing the number of partitions increases the number of TCP connections and open file handles. Additionally, how long it takes to process an incoming record in a consumer will also determine throughput. If there is heavyweight processing in a consumer, adding more partitions may help, but ultimately the slower processing will hinder performance
 94 | 
 95 | * Kafka has the notion of **leader** and **follower** brokers. In Kafka, for each topic partition, one broker is chosen as the leader for the other brokers (the followers). One of the chief duties of the leader is to assign [replication](http://kafka.apache.org/documentation/#design_replicatedlog) of topic partitions to the follower brokers. When producing messages, Kafka sends the record to the broker that is the leader for the record's partition. Brokers that follow a topic partition consume messages from the topic-partition leader and append those records to their log
 96 | 
 97 | * Kafka uses *ZooKeeper* to **elect** the **controller** broker of the cluster. If the controlling broker fails or becomes unavailable for any reason, ZooKeeper elects a new controller from a set of brokers that are considered to be caught up with the leader (an in-sync replica **ISR**). The brokers that make up this set are dynamic, and ZooKeeper recognizes only brokers in this set for election as leader. If a Kafka node dies or is unresponsive (to ZooKeeper heartbeats), all of its assigned partitions (both leader and follower) are reassigned by the controller broker
 98 | 
 99 | * Kafka Streams is a library that allows to perform per-event processing of records, without grouping data in microbatches
100 | 
101 | * Kafka Streams is a graph (or **topology** or Directed Acyclic Graph) of processing nodes or **processors** that combine to provide powerful and complex stream processing. Each processing node performs its assigned **task** and then forwards the record to each of its child node. Records (a key/value pair) flow through the graph in a depth-first manner, which implies that there is no need to have backpressure
102 | 
103 | * Treating an event stream as inserts, and events with keys as updates, is how to defined the relationship between **streams** and **tables**. If a stream of events is as a log, a stream of updates is as a changelog. Both a log and a changelog represent incoming records appended to the end of a file. In a log there are all the records; but in a changelog, there are only the latest record for any given key
104 | 
105 | * A `KTable` is often described as being a materialized view of a `KStream`, a view of a stream is nothing but a **per-key aggregation**
106 | 
107 | ## Setup
108 | 
109 | Requirements
110 | 
111 | * [Base](docker/#base-image) docker image 
112 | * [ZooKeeper](zookeeper) docker image
113 | 
114 | Build `devops/kafka` image
115 | ```bash
116 | # change path
117 | cd devops/kafka
118 | 
119 | # build image
120 | docker build -t devops/kafka .
121 | 
122 | # create network
123 | docker network create --driver bridge my_network
124 | docker network ls
125 | docker network inspect my_network
126 | 
127 | # start temporary zookeeper container [host:container]
128 | docker run --rm \
129 |   --name zookeeper \
130 |   -p 12181:2181 \
131 |   --network=my_network \
132 |   devops/zookeeper
133 | # access container
134 | docker exec -it zookeeper bash
135 | 
136 | # start temporary kafka container [host:container]
137 | docker run --rm \
138 |   --name kafka \
139 |   -p 19092:9092 \
140 |   --network=my_network \
141 |   -e ZOOKEEPER_HOSTS="zookeeper:2181" \
142 |   devops/kafka
143 | # access container
144 | docker exec -it kafka bash
145 | 
146 | # paths
147 | /opt/kafka
148 | /opt/kafka/logs
149 | /var/lib/kafka/data
150 | 
151 | # supervisor logs
152 | /var/log/kafka
153 | /var/log/connect
154 | tail -F /var/log/kafka/stdout
155 | less +G /var/log/connect/stdout
156 | ```
157 | 
158 | Alternatively use `docker-compose`
159 | ```bash
160 | # change path
161 | cd devops/kafka
162 | 
163 | # build base image
164 | docker build -t devops/base ../base
165 | # build + start zookeeper and kafka
166 | docker-compose up
167 | 
168 | # access container
169 | docker exec -it devops-zookeeper bash
170 | docker exec -it devops-kafka bash
171 | ```
172 | 
173 | ## How-To
174 | 
175 | Kafka
176 | ```bash
177 | docker exec -it devops-kafka bash
178 | 
179 | # create topic
180 | kafka-topics.sh --zookeeper zookeeper:2181 \
181 |   --create --if-not-exists --replication-factor 1 --partitions 1 --topic test
182 | 
183 | # view topic
184 | kafka-topics.sh --zookeeper zookeeper:2181 --list 
185 | kafka-topics.sh --zookeeper zookeeper:2181 --describe --topic test
186 | kafka-topics.sh --zookeeper zookeeper:2181 --describe --under-replicated-partitions
187 | kafka-topics.sh --zookeeper zookeeper:2181 --describe --unavailable-partitions
188 | 
189 | # produce
190 | kafka-console-producer.sh --broker-list kafka:9092 --topic test
191 | # util
192 | kafkacat -P -b 0 -t test
193 | 
194 | # consume
195 | kafka-console-consumer.sh --bootstrap-server kafka:9092 --topic test --from-beginning
196 | # util
197 | kafkacat -C -b 0 -t test
198 | 
199 | # list consumers
200 | kafka-consumer-groups.sh --bootstrap-server kafka:9092 --list
201 | # view lag (GROUP_NAME from previous command)
202 | kafka-consumer-groups.sh --bootstrap-server kafka:9092 --describe --group GROUP_NAME
203 | 
204 | # delete
205 | kafka-topics.sh --zookeeper zookeeper:2181 --delete --topic test
206 | 
207 | # verify log segment and index
208 | kafka-run-class.sh kafka.tools.DumpLogSegments \
209 |   --files /var/lib/kafka/data/test-0/00000000000000000000.log
210 | kafka-run-class.sh kafka.tools.DumpLogSegments \
211 |   --index-sanity-check \
212 |   --files /var/lib/kafka/data/test-0/00000000000000000000.index
213 | 
214 | # inspect __consumer_offsets
215 | kafka-console-consumer.sh --bootstrap-server kafka:9092 \
216 |   --topic __consumer_offsets \
217 |   --formatter "kafka.coordinator.group.GroupMetadataManager\$OffsetsMessageFormatter" \
218 |   --max-messages 1
219 | ```
220 | 
221 | Connect
222 | ```bash
223 | docker exec -it devops-kafka bash
224 | 
225 | # verify connect
226 | http :8083
227 | http :8083/connector-plugins
228 | 
229 | # write file to topic
230 | http POST :8083/connectors \
231 |   name=load-kafka-config \
232 |   config:='{"connector.class":"FileStreamSource","file":"/opt/kafka/config/server.properties","topic":"kafka-config-topic"}'
233 | 
234 | # verify topic
235 | kafka-console-consumer.sh --bootstrap-server=kafka:9092 \
236 |   --topic kafka-config-topic --from-beginning
237 | 
238 | # write topic to file
239 | http POST :8083/connectors \
240 |   name=dump-kafka-config \
241 |   config:='{"connector.class":"FileStreamSink","file":"/tmp/copy-of-server-properties","topics":"kafka-config-topic"}'
242 | 
243 | # verify file
244 | vim /tmp/copy-of-server-properties
245 | 
246 | # manage connectors
247 | http :8083/connectors
248 | http DELETE :8083/connectors/dump-kafka-config
249 | ```
250 | 
251 | ZooKeeper
252 | ```bash
253 | docker exec -it devops-zookeeper bash
254 | 
255 | # start cli
256 | zkCli.sh
257 | 
258 | # view ephemeral nodes
259 | ls /brokers/ids
260 | get /brokers/ids/0
261 | 
262 | # view topics
263 | ls /brokers/topics
264 | get /brokers/topics/test
265 | ```
266 | 
267 | Schema Registry
268 | ```bash
269 | # docker-hub images
270 | docker-compose -f kafka/docker-compose-hub.yml up
271 | docker exec -it devops-schema-registry bash
272 | 
273 | # register new schema
274 | http -v POST :8081/subjects/ExampleSchema/versions \
275 |   Accept:application/vnd.schemaregistry.v1+json \
276 |   schema='{"type":"string"}'
277 | 
278 | # list subjects and schema
279 | http -v :8081/subjects \
280 |   Accept:application/vnd.schemaregistry.v1+json
281 | http -v :8081/subjects/ExampleSchema/versions \
282 |   Accept:application/vnd.schemaregistry.v1+json
283 | http -v :8081/subjects/ExampleSchema/versions/1 \
284 |   Accept:application/vnd.schemaregistry.v1+json
285 | 
286 | # ui [mac|linux]
287 | [open|xdg-open] http://localhost:8082
288 | ```
289 | 
290 | <br>
291 | 


--------------------------------------------------------------------------------
/docs/hadoop.md:
--------------------------------------------------------------------------------
  1 | # Hadoop
  2 | 
  3 | The following guide explains how to provision a Multi Node Hadoop Cluster locally and play with it. Checkout the [Vagrantfile](https://github.com/niqdev/devops/blob/master/hadoop/Vagrantfile) and the Vagrant [guide](other/#vagrant) for more details.
  4 | 
  5 | Resources
  6 | 
  7 | * [Documentation](https://hadoop.apache.org)
  8 | 
  9 | * [Hadoop: The Definitive Guide](https://amzn.to/2Kxc8bg) (2015)(4th) by Tom White (Book)
 10 | 
 11 | * [The Hadoop Ecosystem Table](https://hadoopecosystemtable.github.io)
 12 | 
 13 | * [Hadoop Internals](https://ercoppa.github.io/HadoopInternals)
 14 | 
 15 | ### Setup
 16 | 
 17 | Requirements
 18 | 
 19 | * [Vagrant](https://www.vagrantup.com)
 20 | * [VirtualBox](https://www.virtualbox.org)
 21 | 
 22 | Directory structure
 23 | ```bash
 24 | tree -a hadoop/
 25 | hadoop/
 26 | ├── .data # mounted volume
 27 | │   ├── hadoop_rsa
 28 | │   ├── hadoop_rsa.pub
 29 | │   ├── master
 30 | │   │   ├── hadoop
 31 | │   │   │   ├── log
 32 | │   │   │   │   ├── hadoop
 33 | │   │   │   │   ├── mapred
 34 | │   │   │   │   └── yarn
 35 | │   │   │   ├── namenode
 36 | │   │   │   └── secondary
 37 | │   │   ├── oozie
 38 | │   │   │   ├── data
 39 | │   │   │   └── log
 40 | │   │   ├── spark
 41 | │   │   │   └── log
 42 | │   │   └── zeppelin
 43 | │   │       ├── log
 44 | │   │       └── notebook
 45 | │   ├── node-1
 46 | │   │   └── hadoop
 47 | │   │       ├── datanode
 48 | │   │       └── log
 49 | │   │           ├── hadoop
 50 | │   │           ├── mapred
 51 | │   │           └── yarn
 52 | │   ├── node-2
 53 | │   ├── node-3
 54 | ├── example
 55 | │   ├── map-reduce
 56 | │   └── spark
 57 | ├── file
 58 | │   ├── hadoop
 59 | │   │   ├── config
 60 | │   │   │   ├── core-site.xml
 61 | │   │   │   ├── fair-scheduler.xml
 62 | │   │   │   ├── hdfs-site.xml
 63 | │   │   │   ├── mapred-site.xml
 64 | │   │   │   ├── masters
 65 | │   │   │   ├── slaves
 66 | │   │   │   └── yarn-site.xml
 67 | │   │   └── profile-hadoop.sh
 68 | │   ├── hosts
 69 | │   ├── motd
 70 | │   ├── oozie
 71 | │   │   ├── config
 72 | │   │   │   ├── oozie-env.sh
 73 | │   │   │   └── oozie-site.xml
 74 | │   │   └── profile-oozie.sh
 75 | │   ├── spark
 76 | │   │   ├── config
 77 | │   │   │   ├── log4j.properties
 78 | │   │   │   └── spark-env.sh
 79 | │   │   └── profile-spark.sh
 80 | │   ├── ssh
 81 | │   │   └── config
 82 | │   └── zeppelin
 83 | │       ├── config
 84 | │       │   └── zeppelin-env.sh
 85 | │       └── profile-zeppelin.sh
 86 | ├── script
 87 | │   ├── bootstrap.sh
 88 | │   ├── setup_hadoop.sh
 89 | │   ├── setup_oozie.sh
 90 | │   ├── setup_spark.sh
 91 | │   ├── setup_ubuntu.sh
 92 | │   └── setup_zeppelin.sh
 93 | ├── Vagrantfile
 94 | └── vagrant_hadoop.sh
 95 | ```
 96 | 
 97 | Import the script
 98 | ```bash
 99 | source vagrant_hadoop.sh
100 | ```
101 | 
102 | Create and start a Multi Node Hadoop Cluster
103 | ```bash
104 | hadoop-start
105 | ```
106 | *The first time it might take a while*
107 | 
108 | Access the cluster via ssh, check also the [/etc/hosts](https://github.com/niqdev/devops/blob/master/hadoop/file/hosts) file
109 | ```bash
110 | vagrant ssh master
111 | ssh hadoop@172.16.0.10 -i .data/hadoop_rsa
112 | 
113 | # 3 nodes
114 | vagrant ssh node-1
115 | ssh hadoop@172.16.0.101 -i .data/hadoop_rsa
116 | ```
117 | 
118 | Destroy the cluster
119 | ```bash
120 | hadoop-destroy
121 | ```
122 | 
123 | For convenience add to the host machine
124 | ```bash
125 | cat hadoop/file/hosts | sudo tee --append /etc/hosts
126 | ```
127 | 
128 | Web UI links
129 | 
130 | * NameNode: [http://namenode.local:50070](http://172.16.0.10:50070)
131 | * NameNode metrics: [http://namenode.local:50070/jmx](http://172.16.0.10:50070/jmx)
132 | * ResourceManager: [http://resource-manager.local:8088](http://172.16.0.10:8088)
133 | * Log Level: [http://resource-manager.local:8088/logLevel](http://172.16.0.10:8088/logLevel)
134 | * Web Application Proxy Server: [http://web-proxy.local:8100/proxy/application_XXX_0000](http://172.16.0.10:8100/proxy/application_XXX_0000)
135 | * MapReduce Job History Server: [http://history.local:19888](http://172.16.0.10:19888)
136 | * DataNode/NodeManager (1): [http://node-1.local:8042/node](http://172.16.0.101:8042/node)
137 | * DataNode/NodeManager (2): [http://node-2.local:8042/node](http://172.16.0.102:8042/node)
138 | * DataNode/NodeManager (3): [http://node-3.local:8042/node](http://172.16.0.103:8042/node)
139 | * Spark: [http://spark.local:4040](http://172.16.0.10:4040)
140 | * Spark History Server: [http://spark-history.local:18080](http://172.16.0.10:18080)
141 | * Zeppelin (*): [http://zeppelin.local:8080](http://172.16.0.10:8080)
142 | * Oozie (*): [http://oozie.local:11000](http://172.16.0.10:11000)
143 | 
144 | *(\*) Not installed by default*
145 | 
146 | ## HDFS and MapReduce
147 | 
148 | > **HDFS** is a distributed file system that provides high-throughput access to application data
149 | 
150 | > **YARN** is a framework for job scheduling and cluster resource management
151 | 
152 | > **MapReduce** is a YARN-based system for parallel processing of large data sets
153 | 
154 | Documentation
155 | 
156 | * [Hadoop v2.7.6](http://hadoop.apache.org/docs/r2.7.6)
157 | * [Untangling Apache Hadoop YARN](http://blog.cloudera.com/blog/2015/09/untangling-apache-hadoop-yarn-part-1/) series
158 | 
159 | ### Admin
160 | 
161 | HDFS cli
162 | ```bash
163 | # help
164 | hdfs
165 | 
166 | # filesystem statistics
167 | hdfs dfsadmin -report
168 | 
169 | # filesystem check
170 | hdfs fsck /
171 | ```
172 | 
173 | YARN cli
174 | ```bash
175 | # help
176 | yarn
177 | 
178 | # list yarn applications
179 | yarn application -list
180 | 
181 | # list nodes
182 | yarn node -list
183 | 
184 | # view application logs
185 | yarn logs -applicationId APPLICATION_ID
186 | 
187 | # kill yarn application
188 | yarn application -kill APPLICATION_ID
189 | ```
190 | 
191 | Useful paths
192 | ```bash
193 | # data and logs
194 | devops/hadoop/.data/master/hadoop # host
195 | /vol/hadoop # guest
196 | 
197 | # (guest) config
198 | /usr/local/hadoop/etc/hadoop
199 | 
200 | # (hdfs) map-reduce history
201 | /mr-history/history/done_intermediate/hadoop
202 | 
203 | # (hdfs) aggregated app logs
204 | /yarn/app/hadoop/logs/application_XXX
205 | ```
206 | 
207 | ### MapReduce WordCount Job
208 | 
209 | ```bash
210 | # build jar on the host machine
211 | cd devops/hadoop/example/map-reduce
212 | ./gradlew clean build
213 | 
214 | cd devops/hadoop
215 | vagrant ssh master
216 | 
217 | # create base directory using hdfs
218 | hdfs dfs -mkdir -p /user/ubuntu
219 | 
220 | # create example directory
221 | hadoop fs -mkdir -p /user/ubuntu/word-count/input
222 | 
223 | # list directory
224 | hadoop fs -ls -h -R /
225 | hadoop fs -ls -h -R /user/ubuntu
226 | 
227 | # create sample files
228 | echo "Hello World Bye World" > file01
229 | echo "Hello Hadoop Goodbye Hadoop" > file02
230 | 
231 | # copy from local to hdfs
232 | hadoop fs -copyFromLocal file01 /user/ubuntu/word-count/input
233 | hadoop fs -put file02 /user/ubuntu/word-count/input
234 | 
235 | # verify copied files
236 | hadoop fs -ls -h -R /user/ubuntu
237 | hadoop fs -cat /user/ubuntu/word-count/input/file01
238 | hadoop fs -cat /user/ubuntu/word-count/input/file02
239 | hadoop fs -cat /user/ubuntu/word-count/input/*
240 | 
241 | # run application
242 | hadoop jar /vagrant/example/map-reduce/build/libs/map-reduce.jar \
243 |   /user/ubuntu/word-count/input \
244 |   /user/ubuntu/word-count/output
245 | 
246 | # check output
247 | hadoop fs -cat /user/ubuntu/word-count/output/part-r-00000
248 | 
249 | # delete directory to run it again
250 | hadoop fs -rm -R /user/ubuntu/word-count/output
251 | 
252 | # run sample job in a different queue
253 | hadoop jar \
254 |   $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar \
255 |   wordcount \
256 |   -Dmapreduce.job.queuename=root.priority_queue \
257 |   /user/ubuntu/word-count/input \
258 |   /user/ubuntu/word-count/output
259 | 
260 | # well known WARN issue
261 | # https://issues.apache.org/jira/browse/HDFS-10429
262 | ```
263 | 
264 | ### Benchmarking MapReduce with TeraSort
265 | 
266 | ```bash
267 | # generate random data
268 | hadoop jar \
269 |   $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar \
270 |   teragen 1000 random-data
271 | 
272 | # run terasort benchmark
273 | hadoop jar \
274 |   $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar \
275 |   terasort random-data sorted-data
276 | 
277 | # validate data
278 | hadoop jar \
279 |   $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar \
280 |   teravalidate sorted-data report
281 | 
282 | # useful commands
283 | hadoop fs -ls -h -R .
284 | hadoop fs -rm -r random-data
285 | hadoop fs -cat random-data/part-m-00000
286 | hadoop fs -cat sorted-data/part-r-00000
287 | ```
288 | 
289 | <br>
290 | 
291 | ## Spark
292 | 
293 | > **Spark** is an open-source cluster-computing framework
294 | 
295 | Resources
296 | 
297 | * [Documentation](https://spark.apache.org/docs/latest)
298 | 
299 | * [Spark in Action](https://amzn.to/2MzgHio) (2016) by Petar Zečević and Marko Bonaći (Book)
300 | 
301 | * [Big Data Analysis with Scala and Spark](https://www.coursera.org/learn/scala-spark-big-data) (Course)
302 | 
303 | * [How-to: Tune Your Apache Spark Jobs](http://blog.cloudera.com/blog/2015/03/how-to-tune-your-apache-spark-jobs-part-1) series
304 | 
305 | * [Understanding Resource Allocation configurations for a Spark application](http://site.clairvoyantsoft.com/understanding-resource-allocation-configurations-spark-application)
306 | 
307 | * [Apache Spark: Config Cheatsheet](http://c2fo.io/c2fo/spark/aws/emr/2016/07/06/apache-spark-config-cheatsheet)
308 | 
309 | * [Mastering Apache Spark](https://legacy.gitbook.com/book/jaceklaskowski/mastering-apache-spark)
310 | 
311 | * [Managing Spark Partitions with Coalesce and Repartition](https://hackernoon.com/managing-spark-partitions-with-coalesce-and-repartition-4050c57ad5c4)
312 | 
313 | * [Understanding Apache Spark on YARN](http://sujithjay.com/2018/07/24/Understanding-Apache-Spark-on-YARN/)
314 | 
315 | ![spark-architecture](img/spark-architecture.png)
316 | 
317 | Spark application on YARN
318 | 
319 | ![spark-job](img/spark-job.png)
320 | 
321 | ```bash
322 | # start REPL
323 | spark-shell
324 | pyspark
325 | ```
326 | 
327 | ### Interactive Analysis example
328 | 
329 | ```bash
330 | spark-shell
331 | # spark shell with yarn
332 | spark-shell --master yarn --deploy-mode client
333 | 
334 | # view all configured parameters
335 | sc.getConf.getAll.foreach(x => println(s"${x._1}: ${x._2}"))
336 | 
337 | val licenceLines = sc.textFile("file:/usr/local/spark/LICENSE")
338 | val lineCount = licenceLines.count
339 | val isBsd = (line: String) => line.contains("BSD")
340 | val bsdLines = licenceLines.filter(isBsd)
341 | bsdLines.count
342 | bsdLines.foreach(println)
343 | ```
344 | 
345 | ### Spark Job examples
346 | 
347 | Example local
348 | ```bash
349 | # run SparkPi example
350 | spark-submit \
351 |   --class org.apache.spark.examples.SparkPi \
352 |   --master local[*] \
353 |   $SPARK_HOME/examples/jars/spark-examples_*.jar 10
354 | 
355 | # GitHub event documentation
356 | # https://developer.github.com/v3/activity/events/types
357 | 
358 | # build jar on the host machine
359 | cd devops/hadoop/example/spark
360 | sbt clean package
361 | 
362 | cd devops/hadoop
363 | vagrant ssh master
364 | 
365 | # sample dataset
366 | mkdir -p github-archive && \
367 |   cd $_ && \
368 |   wget http://data.githubarchive.org/2018-01-01-{0..10}.json.gz && \
369 |   gunzip -k *
370 | # sample line
371 | head -n 1 2018-01-01-0.json | jq '.'
372 | 
373 | # run local job
374 | spark-submit \
375 |   --class "com.github.niqdev.App" \
376 |   --master local[*] \
377 |   /vagrant/example/spark/target/scala-2.11/spark-github_2.11-0.1.0-SNAPSHOT.jar
378 | ```
379 | 
380 | Example cluster
381 | ```bash
382 | # run job in YARN cluster-deploy mode
383 | spark-submit \
384 |   --class org.apache.spark.examples.SparkPi \
385 |   --master yarn \
386 |   --deploy-mode cluster \
387 |   --driver-memory 2g \
388 |   --executor-memory 1g \
389 |   --executor-cores 3 \
390 |   --queue default \
391 |   $SPARK_HOME/examples/jars/spark-examples*.jar \
392 |   10
393 | 
394 | # --conf "spark.yarn.jars=hdfs://namenode.local:9000/user/spark/share/lib/*.jar"
395 | ```
396 | 
397 | <br>
398 | 
399 | ## Zeppelin
400 | 
401 | > **Zeppelin** is a web-based notebook that enables data-driven, interactive data analytics and collaborative documents with SQL, Scala and more
402 | 
403 | Resources
404 | 
405 | * [Documentation](https://zeppelin.apache.org)
406 | 
407 | ### Setup
408 | 
409 | Install and start Zeppelin
410 | ```bash
411 | # access master node
412 | vagrant ssh master
413 | 
414 | # login as root
415 | sudo su -
416 | 
417 | # install and init
418 | /vagrant/script/setup_zeppelin.sh
419 | 
420 | # start manually (first time only)
421 | su --login hadoop /vagrant/script/bootstrap.sh zeppelin
422 | ```
423 | 
424 | ### Examples
425 | 
426 | * [Learning Spark SQL with Zeppelin](https://hortonworks.com/tutorial/learning-spark-sql-with-zeppelin)
427 | 
428 | ```
429 | # markdown interpreter
430 | %md
431 | hello
432 | 
433 | # shell interpreter
434 | %sh
435 | hadoop fs -ls -h -R /
436 | ```
437 | 
438 | Cluster issue: verify to have enough memory with `free -m` e.g. *Error: Cannot allocate memory*
439 | 
440 | <br>
441 | 
442 | ## Oozie
443 | 
444 | > **Oozie** is a workflow scheduler system to manage Hadoop jobs
445 | 
446 | Resources
447 | 
448 | * [Documentation](https://oozie.apache.org)
449 | 
450 | ### Setup
451 | 
452 | **Optional PostgreSQL configuration** - By default Oozie is configured to use Embedded Derby
453 | ```bash
454 | # access master node
455 | vagrant ssh master
456 | 
457 | # install docker
458 | curl -fsSL get.docker.com -o get-docker.sh && \
459 |   chmod u+x $_ && \
460 |   ./$_ && \
461 |   sudo usermod -aG docker hadoop
462 | 
463 | # logout and login again to verify docker installation
464 | exit
465 | vagrant ssh master
466 | whoami # hadoop
467 | docker ps -a
468 | 
469 | # uncomment PostgreSQL configurations
470 | vim devops/hadoop/file/oozie/config/oozie-site.xml # from host
471 | vim /vagrant/file/oozie/config/oozie-site.xml # from guest
472 | 
473 | # start postgres on guest machine 
474 | docker run \
475 |   --detach \
476 |   --name oozie-postgres \
477 |   -p 5432:5432 \
478 |   -e POSTGRES_DB="oozie-db" \
479 |   -e POSTGRES_USER="postgres" \
480 |   -e POSTGRES_PASSWORD="password" \
481 |   postgres
482 | 
483 | # permission issue
484 | # https://github.com/docker-library/postgres/issues/116
485 | # --volume /vol/postgres:/var/lib/postgresql/data
486 | 
487 | # access container
488 | docker exec -it oozie-postgres bash
489 | psql --username=postgres
490 | # list all databases
491 | \list
492 | \connect oozie-db
493 | # list all tables
494 | \dt
495 | # describe table
496 | \d+ wf_jobs
497 | # list workflow
498 | select * from wf_jobs;
499 | ```
500 | 
501 | Install and start Oozie
502 | ```bash
503 | # access master node
504 | vagrant ssh master
505 | 
506 | # login as root
507 | sudo su -
508 | 
509 | # build, install and init
510 | /vagrant/script/setup_oozie.sh
511 | 
512 | # start oozie manually (first time only)
513 | su --login hadoop /vagrant/script/bootstrap.sh oozie
514 | ```
515 | *It might take a while to build the sources*
516 | 
517 | Useful paths
518 | ```bash
519 | # data and logs
520 | devops/hadoop/.data/master/oozie # host
521 | /vol/oozie # guest
522 | 
523 | # (guest) config
524 | /usr/local/oozie/conf
525 | 
526 | # (hdfs) examples
527 | /user/hadoop/examples
528 | ```
529 | 
530 | ### Examples
531 | 
532 | Run bundled examples within distribution
533 | ```bash
534 | # examples path
535 | .data/master/oozie/examples # host
536 | /vol/oozie/examples # guest
537 | 
538 | # access master node as hadoop user
539 | vagrant ssh master
540 | 
541 | export OOZIE_EXAMPLE_PATH=/vol/oozie/examples
542 | export OOZIE_HDFS_PATH=/user/$(whoami)/examples
543 | 
544 | # open map-reduce job.properties
545 | vim $OOZIE_EXAMPLE_PATH/apps/map-reduce/job.properties
546 | 
547 | # edit the following properties
548 | nameNode=hdfs://namenode.local:9000 # fs.defaultFS @ core-site.xml
549 | jobTracker=resource-manager.local:8032 # yarn.resourcemanager.address @ yarn-site.xml
550 | queueName=priority_queue # or default @ fair-scheduler.xml
551 | 
552 | # upload all the examples
553 | hadoop fs -put $OOZIE_EXAMPLE_PATH $OOZIE_HDFS_PATH
554 | 
555 | # verify uploaded files
556 | hadoop fs -ls -h -R /user/$(whoami)
557 | 
558 | # run the map-reduce workflow example
559 | oozie job \
560 |   -oozie http://oozie.local:11000/oozie \
561 |   -config $OOZIE_EXAMPLE_PATH/apps/map-reduce/job.properties \
562 |   -run
563 | 
564 | # verify status
565 | oozie job -oozie http://oozie.local:11000/oozie -info WORKFLOW_ID
566 | 
567 | # verify result
568 | hadoop fs -cat $OOZIE_HDFS_PATH/output-data/map-reduce/part-00000
569 | 
570 | # remove all the examples
571 | hadoop fs -rm -R $OOZIE_HDFS_PATH
572 | ```
573 | 
574 | ### Useful commands
575 | 
576 | * Workflow requires `oozie.wf.application.path` property
577 | * Coordinator requires `oozie.coord.application.path` property
578 | 
579 | ```bash
580 | # verify oozie status
581 | oozie admin \
582 |   -oozie http://oozie.local:11000/oozie \
583 |   -status
584 | 
585 | # verify workflow or coordinator status
586 | oozie job \
587 |   -oozie http://oozie.local:11000/oozie \
588 |   -info JOB_ID \
589 |   -verbose
590 | 
591 | # poll workflow or coordinator status
592 | oozie job \
593 |   -oozie http://oozie.local:11000/oozie \
594 |   -poll JOB_ID \
595 |   -interval 10 \
596 |   -timeout 60 \
597 |   -verbose
598 | 
599 | # find running coordinator
600 | oozie jobs \
601 |   -oozie http://oozie.local:11000/oozie/ \
602 |   -filter status=RUNNING \
603 |   -jobtype coordinator
604 | 
605 | # suspend|resume|kill coordinator
606 | oozie job \
607 |   -oozie http://oozie.local:11000/oozie/ \
608 |   [-suspend|-resume|-kill] \
609 |   XXX-C
610 | 
611 | # re-run coordinator's workflow (action)
612 | oozie job \
613 |   -oozie http://oozie.local:11000/oozie/ \
614 |   -rerun XXX-C \
615 |   -action 1,2,3,N
616 | 
617 | # kill workflow
618 | oozie job \
619 |   -oozie http://oozie.local:11000/oozie/ \
620 |   -kill \
621 |   XXX-W
622 | 
623 | # re-run all workflow's actions
624 | oozie job \
625 |   -oozie http://oozie.local:11000/oozie/ \
626 |   -rerun \
627 |   XXX-W \
628 |   -Doozie.wf.rerun.failnodes=false
629 | ```
630 | 
631 | <br>
632 | 


--------------------------------------------------------------------------------