├── ansible ├── data │ ├── group_vars │ │ └── .gitkeep │ ├── host_vars │ │ └── .gitkeep │ ├── roles │ │ ├── common │ │ │ ├── defaults │ │ │ │ └── .gitkeep │ │ │ ├── files │ │ │ │ └── .gitkeep │ │ │ ├── handlers │ │ │ │ └── .gitkeep │ │ │ ├── meta │ │ │ │ └── .gitkeep │ │ │ ├── vars │ │ │ │ └── main.yml │ │ │ ├── templates │ │ │ │ └── motd │ │ │ └── tasks │ │ │ │ ├── main.yml │ │ │ │ ├── motd.yml │ │ │ │ ├── package.yml │ │ │ │ └── oracle-jdk.yml │ │ ├── docker │ │ │ ├── meta │ │ │ │ └── main.yml │ │ │ └── tasks │ │ │ │ └── main.yml │ │ └── schema-registry │ │ │ ├── meta │ │ │ └── main.yml │ │ │ ├── defaults │ │ │ └── main.yml │ │ │ ├── img │ │ │ ├── ansible.png │ │ │ └── draw-io-ansible.xml │ │ │ ├── handlers │ │ │ └── main.yml │ │ │ ├── docker-compose-local.yml │ │ │ ├── files │ │ │ ├── log4j.properties │ │ │ └── schema-registry.properties │ │ │ ├── README.md │ │ │ └── tasks │ │ │ └── main.yml │ ├── hosts │ └── site.yml ├── destroy_ansible.sh ├── setup_share.sh ├── setup_ansible.sh └── Vagrantfile ├── aws └── emr │ ├── application │ ├── api │ │ ├── __init__.py │ │ ├── status_api.py │ │ ├── emr_api.py │ │ └── example_api.py │ ├── service │ │ ├── __init__.py │ │ ├── emr_service.py │ │ └── example_service.py │ ├── templates │ │ ├── page_not_found.html │ │ └── hello.html │ ├── static │ │ └── example.txt │ ├── __init__.py │ ├── configuration.py │ ├── main.py │ └── logger.py │ ├── setup.cfg │ ├── .dockerignore │ ├── MANIFEST.in │ ├── dev.sh │ ├── requirements.txt │ ├── README.md │ ├── Dockerfile │ ├── setup.py │ └── tests │ └── application_test.py ├── hadoop ├── example │ ├── map-reduce │ │ ├── src │ │ │ ├── test │ │ │ │ └── java │ │ │ │ │ └── .gitkeep │ │ │ └── main │ │ │ │ └── java │ │ │ │ └── com │ │ │ │ └── github │ │ │ │ └── niqdev │ │ │ │ ├── IntSumReducer.java │ │ │ │ ├── TokenizerMapper.java │ │ │ │ └── WordCount.java │ │ ├── settings.gradle │ │ ├── README.md │ │ ├── gradle │ │ │ └── wrapper │ │ │ │ ├── gradle-wrapper.jar │ │ │ │ └── gradle-wrapper.properties │ │ ├── build.gradle │ │ ├── gradlew.bat │ │ └── gradlew │ └── spark │ │ ├── src │ │ ├── test │ │ │ └── scala │ │ │ │ └── .gitkeep │ │ └── main │ │ │ └── scala │ │ │ └── com │ │ │ └── github │ │ │ └── niqdev │ │ │ └── App.scala │ │ ├── project │ │ ├── build.properties │ │ └── Dependencies.scala │ │ └── build.sbt ├── file │ ├── hadoop │ │ ├── config │ │ │ ├── masters │ │ │ ├── slaves │ │ │ ├── core-site.xml │ │ │ ├── hdfs-site.xml │ │ │ ├── mapred-site.xml │ │ │ ├── fair-scheduler.xml │ │ │ └── yarn-site.xml │ │ └── profile-hadoop.sh │ ├── oozie │ │ ├── profile-oozie.sh │ │ └── config │ │ │ ├── oozie-env.sh │ │ │ └── oozie-site.xml │ ├── spark │ │ ├── profile-spark.sh │ │ └── config │ │ │ ├── spark-env.sh │ │ │ ├── spark-defaults.conf │ │ │ └── log4j.properties │ ├── ssh │ │ └── config │ ├── zeppelin │ │ ├── profile-zeppelin.sh │ │ └── config │ │ │ └── zeppelin-env.sh │ ├── hosts │ └── motd ├── script │ ├── setup_zeppelin.sh │ ├── bootstrap.sh │ ├── setup_hadoop.sh │ ├── setup_spark.sh │ ├── setup_ubuntu.sh │ └── setup_oozie.sh ├── Vagrantfile └── vagrant_hadoop.sh ├── docs ├── img │ ├── hdfs-read.png │ ├── spark-job.png │ ├── hdfs-write.png │ ├── kafka-topic.png │ ├── kafka-cluster.png │ ├── kafka-consumer.png │ ├── kafka-producer.png │ ├── kubernetes-run.png │ ├── map-reduce-job.png │ ├── yarn-scheduler.png │ ├── cassandra-memory.png │ ├── cassandra-query.png │ ├── kubernetes-rbac.png │ ├── yarn-application.png │ ├── cassandra-read-path.png │ ├── kubernetes-client.png │ ├── kubernetes-cluster.png │ ├── kubernetes-volume.png │ ├── spark-architecture.png │ ├── cassandra-token-ring.png │ ├── cassandra-write-path.png │ ├── kafka-consumer-group.png │ ├── kafka-rebalance-lost.png │ ├── kubernetes-deployment.png │ ├── map-reduce-data-flow.png │ ├── kubernetes-architecture.png │ ├── kubernetes-container-api.png │ └── kafka-rebalance-duplicate.png ├── jvm.md ├── scala.md ├── index.md ├── azure.md ├── other-resources.md ├── zookeeper.md ├── programming.md ├── cloud.md ├── operating-system.md ├── docker.md ├── ansible.md ├── toolbox.md ├── cassandra.md ├── system-design.md ├── kafka.md └── hadoop.md ├── .github ├── dependabot.yml └── workflows │ └── gh-pages.yml ├── miscellaneous ├── hello.c └── setup_k8s.sh ├── cassandra ├── docker-compose.yml ├── cql │ ├── all_users.csv │ ├── column_users.csv │ ├── example_create.cql │ └── example_query.cql └── docker-compose-cluster.yml ├── requirements.txt ├── base ├── supervisor.sed └── Dockerfile ├── .gitignore ├── zookeeper ├── supervisor.ini ├── zoo.cfg └── Dockerfile ├── kafka ├── supervisor-connect.ini ├── supervisor-kafka.ini ├── docker-compose.yml ├── Dockerfile └── docker-compose-hub.yml ├── docs-todo ├── _aws.md ├── _neo4j.md └── _spark.md ├── mkdocs.yml ├── README.md └── dev.txt /ansible/data/group_vars/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ansible/data/host_vars/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aws/emr/application/api/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ansible/data/roles/common/defaults/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ansible/data/roles/common/files/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ansible/data/roles/common/handlers/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ansible/data/roles/common/meta/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aws/emr/application/service/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aws/emr/setup.cfg: -------------------------------------------------------------------------------- 1 | [aliases] 2 | test=pytest -------------------------------------------------------------------------------- /aws/emr/.dockerignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.pyc 3 | -------------------------------------------------------------------------------- /hadoop/example/map-reduce/src/test/java/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hadoop/example/spark/src/test/scala/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aws/emr/application/templates/page_not_found.html: -------------------------------------------------------------------------------- 1 | D'oh! -------------------------------------------------------------------------------- /ansible/data/roles/common/vars/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | apt_cache: 3600 -------------------------------------------------------------------------------- /aws/emr/application/static/example.txt: -------------------------------------------------------------------------------- 1 | example-static-file 2 | -------------------------------------------------------------------------------- /hadoop/file/hadoop/config/masters: -------------------------------------------------------------------------------- 1 | secondary-namenode.local 2 | -------------------------------------------------------------------------------- /aws/emr/MANIFEST.in: -------------------------------------------------------------------------------- 1 | graft app/templates 2 | graft app/static 3 | -------------------------------------------------------------------------------- /hadoop/example/spark/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.1.1 2 | -------------------------------------------------------------------------------- /ansible/data/roles/docker/meta/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | dependencies: 3 | - common -------------------------------------------------------------------------------- /hadoop/example/map-reduce/settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'map-reduce' 2 | -------------------------------------------------------------------------------- /hadoop/file/hadoop/config/slaves: -------------------------------------------------------------------------------- 1 | node-1.local 2 | node-2.local 3 | node-3.local 4 | -------------------------------------------------------------------------------- /ansible/data/roles/schema-registry/meta/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | dependencies: 3 | - common -------------------------------------------------------------------------------- /docs/img/hdfs-read.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/hdfs-read.png -------------------------------------------------------------------------------- /docs/img/spark-job.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/spark-job.png -------------------------------------------------------------------------------- /docs/img/hdfs-write.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/hdfs-write.png -------------------------------------------------------------------------------- /docs/img/kafka-topic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kafka-topic.png -------------------------------------------------------------------------------- /docs/img/kafka-cluster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kafka-cluster.png -------------------------------------------------------------------------------- /docs/img/kafka-consumer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kafka-consumer.png -------------------------------------------------------------------------------- /docs/img/kafka-producer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kafka-producer.png -------------------------------------------------------------------------------- /docs/img/kubernetes-run.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kubernetes-run.png -------------------------------------------------------------------------------- /docs/img/map-reduce-job.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/map-reduce-job.png -------------------------------------------------------------------------------- /docs/img/yarn-scheduler.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/yarn-scheduler.png -------------------------------------------------------------------------------- /docs/img/cassandra-memory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/cassandra-memory.png -------------------------------------------------------------------------------- /docs/img/cassandra-query.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/cassandra-query.png -------------------------------------------------------------------------------- /docs/img/kubernetes-rbac.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kubernetes-rbac.png -------------------------------------------------------------------------------- /docs/img/yarn-application.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/yarn-application.png -------------------------------------------------------------------------------- /docs/img/cassandra-read-path.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/cassandra-read-path.png -------------------------------------------------------------------------------- /docs/img/kubernetes-client.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kubernetes-client.png -------------------------------------------------------------------------------- /docs/img/kubernetes-cluster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kubernetes-cluster.png -------------------------------------------------------------------------------- /docs/img/kubernetes-volume.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kubernetes-volume.png -------------------------------------------------------------------------------- /docs/img/spark-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/spark-architecture.png -------------------------------------------------------------------------------- /docs/jvm.md: -------------------------------------------------------------------------------- 1 | # JVM 2 | 3 | Moved to scala-fp 4 | -------------------------------------------------------------------------------- /docs/img/cassandra-token-ring.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/cassandra-token-ring.png -------------------------------------------------------------------------------- /docs/img/cassandra-write-path.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/cassandra-write-path.png -------------------------------------------------------------------------------- /docs/img/kafka-consumer-group.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kafka-consumer-group.png -------------------------------------------------------------------------------- /docs/img/kafka-rebalance-lost.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kafka-rebalance-lost.png -------------------------------------------------------------------------------- /docs/img/kubernetes-deployment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kubernetes-deployment.png -------------------------------------------------------------------------------- /docs/img/map-reduce-data-flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/map-reduce-data-flow.png -------------------------------------------------------------------------------- /docs/scala.md: -------------------------------------------------------------------------------- 1 | # Scala 2 | 3 | Moved to scala-fp 4 | -------------------------------------------------------------------------------- /docs/img/kubernetes-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kubernetes-architecture.png -------------------------------------------------------------------------------- /docs/img/kubernetes-container-api.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kubernetes-container-api.png -------------------------------------------------------------------------------- /docs/img/kafka-rebalance-duplicate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/docs/img/kafka-rebalance-duplicate.png -------------------------------------------------------------------------------- /hadoop/example/map-reduce/README.md: -------------------------------------------------------------------------------- 1 | # map-reduce-example 2 | 3 | ``` 4 | ./gradlew clean build 5 | ./gradlew jar 6 | ``` 7 | -------------------------------------------------------------------------------- /hadoop/file/oozie/profile-oozie.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | export OOZIE_HOME=/usr/local/oozie 4 | export PATH=${OOZIE_HOME}/bin:${PATH} 5 | -------------------------------------------------------------------------------- /hadoop/file/spark/profile-spark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | export SPARK_HOME=/usr/local/spark 4 | export PATH=${SPARK_HOME}/bin:${PATH} 5 | -------------------------------------------------------------------------------- /ansible/data/roles/schema-registry/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | schema: 3 | registry: 4 | user: cp-schema-registry 5 | group: confluent 6 | -------------------------------------------------------------------------------- /hadoop/file/ssh/config: -------------------------------------------------------------------------------- 1 | Host * 2 | StrictHostKeyChecking no 3 | UserKnownHostsFile=/dev/null 4 | NoHostAuthenticationForLocalhost yes 5 | -------------------------------------------------------------------------------- /ansible/data/roles/common/templates/motd: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | echo '\nHost: {{ ansible_nodename }}' 4 | echo 'Groups: {{ group_names | join(', ') }}' 5 | -------------------------------------------------------------------------------- /ansible/data/roles/schema-registry/img/ansible.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/ansible/data/roles/schema-registry/img/ansible.png -------------------------------------------------------------------------------- /hadoop/file/zeppelin/profile-zeppelin.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | export ZEPPELIN_HOME=/usr/local/zeppelin 4 | export PATH=${ZEPPELIN_HOME}/bin:${PATH} 5 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: "daily" 7 | -------------------------------------------------------------------------------- /hadoop/example/map-reduce/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niqdev/devops/HEAD/hadoop/example/map-reduce/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /miscellaneous/hello.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | main() { 4 | printf("Hello, World.\n"); 5 | } 6 | 7 | // compile: cc -o hello hello.c 8 | // run: ./hello 9 | -------------------------------------------------------------------------------- /ansible/data/roles/common/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - import_tasks: package.yml 4 | tags: 5 | - package 6 | 7 | - import_tasks: motd.yml 8 | tags: 9 | - motd 10 | -------------------------------------------------------------------------------- /hadoop/file/oozie/config/oozie-env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export OOZIE_BASE_PATH=/vol/oozie 4 | export OOZIE_DATA=${OOZIE_BASE_PATH}/data 5 | export OOZIE_LOG=${OOZIE_BASE_PATH}/log 6 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # DevOps 2 | 3 | A collection of notes, resources, documentation and POCs mainly related to distributed systems for local development, learning purposes and quick prototyping. 4 | -------------------------------------------------------------------------------- /aws/emr/application/templates/hello.html: -------------------------------------------------------------------------------- 1 | 2 | Hello from Flask 3 | {% if name %} 4 |

Hello {{ name }}!

5 | {% else %} 6 |

Hello, World!

7 | {% endif %} -------------------------------------------------------------------------------- /hadoop/file/spark/config/spark-env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export SPARK_LOG_DIR=/vol/spark/log 4 | # fix warning in spark-shell 5 | export SPARK_LOCAL_IP=$(hostname -i | sed 's/^127.0.0.1 //') 6 | -------------------------------------------------------------------------------- /cassandra/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | cassandra: 4 | container_name: devops-cassandra 5 | image: cassandra:3.11 6 | ports: 7 | - 9042:9042 8 | volumes: 9 | - ./cql:/cql 10 | -------------------------------------------------------------------------------- /aws/emr/dev.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm -fr .eggs/ *.egg-info */__pycache__/ */*/__pycache__/ 4 | 5 | source venv/bin/activate 6 | 7 | pip install -e . 8 | 9 | export FLASK_APP=application 10 | export FLASK_DEBUG=1 11 | flask run 12 | -------------------------------------------------------------------------------- /aws/emr/application/__init__.py: -------------------------------------------------------------------------------- 1 | from flask import Flask 2 | 3 | app = Flask(__name__) 4 | app.config.from_object('application.configuration.Config') 5 | #app.config.from_envvar('APPLICATION_SETTINGS', silent=True) 6 | 7 | import application.main 8 | -------------------------------------------------------------------------------- /ansible/data/roles/schema-registry/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: restart schema-registry 4 | systemd: 5 | name: "{{ schema.registry.service_name }}" 6 | state: restarted 7 | 8 | - name: reload systemd 9 | command: systemctl daemon-reload 10 | -------------------------------------------------------------------------------- /ansible/data/hosts: -------------------------------------------------------------------------------- 1 | [ansible] 2 | 192.168.100.10 3 | 4 | [cluster] 5 | #ip-192-168-100-11.local 6 | 192.168.100.11 7 | 192.168.100.12 8 | 192.168.100.13 9 | 10 | [docker] 11 | 192.168.100.11 12 | 192.168.100.12 13 | 14 | [schema-registry] 15 | 192.168.100.11 16 | -------------------------------------------------------------------------------- /hadoop/file/zeppelin/config/zeppelin-env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export ZEPPELLIN_BASE_PATH=/vol/zeppelin 4 | export ZEPPELIN_LOG_DIR=${ZEPPELLIN_BASE_PATH}/log 5 | export ZEPPELIN_NOTEBOOK_DIR=${ZEPPELLIN_BASE_PATH}/notebook 6 | 7 | export ZEPPELIN_MEM="-Xms1024m -Xmx1024m" 8 | -------------------------------------------------------------------------------- /aws/emr/requirements.txt: -------------------------------------------------------------------------------- 1 | astroid==1.6.1 2 | click==6.7 3 | Flask==1.1.1 4 | isort==4.3.4 5 | itsdangerous==0.24 6 | Jinja2==2.10.1 7 | lazy-object-proxy==1.3.1 8 | MarkupSafe==1.0 9 | mccabe==0.6.1 10 | pylint==1.8.2 11 | six==1.11.0 12 | Werkzeug==0.15.3 13 | wrapt==1.10.11 14 | -------------------------------------------------------------------------------- /aws/emr/README.md: -------------------------------------------------------------------------------- 1 | # aws-emr 2 | 3 | ### Development 4 | 5 | ``` 6 | # create 7 | virtualenv -p $(which python3) venv 8 | 9 | # activate virtualenv 10 | source venv/bin/activate 11 | 12 | # development script 13 | ./dev.sh 14 | 15 | # deactivate virtualenv 16 | deactivate 17 | ``` 18 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | click==7.1.2 2 | future==0.18.2 3 | Jinja2==2.11.3 4 | joblib==0.14.1 5 | livereload==2.6.1 6 | lunr==0.5.6 7 | Markdown==3.2.2 8 | MarkupSafe==1.1.1 9 | mkdocs==1.1 10 | nltk==3.5 11 | PyYAML==5.4 12 | regex==2020.5.7 13 | six==1.14.0 14 | tornado==6.0.4 15 | tqdm==4.46.0 16 | -------------------------------------------------------------------------------- /base/supervisor.sed: -------------------------------------------------------------------------------- 1 | s/logfile=\/tmp\/supervisord.log/logfile=\/var\/log\/supervisord.log/ 2 | s/pidfile=\/tmp\/supervisord.pid/pidfile=\/var\/run\/supervisord.pid/ 3 | s/nodaemon=false/nodaemon=true/ 4 | s/\;\[include\]/\[include\]/ 5 | s/\;files = relative\/directory\/\*.ini/files = \/etc\/supervisor\/conf.d\/\*/ 6 | -------------------------------------------------------------------------------- /hadoop/example/map-reduce/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | #Tue Jan 23 20:22:38 GMT 2018 2 | distributionBase=GRADLE_USER_HOME 3 | distributionPath=wrapper/dists 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | distributionUrl=https\://services.gradle.org/distributions/gradle-4.4.1-all.zip 7 | -------------------------------------------------------------------------------- /docs/azure.md: -------------------------------------------------------------------------------- 1 | # Azure 2 | 3 | * ARM template [documentation](https://docs.microsoft.com/en-us/azure/azure-resource-manager/templates) 4 | * Azure Automation [documentation](https://docs.microsoft.com/en-us/azure/automation) 5 | * Azure Security Center [documentation](https://docs.microsoft.com/en-us/azure/security-center) 6 | -------------------------------------------------------------------------------- /aws/emr/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6 2 | 3 | WORKDIR /usr/src 4 | 5 | COPY requirements.txt ./ 6 | RUN pip install --no-cache-dir -r requirements.txt 7 | 8 | COPY ./application ./application 9 | 10 | COPY setup.py setup.cfg MANIFEST.in ./ 11 | RUN pip install --editable . 12 | 13 | CMD [ "python", "./application/main.py" ] 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | .DS_Store 3 | 4 | */.vagrant 5 | 6 | ansible/.share 7 | ansible/data/site.retry 8 | aws/*/logs/ 9 | cassandra/.cassandra 10 | hadoop/.data/ 11 | 12 | .gradle/ 13 | build/ 14 | 15 | __pycache__ 16 | *.pyc 17 | .pytest_cache/ 18 | venv/ 19 | .eggs/ 20 | *.egg-info 21 | 22 | *.iml 23 | .idea/ 24 | target/ 25 | .vscode/ 26 | *.log 27 | 28 | site 29 | -------------------------------------------------------------------------------- /aws/emr/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='aws-emr', 5 | version='0.1', 6 | packages=['application'], 7 | include_package_data=True, 8 | install_requires=[ 9 | 'flask', 10 | ], 11 | setup_requires=[ 12 | 'pytest-runner', 13 | ], 14 | tests_require=[ 15 | 'pytest', 16 | ], 17 | ) 18 | -------------------------------------------------------------------------------- /aws/emr/tests/application_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | class ApplicationTestCase(unittest.TestCase): 4 | 5 | def setUp(self): 6 | print('test setUp') 7 | 8 | def tearDown(self): 9 | print('test tearDown') 10 | 11 | def test_example(self): 12 | assert 'aaa' in 'aaa' 13 | 14 | if __name__ == '__main__': 15 | unittest.main() 16 | -------------------------------------------------------------------------------- /zookeeper/supervisor.ini: -------------------------------------------------------------------------------- 1 | [program:zookeeper] 2 | command=/opt/zookeeper/bin/zkServer.sh start-foreground 3 | redirect_stderr=false 4 | stdout_logfile=/var/log/zookeeper/stdout 5 | stdout_logfile_maxbytes=0 6 | stderr_logfile=/var/log/zookeeper/stderr 7 | stderr_logfile_maxbytes=0 8 | stopsignal=INT 9 | numprocs_start=1 10 | startsecs=2 11 | autostart=true 12 | autorestart=true 13 | -------------------------------------------------------------------------------- /hadoop/example/spark/build.sbt: -------------------------------------------------------------------------------- 1 | import Dependencies.{V, allDependencies} 2 | 3 | lazy val root = (project in file(".")). 4 | settings( 5 | inThisBuild(List( 6 | organization := "com.github.niqdev", 7 | scalaVersion := V.scala, 8 | version := "0.1.0-SNAPSHOT" 9 | )), 10 | name := "spark-github", 11 | libraryDependencies ++= allDependencies 12 | ) 13 | -------------------------------------------------------------------------------- /kafka/supervisor-connect.ini: -------------------------------------------------------------------------------- 1 | [program:connect] 2 | command=/opt/kafka/bin/connect-distributed.sh /opt/kafka/config/connect-distributed.properties 3 | redirect_stderr=false 4 | stdout_logfile=/var/log/connect/stdout 5 | stdout_logfile_maxbytes=0 6 | stderr_logfile=/var/log/connect/stderr 7 | stderr_logfile_maxbytes=0 8 | stopsignal=INT 9 | numprocs_start=1 10 | startsecs=2 11 | autostart=true 12 | autorestart=true 13 | -------------------------------------------------------------------------------- /hadoop/file/spark/config/spark-defaults.conf: -------------------------------------------------------------------------------- 1 | spark.master yarn 2 | # TODO spark.yarn.jars hdfs://namenode.local:9000/user/spark/share/lib/*.jar 3 | # TODO spark.yarn.archive hdfs://namenode.local:9000/user/spark/share/spark-archive.zip 4 | 5 | # history server 6 | spark.eventLog.enabled true 7 | spark.eventLog.dir hdfs://namenode.local:9000/user/spark/log 8 | spark.history.fs.logDirectory hdfs://namenode.local:9000/user/spark/log 9 | -------------------------------------------------------------------------------- /hadoop/example/map-reduce/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'java-library' 2 | apply plugin: 'application' 3 | 4 | repositories { 5 | jcenter() 6 | } 7 | 8 | mainClassName = "com.github.niqdev.WordCount" 9 | 10 | jar { 11 | manifest { 12 | attributes 'Main-Class': "$mainClassName" 13 | } 14 | } 15 | 16 | dependencies { 17 | compile group: 'org.apache.hadoop', name: 'hadoop-client', version: '2.7.5' 18 | } 19 | -------------------------------------------------------------------------------- /hadoop/file/hadoop/profile-hadoop.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | export HADOOP_HOME=/usr/local/hadoop 4 | export PATH=${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin:${PATH} 5 | 6 | export HADOOP_LOG_PATH=/vol/hadoop/log 7 | export HADOOP_LOG_DIR=${HADOOP_LOG_PATH}/hadoop 8 | export YARN_LOG_DIR=${HADOOP_LOG_PATH}/yarn 9 | export HADOOP_MAPRED_LOG_DIR=${HADOOP_LOG_PATH}/mapred 10 | 11 | # required by spark 12 | export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop 13 | -------------------------------------------------------------------------------- /kafka/supervisor-kafka.ini: -------------------------------------------------------------------------------- 1 | [program:kafka] 2 | command=/opt/kafka/bin/kafka-server-start.sh /opt/kafka/config/server.properties --override zookeeper.connect="%(ENV_ZOOKEEPER_HOSTS)s" 3 | redirect_stderr=false 4 | stdout_logfile=/var/log/kafka/stdout 5 | stdout_logfile_maxbytes=0 6 | stderr_logfile=/var/log/kafka/stderr 7 | stderr_logfile_maxbytes=0 8 | stopsignal=INT 9 | numprocs_start=1 10 | startsecs=2 11 | autostart=true 12 | autorestart=true 13 | -------------------------------------------------------------------------------- /.github/workflows/gh-pages.yml: -------------------------------------------------------------------------------- 1 | name: github-pages 2 | on: 3 | push: 4 | branches: 5 | - master 6 | 7 | jobs: 8 | build: 9 | name: Deploy docs 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout main 13 | uses: actions/checkout@v3.2.0 14 | 15 | - name: Deploy docs 16 | uses: mhausenblas/mkdocs-deploy-gh-pages@nomaterial 17 | env: 18 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 19 | -------------------------------------------------------------------------------- /aws/emr/application/configuration.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | class DefaultConfig(object): 4 | APP_NAME = 'aws-emr' 5 | LOG_PATH = 'logs/application.log' 6 | ENVIRONMENT = 'DEFAULT' 7 | DEBUG = False 8 | HTTP_HOST = '127.0.0.1' 9 | HTTP_PORT = 5000 10 | 11 | class Config(DefaultConfig): 12 | # docker doesn't forward 127.0.0.1 13 | HTTP_HOST = os.getenv('HTTP_HOST', '0.0.0.0') 14 | HTTP_PORT = int(os.getenv('HTTP_PORT', 5000)) 15 | -------------------------------------------------------------------------------- /aws/emr/application/api/status_api.py: -------------------------------------------------------------------------------- 1 | from application import app 2 | 3 | from flask import jsonify 4 | 5 | @app.route('/status') 6 | def status(): 7 | app.logger.debug('status') 8 | return jsonify({ 9 | 'status': 'OK' 10 | }) 11 | 12 | @app.route('/info') 13 | def info(): 14 | app.logger.debug('info') 15 | return jsonify({ 16 | 'application': app.config['APP_NAME'], 17 | 'env': app.config['ENVIRONMENT'] 18 | }) 19 | -------------------------------------------------------------------------------- /hadoop/file/hosts: -------------------------------------------------------------------------------- 1 | # hadoop hosts 2 | 172.16.0.10 master master.local namenode.local secondary-namenode.local resource-manager.local web-proxy.local history.local 3 | 172.16.0.10 spark.local spark-history.local zeppelin.local postgres.local oozie.local 4 | 172.16.0.101 node-1 node-1.local datanode-1.local node-manager-1.local 5 | 172.16.0.102 node-2 node-2.local datanode-2.local node-manager-2.local 6 | 172.16.0.103 node-3 node-3.local datanode-3.local node-manager-3.local 7 | -------------------------------------------------------------------------------- /miscellaneous/setup_k8s.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | curl -Lo minikube https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64 \ 4 | && chmod +x minikube \ 5 | && sudo mv minikube /usr/local/bin/ 6 | 7 | curl -Lo kubectl https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl \ 8 | && chmod +x kubectl \ 9 | && sudo mv kubectl /usr/local/bin/ 10 | -------------------------------------------------------------------------------- /ansible/data/roles/common/tasks/motd.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | # custom banner 4 | # https://ownyourbits.com/2017/04/05/customize-your-motd-login-message-in-debian-and-ubuntu/ 5 | 6 | - name: remove help banner from motd 7 | become: yes 8 | file: 9 | path: /etc/update-motd.d/10-help-text 10 | state: absent 11 | 12 | - name: add custom banner to motd 13 | become: yes 14 | template: 15 | src: motd 16 | dest: /etc/update-motd.d/10-custom-text 17 | mode: 0755 18 | -------------------------------------------------------------------------------- /ansible/destroy_ansible.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # unofficial bash strict mode 4 | set -euo pipefail 5 | IFS=$'\n\t' 6 | 7 | # run from any directory (no symlink allowed) 8 | CURRENT_PATH=$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd -P) 9 | cd ${CURRENT_PATH} 10 | 11 | echo "[+] destroy ansible" 12 | 13 | read -p "Are you sure? [y/n]" -n 1 -r 14 | echo 15 | if [[ $REPLY =~ ^[Yy]$ ]] 16 | then 17 | vagrant destroy -f 18 | 19 | rm -frv \ 20 | .vagrant \ 21 | .share 22 | fi 23 | 24 | echo "[-] destroy ansible" 25 | -------------------------------------------------------------------------------- /ansible/data/site.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: test 4 | hosts: all 5 | tasks: 6 | - name: test uptime 7 | shell: uptime 8 | tags: 9 | - test 10 | 11 | - name: common setup 12 | hosts: cluster 13 | roles: 14 | - common 15 | tags: 16 | - common 17 | 18 | - name: docker setup 19 | hosts: docker 20 | roles: 21 | - docker 22 | tags: 23 | - docker 24 | 25 | - name: schema registry setup 26 | hosts: schema-registry 27 | roles: 28 | - schema-registry 29 | tags: 30 | - schema-registry 31 | -------------------------------------------------------------------------------- /hadoop/file/hadoop/config/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | fs.defaultFS 6 | hdfs://namenode.local:9000 7 | 8 | 9 | 10 | 11 | hadoop.proxyuser.hadoop.hosts 12 | * 13 | 14 | 15 | hadoop.proxyuser.hadoop.groups 16 | * 17 | 18 | 19 | -------------------------------------------------------------------------------- /hadoop/file/motd: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | echo "\nHostname: \033[1;31m$(hostname -s)\033[0m" 4 | echo "Uptime:$(uptime)\n" 5 | 6 | echo '* master: 172.16.0.10' 7 | echo '* node-1: 172.16.0.101\n' 8 | 9 | echo '* NameNode: http://namenode.local:50070' 10 | echo '* ResourceManager: http://resource-manager.local:8088' 11 | echo '* MapReduce Job History Server: http://history.local:19888' 12 | echo '* DataNode/NodeManager (1): http://node-1.local:8042/node\n' 13 | 14 | echo '* Spark: http://spark.local:4040' 15 | echo '* Zeppelin: http://zeppelin.local:8080' 16 | echo '* Oozie: http://oozie.local:11000' 17 | -------------------------------------------------------------------------------- /ansible/setup_share.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # unofficial bash strict mode 4 | set -euo pipefail 5 | IFS=$'\n\t' 6 | 7 | # run from any directory (no symlink allowed) 8 | CURRENT_PATH=$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd -P) 9 | cd ${CURRENT_PATH} 10 | 11 | echo "[+] setup share" 12 | 13 | SHARE_PATH="$CURRENT_PATH/.share" 14 | SSH_PATH="$SHARE_PATH/ssh" 15 | 16 | echo "share path: $SHARE_PATH" 17 | 18 | rm -fr ${SHARE_PATH} 19 | mkdir -p ${SHARE_PATH}/node-{1,2,3} ${SSH_PATH} 20 | 21 | ssh-keygen -t rsa -b 4096 -C "ansible" -N "" -f "$SSH_PATH/ansible_rsa" 22 | 23 | echo "[-] setup share" 24 | -------------------------------------------------------------------------------- /kafka/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | zookeeper: 5 | container_name: devops-zookeeper 6 | build: 7 | context: ../zookeeper 8 | args: 9 | - VERSION=3.4.12 10 | ports: 11 | - 12181:2181 12 | networks: 13 | - devops_network 14 | kafka: 15 | container_name: devops-kafka 16 | build: . 17 | depends_on: 18 | - zookeeper 19 | ports: 20 | - 19092:9092 21 | networks: 22 | - devops_network 23 | environment: 24 | - ZOOKEEPER_HOSTS="zookeeper:2181" 25 | 26 | networks: 27 | devops_network: 28 | -------------------------------------------------------------------------------- /zookeeper/zoo.cfg: -------------------------------------------------------------------------------- 1 | # http://zookeeper.apache.org/doc/current/zookeeperAdmin.html 2 | 3 | # The number of milliseconds of each tick 4 | tickTime=2000 5 | # The number of ticks that the initial synchronization phase can take 6 | initLimit=10 7 | # The number of ticks that can pass between sending a request and getting an acknowledgement 8 | syncLimit=5 9 | # The directory where the snapshot is stored 10 | dataDir=/var/lib/zookeeper/data 11 | # The port at which the clients will connect 12 | clientPort=2181 13 | # Write the transaction log to the dataLogDir rather than the dataDir 14 | dataLogDir=/var/log/zookeeper 15 | -------------------------------------------------------------------------------- /hadoop/file/hadoop/config/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | dfs.namenode.name.dir 6 | file:///vol/hadoop/namenode 7 | 8 | 9 | dfs.namenode.checkpoint.dir 10 | file:///vol/hadoop/secondary 11 | 12 | 13 | dfs.datanode.data.dir 14 | file:///vol/hadoop/datanode 15 | 16 | 17 | -------------------------------------------------------------------------------- /ansible/data/roles/common/tasks/package.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: update & upgrade 4 | become: yes 5 | apt: 6 | update_cache: yes 7 | cache_valid_time: "{{ apt_cache }}" 8 | upgrade: dist 9 | 10 | - name: install common packages 11 | become: yes 12 | apt: 13 | name: 14 | - jq 15 | - tree 16 | - httpie 17 | state: present 18 | update_cache: yes 19 | cache_valid_time: "{{ apt_cache }}" 20 | 21 | - import_tasks: oracle-jdk.yml 22 | tags: 23 | - oracle-jdk 24 | 25 | - name: cleanup 26 | become: yes 27 | apt: 28 | autoclean: yes 29 | autoremove: yes 30 | -------------------------------------------------------------------------------- /aws/emr/application/main.py: -------------------------------------------------------------------------------- 1 | from application import app 2 | from application.logger import Logger 3 | 4 | Logger().init() 5 | 6 | # api 7 | import application.api.status_api 8 | import application.api.example_api 9 | import application.api.emr_api 10 | 11 | # if run with cli this is NOT executed 12 | if __name__ == '__main__': 13 | app.logger.info('start application: [{0}] @ {1}:{2} in DEBUG={3}'.format( 14 | app.config['APP_NAME'], app.config['HTTP_HOST'], app.config['HTTP_PORT'], app.config['DEBUG'])) 15 | app.run(host=app.config['HTTP_HOST'], port=app.config['HTTP_PORT'], debug=app.config['DEBUG']) 16 | -------------------------------------------------------------------------------- /hadoop/example/spark/project/Dependencies.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | 3 | object Dependencies { 4 | 5 | lazy val N = new { 6 | val spark = "org.apache.spark" 7 | } 8 | 9 | lazy val V = new { 10 | val scala = "2.11.12" 11 | 12 | val spark = "2.2.1" 13 | 14 | val scalatest = "3.0.5" 15 | } 16 | 17 | lazy val libDependencies = Seq( 18 | N.spark %% "spark-core" % V.spark % Provided, 19 | N.spark %% "spark-sql" % V.spark % Provided 20 | ) 21 | 22 | lazy val testDependencies = Seq( 23 | "org.scalatest" %% "scalatest" % V.scalatest % Test 24 | ) 25 | 26 | lazy val allDependencies = libDependencies ++ testDependencies 27 | 28 | } 29 | -------------------------------------------------------------------------------- /aws/emr/application/service/emr_service.py: -------------------------------------------------------------------------------- 1 | from application import app 2 | 3 | class EmrService(object): 4 | 5 | def create_cluster(self): 6 | app.logger.debug('TODO create_cluster') 7 | return { 8 | 'instance_id': 'TODO_INSTANCE_ID' 9 | } 10 | 11 | def destroy_cluster(self): 12 | app.logger.debug('TODO destroy_cluster') 13 | return { 14 | 'instance_id': 'TODO_INSTANCE_ID' 15 | } 16 | 17 | def info_cluster(self): 18 | app.logger.debug('TODO info_cluster') 19 | return { 20 | 'instance_id': 'TODO_INSTANCE_ID', 21 | 'name': 'TODO_NAME' 22 | } 23 | -------------------------------------------------------------------------------- /cassandra/cql/all_users.csv: -------------------------------------------------------------------------------- 1 | firstNameCsvAll1;"{'home': {street: 'street1'; city: 'city1'; state: 'STATE'; zip_code: 12345}}";;"{'csv1a@example.com'; 'csv1b@example.com'}";True;;lastNameCsv1;; 2 | firstNameCsvAll2;"{'home': {street: 'street1'; city: 'city1'; state: 'STATE'; zip_code: 12345}}";;"{'csv2a@example.com'; 'csv2b@example.com'}";True;;lastNameCsv2;; 3 | firstNameCsvAll3;"{'home': {street: 'street1'; city: 'city1'; state: 'STATE'; zip_code: 12345}}";;"{'csv3a@example.com'; 'csv3b@example.com'}";False;;lastNameCsv3;; 4 | firstNameCsvAll4;"{'home': {street: 'street1'; city: 'city1'; state: 'STATE'; zip_code: 12345}}";;"{'csv4a@example.com'; 'csv4b@example.com'}";False;;lastNameCsv4;; 5 | -------------------------------------------------------------------------------- /ansible/data/roles/schema-registry/docker-compose-local.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | 5 | zookeeper: 6 | container_name: my-local-zookeeper 7 | image: niqdev/zookeeper:3.4.13 8 | ports: 9 | - 2181:2181 10 | hostname: zookeeper 11 | networks: 12 | - my_local_network 13 | 14 | kafka: 15 | container_name: my-local-kafka 16 | image: niqdev/kafka:2.0.0 17 | depends_on: 18 | - zookeeper 19 | ports: 20 | - 9092:9092 21 | - 8083:8083 22 | hostname: kafka 23 | networks: 24 | - my_local_network 25 | environment: 26 | - ZOOKEEPER_HOSTS="zookeeper:2181" 27 | 28 | networks: 29 | my_local_network: 30 | -------------------------------------------------------------------------------- /ansible/setup_ansible.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # unofficial bash strict mode 4 | set -euo pipefail 5 | IFS=$'\n\t' 6 | 7 | # run from any directory (no symlink allowed) 8 | CURRENT_PATH=$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd -P) 9 | cd ${CURRENT_PATH} 10 | 11 | echo "[+] setup ansible" 12 | 13 | sudo apt-add-repository ppa:ansible/ansible 14 | sudo apt-get update 15 | 16 | sudo apt-get install -y \ 17 | software-properties-common \ 18 | ansible 19 | 20 | # http://docs.ansible.com/ansible/latest/intro_getting_started.html#host-key-checking 21 | sudo sed -i -r "s/#host_key_checking = False/host_key_checking = False/" /etc/ansible/ansible.cfg 22 | 23 | echo "[-] setup ansible" 24 | -------------------------------------------------------------------------------- /docs-todo/_aws.md: -------------------------------------------------------------------------------- 1 | # AWS 2 | 3 | > TODO 4 | 5 | Documentation 6 | 7 | * [Boto 3](https://boto3.readthedocs.io/en/latest/reference/services/index.html) 8 | 9 | ## CLI 10 | 11 | TODO 12 | 13 | ## Setup 14 | 15 | Build `devops/aws-emr` image 16 | ```bash 17 | # change path 18 | cd devops/aws/emr 19 | 20 | # build image 21 | docker build -t devops/aws-emr . 22 | 23 | # start temporary container [port=HOST:CONTAINER] 24 | docker run \ 25 | --rm \ 26 | -e HTTP_PORT=8080 \ 27 | -p 5000:8080 \ 28 | --name aws-emr \ 29 | devops/aws-emr:latest 30 | 31 | # access container 32 | docker exec -it aws-emr bash 33 | ``` 34 | 35 | ### S3 36 | 37 | TODO 38 | 39 | ### EMR 40 | 41 | TODO 42 | -------------------------------------------------------------------------------- /ansible/data/roles/common/tasks/oracle-jdk.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: add java repository 4 | tags: 5 | - oracle-jdk 6 | become: yes 7 | apt_repository: 8 | repo: 'ppa:linuxuprising/java' 9 | state: present 10 | 11 | - name: accept oracle license 12 | tags: 13 | - oracle-jdk 14 | become: yes 15 | debconf: name='oracle-java11-installer' question='shared/accepted-oracle-license-v1-2' value='true' vtype='select' 16 | 17 | - name: install java 18 | tags: 19 | - oracle-jdk 20 | become: yes 21 | apt: 22 | name: "{{ packages }}" 23 | state: latest 24 | vars: 25 | packages: 26 | - oracle-java11-installer 27 | - oracle-java11-set-default 28 | -------------------------------------------------------------------------------- /cassandra/cql/column_users.csv: -------------------------------------------------------------------------------- 1 | first_name,last_name,addresses,emails,enable 2 | firstNameCsv1,lastNameCsv1,"{'home': {street: 'street1', city: 'city1', state: 'STATE', zip_code: 12345}}","{'csv1a@example.com', 'csv1b@example.com'}",True 3 | firstNameCsv2,lastNameCsv2,"{'home': {street: 'street1', city: 'city1', state: 'STATE', zip_code: 12345}}","{'csv2a@example.com', 'csv2b@example.com'}",True 4 | firstNameCsv3,lastNameCsv3,"{'home': {street: 'street1', city: 'city1', state: 'STATE', zip_code: 12345}}","{'csv3a@example.com', 'csv3b@example.com'}",False 5 | firstNameCsv4,lastNameCsv4,"{'home': {street: 'street1', city: 'city1', state: 'STATE', zip_code: 12345}}","{'csv4a@example.com', 'csv4b@example.com'}",False 6 | -------------------------------------------------------------------------------- /ansible/data/roles/docker/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: create docker group 4 | become: yes 5 | group: 6 | name: docker 7 | state: present 8 | 9 | - name: create docker user 10 | become: yes 11 | user: 12 | name: docker 13 | shell: /bin/bash 14 | groups: docker,sudo 15 | append: yes 16 | 17 | - name: install docker 18 | become: yes 19 | #become_user: docker 20 | command: 'bash -c "curl -fsSL https://get.docker.com/ | sh"' 21 | 22 | - name: install docker-compose 23 | become: yes 24 | #become_user: docker 25 | get_url: 26 | url: "https://github.com/docker/compose/releases/download/1.22.0/docker-compose-Linux-x86_64" 27 | dest: /usr/local/bin/docker-compose 28 | mode: +x 29 | -------------------------------------------------------------------------------- /zookeeper/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM devops/base:latest 2 | #FROM niqdev/phusion-base:latest 3 | 4 | ARG VERSION=3.5.5 5 | 6 | ENV ZOOKEEPER_HOME "/opt/zookeeper" 7 | ENV PATH "$ZOOKEEPER_HOME/bin:$PATH" 8 | 9 | RUN apt-get install -y \ 10 | telnet \ 11 | netcat && \ 12 | apt-get clean 13 | 14 | RUN curl https://www-eu.apache.org/dist/zookeeper/zookeeper-${VERSION}/apache-zookeeper-${VERSION}-bin.tar.gz | tar -xzf - -C /opt && \ 15 | mv /opt/apache-zookeeper-${VERSION}-bin /opt/zookeeper-${VERSION} && \ 16 | ln -s /opt/zookeeper-${VERSION} /opt/zookeeper && \ 17 | mkdir -p /var/log/zookeeper /var/lib/zookeeper/data 18 | 19 | ADD zoo.cfg /opt/zookeeper/conf/zoo.cfg 20 | ADD supervisor.ini /etc/supervisor/conf.d/zookeeper.conf 21 | -------------------------------------------------------------------------------- /base/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM phusion/baseimage:latest-amd64 2 | 3 | RUN apt-get update && apt-get upgrade -y 4 | RUN add-apt-repository ppa:openjdk-r/ppa -y 5 | 6 | RUN apt-get update && apt-get install -y \ 7 | iputils-ping \ 8 | python2.7 \ 9 | python-pip \ 10 | httpie \ 11 | jq \ 12 | openjdk-8-jdk && \ 13 | apt-get clean 14 | 15 | ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64 16 | 17 | RUN pip install --upgrade pip wheel setuptools supervisor 18 | 19 | ADD supervisor.sed /tmp/supervisor.sed 20 | 21 | RUN echo_supervisord_conf > /etc/supervisord.conf && \ 22 | sed -i -r -f /tmp/supervisor.sed /etc/supervisord.conf && \ 23 | mkdir -p /etc/supervisor/conf.d 24 | 25 | CMD ["supervisord", "-c", "/etc/supervisord.conf", "-n"] 26 | -------------------------------------------------------------------------------- /docs-todo/_neo4j.md: -------------------------------------------------------------------------------- 1 | # Neo4j 2 | 3 | > TODO 4 | 5 | * [Graph Databases](TODO) (2015) by Ian Robinson, Jim Webber, and Emil Eifrem (Book) 6 | 7 | Graph databases help leveraging complex and dynamic relationships in highly connected data to generate insight and competitive advantage. Connected data is data whose interpretation and value requires users first to understand the ways in which its constituent elements are related. 8 | 9 | > https://github.com/iansrobinson/graph-databases-use-cases 10 | 11 | **What Is a Graph?** 12 | 13 | A graph is just a collection of vertices and edges or, in different words, a set of nodes and the relationships that connect them. Graphs represent entities as nodes and the ways in which those entities relate to the world as relationships. 14 | -------------------------------------------------------------------------------- /hadoop/example/map-reduce/src/main/java/com/github/niqdev/IntSumReducer.java: -------------------------------------------------------------------------------- 1 | package com.github.niqdev; 2 | 3 | import org.apache.hadoop.io.IntWritable; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapreduce.Reducer; 6 | 7 | import java.io.IOException; 8 | 9 | public class IntSumReducer extends Reducer { 10 | 11 | private IntWritable result = new IntWritable(); 12 | 13 | @Override 14 | protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { 15 | int sum = 0; 16 | for (IntWritable value : values) { 17 | sum += value.get(); 18 | } 19 | result.set(sum); 20 | context.write(key, result); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: DevOps 2 | site_author: niqdev 3 | repo_url: https://github.com/niqdev/devops 4 | theme: readthedocs 5 | google_analytics: ['UA-68888222-4', 'niqdev.github.io'] 6 | 7 | nav: 8 | - Linux: linux.md 9 | - Docker: docker.md 10 | - Ansible: ansible.md 11 | - Cassandra: cassandra.md 12 | - ZooKeeper: zookeeper.md 13 | - Kafka: kafka.md 14 | - Hadoop: hadoop.md 15 | - Cloud: cloud.md 16 | - Kubernetes: kubernetes.md 17 | - System Design: system-design.md 18 | - Operating System: operating-system.md 19 | - Programming: programming.md 20 | - Other Resources: other-resources.md 21 | - Toolbox: toolbox.md 22 | - JVM (OLD): jvm.md 23 | - Scala (OLD): scala.md 24 | 25 | # disable search plugin 26 | #plugins: [] 27 | -------------------------------------------------------------------------------- /hadoop/file/hadoop/config/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | mapreduce.framework.name 6 | yarn 7 | 8 | 9 | 10 | 11 | mapreduce.jobhistory.address 12 | history.local:10020 13 | 14 | 15 | mapreduce.jobhistory.webapp.address 16 | history.local:19888 17 | 18 | 19 | 20 | yarn.app.mapreduce.am.staging-dir 21 | /mr-history 22 | 23 | 24 | -------------------------------------------------------------------------------- /aws/emr/application/logger.py: -------------------------------------------------------------------------------- 1 | from application import app 2 | 3 | import os 4 | import logging 5 | from logging.handlers import TimedRotatingFileHandler 6 | 7 | class Logger(object): 8 | 9 | def __init__(self): 10 | self.log_path = app.config['LOG_PATH'] 11 | 12 | def init(self): 13 | # create directory if doesn't exist 14 | os.makedirs(os.path.dirname(self.log_path), exist_ok=True) 15 | 16 | formatter = logging.Formatter("[%(asctime)s][%(levelname)s][%(pathname)s:%(lineno)d] %(message)s") 17 | handler = TimedRotatingFileHandler(self.log_path, when='midnight', interval=1, backupCount=5) 18 | handler.setLevel(logging.DEBUG) 19 | handler.setFormatter(formatter) 20 | 21 | app.logger.addHandler(handler) 22 | app.logger.setLevel(logging.DEBUG) 23 | app.logger.debug('init logger') 24 | -------------------------------------------------------------------------------- /hadoop/file/hadoop/config/fair-scheduler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 9 | 10 | 60.0 11 | 0.8 12 | 120 13 | 14 | 15 | 40.0 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /hadoop/example/map-reduce/src/main/java/com/github/niqdev/TokenizerMapper.java: -------------------------------------------------------------------------------- 1 | package com.github.niqdev; 2 | 3 | import org.apache.hadoop.io.IntWritable; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapreduce.Mapper; 6 | 7 | import java.io.IOException; 8 | import java.util.StringTokenizer; 9 | 10 | public class TokenizerMapper extends Mapper { 11 | 12 | private final static IntWritable one = new IntWritable(1); 13 | private Text word = new Text(); 14 | 15 | @Override 16 | protected void map(Object key, Text value, Context context) throws IOException, InterruptedException { 17 | StringTokenizer iterator = new StringTokenizer(value.toString()); 18 | 19 | while (iterator.hasMoreTokens()) { 20 | word.set(iterator.nextToken()); 21 | context.write(word, one); 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /kafka/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM devops/base:latest 2 | #FROM niqdev/phusion-base:latest 3 | 4 | ARG SCALA_VERSION=2.12 5 | ARG KAFKA_VERSION=2.3.0 6 | 7 | ENV ZOOKEEPER_HOSTS="localhost:2181" 8 | ENV KAFKA_HOME "/opt/kafka" 9 | ENV PATH "$KAFKA_HOME/bin:$PATH" 10 | 11 | RUN apt-get install -y \ 12 | kafkacat && \ 13 | apt-get clean 14 | 15 | RUN curl https://www-eu.apache.org/dist/kafka/${KAFKA_VERSION}/kafka_${SCALA_VERSION}-${KAFKA_VERSION}.tgz | tar -xzf - -C /opt && \ 16 | ln -s /opt/kafka_${SCALA_VERSION}-${KAFKA_VERSION} /opt/kafka && \ 17 | # bash expansion not working /var/log/{kafka,connect} 18 | mkdir -p /var/log/kafka /var/log/connect 19 | 20 | # update data directory 21 | RUN sed -i -r ' \ 22 | s/log.dirs=\/tmp\/kafka-logs/log.dirs=\/var\/lib\/kafka\/data/; \ 23 | ' /opt/kafka/config/server.properties 24 | 25 | ADD supervisor-kafka.ini /etc/supervisor/conf.d/kafka.conf 26 | ADD supervisor-connect.ini /etc/supervisor/conf.d/connect.conf 27 | -------------------------------------------------------------------------------- /ansible/data/roles/schema-registry/files/log4j.properties: -------------------------------------------------------------------------------- 1 | # TODO 2 | 3 | log4j.rootLogger=INFO, stdout, file 4 | 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 7 | log4j.appender.stdout.layout.ConversionPattern=[%d] %p %m (%c:%L)%n 8 | 9 | log4j.logger.kafka=ERROR, stdout 10 | log4j.logger.org.apache.zookeeper=ERROR, stdout 11 | log4j.logger.org.apache.kafka=ERROR, stdout 12 | log4j.logger.org.I0Itec.zkclient=ERROR, stdout 13 | log4j.additivity.kafka.server=false 14 | log4j.additivity.kafka.consumer.ZookeeperConsumerConnector=false 15 | 16 | log4j.appender.file=org.apache.log4j.RollingFileAppender 17 | log4j.appender.file.maxBackupIndex=10 18 | log4j.appender.file.maxFileSize=100MB 19 | log4j.appender.file.File=${schema-registry.log.dir}/schema-registry.log 20 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 21 | log4j.appender.file.layout.ConversionPattern=[%d] %p %m (%c)%n 22 | -------------------------------------------------------------------------------- /hadoop/example/spark/src/main/scala/com/github/niqdev/App.scala: -------------------------------------------------------------------------------- 1 | package com.github.niqdev 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object App { 6 | 7 | def main(args: Array[String]): Unit = { 8 | val spark = SparkSession.builder 9 | .appName("spark-github") 10 | .master("local[*]") 11 | .getOrCreate() 12 | 13 | val sc = spark.sparkContext 14 | 15 | val homeDir = System.getenv("HOME") 16 | val inputPath = s"file:$homeDir/github-archive/*.json" 17 | val outputDir = s"file:$homeDir/github-archive/output" 18 | val githubLog = spark.read.json(inputPath) 19 | val pushes = githubLog.filter("type = 'PushEvent'") 20 | 21 | pushes.printSchema 22 | println(s"all events: ${githubLog.count}") 23 | println(s"only pushes: ${pushes.count}") 24 | pushes.show(5) 25 | 26 | val grouped = pushes.groupBy("actor.login").count 27 | grouped.show(5) 28 | val ordered = grouped.orderBy(grouped("count").desc) 29 | ordered.show(5) 30 | 31 | ordered.write.format("json").save(outputDir) 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /aws/emr/application/service/example_service.py: -------------------------------------------------------------------------------- 1 | from application import app 2 | 3 | from datetime import datetime 4 | from flask import jsonify 5 | 6 | class ExampleService(object): 7 | 8 | def get_tasks(self, request): 9 | tasks = [ 10 | { 11 | 'id': 1, 12 | 'title': u'Buy groceries', 13 | 'description': u'Milk, Cheese, Pizza, Fruit, Tylenol', 14 | 'done': False 15 | }, 16 | { 17 | 'id': 2, 18 | 'title': u'Learn Python', 19 | 'description': u'Need to find a good Python tutorial on the web', 20 | 'done': False 21 | } 22 | ] 23 | app.logger.debug(request.method) 24 | app.logger.debug(request.url) 25 | #app.logger.debug('\n'.join('{}: {}'.format(k, v) for k, v in request.headers.items())) 26 | #app.logger.debug(request.body) 27 | return jsonify({ 28 | 'href': request.url, 29 | 'createdAt': datetime.utcnow().isoformat(), 30 | 'modifiedAt': datetime.utcnow().isoformat(), 31 | 'tasks': tasks 32 | }) 33 | -------------------------------------------------------------------------------- /ansible/data/roles/schema-registry/img/draw-io-ansible.xml: -------------------------------------------------------------------------------- 1 | 5ZhBk9ogFMc/TY7bkRCjHqu17aE9eej2yCaYUEmeg7jqfvq+JCSREDs7s1m1rQcH/vCA9wMeb+LRRXb8otg2/Q4xl54/io8e/eT5/nQ2wv9COFXCeDaphESJuJJIK6zECzeisUv2IuY7q6MGkFpsbTGCPOeRtjSmFBzsbmuQ9qxblnBHWEVMuuoPEevUuOVPWv0rF0laz0zCWdXyxKJNomCfm/k8n67LX9WcsXos4+guZTEcziS69OhCAeiqlB0XXBZoa2yV3ecLrc26Fc/1awxqi2cm97xecrkwfaphlO7wwmDk0fkhFZqvtiwqWg+4+6ilOpNYI1g0w3Gl+fHimkjjKR4gDhnX6oRdjMHYsDnZ1UO7EeOp0dKzTQjqjsxsftKM3ALAgmFwAeDk/niQkQ2E+C6RIOwhQskARKaO/zzGu2KqoHQKCeRMLlt1bhM6o8GPQj+elX8WXT6Mi1qO63o0FmWlbfvFtT6ZIMH2GlBq5/0GsLVYF8v7M2n0BvYqMr3MBmumEl6D698PxSXT4tke/S1sZ7dlO7kJ3PGV4JLwPzy5ZHQlur4TJ1m+E08I8ebhktjh0p+44bIJoefhMhwgWlIHS4650cPtH5GGwi2ojPup0LujQnuSjXejUp/Te8o1ukAIdYH4fdlXQ+lNRNxsdBelPGMPiidiV44WSpxx/qSwlOjG6V5k5DrIph1k17xZxI3DMUQbru4QVOB3QtDolaDoEKACB9SGrTfsL+BEQpdTb7o/BCY3VL8A3COk7q27JqSpA0lCxKQDBd3TtucYw2DDFyBBoZJDXiSdayFlR2JSJDlWI8SBl5nOC1gC5/hoGjIRx2XG2ofafjUGoN3NqYKeqxv00B7kUZg5tMvsgfy7uJ0nJXg33FhtvzaVbWdf9OjyNw== -------------------------------------------------------------------------------- /hadoop/file/spark/config/log4j.properties: -------------------------------------------------------------------------------- 1 | # set global logging severity to INFO (and upwards: WARN, ERROR, FATAL) 2 | log4j.rootCategory=INFO, console, file 3 | 4 | # console config (restrict only to ERROR and FATAL) 5 | log4j.appender.console=org.apache.log4j.ConsoleAppender 6 | log4j.appender.console.target=System.err 7 | log4j.appender.console.threshold=ERROR 8 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 9 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 10 | 11 | # file config 12 | log4j.appender.file=org.apache.log4j.RollingFileAppender 13 | log4j.appender.file.File=/vol/spark/log/info.log 14 | log4j.appender.file.MaxFileSize=5MB 15 | log4j.appender.file.MaxBackupIndex=10 16 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 17 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 18 | 19 | # settings to quiet third party logs that are too verbose 20 | log4j.logger.org.apache.spark.repl.Main=WARN 21 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 22 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 23 | log4j.logger.org.apache.spark=WARN 24 | log4j.logger.org.apache.hadoop=WARN 25 | -------------------------------------------------------------------------------- /hadoop/example/map-reduce/src/main/java/com/github/niqdev/WordCount.java: -------------------------------------------------------------------------------- 1 | package com.github.niqdev; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.Path; 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Job; 8 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 9 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 10 | 11 | import java.io.IOException; 12 | 13 | /** 14 | * http://hadoop.apache.org/docs/r2.7.5/hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html 15 | */ 16 | public class WordCount { 17 | 18 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 19 | Configuration conf = new Configuration(); 20 | Job job = Job.getInstance(conf, "word-count"); 21 | 22 | job.setJarByClass(WordCount.class); 23 | job.setMapperClass(TokenizerMapper.class); 24 | job.setCombinerClass(IntSumReducer.class); 25 | job.setReducerClass(IntSumReducer.class); 26 | 27 | job.setOutputKeyClass(Text.class); 28 | job.setOutputValueClass(IntWritable.class); 29 | 30 | FileInputFormat.addInputPath(job, new Path(args[0])); 31 | FileOutputFormat.setOutputPath(job, new Path(args[1])); 32 | 33 | System.exit(job.waitForCompletion(true) ? 0 : 1); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /cassandra/cql/example_create.cql: -------------------------------------------------------------------------------- 1 | DROP KEYSPACE IF EXISTS example; 2 | 3 | CREATE KEYSPACE IF NOT EXISTS example WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 1}; 4 | 5 | USE example; 6 | 7 | CREATE TABLE IF NOT EXISTS messages ( 8 | id uuid PRIMARY KEY, 9 | body text, 10 | created_at timestamp, 11 | updated_at timestamp 12 | ); 13 | 14 | ALTER TABLE messages ADD title text; 15 | 16 | CREATE TABLE IF NOT EXISTS example.counters ( 17 | id timeuuid PRIMARY KEY, 18 | total counter 19 | ); 20 | 21 | CREATE TABLE IF NOT EXISTS example.users ( 22 | first_name text PRIMARY KEY, 23 | last_name text, 24 | last_ip inet, 25 | any_value blob, 26 | enable boolean 27 | ); 28 | 29 | ALTER TABLE example.users ADD emails set; 30 | 31 | ALTER TABLE example.users ADD phone_numbers list; 32 | 33 | ALTER TABLE example.users ADD login_sessions map; 34 | 35 | CREATE TYPE example.address ( 36 | street text, 37 | city text, 38 | state text, 39 | zip_code int 40 | ); 41 | 42 | -- frozen: user-defined data type is considered a collection 43 | ALTER TABLE example.users ADD addresses map>; 44 | 45 | -- avoid secondary indexes 46 | CREATE INDEX users_last_name_idx ON example.users ( last_name ); 47 | 48 | CREATE CUSTOM INDEX users_last_name_sasi_idx ON example.users ( last_name ) 49 | USING 'org.apache.cassandra.index.sasi.SASIIndex'; 50 | 51 | CREATE INDEX ON example.users ( emails ); 52 | -------------------------------------------------------------------------------- /kafka/docker-compose-hub.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | 5 | zookeeper: 6 | container_name: devops-zookeeper 7 | image: niqdev/zookeeper:latest 8 | ports: 9 | - 2181:2181 10 | hostname: zookeeper 11 | networks: 12 | - devops_network 13 | 14 | kafka: 15 | container_name: devops-kafka 16 | image: niqdev/kafka:latest 17 | ports: 18 | - 9092:9092 19 | - 8083:8083 20 | hostname: kafka 21 | networks: 22 | - devops_network 23 | environment: 24 | - ZOOKEEPER_HOSTS="zookeeper:2181" 25 | 26 | schema-registry: 27 | container_name: devops-schema-registry 28 | image: confluentinc/cp-schema-registry 29 | depends_on: 30 | - kafka 31 | ports: 32 | - 8081:8081 33 | hostname: schema-registry 34 | networks: 35 | - devops_network 36 | environment: 37 | - SCHEMA_REGISTRY_KAFKASTORE_CONNECTION_URL=zookeeper:2181 38 | - SCHEMA_REGISTRY_HOST_NAME=schema-registry 39 | - SCHEMA_REGISTRY_LISTENERS=http://schema-registry:8081 40 | 41 | schema-registry-ui: 42 | container_name: devops-schema-registry-ui 43 | image: landoop/schema-registry-ui 44 | depends_on: 45 | - schema-registry 46 | ports: 47 | - 8082:8000 48 | hostname: schema-registry-ui 49 | networks: 50 | - devops_network 51 | environment: 52 | - SCHEMAREGISTRY_URL=http://schema-registry:8081 53 | - PROXY=true 54 | 55 | networks: 56 | devops_network: 57 | -------------------------------------------------------------------------------- /cassandra/docker-compose-cluster.yml: -------------------------------------------------------------------------------- 1 | version: '3.2' 2 | 3 | services: 4 | 5 | cassandra-seed: 6 | container_name: devops-cassandra-seed 7 | image: cassandra:3.11 8 | restart: unless-stopped 9 | ports: 10 | - 19042:9042 11 | networks: 12 | - network_cluster 13 | volumes: 14 | - ./cql:/cql 15 | #- .cassandra/cassandra-seed/data:/var/lib/cassandra 16 | #- .cassandra/cassandra-seed/log:/var/log/cassandra 17 | 18 | cassandra-node-1: 19 | container_name: devops-cassandra-node-1 20 | image: cassandra:3.11 21 | depends_on: 22 | - cassandra-seed 23 | restart: unless-stopped 24 | ports: 25 | - 19043:9042 26 | networks: 27 | - network_cluster 28 | environment: 29 | CASSANDRA_SEEDS: "devops-cassandra-seed" 30 | volumes: 31 | - ./cql:/cql 32 | #- .cassandra/cassandra-node-1/data:/var/lib/cassandra 33 | #- .cassandra/cassandra-node-1/log:/var/log/cassandra 34 | 35 | cassandra-node-2: 36 | container_name: devops-cassandra-node-2 37 | image: cassandra:3.11 38 | depends_on: 39 | - cassandra-seed 40 | restart: unless-stopped 41 | ports: 42 | - 19044:9042 43 | networks: 44 | - network_cluster 45 | environment: 46 | CASSANDRA_SEEDS: "devops-cassandra-seed" 47 | volumes: 48 | - ./cql:/cql 49 | #- .cassandra/cassandra-node-2/data:/var/lib/cassandra 50 | #- .cassandra/cassandra-node-2/log:/var/log/cassandra 51 | 52 | networks: 53 | network_cluster: 54 | -------------------------------------------------------------------------------- /ansible/data/roles/schema-registry/README.md: -------------------------------------------------------------------------------- 1 | # schema-registry 2 | 3 | ```bash 4 | cd ansible 5 | 6 | # setup 7 | ./setup_share.sh 8 | vagrant up 9 | 10 | # setup docker 11 | vagrant ssh ansible 12 | ansible-playbook /ansible/site.yml -t docker 13 | 14 | # (local) copy docker compose manually 15 | cp data/roles/schema-registry/docker-compose-local.yml .share/node-1/docker-compose-local.yml 16 | 17 | vagrant ssh node-1 18 | # update hosts 19 | echo -e "# docker images\n127.0.1.1 zookeeper\n127.0.1.1 kafka\n" | sudo tee -a /etc/hosts 20 | # start docker 21 | sudo -i -u docker 22 | docker-compose -f /data/docker-compose-local.yml up 23 | 24 | # setup schema registry 25 | vagrant ssh ansible 26 | ansible-playbook /ansible/site.yml -t schema-registry 27 | 28 | # verify schema registry 29 | vagrant ssh node-1 30 | sudo systemctl start confluent-schema-registry 31 | sudo systemctl status confluent-schema-registry 32 | sudo journalctl -u confluent-schema-registry -b 33 | sudo journalctl -ru confluent-schema-registry --no-pager 34 | ll /etc/schema-registry/ 35 | ll /var/log/confluent/schema-registry/ 36 | ll /home/cp-schema-registry/logs/ 37 | less +G /var/log/confluent/schema-registry/schema-registry.log 38 | tail -F /var/log/confluent/schema-registry/schema-registry.log 39 | 40 | # (local) examples 41 | http -v 192.168.100.11:8081/subjects 42 | 43 | # check running services 44 | sudo netstat -ltp 45 | 46 | # check user 47 | ps -ef | grep schema 48 | cat /etc/passwd 49 | 50 | # verify zookeeper 51 | docker exec -it my-local-zookeeper bash 52 | zkCli.sh 53 | get /brokers/ids/0 54 | ``` 55 | -------------------------------------------------------------------------------- /aws/emr/application/api/emr_api.py: -------------------------------------------------------------------------------- 1 | from application import app 2 | from application.service.emr_service import EmrService 3 | 4 | import json 5 | from datetime import datetime 6 | from flask import jsonify, request 7 | 8 | emr_service = EmrService() 9 | 10 | # TODO env|region|config-name (yaml) 11 | 12 | @app.route('/v1/emr/clusters/create', methods=['POST']) 13 | def route_clusters_create(): 14 | data = emr_service.create_cluster() 15 | return __build_response(request, data) 16 | 17 | @app.route('/v1/emr/clusters/destroy', methods=['POST']) 18 | def route_clusters_destroy(): 19 | data = emr_service.destroy_cluster() 20 | return __build_response(request, data) 21 | 22 | @app.route('/v1/emr/clusters/info') 23 | def route_clusters_info(): 24 | data = emr_service.info_cluster() 25 | return __build_response(request, data) 26 | 27 | def __build_response(request, data = {}, debug = True): 28 | """ 29 | Build Response 30 | """ 31 | 32 | if debug: 33 | data_request = { 34 | 'url': request.url, 35 | 'method': request.method, 36 | 'headers': dict(request.headers), 37 | 'params': request.args 38 | } 39 | data_response = { 40 | #'params': request.params, 41 | #'body': request.body, 42 | 'data': data 43 | } 44 | return jsonify({ 45 | 'timestamp': datetime.utcnow().isoformat(), 46 | 'request': data_request, 47 | 'response': data_response 48 | }) 49 | else: 50 | return jsonify(data) 51 | -------------------------------------------------------------------------------- /hadoop/file/oozie/config/oozie-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | oozie.service.HadoopAccessorService.hadoop.configurations 7 | *=/usr/local/hadoop/etc/hadoop 8 | 9 | 10 | 11 | 45 | 46 | -------------------------------------------------------------------------------- /aws/emr/application/api/example_api.py: -------------------------------------------------------------------------------- 1 | from application import app 2 | from application.service.example_service import ExampleService 3 | 4 | from flask import render_template, request, abort, redirect, url_for 5 | 6 | example_service = ExampleService() 7 | 8 | # http://127.0.0.1:5000/static/example.txt 9 | 10 | # http://127.0.0.1:5000 11 | @app.route('/') 12 | def index(): 13 | app.logger.debug('A value for debugging') 14 | app.logger.warning('A warning occurred (%d apples)', 42) 15 | app.logger.error('An error occurred') 16 | return 'Hello, World!' 17 | 18 | # default is GET only 19 | # http://127.0.0.1:5000/query?key=aaa 20 | @app.route('/query', methods=['GET', 'POST']) 21 | def query_param(): 22 | return 'METHOD %s' % request.args.get('key', '') 23 | 24 | # http://127.0.0.1:5000/path/TODO/hello 25 | @app.route('/path//hello') 26 | @app.route('/path//hello/') 27 | def path_param(param): 28 | return 'param %s' % param 29 | 30 | # http://127.0.0.1:5000/hello/name 31 | @app.route('/hello/') 32 | @app.route('/hello/') 33 | def hello(name=None): 34 | return render_template('hello.html', name=name) 35 | 36 | # http://127.0.0.1:5000/redirect 37 | @app.route('/redirect') 38 | def my_redirect(): 39 | return redirect(url_for('error')) 40 | 41 | # http://127.0.0.1:5000/error 42 | @app.route('/error') 43 | def error(): 44 | abort(401) 45 | 46 | # http://127.0.0.1:5000/xxx 47 | @app.errorhandler(404) 48 | def page_not_found(error): 49 | return render_template('page_not_found.html'), 404 50 | 51 | @app.teardown_appcontext 52 | def teardown(error): 53 | app.logger.debug('after each request') 54 | 55 | # http://127.0.0.1:5000/v1/tasks 56 | @app.route('/v1/tasks', methods=['GET']) 57 | def get_tasks(): 58 | return example_service.get_tasks(request) 59 | -------------------------------------------------------------------------------- /hadoop/file/hadoop/config/yarn-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | yarn.nodemanager.aux-services 6 | mapreduce_shuffle 7 | 8 | 9 | 10 | 11 | yarn.resourcemanager.scheduler.class 12 | org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler 13 | 14 | 15 | yarn.scheduler.fair.preemption 16 | true 17 | 18 | 19 | 20 | 21 | yarn.log-aggregation-enable 22 | true 23 | 24 | 25 | yarn.nodemanager.log-dirs 26 | /vol/hadoop/log/yarn 27 | 28 | 29 | 30 | yarn.nodemanager.remote-app-log-dir 31 | /yarn/app 32 | 33 | 34 | yarn.nodemanager.remote-app-log-dir-suffix 35 | logs 36 | 37 | 38 | 39 | 40 | yarn.log.server.url 41 | http://history.local:19888/jobhistory/logs 42 | 43 | 44 | 45 | yarn.resourcemanager.hostname 46 | resource-manager.local 47 | 48 | 49 | 50 | yarn.web-proxy.address 51 | web-proxy.local:8100 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /docs-todo/_spark.md: -------------------------------------------------------------------------------- 1 | # Spark 2 | 3 | Spark wasn't made with Online Transaction Processing (OLTP) applications in mind (fast, numerous, atomic transactions). 4 | It's better suited for Online Analytical Processing (OLAP): batch jobs and data mining. 5 | 6 | MapReduce job results need to be stored in HDFS before they can be used by another job. 7 | For this reason, MapReduce is inherently bad with iterative algorithms. 8 | Furthermore, many kinds of problems don’t easily fit MapReduce’s two-step paradigm. 9 | 10 | There are two types of RDD operations: transformations and actions. 11 | Transformations (for example, filter or map) are operations that produce a new RDD by performing some useful data manipulation on another RDD. 12 | Actions (for example, count or foreach) trigger a computation in order to return the result to the calling program or to perform some actions on an RDD's elements. 13 | 14 | It's important to understand that transformations are evaluated lazily, meaning computation doesn't take place until you invoke an action. 15 | 16 | Data partitioning is Spark’s mechanism for dividing data between multiple nodes in a cluster. 17 | 18 | Physical movement of data between partitions is called shuffling. 19 | It occurs when data from multiple partitions needs to be combined in order to build partitions for a new RDD. 20 | When grouping elements by key, for example, Spark needs to examine all of the RDD's partitions, find elements with the same key, and then physically group them, thus forming new partitions. 21 | 22 | The spark.shuffle.consolidateFiles parameter specifies whether to consolidate intermediate files created during a shuffle. 23 | For performance reasons, we recommend that you change this to true (the default value is false ) if you’re using an ext4 or XFS filesystem. 24 | 25 | Coalesce is used for either reducing or increasing the number of partitions and force shuffling 26 | -------------------------------------------------------------------------------- /ansible/Vagrantfile: -------------------------------------------------------------------------------- 1 | # -*- mode: ruby -*- 2 | # vi: set ft=ruby : 3 | 4 | BOX_IMAGE = "ubuntu/bionic64" 5 | NODE_COUNT = 3 6 | PATH_SHARE = ".share" 7 | 8 | Vagrant.configure("2") do |config| 9 | 10 | config.vm.box = BOX_IMAGE 11 | 12 | config.vm.provider "virtualbox" do |vb| 13 | vb.memory = 2048 14 | vb.cpus = 2 15 | end 16 | 17 | config.vm.define "ansible" do |ansible| 18 | ansible.vm.hostname = "ansible" 19 | ansible.vm.network "private_network", ip: "192.168.100.10" 20 | ansible.vm.synced_folder ENV['HOME'], "/local" 21 | ansible.vm.provision "file", source: PATH_SHARE + "/ssh/ansible_rsa", destination: "$HOME/.ssh/id_rsa" 22 | ansible.vm.provision "shell", path: "setup_ansible.sh" 23 | ansible.vm.synced_folder "data/", "/ansible" 24 | ansible.vm.provision "shell", inline: <<-SHELL 25 | # default syncronized inventory 26 | ln -sf /ansible/hosts /etc/ansible/hosts 27 | SHELL 28 | end 29 | 30 | (1..NODE_COUNT).each do |i| 31 | config.vm.define "node-#{i}" do |node| 32 | node.vm.hostname = "ip-192-168-100-#{i + 10}" 33 | node.vm.network :private_network, ip: "192.168.100.#{i + 10}" 34 | node.vm.synced_folder PATH_SHARE + "/node-#{i}", "/data" 35 | end 36 | end 37 | 38 | # give ssh access to each machine 39 | config.vm.provision "file", source: PATH_SHARE + "/ssh/ansible_rsa.pub", destination: "/tmp/ansible_rsa.pub" 40 | config.vm.provision "shell", inline: <<-SHELL 41 | # append ansible key to avoid lose vagrant key with copy 42 | cat /tmp/ansible_rsa.pub >> .ssh/authorized_keys 43 | rm /tmp/ansible_rsa.pub 44 | SHELL 45 | 46 | config.vm.provision "shell", inline: <<-SHELL 47 | # update 48 | apt-get update 49 | # required python2 missing on ubuntu-18 50 | apt-get install -y python2.7 python-pip 51 | # dns 52 | apt-get install -y avahi-daemon libnss-mdns 53 | SHELL 54 | end 55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DevOps 2 | 3 | [![github-pages](https://github.com/niqdev/devops/actions/workflows/gh-pages.yml/badge.svg)](https://github.com/niqdev/devops/actions/workflows/gh-pages.yml) 4 | 5 | A collection of notes, resources, documentation and POCs mainly related to distributed systems for local development, learning purposes and quick prototyping. 6 | 7 | * [Linux](https://niqdev.github.io/devops/linux) 8 | * [Docker](https://niqdev.github.io/devops/docker) 9 | * [Ansible](https://niqdev.github.io/devops/ansible) 10 | * [Cassandra](https://niqdev.github.io/devops/cassandra) 11 | * [ZooKeeper](https://niqdev.github.io/devops/zookeeper) 12 | * [Kafka](https://niqdev.github.io/devops/kafka) 13 | * [Hadoop](https://niqdev.github.io/devops/hadoop) 14 | * [HDFS and MapReduce](https://niqdev.github.io/devops/hadoop/#hdfs-and-mapreduce) 15 | * [Spark](https://niqdev.github.io/devops/hadoop/#spark) 16 | * [Zeppelin](https://niqdev.github.io/devops/hadoop/#zeppelin) 17 | * [Oozie](https://niqdev.github.io/devops/hadoop/#oozie) 18 | * [Cloud](https://niqdev.github.io/devops/cloud) 19 | * [Kubernetes](https://niqdev.github.io/devops/kubernetes) 20 | * [System Design](https://niqdev.github.io/devops/system-design) 21 | * [Operating System](https://niqdev.github.io/devops/operating-system) 22 | * [Programming](https://niqdev.github.io/devops/programming) 23 | * [Other Resources](https://niqdev.github.io/devops/other-resources) 24 | * [Toolbox](https://niqdev.github.io/devops/toolbox) 25 | 26 | ## Development 27 | 28 | Ubuntu 29 | 30 | ```bash 31 | # install pip3 32 | sudo apt install -y python3-pip 33 | 34 | # install virtualenv globally 35 | sudo pip3 install virtualenv 36 | 37 | # create virtualenv 38 | virtualenv -p $(which python3) venv 39 | 40 | # how-to activate virtualenv 41 | source venv/bin/activate 42 | 43 | # verify virtualenv 44 | which python 45 | python --version 46 | 47 | # how-to deactivate virtualenv 48 | deactivate 49 | 50 | # install new package 51 | pip install mkdocs 52 | 53 | # update requirements 54 | pip freeze > requirements.txt 55 | 56 | # run locally 57 | # http://localhost:8000 58 | mkdocs serve 59 | ``` 60 | -------------------------------------------------------------------------------- /hadoop/script/setup_zeppelin.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # unofficial bash strict mode 4 | set -euo pipefail 5 | IFS=$'\n\t' 6 | 7 | # run from any directory (no symlink allowed) 8 | CURRENT_PATH=$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd -P) 9 | cd ${CURRENT_PATH} 10 | 11 | ############################## 12 | 13 | FILE_PATH="/vagrant/file" 14 | DATA_PATH="/vagrant/.data" 15 | USER_NAME="hadoop" 16 | 17 | ZEPPELIN_VERSION="0.7.3" 18 | ZEPPELIN_NAME="zeppelin-$ZEPPELIN_VERSION-bin-all" 19 | 20 | ############################## 21 | 22 | function download_dist { 23 | local ZEPPELIN_MIRROR_DOWNLOAD="http://www-eu.apache.org/dist/zeppelin/zeppelin-$ZEPPELIN_VERSION/$ZEPPELIN_NAME.tgz" 24 | echo "[*] download dist" 25 | wget -q -P $DATA_PATH $ZEPPELIN_MIRROR_DOWNLOAD 26 | } 27 | 28 | function setup_dist { 29 | local ZEPPELIN_DIST_PATH="$DATA_PATH/$ZEPPELIN_NAME*" 30 | echo "[*] setup dist" 31 | 32 | if [ ! -e $ZEPPELIN_DIST_PATH ]; then 33 | download_dist 34 | fi 35 | 36 | tar -xf $ZEPPELIN_DIST_PATH -C /opt 37 | ln -s /opt/$ZEPPELIN_NAME /usr/local/zeppelin 38 | chown -R $USER_NAME:$USER_NAME /opt/$ZEPPELIN_NAME 39 | } 40 | 41 | function setup_config { 42 | local DATA_PATH_GUEST="/vol/zeppelin" 43 | local ZEPPELIN_BASE_PATH="/usr/local/zeppelin" 44 | local CONFIG_PATH="$ZEPPELIN_BASE_PATH/conf" 45 | local FILES=( "zeppelin-env.sh" ) 46 | 47 | echo "[*] create directories" 48 | mkdir -pv \ 49 | $DATA_PATH_GUEST/{log,notebook} 50 | 51 | for FILE in "${FILES[@]}" 52 | do 53 | echo "[*] update config: $FILE" 54 | # backup only if exists 55 | [ -e $CONFIG_PATH/$FILE ] && mv $CONFIG_PATH/$FILE $CONFIG_PATH/$FILE.orig 56 | cp $FILE_PATH/zeppelin/config/$FILE $CONFIG_PATH/$FILE 57 | done 58 | 59 | echo "[*] update permissions" 60 | chown -R $USER_NAME:$USER_NAME \ 61 | $ZEPPELIN_BASE_PATH/ 62 | 63 | echo "[*] update env" 64 | cp $FILE_PATH/zeppelin/profile-zeppelin.sh /etc/profile.d/profile-zeppelin.sh && \ 65 | source /etc/profile.d/profile-zeppelin.sh 66 | } 67 | 68 | function main { 69 | echo "[+] setup zeppelin" 70 | setup_dist 71 | setup_config 72 | echo "[-] setup zeppelin" 73 | } 74 | 75 | main 76 | -------------------------------------------------------------------------------- /ansible/data/roles/schema-registry/files/schema-registry.properties: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Confluent Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # >>>>>>>>>> https://docs.confluent.io/current/schema-registry/docs/config.html 16 | 17 | # The address the socket server listens on. 18 | # FORMAT: 19 | # listeners = listener_name://host_name:port 20 | # EXAMPLE: 21 | # listeners = PLAINTEXT://your.host.name:9092 22 | listeners=http://0.0.0.0:8081 23 | 24 | # Zookeeper connection string for the Zookeeper cluster used by your Kafka cluster 25 | # (see zookeeper docs for details). 26 | # This is a comma separated host:port pairs, each corresponding to a zk 27 | # server. e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002". 28 | #kafkastore.connection.url=localhost:2181 29 | 30 | # Alternatively, Schema Registry can now operate without Zookeeper, handling all coordination via 31 | # Kafka brokers. Use this setting to specify the bootstrap servers for your Kafka cluster and it 32 | # will be used both for selecting the master schema registry instance and for storing the data for 33 | # registered schemas. 34 | # (Note that you cannot mix the two modes; use this mode only on new deployments or by shutting down 35 | # all instances, switching to the new configuration, and then starting the schema registry 36 | # instances again.) 37 | kafkastore.bootstrap.servers=PLAINTEXT://localhost:9092 38 | 39 | # The name of the topic to store schemas in 40 | kafkastore.topic=_schemas 41 | 42 | # If true, API requests that fail will include extra debugging information, including stack traces 43 | debug=true 44 | 45 | kafkastore.topic.replication.factor=1 46 | -------------------------------------------------------------------------------- /hadoop/Vagrantfile: -------------------------------------------------------------------------------- 1 | # -*- mode: ruby -*- 2 | # vi: set ft=ruby : 3 | 4 | BOX_IMAGE = "ubuntu/bionic64" 5 | NODE_COUNT = 3 6 | 7 | DATA_PATH_HOST = ".data" 8 | DATA_PATH_GUEST = "/vol" 9 | KEY_PATH = DATA_PATH_HOST + "/hadoop_rsa" 10 | HADOOP_ID = "1101" 11 | 12 | VAGRANT_COMMAND = ARGV[0] 13 | Vagrant.configure("2") do |config| 14 | 15 | config.vm.box = BOX_IMAGE 16 | 17 | if VAGRANT_COMMAND == "ssh" 18 | config.ssh.username = "hadoop" 19 | config.ssh.private_key_path = KEY_PATH 20 | end 21 | 22 | config.vm.define "master" do |node| 23 | node.vm.hostname = "master" 24 | 25 | node.vm.provider "virtualbox" do |vb| 26 | vb.memory = 4096 27 | vb.cpus = 4 28 | end 29 | 30 | node.vm.network "private_network", ip: "172.16.0.10" 31 | # NameNode 32 | node.vm.network "forwarded_port", guest: 50070, host: 50070 33 | # ResourceManager 34 | node.vm.network "forwarded_port", guest: 8088, host: 8088 35 | # MapReduce Job History server 36 | node.vm.network "forwarded_port", guest: 19888, host: 19888 37 | # Spark 38 | node.vm.network "forwarded_port", guest: 4040, host: 4040 39 | # Oozie 40 | node.vm.network "forwarded_port", guest: 11000, host: 11000 41 | 42 | node.vm.synced_folder ENV['HOME'], "/local" 43 | # synced_folder permission issue https://github.com/hashicorp/vagrant/issues/936 44 | node.vm.synced_folder DATA_PATH_HOST + "/master", DATA_PATH_GUEST, 45 | mount_options: ["uid=" + HADOOP_ID, "gid=" + HADOOP_ID] 46 | end 47 | 48 | (1..NODE_COUNT).each do |i| 49 | config.vm.define "node-#{i}" do |node| 50 | node.vm.hostname = "node-#{i}" 51 | 52 | node.vm.provider "virtualbox" do |vb| 53 | vb.memory = 1024 54 | vb.cpus = 1 55 | end 56 | 57 | node.vm.network :private_network, ip: "172.16.0.#{i + 100}" 58 | 59 | node.vm.synced_folder DATA_PATH_HOST + "/node-#{i}", DATA_PATH_GUEST, 60 | mount_options: ["uid=" + HADOOP_ID, "gid=" + HADOOP_ID] 61 | end 62 | end 63 | 64 | config.vm.provision "shell", path: "./script/setup_ubuntu.sh" 65 | config.vm.provision "shell", path: "./script/setup_hadoop.sh" 66 | config.vm.provision "shell", path: "./script/setup_spark.sh" 67 | config.vm.provision "shell", run: "always", inline: <<-SHELL 68 | su --login hadoop /vagrant/script/bootstrap.sh 69 | SHELL 70 | end 71 | -------------------------------------------------------------------------------- /ansible/data/roles/schema-registry/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | # https://docs.confluent.io/current/installation/installing_cp/deb-ubuntu.html 4 | # https://docs.confluent.io/current/schema-registry/docs/index.html 5 | 6 | - name: create group 7 | become: yes 8 | group: 9 | name: "{{ schema.registry.group }}" 10 | 11 | - name: create user 12 | become: yes 13 | user: 14 | name: "{{ schema.registry.user }}" 15 | comment: "Schema Registry User" 16 | system: yes 17 | group: "{{ schema.registry.group }}" 18 | 19 | - name: create and update directories 20 | become: yes 21 | file: 22 | path: "{{ item }}" 23 | state: directory 24 | owner: "{{ schema.registry.user }}" 25 | group: "{{ schema.registry.group }}" 26 | recurse: yes 27 | with_items: 28 | - "/home/{{ schema.registry.user }}" 29 | - "/home/{{ schema.registry.user }}/logs" 30 | - /var/log/confluent/schema-registry 31 | 32 | - name: install confluent public key 33 | become: yes 34 | apt_key: 35 | url: https://packages.confluent.io/deb/5.0/archive.key 36 | state: present 37 | 38 | - name: add confluent repository 39 | become: yes 40 | apt_repository: 41 | repo: 'deb [arch=amd64] https://packages.confluent.io/deb/5.0 stable main' 42 | state: present 43 | 44 | - name: install confluent schema registry 45 | become: yes 46 | apt: 47 | name: confluent-schema-registry 48 | update_cache: yes 49 | 50 | - name: symlink /var/log/confluent/schema-registry to /home/{{ schema.registry.user }}/logs 51 | become: yes 52 | #become_user: "{{ schema.registry.user }}" 53 | file: 54 | src: /var/log/confluent/schema-registry 55 | dest: /home/{{ schema.registry.user }}/logs 56 | state: link 57 | force: yes 58 | owner: "{{ schema.registry.user }}" 59 | group: "{{ schema.registry.group }}" 60 | 61 | - name: copy configs 62 | become: yes 63 | copy: 64 | src: "{{ item }}" 65 | dest: "/etc/schema-registry/{{ item }}" 66 | owner: "{{ schema.registry.user }}" 67 | group: "{{ schema.registry.group }}" 68 | mode: u=rwx,g=r,o=r 69 | with_items: 70 | - log4j.properties 71 | - schema-registry.properties 72 | # notify: 73 | # - restart schema-registry 74 | 75 | - name: start confluent schema registry 76 | become: yes 77 | #become_user: "{{ schema.registry.user }}" 78 | systemd: 79 | state: started 80 | name: confluent-schema-registry 81 | -------------------------------------------------------------------------------- /hadoop/script/bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # unofficial bash strict mode 4 | set -euo pipefail 5 | IFS=$'\n\t' 6 | 7 | # run from any directory (no symlink allowed) 8 | CURRENT_PATH=$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd -P) 9 | cd ${CURRENT_PATH} 10 | 11 | ############################## 12 | 13 | USER_NAME="hadoop" 14 | 15 | PARAM_SERVICE_NAME=${1:-"all"} 16 | 17 | ############################## 18 | 19 | function start_hadoop { 20 | local HOSTNAME=$(hostname) 21 | echo "[*] start hadoop on [$HOSTNAME]" 22 | hadoop version 23 | 24 | case $HOSTNAME in 25 | "master") 26 | hadoop-daemon.sh --script hdfs start namenode 27 | hadoop-daemon.sh --script hdfs start secondarynamenode 28 | yarn-daemon.sh start resourcemanager 29 | yarn-daemon.sh start proxyserver 30 | mr-jobhistory-daemon.sh start historyserver 31 | ;; 32 | *) 33 | hadoop-daemons.sh --script hdfs start datanode 34 | yarn-daemons.sh start nodemanager 35 | ;; 36 | esac 37 | 38 | jps 39 | } 40 | 41 | function start_spark { 42 | echo "[*] start spark" 43 | spark-shell --version 44 | 45 | # set "spark.eventLog.enabled" to "true" 46 | # History Server expects to find a file named APPLICATION_COMPLETE 47 | # in the applicationSs directory (/tmp/spark-events/ by default) 48 | /usr/local/spark/sbin/start-history-server.sh 49 | } 50 | 51 | function start_zeppelin { 52 | # check if exists 53 | if [ -x "$(command -v zeppelin.sh)" ]; then 54 | echo "[*] start zeppelin" 55 | zeppelin-daemon.sh start 56 | fi 57 | } 58 | 59 | function start_oozie { 60 | # check if exists 61 | if [ -x "$(command -v oozie)" ]; then 62 | echo "[*] start oozie" 63 | oozied.sh start 64 | oozie admin -oozie http://oozie.local:11000/oozie -status 65 | fi 66 | } 67 | 68 | function start_all { 69 | start_hadoop 70 | start_spark 71 | start_zeppelin 72 | start_oozie 73 | } 74 | 75 | function main { 76 | echo "[+] boostrap" 77 | local SERVICE_NAME=$(echo "${PARAM_SERVICE_NAME}" | awk '{print toupper($0)}') 78 | 79 | case $SERVICE_NAME in 80 | "ZEPPELIN") 81 | start_zeppelin 82 | ;; 83 | "OOZIE") 84 | start_oozie 85 | ;; 86 | "ALL") 87 | start_all 88 | ;; 89 | *) 90 | echo "[-] invalid parameters" 91 | ;; 92 | esac 93 | echo "[-] boostrap" 94 | } 95 | 96 | if [ $USER_NAME == "$(whoami)" ]; then 97 | main 98 | else 99 | echo "[-] execute as [$USER_NAME] user only" 100 | fi 101 | -------------------------------------------------------------------------------- /hadoop/example/map-reduce/gradlew.bat: -------------------------------------------------------------------------------- 1 | @if "%DEBUG%" == "" @echo off 2 | @rem ########################################################################## 3 | @rem 4 | @rem Gradle startup script for Windows 5 | @rem 6 | @rem ########################################################################## 7 | 8 | @rem Set local scope for the variables with windows NT shell 9 | if "%OS%"=="Windows_NT" setlocal 10 | 11 | set DIRNAME=%~dp0 12 | if "%DIRNAME%" == "" set DIRNAME=. 13 | set APP_BASE_NAME=%~n0 14 | set APP_HOME=%DIRNAME% 15 | 16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 17 | set DEFAULT_JVM_OPTS= 18 | 19 | @rem Find java.exe 20 | if defined JAVA_HOME goto findJavaFromJavaHome 21 | 22 | set JAVA_EXE=java.exe 23 | %JAVA_EXE% -version >NUL 2>&1 24 | if "%ERRORLEVEL%" == "0" goto init 25 | 26 | echo. 27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 28 | echo. 29 | echo Please set the JAVA_HOME variable in your environment to match the 30 | echo location of your Java installation. 31 | 32 | goto fail 33 | 34 | :findJavaFromJavaHome 35 | set JAVA_HOME=%JAVA_HOME:"=% 36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 37 | 38 | if exist "%JAVA_EXE%" goto init 39 | 40 | echo. 41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 42 | echo. 43 | echo Please set the JAVA_HOME variable in your environment to match the 44 | echo location of your Java installation. 45 | 46 | goto fail 47 | 48 | :init 49 | @rem Get command-line arguments, handling Windows variants 50 | 51 | if not "%OS%" == "Windows_NT" goto win9xME_args 52 | 53 | :win9xME_args 54 | @rem Slurp the command line arguments. 55 | set CMD_LINE_ARGS= 56 | set _SKIP=2 57 | 58 | :win9xME_args_slurp 59 | if "x%~1" == "x" goto execute 60 | 61 | set CMD_LINE_ARGS=%* 62 | 63 | :execute 64 | @rem Setup the command line 65 | 66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 67 | 68 | @rem Execute Gradle 69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% 70 | 71 | :end 72 | @rem End local scope for the variables with windows NT shell 73 | if "%ERRORLEVEL%"=="0" goto mainEnd 74 | 75 | :fail 76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 77 | rem the _cmd.exe /c_ return code! 78 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 79 | exit /b 1 80 | 81 | :mainEnd 82 | if "%OS%"=="Windows_NT" endlocal 83 | 84 | :omega 85 | -------------------------------------------------------------------------------- /hadoop/script/setup_hadoop.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # unofficial bash strict mode 4 | set -euo pipefail 5 | IFS=$'\n\t' 6 | 7 | # run from any directory (no symlink allowed) 8 | CURRENT_PATH=$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd -P) 9 | cd ${CURRENT_PATH} 10 | 11 | ############################## 12 | 13 | FILE_PATH="/vagrant/file" 14 | DATA_PATH="/vagrant/.data" 15 | USER_NAME="hadoop" 16 | 17 | HADOOP_VERSION="2.7.6" 18 | HADOOP_NAME="hadoop-$HADOOP_VERSION" 19 | 20 | ############################## 21 | 22 | function download_dist { 23 | local HADOOP_MIRROR_DOWNLOAD="http://www-eu.apache.org/dist/hadoop/common/$HADOOP_NAME/$HADOOP_NAME.tar.gz" 24 | echo "[*] download dist" 25 | wget -q -P $DATA_PATH $HADOOP_MIRROR_DOWNLOAD 26 | } 27 | 28 | function setup_dist { 29 | local HADOOP_DIST_PATH="$DATA_PATH/$HADOOP_NAME*" 30 | echo "[*] setup dist" 31 | 32 | if [ ! -e $HADOOP_DIST_PATH ]; then 33 | download_dist 34 | fi 35 | 36 | tar -xzf $HADOOP_DIST_PATH -C /opt 37 | ln -s /opt/$HADOOP_NAME /usr/local/hadoop 38 | chown -R $USER_NAME:$USER_NAME /opt/$HADOOP_NAME 39 | } 40 | 41 | function setup_config { 42 | local DATA_PATH_GUEST="/vol/hadoop" 43 | local HADOOP_BASE_PATH="/usr/local/hadoop" 44 | local CONFIG_PATH="$HADOOP_BASE_PATH/etc/hadoop" 45 | local FILES=( "core-site.xml" "hdfs-site.xml" "mapred-site.xml" "yarn-site.xml" "fair-scheduler.xml" "masters" "slaves" ) 46 | 47 | echo "[*] create directories" 48 | mkdir -pv \ 49 | $DATA_PATH_GUEST/{namenode,secondary,datanode} \ 50 | $DATA_PATH_GUEST/log/{hadoop,yarn,mapred} 51 | 52 | for FILE in "${FILES[@]}" 53 | do 54 | echo "[*] update config: $FILE" 55 | # backup only if exists 56 | [ -e $CONFIG_PATH/$FILE ] && mv $CONFIG_PATH/$FILE $CONFIG_PATH/$FILE.orig 57 | cp $FILE_PATH/hadoop/config/$FILE $CONFIG_PATH/$FILE 58 | done 59 | 60 | echo "[*] update permissions" 61 | # important final slash to be recursive 62 | chown -R $USER_NAME:$USER_NAME \ 63 | $HADOOP_BASE_PATH/ \ 64 | $DATA_PATH_GUEST/ 65 | 66 | echo "[*] update env" 67 | cp $FILE_PATH/hadoop/profile-hadoop.sh /etc/profile.d/profile-hadoop.sh && \ 68 | source /etc/profile.d/profile-hadoop.sh 69 | } 70 | 71 | function init_hdfs { 72 | local HOSTNAME=$(hostname) 73 | echo "[*] init hdfs: $HOSTNAME" 74 | hadoop version 75 | 76 | case $HOSTNAME in 77 | "master") 78 | sudo -i -u $USER_NAME hdfs namenode -format 79 | ;; 80 | *) 81 | # nothing to do 82 | ;; 83 | esac 84 | } 85 | 86 | function main { 87 | echo "[+] setup hadoop" 88 | setup_dist 89 | setup_config 90 | init_hdfs 91 | echo "[-] setup hadoop" 92 | } 93 | 94 | main 95 | -------------------------------------------------------------------------------- /docs/other-resources.md: -------------------------------------------------------------------------------- 1 | # Other resources 2 | 3 | ## Computer Science 4 | 5 | * [CS 101: Introduction to Computing Principles](https://web.stanford.edu/class/cs101/) 6 | * [Stanford CS Education Library](http://cslibrary.stanford.edu) 7 | * [Foundations of Computer Science](http://infolab.stanford.edu/~ullman/focs.html) 8 | * [Computer Networks From Scratch](https://www.networksfromscratch.com) 9 | * [Code With Engineering Playbook](https://microsoft.github.io/code-with-engineering-playbook) 10 | * [Which programs are faster?](https://benchmarksgame-team.pages.debian.net/benchmarksgame) 11 | * [Addison-Wesley Professional Computing Series](https://informit.com/series/professionalcomputing) 12 | 13 | ## Machine Learning 14 | 15 | * [Machine Learning](https://www.coursera.org/learn/machine-learning) (Course) 16 | * [Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course) (Course) 17 | * [Amazon's Machine Learning University](https://www.amazon.science/latest-news/machine-learning-course-free-online-from-amazon-machine-learning-university) (Course) 18 | * [Making Things Think](https://www.holloway.com/g/making-things-think) (Book) 19 | * [Machine Learning from Scratch](https://dafriedman97.github.io/mlbook/content/introduction.html) (Book) 20 | * [What is Natural Language Processing?](https://blog.algorithmia.com/introduction-natural-language-processing-nlp) 21 | * [Scipy Lecture Notes](http://scipy-lectures.org/index.html) 22 | * [Neural Networks](https://aegeorge42.github.io) 23 | * [An Introduction to Tensor Calculus](https://grinfeld.org/books/An-Introduction-To-Tensor-Calculus) 24 | * [Neural Network From Scratch](https://sirupsen.com/napkin/neural-net) 25 | * [The latest in Machine Learning](https://paperswithcode.com) (Papers) 26 | 27 | ## Book collections 28 | 29 | * [The Online Books Page](https://onlinebooks.library.upenn.edu) 30 | * [A collection of free books from Springer](https://hnarayanan.github.io/springer-books) 31 | * [E-Books Directory](http://www.e-booksdirectory.com) 32 | * [OpenStax](https://openstax.org/subjects) 33 | * [Mark Watson: author of 20+ books](https://markwatson.com/#books) 34 | * [LibriVox](https://librivox.org) (audiobook) 35 | * [Textbooks](https://textbooks.cs.ksu.edu/) 36 | * [Global Grey](https://www.globalgreyebooks.com/index.html) 37 | * [Pirate Library Mirror](http://pilimi.org) 38 | * [freeread.org: For the human right to read](https://freeread.org) 39 | 40 | ## Random 41 | 42 | * [suckless](http://suckless.org) 43 | * [Biohacking Lite](https://karpathy.github.io/2020/06/11/biohacking-lite) 44 | 45 | ## Hacker News 46 | 47 | * [Ask HN: Great Blogs by Programmers](https://news.ycombinator.com/item?id=30245247) 48 | * [Ask HN: Can I see your cheatsheet?](https://news.ycombinator.com/item?id=31928736) 49 | * [Ask HN: What are the major open source alternatives to Auth0?](https://news.ycombinator.com/item?id=29392517) 50 | 51 |
52 | -------------------------------------------------------------------------------- /hadoop/vagrant_hadoop.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # unofficial bash strict mode 4 | set -euo pipefail 5 | IFS=$'\n\t' 6 | 7 | # run from any directory (no symlink allowed) 8 | CURRENT_PATH=$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd -P) 9 | cd ${CURRENT_PATH} 10 | 11 | ############################## 12 | 13 | DATA_PATH=".data" 14 | KEY_NAME="hadoop_rsa" 15 | BOX_NAME="master" 16 | 17 | ############################## 18 | 19 | # param #1: 20 | function verify_requirement { 21 | local BIN=$1 22 | echo "[*] verify requirement: $BIN" 23 | command -v $BIN >/dev/null 2>&1 || (echo "[-] error: $BIN not found" && exit 1) 24 | } 25 | 26 | # param #1: 27 | # param #2: 28 | function init_key_pair { 29 | local NAME=$1 30 | local BASE_PATH=$2 31 | local KEY_PATH="$BASE_PATH/$NAME" 32 | 33 | if [ ! -e $KEY_PATH ]; then 34 | mkdir -p $BASE_PATH 35 | ssh-keygen -t rsa -b 4096 -C $NAME -N "" -f $KEY_PATH 36 | echo "[*] new ssh key pair generated: $KEY_PATH" 37 | else 38 | echo "[*] ssh key pair found: $KEY_PATH" 39 | fi 40 | } 41 | 42 | # param #1: 43 | function start_vagrant { 44 | local NAME=$1 45 | local STATUS=$(vagrant status | grep -m 1 $NAME | awk '{ print toupper($2) }') 46 | echo -e "[*] start vagrant: name=$NAME | status=$STATUS" 47 | 48 | case $STATUS in 49 | # not created | poweroff | aborted 50 | "NOT"|"POWEROFF"|"ABORTED"|"SAVED") 51 | #vagrant up --debug &> .vagrant/debug.log 52 | vagrant up && vagrant ssh $NAME 53 | ;; 54 | # running 55 | "RUNNING") 56 | vagrant ssh $NAME 57 | ;; 58 | *) 59 | echo "[-] error: vagrant status unknown" 60 | ;; 61 | esac 62 | } 63 | 64 | explosion() { 65 | cat<<"EOT" 66 | ____ 67 | __,-~~/~ `---. 68 | _/_,---( , ) 69 | __ / < / ) \___ 70 | - ------===;;;'====------------------===;;;===----- - - 71 | \/ ~"~"~"~"~"~\~"~)~"/ 72 | (_ ( \ ( > \) 73 | \_( _ < >_>' 74 | ~ `-i' ::>|--" 75 | I;|.|.| 76 | <|i::|i|`. 77 | (` ^'"`-' ") 78 | ------------------------------------------------------------------ 79 | 80 | EOT 81 | } 82 | 83 | ############################## 84 | 85 | function init_folder { 86 | echo "[*] init folder" 87 | mkdir -pv \ 88 | ${DATA_PATH}/$BOX_NAME \ 89 | ${DATA_PATH}/node-{1,2,3} 90 | } 91 | 92 | function hadoop-start { 93 | verify_requirement vagrant 94 | verify_requirement ssh-keygen 95 | 96 | init_folder 97 | init_key_pair $KEY_NAME $DATA_PATH 98 | start_vagrant $BOX_NAME 99 | } 100 | 101 | function hadoop-destroy { 102 | read -p "Are you sure? [y/n]" -n 1 -r 103 | echo 104 | if [[ $REPLY =~ ^[Yy]$ ]] 105 | then 106 | vagrant destroy -f 107 | rm -frv \ 108 | .vagrant \ 109 | ${DATA_PATH}/$KEY_NAME* \ 110 | ${DATA_PATH}/$BOX_NAME \ 111 | ${DATA_PATH}/node-{1,2,3} 112 | explosion 113 | fi 114 | } 115 | -------------------------------------------------------------------------------- /docs/zookeeper.md: -------------------------------------------------------------------------------- 1 | # ZooKeeper 2 | 3 | > **ZooKeeper** is a centralized service for maintaining configuration information, naming, providing distributed synchronization, and providing group services 4 | 5 | Resources 6 | 7 | * [Documentation](https://zookeeper.apache.org) 8 | 9 | * [Curator](https://curator.apache.org) 10 | 11 | ## Setup 12 | 13 | Requirements 14 | 15 | * [Base](docker/#base-image) image 16 | 17 | Build `devops/zookeeper` image 18 | ```bash 19 | # change path 20 | cd devops/zookeeper 21 | 22 | # build image 23 | docker build -t devops/zookeeper:latest . 24 | # build image with specific version - see Dockerfile for version 3.5.x 25 | docker build -t devops/zookeeper:3.4.10 --build-arg VERSION=3.4.10 . 26 | 27 | # temporary container [host:container] 28 | docker run --rm --name zookeeper -p 12181:2181 devops/zookeeper 29 | # access container 30 | docker exec -it zookeeper bash 31 | 32 | # paths 33 | /opt/zookeeper 34 | /var/log/zookeeper 35 | /var/lib/zookeeper 36 | /var/log/supervisord.log 37 | 38 | # logs 39 | tail -F /var/log/supervisord.log 40 | # check service status 41 | supervisorctl status 42 | supervisorctl restart zookeeper 43 | ``` 44 | 45 | Example 46 | ```bash 47 | docker exec -it zookeeper bash 48 | 49 | # (option 1) check zookeeper status 50 | echo ruok | nc localhost 2181 51 | 52 | # (option 2) check zookeeper status 53 | telnet localhost 2181 54 | # expect answer imok 55 | > ruok 56 | 57 | zkCli.sh -server 127.0.0.1:2181 58 | help 59 | # list znodes 60 | ls / 61 | # create znode and associate value 62 | create /zk_test my_data 63 | # verify data 64 | get /zk_test 65 | # change value 66 | set /zk_test junk 67 | # delete znode 68 | delete /zk_test 69 | ``` 70 | 71 | ## The four-letter words 72 | 73 | | Category | Command | Description | 74 | | -------- |:-------:| ----------- | 75 | | Server status | **ruok** | Prints *imok* if the server is running and not in an error state | 76 | | | **conf** | Prints the server configuration (from zoo.cfg) | 77 | | | **envi** | Prints the server environment, including ZooKeeper version, Java version, and other system properties | 78 | | | **srvr** | Prints server statistics, including latency statistics, the number of znodes, and the server mode (standalone, leader, or follower) | 79 | | | **stat** | Prints server statistics and connected clients | 80 | | | **srst** | Resets server statistics | 81 | | | **isro** | Shows whether the server is in read-only ( ro ) mode (due to a network partition) or read/write mode (rw) | 82 | | Client connections | **dump** | Lists all the sessions and ephemeral znodes for the ensemble. You must connect to the leader (see srvr) for this command | 83 | | | **cons** | Lists connection statistics for all the server's clients | 84 | | | **crst** | Resets connection statistics | 85 | | Watches | **wchs** | Lists summary information for the server's watches | 86 | | | **wchc** | Lists all the server's watches by connection, may impact server performance for a large number of watches | 87 | | | **wchp** | Lists all the server’s watches by znode path, may impact server performance for a large number of watches | 88 | | Monitoring | **mntr** | Lists server statistics in Java properties format, suitable as a source for monitoring systems such as Ganglia and Nagios | 89 | 90 |
91 | -------------------------------------------------------------------------------- /hadoop/script/setup_spark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # unofficial bash strict mode 4 | set -euo pipefail 5 | IFS=$'\n\t' 6 | 7 | # run from any directory (no symlink allowed) 8 | CURRENT_PATH=$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd -P) 9 | cd ${CURRENT_PATH} 10 | 11 | ############################## 12 | 13 | FILE_PATH="/vagrant/file" 14 | DATA_PATH="/vagrant/.data" 15 | USER_NAME="hadoop" 16 | 17 | SPARK_VERSION="2.2.1" 18 | HADOOP_VERSION="2.7" 19 | SPARK_NAME="spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION" 20 | 21 | ############################## 22 | 23 | function download_dist { 24 | local SPARK_MIRROR_DOWNLOAD="http://www-eu.apache.org/dist/spark/spark-$SPARK_VERSION/$SPARK_NAME.tgz" 25 | echo "[*] download dist" 26 | wget -q -P $DATA_PATH $SPARK_MIRROR_DOWNLOAD 27 | } 28 | 29 | function setup_dist { 30 | local SPARK_DIST_PATH="$DATA_PATH/$SPARK_NAME*" 31 | echo "[*] setup dist" 32 | 33 | if [ ! -e $SPARK_DIST_PATH ]; then 34 | download_dist 35 | fi 36 | 37 | tar -xf $SPARK_DIST_PATH -C /opt 38 | ln -s /opt/$SPARK_NAME /usr/local/spark 39 | chown -R $USER_NAME:$USER_NAME /opt/$SPARK_NAME 40 | } 41 | 42 | function setup_config { 43 | local DATA_PATH_GUEST="/vol/spark" 44 | local SPARK_BASE_PATH="/usr/local/spark" 45 | local CONFIG_PATH="$SPARK_BASE_PATH/conf" 46 | local HISTORY_PATH="/tmp/spark-events" 47 | local FILES=( "spark-env.sh" "log4j.properties" ) 48 | 49 | echo "[*] create directories" 50 | mkdir -pv \ 51 | $DATA_PATH_GUEST/log \ 52 | $HISTORY_PATH 53 | 54 | for FILE in "${FILES[@]}" 55 | do 56 | echo "[*] update config: $FILE" 57 | # backup only if exists 58 | [ -e $CONFIG_PATH/$FILE ] && mv $CONFIG_PATH/$FILE $CONFIG_PATH/$FILE.orig 59 | cp $FILE_PATH/spark/config/$FILE $CONFIG_PATH/$FILE 60 | done 61 | 62 | echo "[*] update permissions" 63 | chown -R $USER_NAME:$USER_NAME \ 64 | $SPARK_BASE_PATH/ \ 65 | $HISTORY_PATH 66 | 67 | echo "[*] update env" 68 | cp $FILE_PATH/spark/profile-spark.sh /etc/profile.d/profile-spark.sh && \ 69 | source /etc/profile.d/profile-spark.sh 70 | 71 | # TODO config spark on yarn as default 72 | # verify jars/archive path, ports between nodes and memory issues 73 | # https://www.linode.com/docs/databases/hadoop/install-configure-run-spark-on-top-of-hadoop-yarn-cluster/ 74 | 75 | # spark-shell --master yarn --deploy-mode client 76 | # ERROR SparkContext: Error initializing SparkContext 77 | # org.apache.spark.SparkException: Yarn application has already ended! It might have been killed or unable to launch application master. 78 | 79 | # @see spark-defaults.conf 80 | 81 | # hadoop fs -ls -h -R / 82 | # hdfs dfs -mkdir -p /user/spark/{log,share/lib} 83 | # hadoop fs -put /usr/local/spark/jars/*.jar /user/spark/share/lib/ 84 | 85 | # yarn-site.xml 86 | # 87 | # 88 | # yarn.resourcemanager.address 89 | # resource-manager.local:8032 90 | # 91 | 92 | # zip -j /vol/spark/log/spark-archive.zip /usr/local/spark/jars/*.jar 93 | # hadoop fs -put /vol/spark/log/spark-archive.zip /user/spark/share/spark-archive.zip 94 | } 95 | 96 | function main { 97 | echo "[+] setup spark" 98 | setup_dist 99 | setup_config 100 | echo "[-] setup spark" 101 | } 102 | 103 | main 104 | -------------------------------------------------------------------------------- /hadoop/script/setup_ubuntu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # unofficial bash strict mode 4 | set -euo pipefail 5 | IFS=$'\n\t' 6 | 7 | # run from any directory (no symlink allowed) 8 | CURRENT_PATH=$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd -P) 9 | cd ${CURRENT_PATH} 10 | 11 | ############################## 12 | 13 | FILE_PATH="/vagrant/file" 14 | DATA_PATH="/vagrant/.data" 15 | KEY_NAME="hadoop_rsa" 16 | USER_NAME="hadoop" 17 | USER_ID=1101 18 | HOME_PATH="/home/$USER_NAME" 19 | 20 | ############################## 21 | 22 | function apt_update { 23 | echo "[*] apt update" 24 | apt-get -qq update && apt-get -qq upgrade -y 25 | } 26 | 27 | function setup_packages { 28 | local LOG_PATH="/tmp/apt-packages.log" 29 | echo "[*] setup packages" 30 | 31 | apt-get -qq update && apt-get install -y \ 32 | tree \ 33 | zip \ 34 | unzip \ 35 | jq \ 36 | httpie \ 37 | &> $LOG_PATH && \ 38 | apt-get clean 39 | } 40 | 41 | function setup_java { 42 | local LOG_PATH="/tmp/apt-java.log" 43 | echo "[*] setup java" 44 | 45 | add-apt-repository ppa:openjdk-r/ppa -y &> $LOG_PATH 46 | 47 | apt-get -qq update && apt-get install -y \ 48 | openjdk-8-jdk \ 49 | &> $LOG_PATH && \ 50 | apt-get clean 51 | 52 | java -version 53 | 54 | # https://askubuntu.com/questions/866161/setting-path-variable-in-etc-environment-vs-profile 55 | # /usr/lib/jvm/java-8-openjdk-amd64 56 | echo "JAVA_HOME=$(readlink -f /usr/bin/java | sed "s:bin/java::")" | sudo tee --append /etc/environment && \ 57 | source /etc/environment 58 | } 59 | 60 | # param #1: 61 | # param #2: 62 | function create_user { 63 | local NAME=$1 64 | local ID=$2 65 | echo "[*] create user: $NAME" 66 | 67 | groupadd --gid $ID $NAME 68 | useradd --uid $ID --gid $ID --create-home --shell /bin/bash $NAME 69 | usermod --append --groups sudo,$NAME $NAME 70 | echo "$NAME ALL=(ALL:ALL) NOPASSWD: ALL" > /etc/sudoers.d/$NAME 71 | id $NAME 72 | groups $NAME 73 | } 74 | 75 | ############################## 76 | 77 | function config_ssh { 78 | local SSH_PATH="$HOME_PATH/.ssh" 79 | echo "[*] config ssh" 80 | 81 | mkdir -p $SSH_PATH 82 | # default name to avoid -i parameter 83 | cp $DATA_PATH/$KEY_NAME $SSH_PATH/id_rsa 84 | # passphraseless 85 | cat $DATA_PATH/$KEY_NAME.pub >> $SSH_PATH/authorized_keys 86 | # avoid prompt first time 87 | cp $FILE_PATH/ssh/config $SSH_PATH/config 88 | # update permissions 89 | chmod 0600 $SSH_PATH/id_rsa $SSH_PATH/authorized_keys 90 | chown -R $USER_NAME:$USER_NAME $SSH_PATH 91 | } 92 | 93 | function config_profile { 94 | echo "[*] config profile" 95 | sed -i -r "s/alias ll='ls -alF'/alias ll='ls -alh'/" $HOME_PATH/.bashrc 96 | source $HOME_PATH/.bashrc 97 | } 98 | 99 | function config_host { 100 | echo "[*] config host" 101 | cat $FILE_PATH/hosts >> /etc/hosts 102 | } 103 | 104 | function setup_motd { 105 | local MOTD_PATH="/etc/update-motd.d" 106 | echo "[*] setup motd" 107 | rm -fr $MOTD_PATH/10-help-text 108 | cp $FILE_PATH/motd $MOTD_PATH/10-custom-text 109 | chmod 0755 $MOTD_PATH/10-custom-text 110 | } 111 | 112 | function main { 113 | echo "[+] setup ubuntu" 114 | #apt_update 115 | setup_packages 116 | setup_java 117 | create_user $USER_NAME $USER_ID 118 | config_ssh 119 | config_profile 120 | config_host 121 | setup_motd 122 | echo "[-] setup ubuntu" 123 | } 124 | 125 | main 126 | -------------------------------------------------------------------------------- /cassandra/cql/example_query.cql: -------------------------------------------------------------------------------- 1 | SELECT * FROM example.messages; 2 | SELECT * FROM example.counters; 3 | SELECT * FROM example.users; 4 | 5 | SELECT COUNT(*) FROM example.messages; 6 | 7 | INSERT INTO example.messages(id, body, created_at, updated_at) 8 | VALUES(uuid(), 'message1', toTimestamp(now()), toTimestamp(now())); 9 | 10 | INSERT INTO example.messages(id, body, title, created_at, updated_at) 11 | VALUES(uuid(), 'message2', 'title2', toTimestamp(now()), toTimestamp(now())); 12 | 13 | -- no timestamp allowed on primary key 14 | SELECT id, body, WRITETIME(body) FROM example.messages; 15 | 16 | INSERT INTO example.users(first_name, last_name, last_ip, any_value, enable) 17 | VALUES('firstName1', 'lastName1', '127.0.0.1', textAsBlob('{"key1": "value1", "key2": "value2"}'), true); 18 | 19 | INSERT INTO example.users(first_name, last_name, last_ip, any_value, enable) 20 | VALUES('firstName2', 'lastName2', '0.0.0.0', textAsBlob('{"key": "value"}'), false); 21 | 22 | -- value from column 23 | SELECT blobAsText(0x7b226b6579223a202276616c7565227d) FROM example.users; 24 | 25 | -- timestamp in microsecond - old timestamp are ignored 26 | UPDATE example.users USING TIMESTAMP 1434373756626000 27 | SET last_name = 'lastName2' WHERE first_name = 'firstName1'; 28 | 29 | UPDATE example.users USING TIMESTAMP 2034373756626000 30 | SET last_name = 'lastName3' WHERE first_name = 'firstName1'; 31 | 32 | -- time to live TTL - stored on a per-column level no row 33 | -- 60 seconds 34 | UPDATE example.users USING TTL 60 35 | SET last_name = 'lastName4' WHERE first_name = 'firstName2'; 36 | 37 | SELECT first_name, last_name, TTL(last_name) FROM example.users; 38 | 39 | -- insert not allowed 40 | UPDATE example.counters SET total = total + 2 41 | WHERE id = now(); 42 | 43 | SELECT * FROM example.counters; 44 | 45 | UPDATE example.users SET emails = { 'hello@example.com' } 46 | WHERE first_name = 'firstName1'; 47 | 48 | UPDATE example.users SET emails = emails + { 'world@example.com' } 49 | WHERE first_name = 'firstName1'; 50 | 51 | UPDATE example.users SET emails = emails - { 'world@example.com' } 52 | WHERE first_name = 'firstName1'; 53 | 54 | UPDATE example.users SET emails = {} 55 | WHERE first_name = 'firstName1'; 56 | 57 | UPDATE example.users SET emails = { 'hello@example.com', 'world@example.com' } 58 | WHERE first_name = 'firstName2'; 59 | 60 | UPDATE example.users SET phone_numbers = [ '1-800-999-9999' ] 61 | WHERE first_name = 'firstName1'; 62 | 63 | UPDATE example.users SET phone_numbers = phone_numbers + [ '480-111-1111' ] 64 | WHERE first_name = 'firstName1'; 65 | 66 | UPDATE example.users SET phone_numbers = [ '111-222-3333' ] + phone_numbers 67 | WHERE first_name = 'firstName1'; 68 | 69 | -- start from index 0 70 | UPDATE example.users SET phone_numbers[1] = '000-000-0000' 71 | WHERE first_name = 'firstName1'; 72 | 73 | UPDATE example.users SET phone_numbers = phone_numbers - [ '000-000-0000' ] 74 | WHERE first_name = 'firstName1'; 75 | 76 | UPDATE example.users SET login_sessions = { now(): 13, now(): 18} 77 | WHERE first_name = 'firstName2'; 78 | 79 | -- use index 80 | SELECT * FROM example.users WHERE last_name = 'lastName3'; 81 | 82 | -- use sasi index 83 | SELECT * FROM example.users WHERE last_name LIKE 'last%'; 84 | 85 | UPDATE example.users SET addresses = addresses + 86 | {'home': { street: 'street1', city: 'city1', state: 'STATE', zip_code: 12345} } 87 | WHERE first_name = 'firstName2'; 88 | 89 | DROP INDEX example.users_last_name_idx; 90 | 91 | DELETE phone_numbers[0] FROM example.users 92 | WHERE first_name = 'firstName1'; 93 | 94 | DELETE last_ip from example.users WHERE first_name = 'firstName2'; 95 | 96 | TRUNCATE example.messages; 97 | 98 | DROP TABLE example.messages; 99 | -------------------------------------------------------------------------------- /docs/programming.md: -------------------------------------------------------------------------------- 1 | # Programming 2 | 3 | ## Courses 4 | 5 | * 6.001: Structure and Interpretation of Computer Programs MIT [ [course](https://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-001-structure-and-interpretation-of-computer-programs-spring-2005) | [book](https://mitpress.mit.edu/sites/default/files/sicp/index.html) ] 6 | * [CS 6120: Advanced Compilers](https://www.cs.cornell.edu/courses/cs6120/2020fa/self-guided) 7 | * [History of Programming Languages](https://felleisen.org/matthias/7480-s21/index.html) 8 | 9 | ## Books 10 | 11 | * [Compilers: Principles, Techniques, and Tools](https://suif.stanford.edu/dragonbook) 12 | * [Writing an Interpreter and Compiler in Go](https://gumroad.com/l/waiig_wacig_bundle) 13 | * [Crafting Interpreters](https://craftinginterpreters.com) (online) 14 | * [Programming Languages: Application and Interpretation](https://cs.brown.edu/courses/cs173/2012/book) (online) 15 | * [Build Your Own Lisp](https://buildyourownlisp.com) (online) 16 | * [LispE: Lisp Elémentaire](https://github.com/naver/lispe/wiki) (online) 17 | 18 | ## Interpreter / Compiler 19 | 20 | * [awesome-compilers](https://github.com/aalhour/awesome-compilers) 21 | * [Compiler Explorer](https://godbolt.org) 22 | * [Let's Build a Compiler](https://xmonader.github.io/letsbuildacompiler-pretty/about.html) 23 | * [A C version of the "Let's Build a Compiler"](https://github.com/lotabout/Let-s-build-a-compiler) 24 | * [Let's write a compiler](https://briancallahan.net/blog/20210814.html) 25 | * [An Intro to Compilers](https://web.archive.org/web/20210111064441/https://nicoleorchard.com/blog/compilers) (archive) 26 | * [Tiny C Compiler](https://bellard.org/tcc) 27 | * [rui314/chibicc: A small C compiler](https://github.com/rui314/chibicc) 28 | * [The Super Tiny Compiler!](https://github.com/jamiebuilds/the-super-tiny-compiler) 29 | * [Obfuscated Tiny C Compiler](https://bellard.org/otcc) 30 | * [Tinylisp: Lisp in 99 lines of C and how to write one yourself](https://github.com/Robert-van-Engelen/tinylisp) 31 | * [Lessons from Writing a Compiler](https://borretti.me/article/lessons-writing-compiler) 32 | 33 | ## Parser 34 | 35 | * [Parsing Text with Nom](https://blog.adamchalmers.com/nom-chars) 36 | * [How to write a tree-sitter grammar in an afternoon](https://siraben.dev/2022/03/01/tree-sitter.html) 37 | * [Writing a Simple Parser in Rust](https://adriann.github.io/rust_parser.html) 38 | 39 | ## LLVM 40 | 41 | * [How to learn compilers: LLVM edition](https://lowlevelbits.org/how-to-learn-compilers-llvm-edition) 42 | * [A Complete Guide to LLVM for Programming Language Creators](https://mukulrathi.com/create-your-own-programming-language/llvm-ir-cpp-api-tutorial) 43 | 44 | ## Random 45 | 46 | * [Esolang](https://esolangs.org) 47 | * [Compile code into silicon](https://www.siliconcompiler.com) 48 | * [Make A Language](https://arzg.github.io/lang) (rust) 49 | * [Designing a programming language](http://ducklang.org/designing-a-programming-language-i) 50 | * [mirdaki/theforce: The Force - A Star Wars themed programming language](https://github.com/mirdaki/theforce) 51 | * [riicchhaarrd/ocean: Programming language that compiles into a x86 ELF executable](https://github.com/riicchhaarrd/ocean) 52 | * [adam-mcdaniel/oakc: An infinitely more portable alternative to the C programming language](http://github.com/adam-mcdaniel/oakc) 53 | * [Creating the Golfcart Programming Language](https://healeycodes.com/creating-the-golfcart-programming-language) 54 | * [I wrote a linker everyone can understand!](https://briancallahan.net/blog/20210609.html) 55 | * [spencertipping/jit-tutorial: How to write a JIT compiler](https://github.com/spencertipping/jit-tutorial) 56 | * [What Every Computer Scientist Should Know About Floating-Point Arithmetic](https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html) 57 | 58 |
59 | -------------------------------------------------------------------------------- /hadoop/script/setup_oozie.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # unofficial bash strict mode 4 | set -euo pipefail 5 | IFS=$'\n\t' 6 | 7 | # run from any directory (no symlink allowed) 8 | CURRENT_PATH=$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd -P) 9 | cd ${CURRENT_PATH} 10 | 11 | ############################## 12 | 13 | FILE_PATH="/vagrant/file" 14 | DATA_PATH="/vagrant/.data" 15 | USER_NAME="hadoop" 16 | 17 | HADOOP_VERSION="2.7.5" 18 | EXTJS_NAME="ext-2.2" 19 | OOZIE_VERSION="5.0.0-beta1" 20 | OOZIE_NAME="oozie-$OOZIE_VERSION" 21 | OOZIE_BASE_PATH="/usr/local/oozie" 22 | 23 | ############################## 24 | 25 | function setup_maven { 26 | local LOG_PATH="/tmp/apt-maven.log" 27 | echo "[*] setup maven" 28 | 29 | apt-get -qq update && apt-get install -y \ 30 | maven \ 31 | &> $LOG_PATH && \ 32 | apt-get clean 33 | 34 | mvn -version 35 | 36 | # environment variables 37 | #export M2_HOME=/usr/share/maven 38 | #export PATH=${M2_HOME}/bin:${PATH} 39 | 40 | # configuration path 41 | #/etc/maven 42 | } 43 | 44 | function download_oozie_dist { 45 | local OOZIE_MIRROR_DOWNLOAD="http://www-eu.apache.org/dist/oozie/$OOZIE_VERSION/$OOZIE_NAME.tar.gz" 46 | echo "[*] download oozie dist" 47 | wget -q -P $DATA_PATH $OOZIE_MIRROR_DOWNLOAD 48 | } 49 | 50 | function download_extjs_dist { 51 | local EXTJS_MIRROR_DOWNLOAD="http://archive.cloudera.com/gplextras/misc/$EXTJS_NAME.zip" 52 | echo "[*] download extjs dist" 53 | wget -q -P $DATA_PATH $EXTJS_MIRROR_DOWNLOAD 54 | } 55 | 56 | function setup_dist { 57 | local DATA_PATH_GUEST="/vol/oozie" 58 | local OOZIE_DIST_PATH="$DATA_PATH/$OOZIE_NAME*" 59 | local EXTJS_DIST_PATH="$DATA_PATH/$EXTJS_NAME*" 60 | local CONFIG_PATH="$OOZIE_BASE_PATH/conf" 61 | local FILES=( "oozie-site.xml" "oozie-env.sh" ) 62 | echo "[*] setup dist" 63 | 64 | if [ ! -e $OOZIE_DIST_PATH ]; then 65 | download_oozie_dist 66 | fi 67 | if [ ! -e $EXTJS_DIST_PATH ]; then 68 | download_extjs_dist 69 | fi 70 | 71 | echo "[*] create directories" 72 | mkdir -pv \ 73 | $DATA_PATH_GUEST/{log,data} 74 | 75 | echo "[*] build sources" 76 | tar -xzf $OOZIE_DIST_PATH -C /tmp 77 | /tmp/$OOZIE_NAME/bin/mkdistro.sh \ 78 | -DskipTests \ 79 | -Puber \ 80 | -Dhadoop.version=$HADOOP_VERSION 81 | tar -xzf /tmp/$OOZIE_NAME/distro/target/$OOZIE_NAME-distro.tar.gz -C /opt 82 | ln -s /opt/$OOZIE_NAME $OOZIE_BASE_PATH 83 | 84 | echo "[*] add ExtJS external lib" 85 | mkdir -p $OOZIE_BASE_PATH/libext 86 | cp $EXTJS_DIST_PATH $OOZIE_BASE_PATH/libext 87 | 88 | echo "[*] setup examples" 89 | tar -xzf $OOZIE_BASE_PATH/oozie-examples.tar.gz -C $DATA_PATH_GUEST 90 | 91 | for FILE in "${FILES[@]}" 92 | do 93 | echo "[*] update config: $FILE" 94 | # backup only if exists 95 | [ -e $CONFIG_PATH/$FILE ] && mv $CONFIG_PATH/$FILE $CONFIG_PATH/$FILE.orig 96 | cp $FILE_PATH/oozie/config/$FILE $CONFIG_PATH/$FILE 97 | done 98 | 99 | echo "[*] update permissions" 100 | chown -R $USER_NAME:$USER_NAME \ 101 | $OOZIE_BASE_PATH/ \ 102 | $DATA_PATH_GUEST/ 103 | 104 | echo "[*] update env" 105 | cp $FILE_PATH/oozie/profile-oozie.sh /etc/profile.d/profile-oozie.sh && \ 106 | source /etc/profile.d/profile-oozie.sh 107 | } 108 | 109 | function init_oozie { 110 | echo "[*] init oozie" 111 | su --login $USER_NAME -c "$OOZIE_BASE_PATH/bin/oozie-setup.sh sharelib create -fs hdfs://namenode.local:9000" 112 | su --login $USER_NAME -c "$OOZIE_BASE_PATH/bin/ooziedb.sh create -sqlfile oozie.sql -run" 113 | } 114 | 115 | # only for development purpose 116 | function remove_oozie { 117 | echo "[*] remove oozie" 118 | hadoop fs -rm -f -R /user/$USER_NAME/examples 119 | 120 | rm -frv \ 121 | "/vol/oozie" \ 122 | "/usr/local/oozie" \ 123 | "/opt/oozie-*" \ 124 | "/tmp/oozie*" 125 | } 126 | 127 | function main { 128 | echo "[+] setup oozie" 129 | setup_maven 130 | setup_dist 131 | init_oozie 132 | echo "[-] setup oozie" 133 | } 134 | 135 | main 136 | #remove_oozie 137 | 138 | # http://www.thecloudavenue.com/2013/10/installation-and-configuration-of.html 139 | # https://www.edureka.co/blog/apache-oozie-tutorial/ 140 | -------------------------------------------------------------------------------- /docs/cloud.md: -------------------------------------------------------------------------------- 1 | # Cloud 2 | 3 | * [CNCF cloud native landscape](https://landscape.cncf.io) 4 | * [CloudSkew](https://www.cloudskew.com) - Draw cloud architecture diagrams 5 | * [Steampipe](https://steampipe.io) - `select * from cloud;` 6 | * [Infracost](https://www.infracost.io) - Cloud cost estimates for Terraform in pull requests 7 | * [Rover - Terraform Visualizer](https://github.com/im2nguyen/rover) 8 | 9 | ## AWS 10 | 11 | * [AWS diagrams & notes](https://www.awsgeek.com) 12 | * [cfn-diagram](https://github.com/mhlabs/cfn-diagram) - Visualise CloudFormation/SAM/CDK templates as diagrams 13 | * [CDK-Dia](https://github.com/pistazie/cdk-dia) - Automated diagrams for CDK infrastructure 14 | 15 | ## Kubernetes 16 | 17 | ### Resources 18 | 19 | * [Kube by Example](http://kubebyexample.com) 20 | * [Kubernetes The Hard Way](https://github.com/kelseyhightower/kubernetes-the-hard-way) 21 | * [Kubernetes Best Practices 101](https://github.com/diegolnasc/kubernetes-best-practices) 22 | * [Kubernetes Failure Stories](https://k8s.af) 23 | * [10 most common mistakes using kubernetes](https://blog.pipetail.io/posts/2020-05-04-most-common-mistakes-k8s) 24 | * [Container Training](https://container.training) 25 | * [15 Kubernetes Best Practices Every Developer Should Know](https://spacelift.io/blog/kubernetes-best-practices) 26 | 27 | ### Tools 28 | 29 | * [Kubetools](https://collabnix.github.io/kubetools) - A Curated List of Kubernetes Tools 30 | * [KEDA](https://keda.sh) - Kubernetes Event-driven Autoscaling 31 | * [Mizu](https://getmizu.io) - API Traffic Viewer for Kubernetes 32 | * [Sloop](https://github.com/salesforce/sloop) - Kubernetes History Visualization 33 | * [Atlas](https://greenops.io/atlas) - Effortless deployment pipelines for Kubernetes 34 | * [kube-chaos](https://github.com/Shogan/kube-chaos) 35 | * [minikube](https://minikube.sigs.k8s.io) 36 | * [K3s](https://k3s.io) - Lightweight Kubernetes 37 | * [devtron](https://github.com/devtron-labs/devtron) - Tool integration platform for Kubernetes 38 | * [arkade](https://github.com/alexellis/arkade) - Open Source Marketplace For Kubernetes 39 | * [Kubernetes YAML Generator](https://k8syaml.com) 40 | * [kind](https://kind.sigs.k8s.io) - Local Kubernetes clusters using Docker 41 | 42 | ### Cli 43 | 44 | * [Kustomize](https://kustomize.io) - Customization of Kubernetes YAML configurations 45 | * [Krew](https://github.com/kubernetes-sigs/krew) - Find and install kubectl plugins 46 | * [kubectx](https://ahmet.im/blog/kubectx) - A tool to switch between Kubernetes contexts 47 | * [Display the current kubectl context in the Bash prompt](https://pracucci.com/display-the-current-kubelet-context-in-the-bash-prompt.html) 48 | * [Kubie](https://github.com/sbstp/kubie) - A more powerful alternative to kubectx and kubens 49 | * [kube-ps1](https://github.com/jonmosco/kube-ps1) - Kubernetes prompt 50 | * [kube-prompt](https://github.com/c-bata/kube-prompt) - An interactive kubernetes client featuring auto-complete 51 | * [kubeprompt](https://github.com/jlesquembre/kubeprompt) - Isolated kubectl shells and prompt info 52 | * [kubefwd](https://kubefwd.com) - Kubernetes port forwarding for local development 53 | * [stern](https://github.com/wercker/stern) - Multi pod and container log tailing for Kubernetes (obsolete) 54 | * [kail](https://github.com/boz/kail) - kubernetes log viewer 55 | * [k9s](https://k9scli.io) - Kubernetes CLI To Manage Your Clusters In Style! 56 | * [KDash](https://github.com/kdash-rs/kdash) - A fast and simple dashboard for Kubernetes 57 | * [Skaffold](https://skaffold.dev) - Local Kubernetes Development 58 | 59 | ### Homelab 60 | 61 | * [k8s@home](https://docs.k8s-at-home.com) 62 | * [k3sup](https://github.com/alexellis/k3sup) 63 | * [Dan Manners' Homelab](https://github.com/danmanners/homelab-kube-cluster) 64 | * [Khue's Homelab](https://github.com/khuedoan/homelab) 65 | * [Humble Project](https://github.com/locmai/humble) 66 | * [Truxnell's home k8s cluster](https://github.com/onedr0p/home-cluster) 67 | 68 | ### Raspberry Pi 69 | 70 | * [Setup Kubernetes on a Raspberry Pi Cluster easily the official way!](https://blog.hypriot.com/post/setup-kubernetes-raspberry-pi-cluster) 71 | * [Raspberry Pi Kubernetes Cluster](https://chrisshort.net/my-raspberry-pi-kubernetes-cluster) 72 | * [Building an ARM Kubernetes Cluster](https://itnext.io/building-an-arm-kubernetes-cluster-ef31032636f9) 73 | * [kube-arm](https://github.com/lahsivjar/kube-arm) 74 | 75 | ### Other 76 | 77 | * [Kubelist Podcast](https://kubelist.com/podcast) 78 | * [Kubernetes comic](https://cloud.google.com/kubernetes-engine/kubernetes-comic) 79 | -------------------------------------------------------------------------------- /dev.txt: -------------------------------------------------------------------------------- 1 | --- OLD 2 | 3 | # emr info on master node 4 | cat /mnt/var/lib/info/job-flow.json | jq 5 | 6 | * sla, percentile 7 | * add in linux: monit, nohup, screen, mc, vim keyboard shortcuts 8 | * command vs event api - use paste tense to name event 9 | 10 | * Avro **Avro** is a data serialization system 11 | * Parquet **Parquet** is a columnar storage format that can efficiently store nested data 12 | * Flume 13 | * Sqoop 14 | * Pig 15 | * Hive 16 | * Presto 17 | * Crunch 18 | * HBase 19 | * Flink 20 | * Ganglia **Ganglia** is a monitoring system for Hadoop 21 | * Zeppelin 22 | * Knox 23 | 24 | * elastic-search 25 | * kong 26 | * etcd 27 | * linux containers LXD 28 | 29 | --- 30 | 31 | # keytool 32 | https://www.digitalocean.com/community/tutorials/java-keytool-essentials-working-with-java-keystores#viewing-keystore-entries 33 | https://www.digitalocean.com/community/tutorials/openssl-essentials-working-with-ssl-certificates-private-keys-and-csrs 34 | 35 | # api 36 | https://hackernoon.com/restful-api-designing-guidelines-the-best-practices-60e1d954e7c9 37 | https://github.com/WhiteHouse/api-standards 38 | https://geemus.gitbooks.io/http-api-design/content/en/ 39 | 40 | # spark-in-action 41 | curl -O https://raw.githubusercontent.com/spark-in-action/first-edition/master/spark-in-action-box.json 42 | vagrant box add spark-in-action-box.json 43 | vagrant init manning/spark-in-action 44 | vagrant up 45 | vagrant halt 46 | vagrant destroy 47 | vagrant box remove manning/spark-in-action 48 | 49 | # hadoop 50 | https://dwbi.org/etl/bigdata/183-setup-hadoop-cluster 51 | https://www.linode.com/docs/databases/hadoop/how-to-install-and-set-up-hadoop-cluster 52 | https://www.linode.com/docs/databases/hadoop/install-configure-run-spark-on-top-of-hadoop-yarn-cluster 53 | 54 | # vagrant images 55 | https://github.com/martinprobson/vagrant-hadoop-hive-spark 56 | 57 | # back pressure 58 | https://mechanical-sympathy.blogspot.com/2012/05/apply-back-pressure-when-overloaded.html 59 | 60 | # IAC infrastructure-as-code 61 | https://blog.gruntwork.io/why-we-use-terraform-and-not-chef-puppet-ansible-saltstack-or-cloudformation-7989dad2865c 62 | 63 | # makefile 64 | https://diamantidis.github.io/tips/2020/07/01/list-makefile-targets 65 | `make -pRrq`, that is a very useful command to debug your makefiles, especially in a big project. The option `-p` prints the make data-base, -R and -r removes the implicit rules and variables, and -q indicates only asking make a question about current state, avoid executing anything 66 | 67 | # copy cd-dvd 68 | https://unix.stackexchange.com/questions/224277/is-it-better-to-use-cat-dd-pv-or-another-procedure-to-copy-a-cd-dvd/224314#224314 69 | 70 | # Serving up zero-knowledge proofs 71 | https://blog.trailofbits.com/2021/02/19/serving-up-zero-knowledge-proofs 72 | 73 | # windows 74 | https://arstechnica.com/gadgets/2021/01/dosbox-pure-for-retroarch-aims-to-simplify-classic-ms-dos-gaming 75 | 76 | # discord exporter 77 | https://github.com/Tyrrrz/DiscordChatExporter 78 | 79 | # build your own amazing illustrations 80 | https://iradesign.io 81 | 82 | # service mesh 83 | https://linkerd.io 84 | 85 | --- 86 | 87 | # torrent 88 | https://en.m.wikibooks.org/wiki/The_World_of_Peer-to-Peer_(P2P)/Networks_and_Protocols/BitTorrent 89 | https://github.com/arvidn/libtorrent 90 | https://github.com/johang/btfs 91 | https://blog.libtorrent.org/2020/09/bittorrent-v2 92 | # Kademlia: A Design Specification 93 | http://xlattice.sourceforge.net/components/protocol/kademlia/specs.html 94 | https://github.com/smmr-software/mabel 95 | 96 | # testing 97 | https://github.com/nakabonne/ali 98 | https://playwright.dev 99 | 100 | # tracing 101 | https://opentracing.io/specification 102 | https://blog.techlanika.com/distributed-tracing-the-why-what-and-how-ab9ca9e40081 103 | 104 | # aws dynamodb 105 | https://github.com/aws-samples/amazon-dynamodb-labs 106 | http://rh-web-bucket.s3.amazonaws.com/index.html 107 | https://aws.amazon.com/blogs/database/how-to-determine-if-amazon-dynamodb-is-appropriate-for-your-needs-and-then-plan-your-migration 108 | https://aws.amazon.com/blogs/database/amazon-dynamodb-auto-scaling-performance-and-cost-optimization-at-any-scale 109 | https://tech.nextroll.com/blog/dev/2019/02/05/dynamodb-managed-autoscaling.html 110 | 111 | # aws batch 112 | https://towardsdatascience.com/get-your-own-data-building-a-scalable-web-scraper-with-aws-654feb9fdad7 113 | 114 | # crypto/mininig 115 | https://monokh.com/posts/bitcoin-from-scratch-part-1 116 | https://github.com/smartcontracts/eth2-book 117 | https://cardano.org 118 | https://nano.org 119 | https://www.chia.net 120 | https://www.anchorage.com 121 | 122 | # lambda 123 | https://www.serverless.com 124 | https://www.openfaas.com 125 | https://dev.to/kumo/we-tested-the-best-serverless-monitoring-solutions-so-you-dont-have-to-121m 126 | https://jvns.ca/blog/2021/01/23/firecracker--start-a-vm-in-less-than-a-second 127 | https://www.talhoffman.com/2021/07/18/firecracker-internals 128 | -------------------------------------------------------------------------------- /docs/operating-system.md: -------------------------------------------------------------------------------- 1 | # Operating System 2 | 3 | ## Courses 4 | 5 | * [6.033: Computer System Engineering MIT](https://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-033-computer-system-engineering-spring-2018) 6 | * [6.S081: Operating System Engineering](https://pdos.csail.mit.edu/6.S081/2021/schedule.html) 7 | - [Xv6: A simple Unix-like teaching operating system](https://pdos.csail.mit.edu/6.828/2020/xv6.html) 8 | * [CS 377: Operating Systems](https://m.youtube.com/playlist?list=PLacuG5pysFbDQU8kKxbUh4K5c1iL5_k7k) (youtube) 9 | * [Operating System](https://m.youtube.com/playlist?list=PLBlnK6fEyqRiVhbXDGLXDk_OQAeuVcp2O) (youtube) 10 | * [CS 422/522: Design and Implementation of Operating Systems](https://flint.cs.yale.edu/cs422/index.html) 11 | * [CS 3210: Build an operating system in Rust programming language on Raspberry Pi 3](https://tc.gts3.org/cs3210/2020/spring/lab.html) 12 | * [ITSC 3181: Introduction to Computer Architecture](https://passlab.github.io/ITSC3181) 13 | * [LFD103: A Beginner's Guide to Linux Kernel Development](https://training.linuxfoundation.org/training/a-beginners-guide-to-linux-kernel-development-lfd103) 14 | 15 | ## Books 16 | 17 | * Operating System Concepts (10th) [ [book](https://www.os-book.com/OS10/index.html) | [slides](https://codex.cs.yale.edu/avi/courses/CS-423/slides/index.html) ] 18 | * [Lion's Commentary on UNIX with Source Code](https://www.bookdepository.com/Lion-s-Commentary-on-UNIX-with-Source-Code/9781573980135) 19 | * [UNIX Internals: The New Frontiers](https://www.amazon.co.uk/UNIX-Internals-Frontiers-Uresh-Vahalia/dp/013021034X) 20 | * [Linux From Scratch](https://www.linuxfromscratch.org) (online) 21 | * [The little book about OS development](https://littleosbook.github.io) 22 | * [OS01: Bootstrap yourself to write an OS from scratch](https://tuhdo.github.io/os01) (incomplete) 23 | * [Learning operating system development using Linux kernel and Raspberry Pi](https://s-matyukevich.github.io/raspberry-pi-os) (incomplete) 24 | * [Writing a "bare metal" operating system for Raspberry Pi 4](https://www.rpi4os.com) 25 | * [Writing a simple 16 bit VM in less than 125 lines of C](https://www.andreinc.net/2021/12/01/writing-a-simple-vm-in-less-than-125-lines-of-c) 26 | * [os-tutorial: How to create an OS from scratch](https://github.com/cfenollosa/os-tutorial) 27 | 28 | ## Kernel 29 | 30 | * [The Linux Kernel Archives](https://www.kernel.org/lore.html) 31 | * [OldLinux: Early Linux Kernel Analysis and Comments](http://www.oldlinux.org) 32 | * [Writing Your First Kernel Module](https://scottc130.medium.com/writing-your-first-kernel-module-98ae68edf0e) 33 | * [Biscuit: An OS kernel in a high-level language](https://pdos.csail.mit.edu/projects/biscuit.html) 34 | * [HermiTux: A binary-compatible unikernel](https://ssrg-vt.github.io/hermitux) 35 | * [The big idea around unikernels](https://changelog.com/posts/the-big-idea-around-unikernels) 36 | * [State of the art for Unikernels](https://github.com/seeker89/unikernels) 37 | * [Tiny Core Linux](http://www.tinycorelinux.net) 38 | 39 | ## Boot 40 | 41 | * [Bootloader basics](https://notes.eatonphil.com/bootloader-basics.html) 42 | * [Interactive x86 bootloader](https://blog.benjojo.co.uk/post/interactive-x86-bootloader-tutorial) 43 | * [A set of minimal dependency bootstrap binaries](https://github.com/oriansj/stage0) 44 | * [Writing an x86 bootloader in Rust that can launch vmlinux](https://vmm.dev/en/rust/krabs.md) 45 | 46 | ## Rust 47 | 48 | * [Writing an OS in Rust](https://os.phil-opp.com) 49 | * [Rust OS comparison](https://github.com/flosse/rust-os-comparison) 50 | * [Kerla: A new operating system kernel with Linux binary compatibility written in Rust](https://github.com/nuta/kerla) 51 | * [Redox: Redox is a Unix-like Operating System written in Rust](https://www.redox-os.org) 52 | * [Operating System development tutorials in Rust on the Raspberry Pi](https://github.com/rust-embedded/rust-raspberrypi-OS-tutorials) 53 | * [CrabOS: My hobby operating system written in Rust](https://github.com/haileys/crabos) (hobby) 54 | * [SnakeOS: Bootable x86 snake game in rust](https://github.com/trusch/snakeos) (hobby) 55 | 56 | ## Hobby 57 | 58 | * [SerenityOS](https://github.com/SerenityOS/serenity) 59 | * [Chicago95](https://github.com/grassmunk/Chicago95) 60 | * [ToaruOS](https://github.com/klange/toaruos) 61 | * [MenuetOS](https://www.menuetos.net) 62 | * [oasis](https://github.com/oasislinux/oasis) 63 | 64 | ## Alternative 65 | 66 | * [Nanos](https://nanos.org) 67 | * [Qubes OS](https://www.qubes-os.org) 68 | 69 | ## Random 70 | 71 | * [Awesome Operating System Stuff](https://github.com/jubalh/awesome-os) 72 | * [QEMU: A generic and open source machine emulator and virtualizer](https://www.qemu.org) 73 | * [Hypervisor From Scratch](https://rayanfam.com/topics/hypervisor-from-scratch-part-1) 74 | * [SCAMP CPU](https://github.com/jes/scamp-cpu) - A homebrew 16-bit CPU with a homebrew Unix-like-ish operating system 75 | * [Virtual Hackintosh](https://github.com/kholia/OSX-KVM) 76 | * [How To Write a Computer Emulator](https://fms.komkon.org/EMUL8/HOWTO.html) 77 | * [Linux x86 Program Start Up](http://dbp-consulting.com/tutorials/debugging/linuxProgramStartup.html) 78 | * [Floppinux: An Embedded Linux on a Single Floppy](https://bits.p1x.in/floppinux-an-embedded-linux-on-a-single-floppy) 79 | 80 |
81 | -------------------------------------------------------------------------------- /docs/docker.md: -------------------------------------------------------------------------------- 1 | # Docker 2 | 3 | > **Docker** is an open platform for developers and sysadmins to build, ship, and run distributed applications 4 | 5 | Resources 6 | 7 | * [Documentation](https://docs.docker.com) 8 | 9 | * [Docker in Action](https://amzn.to/2MxbJTt) (2016) by Jeff Nickoloff (Book) 10 | 11 | ## How-To 12 | 13 | Setup 14 | ```bash 15 | # install docker 16 | curl -fsSL get.docker.com -o get-docker.sh && \ 17 | chmod u+x $_ && \ 18 | ./$_ && \ 19 | sudo usermod -aG docker docker 20 | 21 | docker --version 22 | 23 | # install docker-compose 24 | sudo curl -L https://github.com/docker/compose/releases/download/1.19.0/docker-compose-`uname -s`-`uname -m` \ 25 | -o /usr/local/bin/docker-compose && \ 26 | sudo chmod +x /usr/local/bin/docker-compose 27 | 28 | docker-compose --version 29 | 30 | # install docker-machine (VirtualBox required) 31 | curl -L https://github.com/docker/machine/releases/download/v0.13.0/docker-machine-`uname -s`-`uname -m` >/tmp/docker-machine && \ 32 | sudo install /tmp/docker-machine /usr/local/bin/docker-machine 33 | 34 | docker-machine --version 35 | ``` 36 | 37 | Useful commands 38 | ```bash 39 | # list images 40 | docker images 41 | # list containers 42 | docker ps -a 43 | # list volumes 44 | docker volume ls 45 | 46 | # run temporary container 47 | docker run --rm --name phusion phusion/baseimage:latest 48 | # access container from another shell 49 | docker exec -it phusion bash 50 | 51 | # remove container by name 52 | docker ps -a -q -f name=CONTAINER_NAME | xargs --no-run-if-empty docker rm -f 53 | # delete dangling images 54 | docker images -q -f dangling=true | xargs --no-run-if-empty docker rmi 55 | # delete dangling volumes 56 | docker volume ls -q -f dangling=true | xargs --no-run-if-empty docker volume rm 57 | ``` 58 | 59 | Docker Machine 60 | ```bash 61 | # create local machine 62 | docker-machine create --driver virtualbox default 63 | 64 | # list 65 | docker-machine ls 66 | docker-machine ls --filter name=default 67 | docker-machine ls --filter state=Running 68 | docker-machine ls --format "{{.Name}}: {{.DriverName}} - {{.State}}" 69 | 70 | # info 71 | docker-machine inspect default 72 | docker-machine inspect --format='{{.Driver.IPAddress}}' default 73 | docker-machine status default 74 | docker-machine ip default 75 | 76 | # management 77 | docker-machine start default 78 | docker-machine stop default 79 | docker-machine restart default 80 | docker-machine rm default 81 | 82 | # mount volume 83 | #https://docs.docker.com/machine/reference/mount 84 | 85 | # show command to connect to machine 86 | docker-machine env default 87 | # check if variables are set 88 | env | grep DOCKER 89 | 90 | # connect to machine 91 | eval "$(docker-machine env default)" 92 | docker ps -a 93 | 94 | # show command to disconnect from machine 95 | docker-machine env -u 96 | # unset all 97 | eval $(docker-machine env -u) 98 | 99 | # access 100 | docker-machine ssh default 101 | # execute command and exit 102 | docker-machine ssh default uptime 103 | # copy files from host to guest 104 | docker-machine scp -r /FROM default:/TO 105 | 106 | # start nginx on default machine 107 | docker run -d -p 8000:80 nginx 108 | # verify from host 109 | curl $(docker-machine ip default):8000 110 | # forward to port 8080 111 | docker-machine ssh default -L 8080:localhost:8000 112 | # verify tunnel from host 113 | curl localhost:8080 114 | 115 | # disable error crash reporting 116 | mkdir -p ~/.docker/machine && touch ~/.docker/machine/no-error-report 117 | ``` 118 | 119 | ## Base image 120 | 121 | * [Supervisor](http://supervisord.org) 122 | 123 | Build `devops/base` image 124 | ```bash 125 | # change path 126 | cd devops/base 127 | 128 | # build image 129 | docker build -t devops/base . 130 | 131 | # temporary container 132 | docker run --rm --name devops-base devops/base 133 | # access container 134 | docker exec -it devops-base bash 135 | 136 | # configurations 137 | /etc/supervisor/conf.d 138 | 139 | # supervisor actions 140 | supervisorctl status 141 | supervisorctl start SERVICE_NAME 142 | supervisorctl stop SERVICE_NAME 143 | ``` 144 | 145 | ## Docker Hub 146 | 147 | * [niqdev/phusion-base](https://hub.docker.com/r/niqdev/phusion-base) 148 | * [niqdev/zookeeper](https://hub.docker.com/r/niqdev/zookeeper) 149 | * [niqdev/kafka](https://hub.docker.com/r/niqdev/kafka) 150 | 151 | ```bash 152 | docker login 153 | 154 | # phusion-base 155 | # https://github.com/phusion/baseimage-docker 156 | docker build -t devops/base:latest ./base 157 | docker tag devops/base niqdev/phusion-base:latest-amd64 158 | docker tag devops/base niqdev/phusion-base:latest 159 | docker push niqdev/phusion-base:latest-amd64 160 | docker push niqdev/phusion-base:latest 161 | 162 | # zookeeper 163 | docker build -t devops/zookeeper:latest ./zookeeper 164 | docker tag devops/zookeeper niqdev/zookeeper:3.5.5 165 | docker tag devops/zookeeper niqdev/zookeeper 166 | docker push niqdev/zookeeper:3.5.5 167 | docker push niqdev/zookeeper:latest 168 | 169 | # kafka 170 | docker build -t devops/kafka:latest ./kafka 171 | docker tag devops/kafka niqdev/kafka:2.3.0 172 | docker tag devops/kafka niqdev/kafka 173 | docker push niqdev/kafka:2.3.0 174 | docker push niqdev/kafka:latest 175 | 176 | docker-compose -f kafka/docker-compose-hub.yml up 177 | ``` 178 | 179 |
180 | -------------------------------------------------------------------------------- /docs/ansible.md: -------------------------------------------------------------------------------- 1 | # Ansible 2 | 3 | > **Ansible** is an open source automation platform that can help with config management, deployment and task automation 4 | 5 | Resources 6 | 7 | * [Documentation](http://docs.ansible.com/ansible/latest/index.html) 8 | 9 | * [Ansible - Up and Running](https://amzn.to/2IDtDSd) (2017) by Lorin Hochstein and Rene Moser (Book) 10 | 11 | * [Tutorial](https://serversforhackers.com/c/an-ansible2-tutorial) 12 | 13 | * [Playbook example](https://gist.github.com/marktheunissen/2979474) 14 | 15 | * [Ansible Tutorial for Beginners: Ultimate Playbook & Examples](https://spacelift.io/blog/ansible-tutorial) 16 | 17 | The following guide explains how to provision Ansible locally and play with it. Checkout the [Vagrantfile](https://github.com/niqdev/devops/blob/master/ansible/Vagrantfile) and the Vagrant [guide](toolbox/#vagrant) for more details. 18 | 19 | ### Setup 20 | 21 | Requirements 22 | 23 | * [Vagrant 2](https://www.vagrantup.com) 24 | * [VirtualBox 5](https://www.virtualbox.org) 25 | 26 | Directory structure 27 | ```bash 28 | tree -a ansible/ 29 | ansible/ 30 | ├── .share 31 | │   ├── node-1 32 | │   ├── node-2 33 | │   ├── node-3 34 | │   └── ssh 35 | │   ├── ansible_rsa 36 | │   └── ansible_rsa.pub 37 | ├── Vagrantfile 38 | ├── data 39 | │   ├── group_vars 40 | │   ├── host_vars 41 | │   ├── hosts 42 | │   ├── roles 43 | │   │   ├── common 44 | │   │   │   ├── defaults 45 | │   │   │   ├── files 46 | │   │   │   ├── handlers 47 | │   │   │   ├── meta 48 | │   │   │   ├── tasks 49 | │   │   │   │   ├── main.yml 50 | │   │   │   │   ├── motd.yml 51 | │   │   │   │   ├── oracle-jdk.yml 52 | │   │   │   │   └── package.yml 53 | │   │   │   ├── templates 54 | │   │   │   │   └── motd 55 | │   │   │   └── vars 56 | │   │   │   └── main.yml 57 | │   │   └── docker 58 | │   │   ├── meta 59 | │   │   │   └── main.yml 60 | │   │   └── tasks 61 | │   │   └── main.yml 62 | │   └── site.yml 63 | ├── destroy_ansible.sh 64 | ├── setup_ansible.sh 65 | └── setup_share.sh 66 | ``` 67 | 68 | The first time *only*, you have to setup the shared folders and generate the ssh key needed by ansible to access all nodes executing 69 | 70 | ```bash 71 | ./setup_share.sh 72 | ``` 73 | 74 | Start the boxes with 75 | ```bash 76 | vagrant up 77 | ``` 78 | *The first time it could take a while* 79 | 80 | Verify status of the boxes with 81 | ```bash 82 | vagrant status 83 | ``` 84 | 85 | Verify access to the boxes with 86 | ```bash 87 | vagrant ssh ansible 88 | vagrant ssh node-1 89 | ``` 90 | 91 | From inside the boxes you should be able to communicate with the others 92 | ```bash 93 | ping ansible.local 94 | ping ip-192-168-100-11.local 95 | ping 192.168.100.12 96 | ``` 97 | 98 | The following paths are shared with the boxes 99 | 100 | * `/vagrant` provision-tool 101 | * `/local` host $HOME 102 | * `/ansible` data *(ansible only)* 103 | * `/data` .share *(node only)* 104 | 105 | Cleanup 106 | ```bash 107 | ./destroy_ansible.sh 108 | ``` 109 | 110 | ## Ad-Hoc Commands 111 | 112 | Access the ansible box with 113 | ```bash 114 | vagrant ssh ansible 115 | ``` 116 | 117 | Below a list of examples 118 | ```bash 119 | 120 | # ping all nodes (default inventory /etc/ansible/hosts) 121 | ansible all -m ping 122 | ansible ansible -m ping 123 | ansible cluster -m ping 124 | 125 | # ping all nodes (specify inventory) 126 | ansible all -i "/vagrant/data/hosts" -m ping 127 | 128 | # gathering facts 129 | ansible all -m setup 130 | ansible ansible -m setup 131 | 132 | # specify host and user 133 | ansible ip-192-168-100-11.local -m ping -u vagrant 134 | 135 | # execute command 136 | ansible all -a "/bin/echo hello" 137 | ansible all -a "uptime" 138 | ansible all -a "/bin/date" 139 | # do NOT reboot vagrant through ansible (use vagrant reload) 140 | ansible cluster -a "/sbin/reboot" --become 141 | 142 | # shell module 143 | ansible all -m shell -a "pwd" 144 | # be carefull to quotes 145 | ansible all -m shell -a 'echo $HOME' 146 | 147 | # update && upgrade 148 | ansible all -m apt -a "update_cache=yes upgrade=dist" --become 149 | # restart after upgrade 150 | vagrant reload 151 | # install package 152 | ansible all -m apt -a "name=tree state=present" --become 153 | ``` 154 | 155 | ## Playbooks 156 | 157 | Access the ansible box with 158 | ```bash 159 | vagrant ssh ansible 160 | ``` 161 | 162 | Below a list of examples 163 | 164 | ```bash 165 | # test uptime on all node 166 | ansible-playbook /ansible/site.yml --tags=test --verbose 167 | 168 | # update & upgrade only on cluster nodes 169 | ansible-playbook /ansible/site.yml -t package --skip-tags=oracle-jdk --verbose 170 | 171 | # install oracle-jdk only on cluster nodes 172 | ansible-playbook /ansible/site.yml -t oracle-jdk 173 | 174 | # install all packages on cluster nodes 175 | ansible-playbook /ansible/site.yml -t package --verbose 176 | 177 | # run common task on cluster node 178 | ansible-playbook /ansible/site.yml -t common 179 | 180 | # setup docker 181 | ansible-playbook /ansible/site.yml -t docker 182 | # test docker 183 | vagrant ssh node-1 184 | sudo -i -u docker 185 | docker ps -a 186 | 187 | # custom banner 188 | ansible-playbook /ansible/site.yml -t motd 189 | 190 | # setup all infrastructure at once 191 | ansible-playbook /ansible/site.yml 192 | 193 | # dry run 194 | ansible-playbook -i /ansible/hosts /ansible/site.yml --check --diff 195 | ``` 196 | 197 |
198 | -------------------------------------------------------------------------------- /hadoop/example/map-reduce/gradlew: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | ############################################################################## 4 | ## 5 | ## Gradle start up script for UN*X 6 | ## 7 | ############################################################################## 8 | 9 | # Attempt to set APP_HOME 10 | # Resolve links: $0 may be a link 11 | PRG="$0" 12 | # Need this for relative symlinks. 13 | while [ -h "$PRG" ] ; do 14 | ls=`ls -ld "$PRG"` 15 | link=`expr "$ls" : '.*-> \(.*\)$'` 16 | if expr "$link" : '/.*' > /dev/null; then 17 | PRG="$link" 18 | else 19 | PRG=`dirname "$PRG"`"/$link" 20 | fi 21 | done 22 | SAVED="`pwd`" 23 | cd "`dirname \"$PRG\"`/" >/dev/null 24 | APP_HOME="`pwd -P`" 25 | cd "$SAVED" >/dev/null 26 | 27 | APP_NAME="Gradle" 28 | APP_BASE_NAME=`basename "$0"` 29 | 30 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 31 | DEFAULT_JVM_OPTS="" 32 | 33 | # Use the maximum available, or set MAX_FD != -1 to use that value. 34 | MAX_FD="maximum" 35 | 36 | warn () { 37 | echo "$*" 38 | } 39 | 40 | die () { 41 | echo 42 | echo "$*" 43 | echo 44 | exit 1 45 | } 46 | 47 | # OS specific support (must be 'true' or 'false'). 48 | cygwin=false 49 | msys=false 50 | darwin=false 51 | nonstop=false 52 | case "`uname`" in 53 | CYGWIN* ) 54 | cygwin=true 55 | ;; 56 | Darwin* ) 57 | darwin=true 58 | ;; 59 | MINGW* ) 60 | msys=true 61 | ;; 62 | NONSTOP* ) 63 | nonstop=true 64 | ;; 65 | esac 66 | 67 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 68 | 69 | # Determine the Java command to use to start the JVM. 70 | if [ -n "$JAVA_HOME" ] ; then 71 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 72 | # IBM's JDK on AIX uses strange locations for the executables 73 | JAVACMD="$JAVA_HOME/jre/sh/java" 74 | else 75 | JAVACMD="$JAVA_HOME/bin/java" 76 | fi 77 | if [ ! -x "$JAVACMD" ] ; then 78 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 79 | 80 | Please set the JAVA_HOME variable in your environment to match the 81 | location of your Java installation." 82 | fi 83 | else 84 | JAVACMD="java" 85 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 86 | 87 | Please set the JAVA_HOME variable in your environment to match the 88 | location of your Java installation." 89 | fi 90 | 91 | # Increase the maximum file descriptors if we can. 92 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then 93 | MAX_FD_LIMIT=`ulimit -H -n` 94 | if [ $? -eq 0 ] ; then 95 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then 96 | MAX_FD="$MAX_FD_LIMIT" 97 | fi 98 | ulimit -n $MAX_FD 99 | if [ $? -ne 0 ] ; then 100 | warn "Could not set maximum file descriptor limit: $MAX_FD" 101 | fi 102 | else 103 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" 104 | fi 105 | fi 106 | 107 | # For Darwin, add options to specify how the application appears in the dock 108 | if $darwin; then 109 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" 110 | fi 111 | 112 | # For Cygwin, switch paths to Windows format before running java 113 | if $cygwin ; then 114 | APP_HOME=`cygpath --path --mixed "$APP_HOME"` 115 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` 116 | JAVACMD=`cygpath --unix "$JAVACMD"` 117 | 118 | # We build the pattern for arguments to be converted via cygpath 119 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` 120 | SEP="" 121 | for dir in $ROOTDIRSRAW ; do 122 | ROOTDIRS="$ROOTDIRS$SEP$dir" 123 | SEP="|" 124 | done 125 | OURCYGPATTERN="(^($ROOTDIRS))" 126 | # Add a user-defined pattern to the cygpath arguments 127 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then 128 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" 129 | fi 130 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 131 | i=0 132 | for arg in "$@" ; do 133 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` 134 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option 135 | 136 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition 137 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` 138 | else 139 | eval `echo args$i`="\"$arg\"" 140 | fi 141 | i=$((i+1)) 142 | done 143 | case $i in 144 | (0) set -- ;; 145 | (1) set -- "$args0" ;; 146 | (2) set -- "$args0" "$args1" ;; 147 | (3) set -- "$args0" "$args1" "$args2" ;; 148 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;; 149 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; 150 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; 151 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; 152 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; 153 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; 154 | esac 155 | fi 156 | 157 | # Escape application args 158 | save () { 159 | for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done 160 | echo " " 161 | } 162 | APP_ARGS=$(save "$@") 163 | 164 | # Collect all arguments for the java command, following the shell quoting and substitution rules 165 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" 166 | 167 | # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong 168 | if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then 169 | cd "$(dirname "$0")" 170 | fi 171 | 172 | exec "$JAVACMD" "$@" 173 | -------------------------------------------------------------------------------- /docs/toolbox.md: -------------------------------------------------------------------------------- 1 | # Toolbox 2 | 3 | ## Vagrant 4 | 5 | > **Vagrant** is a tool for building and managing virtual machine environments in a single workflow 6 | 7 | Resources 8 | 9 | * [Documentation](https://www.vagrantup.com/docs) 10 | * [VirtualBox](https://www.virtualbox.org/wiki/Downloads) 11 | 12 | Setup project creating a Vagrantfile 13 | ```bash 14 | vagrant init 15 | ``` 16 | 17 | Boot and connect to the default virtual machine 18 | ```bash 19 | vagrant up 20 | vagrant status 21 | vagrant ssh 22 | ``` 23 | 24 | Useful commands 25 | ```bash 26 | # shut down gracefully 27 | vagrant halt 28 | 29 | # reload (halt + up) + re-provision 30 | vagrant reload --provision 31 | 32 | # update box 33 | vagrant box update 34 | vagrant box list 35 | 36 | # delete virtual machine without prompt 37 | vagrant destory -f 38 | ``` 39 | 40 |
41 | 42 | ## MkDocs 43 | 44 | > **MkDocs** is a static site generator 45 | 46 | Resources 47 | 48 | * [Documentation](http://www.mkdocs.org) 49 | 50 | Install 51 | ```bash 52 | pip install mkdocs 53 | sudo -H pip3 install mkdocs 54 | ``` 55 | 56 | Useful commands 57 | ```bash 58 | # setup in current directory 59 | mkdocs new . 60 | 61 | # start dev server with hot reload @ http://127.0.0.1:8000 62 | mkdocs serve 63 | 64 | # build static site 65 | mkdocs build --clean 66 | 67 | # deploy to github 68 | mkdocs gh-deploy 69 | ``` 70 | 71 |
72 | 73 | ## Hugo 74 | 75 | > **Hugo** is a static site generator 76 | 77 | * [Documentation](https://gohugo.io/documentation) 78 | 79 | Useful commands 80 | ```bash 81 | # install 82 | snap install hugo 83 | # switch to extended Sass/SCSS version 84 | snap refresh hugo --channel=extended 85 | 86 | # create skeleton 87 | hugo new site docs 88 | # create skeleton in current non-empty folder 89 | hugo new site . --force 90 | 91 | # setup theme 92 | git submodule add https://github.com/alex-shpak/hugo-book themes/book 93 | echo 'theme = "book"' >> config.toml 94 | 95 | # start dev server 96 | hugo server -D 97 | ``` 98 | 99 |
100 | 101 | ## SDKMAN! 102 | 103 | > **SDKMAN!** is a tool for managing parallel versions of multiple Software Development Kits on most Unix based systems 104 | 105 | Resources 106 | 107 | * [Documentation](http://sdkman.io) 108 | 109 | Setup 110 | ``` 111 | curl -s "https://get.sdkman.io" | bash 112 | source "$HOME/.sdkman/bin/sdkman-init.sh" 113 | sdk version 114 | ``` 115 | 116 | Gradle 117 | ```bash 118 | # setup 119 | sdk list gradle 120 | sdk install gradle 4.4.1 121 | gradle -version 122 | 123 | # create Gradle project 124 | mkdir -p PROJECT_NAME && cd $_ 125 | gradle init --type java-library 126 | 127 | ./gradlew clean build 128 | ``` 129 | 130 | Scala 131 | ```bash 132 | # setup sbt 133 | sdk list sbt 134 | sdk install sbt 135 | sbt sbtVersion 136 | sbt about 137 | 138 | # setup scala 139 | sdk list scala 140 | sdk install scala 2.11.8 141 | scala -version 142 | 143 | # sample project 144 | sbt new sbt/scala-seed.g8 145 | ``` 146 | 147 |
148 | 149 | ## Giter8 150 | 151 | > **Giter8** is a command line tool to generate files and directories from templates published on GitHub or any other git repository 152 | 153 | Resources 154 | 155 | * [Documentation](http://www.foundweekends.org/giter8) 156 | * [Templates](https://github.com/foundweekends/giter8/wiki/giter8-templates) 157 | 158 | Setup 159 | ```bash 160 | # install conscript 161 | curl https://raw.githubusercontent.com/foundweekends/conscript/master/setup.sh | sh 162 | source ~/.bashrc 163 | 164 | # install g8 165 | cs foundweekends/giter8 166 | ``` 167 | 168 | Example 169 | ```bash 170 | # interactive 171 | g8 sbt/scala-seed.g8 172 | # non-interactive 173 | g8 sbt/scala-seed.g8 --name=my-new-website 174 | ``` 175 | 176 |
177 | 178 | ## Snap 179 | 180 | Resources 181 | 182 | * [Documentation](https://docs.snapcraft.io) 183 | 184 | Useful commands 185 | ```bash 186 | # search 187 | snap find gimp 188 | 189 | # info 190 | snap info gimp 191 | 192 | # install 193 | snap install gimp 194 | 195 | # list installed app 196 | snap list 197 | 198 | # update all packages 199 | snap refresh 200 | 201 | # remove 202 | snap remove gimp 203 | ``` 204 | 205 |
206 | 207 | ## Python 208 | 209 | Resources 210 | 211 | * [pip](https://pip.pypa.io/en/stable/user_guide) 212 | * [virtualenv](https://virtualenv.pypa.io/en/stable/userguide) 213 | * [What is the difference between virtualenv | pyenv | virtualenvwrapper | venv ?](https://stackoverflow.com/questions/41573587/what-is-the-difference-between-venv-pyvenv-pyenv-virtualenv-virtualenvwrappe/41573588#41573588) 214 | 215 | Setup 216 | ```bash 217 | # search 218 | apt-get update && apt-cache search python | grep python2 219 | 220 | # setup python 221 | apt-get install -y python2.7 222 | apt-get install -y python3 223 | 224 | # install pip + setuptools 225 | curl https://bootstrap.pypa.io/get-pip.py | python2.7 - 226 | curl https://bootstrap.pypa.io/get-pip.py | python3 - 227 | apt install -y python-pip 228 | apt install -y python3-pip 229 | 230 | # upgrade pip 231 | pip install -U pip 232 | 233 | # install virtualenv globally 234 | pip install virtualenv 235 | ``` 236 | 237 | virtualenv 238 | ```bash 239 | # create virtualenv 240 | virtualenv venv 241 | virtualenv -p python3 venv 242 | virtualenv -p $(which python3) venv 243 | 244 | # activate virtualenv 245 | source venv/bin/activate 246 | 247 | # verify virtualenv 248 | which python 249 | python --version 250 | 251 | # deactivate virtualenv 252 | deactivate 253 | ``` 254 | 255 | pip 256 | ```bash 257 | # search package 258 | pip search 259 | 260 | # install new package 261 | pip install 262 | 263 | # update requirements with new packages 264 | pip freeze > requirements.txt 265 | 266 | # install all requirements 267 | pip install -r requirements.txt 268 | ``` 269 | 270 | Other 271 | ```bash 272 | # generate rc file 273 | pylint --generate-rcfile > .pylintrc 274 | 275 | # create module 276 | touch app/{__init__,main}.py 277 | ``` 278 | 279 |
280 | 281 | ## Git 282 | 283 | Resources 284 | 285 | * [git - the simple guide](https://rogerdudler.github.io/git-guide) 286 | * [git notes (1)](https://github.com/niqdev/git-notes/blob/master/git-real-1.md) 287 | * [git notes (2)](https://github.com/niqdev/git-notes/blob/master/git-real-2.md) 288 | 289 | Other 290 | 291 | * [Oh Shit, Git!?!](https://ohshitgit.com) 292 | * [Using Askgit](https://willschenk.com/articles/2020/using_askgit) 293 | * [`git filter-repo` is a versatile tool for rewriting history](https://github.com/newren/git-filter-repo) 294 | * [Merkle Tree](https://brilliant.org/wiki/merkle-tree) 295 | * [The Myers diff algorithm](https://blog.jcoglan.com/2017/02/12/the-myers-diff-algorithm-part-1) 296 | 297 |
298 | 299 | ## Mercurial 300 | 301 | Resources 302 | 303 | * [A Guide to Branching in Mercurial](http://stevelosh.com/blog/2009/08/a-guide-to-branching-in-mercurial) 304 | 305 | ```bash 306 | # changes since last commit 307 | hg st 308 | 309 | # verify current branch 310 | hg branch 311 | 312 | # lists all branches 313 | hg branches 314 | 315 | # checkout default branch 316 | hg up default 317 | 318 | # pull latest changes 319 | hg pull -u 320 | 321 | # create new branch 322 | hg branch "branch-name" 323 | 324 | # track new file 325 | hg add . 326 | 327 | # track new files and untrack removed files 328 | hg addremove 329 | 330 | # commit all tracked files 331 | hg commit -m "my-comment" 332 | 333 | # commit specific files 334 | hg commit FILE_1 FILE_2 -m "my-comment" 335 | 336 | # commit and track/untrack files (i.e. addremove) 337 | hg commit -A -m "my-comment-with-addremove" 338 | 339 | # rename last unpushed commit message 340 | hg commit -m "bad-commit-message" 341 | hg commit --amend -m "good-commit-message" 342 | 343 | # discard untracked files 344 | hg purge 345 | 346 | # discard uncommitted local changes 347 | hg up -C 348 | 349 | # discard local uncommitted branch 350 | hg strip "branch-name" 351 | 352 | # push commits in all branches 353 | hg push 354 | 355 | # push commits in current branch 356 | hg push -b . 357 | 358 | # create a new branch and push commits in current branch (first time only) 359 | hg push -b . --new-branch 360 | 361 | # lists unpushed commit 362 | hg outgoing 363 | 364 | # change head to specific revision 365 | hg up -r 12345 366 | 367 | # merge default branch on current branch 368 | hg up default 369 | hg pull -u 370 | hg status 371 | hg up CURRENT-BRANCH 372 | hg merge default 373 | hg diff 374 | 375 | # remove all resolved conflicts 376 | rm **/*.orig 377 | 378 | # list stashes 379 | hg shelve --list 380 | 381 | # stash 382 | hg shelve -n "my-draft" 383 | 384 | # unstash 385 | hg unshelve "my-draft" 386 | 387 | # revert/undo last unpushed commit 388 | hg strip -r -1 --keep 389 | hg strip --keep --rev . 390 | 391 | # solve conflicts manually and then mark it as merged 392 | hg resolve -m FILE-NAME 393 | 394 | # lists commits 395 | hg log 396 | hg ls 397 | 398 | # pretty log 399 | hg history --graph --limit 10 400 | ``` 401 | 402 |
403 | -------------------------------------------------------------------------------- /docs/cassandra.md: -------------------------------------------------------------------------------- 1 | # Cassandra 2 | 3 | > **Cassandra** is a distributed database for managing large amounts of structured data across many commodity servers, while providing highly available service and no single point of failure 4 | 5 | Resources 6 | 7 | * [Documentation](https://cassandra.apache.org) 8 | 9 | * [Cassandra: The Definitive Guide](https://amzn.to/2KvnEjY) (2016)(4th) by Eben Hewitt, Jeff Carpenter (Book) 10 | 11 | * [A Decentralized Structured Storage System](https://www.cs.cornell.edu/projects/ladis2009/papers/lakshman-ladis2009.pdf) (Paper) 12 | 13 | * [A Big Data Modeling Methodology for Apache Cassandra](http://www.cs.wayne.edu/andrey/papers/TR-BIGDATA-05-2015-CKL.pdf) (Paper) 14 | 15 | * [Facebook’s Cassandra paper](https://docs.datastax.com/en/articles/cassandra/cassandrathenandnow.html) 16 | 17 | * [Cassandra Data Modeling Best Practices](https://www.ebayinc.com/stories/blogs/tech/cassandra-data-modeling-best-practices-part-1) 18 | 19 | * [Difference between partition key, composite key and clustering key](https://stackoverflow.com/questions/24949676/difference-between-partition-key-composite-key-and-clustering-key-in-cassandra) 20 | 21 | * [Cassandra Cluster Manager](https://github.com/riptano/ccm) 22 | 23 | * [Netflix Priam](https://github.com/Netflix/Priam) 24 | 25 | * [cstar_perf](https://www.datastax.com/dev/blog/cassandra-performance-testing-with-cstar_perf) 26 | 27 | * [Amy's Cassandra 2.1 tuning guide](https://tobert.github.io/pages/als-cassandra-21-tuning-guide.html) 28 | 29 | * [Repair in Cassandra](https://www.datastax.com/dev/blog/repair-in-cassandra) 30 | 31 | 32 | 33 | Cassandra uses a tick-tock release model, even-numbered releases are feature releases, while odd-numbered releases are focused on bug fixes 34 | 35 | ## Architecture 36 | 37 | * A **rack** is a logical set of nodes in close proximity to each other 38 | 39 | * A **data center** is a logical set of racks 40 | 41 | * Cassandra uses a **gossip protocol** (called epidemic protocol) that allows each node to keep track of state information about the other nodes in the cluster implementing an algorithm called *Phi Accrual Failure Detection* instead of simple heartbeats 42 | 43 | * The job of a **snitch** is to determine relative host proximity for each node in a cluster, which is used to determine which nodes to read and write from 44 | 45 | * Cassandra represents the data managed by a cluster as a **ring**. Each node in the ring is assigned one or more ranges of data described by a **token**, which determines its position in the ring and is used to identify each partition 46 | 47 | ![cassandra-token-ring](img/cassandra-token-ring.png) 48 | 49 | * **virtual nodes** allow to break a token range and assign multiple tokens to a single physical node 50 | 51 | * A **partitioner** is a hash function for computing the token of a partition key and determines how a (wide) row or partition of data is distributed within the ring 52 | 53 | * The **replication factor** is the number of nodes in a cluster that will receive copies of the same row and the replication strategy is set independently for each keyspace 54 | 55 | * Cassandra provides tuneable **consistency** levels and must be specified on each read or write 56 | 57 | * A client may connect to any node in the cluster, named **coordinator node**, to initiate a read or write query. The coordinator identifies which nodes are replicas for the data and forwards the queries to them 58 | 59 | ![cassandra-query](img/cassandra-query.png) 60 | 61 | * When a write operation is performed, it's immediately written to a **commit log** to ensure that data is not lost. It is a crash-recovery mechanism only, clients never read from it 62 | 63 | * After it's written to the commit log, the value is written (already ordered) to a memory-resident data structure called the **memtable** divided by Column Family (table) 64 | 65 | * When the number of objects stored in the memtable or in the commit log reaches a threshold, the contents of the memtable are flushed (non-blocking operation) to disk in a file called **SSTable** and a new memtable or commit log is then created/recycled 66 | 67 | * No reads or seeks of any kind are required for writing a value to Cassandra because all writes are append operations to immutable SSTables. However, periodic **compaction** operations in Cassandra are performed in order to support fast read performance: the keys are merged, columns are combined, tombstones are discarded, and a new index is created 68 | 69 | * The **key cache** stores a map of partition keys to row index entries, facilitating faster read access into SSTables stored on disk. The key cache is stored on the JVM heap 70 | 71 | * The **row cache** caches entire rows and can greatly speed up read access for frequently accessed rows, at the cost of more memory usage. The row cache is stored in off-heap memory 72 | 73 | * The **counter cache** is used to improve counter performance by reducing lock contention for the most frequently accessed counters 74 | 75 | * In a scenario in which a write request is sent to Cassandra, but a replica node where the write properly belongs is not available due to network partition, hardware failure, or some other reason, to ensure general availability Cassandra implements a feature called **hinted handoff**. The coordinator node while store temporarily the data until it detects that the node is available again 76 | 77 | *Write Path* 78 | ![cassandra-write-path](img/cassandra-write-path.png) 79 | 80 | *Read Path* 81 | ![cassandra-read-path](img/cassandra-read-path.png) 82 | 83 | * To provide *linearizable consistency* e.g. read-before-write, Cassandra supports a **lightweight transaction** or LWT. The implementation is based on *paxos* and is limited to a single partition 84 | 85 | * A **tombstone** is a deletion marker that is required to suppress older data in SSTables until compaction or garbage collection run. Data is not immediately deleted but it's treated as an update operation 86 | 87 | * **Bloom filters** are very fast, non-deterministic algorithms for testing whether an element is a member of a set. It is possible to get a false-positive read, but not a false-negative. When a read is performed, the filter is checked first before accessing disk, if it indicates that the element does not exist in the set, it certainly doesn't, but if the filter thinks that the element is in the set, the disk is accessed to make sure 88 | 89 | * *Replica Synchronization (1)* Cassandra reads data from multiple replicas in order to achieve the requested consistency level and detects if any replicas have out of date values. If an insufficient number of nodes have the latest value, a **read repair** is performed immediately to update the out of date replicas 90 | 91 | * *Replica Synchronization (2)* **Anti-entropy repair** is a manually initiated operation performed on nodes as part of a regular maintenance process executed with *nodetool* causing a *major compaction* during which a node exchange *Merkle trees* with neighboring nodes 92 | 93 | ## Setup 94 | 95 | Single Node Cluster 96 | ```bash 97 | # change path 98 | cd devops/cassandra 99 | 100 | # start single node 101 | docker-compose up 102 | 103 | # paths 104 | /etc/cassandra 105 | /var/lib/cassandra 106 | /var/log/cassandra 107 | 108 | # remove container and volume 109 | docker rm -fv devops-cassandra 110 | ``` 111 | 112 | Multi Node Cluster 113 | ```bash 114 | # change path 115 | cd devops/cassandra 116 | 117 | # start node 118 | docker-compose -f docker-compose-cluster.yml up 119 | 120 | # optional mounted volumes 121 | mkdir -p \ 122 | .cassandra/cassandra-seed/{data,log} \ 123 | .cassandra/cassandra-node-1/{data,log} \ 124 | .cassandra/cassandra-node-2/{data,log} 125 | tree .cassandra/ 126 | 127 | # ISSUES releated to host permissions 128 | # > Small commitlog volume detected at /var/lib/cassandra/commitlog 129 | # > There is insufficient memory for the Java Runtime Environment to continue 130 | (cassandra) /var/lib/cassandra 131 | (root) /var/log/cassandra 132 | ``` 133 | 134 | Access container 135 | ```bash 136 | # access container 137 | docker exec -it devops-cassandra bash 138 | docker exec -it devops-cassandra bash -c cqlsh 139 | docker exec -it devops-cassandra-seed bash 140 | docker exec -it devops-cassandra-node-1 bash 141 | 142 | # execute cql script from host 143 | (docker exec -i devops-cassandra bash \ 144 | -c "cat > example.cql; cqlsh -f example.cql") < cql/example_create.cql 145 | ``` 146 | 147 | ## CQL 148 | 149 | `cqlsh` script [examples](https://github.com/niqdev/devops/tree/master/cassandra/cql) 150 | 151 | ```bash 152 | # connect 153 | cqlsh localhost 9042 154 | cqlsh localhost 9042 -u cassandra -p cassandra 155 | 156 | # execute cql script 157 | cqlsh -f cql/example_create.cql 158 | 159 | # info 160 | SHOW VERSION; 161 | DESCRIBE CLUSTER; 162 | DESCRIBE KEYSPACES; 163 | DESCRIBE KEYSPACE example; 164 | DESCRIBE TABLE example.messages; 165 | 166 | # nice format 167 | EXPAND ON; 168 | # trace query 169 | TRACING ON; 170 | 171 | # bulk loading 172 | COPY example.users TO '/cql/users.csv' WITH HEADER=TRUE; 173 | COPY example.users FROM '/cql/all_users.csv' WITH DELIMITER = ';'; 174 | COPY example.users (first_name,last_name,addresses,emails,enable) FROM '/cql/column_users.csv' WITH HEADER=TRUE; 175 | 176 | # automatic paging 177 | PAGING; 178 | PAGING ON; 179 | PAGING 100; 180 | # limit 181 | SELECT * FROM example.users LIMIT 1; 182 | ``` 183 | 184 | * [Batch](https://docs.datastax.com/en/cql/3.3/cql/cql_using/useBatch.html) 185 | * [User-Defined Type](https://docs.datastax.com/en/dse/6.0/cql/cql/cql_using/useCreateUDT.html) 186 | * [User-Defined Function](https://docs.datastax.com/en/cql/3.3/cql/cql_using/useCreateUDF.html) 187 | * [User-Defined Aggregate Function](https://docs.datastax.com/en/cql/3.3/cql/cql_using/useCreateUDA.html) 188 | 189 | Old `cassandra-cli` deprecated and removed in Cassandra 3.0 190 | 191 | ``` 192 | USE keyspace_name; 193 | LIST table_name; 194 | GET table_name["primary_key"]; 195 | SET table_name["primary_key"]["column_name"]; 196 | ``` 197 | 198 | ## nodetool 199 | 200 | ```bash 201 | # help 202 | nodetool 203 | 204 | # cluster informations 205 | nodetool describecluster 206 | nodetool status 207 | 208 | # node informations 209 | nodetool -h xxx.xxx.xxx.xxx info 210 | nodetool -h xxx.xxx.xxx.xxx statusgossip|statusthrift|statusbinary|statushandoff 211 | nodetool gossipinfo 212 | 213 | # ring informations 214 | nodetool ring 215 | nodetool describering KEYSPACE 216 | 217 | # monitor network 218 | nodetool netstats 219 | 220 | # threadpool statistics 221 | nodetool tpstats 222 | 223 | # keyspace statistics 224 | nodetool tablestats KEYSPACE 225 | 226 | # dynamic logging via JMX 227 | nodetool getlogginglevels 228 | 229 | # force to write data from memtables to SSTables 230 | nodetool flush 231 | 232 | # gracefully shutdown 233 | nodetool drain 234 | 235 | # discards any data that is no longer owned by the node 236 | # e.g. after changing replication factor or token range 237 | nodetool cleanup 238 | 239 | # anti-entropy repair or manual repair: reconcile data exchanging Merkle trees among nodes 240 | # maintenance: incremental parallel repair on the primary token range (run on each node) 241 | nodetool repair -pr 242 | 243 | # create snapshot 244 | nodetool snapshot 245 | nodetool listsnapshots 246 | 247 | # restore snapshot (create schema or truncate table before) 248 | # 1) same cluster and configuration 249 | # copy SSTable ".db" files into the data directory and on the running node execute refresh 250 | nodetool refresh 251 | # 2) different configuration (e.g. topology, token ranges, or replication) 252 | sstableloader 253 | 254 | # stress tool 255 | cassandra-stress write n=1000000 256 | cassandra-stress read n=200000 257 | ``` 258 | 259 |
260 | -------------------------------------------------------------------------------- /docs/system-design.md: -------------------------------------------------------------------------------- 1 | # System Design 2 | 3 | ## Books 4 | 5 | * [Designing Data-Intensive Applications](https://amzn.to/2lKJMvU) (2017) by Martin Kleppmann 6 | * [Domain-Driven Design: Tackling Complexity in the Heart of Software](https://amzn.to/2VTvGYS) (2003) by Eric Evans 7 | * [Functional and Reactive Domain Modeling](https://www.manning.com/books/functional-and-reactive-domain-modeling) (2016) by Debasish Ghosh 8 | * [Versioning in an Event Sourced System](https://leanpub.com/esversioning/read) 9 | * [Exploring CQRS and Event Sourcing](https://docs.microsoft.com/en-us/previous-versions/msp-n-p/jj554200(v%3dpandp.10)) 10 | * [Database Internals - A Deep Dive into How Distributed Data Systems Work](https://www.databass.dev) 11 | * [The Architecture of Open Source Applications](http://aosabook.org/en/index.html) (free) 12 | 13 | ## Resources 14 | 15 | * [6.824 Distributed Systems MIT](https://www.youtube.com/playlist?list=PLrw6a1wE39_tb2fErI4-WkMbsvGQk9_UB) (course) 16 | * [Distributed Systems lecture series](https://www.youtube.com/playlist?list=PLeKd45zvjcDFUEv_ohr_HdUFe97RItdiB) by Martin Kleppmann (course) 17 | * [Software Architecture Monday](https://www.youtube.com/playlist?list=PLdsOZAx8I5umhnn5LLTNJbFgwA3xbycar) (videos) 18 | * [CQRS](https://www.martinfowler.com/bliki/CQRS.html) by Martin Fowler 19 | * [Clarified CQRS](http://udidahan.com/2009/12/09/clarified-cqrs) 20 | * [1 Year of Event Sourcing and CQRS](https://hackernoon.com/1-year-of-event-sourcing-and-cqrs-fb9033ccd1c6) 21 | * [Eventually Consistent - Revisited](https://www.allthingsdistributed.com/2008/12/eventually_consistent.html) 22 | * [How do CRDTs solve distributed data consistency challenges?](https://ably.com/blog/crdts-distributed-data-consistency-challenges) 23 | * [Are CRDTs suitable for shared editing?](https://blog.kevinjahns.de/are-crdts-suitable-for-shared-editing) 24 | * [On Designing and Deploying Internet-Scale Services](https://www.usenix.org/legacy/events/lisa07/tech/full_papers/hamilton/hamilton_html) 25 | * [There is No Now](https://queue.acm.org/detail.cfm?id=2745385) 26 | * [Online Event Processing](https://queue.acm.org/detail.cfm?id=3321612) 27 | * [The world beyond batch: Streaming 101](https://www.oreilly.com/ideas/the-world-beyond-batch-streaming-101) 28 | * [Questioning the Lambda Architecture](https://www.oreilly.com/ideas/questioning-the-lambda-architecture) 29 | * [The Difference between SLI, SLO, and SLA](https://enqueuezero.com/the-difference-between-sli-slo-and-sla.html) 30 | * [A review of consensus protocols](https://thomasvilhena.com/2020/10/a-review-of-consensus-protocols) 31 | * [How you could have come up with Paxos yourself](https://explain.yshui.dev/distributed%20system/2020/09/20/paxos.html) 32 | * [Implementing Raft's Leader Election in Rust](https://blog.laurocaetano.com/programming/2021/01/23/raft-leader-election-rust) 33 | * [Consensus Protocol](https://www.consul.io/docs/architecture/consensus) 34 | * [Implementing Raft for Browsers with Rust and WebRTC](https://eevans.co/blog/wraft) 35 | * [HTTP Feeds](https://www.http-feeds.org) 36 | * [Autopilot Pattern Applications](http://autopilotpattern.io) 37 | * [REST Hooks](https://resthooks.org) 38 | 39 | ## Blogs 40 | 41 | * [Jepsen](https://aphyr.com/tags/Jepsen) 42 | * [The Paper Trail](https://www.the-paper-trail.org) 43 | * [High Scalability](http://highscalability.com) 44 | * [InfoQ: Architecture & Design Content](https://www.infoq.com/architecture-design/presentations) 45 | 46 | ## CAP 47 | 48 | * [Brewer's CAP Theorem](http://www.julianbrowne.com/article/brewers-cap-theorem) 49 | * [CAP Twelve Years Later: How the "Rules" Have Changed](https://www.infoq.com/articles/cap-twelve-years-later-how-the-rules-have-changed) 50 | * [Please stop calling databases CP or AP](https://martin.kleppmann.com/2015/05/11/please-stop-calling-databases-cp-or-ap.html) 51 | * [The CAP FAQ](https://www.the-paper-trail.org/page/cap-faq) 52 | * [You Can't Sacrifice Partition Tolerance](https://codahale.com/you-cant-sacrifice-partition-tolerance) 53 | 54 | ## Papers 55 | 56 | * [Foundational distributed systems papers](https://muratbuffalo.blogspot.com/2021/02/foundational-distributed-systems-papers.html) (collection) 57 | * [Distributed Systems Reading List](https://dancres.github.io/Pages) (collection) 58 | * [Best Paper Awards in Computer Science](https://jeffhuang.com/best_paper_awards) (collection) 59 | * [Ask HN: Recommended books and papers on distributed systems?](https://news.ycombinator.com/item?id=25987664) (collection) 60 | * [The Google File System](https://static.googleusercontent.com/media/research.google.com/en//archive/gfs-sosp2003.pdf) 61 | * [MapReduce: Simplified Data Processing on Large Clusters](https://static.googleusercontent.com/media/research.google.com/en//archive/mapreduce-osdi04.pdf) 62 | * [Raft: In Search of an Understandable Consensus Algorithm](https://raft.github.io/raft.pdf) 63 | * [Paxos Made Simple](https://www.microsoft.com/en-us/research/uploads/prod/2016/12/paxos-simple-Copy.pdf) 64 | * [Zab: A simple totally ordered broadcast protocol](http://diyhpl.us/~bryan/papers2/distributed/distributed-systems/zab.totally-ordered-broadcast-protocol.2008.pdf) 65 | * [The Chubby lock service for loosely-coupled distributed systems](https://static.googleusercontent.com/media/research.google.com/en//archive/chubby-osdi06.pdf) 66 | * [Spanner: Google's Globally-Distributed Database](https://static.googleusercontent.com/media/research.google.com/en//archive/spanner-osdi2012.pdf) 67 | * [Dynamo: Amazon’s Highly Available Key-value Store](https://s3.amazonaws.com/AllThingsDistributed/sosp/amazon-dynamo-sosp2007.pdf) 68 | * [HyperLogLog in Practice](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/40671.pdf) 69 | * [Dapper, a Large-Scale Distributed Systems Tracing Infrastructure](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/36356.pdf) 70 | * [Large-scale cluster management at Google with Borg](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43438.pdf) 71 | * [Linearizability: A Correctness Condition for Concurrent Objects](https://cs.brown.edu/~mph/HerlihyW90/p463-herlihy.pdf) 72 | * [Harvest, Yield, and Scalable Tolerant Systems](https://s3.amazonaws.com/systemsandpapers/papers/FOX_Brewer_99-Harvest_Yield_and_Scalable_Tolerant_Systems.pdf) 73 | * [Life beyond Distributed Transactions](https://web.archive.org/web/20210303104924/https://www-db.cs.wisc.edu/cidr/cidr2007/papers/cidr07p15.pdf) (webarchive) 74 | * [The ϕ Accrual Failure Detector](https://web.archive.org/web/20170517022242/http://fubica.lsd.ufcg.edu.br/hp/cursos/cfsc/papers/hayashibara04theaccrual.pdf) (webarchive) 75 | * [Conflict-free Replicated Data Types](https://hal.inria.fr/inria-00609399v1/document) 76 | * [FLP - Impossibility of Distributed Consensus with One Faulty Process](https://web.archive.org/web/20210211213256/http://macs.citadel.edu/rudolphg/csci604/ImpossibilityofConsensus.pdf) (webarchive) 77 | * [SEDA: An Architecture for Well-Conditioned, Scalable Internet Services](http://nms.lcs.mit.edu/~kandula/projects/killbots/killbots_files/seda-sosp01.pdf) 78 | * [Pregel: A System for Large-Scale Graph Processing](https://kowshik.github.io/JPregel/pregel_paper.pdf) 79 | * [Hashed and Hierarchical Timing Wheels](http://www.cs.columbia.edu/~nahum/w6998/papers/sosp87-timing-wheels.pdf) 80 | * [Merkle Hash Tree based Techniques for Data Integrity of Outsourced Data](http://ceur-ws.org/Vol-1366/paper13.pdf) 81 | * [What Every Programmer Should Know About Memory](https://www.akkadia.org/drepper/cpumemory.pdf) 82 | * [Fallacies of Distributed Computing Explained](https://web.archive.org/web/20201108163119/http://www.rgoarchitects.com/Files/fallacies.pdf) (webarchive) 83 | * [The Dataflow Model: A Practical Approach to Balancing Correctness, Latency, and Cost in Massive-Scale, Unbounded, Out-of-Order Data Processing](https://www.vldb.org/pvldb/vol8/p1792-Akidau.pdf) 84 | * [A Dataset of Dockerfiles](https://arxiv.org/pdf/2003.12912.pdf) 85 | 86 |
87 | 88 | 160 | -------------------------------------------------------------------------------- /docs/kafka.md: -------------------------------------------------------------------------------- 1 | # Kafka 2 | 3 | > **Kafka** is a distributed streaming platform 4 | 5 | Resources 6 | 7 | * [Documentation](https://kafka.apache.org) 8 | 9 | * [Kafka: The Definitive Guide](https://amzn.to/2tQCryv) (2017) by Gwen Shapira, Neha Narkhede, Todd Palino (Book) 10 | 11 | * [Kafka Streams in Action](https://www.manning.com/books/kafka-streams-in-action) (2018) by William P. Bejeck Jr. (Book) 12 | 13 | * [Kafka: a Distributed Messaging System for Log Processing](http://notes.stephenholiday.com/Kafka.pdf) (Paper) 14 | 15 | * [The Internals of Kafka Streams](https://jaceklaskowski.gitbooks.io/mastering-kafka-streams) (Book) 16 | 17 | * [Gently down the stream](https://www.gentlydownthe.stream) (Kid's Book) 18 | 19 | * [Schema Registry](https://docs.confluent.io/current/schema-registry/docs/index.html) 20 | 21 | * [KafkaProducer javadocs](https://kafka.apache.org/20/javadoc/index.html?org/apache/kafka/clients/producer/KafkaProducer.html) 22 | 23 | * [KafkaConsumer javadocs](https://kafka.apache.org/20/javadoc/index.html?org/apache/kafka/clients/consumer/KafkaConsumer.html) 24 | 25 | * [Reactive Kafka](https://doc.akka.io/docs/akka-stream-kafka/current/home.html) 26 | 27 | ## Architecture 28 | 29 | * Kafka is a publish/subscribe messaging system often described as a *distributed commit log* or *distributing streaming platform* 30 | 31 | * The unit of data is called a **message**, which is simply an array of bytes and it can have a **key** used to assign partitions. A **batch** is a collection of messages, all of which are being produced to the same topic and partition 32 | 33 | * Messages are categorized into **topics** which are additionally broken down into a number of **partitions**. Each partition is splitted into **segments** for storage purposes and each segment is stored in a single data file which contains messages and their offsets 34 | 35 | * Messages are written in an append-only fashion and are read in order from beginning to end. As a topic typically has multiple partitions, there is no guarantee of message time-ordering across the entire topic, just within a single partition 36 | 37 | * In order to help brokers quickly locate the message for a given offset, Kafka maintains an **index** for each partition. The index maps offsets to segment files and positions within the file 38 | 39 | * A **stream** is considered to be a single topic of data, regardless of the number of partitions 40 | 41 | * **Producers**, publishers or writers, create new messages to a specific topic. By default, the producer does not care what partition a specific message is written to and will balance messages over all partitions of a topic evenly 42 | 43 | ![kafka-producer](img/kafka-producer.png) 44 | 45 | * **Consumers**, subscribers or readers, read messages. The consumer subscribes to one or more topics and reads the messages in the order in which they were produced. The consumer keeps track of which messages it has already consumed by keeping track of the **offset** of messages i.e. an integer value that continually 46 | increases. Each message in a given partition has a unique offset stored either in Zookeeper or in Kafka itself 47 | 48 | ![kafka-consumer](img/kafka-consumer.png) 49 | 50 | * Consumers work as part of a **consumer group**, which is one or more consumers that work together to consume a topic. The group assures that each partition is only consumed by one member. The mapping of a consumer to a partition is often called **ownership** of the partition by the consumer 51 | 52 | * When a new consumer is added to a group, or when a consumer shuts down or crashes leaving the group, it cause reassignment of partitions to other consumers. Moving partition ownership from one consumer to another is called a **rebalance** which provide high availability and scalability 53 | 54 | * Consumers maintain membership in a consumer group and ownership of the partitions assigned to them by sending **heartbeats** to a Kafka broker designated as the **group coordinator** 55 | 56 | * You can't have multiple consumers that belong to the same group in one thread and you can't have multiple threads safely use the same consumer 57 | 58 | ![kafka-consumer-group](img/kafka-consumer-group.png) 59 | 60 | * Consumers must keep polling or they will be considered dead and the partitions they are consuming will be handed to another consumer in the group to continue consuming. Consumers **commit** (track) their offset (position) in each partition to a special `__consumer_offsets` topic. If a consumer crashes or a new consumer joins the consumer group, this will trigger a rebalance. After a rebalance, each consumer may be assigned a new set of partitions than the one it processed before. In order to know where to pick up the work, the consumer will read the latest committed offset of each partition and continue from there 61 | 62 | ![kafka-rebalance-duplicate](img/kafka-rebalance-duplicate.png) 63 | ![kafka-rebalance-lost](img/kafka-rebalance-lost.png) 64 | 65 | * A single Kafka server is called a **broker**. The broker receives messages from producers, assigns offsets to them, and commits the messages to storage on disk. It also services consumers, responding to fetch requests for partitions and responding with the messages that have been committed to disk 66 | 67 | * Kafka brokers are designed to operate as part of a **cluster**. A partition is owned by a single broker in the cluster and that broker is called the **leader** of the partition. A partition may be assigned to multiple brokers, which will result in the partition being replicated. All events are produced to and consumed from the *leader* replica. Other *follower* replicas just need to stay **in-sync** with the leader and replicate all the recent events on time 68 | 69 | * Kafka uses **Zookeeper** to maintain the list of brokers that are currently members of a cluster. Every time a broker process starts, it registers itself with a unique identifier by creating an [ephemeral node](http://zookeeper.apache.org/doc/current/zookeeperProgrammers.html#Ephemeral+Nodes). Kafka uses Zookeeper's ephemeral node feature to elect a **controller**. The controller is responsible for electing leaders among the partitions and replicas whenever it notices nodes join and leave the cluster 70 | 71 | ![kafka-cluster](img/kafka-cluster.png) 72 | 73 | * Data in Kafka is organized by topics. Each topic is partitioned and each partition can have multiple **replicas**. Those replicas are stored on brokers and each broker stores replicas belonging to different topics and partitions 74 | 75 | * A key feature is that of **retention**. Brokers are configured with a default retention setting for topics, either retaining messages for some period of *time* or until the topic reaches a certain *size* in bytes. Once these limits are reached, messages are expired and deleted 76 | 77 | * **MirrorMaker** is a tool to coordinates multiple clusters or datacenters and replicate data 78 | 79 | ## Details 80 | 81 | * The underlying technology of a Kafka topic is a **log**, which is a file, an append-only, totally ordered sequence of records ordered by time. Topics in Kafka are logs that are segregated by topic name 82 | 83 | * The configuration settings `log.dir` specifies where Kafka stores log data and each topic maps to a subdirectory. There will be as many subdirectories as there are topic partitions, with a format of `partition-name_partition-number`. Once the log files reach a certain size (either a number of records or size on disk), or when a configured time difference between message timestamps is reached, the log file is **rolled** and Kafka appends incoming messages to a new log 84 | 85 | * To manage the increasing size of the logs, Kafka rolls them into **segments**. The timing of log rolling is based on *timestamps* embedded in the messages. Kafka rolls a log when a new message arrives, and its timestamp is greater than the timestamp of the first message in the log plus the `log.roll.ms`. At that point, the log is rolled and a new segment is created as the new active log. The previous active segment is still used to retrieve messages for consumers. Over time, the number of segments will continue to grow, and older segments will need to be deleted to make room for incoming data. To handle the deletion, you can specify how long to retain the segments by `log.retention` configurations 86 | 87 | * **Log compaction** ensures that Kafka will always retain at least the last known value for each message key within the log of data for a single topic partition. Instead of taking a coarse-grained approach and deleting entire segments based on time or size, compaction is more fine-grained and deletes old records per key in a log. A log cleaner (a pool of threads) runs in the background, recopying log-segment files and removing records if there's an occurrence later in the log with the same key. To use compaction for a topic, set the `log.cleanup.policy=compact` property when creating the topic. With a compacted topic, deletion provides a `null` value for the given key, setting a tombstone marker 88 | 89 | * **Partitions** guarantee that data with the same keys will be sent to the same consumer and in order. Partitioning a topic essentially splits the data forwarded to a topic across parallel streams, and it's key for performances and high throughput. Each message has an **offset** number assigned to it. The order of messages across partitions isn't guaranteed, but the order of messages within each partition is guaranteed 90 | 91 | * Kafka works with data in key/value pairs. If the keys are `null`, the Kafka producer will write records to partitions chosen in a round-robin fashion, otherwise Kafka uses the formula `partition = hashCode(key) % numberOfPartitions` to determine to which partition to send the key/value pair to. By using a deterministic approach to select a partition, records with the same key will always be sent to the same partition and in order 92 | 93 | * To determe the correct number of partitions, one of the key considerations is the amount of data flowing into a given topic. More data implies more partitions for higher throughput. On the other hand, increasing the number of partitions increases the number of TCP connections and open file handles. Additionally, how long it takes to process an incoming record in a consumer will also determine throughput. If there is heavyweight processing in a consumer, adding more partitions may help, but ultimately the slower processing will hinder performance 94 | 95 | * Kafka has the notion of **leader** and **follower** brokers. In Kafka, for each topic partition, one broker is chosen as the leader for the other brokers (the followers). One of the chief duties of the leader is to assign [replication](http://kafka.apache.org/documentation/#design_replicatedlog) of topic partitions to the follower brokers. When producing messages, Kafka sends the record to the broker that is the leader for the record's partition. Brokers that follow a topic partition consume messages from the topic-partition leader and append those records to their log 96 | 97 | * Kafka uses *ZooKeeper* to **elect** the **controller** broker of the cluster. If the controlling broker fails or becomes unavailable for any reason, ZooKeeper elects a new controller from a set of brokers that are considered to be caught up with the leader (an in-sync replica **ISR**). The brokers that make up this set are dynamic, and ZooKeeper recognizes only brokers in this set for election as leader. If a Kafka node dies or is unresponsive (to ZooKeeper heartbeats), all of its assigned partitions (both leader and follower) are reassigned by the controller broker 98 | 99 | * Kafka Streams is a library that allows to perform per-event processing of records, without grouping data in microbatches 100 | 101 | * Kafka Streams is a graph (or **topology** or Directed Acyclic Graph) of processing nodes or **processors** that combine to provide powerful and complex stream processing. Each processing node performs its assigned **task** and then forwards the record to each of its child node. Records (a key/value pair) flow through the graph in a depth-first manner, which implies that there is no need to have backpressure 102 | 103 | * Treating an event stream as inserts, and events with keys as updates, is how to defined the relationship between **streams** and **tables**. If a stream of events is as a log, a stream of updates is as a changelog. Both a log and a changelog represent incoming records appended to the end of a file. In a log there are all the records; but in a changelog, there are only the latest record for any given key 104 | 105 | * A `KTable` is often described as being a materialized view of a `KStream`, a view of a stream is nothing but a **per-key aggregation** 106 | 107 | ## Setup 108 | 109 | Requirements 110 | 111 | * [Base](docker/#base-image) docker image 112 | * [ZooKeeper](zookeeper) docker image 113 | 114 | Build `devops/kafka` image 115 | ```bash 116 | # change path 117 | cd devops/kafka 118 | 119 | # build image 120 | docker build -t devops/kafka . 121 | 122 | # create network 123 | docker network create --driver bridge my_network 124 | docker network ls 125 | docker network inspect my_network 126 | 127 | # start temporary zookeeper container [host:container] 128 | docker run --rm \ 129 | --name zookeeper \ 130 | -p 12181:2181 \ 131 | --network=my_network \ 132 | devops/zookeeper 133 | # access container 134 | docker exec -it zookeeper bash 135 | 136 | # start temporary kafka container [host:container] 137 | docker run --rm \ 138 | --name kafka \ 139 | -p 19092:9092 \ 140 | --network=my_network \ 141 | -e ZOOKEEPER_HOSTS="zookeeper:2181" \ 142 | devops/kafka 143 | # access container 144 | docker exec -it kafka bash 145 | 146 | # paths 147 | /opt/kafka 148 | /opt/kafka/logs 149 | /var/lib/kafka/data 150 | 151 | # supervisor logs 152 | /var/log/kafka 153 | /var/log/connect 154 | tail -F /var/log/kafka/stdout 155 | less +G /var/log/connect/stdout 156 | ``` 157 | 158 | Alternatively use `docker-compose` 159 | ```bash 160 | # change path 161 | cd devops/kafka 162 | 163 | # build base image 164 | docker build -t devops/base ../base 165 | # build + start zookeeper and kafka 166 | docker-compose up 167 | 168 | # access container 169 | docker exec -it devops-zookeeper bash 170 | docker exec -it devops-kafka bash 171 | ``` 172 | 173 | ## How-To 174 | 175 | Kafka 176 | ```bash 177 | docker exec -it devops-kafka bash 178 | 179 | # create topic 180 | kafka-topics.sh --zookeeper zookeeper:2181 \ 181 | --create --if-not-exists --replication-factor 1 --partitions 1 --topic test 182 | 183 | # view topic 184 | kafka-topics.sh --zookeeper zookeeper:2181 --list 185 | kafka-topics.sh --zookeeper zookeeper:2181 --describe --topic test 186 | kafka-topics.sh --zookeeper zookeeper:2181 --describe --under-replicated-partitions 187 | kafka-topics.sh --zookeeper zookeeper:2181 --describe --unavailable-partitions 188 | 189 | # produce 190 | kafka-console-producer.sh --broker-list kafka:9092 --topic test 191 | # util 192 | kafkacat -P -b 0 -t test 193 | 194 | # consume 195 | kafka-console-consumer.sh --bootstrap-server kafka:9092 --topic test --from-beginning 196 | # util 197 | kafkacat -C -b 0 -t test 198 | 199 | # list consumers 200 | kafka-consumer-groups.sh --bootstrap-server kafka:9092 --list 201 | # view lag (GROUP_NAME from previous command) 202 | kafka-consumer-groups.sh --bootstrap-server kafka:9092 --describe --group GROUP_NAME 203 | 204 | # delete 205 | kafka-topics.sh --zookeeper zookeeper:2181 --delete --topic test 206 | 207 | # verify log segment and index 208 | kafka-run-class.sh kafka.tools.DumpLogSegments \ 209 | --files /var/lib/kafka/data/test-0/00000000000000000000.log 210 | kafka-run-class.sh kafka.tools.DumpLogSegments \ 211 | --index-sanity-check \ 212 | --files /var/lib/kafka/data/test-0/00000000000000000000.index 213 | 214 | # inspect __consumer_offsets 215 | kafka-console-consumer.sh --bootstrap-server kafka:9092 \ 216 | --topic __consumer_offsets \ 217 | --formatter "kafka.coordinator.group.GroupMetadataManager\$OffsetsMessageFormatter" \ 218 | --max-messages 1 219 | ``` 220 | 221 | Connect 222 | ```bash 223 | docker exec -it devops-kafka bash 224 | 225 | # verify connect 226 | http :8083 227 | http :8083/connector-plugins 228 | 229 | # write file to topic 230 | http POST :8083/connectors \ 231 | name=load-kafka-config \ 232 | config:='{"connector.class":"FileStreamSource","file":"/opt/kafka/config/server.properties","topic":"kafka-config-topic"}' 233 | 234 | # verify topic 235 | kafka-console-consumer.sh --bootstrap-server=kafka:9092 \ 236 | --topic kafka-config-topic --from-beginning 237 | 238 | # write topic to file 239 | http POST :8083/connectors \ 240 | name=dump-kafka-config \ 241 | config:='{"connector.class":"FileStreamSink","file":"/tmp/copy-of-server-properties","topics":"kafka-config-topic"}' 242 | 243 | # verify file 244 | vim /tmp/copy-of-server-properties 245 | 246 | # manage connectors 247 | http :8083/connectors 248 | http DELETE :8083/connectors/dump-kafka-config 249 | ``` 250 | 251 | ZooKeeper 252 | ```bash 253 | docker exec -it devops-zookeeper bash 254 | 255 | # start cli 256 | zkCli.sh 257 | 258 | # view ephemeral nodes 259 | ls /brokers/ids 260 | get /brokers/ids/0 261 | 262 | # view topics 263 | ls /brokers/topics 264 | get /brokers/topics/test 265 | ``` 266 | 267 | Schema Registry 268 | ```bash 269 | # docker-hub images 270 | docker-compose -f kafka/docker-compose-hub.yml up 271 | docker exec -it devops-schema-registry bash 272 | 273 | # register new schema 274 | http -v POST :8081/subjects/ExampleSchema/versions \ 275 | Accept:application/vnd.schemaregistry.v1+json \ 276 | schema='{"type":"string"}' 277 | 278 | # list subjects and schema 279 | http -v :8081/subjects \ 280 | Accept:application/vnd.schemaregistry.v1+json 281 | http -v :8081/subjects/ExampleSchema/versions \ 282 | Accept:application/vnd.schemaregistry.v1+json 283 | http -v :8081/subjects/ExampleSchema/versions/1 \ 284 | Accept:application/vnd.schemaregistry.v1+json 285 | 286 | # ui [mac|linux] 287 | [open|xdg-open] http://localhost:8082 288 | ``` 289 | 290 |
291 | -------------------------------------------------------------------------------- /docs/hadoop.md: -------------------------------------------------------------------------------- 1 | # Hadoop 2 | 3 | The following guide explains how to provision a Multi Node Hadoop Cluster locally and play with it. Checkout the [Vagrantfile](https://github.com/niqdev/devops/blob/master/hadoop/Vagrantfile) and the Vagrant [guide](other/#vagrant) for more details. 4 | 5 | Resources 6 | 7 | * [Documentation](https://hadoop.apache.org) 8 | 9 | * [Hadoop: The Definitive Guide](https://amzn.to/2Kxc8bg) (2015)(4th) by Tom White (Book) 10 | 11 | * [The Hadoop Ecosystem Table](https://hadoopecosystemtable.github.io) 12 | 13 | * [Hadoop Internals](https://ercoppa.github.io/HadoopInternals) 14 | 15 | ### Setup 16 | 17 | Requirements 18 | 19 | * [Vagrant](https://www.vagrantup.com) 20 | * [VirtualBox](https://www.virtualbox.org) 21 | 22 | Directory structure 23 | ```bash 24 | tree -a hadoop/ 25 | hadoop/ 26 | ├── .data # mounted volume 27 | │   ├── hadoop_rsa 28 | │   ├── hadoop_rsa.pub 29 | │   ├── master 30 | │   │   ├── hadoop 31 | │   │   │   ├── log 32 | │   │   │   │   ├── hadoop 33 | │   │   │   │   ├── mapred 34 | │   │   │   │   └── yarn 35 | │   │   │   ├── namenode 36 | │   │   │   └── secondary 37 | │   │   ├── oozie 38 | │   │   │   ├── data 39 | │   │   │   └── log 40 | │   │   ├── spark 41 | │   │   │   └── log 42 | │   │   └── zeppelin 43 | │   │   ├── log 44 | │   │   └── notebook 45 | │   ├── node-1 46 | │   │   └── hadoop 47 | │   │   ├── datanode 48 | │   │   └── log 49 | │   │      ├── hadoop 50 | │   │      ├── mapred 51 | │   │      └── yarn 52 | │   ├── node-2 53 | │   ├── node-3 54 | ├── example 55 | │   ├── map-reduce 56 | │   └── spark 57 | ├── file 58 | │   ├── hadoop 59 | │   │   ├── config 60 | │   │   │   ├── core-site.xml 61 | │   │   │   ├── fair-scheduler.xml 62 | │   │   │   ├── hdfs-site.xml 63 | │   │   │   ├── mapred-site.xml 64 | │   │   │   ├── masters 65 | │   │   │   ├── slaves 66 | │   │   │   └── yarn-site.xml 67 | │   │   └── profile-hadoop.sh 68 | │   ├── hosts 69 | │   ├── motd 70 | │   ├── oozie 71 | │   │   ├── config 72 | │   │   │   ├── oozie-env.sh 73 | │   │   │   └── oozie-site.xml 74 | │   │   └── profile-oozie.sh 75 | │   ├── spark 76 | │   │   ├── config 77 | │   │   │   ├── log4j.properties 78 | │   │   │   └── spark-env.sh 79 | │   │   └── profile-spark.sh 80 | │   ├── ssh 81 | │   │   └── config 82 | │   └── zeppelin 83 | │   ├── config 84 | │   │   └── zeppelin-env.sh 85 | │   └── profile-zeppelin.sh 86 | ├── script 87 | │   ├── bootstrap.sh 88 | │   ├── setup_hadoop.sh 89 | │   ├── setup_oozie.sh 90 | │   ├── setup_spark.sh 91 | │   ├── setup_ubuntu.sh 92 | │   └── setup_zeppelin.sh 93 | ├── Vagrantfile 94 | └── vagrant_hadoop.sh 95 | ``` 96 | 97 | Import the script 98 | ```bash 99 | source vagrant_hadoop.sh 100 | ``` 101 | 102 | Create and start a Multi Node Hadoop Cluster 103 | ```bash 104 | hadoop-start 105 | ``` 106 | *The first time it might take a while* 107 | 108 | Access the cluster via ssh, check also the [/etc/hosts](https://github.com/niqdev/devops/blob/master/hadoop/file/hosts) file 109 | ```bash 110 | vagrant ssh master 111 | ssh hadoop@172.16.0.10 -i .data/hadoop_rsa 112 | 113 | # 3 nodes 114 | vagrant ssh node-1 115 | ssh hadoop@172.16.0.101 -i .data/hadoop_rsa 116 | ``` 117 | 118 | Destroy the cluster 119 | ```bash 120 | hadoop-destroy 121 | ``` 122 | 123 | For convenience add to the host machine 124 | ```bash 125 | cat hadoop/file/hosts | sudo tee --append /etc/hosts 126 | ``` 127 | 128 | Web UI links 129 | 130 | * NameNode: [http://namenode.local:50070](http://172.16.0.10:50070) 131 | * NameNode metrics: [http://namenode.local:50070/jmx](http://172.16.0.10:50070/jmx) 132 | * ResourceManager: [http://resource-manager.local:8088](http://172.16.0.10:8088) 133 | * Log Level: [http://resource-manager.local:8088/logLevel](http://172.16.0.10:8088/logLevel) 134 | * Web Application Proxy Server: [http://web-proxy.local:8100/proxy/application_XXX_0000](http://172.16.0.10:8100/proxy/application_XXX_0000) 135 | * MapReduce Job History Server: [http://history.local:19888](http://172.16.0.10:19888) 136 | * DataNode/NodeManager (1): [http://node-1.local:8042/node](http://172.16.0.101:8042/node) 137 | * DataNode/NodeManager (2): [http://node-2.local:8042/node](http://172.16.0.102:8042/node) 138 | * DataNode/NodeManager (3): [http://node-3.local:8042/node](http://172.16.0.103:8042/node) 139 | * Spark: [http://spark.local:4040](http://172.16.0.10:4040) 140 | * Spark History Server: [http://spark-history.local:18080](http://172.16.0.10:18080) 141 | * Zeppelin (*): [http://zeppelin.local:8080](http://172.16.0.10:8080) 142 | * Oozie (*): [http://oozie.local:11000](http://172.16.0.10:11000) 143 | 144 | *(\*) Not installed by default* 145 | 146 | ## HDFS and MapReduce 147 | 148 | > **HDFS** is a distributed file system that provides high-throughput access to application data 149 | 150 | > **YARN** is a framework for job scheduling and cluster resource management 151 | 152 | > **MapReduce** is a YARN-based system for parallel processing of large data sets 153 | 154 | Documentation 155 | 156 | * [Hadoop v2.7.6](http://hadoop.apache.org/docs/r2.7.6) 157 | * [Untangling Apache Hadoop YARN](http://blog.cloudera.com/blog/2015/09/untangling-apache-hadoop-yarn-part-1/) series 158 | 159 | ### Admin 160 | 161 | HDFS cli 162 | ```bash 163 | # help 164 | hdfs 165 | 166 | # filesystem statistics 167 | hdfs dfsadmin -report 168 | 169 | # filesystem check 170 | hdfs fsck / 171 | ``` 172 | 173 | YARN cli 174 | ```bash 175 | # help 176 | yarn 177 | 178 | # list yarn applications 179 | yarn application -list 180 | 181 | # list nodes 182 | yarn node -list 183 | 184 | # view application logs 185 | yarn logs -applicationId APPLICATION_ID 186 | 187 | # kill yarn application 188 | yarn application -kill APPLICATION_ID 189 | ``` 190 | 191 | Useful paths 192 | ```bash 193 | # data and logs 194 | devops/hadoop/.data/master/hadoop # host 195 | /vol/hadoop # guest 196 | 197 | # (guest) config 198 | /usr/local/hadoop/etc/hadoop 199 | 200 | # (hdfs) map-reduce history 201 | /mr-history/history/done_intermediate/hadoop 202 | 203 | # (hdfs) aggregated app logs 204 | /yarn/app/hadoop/logs/application_XXX 205 | ``` 206 | 207 | ### MapReduce WordCount Job 208 | 209 | ```bash 210 | # build jar on the host machine 211 | cd devops/hadoop/example/map-reduce 212 | ./gradlew clean build 213 | 214 | cd devops/hadoop 215 | vagrant ssh master 216 | 217 | # create base directory using hdfs 218 | hdfs dfs -mkdir -p /user/ubuntu 219 | 220 | # create example directory 221 | hadoop fs -mkdir -p /user/ubuntu/word-count/input 222 | 223 | # list directory 224 | hadoop fs -ls -h -R / 225 | hadoop fs -ls -h -R /user/ubuntu 226 | 227 | # create sample files 228 | echo "Hello World Bye World" > file01 229 | echo "Hello Hadoop Goodbye Hadoop" > file02 230 | 231 | # copy from local to hdfs 232 | hadoop fs -copyFromLocal file01 /user/ubuntu/word-count/input 233 | hadoop fs -put file02 /user/ubuntu/word-count/input 234 | 235 | # verify copied files 236 | hadoop fs -ls -h -R /user/ubuntu 237 | hadoop fs -cat /user/ubuntu/word-count/input/file01 238 | hadoop fs -cat /user/ubuntu/word-count/input/file02 239 | hadoop fs -cat /user/ubuntu/word-count/input/* 240 | 241 | # run application 242 | hadoop jar /vagrant/example/map-reduce/build/libs/map-reduce.jar \ 243 | /user/ubuntu/word-count/input \ 244 | /user/ubuntu/word-count/output 245 | 246 | # check output 247 | hadoop fs -cat /user/ubuntu/word-count/output/part-r-00000 248 | 249 | # delete directory to run it again 250 | hadoop fs -rm -R /user/ubuntu/word-count/output 251 | 252 | # run sample job in a different queue 253 | hadoop jar \ 254 | $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar \ 255 | wordcount \ 256 | -Dmapreduce.job.queuename=root.priority_queue \ 257 | /user/ubuntu/word-count/input \ 258 | /user/ubuntu/word-count/output 259 | 260 | # well known WARN issue 261 | # https://issues.apache.org/jira/browse/HDFS-10429 262 | ``` 263 | 264 | ### Benchmarking MapReduce with TeraSort 265 | 266 | ```bash 267 | # generate random data 268 | hadoop jar \ 269 | $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar \ 270 | teragen 1000 random-data 271 | 272 | # run terasort benchmark 273 | hadoop jar \ 274 | $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar \ 275 | terasort random-data sorted-data 276 | 277 | # validate data 278 | hadoop jar \ 279 | $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar \ 280 | teravalidate sorted-data report 281 | 282 | # useful commands 283 | hadoop fs -ls -h -R . 284 | hadoop fs -rm -r random-data 285 | hadoop fs -cat random-data/part-m-00000 286 | hadoop fs -cat sorted-data/part-r-00000 287 | ``` 288 | 289 |
290 | 291 | ## Spark 292 | 293 | > **Spark** is an open-source cluster-computing framework 294 | 295 | Resources 296 | 297 | * [Documentation](https://spark.apache.org/docs/latest) 298 | 299 | * [Spark in Action](https://amzn.to/2MzgHio) (2016) by Petar Zečević and Marko Bonaći (Book) 300 | 301 | * [Big Data Analysis with Scala and Spark](https://www.coursera.org/learn/scala-spark-big-data) (Course) 302 | 303 | * [How-to: Tune Your Apache Spark Jobs](http://blog.cloudera.com/blog/2015/03/how-to-tune-your-apache-spark-jobs-part-1) series 304 | 305 | * [Understanding Resource Allocation configurations for a Spark application](http://site.clairvoyantsoft.com/understanding-resource-allocation-configurations-spark-application) 306 | 307 | * [Apache Spark: Config Cheatsheet](http://c2fo.io/c2fo/spark/aws/emr/2016/07/06/apache-spark-config-cheatsheet) 308 | 309 | * [Mastering Apache Spark](https://legacy.gitbook.com/book/jaceklaskowski/mastering-apache-spark) 310 | 311 | * [Managing Spark Partitions with Coalesce and Repartition](https://hackernoon.com/managing-spark-partitions-with-coalesce-and-repartition-4050c57ad5c4) 312 | 313 | * [Understanding Apache Spark on YARN](http://sujithjay.com/2018/07/24/Understanding-Apache-Spark-on-YARN/) 314 | 315 | ![spark-architecture](img/spark-architecture.png) 316 | 317 | Spark application on YARN 318 | 319 | ![spark-job](img/spark-job.png) 320 | 321 | ```bash 322 | # start REPL 323 | spark-shell 324 | pyspark 325 | ``` 326 | 327 | ### Interactive Analysis example 328 | 329 | ```bash 330 | spark-shell 331 | # spark shell with yarn 332 | spark-shell --master yarn --deploy-mode client 333 | 334 | # view all configured parameters 335 | sc.getConf.getAll.foreach(x => println(s"${x._1}: ${x._2}")) 336 | 337 | val licenceLines = sc.textFile("file:/usr/local/spark/LICENSE") 338 | val lineCount = licenceLines.count 339 | val isBsd = (line: String) => line.contains("BSD") 340 | val bsdLines = licenceLines.filter(isBsd) 341 | bsdLines.count 342 | bsdLines.foreach(println) 343 | ``` 344 | 345 | ### Spark Job examples 346 | 347 | Example local 348 | ```bash 349 | # run SparkPi example 350 | spark-submit \ 351 | --class org.apache.spark.examples.SparkPi \ 352 | --master local[*] \ 353 | $SPARK_HOME/examples/jars/spark-examples_*.jar 10 354 | 355 | # GitHub event documentation 356 | # https://developer.github.com/v3/activity/events/types 357 | 358 | # build jar on the host machine 359 | cd devops/hadoop/example/spark 360 | sbt clean package 361 | 362 | cd devops/hadoop 363 | vagrant ssh master 364 | 365 | # sample dataset 366 | mkdir -p github-archive && \ 367 | cd $_ && \ 368 | wget http://data.githubarchive.org/2018-01-01-{0..10}.json.gz && \ 369 | gunzip -k * 370 | # sample line 371 | head -n 1 2018-01-01-0.json | jq '.' 372 | 373 | # run local job 374 | spark-submit \ 375 | --class "com.github.niqdev.App" \ 376 | --master local[*] \ 377 | /vagrant/example/spark/target/scala-2.11/spark-github_2.11-0.1.0-SNAPSHOT.jar 378 | ``` 379 | 380 | Example cluster 381 | ```bash 382 | # run job in YARN cluster-deploy mode 383 | spark-submit \ 384 | --class org.apache.spark.examples.SparkPi \ 385 | --master yarn \ 386 | --deploy-mode cluster \ 387 | --driver-memory 2g \ 388 | --executor-memory 1g \ 389 | --executor-cores 3 \ 390 | --queue default \ 391 | $SPARK_HOME/examples/jars/spark-examples*.jar \ 392 | 10 393 | 394 | # --conf "spark.yarn.jars=hdfs://namenode.local:9000/user/spark/share/lib/*.jar" 395 | ``` 396 | 397 |
398 | 399 | ## Zeppelin 400 | 401 | > **Zeppelin** is a web-based notebook that enables data-driven, interactive data analytics and collaborative documents with SQL, Scala and more 402 | 403 | Resources 404 | 405 | * [Documentation](https://zeppelin.apache.org) 406 | 407 | ### Setup 408 | 409 | Install and start Zeppelin 410 | ```bash 411 | # access master node 412 | vagrant ssh master 413 | 414 | # login as root 415 | sudo su - 416 | 417 | # install and init 418 | /vagrant/script/setup_zeppelin.sh 419 | 420 | # start manually (first time only) 421 | su --login hadoop /vagrant/script/bootstrap.sh zeppelin 422 | ``` 423 | 424 | ### Examples 425 | 426 | * [Learning Spark SQL with Zeppelin](https://hortonworks.com/tutorial/learning-spark-sql-with-zeppelin) 427 | 428 | ``` 429 | # markdown interpreter 430 | %md 431 | hello 432 | 433 | # shell interpreter 434 | %sh 435 | hadoop fs -ls -h -R / 436 | ``` 437 | 438 | Cluster issue: verify to have enough memory with `free -m` e.g. *Error: Cannot allocate memory* 439 | 440 |
441 | 442 | ## Oozie 443 | 444 | > **Oozie** is a workflow scheduler system to manage Hadoop jobs 445 | 446 | Resources 447 | 448 | * [Documentation](https://oozie.apache.org) 449 | 450 | ### Setup 451 | 452 | **Optional PostgreSQL configuration** - By default Oozie is configured to use Embedded Derby 453 | ```bash 454 | # access master node 455 | vagrant ssh master 456 | 457 | # install docker 458 | curl -fsSL get.docker.com -o get-docker.sh && \ 459 | chmod u+x $_ && \ 460 | ./$_ && \ 461 | sudo usermod -aG docker hadoop 462 | 463 | # logout and login again to verify docker installation 464 | exit 465 | vagrant ssh master 466 | whoami # hadoop 467 | docker ps -a 468 | 469 | # uncomment PostgreSQL configurations 470 | vim devops/hadoop/file/oozie/config/oozie-site.xml # from host 471 | vim /vagrant/file/oozie/config/oozie-site.xml # from guest 472 | 473 | # start postgres on guest machine 474 | docker run \ 475 | --detach \ 476 | --name oozie-postgres \ 477 | -p 5432:5432 \ 478 | -e POSTGRES_DB="oozie-db" \ 479 | -e POSTGRES_USER="postgres" \ 480 | -e POSTGRES_PASSWORD="password" \ 481 | postgres 482 | 483 | # permission issue 484 | # https://github.com/docker-library/postgres/issues/116 485 | # --volume /vol/postgres:/var/lib/postgresql/data 486 | 487 | # access container 488 | docker exec -it oozie-postgres bash 489 | psql --username=postgres 490 | # list all databases 491 | \list 492 | \connect oozie-db 493 | # list all tables 494 | \dt 495 | # describe table 496 | \d+ wf_jobs 497 | # list workflow 498 | select * from wf_jobs; 499 | ``` 500 | 501 | Install and start Oozie 502 | ```bash 503 | # access master node 504 | vagrant ssh master 505 | 506 | # login as root 507 | sudo su - 508 | 509 | # build, install and init 510 | /vagrant/script/setup_oozie.sh 511 | 512 | # start oozie manually (first time only) 513 | su --login hadoop /vagrant/script/bootstrap.sh oozie 514 | ``` 515 | *It might take a while to build the sources* 516 | 517 | Useful paths 518 | ```bash 519 | # data and logs 520 | devops/hadoop/.data/master/oozie # host 521 | /vol/oozie # guest 522 | 523 | # (guest) config 524 | /usr/local/oozie/conf 525 | 526 | # (hdfs) examples 527 | /user/hadoop/examples 528 | ``` 529 | 530 | ### Examples 531 | 532 | Run bundled examples within distribution 533 | ```bash 534 | # examples path 535 | .data/master/oozie/examples # host 536 | /vol/oozie/examples # guest 537 | 538 | # access master node as hadoop user 539 | vagrant ssh master 540 | 541 | export OOZIE_EXAMPLE_PATH=/vol/oozie/examples 542 | export OOZIE_HDFS_PATH=/user/$(whoami)/examples 543 | 544 | # open map-reduce job.properties 545 | vim $OOZIE_EXAMPLE_PATH/apps/map-reduce/job.properties 546 | 547 | # edit the following properties 548 | nameNode=hdfs://namenode.local:9000 # fs.defaultFS @ core-site.xml 549 | jobTracker=resource-manager.local:8032 # yarn.resourcemanager.address @ yarn-site.xml 550 | queueName=priority_queue # or default @ fair-scheduler.xml 551 | 552 | # upload all the examples 553 | hadoop fs -put $OOZIE_EXAMPLE_PATH $OOZIE_HDFS_PATH 554 | 555 | # verify uploaded files 556 | hadoop fs -ls -h -R /user/$(whoami) 557 | 558 | # run the map-reduce workflow example 559 | oozie job \ 560 | -oozie http://oozie.local:11000/oozie \ 561 | -config $OOZIE_EXAMPLE_PATH/apps/map-reduce/job.properties \ 562 | -run 563 | 564 | # verify status 565 | oozie job -oozie http://oozie.local:11000/oozie -info WORKFLOW_ID 566 | 567 | # verify result 568 | hadoop fs -cat $OOZIE_HDFS_PATH/output-data/map-reduce/part-00000 569 | 570 | # remove all the examples 571 | hadoop fs -rm -R $OOZIE_HDFS_PATH 572 | ``` 573 | 574 | ### Useful commands 575 | 576 | * Workflow requires `oozie.wf.application.path` property 577 | * Coordinator requires `oozie.coord.application.path` property 578 | 579 | ```bash 580 | # verify oozie status 581 | oozie admin \ 582 | -oozie http://oozie.local:11000/oozie \ 583 | -status 584 | 585 | # verify workflow or coordinator status 586 | oozie job \ 587 | -oozie http://oozie.local:11000/oozie \ 588 | -info JOB_ID \ 589 | -verbose 590 | 591 | # poll workflow or coordinator status 592 | oozie job \ 593 | -oozie http://oozie.local:11000/oozie \ 594 | -poll JOB_ID \ 595 | -interval 10 \ 596 | -timeout 60 \ 597 | -verbose 598 | 599 | # find running coordinator 600 | oozie jobs \ 601 | -oozie http://oozie.local:11000/oozie/ \ 602 | -filter status=RUNNING \ 603 | -jobtype coordinator 604 | 605 | # suspend|resume|kill coordinator 606 | oozie job \ 607 | -oozie http://oozie.local:11000/oozie/ \ 608 | [-suspend|-resume|-kill] \ 609 | XXX-C 610 | 611 | # re-run coordinator's workflow (action) 612 | oozie job \ 613 | -oozie http://oozie.local:11000/oozie/ \ 614 | -rerun XXX-C \ 615 | -action 1,2,3,N 616 | 617 | # kill workflow 618 | oozie job \ 619 | -oozie http://oozie.local:11000/oozie/ \ 620 | -kill \ 621 | XXX-W 622 | 623 | # re-run all workflow's actions 624 | oozie job \ 625 | -oozie http://oozie.local:11000/oozie/ \ 626 | -rerun \ 627 | XXX-W \ 628 | -Doozie.wf.rerun.failnodes=false 629 | ``` 630 | 631 |
632 | --------------------------------------------------------------------------------