├── .gitignore ├── CHANGELOG.md ├── Jenkinsfile ├── LICENSE ├── README.md ├── SUMMARY.md ├── applications ├── README.md ├── examples.md ├── ksh.md └── kso.md ├── bulkingest └── README.md ├── community ├── README.md ├── contributions_guidelines.md └── governance.md ├── console ├── README.md ├── applications.md ├── datasets.md ├── images │ ├── app_new_1_select-package.png │ ├── app_new_2_overrides.png │ ├── app_new_3_name.png │ ├── app_running_metrics.png │ ├── app_status.png │ ├── compaction_timeline.png │ ├── component_green.png │ ├── component_red.png │ ├── datasets_none.png │ ├── home_errors_warnings4.png │ ├── home_green.png │ ├── info_hdfs_errors.png │ ├── metrics_all_green.png │ ├── metrics_errors_overview.png │ ├── metrics_filtered_errors_warnings.png │ ├── more_info_flink_streaming.png │ ├── more_info_locaton.png │ ├── more_info_on_click.png │ ├── more_info_oozie_spark_batch_c.png │ ├── more_info_spark_stream.png │ ├── more_info_spark_stream_accepted.png │ ├── packages_details.png │ └── packages_search.png ├── metrics.md ├── packages.md └── uicredentials.md ├── consumer ├── README.md └── images │ └── opentsdb.png ├── cover.jpg ├── downloads └── README.md ├── em_sd ├── README.md ├── consul.md ├── consul_dns.jpg ├── consul_nodes.jpeg ├── consul_services.jpeg ├── dns.md └── sd.md ├── exploration ├── README.md ├── dep-usage.md ├── dependencies.md ├── images │ ├── lab-challenge-2-output.png │ ├── lab-challenge-3-output.png │ ├── lab-example-1-output.png │ ├── lab-example-1.png │ ├── lab-example-3-output.png │ ├── lab-example-3.png │ ├── lab-example-4-output.png │ ├── lab-example-4.png │ ├── lab-example-5-output.png │ ├── lab-example-5-ui.png │ ├── lab-example-5.png │ ├── lab-example-6-output.png │ ├── lab-example-6-ui.png │ ├── lab-example-6.png │ └── lab-images │ │ ├── jupyter-example-cell1-output.png │ │ ├── jupyter-example-cell1.png │ │ ├── jupyter-example-cell2.png │ │ ├── jupyter-example-cell3-output.png │ │ ├── jupyter-example-cell3.png │ │ ├── jupyter-example-cell4-output.png │ │ ├── jupyter-example-cell4.png │ │ ├── jupyter-example-cell5-output.png │ │ ├── jupyter-example-cell5-ui.png │ │ ├── jupyter-example-cell5.png │ │ ├── jupyter-example-cell6-output.png │ │ ├── jupyter-example-cell6-ui.png │ │ └── jupyter-example-cell6.png ├── jupyter.md └── lab.md ├── gettingstarted ├── README.md └── hadoop_distro.md ├── images ├── Detailed Sequence for Kerberos.png ├── Mainconsole with LDAP.png ├── Mainconsole SPNEGO.png ├── Pnda-security.png ├── Webconsole.png ├── dm-overview.png ├── kso-summary.png ├── kso-tsd.png ├── openbmp_flow.png ├── package-server.png ├── pnda-architecture2.png ├── provisioning-heat.png └── provisioning-salt-cloud.png ├── instructions.md ├── log-aggregation └── README.md ├── others └── README.md ├── overview └── README.md ├── provisioning ├── OVERVIEW.md ├── PNDA-overview.jpg ├── README.md ├── aws.md ├── aws │ ├── BUILD.md │ ├── CONFIGURE.md │ ├── CREATE.md │ ├── EXAMPLES.md │ ├── MIRROR.md │ ├── PREPARE.md │ ├── STAGE.md │ └── images │ │ ├── attach-policy.png │ │ ├── create-policy1.png │ │ ├── create-policy2.png │ │ ├── create-policy3.png │ │ ├── create-user1.png │ │ └── create-user2.png ├── baremetal │ ├── BUILD.md │ ├── CONFIGURE.md │ ├── CREATE.md │ ├── MIRROR.md │ ├── PREPARE.md │ ├── REGISTER.md │ ├── SALTMASTER.md │ ├── STAGE.md │ ├── UNDERCLOUD.md │ ├── bm-deployment.png │ ├── bm-inspecting.png │ ├── bm-registration.png │ └── bm-workflow.png ├── building.md ├── heat.md ├── images │ ├── breadcrumbs-build.jpg │ ├── breadcrumbs-cfg.jpg │ ├── breadcrumbs-create.jpg │ ├── breadcrumbs-mirror.jpg │ ├── breadcrumbs-stage.jpg │ └── breadcrumbs.jpg ├── openstack │ ├── BUILD.md │ ├── CONFIGURE.md │ ├── CREATE.md │ ├── EXAMPLES.md │ ├── IMAGE.md │ ├── MIRROR.md │ ├── PREPARE.md │ ├── STAGE.md │ ├── adam.txt │ └── images │ │ ├── attach-policy.png │ │ ├── create-policy1.png │ │ ├── create-policy2.png │ │ ├── create-policy3.png │ │ ├── create-user1.png │ │ └── create-user2.png ├── saltstack.md ├── server-cluster │ ├── BUILD.md │ ├── CONFIGURE.md │ ├── CREATE.md │ ├── EXAMPLES.md │ ├── MIRROR.md │ ├── PREPARE.md │ └── STAGE.md ├── versions.md ├── vmware.md └── vmware │ ├── CONFIGURE.md │ ├── CREATE.md │ ├── IMAGE.md │ └── PREPARE.md ├── query ├── README.md └── impala.md ├── repos └── README.md ├── resourcemanagement └── README.md ├── security ├── Architecture.jpg ├── Basic-Authentication.png ├── E-HDFS-Access-from-Shell.jpg ├── Impersonation.png ├── README.md ├── impala-ldap-f1.png └── settings.json ├── streamingest ├── README.md ├── data-preparation.md ├── images │ └── cluster_config.png ├── logstash.md ├── openbmp.md ├── opendl.md ├── pmacct.md ├── producer.md └── topic-preparation.md └── timeseries ├── README.md ├── grafana.md └── opentsdb.md /.gitignore: -------------------------------------------------------------------------------- 1 | _book 2 | pnda-guide.pdf 3 | pnda-guide.zip 4 | pnda-guide.epub 5 | pnda-guide.mobi 6 | *.graffle 7 | node_modules 8 | archives 9 | scripts 10 | -------------------------------------------------------------------------------- /Jenkinsfile: -------------------------------------------------------------------------------- 1 | node { 2 | try { 3 | stage 'Build' 4 | 5 | deleteDir() 6 | 7 | checkout scm 8 | 9 | def workspace = pwd() 10 | def version = env.BRANCH_NAME 11 | 12 | if(env.BRANCH_NAME=="master") { 13 | version = sh(returnStdout: true, script: 'git describe --abbrev=0 --tags').trim() 14 | checkout([$class: 'GitSCM', branches: [[name: "tags/${version}"]], extensions: [[$class: 'CleanCheckout']]]) 15 | } 16 | 17 | sh """ 18 | npm install gitbook-cli 19 | node_modules/gitbook-cli/bin/gitbook.js init 20 | node_modules/gitbook-cli/bin/gitbook.js build 21 | rm -f _book/Jenkinsfile 22 | rm -f _book/instructions.md 23 | rm -f _book/pnda-guide.* 24 | rm -rf _book/archives 25 | mv _book pnda-guide-${version} 26 | tar zcf pnda-guide-${version}.tar.gz pnda-guide-${version} 27 | """ 28 | 29 | stage 'Test' 30 | sh ''' 31 | ''' 32 | 33 | stage 'Deploy' 34 | build job: 'deploy-component', parameters: [[$class: 'StringParameterValue', name: 'branch', value: env.BRANCH_NAME],[$class: 'StringParameterValue', name: 'component', value: "public-documentation"],[$class: 'StringParameterValue', name: 'release_path', value: "resources/releases"],[$class: 'StringParameterValue', name: 'release', value: "${workspace}/pnda-guide-${version}.tar.gz"]] 35 | 36 | if(env.BRANCH_NAME=="develop") { 37 | stage 'Publish' 38 | build job: 'pnda-guide', parameters: [[$class: 'StringParameterValue', name: 'deployment', value: 'production'],[$class: 'StringParameterValue', name: 'release', value: env.BRANCH_NAME],[$class: 'StringParameterValue', name: 'release_path', value: "resources/releases"]] 39 | } 40 | 41 | stage 'Notifier' 42 | build job: 'notifier', parameters: [[$class: 'StringParameterValue', name: 'message', value: "${env.JOB_NAME} succeeded: see [Jenkins job ${env.BUILD_ID}](${env.BUILD_URL})"]] 43 | } 44 | catch(error) { 45 | build job: 'notifier', parameters: [[$class: 'StringParameterValue', name: 'message', value: "${env.JOB_NAME} failed: see [Jenkins job ${env.BUILD_ID}](${env.BUILD_URL})"]] 46 | currentBuild.result = "FAILED" 47 | throw error 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016 Cisco and/or its affiliates. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /SUMMARY.md: -------------------------------------------------------------------------------- 1 | # Summary 2 | 3 | * [PNDA Guide](README.md) 4 | * [Overview](overview/README.md) 5 | * [Getting Started](gettingstarted/README.md) 6 | * [Provisioning](provisioning/README.md) 7 | * [Creating PNDA](provisioning/OVERVIEW.md) 8 | * [Getting started with Heat](provisioning/heat.md) 9 | * [Getting started with AWS](provisioning/aws.md) 10 | * [Getting started with SaltStack](provisioning/saltstack.md) 11 | * [Creating PNDA on OpenStack](provisioning/openstack/PREPARE.md) 12 | * [Creating PNDA image](provisioning/openstack/IMAGE.md) 13 | * [Create PNDA mirror components](provisioning/openstack/MIRROR.md) 14 | * [Create PNDA build components](provisioning/openstack/BUILD.md) 15 | * [Stage PNDA mirror & components](provisioning/openstack/STAGE.md) 16 | * [Tips for HTTP server](provisioning/openstack/EXAMPLES.md) 17 | * [Configure PNDA creation process](provisioning/openstack/CONFIGURE.md) 18 | * [Create PNDA cluster](provisioning/openstack/CREATE.md) 19 | * [Creating PNDA on AWS](provisioning/aws/PREPARE.md) 20 | * [Create PNDA mirror components](provisioning/aws/MIRROR.md) 21 | * [Create PNDA build components](provisioning/aws/BUILD.md) 22 | * [Stage PNDA mirror & components](provisioning/aws/STAGE.md) 23 | * [Tips for HTTP server](provisioning/aws/EXAMPLES.md) 24 | * [Configure PNDA creation process](provisioning/aws/CONFIGURE.md) 25 | * [Create PNDA cluster](provisioning/aws/CREATE.md) 26 | * [Creating PNDA on VMWare](provisioning/vmware/PREPARE.md) 27 | * [Creating PNDA image](provisioning/vmware/IMAGE.md) 28 | * [Configure PNDA creation process](provisioning/vmware/CONFIGURE.md) 29 | * [Create PNDA cluster](provisioning/vmware/CREATE.md) 30 | * [Creating PNDA on server clusters](provisioning/server-cluster/PREPARE.md) 31 | * [Create PNDA mirror components](provisioning/server-cluster/MIRROR.md) 32 | * [Create PNDA build components](provisioning/server-cluster/BUILD.md) 33 | * [Stage PNDA mirror & components](provisioning/server-cluster/STAGE.md) 34 | * [Tips for HTTP server](provisioning/server-cluster/EXAMPLES.md) 35 | * [Configure PNDA creation process](provisioning/server-cluster/CONFIGURE.md) 36 | * [Create PNDA cluster](provisioning/server-cluster/CREATE.md) 37 | * [Creating PNDA on Bare metal](provisioning/baremetal/PREPARE.md) 38 | * [Setting up Undercloud](provisioning/baremetal/UNDERCLOUD.md) 39 | * [Registering nodes with Undercloud](provisioning/baremetal/REGISTER.md) 40 | * [Create PNDA mirror components](provisioning/baremetal/MIRROR.md) 41 | * [Create PNDA build components](provisioning/baremetal/BUILD.md) 42 | * [Stage PNDA mirror & components](provisioning/baremetal/STAGE.md) 43 | * [Configure PNDA creation process](provisioning/baremetal/CONFIGURE.md) 44 | * [Create PNDA cluster](provisioning/baremetal/CREATE.md) 45 | * [Console](console/README.md) 46 | * [Metrics](console/metrics.md) 47 | * [Packages](console/packages.md) 48 | * [Applications](console/applications.md) 49 | * [Datasets](console/datasets.md) 50 | * [Endpoint Management and Service discovery](em_sd/README.md) 51 | * [Consul Overview](em_sd/consul.md) 52 | * [DNS in PNDA](em_sd/dns.md) 53 | * [Service discovery](em_sd/sd.md) 54 | * [Streaming Ingest](streamingest/README.md) 55 | * [Preparing topics](streamingest/topic-preparation.md) 56 | * [Preparing data](streamingest/data-preparation.md) 57 | * [Integrating Logstash](streamingest/logstash.md) 58 | * [Integrating OpenDaylight](streamingest/opendl.md) 59 | * [Integrating OpenBMP](streamingest/openbmp.md) 60 | * [Integrating Pmacct](streamingest/pmacct.md) 61 | * [Developing a producer](streamingest/producer.md) 62 | * [Bulk Ingest](bulkingest/README.md) 63 | * [Consumers](consumer/README.md) 64 | * [Packages & Applications](applications/README.md) 65 | * [Example Applications](applications/examples.md) 66 | * [Spark Streaming and HBase tutorial](applications/ksh.md) 67 | * [Spark Streaming and OpenTSDB tutorial](applications/kso.md) 68 | * [Log Aggregation](log-aggregation/README.md) 69 | * [Structured Query](query/README.md) 70 | * [Impala](query/impala.md) 71 | * [Data Exploration](exploration/README.md) 72 | * [Exploratory data analytics tutorial](exploration/lab.md) 73 | * [How to manage application dependencies](exploration/dependencies.md) 74 | * [Time Series](timeseries/README.md) 75 | * [OpenTSDB](timeseries/opentsdb.md) 76 | * [Grafana](timeseries/grafana.md) 77 | * [Security](security/README.md) 78 | * [Resource Management](resourcemanagement/README.md) 79 | * [Community](community/README.md) 80 | * [Contributions Guidelines](community/contributions_guidelines.md) 81 | * [Project Governance](community/governance.md) 82 | * [Repositories](repos/README.md) 83 | * [References](others/README.md) 84 | * [Changelog](CHANGELOG.md) 85 | -------------------------------------------------------------------------------- /applications/examples.md: -------------------------------------------------------------------------------- 1 | # Example Applications 2 | 3 | Some example applications have been written to help with getting started using PNDA and knowing what to put in an application package. 4 | 5 | ## Spark Batch 6 | 7 | The [spark-batch](https://github.com/pndaproject/example-applications/tree/master/spark-batch) example converts data from the master data set into parquet format. 8 | 9 | ## Spark Streaming - Kafka to HBase 10 | 11 | The [spark-streaming](https://github.com/pndaproject/example-applications/tree/master/spark-streaming) example writes data from Kafka into an HBase table. 12 | 13 | The [Spark Streaming and HBase tutorial](ksh.md) provides an in-depth look at the app. 14 | 15 | ## Spark Streaming - Kafka to OpenTSDB 16 | 17 | The [kafka-spark-opentsdb](https://github.com/pndaproject/example-applications/tree/master/kafka-spark-opentsdb) example writes data from Kafka in [OpenTSDB](../timeseries/opentsdb.md) metric time series. 18 | 19 | The [Spark Streaming and OpenTSDB tutorial](kso.md) provides an in-depth look at the app. 20 | 21 | ## Flink Streaming 22 | 23 | The [flink-streaming](https://github.com/pndaproject/example-applications/tree/develop/flink-streaming-word-count) example reads events from a network socket and performs stream processing. 24 | -------------------------------------------------------------------------------- /bulkingest/README.md: -------------------------------------------------------------------------------- 1 | # Bulk Ingest 2 | 3 | In addition to streaming ingest via Kafka producers, PNDA also provide an offline bulk ingest tool for those who would like to migrate pre-collected to PNDA. 4 | 5 | Unlike streaming ingest, bulk ingest does not require data to be formatted using the PNDA schema, and places no restrictions on the destination folder (as long as the user has write permission). 6 | 7 | ## [Bulk Ingest Tool](https://github.com/pndaproject/platform-tools/bulkingest/blob/master/README.md) 8 | 9 | The bulk-ingest tool can be used up loading a file or folder to HDFS. Once the dataset has been uploaded, it can be found in `/user/pnda/PNDA_datasets/bulk/`. 10 | -------------------------------------------------------------------------------- /community/README.md: -------------------------------------------------------------------------------- 1 | # Welcome to the PNDA community 2 | 3 | You will find bellow all the resources available with regards to the PNDA community: 4 | 5 | ## GitHub 6 | The [PNDA distribution](https://github.com/pndaproject) is available on GitHub 7 | To get started setting up your own PNDA cluster, see the [getting started](../gettingstarted/README.md). 8 | If you want to contribute to PNDA, check the [contributions guidelines](./contributions_guidelines.md) 9 | 10 | ## JIRA 11 | If you want to have more information about the PNDA roadmap and plan, see the [PNDA project](https://issues.pnda.io/browse/PNDA) 12 | 13 | ## Mailing Lists 14 | * [pnda-users group](https://groups.google.com/forum/#!forum/pnda-users) – for discussions around PNDA usage and community support 15 | * [pnda-developers group](https://groups.google.com/forum/#!forum/pnda-developers) – for discussions around PNDA development 16 | If you need support, please send your questions to the pnda-users group rather than filing a GitHub issue. 17 | 18 | ## Project Governance 19 | As a Linux Foundation Collaboration Project PNDA is not controlled by any single company. A copy of the project charter is published [here](./governance.md) 20 | -------------------------------------------------------------------------------- /community/contributions_guidelines.md: -------------------------------------------------------------------------------- 1 | # Contributions guidelines 2 | 3 | ## Before you start 4 | 5 | 1. Install Git client 6 | 2. Make sure your git client reflects your name and email address as per your GitHub account. 7 | 8 | ``` 9 | git config --global user.name "John Doe" 10 | git config --global user.email johndoe@example.com 11 | ``` 12 | 13 | ## Creating a feature or bugfix 14 | 15 | 1. Fork the repository 16 | 2. Make sure your develop branch is fully up to date with the upstream 17 | 3. Branch from the head of the develop branch 18 | 4. If at all possible, name your branch according to the issue that caused you to create it 19 | 5. Make your changes, commit/push 20 | 6. If develop moves on, rebase your branch on develop. Do not merge develop to the branch. 21 | 22 | ## Submitting a feature or bugfix 23 | 24 | 1. When ready, rebase your branch in interactive mode and squash all the commits to one commit. 25 | 2. Make sure the commit message in the one commit - 26 | * Is written in the present tense and imperative mood e.g. "Force widget factor to 7" 27 | * Don't exceed 50 characters for the first line 28 | * Leave a blank line 29 | * Optionally include any other descriptive text you feel qualifies the reason for the change 30 | * Preferably include reference (not URL) to the issue that caused you to make this change e.g. PNDA-8 31 | 3. Push this to your branch and open a PR 32 | 4. GitHub will take the title of the commit as the PR name, do not modify this 33 | 5. In the description, include any descriptive text you feel qualifies the reason for the change 34 | 6. Note validation that has been carried out 35 | 7. Do not fill the description with design discussion and so on - describe the code and what it does 36 | 37 | **Commit example** 38 | 39 | _Force widget factor to 7_ 40 | 41 | _PNDA-1234: Widget factors other than 7 cause all kinds of problems. Introduced new WidgetFactorFactory to generate sevens._ 42 | 43 | ## Responding to feedback on the PR 44 | 45 | 1. Keep discussion on the PR if possible and record outcome of side discussions if not 46 | 2. If the PR has been closed, do not delete or rebase your branch before re-opening the PR 47 | 3. Do not close PRs and open new PRs for the same work 48 | 4. Ensure the PR commit history and relationship with develop remains sane 49 | * If develop moves on, rebase branch on develop 50 | * If rework of existing changes is required it usually makes sense to rebase and squash to one commit 51 | * If the additional work is purely additive, an additional commit is often all that's necessary 52 | 53 | ## Changelogs 54 | 55 | Given the difficulty of N-way merging on a single location in a single file on GitHub, we've decided to make modifications to the CHANGELOG a Maintainer activity. Therefore, omit these from the PR and we'll make the change after merging. 56 | 57 | 58 | -------------------------------------------------------------------------------- /console/applications.md: -------------------------------------------------------------------------------- 1 | # Applications 2 | 3 | The apps page lets you manage the lifecycle of [applications](../applications/README.md), which are instances of packages. You can create a new application from a deployed package, see the status of each application, and start, pause or delete them. 4 | 5 | ![Applications](images/app_running_metrics.png) 6 | Click an application for more detailed information. There are tabs that show the overview, deployment properties, logs, statistics and metrics (application key performance indicators, or KPIs). 7 | 8 | ## Creating applications 9 | 10 | Click the "Create New Application" button to create a new application from a package that has been deployed. If your package is not listed, you should deploy it first on the [Packages](packages.md) page. 11 | 12 | ![Select Package](images/app_new_1_select-package.png) 13 | 14 | First, choose a deployed package and version to use from the list. 15 | 16 | ![Overrides](images/app_new_2_overrides.png) 17 | 18 | Next, the default application properties for the package will be displayed. Click on any default value to modify it. When you are done, click "Next". 19 | 20 | ![New Name](images/app_new_3_name.png) 21 | 22 | Finally, enter a name for your new application, and click "Confirm". The new app will then be displayed in the list. 23 | 24 | 25 | ## Working with applications 26 | 27 | By default, a new app is not running. Click the "Start" button next to an app to start it, or click the "Pause" button next to a running app to pause it. You can also click the "Delete" button next to an app to delete it. 28 | 29 | 30 | 31 | 32 | ## Application Detailed Summary 33 | 34 | Once the application starts, status is shown as Running. To know the actual status of jobs which are spawned by that applicaition, click "More info" link. 35 | 36 | ![More Info location](images/more_info_locaton.png) 37 | 38 | On clicking that, the details are shown as below. 39 | 40 | ![More Info data](images/more_info_on_click.png) 41 | 42 | ### Detailed summary - SparkStreaming 43 | 44 | For Spark Streaming applications, job status will be queried from YARN and Spark server. 45 | Different status in Spark Streaming application are: 46 | 47 | - CREATED - SparkStreaming Application is created 48 | - ACCEPTED - Application is with YARN, in accepted state 49 | - RUNNING - Application running and spark server returned all jobs as succeeded (last 1000 jobs) 50 | - RUNNING_WITH_ERROS - Application is running and spark server returned with one or more jobs failed (last 1000 jobs) 51 | - FINISHED_FAILED - Application execution completed with yarn state as FINISHED and finalStatus as FAILED 52 | - FINISHED_KILLED - Application execution completed with yarn state as FINISHED and finalStatus as KILLED 53 | - FAILED - Application execution completed with yarn state and finalStatus as FAILED 54 | - KILLED - Application execution completed with yarn state and finalStatus as KILLED 55 | 56 | ![More Info location](images/more_info_spark_stream.png) 57 | 58 | Summary data provides a link to the application which leads to Application overview and Metrics page of YARN. 59 | In cases of ACCEPTED, FAILED and KILLED, summary data will provide a diagnostic information from YARN (in information tool tip). 60 | 61 | ![More Info location](images/more_info_spark_stream_accepted.png) 62 | 63 | ### Detailed summary - OOZIE 64 | 65 | For OOZIE applications, job status will be queried from OOZIE and YARN. Depending on the end job type, respective component will be queried (like Spark Server for Spark job). 66 | Different status in OOZIE application are: 67 | 68 | - CREATED - Oozie application is created and is not yet started 69 | - STARTING - Oozie application is getting started 70 | - RUNNING - Running properly with all jobs in running or succeeded state 71 | - RUNNING_WITH_ERRORS - Running, but one or more end jobs in failed state 72 | - SUSPENDED - User stops the oozie application when all workflows/jobs are in succeeded state 73 | - SUSPENDED_WITH_FAILURES - User stops the oozie application when one or more workflows/jobs are in failed state 74 | - KILLED - User kills the oozie application when all workflows/jobs are in succeeded state 75 | - KILLED_WITH_FAILURES - User kills the oozie application when one or more workflows/jobs are in failed state 76 | - COMPLETED - OOZIE application finished all its workflows/jobs and all are in succeeded state 77 | - COMPLETED_WITH_FAILURES - OOZIE application finished all its workflows/jobs and one or more in failed state 78 | 79 | ![More Info Spark Batch C](images/more_info_oozie_spark_batch_c.png) 80 | 81 | ### Detailed summary - Flink Streaming 82 | 83 | For Flink applications, job status will be queried from YARN and Flink server. Different status in Flink applications are: 84 | 85 | - CREATED - Flink Streaming Application is created 86 | - ACCEPTED - Application is with YARN, in accepted state 87 | - RUNNING - Application running properly with all vertices in running or succeeded state 88 | - RUNNING_WITH_ERRORS - Running but one or more vertices in failed state 89 | - FINISHED_SUCCEDED - Application execution completed with yarn state as FINISHED and final status as SUCCEEDED 90 | - FAILED - Application execution completed with yarn state and final status as FAILED 91 | - KILLED - Application execution completed with yarn state and final status as KILLED 92 | 93 | ![More Info Flink Streaming](images/more_info_flink_streaming.png) 94 | -------------------------------------------------------------------------------- /console/datasets.md: -------------------------------------------------------------------------------- 1 | # Datasets 2 | 3 | The datasets page lets you manage the data retention policy of each dataset in the cluster. As a big data system, PNDA is capable of storing a large amount of data. However, to manage your data storage costs you can choose to put an upper limit on the amount of data that is retained. 4 | 5 | All data stored in PNDA is divided into datasets. Each dataset has a unique identifier, such as `netflow`, `telemetry`, etc. You can customize the data retention policy for each dataset individually, as your needs may be different for each kind of data. 6 | 7 | If you need to retain data indefinitely, you can choose to keep it. Otherwise you can choose what happens to data when it has reached the limit. If you need to retain the data after it is removed from PNDA, you can choose to have it archived. If you no longer need the data, you can have it deleted. 8 | 9 | You can limit the amount of data by age or by size. If you choose to limit the data by age, you can specify the maximum age in days. Alternatively if you choose to limit the data by size, you can specify the maximum size in gigabytes. 10 | 11 | ![Datasets](images/datasets_none.png) 12 | 13 | ## Setting policies 14 | 15 | The datasets page contains a table that lists all datasets in the current cluster, showing the current data retention policy. You cannot add datasets on this page, only modify the policy for existing datasets. 16 | 17 | - From the mode popup menu, choose whether you want to keep data indefinitely, or archive or delete data that has exceeded the limit. 18 | - From the policy popup menu, choose whether you want to limit data by age or by size. 19 | - In the limit column, you can enter the maximum age in days, or size in gigabytes, depending upon the policy. 20 | 21 | After you have made changes, click the Save button to review and confirm the changes that will be made. For example, if you are changing the limit from 30 to 20 gigabytes, the message will confirm that up to 10 gigabytes of data could be archived or deleted. 22 | 23 | ## Creating datasets 24 | 25 | Datasets are automatically created based on the name of the `source` field in Kafka messages by [gobblin](https://github.com/pndaproject/gobblin). See the [getting started](../gettingstarted/README.md#producer-integration) for more information on how datasets are created. 26 | 27 | ## Dataset Compaction 28 | 29 | To consolidate files in a dataset, [Gobblin Compaction](https://gobblin.readthedocs.io/en/latest/user-guide/Compaction/) can be used. Compaction schedule is defined at the time of PNDA cluster creation. Compaction can be enabled and set to run on hourly, daily, monthly or yearly schedule. 30 | 31 | Consider below compaction job config snippet for daily run of compaction job, scheduled to run at 01:00:00 hours as a cron. 32 | ``` 33 | Input directory: /user/pnda/PNDA_datasets/datasets 34 | compaction directory: /user/PNDA_datasets/compacted 35 | compaction schedule: daily 36 | folder pattern: “year=YYYY/month=MM/day=dd” 37 | min.time.ago = 1h 38 | max.time.ago = 1d2h 39 | ``` 40 | All datasets in the input directory are considered for compaction 41 | For example if /user/pnda/PNDA_datasets/datasets/ has dataset directories "source 42 | =dir1" and "source=dir2". Contents of both directories are considered for compaction. 43 | 44 | ![Compaction Timeline](images/compaction_timeline.png) 45 | 46 | List of input files for compaction from a dataset is selected based on **min.time.ago** and **max.time.ago**, as illustrated above. Only late arrivals are picked up for compaction, from those folders which were part of the previous compaction cycle. 47 | 48 | For example, daily compaction for day 2018-03-12 will start at 2018-03-12 01:00:00 hours. 49 | 50 | It will consider all files in the dataset directories, starting from: 51 | ``` 52 | /user/pnda/PNDA_datasets/datasets/source=dir1/year=2018/month=03/day=10/hour=23 53 | /user/pnda/PNDA_datasets/datasets/source=dir1/year=2018/month=03/day=11/hour=00 54 | ... 55 | ... 56 | /user/pnda/PNDA_datasets/datasets/source=dir1/year=2018/month=03/day=11/hour=23 57 | /user/pnda/PNDA_datasets/datasets/source=dir1/year=2018/month=03/day=12/hour=00 58 | ``` 59 | Only those files with a modification time > 2018-03-11 01:00 hrs, are selected from directory “day=10/hour=23” and “day=11/hour=00”. Other files in these directories were part of the previous compaction cycle and hence are not selected. 60 | 61 | After compaction, the compacted files are kept in the output directory (directory structure will follow the pattern defined in “folder pattern”. In addition to the compacted file, this directory will also have \_COMPACTION\_COMPLETE and _SUCCESS. \_COMPACTION\_COMPLETE contains the timestamp of when the compaction job started. All files in the input folders with earlier modification timestamps have been compacted. Next run of compaction will only consider the files in the input folders with the later timestamps. _SUCCESS is created only when the compaction is successfully completed. Compacted filename has the following pattern: 62 | 63 | **part-m{RecordCount}.{SystemCurrentTimeInMills}.{RandomInteger}.avro** 64 | 65 | After compaction on 2018-03-12 01:00 hrs, the compacted folder will have the following files: 66 | ``` 67 | /user/pnda/PNDA_datasets/compacted/source=dir1/year=2018/month=03/day=11 68 | /user/pnda/PNDA\_datasets/compacted/source=dir1/year=2018/month=03/day=11/\_COMPACTION_COMPLETE 69 | /user/pnda/PNDA\_datasets/compacted/source=dir1/year=2018/month=03/day=11/\_SUCCESS 70 | /user/pnda/PNDA_datasets/compacted/source=dir1/year=2018/month=03/day=11/part-m-89561. 1520869721000.794208420.avro 71 | ``` 72 | 73 | ## Kafka 74 | 75 | Kafka has its own data retention policy that affects how long data from producers stays in the queue for consumers to consume it. By default, Kafka keeps data for 24 hours for each topic. You can use the Kafka Manager to override the default data retention policy on a time or size basis. 76 | -------------------------------------------------------------------------------- /console/images/app_new_1_select-package.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/console/images/app_new_1_select-package.png -------------------------------------------------------------------------------- /console/images/app_new_2_overrides.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/console/images/app_new_2_overrides.png -------------------------------------------------------------------------------- /console/images/app_new_3_name.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/console/images/app_new_3_name.png -------------------------------------------------------------------------------- /console/images/app_running_metrics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/console/images/app_running_metrics.png -------------------------------------------------------------------------------- /console/images/app_status.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/console/images/app_status.png -------------------------------------------------------------------------------- /console/images/compaction_timeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/console/images/compaction_timeline.png -------------------------------------------------------------------------------- /console/images/component_green.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/console/images/component_green.png -------------------------------------------------------------------------------- /console/images/component_red.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/console/images/component_red.png -------------------------------------------------------------------------------- /console/images/datasets_none.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/console/images/datasets_none.png -------------------------------------------------------------------------------- /console/images/home_errors_warnings4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/console/images/home_errors_warnings4.png -------------------------------------------------------------------------------- /console/images/home_green.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/console/images/home_green.png -------------------------------------------------------------------------------- /console/images/info_hdfs_errors.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/console/images/info_hdfs_errors.png -------------------------------------------------------------------------------- /console/images/metrics_all_green.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/console/images/metrics_all_green.png -------------------------------------------------------------------------------- /console/images/metrics_errors_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/console/images/metrics_errors_overview.png -------------------------------------------------------------------------------- /console/images/metrics_filtered_errors_warnings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/console/images/metrics_filtered_errors_warnings.png -------------------------------------------------------------------------------- /console/images/more_info_flink_streaming.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/console/images/more_info_flink_streaming.png -------------------------------------------------------------------------------- /console/images/more_info_locaton.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/console/images/more_info_locaton.png -------------------------------------------------------------------------------- /console/images/more_info_on_click.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/console/images/more_info_on_click.png -------------------------------------------------------------------------------- /console/images/more_info_oozie_spark_batch_c.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/console/images/more_info_oozie_spark_batch_c.png -------------------------------------------------------------------------------- /console/images/more_info_spark_stream.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/console/images/more_info_spark_stream.png -------------------------------------------------------------------------------- /console/images/more_info_spark_stream_accepted.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/console/images/more_info_spark_stream_accepted.png -------------------------------------------------------------------------------- /console/images/packages_details.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/console/images/packages_details.png -------------------------------------------------------------------------------- /console/images/packages_search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/console/images/packages_search.png -------------------------------------------------------------------------------- /console/metrics.md: -------------------------------------------------------------------------------- 1 | # Metrics 2 | 3 | The metrics page lists the metrics for all components in a single table. In addition, it includes key performance indicators (KPIs) for installed applications. 4 | 5 | ![Metrics green](images/metrics_all_green.png) 6 | 7 | Metrics have a hierarchical namespace, such as `hadoop.HDFS.health` or `kafka.brokers.1.system.ProcessCpuLoad`. 8 | 9 | You can filter the list by typing part of a metric name into the search field. For example, type `HDFS` to match `hadoop.HDFS.health`, `hadoop.HDFS.files_total`, etc. 10 | 11 | Most components have a `health` metric that can be `OK`, `WARN` or `ERROR`. You can, for example, filter the list by `health` to see the health status for all components. 12 | 13 | Any warnings will be displayed in yellow, and any errors will be displayed in red. The overall platform health section at the top of the page shows an overview of all warnings and errors. 14 | 15 | ![Metrics errors](images/metrics_filtered_errors_warnings.png) 16 | Click the More Info link for more information about a warning or error. 17 | 18 | ![Metrics errors](images/metrics_errors_overview.png) 19 | 20 | ## See also 21 | 22 | - See the [home page](README.md) for a graphical view of these metrics, organized by component. 23 | - See the metrics section in the [Apps page](applications.md) for application-specific key performance indicators (KPIs). 24 | 25 | 26 | # Quick Links 27 | 28 | The page also has links to the following other components: 29 | 30 | - [Flink](http://flink.apache.org/) 31 | - [Cloudera Manager](https://www.cloudera.com/products/cloudera-manager.html) (CDH only) 32 | - [Ambari](https://ambari.apache.org/) (HDP only) 33 | - [Grafana](http://grafana.org) 34 | - [Hue](http://gethue.com/) 35 | - [Jupyter](http://jupyter.org) 36 | - [Kafka Manager](http://kafka.apache.org/) 37 | - [OpenTSDB](http://opentsdb.net/) 38 | - [PNDA logserver](../log-aggregation/README.md) 39 | - [YARN Resource Manager](http://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/YARN.html) 40 | 41 | -------------------------------------------------------------------------------- /console/packages.md: -------------------------------------------------------------------------------- 1 | # Packages 2 | 3 | The packages page lets you browse available packages, and packages that have been deployed. You can deploy or undeploy an individual package. Packages are independently deployable units of application layer functionality. Each package consists of one or more components, each of which has a defined type. 4 | 5 | ![Packages](images/packages_search.png) 6 | 7 | ## Deployed packages 8 | 9 | This table lists all packages that have been deployed on the cluster, with their name and deployed version. Applications can be created from deployed packages on the [Apps](applications.md) page. 10 | 11 | ### Undeploying a package 12 | 13 | Click the Undeploy button to undeploy the package. You'll see that it is removed from the list of deployed packages. 14 | 15 | ## Available packages 16 | 17 | This table lists all packages that are available in the package repository for deployment to the cluster, with their name and latest version. If there is more than one version available, then the number of versions is listed. 18 | 19 | For more info on uploading packages to the package repository, see the [getting started](../gettingstarted/README.md#packages-and-applications) page. 20 | 21 | ### Deploying a package 22 | 23 | Click a package to deploy it. Click the "Deploy latest version" button to deploy the latest version, or click the "Deploy" button next to another version. Click "OK" to confirm, and then a message will tell you when the package has been deployed. You'll see that it is added to the list of deployed packages. 24 | 25 | ![Packages](images/packages_details.png) -------------------------------------------------------------------------------- /console/uicredentials.md: -------------------------------------------------------------------------------- 1 | # UIs In PNDA 2 | 3 | PNDA pulls together many different open source technologies, several of which provide a UI. To help with finding all these UIs the PNDA console links to the most useful ones, for example: 4 | 5 | - YARN resource manager 6 | - Hue 7 | - Jupyter 8 | - Grafana 9 | - PNDA Logserver 10 | - Flink 11 | 12 | The cog icons on the PNDA console home page link to the UI for that component. 13 | 14 | The PNDA console "metrics" page also contains a list of links to various UIs. 15 | 16 | ## Default Credentials 17 | Most UI's use PAM authentication (which includes pam_unix for local user authentication and pam_ldap for LDAP user authentication). 18 | 19 | ### Default users 20 | 21 | All the users that can be authenticated by the configured LDAP server will get access to the *user facing UI's*. 22 | 23 | When LDAP configuration is missing, two default local users are present on the cluster: 24 | 25 | | user | password | group | 26 | | --- | --- | --- 27 | | dev1 | dev1 | dev | 28 | | prod1 | prod1 | prod | 29 | 30 | ### Admin users 31 | 32 | The default local admin user is: 33 | 34 | | user | password | group | 35 | | --- | --- | --- 36 | | pnda | pnda | pnda | 37 | 38 | Due to limitations in some of some of the used open source technologies, some UI's can not authenticate through PAM and are connected directly with the LDAP server. 39 | 40 | - Hadoop Cluster Manager: admin/admin 41 | - Grafana: pnda/pndapnda 42 | 43 | ## Setting Credentials 44 | If different passwords are required they can be configured in the platform-salt configuration before provisioning PNDA: 45 | - The Hadoop Cluster Manager admin user credentials are located in [platform-salt/pillar/services.sls:admin_login](https://github.com/pndaproject/platform-salt/blob/develop/pillar/services.sls) 46 | - The Grafana pnda user credentials are located in [platform-salt/salt/grafana/init.sls:grafana_pass](https://github.com/pndaproject/platform-salt/blob/develop/salt/grafana/init.sls) 47 | - The pnda user credentials are located in [platform-salt/pillar/pnda.sls:pnda](https://github.com/pndaproject/platform-salt/blob/develop/pillar/pnda.sls) 48 | - The default users (dev1/prod1) credentials (when no LDAP properties are configured) are located in [platform-salt/pillar/identity.sls](https://github.com/pndaproject/platform-salt/blob/develop/pillar/identity.sls) 49 | 50 | Note that for the PNDA user `password_hash` should be set along with the `user` and `password`. The easiest and most reliable way to do this is to set the password on a RHEL 7 machine, then look in /etc/shadow for the password hash. 51 | 52 | The credentials can also be changed manually in each UI as required after PNDA has been provisioned. 53 | 54 | Note that if the Cloudera Manager or Ambari password is changed, then the following salt states should be rerun, as these components require access to the Cloudera Manager API: 55 | - [platform-testing](https://github.com/pndaproject/platform-salt/tree/develop/salt/platform-testing) 56 | - [deployment-manager](https://github.com/pndaproject/platform-salt/tree/develop/salt/deployment-manager) 57 | - [data-service](https://github.com/pndaproject/platform-salt/tree/develop/salt/data-service) 58 | - [hdfs-cleaner](https://github.com/pndaproject/platform-salt/tree/develop/salt/hdfs-cleaner) 59 | - [jupyter](https://github.com/pndaproject/platform-salt/tree/develop/salt/jupyter) 60 | - [opentsdb](https://github.com/pndaproject/platform-salt/tree/develop/salt/opentsdb) 61 | -------------------------------------------------------------------------------- /consumer/README.md: -------------------------------------------------------------------------------- 1 | # Consumers 2 | 3 | Kafka has a simple, clean design that moves complexity traditionally found inside message brokers into its producers and consumers. 4 | 5 | A Kafka consumer pulls messages from one or more topics using Zookeeper for discovery, issuing fetch requests to the brokers leading the partitions it wants to consume. Rather than the broker maintaining state and controlling the flow of data, each consumer controls the rate at which it consumes messages by maintaining an index. 6 | 7 | Multiple consumers can be organized into groups consuming from a topic in parallel up to the number of partitions for that topic, so increasing throughput or can be individually placed into separate groups each consuming a given partition, so giving predictable ordering. 8 | 9 | Consumers take on the name of the group to which they belong and by convention this name is organized such that conflict with other consumers is avoided. 10 | 11 | ## Data Processing consumers 12 | 13 | For examples of how to craft data processing applications that consume from Kafka and horizontally scale using parallel processing techniques, see the next chapter on [Packages & Applications](../applications/README.md). 14 | 15 | ## Client consumers 16 | 17 | Clients of PNDA may wish to integrate directly with Kafka instead of consuming data processed output via Impala or OpenTSDB, particularly if those clients are pre-existing applications that would otherwise integrated with directly with data sources and are designed to consume a continuous stream of data. 18 | 19 | In this case, you can use the [Kafka Consumer API](http://kafka.apache.org/documentation.html#newconsumerapi). Example code showing how to build a straightforward consumer using this API can be found in the [Kafka Quickstart](http://kafka.apache.org/07/quickstart.html). 20 | 21 | We have several examples of building Kafka clients in our [example code repository](https://github.com/pndaproject/example-kafka-clients). 22 | -------------------------------------------------------------------------------- /consumer/images/opentsdb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/consumer/images/opentsdb.png -------------------------------------------------------------------------------- /cover.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/cover.jpg -------------------------------------------------------------------------------- /downloads/README.md: -------------------------------------------------------------------------------- 1 | # Download Book 2 | 3 | You can read the latest version of this guide on the [PNDA website](http://pnda.io/guide), or download the book in a number of formats. 4 | 5 | ## PDF 6 | 7 | Read the book on your computer or mobile device: 8 | 9 | * [Download PDF](http://pnda.io/pnda-guide/downloads/pnda-guide.pdf) 10 | 11 | ## ePub 12 | 13 | Read the book on your favorite eBook reader, including iOS and Android devices: 14 | 15 | * [Download ePub](http://pnda.io/pnda-guide/downloads/pnda-guide.epub) 16 | 17 | ## Mobi 18 | 19 | Read the book on your Amazon Kindle: 20 | 21 | * [Download Mobi](http://pnda.io/pnda-guide/downloads/pnda-guide.mobi) 22 | -------------------------------------------------------------------------------- /em_sd/README.md: -------------------------------------------------------------------------------- 1 | # Endpoint Management and Service discovery 2 | 3 | The Endpoint Management and Service discovery aims to solve these problems: 4 | 5 | * /etc/hosts inflexible to changes and integration with external discovery 6 | * Endpoints change over time but nothing updates them 7 | * Hand rolled implementation for endpoint registrar (on DM) 8 | * Configuration inflexibly handled as multiple config files on various nodes 9 | 10 | See the following pages for a description of features available on other tabs: 11 | * [Consul Overview](consul.md) 12 | * [DNS in PNDA](dns.md) 13 | * [Service discovery](sd.md) 14 | -------------------------------------------------------------------------------- /em_sd/consul.md: -------------------------------------------------------------------------------- 1 | # Consul Overview 2 | 3 | Each node in a PNDA Cluster includes a Consul agent. These agents register with a Consul server - a single one running on the Kafka instance for the pico flavor and a quorum running on the Zookeeper instances for standard. The agents automatically keep the DNS entries in the server up to date with any changes. 4 | 5 | Consul can be queried on port 8500 using the IP address of any node. To see a quick overview of the current cluster topology use the following command: 6 | 7 | ``` 8 | [ec2-user@pnda-guide-14371-bastion ~]$ consul members -http-addr 10.0.0.205:8500 9 | Node Address Status Type Build Protocol DC Segment 10 | pnda-guide-14371-kafka-0 10.0.1.98:8301 alive server 1.0.3 2 dc1 11 | pnda-guide-14371-bastion 10.0.0.205:8301 alive client 1.0.3 2 dc1 12 | pnda-guide-14371-hadoop-dn-0 10.0.1.110:8301 alive client 1.0.3 2 dc1 13 | pnda-guide-14371-hadoop-edge 10.0.1.187:8301 alive client 1.0.3 2 dc1 14 | ``` 15 | 16 | The base configuration file for Consul is located under /etc/consul.d/config.json and is automatically generated by the SaltStack Consul formula in the platform-salt repository. The key elements of the configuration are the datacenter & domain, which will be used for DNS. It also defines the IP address to listen on with the parameter bind_addr and client_addr. 17 | 18 | Here is a sample configuration file: 19 | 20 | ``` 21 | [root@pnda-guide-14371-bastion ec2-user]# cat /etc/consul.d/config.json 22 | { 23 | "bind_addr": "10.0.0.205", 24 | "client_addr": "10.0.0.205", 25 | "data_dir": "/var/consul", 26 | "datacenter": "dc1", 27 | "domain": "pnda.local", 28 | "enable_debug": false, 29 | "enable_script_checks": true, 30 | "encrypt": "", 31 | "log_level": "info", 32 | "ports": [ 33 | { 34 | "dns": 53 35 | } 36 | ], 37 | "retry_interval": "30s", 38 | "retry_join": [ 39 | "10.0.1.98" 40 | ], 41 | "server": false, 42 | "ui": true 43 | } 44 | ``` 45 | 46 | Consul hosts a Web UI that can be accessed on port 8500 on any node. 47 | Here is a screenshot of the Consul UI: 48 | 49 | ![Consul UI](consul_services.jpeg) 50 | 51 | Services are defined using JSON files under /etc/consul.d. For example, here is the Zookeeper service definition: 52 | 53 | ``` 54 | [root@pnda-guide-14371-kafka-0 ec2-user]# cat /etc/consul.d/zookeeper.json 55 | { 56 | "service": { 57 | "id": "zookeeper0", 58 | "name": "zookeeper", 59 | "tags": ["0"], 60 | "address": "10.0.1.98", 61 | "port": 2181 62 | }, 63 | "check": { 64 | "id": "service:zookeeper0", 65 | "name": "Zookeeper health check", 66 | "ServiceID": "zookeeper0", 67 | "args": ["/opt/pnda/zookeeper-3.4.11/consul_check.sh"], 68 | "interval": "60s", 69 | "timeout": "3s" 70 | } 71 | } 72 | ``` 73 | -------------------------------------------------------------------------------- /em_sd/consul_dns.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/em_sd/consul_dns.jpg -------------------------------------------------------------------------------- /em_sd/consul_nodes.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/em_sd/consul_nodes.jpeg -------------------------------------------------------------------------------- /em_sd/consul_services.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/em_sd/consul_services.jpeg -------------------------------------------------------------------------------- /em_sd/dns.md: -------------------------------------------------------------------------------- 1 | # DNS in PNDA 2 | 3 | Consul is used to manage endpoints and deliver a service discovery feature. The DNS interface is used in order to resolve hosts and service names. It also provides a REST interface that could be used by users or operators of the platform. 4 | 5 | More information on the DNS interface for Consul can be found in the [Consul DNS documentation](https://www.consul.io/docs/agent/dns.html) 6 | 7 | Each node in a PNDA Cluster includes a Consul agent. These agents register with a Consul server - a single one running on the Kafka instance for the pico flavor and a quorum running on the Zookeeper instances for standard. The agents automatically keep the DNS entries in the server up to date with any changes. 8 | 9 | ## Overview 10 | ![DNS Overview](consul_dns.jpg) 11 | 12 | 13 | ## Usage 14 | 15 | Hosts can be referenced with their hostname or full qualified domain name: 16 | 17 | ``` 18 | .node.. 19 | e.g. 20 | pnda-kafka-0.node.dc1.pnda.local 21 | or just 22 | pnda-kafka-0 23 | ``` 24 | 25 | PNDA uses default values for datacenter and domain of `dc1` and `pnda.local`. 26 | 27 | Services can be referenced with: 28 | 29 | ``` 30 | .service.. 31 | ``` 32 | 33 | The Consul DNS server is configured on each node in a PNDA cluster in resolv.conf. This defines the nameserver as the IP address where the server instance of Consul is running and the search domain as node.dc1.pnda.local. Here is a sample configuration: 34 | 35 | ``` 36 | [root@pnda-guide-14371-kafka-0 ec2-user]# cat /etc/resolv.conf 37 | nameserver 10.0.1.98 38 | # Generated by NetworkManager 39 | search eu-west-1.compute.internal node.dc1.pnda.local 40 | nameserver 10.0.0.2 41 | ``` 42 | 43 | Hostnames, fully qualified domain names, and service names can all be resolved from any node in the cluster, for example: 44 | 45 | ``` 46 | [root@pnda-guide-14371-bastion ec2-user]# ping pnda-guide-14371-kafka-0 47 | PING pnda-guide-14371-kafka-0.node.dc1.pnda.local (10.0.1.98) 56(84) bytes of data. 48 | ``` 49 | 50 | ``` 51 | [root@pnda-guide-14371-bastion ec2-user]# ping zookeeper.service.dc1.pnda.local 52 | PING zookeeper.service.dc1.pnda.local (10.0.1.98) 56(84) bytes of data. 53 | ``` 54 | 55 | The Consul UI can be used to view the cluster topology: 56 | ![Consul UI](consul_nodes.jpeg) 57 | -------------------------------------------------------------------------------- /em_sd/sd.md: -------------------------------------------------------------------------------- 1 | # Service discovery 2 | 3 | The Consul infrastructure can be used to register/deregister services automatically based on host health. More services will be defined in due course and it is also possible to configure user defined services by editing the Consul configuration files. Here is an example of the configuration needed to configure a Zookeeper service: 4 | 5 | ``` 6 | { 7 | "service": { 8 | "id": "zookeeper1", 9 | "name": "zookeeper", 10 | "tags": ["1"], 11 | "address": "10.0.1.123", 12 | "port": 2181 13 | }, 14 | "check": { 15 | "id": "service:zookeeper1", 16 | "name": "Zookeeper health check", 17 | "ServiceID": "zookeeper1", 18 | "args": ["/opt/pnda/zookeeper-3.4.11/consul_check.sh"], 19 | "interval": "60s", 20 | "timeout": "3s" 21 | } 22 | } 23 | ``` 24 | 25 | A service instance is declared and then linked to a check to ensure that the service is healthy. Review the Consul documentation for more details [Consul Catalog HTTP API](https://www.consul.io/api/catalog.html) 26 | 27 | Also, you can see the service topology through the Consul UI on the service part: 28 | ![Consul Service](consul_services.jpeg) 29 | -------------------------------------------------------------------------------- /exploration/README.md: -------------------------------------------------------------------------------- 1 | # Data Exploration 2 | 3 | ## [Jupyter](https://github.com/pndaproject/example-applications/tree/master/jupyter-notebooks) 4 | 5 | The Jupyter Notebook is a web application that allows you to create and share documents that contain live code, equations, visualizations and explanatory text. In PNDA, it supports exploration and presentation of data from HDFS and HBase. 6 | 7 | ## [Exploratory data analytics tutorial](lab.md) 8 | 9 | This tutorial demonstrates how to use Jupyter to access data and make sense of data interactively using the Jupyter pyspark interpreter. The data samples used in this lab are networking telemetry data and netflow data and persistent in PNDA platform. 10 | 11 | ## [How to manage application dependencies](dependencies.md) 12 | 13 | This section explains how to manage PySpark application dependencies. 14 | -------------------------------------------------------------------------------- /exploration/dep-usage.md: -------------------------------------------------------------------------------- 1 | # Using Python application dependencies 2 | 3 | ## Jupyter, Deployment Manager and shells 4 | 5 | ### Anaconda and app-packages dependencies 6 | 7 | For dependencies managed via Anaconda or the app-packages mechanism, simply import the dependencies required in the normal way. PNDA has already set up the necessary paths and depedendency caches so nothing further is required. 8 | 9 | ### Runtime dependencies 10 | 11 | For dependencies to be delivered at runtime, use addPyFile() in your code. For example - 12 | 13 | addPyFile('hdfs:///pnda/deployment/app_packages/sharedroutines-0.1.egg') 14 | 15 | ## Batch 16 | 17 | At present, PNDA does not set these up automatically in the same way as above. 18 | 19 | For jobs to be scheduled via coordinators and workflows, do the following. 20 | 21 | ### Anaconda dependencies 22 | 23 | To your `` section in the workflow action, add 24 | 25 | --conf spark.executorEnv.PYSPARK_PYTHON=/opt/pnda/anaconda/bin/python 26 | --conf spark.yarn.appMasterEnv.PYSPARK_PYTHON=/opt/pnda/anaconda/bin/python 27 | 28 | ### Dependencies managed via the app-packages mechanism 29 | 30 | Due to [this unresolved issue in Spark](https://issues.apache.org/jira/browse/SPARK-22151), spark.yarn.appMasterEnv.PYTHONPATH isn't handled properly. Until this is resolved: 31 | 32 | Add this to your code, before importing dependencies - 33 | 34 | sys.path.insert(0, '/opt/pnda/app-packages/lib/python2.7/site-packages') 35 | 36 | ### Runtime dependencies 37 | 38 | For dependencies to be delivered at runtime, use addPyFile() in the normal way as described above. 39 | -------------------------------------------------------------------------------- /exploration/dependencies.md: -------------------------------------------------------------------------------- 1 | # Adding application dependencies 2 | 3 | Applications have two distinct sets of dependency resolution considerations. Firstly, there is a driver process which runs on one of the edge nodes or on a data node, depending on how it was launched. Secondly, there are executor processes which run on data nodes. 4 | 5 | ## PNDA 'app-packages' ## 6 | 7 | This is a mechanism for permanently installing Python dependencies into safe virtualenv sandboxes in designated locations around the PNDA cluster. Once installed, the dependencies can be referenced by applications from driver or executor proceses. 8 | 9 | The dependencies are [python pip libraries](https://pypi.python.org/pypi) and are specified in the format required by [pip](https://pypi.python.org/pypi/pip). Refer to https://packaging.python.org/guides/tool-recommendations/ for more information. 10 | 11 | Maintain this file: 12 | 13 | ```platform-salt/salt/app-packages/files/app-packages-requirements.txt``` 14 | 15 | Also ensure the PNDA mirror contains the corresponding packages. 16 | 17 | On creation of PNDA a default set of libraries will be installed ready for use. 18 | 19 | ### Updates ### 20 | 21 | If it becomes necessary to update the installed dependencies on PNDA, run this Salt State: 22 | 23 | ```sudo salt -C 'G@hadoop:role:EDGE or G@roles:jupyter or G@hadoop:role:DATANODE' state.sls app-packages``` 24 | 25 | ## Usage ## 26 | 27 | Use dependencies in the normal fashion, for example: 28 | 29 | ```import sharedroutines``` 30 | 31 | ## Distributing at runtime ## 32 | 33 | Sometimes it can be useful to distribute a dependency across the cluster only when an application is launched. For example, you may not want to permanently install a library on nearly every node on the cluster but only have it present where and when it's needed. 34 | 35 | These dependencies usually take the form of Python [eggs](https://setuptools.readthedocs.io/en/latest/formats.html) but could also be JARs or zip archives. 36 | 37 | In order for this approach to work the dependencies need to be accessible from anywhere on the cluster. Therefore, they are staged on the distributed file system. 38 | 39 | Maintain this file: 40 | 41 | ```platform-salt/salt/app-packages/files/app-packages-hdfs.txt``` 42 | 43 | ## Updates ## 44 | 45 | Run this Salt State to stage the listed files at the HDFS path configured in the Pillar (see platform-salt/pillar/pnda.sls): 46 | 47 | ```sudo salt-call state.sls app-packages.hdfs-sync``` 48 | 49 | Also ensure the PNDA mirror contains the corresponding packages. 50 | 51 | ### Usage ### 52 | 53 | In your PySpark environment, use the SparkContext.addPyFile method to add any required dependencies, referencing the full HDFS path configured as mentioned above. For example: 54 | 55 | ```sc.addPyFile('hdfs:///pnda/deployment/app_packages/sharedroutines-0.1.egg')``` 56 | 57 | ## Mirror ## 58 | 59 | All application dependencies must be made available on the PNDA Mirror in the usual way. For more details regarding working with the PNDA Mirror, see the PNDA Guide. 60 | 61 | Add dependencies to be staged on HDFS (eggs, etc) to the ```/mirror_apps``` directory on the PNDA Mirror. 62 | 63 | Add dependencies to be managed by app-packages using the tools provided for managing Python libraries on the PNDA Mirror. See [mirror tools](https://github.com/pndaproject/pnda/tree/develop/mirror/tools) for more details. 64 | 65 | -------------------------------------------------------------------------------- /exploration/images/lab-challenge-2-output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/exploration/images/lab-challenge-2-output.png -------------------------------------------------------------------------------- /exploration/images/lab-challenge-3-output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/exploration/images/lab-challenge-3-output.png -------------------------------------------------------------------------------- /exploration/images/lab-example-1-output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/exploration/images/lab-example-1-output.png -------------------------------------------------------------------------------- /exploration/images/lab-example-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/exploration/images/lab-example-1.png -------------------------------------------------------------------------------- /exploration/images/lab-example-3-output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/exploration/images/lab-example-3-output.png -------------------------------------------------------------------------------- /exploration/images/lab-example-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/exploration/images/lab-example-3.png -------------------------------------------------------------------------------- /exploration/images/lab-example-4-output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/exploration/images/lab-example-4-output.png -------------------------------------------------------------------------------- /exploration/images/lab-example-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/exploration/images/lab-example-4.png -------------------------------------------------------------------------------- /exploration/images/lab-example-5-output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/exploration/images/lab-example-5-output.png -------------------------------------------------------------------------------- /exploration/images/lab-example-5-ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/exploration/images/lab-example-5-ui.png -------------------------------------------------------------------------------- /exploration/images/lab-example-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/exploration/images/lab-example-5.png -------------------------------------------------------------------------------- /exploration/images/lab-example-6-output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/exploration/images/lab-example-6-output.png -------------------------------------------------------------------------------- /exploration/images/lab-example-6-ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/exploration/images/lab-example-6-ui.png -------------------------------------------------------------------------------- /exploration/images/lab-example-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/exploration/images/lab-example-6.png -------------------------------------------------------------------------------- /exploration/images/lab-images/jupyter-example-cell1-output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/exploration/images/lab-images/jupyter-example-cell1-output.png -------------------------------------------------------------------------------- /exploration/images/lab-images/jupyter-example-cell1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/exploration/images/lab-images/jupyter-example-cell1.png -------------------------------------------------------------------------------- /exploration/images/lab-images/jupyter-example-cell2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/exploration/images/lab-images/jupyter-example-cell2.png -------------------------------------------------------------------------------- /exploration/images/lab-images/jupyter-example-cell3-output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/exploration/images/lab-images/jupyter-example-cell3-output.png -------------------------------------------------------------------------------- /exploration/images/lab-images/jupyter-example-cell3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/exploration/images/lab-images/jupyter-example-cell3.png -------------------------------------------------------------------------------- /exploration/images/lab-images/jupyter-example-cell4-output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/exploration/images/lab-images/jupyter-example-cell4-output.png -------------------------------------------------------------------------------- /exploration/images/lab-images/jupyter-example-cell4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/exploration/images/lab-images/jupyter-example-cell4.png -------------------------------------------------------------------------------- /exploration/images/lab-images/jupyter-example-cell5-output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/exploration/images/lab-images/jupyter-example-cell5-output.png -------------------------------------------------------------------------------- /exploration/images/lab-images/jupyter-example-cell5-ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/exploration/images/lab-images/jupyter-example-cell5-ui.png -------------------------------------------------------------------------------- /exploration/images/lab-images/jupyter-example-cell5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/exploration/images/lab-images/jupyter-example-cell5.png -------------------------------------------------------------------------------- /exploration/images/lab-images/jupyter-example-cell6-output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/exploration/images/lab-images/jupyter-example-cell6-output.png -------------------------------------------------------------------------------- /exploration/images/lab-images/jupyter-example-cell6-ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/exploration/images/lab-images/jupyter-example-cell6-ui.png -------------------------------------------------------------------------------- /exploration/images/lab-images/jupyter-example-cell6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/exploration/images/lab-images/jupyter-example-cell6.png -------------------------------------------------------------------------------- /exploration/jupyter.md: -------------------------------------------------------------------------------- 1 | # Using Jupyter 2 | 3 | The [Jupyter Notebook](http://jupyter.org) is a web application that allows you to create and share documents that contain live code, equations, visualizations and explanatory text. A notebook document is associated with a "computation engine", called notebook kernel. Kernels for many other languages exist (see [official kernels](http://jupyter.readthedocs.io/en/latest/#kernels)). 4 | 5 | Jupyter is integrated with PNDA platform as a standard service with three pre-configured kernels by default, primarily python kernels: 6 | 7 | * Python2 kernel 8 | * Python3 kernel 9 | * PySpark (Python2) kernel 10 | 11 | These kernels are installed along with 3rd-party libraries such as [numpy](http://www.numpy.org/) and [matplotlib](http://matplotlib.org/), which allow you to run scientific computing and visualization against your data. In addition to those fundamental scientific computing facilities, the PySpark (Python2) kernel also integrates PNDA [platform libaries](https://github.com/pndaproject/platform-libraries) that provides easy-to-use utilities for large-scale big data analytics using Spark. Such utilities include loading and decoding [PNDA Avro datasets](../producer/data-preparation.md) as an RDD, and submit MapR jobs to Hadoop cluster via YARN for data pre-processing and data transformation, and data query. 12 | 13 | For those who would like to try Jupyter without pre-existed data, a data-generation tool is also provided that allows you to generate test datasets. Follow the instructions from [the example Jupyter notebook](https://github.com/pndaproject/example-applications/tree/master/jupyter-notebooks) to connect to PNDA jupyter node and start using Jupyter. Detailed explanations of the example Jupyter notebook cells can be found in the [Lab](lab.md). 14 | -------------------------------------------------------------------------------- /gettingstarted/hadoop_distro.md: -------------------------------------------------------------------------------- 1 | # Hadoop Distributions 2 | 3 | ## Choice of Hadoop Distribution 4 | PNDA can be provisioned with either [Hortonworks HDP](https://hortonworks.com/products/data-center/hdp/) or [Cloudera CDH](https://www.cloudera.com/products/open-source/apache-hadoop/key-cdh-components.html) as the Hadoop distribution 5 | 6 | The Hadoop distribution provides the main data storage and data processing capabilities. The distribution brings together all the upstream component projects that make up 'Hadoop' in a tested package, with pre-built binaries and APIs for automated setup. 7 | 8 | ## How to select the distribution 9 | First you need to decide based on feature set, licencing or pricing, which distribution to use. See below for a basic overview of the differences. 10 | 11 | In terms of physically selecting the distribution, the Hadoop distribution to use is configured at PNDA creation time as a single setting in the pnda_env.yaml file. The PNDA setup instructions cover this at the appropriate point. 12 | 13 | The PNDA mirror (which provides all the resources required during PNDA creation) contains both CDH and HDP components, so there is no need to select one or the other when creating the PNDA mirror. 14 | 15 | ## Hortonworks HDP 16 | - [Hortonworks HDP](https://hortonworks.com/products/data-center/hdp/) is 100% open source 17 | - Uses Amabri for cluster monitoring, management, [additional UIs](https://docs.hortonworks.com/HDPDocuments/Ambari-2.5.1.0/bk_ambari-views/content/ch_understanding_ambari_views.html) and [setup](https://cwiki.apache.org/confluence/display/AMBARI/Blueprints) 18 | - Provides Hive for MPP SQL queries instead of Impala (a performance evaluation would be advisable for the specific workloads you will run, if this type of SQL query is important to your use cases). 19 | - A [commercial subscription](https://hortonworks.com/services/support/enterprise/) is required for support. 20 | 21 | ## Cloudera CDH & Cloudera Manager 22 | - [Cloudera CDH](https://www.cloudera.com/products/open-source/apache-hadoop/key-cdh-components.html) is 100% open source 23 | - [Cloudera Manager](https://www.cloudera.com/products/product-components/cloudera-manager.html) and some other components are [proprietary](https://www.cloudera.com/content/dam/www/static/documents/datasheets/cloudera-enterprise-datasheet.pdf) (certain core features may be used for free but advanced cluster management features require a commercial licence) 24 | - Uses Cloudera Manager for cluster monitoring, management and [setup](https://cloudera.github.io/cm_api/) 25 | - Uses Hue for [additional UIs](http://gethue.com/) 26 | - Provides Impala for MPP SQL queries 27 | - A [commercial subscription](https://www.cloudera.com/more/services-and-support.html) is required for support. 28 | 29 | ## NOTE 30 | 31 | All information provided about Hortonworks and Cloudera products and services is for convenience purposes only and may not reflect current licencing or pricing. Please visit or contact Hortonworks and Cloudera directly to determine your rights and obligations. 32 | -------------------------------------------------------------------------------- /images/Detailed Sequence for Kerberos.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/images/Detailed Sequence for Kerberos.png -------------------------------------------------------------------------------- /images/Mainconsole with LDAP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/images/Mainconsole with LDAP.png -------------------------------------------------------------------------------- /images/Mainconsole SPNEGO.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/images/Mainconsole SPNEGO.png -------------------------------------------------------------------------------- /images/Pnda-security.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/images/Pnda-security.png -------------------------------------------------------------------------------- /images/Webconsole.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/images/Webconsole.png -------------------------------------------------------------------------------- /images/dm-overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/images/dm-overview.png -------------------------------------------------------------------------------- /images/kso-summary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/images/kso-summary.png -------------------------------------------------------------------------------- /images/kso-tsd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/images/kso-tsd.png -------------------------------------------------------------------------------- /images/openbmp_flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/images/openbmp_flow.png -------------------------------------------------------------------------------- /images/package-server.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/images/package-server.png -------------------------------------------------------------------------------- /images/pnda-architecture2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/images/pnda-architecture2.png -------------------------------------------------------------------------------- /images/provisioning-heat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/images/provisioning-heat.png -------------------------------------------------------------------------------- /images/provisioning-salt-cloud.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/images/provisioning-salt-cloud.png -------------------------------------------------------------------------------- /instructions.md: -------------------------------------------------------------------------------- 1 | # Instructions 2 | 3 | Here are some instructions for using gitbook to build the PNDA guide. 4 | 5 | ## Website 6 | 7 | Check out the `pnda-guide` repo and change to that directory. 8 | 9 | Follow the gitbook [Installation and Setup](http://toolchain.gitbook.com/setup.html) guide. Basically, to install gitbook: 10 | 11 | ``` 12 | npm install gitbook-cli -g 13 | 14 | gitbook init 15 | ``` 16 | 17 | To build the HTML website: 18 | 19 | ``` 20 | gitbook build 21 | ``` 22 | 23 | ## PDF 24 | 25 | To build a PDF, follow the [eBook and PDF](http://toolchain.gitbook.com/ebook.html) instructions. This seems to require a Mac. 26 | 27 | Download the [Calibre](https://calibre-ebook.com/download) application and copy it to `/Applications`. Then make a symbolic link so gitbook can find it: 28 | 29 | ``` 30 | sudo ln -s /Applications/calibre.app/Contents/MacOS/ebook-convert /usr/local/bin 31 | ``` 32 | 33 | To build a PDF: 34 | 35 | ``` 36 | gitbook pdf ./ ./pnda-guide.pdf 37 | ``` 38 | 39 | ## Build script 40 | 41 | To build everything, just run: 42 | 43 | ``` 44 | scripts/build.sh 45 | ``` 46 | 47 | This will build the website, compress it to a `pnda-guide.zip` archive, and build the `pnda-guide.pdf` file. 48 | 49 | ## Repositories 50 | 51 | The `repos` directory contains a copy of the README file from every repository in the distribution. When the README files have changed, this command will copy them into the repos directory, along with any linked images: 52 | 53 | ``` 54 | scripts/copy-repos.sh 55 | ``` 56 | 57 | ## Uploading 58 | 59 | To upload the built guide to the website: 60 | 61 | ``` 62 | scripts/upload.sh pnda-website 0.0.6 staging 63 | scripts/upload.sh pnda-website 0.0.6 production 64 | ``` 65 | 66 | In this case, `pnda-website` should be a host defined in `~/.ssh/config`. 67 | -------------------------------------------------------------------------------- /log-aggregation/README.md: -------------------------------------------------------------------------------- 1 | # Log Aggregation 2 | 3 | Logs from the various component services that make up PNDA, and the applications that run on PNDA, are collected and stored on the logserver node. 4 | 5 | Logstash clients on each node monitor the various log files and push data in near real-time to a [logstash](https://www.elastic.co/products/logstash) server. The logstash server writes the logs into raw text files under `/var/log/pnda`, and also adds them into [elasticsearch](https://www.elastic.co). Elasticsearch indexes the logs and makes them available to [Kibana](https://www.elastic.co/products/kibana). 6 | 7 | ## ELK 8 | 9 | Kibana is linked from the main console under Metrics ► Quick Links ► PNDA Logserver. You can search for specific logs, or create graphs on various aspects of the log data. A basic dashboard for PNDA logs is provided out-of-the-box. 10 | 11 | ## Raw logs 12 | 13 | Plain text log files are written under `/var/log/pnda` on the logserver node. 14 | 15 | They are rotated with [logrotate](http://www.linuxcommand.org/man_pages/logrotate8.html), limiting each file to 10 MB with five prior versions retained. The logrotate config is in `/etc/logrotate.d/pnda`. It can be useful to tweak the settings for this to retain more logs for specific components when debugging a particular problem. 16 | 17 | Something to beware of is that logrotate only runs every 15 minutes. So in the case that a rogue application writes a lot of log data in under 15 minutes (this isn't as hard as it sounds with big data—consider per-message debug output), the logserver could run out of disk space. 18 | 19 | ## Application Logs 20 | 21 | Application logs are aggregated from log files named `stdout`, `stderr` and `spark.log` for [YARN](http://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/YARN.html) applications. The log files are named `yarn_applicationId.log`. 22 | 23 | The YARN application ID (or IDs, in the case of a PNDA application that makes us of multiple YARN applications) can be obtained programatically by querying the Deployment Manager (`/applications/application_id/detail`), or manually by looking in the YARN Resource Manager UI. This will be integrated into the PNDA Console in a future release. 24 | -------------------------------------------------------------------------------- /others/README.md: -------------------------------------------------------------------------------- 1 | # References 2 | 3 | * [Avro](https://avro.apache.org/) 4 | * [Calibre](https://calibre-ebook.com/download) 5 | * [Cloudera Manager](https://www.cloudera.com/products/cloudera-manager.html) 6 | * [ElasticSearch](https://www.elastic.co) 7 | * [Express with Passport](http://mherman.org/blog/2015/01/31/local-authentication-with-passport-and-express-4/#.VqCrvfl95p8) 8 | * [Flink](http://flink.apache.org/) 9 | * [GitBook](http://www.gitbook.com/) 10 | * [Gobblin](https://github.com/linkedin/gobblin/) 11 | * [Grafana](http://grafana.org) 12 | * [HBase](https://hbase.apache.org/book.html) 13 | * [HDFS](http://hadoop.apache.org/) 14 | * [Heat](https://wiki.openstack.org/wiki/Heat) 15 | * [Hive](http://www.cloudera.com/documentation/archive/cdh/4-x/4-2-0/CDH4-Installation-Guide/cdh4ig_topic_18_4.html) 16 | * [Hue](http://gethue.com/) 17 | * [HWEventSource](https://github.com/opendaylight/coretutorials/tree/master/hweventsource) 18 | * [Impala Security](http://blog.cloudera.com/blog/2014/10/new-in-cdh-5-2-impala-authentication-with-ldap-and-kerberos/) 19 | * [Impala](http://impala.io) 20 | * [Java JDK](https://docs.oracle.com/javase/8/docs/technotes/guides/install/install_overview.html) 21 | * [JMX Proxy](https://github.com/mk23/jmxproxy) 22 | * [Jupyter Notebook](http://jupyter.org) 23 | * [Kafka](http://kafka.apache.org/) 24 | * [Kerberos](http://web.mit.edu/kerberos/) 25 | * [Kibana](https://www.elastic.co/products/kibana) 26 | * [Kite](http://kitesdk.org/) 27 | * [Logrotate](http://www.linuxcommand.org/man_pages/logrotate8.html) 28 | * [Logstash guide](https://www.elastic.co/guide/en/logstash/5.3/index.html) 29 | * [Logstash](https://www.elastic.co/products/logstash) 30 | * [Maven](https://maven.apache.org/install.html) 31 | * [Node Security](https://nodesecurity.io/) 32 | * [Node.js security threats](https://speakerdeck.com/ckarande/top-overlooked-security-threats-to-node-dot-js-web-applications) 33 | * [Oozie](https://oozie.apache.org) 34 | * [OpenBMP](http://www.openbmp.org/) 35 | * [OpenDaylight](https://wiki.opendaylight.org/) 36 | * [OpenStack](https://www.openstack.org/) 37 | * [OpenTSDB](http://opentsdb.net/) 38 | * [PAM](http://linux.die.net/man/5/pam_ldap) 39 | * [Rackspace](http://www.rackspace.com) 40 | * [Salt Cloud installation](https://docs.saltstack.com/en/latest/topics/cloud/install/index.html) 41 | * [SaltStack](https://docs.saltstack.com/en/getstarted/overview.html) 42 | * [Scala](http://www.scala-sbt.org/release/docs/Setup.html) 43 | * [Schema-on-read](https://www.techopedia.com/definition/30153/schema-on-read) 44 | * [Secure Impersonation](https://hadoop.apache.org/docs/r1.2.1/Secure_Impersonation.html) 45 | * [Semantic versioning](http://semver.org/) 46 | * [Spark](http://spark.apache.org) 47 | * [SPNEGO](https://github.com/adaltas/node-krb5/blob/master/samples/spnego_async.js) 48 | * [SSH Public Key](https://git-scm.com/book/be/v2/Git-on-the-Server-Generating-Your-SSH-Public-Key) 49 | * [Strict versioning](http://legacy.python.org/dev/peps/pep-0386) 50 | * [TripleO](http://docs.openstack.org/developer/tripleo-docs/) 51 | * [YARN](http://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/YARN.html) 52 | * [Zookeeper](http://zookeeper.apache.org/) 53 | 54 | -------------------------------------------------------------------------------- /provisioning/OVERVIEW.md: -------------------------------------------------------------------------------- 1 | # Creating PNDA - Overview 2 | 3 | PNDA can be deployed on OpenStack, Amazon AWS, VMWare or bare metal servers. The process is similar for each platform. 4 | 5 | ![](PNDA-overview.jpg) 6 | 7 | To create PNDA, select a deployment target, then follow one of these tracks - 8 | 9 | |[Creating PNDA on OpenStack](openstack/PREPARE.md)|[Creating PNDA on Amazon AWS](aws/PREPARE.md)|[Creating PNDA on VMWare](vmware/PREPARE.md)|[Creating PNDA on server clusters](server-cluster/PREPARE.md)| 10 | | --- | --- | --- | --- | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /provisioning/PNDA-overview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/provisioning/PNDA-overview.jpg -------------------------------------------------------------------------------- /provisioning/README.md: -------------------------------------------------------------------------------- 1 | # Creating PNDA 2 | 3 | This chapter describes how to create a PNDA cluster, and includes some background information on SaltStack, OpenStack Heat and AWS CloudFormation. 4 | 5 | * [Creating PNDA](OVERVIEW.md) 6 | 7 | ## Background information 8 | 9 | * [Getting started with Heat](heat.md) 10 | * [Getting started with AWS](aws.md) 11 | * [Getting started with SaltStack](saltstack.md) 12 | * [Getting started with VMWare](vmware.md) 13 | 14 | 15 | -------------------------------------------------------------------------------- /provisioning/aws.md: -------------------------------------------------------------------------------- 1 | # Getting Started with CloudFormation on AWS 2 | 3 | [CloudFormation](https://aws.amazon.com/cloudformation) is an easy way to create and manage AWS resources. 4 | 5 | For a simple guide to getting started with developing templates on AWS you can check out the [tutorials](http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/GettingStarted.html) on the AWS documentation site. 6 | 7 | Finally, follow the instructions for [creating PNDA on AWS](OVERVIEW.md). 8 | -------------------------------------------------------------------------------- /provisioning/aws/BUILD.md: -------------------------------------------------------------------------------- 1 | # Create PNDA build components 2 | 3 | ![](../images/breadcrumbs-build.jpg) 4 | 5 | ## Introduction 6 | 7 | In addition to projects like Hadoop and Kafka, PNDA also includes a variety of components that provide an operations console, application deployment and more. Build these components before provisioning PNDA. 8 | 9 | ## Create build node 10 | 11 | #### Select build node 12 | 13 | Designate or create the PNDA build node. This could be the same machine that was used to build the mirror file sets. 14 | 15 | Two types of build node are supported - 16 | 17 | - Red Hat Enterprise Linux 7 18 | - CentOS 7 19 | 20 | 21 | #### Obtain build tools 22 | 23 | The repository [pnda](https://github.com/pndaproject/pnda) contains all the tools needed to build PNDA. 24 | 25 | Decide which version of PNDA you want to create. All PNDA releases are desginated with tag similar to ```release/4.0``` across all repositories. 26 | 27 | Clone this repository at the right version to the build node. 28 | 29 | #### Configure the proxy. (Optional) 30 | 31 | The entire PNDA build process can be performed from behind a non-transparent proxy. 32 | 33 | To proceeding in this mode, first set the system configuration and then run the ```set-proxy-env.sh``` script that will set up the various proxy configurations needed by the multiple build tools. 34 | 35 | ```sh 36 | sudo su 37 | export http_proxy=http://: 38 | export https_proxy=http://: 39 | . set-proxy-env.sh 40 | ``` 41 | 42 | #### Preparing the build environment 43 | 44 | The tools are found in the [build folder](https://github.com/pndaproject/pnda/tree/master/build). 45 | 46 | The script ```install-build-tools.sh``` installs all the necessarily build prerequisites. 47 | 48 | Run it with superuser privileges in the location that you wish to install your build tools. 49 | 50 | For example 51 | 52 | ```sh 53 | sudo su 54 | cd /home/builder 55 | ./install-build-tools.sh 56 | ``` 57 | 58 | As well as installing all the required software, it may pause and ask the operator to carry out some configuration on the build environment, for example adjusting the contents of /etc/hosts. 59 | 60 | The script generates a file called ```set-pnda-env.sh``` containing the necessary environment settings needed to carry out builds. Ensure this is invoked before each build 61 | 62 | For example 63 | 64 | ```sh 65 | . /home/builder/set-pnda-env.sh 66 | ``` 67 | 68 | Your environment is now ready to build PNDA. 69 | 70 | ## Building PNDA 71 | 72 | The script ```build-pnda.sh``` is invoked as a non-privileged user. 73 | 74 | If you are running behind a non-transparent proxy, go through the [proxy configuration](#configure-the-proxy-optional) steps again for the non-privileged user (don't substitute user). 75 | 76 | For example 77 | 78 | ```sh 79 | cd pnda 80 | ./build-pnda.sh RELEASE release/4.0 81 | ``` 82 | 83 | It is also possible to perform more complex builds including building to a specific bill-of-materials. Please refer to the [repository notes](https://github.com/pndaproject/pnda). 84 | 85 | ## Build Products 86 | 87 | All build products are assembled in the directory ```pnda-dist```. 88 | 89 | # [Next](STAGE.md) 90 | 91 | | [Home](../OVERVIEW.md) | [Prepare](PREPARE.md) | [Mirror](MIRROR.md) | [Build](BUILD.md) | [Stage](STAGE.md) | [Configure](CONFIGURE.md) | [Create](CREATE.md) | 92 | | --- | --- | --- | --- | --- | --- | --- | 93 | -------------------------------------------------------------------------------- /provisioning/aws/CONFIGURE.md: -------------------------------------------------------------------------------- 1 | # Configure PNDA creation process 2 | 3 | ![](../images/breadcrumbs-cfg.jpg) 4 | 5 | ## Introduction 6 | 7 | The PNDA creation process is controlled primarily via a YAML configuration file. 8 | 9 | A template YAML configuration can be found in the [pnda-cli repository](https://github.com/pndaproject/pnda-cli). 10 | 11 | ## Configure pnda_env.yaml 12 | 13 | #### Designate client machine 14 | 15 | Create or designate a suitable machine for running the PNDA CLI. We recommend CentOS 7. 16 | 17 | #### Obtain code 18 | 19 | Clone the [pnda-cli repository](https://github.com/pndaproject/pnda-cli) repository from the master branch at a specific release tag (e.g. ```release/3.5```) to the client machine. 20 | 21 | Copy ```pnda_env_example.yaml``` to create ```pnda_env.yaml``` 22 | 23 | #### Set image 24 | 25 | Set the following image related fields as below. 26 | 27 | | Field | RHEL | CentOS | 28 | | --- | --- | --- | 29 | | `cloud_formation_parameters.imageId` | Select 64 bit RHEL 7 image for region | Select 64 bit CentOS 7 image for region | 30 | | `ec2_access.OS_USER` | ec2-user | centos | 31 | 32 | #### Set ec2 access keys 33 | 34 | Set `ec2_access.AWS_ACCESS_KEY_ID` and `ec2_access.AWS_SECRET_ACCESS_KEY` to the credentials created for accessing ec2 and CloudFormation. 35 | 36 | These credentials are only stored on the client machine. 37 | 38 | #### Hadoop distribution 39 | 40 | Decide whether you want to run the Cloudera CDH or the Hortonworks HDP Hadoop distribution. 41 | 42 | Set `hadoop.HADOOP_DISTRO` to either `CDH` or `HDP`. 43 | 44 | #### Set source of SaltStack provisioning scripts 45 | 46 | The PNDA software is installed and configured using the SaltStack code found in the [platform-salt](https://github.com/pndaproject/platform-salt) repository. This can be supplied in two main ways. 47 | 48 | ##### Local copy 49 | 50 | A local copy of platform-salt can be used by setting `platform_salt.PLATFORM_SALT_LOCAL` to the path to the platform-salt folder on the client machine. 51 | 52 | ##### Git repository 53 | 54 | Set `platform_salt.PLATFORM_GIT_REPO_URI` and `platform_salt.PLATFORM_GIT_BRANCH` to clone a remote git URI at the specified branch during provisioning. 55 | 56 | If authenticated access to `platform_salt.PLATFORM_GIT_REPO_URI` is required then place the ssh key to use, named git.pem, in the top level directory of this repository and set `platform_salt.PLATFORM_GIT_REPO_HOST` to the hostname of the server. 57 | 58 | **Note** Please ensure that the local clone of platform-salt or `platform_salt.PLATFORM_GIT_BRANCH` correspond to the same release tag as the pnda-cli repository cloned above. 59 | 60 | #### Object storage 61 | 62 | Set `pnda_application_repo.PNDA_APPS_CONTAINER` to the Application container configured during the preparation phase. 63 | 64 | Set `pnda_application_repo.PNDA_APPS_FOLDER` to the Application folder configured during the preparation phase. 65 | 66 | Set `pnda_data_archive.PNDA_ARCHIVE_CONTAINER` to the Dataset archive container configured during the preparation phase. 67 | 68 | Set `pnda_application_repo.PNDA_APPS_ACCESS_KEY_ID` and `pnda_application_repo.PNDA_APPS_SECRET_ACCESS_KEY` to the credentials created for object storage during the preparation phase. 69 | 70 | #### PNDA mirror 71 | 72 | Set `mirrors.PNDA_MIRROR` to the URI determined by the placement of the mirror and build components in the staging phase. 73 | 74 | #### Other fields 75 | 76 | There are a wide range of parameters that can be set, please refer to ```pnda_env_example.yaml``` in the [pnda-cli repository](https://github.com/pndaproject/pnda-cli) for more details. 77 | 78 | ## Security Material 79 | 80 | #### Perimeter security (FQDN's and associated certificates/private keys) 81 | Access to the PNDA cluster requires user authentication over a secure connection. In order to secure this user authentication, the perimeter servers require certification material which allows validating the FQDN used to access those servers to further authenticate and secure the connection to those servers. 82 | 83 | For PRODUCTION ENVIRONMENTS, this security material MUST be generated outside the PNDA realm and dropped under the [platform-certificates](https://github.com/pndaproject/pnda-cli/tree/cfa40dbd94afaa5e3f3080106c852fb6c1e2d516/platform-certificates) directory tree. Consult the README files under that same directory and sub-directories for further details on the required material. 84 | 85 | For NON-PRODUCTION ENVIRONMENTS, a helper tool ([tools/gen-certs.py](https://github.com/pndaproject/pnda-cli/blob/cfa40dbd94afaa5e3f3080106c852fb6c1e2d516/tools/gen-certs.py)) is provided that can auto-generate the required server certificates based on an existing CA (private key) or based on a newly generated CA (when no private key is detected in the ./platform-certificates directory by the helper tool). 86 | 87 | #### SSH key pair 88 | 89 | Create [an ssh keypair](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html) to use when creating the ec2 instances for PNDA as ```key_name```. 90 | 91 | Place the private key ```key_name.pem``` in the root of the pnda-cli directory. 92 | 93 | Ensure that key_name.pem has 0600 permissions. 94 | 95 | For example, if the keypair in AWS is 'pnda' then the local private key file should be named 'pnda.pem'. 96 | 97 | # [Next](CREATE.md) 98 | 99 | | [Home](../OVERVIEW.md) | [Prepare](PREPARE.md) | [Mirror](MIRROR.md) | [Build](BUILD.md) | [Stage](STAGE.md) | [Configure](CONFIGURE.md) | [Create](CREATE.md) | 100 | | --- | --- | --- | --- | --- | --- | --- | 101 | -------------------------------------------------------------------------------- /provisioning/aws/CREATE.md: -------------------------------------------------------------------------------- 1 | # Create PNDA cluster 2 | 3 | ![](../images/breadcrumbs-create.jpg) 4 | 5 | ## Introduction 6 | 7 | With the target platform fully prepared, the PNDA software staged and the YAML configuration completed, the final step is to invoke the PNDA CLI to create the cluster. 8 | 9 | ## Create PNDA 10 | 11 | #### Install CLI dependencies 12 | 13 | On the client machine, install the pip packages required by the CLI. Navigate to the folder containing the pnda-cli repository, then run the following commands: 14 | 15 | ``` 16 | cd cli 17 | sudo pip install -r requirements.txt 18 | ``` 19 | 20 | #### CLI invocation 21 | 22 | **Important:** ensure you are certain what version of PNDA you want to deploy, and specify the correct branch or tag when invoking the CLI using the -b option. In most circumstances you'll want to make sure the branch or tag you specify is identical to the branch or tag you used to build the PNDA mirror, and identical to the version you checked out from the pnda-cli repository. All PNDA releases are designated with a tag such as ```release/4.0``` across all repositories. 23 | 24 | An example CLI invocation - 25 | 26 | ``` 27 | cd cli 28 | pnda-cli.py create -e -s -f standard -o 2 -n 3 -k 2 -z 3 -b release/4.0 29 | ``` 30 | 31 | The options shown select the standard flavor, 2 openTSDB instances, 3 hadoop datanodes, 2 kafka brokers, and 3 zookeeper nodes. If you need to operate within the default EC2 instance quota of 20 instances then you can reduce this to 1 kafka and 1 zookeeper instance or use the pico flavor. 32 | 33 | ``` 34 | pnda-cli.py create -e -s -f standard -o 1 -n 1 -k 1 -z 1 -b release/4.0 35 | pnda-cli.py create -e -s -f pico -n 1 -k 1 -b release/4.0 36 | ``` 37 | 38 | Note that CLI parameter -s refer to the key configured in the previous phase. For example, if the keypair in AWS is 'pnda' then the local private key file should be named 'pnda.pem' and you should pass '-s pnda' to the CLI. 39 | 40 | 41 | A small cluster typically takes around 30 minutes to fully provision while a larger cluster can take 50 minutes. 42 | 43 | # [Home](../OVERVIEW.md) 44 | 45 | | [Home](../OVERVIEW.md) | [Prepare](PREPARE.md) | [Mirror](MIRROR.md) | [Build](BUILD.md) | [Stage](STAGE.md) | [Configure](CONFIGURE.md) | [Create](CREATE.md) | 46 | | --- | --- | --- | --- | --- | --- | --- | 47 | -------------------------------------------------------------------------------- /provisioning/aws/EXAMPLES.md: -------------------------------------------------------------------------------- 1 | # HTTP Server examples 2 | 3 | These are some examples of how an HTTP server can be set up to serve files for PNDA provisioning. 4 | 5 | These notes are intended for guidance only and will not be maintained or supported. You are strongly advised to refer to the official documentation for each technology. 6 | 7 | ## Apache HTTP Server on CentOS 8 | 9 | Assuming a working CentOS 7 server and that built components are in directory pnda-dist below the current working directory - 10 | 11 | Install Apache HTTP Server - 12 | 13 | sudo yum install httpd 14 | 15 | Copy components to the HTTP server - 16 | 17 | cp pnda-dist/* /var/www/html/ 18 | 19 | The components are now available via HTTP from the server - 20 | 21 | http://server/some-component-0.1.0.tar.gz 22 | 23 | ## Python 24 | 25 | Assuming that the built components are in the current working directory - 26 | 27 | sudo python -m SimpleHTTPServer 80 28 | 29 | The components are now available via HTTP from the server - 30 | 31 | http://server/some-component-0.1.0.tar.gz 32 | 33 | ## Vagrant 34 | 35 | The starting assumption is that you have a working Vagrant installation including a suitable provider plugin for the target environment and that built components are in directory pnda-dist below the current working directory. 36 | 37 | **AWS** 38 | 39 | For this example we'll use a popular [AWS provider](https://github.com/mitchellh/vagrant-aws). 40 | 41 | Create a Vagrantfile, substituting in your AWS configuration - 42 | 43 | VAGRANTFILE_API_VERSION = "2" 44 | 45 | Vagrant.configure("2") do |config| 46 | config.vm.box = "dummy" 47 | 48 | config.vm.provider :aws do |aws, override| 49 | aws.access_key_id = "" 50 | aws.secret_access_key = "" 51 | aws.keypair_name = "" 52 | aws.ami = "" 53 | aws.region = "" 54 | aws.security_groups = [ '' ] 55 | override.ssh.username = "centos" 56 | override.ssh.private_key_path = "" 57 | end 58 | config.vm.provision :shell, path: "bootstrap.sh" 59 | end 60 | 61 | Create a bootstrap.sh - 62 | 63 | yum install -y httpd 64 | rm -rf /var/www/html 65 | ln -fs /vagrant/pnda-dist /var/www/html 66 | 67 | Start the instance - 68 | 69 | vagrant up --provider=aws 70 | 71 | The components are now available via HTTP from the server in AWS - 72 | 73 | http://server/some-component-0.1.0.tar.gz 74 | 75 | If you need to update the components later - 76 | 77 | vagrant rsync 78 | 79 | **OpenStack** 80 | 81 | For this example we'll use a popular [OpenStack provider](https://github.com/cloudbau/vagrant-openstack-plugin). 82 | 83 | Create a Vagrantfile, substituting in your OpenStack configuration - 84 | 85 | Vagrant.configure("2") do |config| 86 | config.vm.box = "dummy" 87 | config.ssh.username = "" 88 | config.ssh.private_key_path = "" 89 | 90 | config.vm.provider :openstack do |os| 91 | os.username = "" 92 | os.api_key = "" 93 | os.flavor = "" 94 | os.region = "" 95 | os.image = "" 96 | os.endpoint = "" 97 | os.tenant = "" 98 | os.keypair_name = "" 99 | os.floating_ip = :auto 100 | os.floating_ip_pool = "" 101 | os.networks = [''] 102 | end 103 | config.vm.provision :shell, path: "bootstrap.sh" 104 | end 105 | 106 | Create a bootstrap.sh - 107 | 108 | yum install -y httpd 109 | rm -rf /var/www/html 110 | ln -fs /vagrant/pnda-dist /var/www/html 111 | 112 | Start the instance - 113 | 114 | vagrant up --provider=openstack 115 | 116 | The components are now available via HTTP from the server in OpenStack - 117 | 118 | http://server/some-component-0.1.0.tar.gz 119 | 120 | If you need to update the components later - 121 | 122 | vagrant provision 123 | 124 | ## Docker 125 | 126 | Assuming a working Docker installation and that built components are in directory pnda-dist below the current working directory - 127 | 128 | Create a Dockerfile - 129 | 130 | FROM httpd:2.4 131 | COPY ./pnda-dist/ /usr/local/apache2/htdocs/ 132 | 133 | Build the container - 134 | 135 | sudo docker build -t package-server . 136 | sudo docker run -dit -p 80:80 --name package-server package-server 137 | 138 | The components are now available via HTTP from the server - 139 | 140 | http://server/some-component-0.1.0.tar.gz 141 | 142 | If you need to update the components later - 143 | 144 | sudo docker cp pnda-dist/some-component-0.2.0.tar.gz package-server:/usr/local/apache2/htdocs/ 145 | 146 | 147 | -------------------------------------------------------------------------------- /provisioning/aws/MIRROR.md: -------------------------------------------------------------------------------- 1 | # Create PNDA mirror components 2 | 3 | ![](../images/breadcrumbs-mirror.jpg) 4 | 5 | ## Introduction 6 | 7 | As many real-world deployment environments don’t have Internet connectivity and online sources are not always dependable, PNDA is created from a cache of all the required software known as the "PNDA mirror". 8 | 9 | Before PNDA can be created, first we must create the directory structure and file sets to be placed on the PNDA mirror. 10 | 11 | ### RHEL or CentOS 12 | 13 | PNDA can be created on RHEL or CentOS instances. Before building the PNDA mirror components, decide which instance type your deployment will use. 14 | 15 | ## Create mirror 16 | 17 | #### Select build node 18 | 19 | Designate or create the mirror build node. 20 | 21 | This can be a physical machine or a VM but it needs to reflect the type of mirror you wish to build and must be clean. 22 | 23 | Two types of mirror are supported - 24 | 25 | - Red Hat Enterprise Linux 7 26 | - CentOS 7 27 | 28 | If using Red Hat, ensure the mirror is built on a clean instance that has had **absolutely no additional** packages installed via yum (i.e. git, unzip, etc) as this will interfere with the dependency calculations of which RPM packages are required on the mirror. 29 | 30 | #### Obtain mirror build tools 31 | 32 | The repository [pnda](https://github.com/pndaproject/pnda) contains all the tools needed to create and maintain the mirror file sets. 33 | 34 | Decide which version of PNDA you want to create. All PNDA releases are desginated with tag similar to ```release/4.0``` across all repositories. 35 | 36 | Clone this repository at the right version to the mirror creation node. 37 | 38 | #### Configure the proxy. (Optional) 39 | 40 | The entire mirror build process can be performed from behind a non-transparent proxy. 41 | 42 | To proceeding in this mode, first set the system configuration and then run the ```set-proxy-env.sh``` script that will set up the various proxy configurations needed by the multiple build tools. 43 | 44 | ```sh 45 | sudo su 46 | export http_proxy=http://: 47 | export https_proxy=http://: 48 | . set-proxy-env.sh 49 | ``` 50 | 51 | #### Build mirror file sets 52 | 53 | The build tools are found in the [mirror folder](https://github.com/pndaproject/pnda/tree/master/mirror). 54 | 55 | To run the entire mirror creation process - 56 | 57 | ```sh 58 | sudo su 59 | ./create_mirror.sh 60 | ``` 61 | 62 | The script automatically detects the host Linux distribution and builds the appropriate file sets. 63 | 64 | This takes about 20 minutes to run and the output will be available in a directory named ```mirror-dist```. 65 | 66 | ##### Building parts of the mirror 67 | 68 | The different parts of the mirror can be created separately if required. The scripts to do this are - 69 | 70 | ``` 71 | create_mirror_rpm.sh 72 | create_mirror_misc.sh 73 | create_mirror_python.sh 74 | create_mirror_anaconda.sh 75 | create_mirror_cdh.sh 76 | create_mirror_hdp.sh 77 | create_mirror_apps.sh 78 | ``` 79 | 80 | Each script creates its output in a directory named for the respective mirror type - 81 | 82 | ``` 83 | mirror_rpm 84 | mirror_misc 85 | mirror_python 86 | mirror_anaconda 87 | mirror_cloudera 88 | mirror_hdp 89 | mirror_apps 90 | ``` 91 | 92 | For more about creating and maintaining mirrors, please refer to the [repository notes](https://github.com/pndaproject/pnda/tree/master/mirror). 93 | 94 | # [Next](BUILD.md) 95 | 96 | | [Home](../OVERVIEW.md) | [Prepare](PREPARE.md) | [Mirror](MIRROR.md) | [Build](BUILD.md) | [Stage](STAGE.md) | [Configure](CONFIGURE.md) | [Create](CREATE.md) | 97 | | --- | --- | --- | --- | --- | --- | --- | 98 | -------------------------------------------------------------------------------- /provisioning/aws/STAGE.md: -------------------------------------------------------------------------------- 1 | # Stage PNDA mirror & components 2 | 3 | ![](../images/breadcrumbs-stage.jpg) 4 | 5 | ## Introduction 6 | 7 | In order to make built components available during the PNDA provisioning process, they need to be staged in a location that is accessible from the target environment via HTTP. 8 | 9 | ## Create server 10 | 11 | Create an ordinary HTTP server in the target environment or identify an existing server. The server must have connectivity with the PNDA cluster being provisioned. 12 | 13 | See [these tips](EXAMPLES.md) for rapidly creating an HTTP server using a number of different approaches. 14 | 15 | Your existing CICD system may already include the capability to host and serve build artifacts over HTTP. In this case, simply make use of an appropriate location on the existing resource. 16 | 17 | ## Stage files 18 | 19 | Copy the *contents* of ```mirror-dist``` and ```pnda-dist``` from the mirror creation and build steps respectively to the HTTP server. 20 | 21 | The final directory layout should resemble the following - 22 | 23 | ``` 24 | pnda-root 25 | │ 26 | ├── console-backend-data-logger-develop.tar.gz 27 | ├── console-backend-data-logger-develop.tar.gz.sha512.txt 28 | ├── etc 29 | │ 30 | ├── mirror_anaconda 31 | │   ├── Anaconda-4.0.0-el7.parcel 32 | │ ├── etc 33 | │ 34 | ├── mirror_rpm 35 | │   ├── a-rpm.rpm 36 | │ ├── etc 37 | │ 38 | ├── etc 39 | ``` 40 | 41 | Note that ```pnda-root``` can be any location, all that is required is that the hierarchy under this is available via a known URI. For example, using a standard Apache 2 installation on RHEL 7, if the hierarchy above is placed in ```/var/www/html``` this will be available via the URI ```http:///``` since ```/var/www/html``` is the default *document root*. Please refer to your HTTP server documentation for more details. 42 | 43 | Make a note of the URI to ```pnda-root``` as this will be used in configuring the PNDA creation process. 44 | 45 | # [Next](CONFIGURE.md) 46 | 47 | | [Home](../OVERVIEW.md) | [Prepare](PREPARE.md) | [Mirror](MIRROR.md) | [Build](BUILD.md) | [Stage](STAGE.md) | [Configure](CONFIGURE.md) | [Create](CREATE.md) | 48 | | --- | --- | --- | --- | --- | --- | --- | 49 | -------------------------------------------------------------------------------- /provisioning/aws/images/attach-policy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/provisioning/aws/images/attach-policy.png -------------------------------------------------------------------------------- /provisioning/aws/images/create-policy1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/provisioning/aws/images/create-policy1.png -------------------------------------------------------------------------------- /provisioning/aws/images/create-policy2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/provisioning/aws/images/create-policy2.png -------------------------------------------------------------------------------- /provisioning/aws/images/create-policy3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/provisioning/aws/images/create-policy3.png -------------------------------------------------------------------------------- /provisioning/aws/images/create-user1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/provisioning/aws/images/create-user1.png -------------------------------------------------------------------------------- /provisioning/aws/images/create-user2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/provisioning/aws/images/create-user2.png -------------------------------------------------------------------------------- /provisioning/baremetal/BUILD.md: -------------------------------------------------------------------------------- 1 | # Create PNDA build components 2 | 3 | ![](../images/breadcrumbs-build.jpg) 4 | 5 | ## Introduction 6 | 7 | In addition to projects like Hadoop and Kafka, PNDA also includes a variety of components that provide an operations console, application deployment and more. Build these components before provisioning PNDA. 8 | 9 | ## Create build machine 10 | 11 | #### Select build machine 12 | 13 | Designate or create the PNDA build machine. This could be the same machine that was used to build the mirror file sets. 14 | 15 | Two types of build node are supported - 16 | 17 | - Red Hat Enterprise Linux 7 18 | - CentOS 7 19 | 20 | 21 | #### Obtain build tools 22 | 23 | The repository [pnda](https://github.com/pndaproject/pnda) contains all the tools needed to build PNDA. 24 | 25 | Clone this repository to the build machine. The tools are found in the [build folder](https://github.com/pndaproject/pnda). 26 | 27 | #### Preparing the build environment 28 | 29 | The script ```install-build-tools.sh``` installs all the necessarily build prerequisites. 30 | 31 | Run it with superuser privileges in the location that you wish to install your build tools. 32 | 33 | For example 34 | 35 | ```sh 36 | sudo su 37 | cd /home/builder 38 | ./install-build-tools.sh 39 | ``` 40 | 41 | As well as installing all the required software, it may pause and ask the operator to carry out some configuration on the build environment, for example adjusting the contents of /etc/hosts. 42 | 43 | Once it has finished, a file called ```set-pnda-env.sh``` will be found in the working directory. This script contains the necessary environment variables and other settings needed to carry out builds. It should either be added to the end of an initialization script such as ```/etc/bash.bashrc``` so that these settings are available for new shells, or it can be invoked with each build. 44 | 45 | For example 46 | 47 | ```sh 48 | sudo su 49 | cat >> /etc/bash.bashrc 50 | . /home/builder/set-pnda-env.sh 51 | ``` 52 | 53 | Your environment is now ready to build PNDA. 54 | 55 | ## Building PNDA 56 | 57 | The script ```build-pnda.sh``` is invoked as a non-privileged user. 58 | 59 | For example 60 | 61 | ```sh 62 | cd pnda 63 | ./build-pnda.sh RELEASE release/3.5 64 | ``` 65 | 66 | It is also possible to perform more complex builds including building to a specific bill-of-materials. Please refer to the [repository notes](https://github.com/pndaproject/pnda). 67 | 68 | ## Build Products 69 | 70 | All build products are assembled in the directory ```pnda-dist```. 71 | 72 | # [Next](STAGE.md) 73 | 74 | | [Home](../OVERVIEW.md) | [Prepare](PREPARE.md) | [Undercloud](UNDERCLOUD.md) | [Saltmaster](SALTMASTER.md) | [Register](REGISTER.md) | [Mirror](MIRROR.md) | [Build](BUILD.md) | [Stage](STAGE.md) | [Configure](CONFIGURE.md) | [Create](CREATE.md) | 75 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | 76 | -------------------------------------------------------------------------------- /provisioning/baremetal/CONFIGURE.md: -------------------------------------------------------------------------------- 1 | # Configure PNDA creation process 2 | 3 | ![](../images/breadcrumbs-cfg.jpg) 4 | 5 | ## Introduction 6 | 7 | The PNDA creation process is controlled primarily via a YAML configuration file. 8 | 9 | A template YAML configuration can be found in the [Heat templates repository](https://github.com/pndaproject/pnda-heat-templates). 10 | 11 | ## Configure pnda_env.yaml 12 | 13 | #### Designate client machine 14 | 15 | Create or designate a suitable machine for running the PNDA CLI. We recommend CentOS 7. 16 | 17 | #### Obtain code 18 | 19 | Clone the [Heat templates repository](https://github.com/pndaproject/pnda-heat-templates) repository from the master branch at a specific release tag (e.g. ```release/3.5```) to the client machine. 20 | 21 | Copy ```pnda_env_example.yaml``` to create ```pnda_env.yaml``` 22 | 23 | #### Set access credentials 24 | 25 | Set the following fields. The values can be obtained by referring to the Keystone authentication details obtained in the preparation phase. 26 | 27 | | Field | Value | 28 | | --- | --- | 29 | |keystone_user| User for creating PNDA | 30 | |keystone_password| Password for user | 31 | |keystone_tenant| Tenant for creating PNDA | 32 | |keystone_auth_url| Authorization URL | 33 | |keystone_auth_version| 2 or 3 | 34 | |keystone_region_name| Region name | 35 | 36 | #### Object storage 37 | 38 | Set `pnda_apps_container` to the Application container configured during the preparation phase. 39 | 40 | Set `pnda_apps_folder` to the Application folder configured during the preparation phase. 41 | 42 | Set `pnda_archive_container` to the Dataset archive container configured during the preparation phase. 43 | 44 | #### Hadoop distribution 45 | Decide whether you want to run the Cloudera CDH or the Hortonworks HDP Hadoop distribution. 46 | 47 | Set `hadoop_distro` to either `CDH` or `HDP`. 48 | 49 | #### Set source of SaltStack provisioning scripts 50 | 51 | The PNDA software is installed and configured using the SaltStack code found in the [platform-salt](https://github.com/pndaproject/platform-salt) repository. This must be supplied via a URI to a git repository. 52 | 53 | Set `platform_git_repo_uri` to the required git URI at the specified branch during provisioning. 54 | 55 | If authenticated access to `platform_git_repo_uri` is required then place the private SSH key to use, named ```deploy```, in the top level of the pnda-heat-templates repository. 56 | 57 | **Note** that by default the master branch of the specified git repository is used in provisioning. See below for other fields that can be used to control this behaviour. 58 | 59 | #### PNDA mirror 60 | 61 | Set `PndaMirror` to the URI determined by the placement of the mirror and build components in the staging phase. 62 | 63 | #### Other fields 64 | 65 | There are a wide range of parameters that can be set, please refer to ```pnda_env_example.yaml``` in the [Heat templates repository](https://github.com/pndaproject/pnda-heat-templates) for more details. 66 | ## Security Material 67 | 68 | #### Perimeter security (FQDN's and associated certificates/private keys) 69 | Access to the PNDA cluster requires user authentication over a secure connection. In order to secure this user authentication, the perimeter servers require certification material which allows validating the FQDN used to access those servers to further authenticate and secure the connection to those servers. 70 | 71 | For PRODUCTION ENVIRONMENTS, this security material MUST be generated outside the PNDA realm and dropped under the [platform-certificates](https://github.com/pndaproject/pnda-cli/tree/cfa40dbd94afaa5e3f3080106c852fb6c1e2d516/platform-certificates) directory tree. Consult the README files under that same directory and sub-directories for further details on the required material. 72 | 73 | For NON-PRODUCTION ENVIRONMENTS, a helper tool ([tools/gen-certs.py](https://github.com/pndaproject/pnda-cli/blob/cfa40dbd94afaa5e3f3080106c852fb6c1e2d516/tools/gen-certs.py)) is provided that can auto-generate the required server certificates based on an existing CA (private key) or based on a newly generated CA (when no private key is detected in the ./platform-certificates directory by the helper tool). 74 | 75 | # [Next](CREATE.md) 76 | 77 | | [Home](../OVERVIEW.md) | [Prepare](PREPARE.md) | [Undercloud](UNDERCLOUD.md) | [Saltmaster](SALTMASTER.md) | [Register](REGISTER.md) | [Mirror](MIRROR.md) | [Build](BUILD.md) | [Stage](STAGE.md) | [Configure](CONFIGURE.md) | [Create](CREATE.md) | 78 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | 79 | -------------------------------------------------------------------------------- /provisioning/baremetal/CREATE.md: -------------------------------------------------------------------------------- 1 | # Create PNDA cluster 2 | 3 | ![](../images/breadcrumbs-create.jpg) 4 | 5 | ## Introduction 6 | 7 | With the target platform fully prepared, the PNDA software staged and the YAML configuration completed, the final step is to invoke the PNDA CLI to create the cluster. 8 | 9 | Underneath, Nova will select suitable machines via Ironic based on the earlier flavor tagging and the machines will be PXE booted and images installed on the physical disks. 10 | 11 | ## Create PNDA 12 | 13 | #### Install CLI dependencies 14 | 15 | To use the PNDA command line interface, you will need to install the python, heat and nova clients. To install them on CentOS, run: 16 | 17 | ``` 18 | sudo yum install -y epel-release 19 | sudo yum install -y python python-pip python-devel 20 | 21 | cd cli 22 | sudo pip install -r requirements.txt 23 | ``` 24 | 25 | #### Keystone authentication 26 | 27 | You must authenticate with Keystone before using the OpenStack clients. 28 | 29 | The easiest way to accomplish this is to download the credentials file usually named -openrc.sh from Horizon for the given user and tenant, then source it in the shell you will use to create PNDA. 30 | 31 | ``` 32 | . -openrc.sh 33 | ``` 34 | 35 | #### CLI invocation 36 | 37 | The heat_cli.py scripts allows to launch a PNDA deployment. It sits in the cli subdirectory. 38 | 39 | 40 | ``` 41 | cd cli 42 | ./heat_cli.py 43 | usage: heat_cli.py [-h] [-y] [-e PNDA_CLUSTER] [-n DATANODES] 44 | [-o OPENTSDB_NODES] [-k KAFKA_NODES] [-z ZK_NODES] 45 | [-f {standard}] [-b BRANCH] [-s KEYPAIR] 46 | {create,destroy} 47 | ``` 48 | 49 | In particular note that the -s option refers to the key pair created in the OpenStack tenant during the preparation phase. 50 | 51 | Make sure you have access to the private key of this key pair otherwise you will not be able to connect to the bastion node and access the cluster. 52 | 53 | ``` 54 | ./heat_cli.py -e pnda-cluster -f bmstandard -b master -s -bare true -fstype local create 55 | ``` 56 | 57 | #### Updating the undercloud ```/etc/hosts``` file 58 | ``` 59 | openstack server list -c Networks -c Name | grep -v Networks|awk {'print $4,$2'}|cut -d\= -f2 - |sudo tee -a /etc/hosts 60 | ``` 61 | 62 | #### Connecting to PNDA 63 | 64 | Forward host port 2222 to the undercloud ssh port 65 | ``` 66 | iptables -t nat -I PREROUTING -p tcp -d 10.60.19.29 --dport 2222 -j DNAT --to-destination 192.168.122.73:22 67 | iptables -I FORWARD -m state -d 192.168.122.0/24 --state NEW,RELATED,ESTABLISHED -j ACCEPT 68 | ``` 69 | 70 | # [Home](../OVERVIEW.md) 71 | 72 | | [Home](../OVERVIEW.md) | [Prepare](PREPARE.md) | [Undercloud](UNDERCLOUD.md) | [Saltmaster](SALTMASTER.md) | [Register](REGISTER.md) | [Mirror](MIRROR.md) | [Build](BUILD.md) | [Stage](STAGE.md) | [Configure](CONFIGURE.md) | [Create](CREATE.md) | 73 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | 74 | -------------------------------------------------------------------------------- /provisioning/baremetal/MIRROR.md: -------------------------------------------------------------------------------- 1 | # Create PNDA mirror components 2 | 3 | ![](../images/breadcrumbs-mirror.jpg) 4 | 5 | ## Introduction 6 | 7 | As many real-world deployment environments don’t have Internet connectivity and online sources are not always dependable, PNDA is created from a cache of all the required software known as the "PNDA mirror". 8 | 9 | At provisioning time, when Minions execute SaltStack states, software from this server is downloaded, installed and configured. 10 | 11 | Before PNDA can be created, first we must create the directory structure and file sets to be placed on the PNDA mirror. 12 | 13 | ### CentOS or RHEL 14 | 15 | PNDA can be created on CentOS or RHEL instances. Before building the PNDA mirror components, decide which instance type your deployment will use. 16 | 17 | ## Create mirror 18 | 19 | #### Select build machine 20 | 21 | Designate or create the mirror build machine. 22 | 23 | This can be a physical machine or a VM but it needs to reflect the type of mirror you wish to build and must be clean. 24 | 25 | Two types of mirror are supported - 26 | 27 | - Red Hat Enterprise Linux 7 28 | - CentOS 7 29 | 30 | #### Obtain mirror build tools 31 | 32 | The repository [pnda](https://github.com/pndaproject/pnda) contains all the tools needed to create and maintain the mirror file sets. 33 | 34 | Clone this repository to the mirror creation machine. The tools are found in the [mirror folder](https://github.com/pndaproject/pnda/tree/master/mirror). 35 | 36 | #### Build mirror file sets 37 | 38 | To run the entire mirror creation process - 39 | 40 | ```sh 41 | sudo su 42 | ./create_mirror.sh 43 | ``` 44 | 45 | The script automatically detects the host Linux distribution and builds the appropriate file sets. 46 | 47 | This takes about 20 minutes to run and the output will be available in a directory named ```mirror-dist```. 48 | 49 | ##### Building parts of the mirror 50 | 51 | The different parts of the mirror can be created separately if required. The scripts to do this are - 52 | 53 | ``` 54 | create_mirror_rpm.sh 55 | create_mirror_misc.sh 56 | create_mirror_python.sh 57 | create_mirror_anaconda.sh 58 | create_mirror_cdh.sh 59 | create_mirror_hdp.sh 60 | ``` 61 | 62 | Each script creates its output in a directory named for the respective mirror type - 63 | 64 | ``` 65 | mirror_rpm 66 | mirror_misc 67 | mirror_python 68 | mirror_anaconda 69 | mirror_cloudera 70 | mirror_hdp 71 | ``` 72 | 73 | For more about creating and maintaining mirrors, please refer to the [repository notes](https://github.com/pndaproject/pnda/tree/master/mirror). 74 | 75 | # [Next](BUILD.md) 76 | 77 | | [Home](../OVERVIEW.md) | [Prepare](PREPARE.md) | [Undercloud](UNDERCLOUD.md) | [Saltmaster](SALTMASTER.md) | [Register](REGISTER.md) | [Mirror](MIRROR.md) | [Build](BUILD.md) | [Stage](STAGE.md) | [Configure](CONFIGURE.md) | [Create](CREATE.md) | 78 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | 79 | -------------------------------------------------------------------------------- /provisioning/baremetal/PREPARE.md: -------------------------------------------------------------------------------- 1 | # Select & Prepare Platform 2 | 3 | ![](../images/breadcrumbs.jpg) 4 | 5 | # Introduction 6 | 7 | The PNDA bare metal deployment process is similar to the process to deploy OpenStack itself. To deploy PNDA on top of bare metal nodes, OpenStack platform services are used. The two main services involved in this process are Ironic and Heat. 8 | 9 | The subject and content of this guide was originally developed and authored by [Fabien Andrieux](https://github.com/krickwix). 10 | 11 | # Preparing the bare metal cluster 12 | 13 | The preparation phase for bare metal is more involved than the other target platforms to which PNDA can be deployed, as before creating PNDA we first bring the physical cluster under the control of the OpenStack orchestration components. Once this done, creation proceeds as per the OpenStack deployment process. 14 | 15 | Bare metal nodes on top of which PNDA is to be deployed will need to implement an IPMI interface for power management and be able to boot using PXE to boot and deploy an operating system. We make use of the pxe_ipmitool ironic driver which is generic enough to manage power management and pxe boot on a vast majority of servers. 16 | 17 | ## Overview 18 | 19 | The preparation steps are - 20 | 21 | - Identify and configure the hardware resources - servers and networks - that will be used 22 | - Create the Build Node 23 | - Gather bare-metal nodes specifications (IPMI ip address, IPMI credentials, MAC address) 24 | - Populate the Ironic database with node specifications 25 | - Introspect the nodes 26 | - Tag the nodes with profiles 27 | 28 | ### Identify and configure the hardware resources 29 | 30 | Firstly, a set of suitable hardware must be commissioned and configured. This includes, for example, making sure that all disks are set up with the desired RAID configuration, network switches and routers are properly configured and so on. 31 | 32 | To facilitate the remainder of the process it's useful at this stage to spend some time to compile an accurate inventory of all machine capabilities including CPUs, memory and storage as well as all interfaces, MAC addresses and so on. 33 | 34 | ### Designate the Build Node 35 | 36 | One machine must be designated as the build node. 37 | 38 | The purpose of this node is to host the functions required to conduct both initial orchestration and subsequent updates and maintenance of the PNDA cluster. 39 | 40 | It will have two network interfaces. One will be dedicated to provisioning and administration of the bare-metal nodes, the other one will provide direct external connectivity, as suggested below. 41 | 42 | ![](bm-deployment.png) 43 | 44 | The build node operating system must be either a CentOS 7 or Redhat Enterprise Linux 7. 45 | 46 | On the build node, a number of additional nodes will be created, as KVMs. 47 | 48 | The process of creating these nodes is described in the remainder of this guide. 49 | 50 | # References 51 | 52 | [PNDA Website](http://pnda.io/) 53 | 54 | [OpenStack project documentation](http://docs.openstack.org/) 55 | 56 | [TripleO project documentation](http://tripleo.org/) 57 | 58 | # [Next](UNDERCLOUD.md) 59 | 60 | | [Home](../OVERVIEW.md) | [Prepare](PREPARE.md) | [Undercloud](UNDERCLOUD.md) | [Saltmaster](SALTMASTER.md) | [Register](REGISTER.md) | [Mirror](MIRROR.md) | [Build](BUILD.md) | [Stage](STAGE.md) | [Configure](CONFIGURE.md) | [Create](CREATE.md) | 61 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | 62 | -------------------------------------------------------------------------------- /provisioning/baremetal/SALTMASTER.md: -------------------------------------------------------------------------------- 1 | # Setting up Saltmaster VM 2 | 3 | ![](../images/breadcrumbs.jpg) 4 | 5 | ## Introduction 6 | 7 | PNDA uses SaltStack to take care of provisioning, managing configuration and upgrading at the software and services layer above the infrastructure. Please see [this quick overview of SaltStack](https://docs.saltstack.com/en/latest/topics/tutorials/walkthrough.html). 8 | 9 | SaltStack servers are either designated Master and Minion. The Master is the server hosts all policies and configuration and pushes those to the minions. The Minions are the infrastructure hosts that are to be managed. All communication is encrypted and Minions are securely authenticated with the Master. 10 | 11 | This VM will be the SaltStack Master node and host all policies and configuration and coordinate the way these are communicated to to the SaltStack Minions. 12 | 13 | ## Walkthrough 14 | 15 | These instructions are carried out on the Build Node. 16 | 17 | #### Creating the VM 18 | 19 | ``` 20 | sudo qemu-img create -f qcow2 -o preallocation=metadata /var/lib/libvirt/images/pnda-master.qcow2 40G 21 | sudo virt-install --name=pnda-master --file=/var/lib/libvirt/images/pnda-master.qcow2 --graphics vnc,listen=0.0.0.0 --vcpus=1 --ram=4096 --network network=provisioning,model=virtio --os-type=linux --boot hd --dry-run --print-xml > pnda-master.xml 22 | sudo virsh define pnda-master.xml 23 | sudo virsh domiflist pnda-master 24 | ``` 25 | Remember the instance's mac address 26 | 27 | # [Next](REGISTER.md) 28 | 29 | | [Home](../OVERVIEW.md) | [Prepare](PREPARE.md) | [Undercloud](UNDERCLOUD.md) | [Saltmaster](SALTMASTER.md) | [Register](REGISTER.md) | [Mirror](MIRROR.md) | [Build](BUILD.md) | [Stage](STAGE.md) | [Configure](CONFIGURE.md) | [Create](CREATE.md) | 30 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | 31 | -------------------------------------------------------------------------------- /provisioning/baremetal/bm-deployment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/provisioning/baremetal/bm-deployment.png -------------------------------------------------------------------------------- /provisioning/baremetal/bm-inspecting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/provisioning/baremetal/bm-inspecting.png -------------------------------------------------------------------------------- /provisioning/baremetal/bm-registration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/provisioning/baremetal/bm-registration.png -------------------------------------------------------------------------------- /provisioning/baremetal/bm-workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/provisioning/baremetal/bm-workflow.png -------------------------------------------------------------------------------- /provisioning/building.md: -------------------------------------------------------------------------------- 1 | # PNDA Components 2 | 3 | PNDA has a number of components, some of which are built from PNDA source repositories and others which are obtained from online repositories. All of these components must be staged on a suitable HTTP server prior to provisioning a cluster in your target environment. 4 | 5 | For full instructions on creating PNDA and advice on staging the PNDA components, please refer the [repository notes](https://github.com/pndaproject/pnda/blob/master/build/README.md). 6 | -------------------------------------------------------------------------------- /provisioning/heat.md: -------------------------------------------------------------------------------- 1 | # Getting Started with Heat 2 | 3 | [Heat](https://wiki.openstack.org/wiki/Heat) is the main project in the OpenStack Orchestration program. It implements an orchestration engine to launch multiple composite cloud applications based on templates in the form of text files that can be treated like code. 4 | 5 | Start understanding how Heat fits into OpenStack by reading the [architecture guide](http://docs.openstack.org/kilo/install-guide/install/apt/content/ch_overview.html#). Next, download and install the [Python command line clients](https://wiki.openstack.org/wiki/OpenStackClients). 6 | 7 | Then, for a simple guide to getting started with developing templates for Heat you can check out the [tutorials](http://docs.openstack.org/developer/heat/template_guide/hello_world.html) on the OpenStack documentation site. 8 | 9 | Finally, follow the instructions for [creating PNDA on OpenStack](OVERVIEW.md). 10 | -------------------------------------------------------------------------------- /provisioning/images/breadcrumbs-build.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/provisioning/images/breadcrumbs-build.jpg -------------------------------------------------------------------------------- /provisioning/images/breadcrumbs-cfg.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/provisioning/images/breadcrumbs-cfg.jpg -------------------------------------------------------------------------------- /provisioning/images/breadcrumbs-create.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/provisioning/images/breadcrumbs-create.jpg -------------------------------------------------------------------------------- /provisioning/images/breadcrumbs-mirror.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/provisioning/images/breadcrumbs-mirror.jpg -------------------------------------------------------------------------------- /provisioning/images/breadcrumbs-stage.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/provisioning/images/breadcrumbs-stage.jpg -------------------------------------------------------------------------------- /provisioning/images/breadcrumbs.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/provisioning/images/breadcrumbs.jpg -------------------------------------------------------------------------------- /provisioning/openstack/BUILD.md: -------------------------------------------------------------------------------- 1 | # Create PNDA build components 2 | 3 | ![](../images/breadcrumbs-build.jpg) 4 | 5 | ## Introduction 6 | 7 | In addition to projects like Hadoop and Kafka, PNDA also includes a variety of components that provide an operations console, application deployment and more. Build these components before provisioning PNDA. 8 | 9 | ## Create build node 10 | 11 | #### Select build node 12 | 13 | Designate or create the PNDA build node. This could be the same machine that was used to build the mirror file sets. 14 | 15 | Two types of build node are supported - 16 | 17 | - Red Hat Enterprise Linux 7 18 | - CentOS 7 19 | 20 | 21 | #### Obtain build tools 22 | 23 | The repository [pnda](https://github.com/pndaproject/pnda) contains all the tools needed to build PNDA. 24 | 25 | Decide which version of PNDA you want to create. All PNDA releases are desginated with tag similar to ```release/4.0``` across all repositories. 26 | 27 | Clone this repository at the right version to the build node. 28 | 29 | #### Configure the proxy. (Optional) 30 | 31 | The entire PNDA build process can be performed from behind a non-transparent proxy. 32 | 33 | To proceeding in this mode, first set the system configuration and then run the ```set-proxy-env.sh``` script that will set up the various proxy configurations needed by the multiple build tools. 34 | 35 | ```sh 36 | sudo su 37 | export http_proxy=http://: 38 | export https_proxy=http://: 39 | . set-proxy-env.sh 40 | ``` 41 | 42 | #### Preparing the build environment 43 | 44 | The tools are found in the [build folder](https://github.com/pndaproject/pnda/tree/master/build). 45 | 46 | The script ```install-build-tools.sh``` installs all the necessarily build prerequisites. 47 | 48 | Run it with superuser privileges in the location that you wish to install your build tools. 49 | 50 | In case you are using Redhat, you will need to override default RedHat repos by defining the following environment variables. The names should be substituted with the appropriate names for your environment. 51 | ```sh 52 | sudo su 53 | export RPM_OPTIONAL=rhel-7-server-optional-rpms 54 | export RPM_EXTRAS=rhel-7-server-extras-rpms 55 | ``` 56 | 57 | For example 58 | 59 | ```sh 60 | sudo su 61 | cd /home/builder 62 | ./install-build-tools.sh 63 | ``` 64 | 65 | As well as installing all the required software, it may pause and ask the operator to carry out some configuration on the build environment, for example adjusting the contents of /etc/hosts. 66 | 67 | Once it has finished, a file called ```set-pnda-env.sh``` will be found in the working directory. This script contains the necessary environment variables and other settings needed to carry out builds. It should either be added to the end of an initialization script such as ```/etc/bash.bashrc``` so that these settings are available for new shells, or it can be invoked with each build. 68 | 69 | For example 70 | 71 | ```sh 72 | sudo su 73 | cat >> /etc/bash.bashrc 74 | . /home/builder/set-pnda-env.sh 75 | ``` 76 | 77 | Your environment is now ready to build PNDA. 78 | 79 | ## Building PNDA 80 | 81 | The script ```build-pnda.sh``` is invoked as a non-privileged user. 82 | 83 | If you are running behind a non-transparent proxy, go through the [proxy configuration](#configure-the-proxy-optional) steps again for the non-privileged user (don't substitute user). 84 | 85 | For example 86 | 87 | ```sh 88 | cd pnda/build/ 89 | ./build-pnda.sh RELEASE release/4.0 90 | ``` 91 | 92 | It is also possible to perform more complex builds including building to a specific bill-of-materials. Please refer to the [repository notes](https://github.com/pndaproject/pnda/tree/master/build). 93 | 94 | ## Build Products 95 | 96 | All build products are assembled in the directory ```pnda-dist```. 97 | 98 | # [Next](STAGE.md) 99 | 100 | | [Home](../OVERVIEW.md) | [Prepare](PREPARE.md) | [Mirror](MIRROR.md) | [Build](BUILD.md) | [Stage](STAGE.md) | [Configure](CONFIGURE.md) | [Create](CREATE.md) | 101 | | --- | --- | --- | --- | --- | --- | --- | 102 | -------------------------------------------------------------------------------- /provisioning/openstack/CONFIGURE.md: -------------------------------------------------------------------------------- 1 | # Configure PNDA creation process 2 | 3 | ![](../images/breadcrumbs-cfg.jpg) 4 | 5 | ## Introduction 6 | 7 | The PNDA creation process is controlled primarily via a YAML configuration file. 8 | 9 | A template YAML configuration can be found in the [pnda-cli repository](https://github.com/pndaproject/pnda-cli). 10 | 11 | ## Configure pnda_env.yaml 12 | 13 | #### Designate client machine 14 | 15 | Create or designate a suitable machine for running the PNDA CLI. We recommend CentOS 7. 16 | 17 | #### Obtain code 18 | 19 | Clone the [pnda-cli repository](https://github.com/pndaproject/pnda-cli) repository from the master branch at a specific release tag (e.g. ```release/4.0```) to the client machine. 20 | 21 | Copy ```pnda_env_example.yaml``` to create ```pnda_env.yaml``` 22 | 23 | #### Set access credentials 24 | 25 | Set the following fields under `openstack_parameters` section in `pnda_env.yaml` . The values can be obtained by referring to the Keystone authentication details obtained in the preparation phase. 26 | 27 | | Field | Value | 28 | | --- | --- | 29 | |KEYSTONE_USER| User for creating PNDA | 30 | |KEYSTONE_PASSWORD| Password for user | 31 | |KEYSTONE_TENANT| Tenant for creating PNDA | 32 | |KEYSTONE_AUTH_URL| Authorization URL | 33 | |KEYSTONE_AUTH_VERSION| 2 | 34 | |KEYSTONE_REGION_NAME| Region name | 35 | 36 | #### Object storage 37 | 38 | Set `pnda_application_repo.PNDA_APPS_CONTAINER` to the Application container configured during the preparation phase. 39 | 40 | Set `pnda_application_repo.PNDA_APPS_FOLDER` to the Application folder configured during the preparation phase. 41 | 42 | Set `pnda_data_archive.PNDA_ARCHIVE_CONTAINER` to the Dataset archive container configured during the preparation phase. 43 | 44 | #### Hadoop distribution 45 | 46 | Decide whether you want to run the Cloudera CDH or the Hortonworks HDP Hadoop distribution. 47 | 48 | Set `hadoop.HADOOP_DISTRO` to either `CDH` or `HDP`. 49 | 50 | #### Set source of SaltStack provisioning scripts 51 | 52 | The PNDA software is installed and configured using the SaltStack code found in the [platform-salt](https://github.com/pndaproject/platform-salt) repository. There are two main options to provide source for platform-salt: 53 | 54 | 1. Set `platform_salt.PLATFORM_GIT_REPO_URI` to the remote git URI and `platform_salt.PLATFORM_GIT_BRANCH` at the specified branch to be cloned during provisioning. 55 | If authenticated access to `platform_salt.PLATFORM_GIT_REPO_URI` is required, then place the ssh key to use, named git.pem, in the top level directory of "pnda-cli" repository and also set `platform_salt.PLATFORM_GIT_REPO_HOST` to the hostname of the server. 56 | 57 | 2. A local copy of platform-salt can be used by setting (`platform_salt.PLATFORM_SALT_LOCAL`) to the path to the platform-salt folder on the local machine running pnda-cli.py. 58 | 59 | #### PNDA mirror 60 | 61 | Set `mirrors.PNDA_MIRROR` to the URI determined by the placement of the mirror and build components in the staging phase. 62 | 63 | #### Other fields 64 | 65 | There are a wide range of parameters that can be set, please refer to ```pnda_env_example.yaml``` in the [pnda-cli repository](https://github.com/pndaproject/pnda-cli) for more details. 66 | 67 | ## Security Material 68 | 69 | #### Perimeter security (FQDN's and associated certificates/private keys) 70 | Access to the PNDA cluster requires user authentication over a secure connection. In order to secure this user authentication, the perimeter servers require certification material which allows validating the FQDN used to access those servers to further authenticate and secure the connection to those servers. 71 | 72 | For PRODUCTION ENVIRONMENTS, this security material MUST be generated outside the PNDA realm and dropped under the [platform-certificates](https://github.com/pndaproject/pnda-cli/tree/cfa40dbd94afaa5e3f3080106c852fb6c1e2d516/platform-certificates) directory tree. Consult the README files under that same directory and sub-directories for further details on the required material. 73 | 74 | For NON-PRODUCTION ENVIRONMENTS, a helper tool ([tools/gen-certs.py](https://github.com/pndaproject/pnda-cli/blob/cfa40dbd94afaa5e3f3080106c852fb6c1e2d516/tools/gen-certs.py)) is provided that can auto-generate the required server certificates based on an existing CA (private key) or based on a newly generated CA (when no private key is detected in the ./platform-certificates directory by the helper tool). 75 | 76 | # [Next](CREATE.md) 77 | 78 | | [Home](../OVERVIEW.md) | [Prepare](PREPARE.md) | [Mirror](MIRROR.md) | [Build](BUILD.md) | [Stage](STAGE.md) | [Configure](CONFIGURE.md) | [Create](CREATE.md) | 79 | | --- | --- | --- | --- | --- | --- | --- | 80 | -------------------------------------------------------------------------------- /provisioning/openstack/CREATE.md: -------------------------------------------------------------------------------- 1 | # Create PNDA cluster 2 | 3 | ![](../images/breadcrumbs-create.jpg) 4 | 5 | ## Introduction 6 | 7 | With the target platform fully prepared, the PNDA software staged and the YAML configuration completed, the final step is to invoke the PNDA CLI to create the cluster. 8 | 9 | ## Create PNDA 10 | 11 | #### Install CLI dependencies 12 | 13 | To use the PNDA command line interface, you will need to install the python, heat and nova clients. To install them on CentOS, run: 14 | 15 | ``` 16 | sudo yum install -y epel-release 17 | sudo yum install -y python python-pip python-devel gcc 18 | 19 | cd cli 20 | sudo pip install -r requirements.txt 21 | ``` 22 | 23 | #### CLI invocation 24 | 25 | The pnda-cli.py script allows launching a PNDA deployment. It sits in the cli subdirectory. 26 | 27 | ``` 28 | cd cli 29 | ./pnda-cli.py 30 | usage: pnda-cli.py [-h] [-e PNDA_CLUSTER] [-n DATANODES] [-o OPENTSDB_NODES] 31 | [-k KAFKA_NODES] [-z ZK_NODES] [-f FLAVOR] [-s KEYNAME] 32 | [-x] [-b BRANCH] [-d] [-m X_MACHINES_DEFINITION] [-v] 33 | {create,expand,destroy} 34 | ``` 35 | 36 | | Options | Description | 37 | | --- | --- | 38 | | -e | Namespaced environment for machines in this cluster. 39 | | -n | How many datanodes needed for the hadoop cluster 40 | | -o | How many Open TSDB nodes for the hadoop cluster 41 | | -k | How many kafka nodes for the databus cluster 42 | | -z | How many zookeeper nodes for the databus cluster 43 | | -f | PNDA flavours: ['pico', 'standard'] 44 | | -s | Name of key-pair name created in the OpenStack Tenant during the preparation phase. 45 | | -b | Branch of [platform-salt](https://github.com/pndaproject/platform-salt) to use. Overrides value in pnda_env.yaml 46 | | -v | Verbose logging 47 | 48 | Note: Make sure you have access to the private key of this key pair otherwise you will not be able to connect to the gateway node and access the cluster. 49 | 50 | Examples on invocation of CLI: 51 | 52 | ##### Create new cluster: 53 | ``` 54 | cd cli 55 | pnda-cli.py create -e -s -f standard -o 2 -n 3 -k 2 -z 3 -b release/4.0 56 | ``` 57 | The options shown select the standard flavor, 2 openTSDB instances, 3 hadoop datanodes, 2 kafka brokers, and 3 zookeeper nodes. If you need to operate within the Openstack tenant instance quota of 20 instances then you can reduce this to 1 kafka and 1 zookeeper instance or use the pico flavor. 58 | 59 | ``` 60 | pnda-cli.py create -e -s -f standard -o 1 -n 1 -k 1 -z 1 -b release/4.0 61 | pnda-cli.py create -e -s -f pico -n 1 -k 1 -b release/4.0 62 | ``` 63 | 64 | ##### Destroy existing cluster: 65 | ``` 66 | pnda-cli.py destroy -e 67 | ``` 68 | ##### Expand existing cluster: 69 | ``` 70 | pnda-cli.py expand -e -f standard -s -n 10 -k 5 71 | ``` 72 | Either, or both, kafka (k) and datanodes (n) can be changed. 73 | The value specifies the new total number of nodes. 74 | Shrinking is not supported - this must be done very carefully to avoid data loss. 75 | 76 | To orchestrate PNDA on Openstack clone [pnda-cli repository](https://github.com/pndaproject/pnda-cli). 77 | 78 | **Important:** ensure you are certain what version of PNDA you want to deploy, and specify the correct branch or tag when invoking the CLI using the -b option. In most circumstances you'll want to make sure the branch or tag you specify is identical to the branch or tag you used to build the PNDA mirror, and identical to the version you checked out from the pnda-cli repository. All PNDA releases are designated with a tag such as ```release/4.0``` across all repositories. 79 | 80 | # [Home](../OVERVIEW.md) 81 | 82 | | [Home](../OVERVIEW.md) | [Prepare](PREPARE.md) | [Mirror](MIRROR.md) | [Build](BUILD.md) | [Stage](STAGE.md) | [Configure](CONFIGURE.md) | [Create](CREATE.md) | 83 | | --- | --- | --- | --- | --- | --- | --- | 84 | -------------------------------------------------------------------------------- /provisioning/openstack/EXAMPLES.md: -------------------------------------------------------------------------------- 1 | # HTTP Server examples 2 | 3 | These are some examples of how an HTTP server can be set up to serve files for PNDA provisioning. 4 | 5 | These notes are intended for guidance only and will not be maintained or supported. You are strongly advised to refer to the official documentation for each technology. 6 | 7 | ## Apache HTTP Server on CentOS 8 | 9 | Assuming a working CentOS 7 server and that built components are in directory pnda-dist below the current working directory - 10 | 11 | Install Apache HTTP Server - 12 | 13 | sudo yum install httpd 14 | 15 | Copy components to the HTTP server - 16 | 17 | cp pnda-dist/* /var/www/html/ 18 | 19 | The components are now available via HTTP from the server - 20 | 21 | http://server/some-component-0.1.0.tar.gz 22 | 23 | ## Python 24 | 25 | Assuming that the built components are in the current working directory - 26 | 27 | sudo python -m SimpleHTTPServer 80 28 | 29 | The components are now available via HTTP from the server - 30 | 31 | http://server/some-component-0.1.0.tar.gz 32 | 33 | ## Vagrant 34 | 35 | The starting assumption is that you have a working Vagrant installation including a suitable provider plugin for the target environment and that built components are in directory pnda-dist below the current working directory. 36 | 37 | **AWS** 38 | 39 | For this example we'll use a popular [AWS provider](https://github.com/mitchellh/vagrant-aws). 40 | 41 | Create a Vagrantfile, substituting in your AWS configuration - 42 | 43 | VAGRANTFILE_API_VERSION = "2" 44 | 45 | Vagrant.configure("2") do |config| 46 | config.vm.box = "dummy" 47 | 48 | config.vm.provider :aws do |aws, override| 49 | aws.access_key_id = "" 50 | aws.secret_access_key = "" 51 | aws.keypair_name = "" 52 | aws.ami = "" 53 | aws.region = "" 54 | aws.security_groups = [ '' ] 55 | override.ssh.username = "centos" 56 | override.ssh.private_key_path = "" 57 | end 58 | config.vm.provision :shell, path: "bootstrap.sh" 59 | end 60 | 61 | Create a bootstrap.sh - 62 | 63 | yum install -y httpd 64 | rm -rf /var/www/html 65 | ln -fs /vagrant/pnda-dist /var/www/html 66 | 67 | Start the instance - 68 | 69 | vagrant up --provider=aws 70 | 71 | The components are now available via HTTP from the server in AWS - 72 | 73 | http://server/some-component-0.1.0.tar.gz 74 | 75 | If you need to update the components later - 76 | 77 | vagrant rsync 78 | 79 | **OpenStack** 80 | 81 | For this example we'll use a popular [OpenStack provider](https://github.com/cloudbau/vagrant-openstack-plugin). 82 | 83 | Create a Vagrantfile, substituting in your OpenStack configuration - 84 | 85 | Vagrant.configure("2") do |config| 86 | config.vm.box = "dummy" 87 | config.ssh.username = "" 88 | config.ssh.private_key_path = "" 89 | 90 | config.vm.provider :openstack do |os| 91 | os.username = "" 92 | os.api_key = "" 93 | os.flavor = "" 94 | os.region = "" 95 | os.image = "" 96 | os.endpoint = "" 97 | os.tenant = "" 98 | os.keypair_name = "" 99 | os.floating_ip = :auto 100 | os.floating_ip_pool = "" 101 | os.networks = [''] 102 | end 103 | config.vm.provision :shell, path: "bootstrap.sh" 104 | end 105 | 106 | Create a bootstrap.sh - 107 | 108 | yum install -y httpd 109 | rm -rf /var/www/html 110 | ln -fs /vagrant/pnda-dist /var/www/html 111 | 112 | Start the instance - 113 | 114 | vagrant up --provider=openstack 115 | 116 | The components are now available via HTTP from the server in OpenStack - 117 | 118 | http://server/some-component-0.1.0.tar.gz 119 | 120 | If you need to update the components later - 121 | 122 | vagrant provision 123 | 124 | ## Docker 125 | 126 | Assuming a working Docker installation and that built components are in directory pnda-dist below the current working directory - 127 | 128 | Create a Dockerfile - 129 | 130 | FROM httpd:2.4 131 | COPY ./pnda-dist/ /usr/local/apache2/htdocs/ 132 | 133 | Build the container - 134 | 135 | sudo docker build -t package-server . 136 | sudo docker run -dit -p 80:80 --name package-server package-server 137 | 138 | The components are now available via HTTP from the server - 139 | 140 | http://server/some-component-0.1.0.tar.gz 141 | 142 | If you need to update the components later - 143 | 144 | sudo docker cp pnda-dist/some-component-0.2.0.tar.gz package-server:/usr/local/apache2/htdocs/ 145 | 146 | 147 | -------------------------------------------------------------------------------- /provisioning/openstack/IMAGE.md: -------------------------------------------------------------------------------- 1 | # Select & Prepare Platform 2 | 3 | ![](../images/breadcrumbs.jpg) 4 | 5 | ## Creating PNDA image 6 | 7 | Deploying PNDA using Heat templates requires an image with some pre-installed elements, such as `os-collect-config`. 8 | 9 | PNDA currently runs on CentOS/RHEL, but you can use Ubuntu or CentOS/RHEL OSes to create the PNDA image. 10 | 11 | ### Pre-requisites 12 | 13 | **Important:** these dependencies install correctly on an Ubuntu 14.04 *server* image but fail on a *desktop* images. 14 | 15 | If you are on Ubuntu: 16 | ``` 17 | sudo apt-get -y install python-pip python-dev qemu-utils libguestfs-tools 18 | ``` 19 | 20 | If you are on CentOS/RHEL: 21 | ``` 22 | sudo yum install epel-release 23 | sudo yum install python-pip python-devel libguestfs-tools 24 | ``` 25 | 26 | ### Setting up a virtualenv 27 | 28 | Install virtualenv: 29 | 30 | ``` 31 | pip install --user virtualenv 32 | ``` 33 | 34 | Create the virtual environment: 35 | 36 | ``` 37 | virtualenv /path/to/project/pnda-dib 38 | . /path/to/project/pnda-dib/bin/activate 39 | ``` 40 | 41 | ### Getting the required elements 42 | 43 | Update submodules: 44 | ``` 45 | git submodule init 46 | git submodule update 47 | ``` 48 | 49 | Install `openstack/diskimage-builder`: 50 | 51 | ``` 52 | cd dib-utils 53 | python setup.py install 54 | cd .. 55 | cd diskimage-builder 56 | python setup.py install 57 | cd .. 58 | pip install six 59 | pip install PyYAML 60 | ``` 61 | 62 | Set up environment variables, assuming you currently are in this repository's project directory (there is at least a elements directory present): 63 | 64 | 65 | ``` 66 | cat > dib_env.sh <: 48 | export https_proxy=http://: 49 | . set-proxy-env.sh 50 | ``` 51 | 52 | #### Build mirror file sets 53 | 54 | The build tools are found in the [mirror folder](https://github.com/pndaproject/pnda/tree/master/mirror). 55 | 56 | To run the entire mirror creation process - 57 | 58 | ```sh 59 | sudo su 60 | ./create_mirror.sh 61 | ``` 62 | 63 | The script automatically detects the host Linux distribution and builds the appropriate file sets. 64 | 65 | This takes about 20 minutes to run and the output will be available in a directory named ```mirror-dist```. 66 | 67 | ##### Building parts of the mirror 68 | 69 | The different parts of the mirror can be created separately if required. The scripts to do this are - 70 | 71 | ``` 72 | create_mirror_rpm.sh 73 | create_mirror_misc.sh 74 | create_mirror_python.sh 75 | create_mirror_cdh.sh 76 | create_mirror_hdp.sh 77 | create_mirror_apps.sh 78 | ``` 79 | 80 | Each script creates its output in a directory named for the respective mirror type - 81 | 82 | ``` 83 | mirror_rpm 84 | mirror_misc 85 | mirror_python 86 | mirror_cloudera 87 | mirror_hdp 88 | mirror_apps 89 | ``` 90 | 91 | For more about creating and maintaining mirrors, please refer to the [repository notes](https://github.com/pndaproject/pnda/tree/master/mirror). 92 | 93 | # [Next](BUILD.md) 94 | 95 | | [Home](../OVERVIEW.md) | [Prepare](PREPARE.md) | [Mirror](MIRROR.md) | [Build](BUILD.md) | [Stage](STAGE.md) | [Configure](CONFIGURE.md) | [Create](CREATE.md) | 96 | | --- | --- | --- | --- | --- | --- | --- | 97 | -------------------------------------------------------------------------------- /provisioning/openstack/STAGE.md: -------------------------------------------------------------------------------- 1 | # Stage PNDA mirror & components 2 | 3 | ![](../images/breadcrumbs-stage.jpg) 4 | 5 | ## Introduction 6 | 7 | In order to make built components available during the PNDA provisioning process, they need to be staged in a location that is accessible from the target environment via HTTP. 8 | 9 | ## Create server 10 | 11 | Create an ordinary HTTP server in the target environment or identify an existing server. The server must have connectivity with the PNDA cluster being provisioned. 12 | 13 | See [these tips](EXAMPLES.md) for rapidly creating an HTTP server using a number of different approaches. 14 | 15 | Your existing CICD system may already include the capability to host and serve build artifacts over HTTP. In this case, simply make use of an appropriate location on the existing resource. 16 | 17 | ## Stage files 18 | 19 | Copy the *contents* of ```mirror-dist``` and ```pnda-dist``` from the mirror creation and build steps respectively to the HTTP server. 20 | 21 | The final directory layout should resemble the following - 22 | 23 | ``` 24 | pnda-root 25 | │ 26 | ├── console-backend-data-logger-develop.tar.gz 27 | ├── console-backend-data-logger-develop.tar.gz.sha512.txt 28 | ├── etc 29 | │ 30 | ├── mirror_python 31 | │   ├── packages/ 32 | │ ├── simple/ 33 | │ 34 | ├── mirror_rpm 35 | │   ├── a-rpm.rpm 36 | │ ├── etc 37 | │ 38 | ├── etc 39 | ``` 40 | 41 | Note that ```pnda-root``` can be any location, all that is required is that the hierarchy under this is available via a known URI. For example, using a standard Apache 2 installation on RHEL 7, if the hierarchy above is placed in ```/var/www/html``` this will be available via the URI ```http:///``` since ```/var/www/html``` is the default *document root*. Please refer to your HTTP server documentation for more details. 42 | 43 | Make a note of the URI to ```pnda-root``` as this will be used in configuring the PNDA creation process. 44 | 45 | # [Next](CONFIGURE.md) 46 | 47 | | [Home](../OVERVIEW.md) | [Prepare](PREPARE.md) | [Mirror](MIRROR.md) | [Build](BUILD.md) | [Stage](STAGE.md) | [Configure](CONFIGURE.md) | [Create](CREATE.md) | 48 | | --- | --- | --- | --- | --- | --- | --- | 49 | -------------------------------------------------------------------------------- /provisioning/openstack/adam.txt: -------------------------------------------------------------------------------- 1 | Adam -------------------------------------------------------------------------------- /provisioning/openstack/images/attach-policy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/provisioning/openstack/images/attach-policy.png -------------------------------------------------------------------------------- /provisioning/openstack/images/create-policy1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/provisioning/openstack/images/create-policy1.png -------------------------------------------------------------------------------- /provisioning/openstack/images/create-policy2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/provisioning/openstack/images/create-policy2.png -------------------------------------------------------------------------------- /provisioning/openstack/images/create-policy3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/provisioning/openstack/images/create-policy3.png -------------------------------------------------------------------------------- /provisioning/openstack/images/create-user1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/provisioning/openstack/images/create-user1.png -------------------------------------------------------------------------------- /provisioning/openstack/images/create-user2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/provisioning/openstack/images/create-user2.png -------------------------------------------------------------------------------- /provisioning/saltstack.md: -------------------------------------------------------------------------------- 1 | # Getting started with SaltStack 2 | 3 | PNDA uses SaltStack for deployment automation. A good place to start is the [SaltStack Get Started Guide](https://docs.saltstack.com/en/getstarted/). Here you will learn the fundamentals of SaltStack. 4 | 5 | Next, read about the [SaltStack Components](https://docs.saltstack.com/en/getstarted/overview.html) that form the basis of Salt: the Salt Master, the [minions](http://img0.ndsstatic.com/wallpapers/c42ac6a0c2aaee69c79955d1d32c54b4_large.jpeg) that receive commands from the master, modules, formulas, grains and more. 6 | 7 | Once you've understood the principles, you could then go to the [SaltStack walk-through](https://docs.saltstack.com/en/latest/topics/tutorials/walkthrough.html) where you will install Salt and can begin experimenting with it. 8 | 9 | As an alternative to installing software locally, it is possible to use the [SaltStack Demo Environment](https://docs.saltstack.com/en/getstarted/fundamentals/index.html), which leverages VirtualBox and Vagrant. In less than an hour you can be up and ready to go! 10 | -------------------------------------------------------------------------------- /provisioning/server-cluster/BUILD.md: -------------------------------------------------------------------------------- 1 | # Create PNDA build components 2 | 3 | ![](../images/breadcrumbs-build.jpg) 4 | 5 | ## Introduction 6 | 7 | In addition to projects like Hadoop and Kafka, PNDA also includes a variety of components that provide an operations console, application deployment and more. Build these components before provisioning PNDA. 8 | 9 | ## Create build node 10 | 11 | #### Select build node 12 | 13 | Designate or create the PNDA build node. This could be the same machine that was used to build the mirror file sets. 14 | 15 | Two types of build node are supported - 16 | 17 | - Red Hat Enterprise Linux 7 18 | - CentOS 7 19 | 20 | 21 | #### Obtain build tools 22 | 23 | The repository [pnda](https://github.com/pndaproject/pnda) contains all the tools needed to build PNDA. 24 | 25 | Decide which version of PNDA you want to create. All PNDA releases are desginated with tag similar to ```release/4.0``` across all repositories. 26 | 27 | Clone this repository at the right version to the build node. 28 | 29 | #### Configure the proxy. (Optional) 30 | 31 | The entire PNDA build process can be performed from behind a non-transparent proxy. 32 | 33 | To proceeding in this mode, first set the system configuration and then run the ```set-proxy-env.sh``` script that will set up the various proxy configurations needed by the multiple build tools. 34 | 35 | ```sh 36 | sudo su 37 | export http_proxy=http://: 38 | export https_proxy=http://: 39 | . set-proxy-env.sh 40 | ``` 41 | 42 | #### Preparing the build environment 43 | 44 | The tools are found in the [build folder](https://github.com/pndaproject/pnda/tree/master/build). 45 | 46 | The script ```install-build-tools.sh``` installs all the necessarily build prerequisites. 47 | 48 | Run it with superuser privileges in the location that you wish to install your build tools. 49 | 50 | For example 51 | 52 | ```sh 53 | sudo su 54 | cd /home/builder 55 | ./install-build-tools.sh 56 | ``` 57 | 58 | As well as installing all the required software, it may pause and ask the operator to carry out some configuration on the build environment, for example adjusting the contents of /etc/hosts. 59 | 60 | The script generates a file called ```set-pnda-env.sh``` containing the necessary environment settings needed to carry out builds. Ensure this is invoked before each build 61 | 62 | For example 63 | 64 | ```sh 65 | . /home/builder/set-pnda-env.sh 66 | ``` 67 | 68 | Your environment is now ready to build PNDA. 69 | 70 | ## Building PNDA 71 | 72 | The script ```build-pnda.sh``` is invoked as a non-privileged user. 73 | 74 | If you are running behind a non-transparent proxy, go through the [proxy configuration](#configure-the-proxy-optional) steps again for the non-privileged user (don't substitute user). 75 | 76 | For example 77 | 78 | ```sh 79 | cd pnda 80 | ./build-pnda.sh RELEASE release/4.0 81 | ``` 82 | 83 | It is also possible to perform more complex builds including building to a specific bill-of-materials. Please refer to the [repository notes](https://github.com/pndaproject/pnda). 84 | 85 | ## Build Products 86 | 87 | All build products are assembled in the directory ```pnda-dist```. 88 | 89 | # [Next](STAGE.md) 90 | 91 | | [Home](../OVERVIEW.md) | [Prepare](PREPARE.md) | [Mirror](MIRROR.md) | [Build](BUILD.md) | [Stage](STAGE.md) | [Configure](CONFIGURE.md) | [Create](CREATE.md) | 92 | | --- | --- | --- | --- | --- | --- | --- | 93 | -------------------------------------------------------------------------------- /provisioning/server-cluster/CONFIGURE.md: -------------------------------------------------------------------------------- 1 | # Configure PNDA creation process 2 | 3 | ![](../images/breadcrumbs-cfg.jpg) 4 | 5 | ## Introduction 6 | 7 | The PNDA creation process is executed over SSH using key based authentication, and controlled via a YAML configuration file. 8 | 9 | A template YAML configuration can be found in the [PNDA CLI repository](https://github.com/pndaproject/pnda-cli). 10 | 11 | ## Designate client machine 12 | 13 | Create or designate a suitable machine for running the PNDA CLI. We recommend CentOS 7. 14 | 15 | ## Obtain code 16 | 17 | Clone the [PNDA CLI repository](https://github.com/pndaproject/pnda-cli) repository from the master branch at a specific release tag (e.g. ```release/3.5```) to the client machine. 18 | 19 | Copy ```pnda_env_example.yaml``` to create ```pnda_env.yaml``` 20 | 21 | ## SSH key pair 22 | 23 | Create an SSH key pair for use when configuring the PNDA nodes as ```key_name```. 24 | 25 | ```sh 26 | ssh-keygen -b 2048 -t rsa -f key_name.pem -q -N "" 27 | ``` 28 | 29 | Place the private key ```key_name.pem``` in the root of the pnda-cli directory. 30 | 31 | Ensure that key_name.pem has 0600 permissions. 32 | 33 | ## Configure sudo user 34 | 35 | Create a sudo user `` on each machine: 36 | 37 | ```sh 38 | sudo su 39 | adduser --disabled-password --gecos "" 40 | echo ALL=NOPASSWD: ALL >> /etc/sudoers 41 | ``` 42 | Allow login for that user with the created key on each machine: 43 | 44 | ```sh 45 | mkdir /home//.ssh 46 | cat key_name.pem.pub >> /home//.ssh/authorized_keys 47 | ``` 48 | 49 | ## Configure pnda_env.yaml 50 | 51 | ##### Set the OS user to be used in provisioning 52 | 53 | Set `OS_USER` to the appropriate value. 54 | 55 | ##### Configure local storage of applications 56 | 57 | Set `pnda_application_repo: PR_FS_TYPE` to `local`. 58 | 59 | ##### Configure NTP server 60 | 61 | Set `ntp: NTP_SERVERS` to an appropriate value for your cluster. 62 | 63 | ##### Hadoop distribution 64 | 65 | Decide whether you want to run the Cloudera CDH or the Hortonworks HDP Hadoop distribution. 66 | 67 | Set `hadoop.HADOOP_DISTRO` to either `CDH` or `HDP`. 68 | 69 | ##### Set source of SaltStack provisioning scripts 70 | 71 | The PNDA software is installed and configured using the SaltStack code found in the [platform-salt](https://github.com/pndaproject/platform-salt) repository. This can be supplied in two main ways. 72 | 73 | ##### Use local copy of platform-salt 74 | 75 | A local copy of platform-salt can be used by setting `platform_salt.PLATFORM_SALT_LOCAL` to the path to the platform-salt folder on the client machine. 76 | 77 | ##### Git repository 78 | 79 | Set `platform_salt.PLATFORM_GIT_REPO_URI` and `platform_salt.PLATFORM_GIT_BRANCH` to clone a remote git URI at the specified branch during provisioning. 80 | 81 | If authenticated access to `platform_salt.PLATFORM_GIT_REPO_URI` is required then place the ssh key to use, named git.pem, in the top level directory of this repository and set `platform_salt.PLATFORM_GIT_REPO_HOST` to the hostname of the server. 82 | 83 | **Note** Please ensure that the local clone of platform-salt or `platform_salt.PLATFORM_GIT_BRANCH` correspond to the same release tag as the pnda-cli repository cloned above. 84 | 85 | ##### PNDA mirror 86 | 87 | Set `mirrors.PNDA_MIRROR` to the URI determined by the placement of the mirror and build components in the staging phase. 88 | 89 | ##### Other fields 90 | 91 | There are a wide range of parameters that can be set, please refer to ```pnda_env_example.yaml``` in the [PNDA CLI repository](https://github.com/pndaproject/pnda-cli) for more details. 92 | 93 | ## Security Material 94 | 95 | #### Perimeter security (FQDN's and associated certificates/private keys) 96 | Access to the PNDA cluster requires user authentication over a secure connection. In order to secure this user authentication, the perimeter servers require certification material which allows validating the FQDN used to access those servers to further authenticate and secure the connection to those servers. 97 | 98 | For PRODUCTION ENVIRONMENTS, this security material MUST be generated outside the PNDA realm and dropped under the [platform-certificates](https://github.com/pndaproject/pnda-cli/tree/cfa40dbd94afaa5e3f3080106c852fb6c1e2d516/platform-certificates) directory tree. Consult the README files under that same directory and sub-directories for further details on the required material. 99 | 100 | For NON-PRODUCTION ENVIRONMENTS, a helper tool ([tools/gen-certs.py](https://github.com/pndaproject/pnda-cli/blob/cfa40dbd94afaa5e3f3080106c852fb6c1e2d516/tools/gen-certs.py)) is provided that can auto-generate the required server certificates based on an existing CA (private key) or based on a newly generated CA (when no private key is detected in the ./platform-certificates directory by the helper tool). 101 | 102 | ## Cluster descriptor 103 | 104 | Edit existing-machines/production.json (or a copy of this file) with the IP addresses for the machines in the cluster. ip_address must be reachable from each machine but does not have to be reachable from where the cli is run. Add or remove datanodes, kafka nodes, zookeeper nodes and opentsdb nodes to match the number of machines that you have. 105 | 106 | ## Volume descriptor 107 | 108 | Review the [volume config mapping](https://github.com/pndaproject/pnda-cli/blob/develop/bootstrap-scripts/production/volume-config.yaml) for your flavor so the number of disk volumes you have matches the ones being requested. By default the volumes are assigned out in descending size order, but it is possible to specify the device names and hard code the mappings in the volume config file. The disks can also be partitioned automatically. For an example of specifying device names and partitioning see [this volume config file](https://github.com/pndaproject/pnda-cli/blob/develop/bootstrap-scripts/production/volume-config.yaml). 109 | 110 | # [Next](CREATE.md) 111 | 112 | | [Home](../OVERVIEW.md) | [Prepare](PREPARE.md) | [Mirror](MIRROR.md) | [Build](BUILD.md) | [Stage](STAGE.md) | [Configure](CONFIGURE.md) | [Create](CREATE.md) | 113 | | --- | --- | --- | --- | --- | --- | --- | 114 | -------------------------------------------------------------------------------- /provisioning/server-cluster/CREATE.md: -------------------------------------------------------------------------------- 1 | # Create PNDA cluster 2 | 3 | ![](../images/breadcrumbs-create.jpg) 4 | 5 | ## Introduction 6 | 7 | With the target platform fully prepared, the PNDA software staged and the YAML configuration completed, the final step is to invoke the PNDA CLI to create the cluster. 8 | 9 | ## Create PNDA 10 | 11 | #### Install CLI dependencies 12 | 13 | On the client machine, install the pip packages required by the CLI. Navigate to the folder containing the pnda-cli repository, then run the following commands: 14 | 15 | ``` 16 | cd cli 17 | sudo pip install -r requirements.txt 18 | ``` 19 | 20 | #### CLI invocation 21 | 22 | **Important:** ensure you are certain what version of PNDA you want to deploy, and specify the correct branch or tag when invoking the CLI using the -b option. In most circumstances you'll want to make sure the branch or tag you specify is identical to the branch or tag you used to build the PNDA mirror, and identical to the version you checked out from the pnda-cli repository. All PNDA releases are designated with a tag such as ```release/4.0``` across all repositories. 23 | 24 | An example CLI invocation - 25 | 26 | ``` 27 | pnda-cli.py -e pnda -s mykey -f production -m 'existing-machines/production.json' create -b release/4.0 28 | ``` 29 | 30 | Note the -m flag for existing-machines installation. 31 | 32 | Note also that CLI parameter -s refer to the key configured in the previous phase. For example, if the keypair in AWS is 'pnda' then the local private key file should be named 'pnda.pem' and you should pass '-s pnda' to the CLI. 33 | 34 | This phase of the installation typically takes around 45 minutes to an hour to complete. 35 | 36 | # [Home](../OVERVIEW.md) 37 | 38 | | [Home](../OVERVIEW.md) | [Prepare](PREPARE.md) | [Mirror](MIRROR.md) | [Build](BUILD.md) | [Stage](STAGE.md) | [Configure](CONFIGURE.md) | [Create](CREATE.md) | 39 | | --- | --- | --- | --- | --- | --- | --- | 40 | -------------------------------------------------------------------------------- /provisioning/server-cluster/EXAMPLES.md: -------------------------------------------------------------------------------- 1 | # HTTP Server examples 2 | 3 | These are some examples of how an HTTP server can be set up to serve files for PNDA provisioning. 4 | 5 | These notes are intended for guidance only and will not be maintained or supported. You are strongly advised to refer to the official documentation for each technology. 6 | 7 | ## Apache HTTP Server on CentOS 8 | 9 | Assuming a working CentOS 7 server and that built components are in directory pnda-dist below the current working directory - 10 | 11 | Install Apache HTTP Server - 12 | 13 | sudo yum install httpd 14 | 15 | Copy components to the HTTP server - 16 | 17 | cp pnda-dist/* /var/www/html/ 18 | 19 | The components are now available via HTTP from the server - 20 | 21 | http://server/some-component-0.1.0.tar.gz 22 | 23 | ## Python 24 | 25 | Assuming that the built components are in the current working directory - 26 | 27 | sudo python -m SimpleHTTPServer 80 28 | 29 | The components are now available via HTTP from the server - 30 | 31 | http://server/some-component-0.1.0.tar.gz 32 | 33 | ## Vagrant 34 | 35 | The starting assumption is that you have a working Vagrant installation including a suitable provider plugin for the target environment and that built components are in directory pnda-dist below the current working directory. 36 | 37 | **AWS** 38 | 39 | For this example we'll use a popular [AWS provider](https://github.com/mitchellh/vagrant-aws). 40 | 41 | Create a Vagrantfile, substituting in your AWS configuration - 42 | 43 | VAGRANTFILE_API_VERSION = "2" 44 | 45 | Vagrant.configure("2") do |config| 46 | config.vm.box = "dummy" 47 | 48 | config.vm.provider :aws do |aws, override| 49 | aws.access_key_id = "" 50 | aws.secret_access_key = "" 51 | aws.keypair_name = "" 52 | aws.ami = "" 53 | aws.region = "" 54 | aws.security_groups = [ '' ] 55 | override.ssh.username = "centos" 56 | override.ssh.private_key_path = "" 57 | end 58 | config.vm.provision :shell, path: "bootstrap.sh" 59 | end 60 | 61 | Create a bootstrap.sh - 62 | 63 | yum install -y httpd 64 | rm -rf /var/www/html 65 | ln -fs /vagrant/pnda-dist /var/www/html 66 | 67 | Start the instance - 68 | 69 | vagrant up --provider=aws 70 | 71 | The components are now available via HTTP from the server in AWS - 72 | 73 | http://server/some-component-0.1.0.tar.gz 74 | 75 | If you need to update the components later - 76 | 77 | vagrant rsync 78 | 79 | **OpenStack** 80 | 81 | For this example we'll use a popular [OpenStack provider](https://github.com/cloudbau/vagrant-openstack-plugin). 82 | 83 | Create a Vagrantfile, substituting in your OpenStack configuration - 84 | 85 | Vagrant.configure("2") do |config| 86 | config.vm.box = "dummy" 87 | config.ssh.username = "" 88 | config.ssh.private_key_path = "" 89 | 90 | config.vm.provider :openstack do |os| 91 | os.username = "" 92 | os.api_key = "" 93 | os.flavor = "" 94 | os.region = "" 95 | os.image = "" 96 | os.endpoint = "" 97 | os.tenant = "" 98 | os.keypair_name = "" 99 | os.floating_ip = :auto 100 | os.floating_ip_pool = "" 101 | os.networks = [''] 102 | end 103 | config.vm.provision :shell, path: "bootstrap.sh" 104 | end 105 | 106 | Create a bootstrap.sh - 107 | 108 | yum install -y httpd 109 | rm -rf /var/www/html 110 | ln -fs /vagrant/pnda-dist /var/www/html 111 | 112 | Start the instance - 113 | 114 | vagrant up --provider=openstack 115 | 116 | The components are now available via HTTP from the server in OpenStack - 117 | 118 | http://server/some-component-0.1.0.tar.gz 119 | 120 | If you need to update the components later - 121 | 122 | vagrant provision 123 | 124 | ## Docker 125 | 126 | Assuming a working Docker installation and that built components are in directory pnda-dist below the current working directory - 127 | 128 | Create a Dockerfile - 129 | 130 | FROM httpd:2.4 131 | COPY ./pnda-dist/ /usr/local/apache2/htdocs/ 132 | 133 | Build the container - 134 | 135 | sudo docker build -t package-server . 136 | sudo docker run -dit -p 80:80 --name package-server package-server 137 | 138 | The components are now available via HTTP from the server - 139 | 140 | http://server/some-component-0.1.0.tar.gz 141 | 142 | If you need to update the components later - 143 | 144 | sudo docker cp pnda-dist/some-component-0.2.0.tar.gz package-server:/usr/local/apache2/htdocs/ 145 | 146 | 147 | -------------------------------------------------------------------------------- /provisioning/server-cluster/MIRROR.md: -------------------------------------------------------------------------------- 1 | # Create PNDA mirror components 2 | 3 | ![](../images/breadcrumbs-mirror.jpg) 4 | 5 | ## Introduction 6 | 7 | As many real-world deployment environments don’t have Internet connectivity and online sources are not always dependable, PNDA is created from a cache of all the required software known as the "PNDA mirror". 8 | 9 | Before PNDA can be created, first we must create the directory structure and file sets to be placed on the PNDA mirror. 10 | 11 | ### RHEL or CentOS 12 | 13 | PNDA can be created on RHEL or CentOS instances. Before building the PNDA mirror components, decide which instance type your deployment will use. 14 | 15 | ## Create mirror 16 | 17 | #### Select build node 18 | 19 | Designate or create the mirror build node. 20 | 21 | This can be a physical machine or a VM but it needs to reflect the type of mirror you wish to build and must be clean. 22 | 23 | Two types of mirror are supported - 24 | 25 | - Red Hat Enterprise Linux 7 26 | - CentOS 7 27 | 28 | If using Red Hat, ensure the mirror is built on a clean instance that has had **absolutely no additional** packages installed via yum (i.e. git, unzip, etc) as this will interfere with the dependency calculations of which RPM packages are required on the mirror. 29 | 30 | #### Obtain mirror build tools 31 | 32 | The repository [pnda](https://github.com/pndaproject/pnda) contains all the tools needed to create and maintain the mirror file sets. 33 | 34 | Decide which version of PNDA you want to create. All PNDA releases are desginated with tag similar to ```release/4.0``` across all repositories. 35 | 36 | Clone this repository at the right version to the mirror creation node. 37 | 38 | #### Configure the proxy. (Optional) 39 | 40 | The entire mirror build process can be performed from behind a non-transparent proxy. 41 | 42 | To proceeding in this mode, first set the system configuration and then run the ```set-proxy-env.sh``` script that will set up the various proxy configurations needed by the multiple build tools. 43 | 44 | ```sh 45 | sudo su 46 | export http_proxy=http://: 47 | export https_proxy=http://: 48 | . set-proxy-env.sh 49 | ``` 50 | 51 | #### Build mirror file sets 52 | 53 | The build tools are found in the [mirror folder](https://github.com/pndaproject/pnda/tree/master/mirror). 54 | 55 | To run the entire mirror creation process - 56 | 57 | ```sh 58 | sudo su 59 | ./create_mirror.sh 60 | ``` 61 | 62 | The script automatically detects the host Linux distribution and builds the appropriate file sets. 63 | 64 | This takes about 20 minutes to run and the output will be available in a directory named ```mirror-dist```. 65 | 66 | ##### Building parts of the mirror 67 | 68 | The different parts of the mirror can be created separately if required. The scripts to do this are - 69 | 70 | ``` 71 | create_mirror_rpm.sh 72 | create_mirror_misc.sh 73 | create_mirror_python.sh 74 | create_mirror_anaconda.sh 75 | create_mirror_cdh.sh 76 | create_mirror_hdp.sh 77 | create_mirror_apps.sh 78 | ``` 79 | 80 | Each script creates its output in a directory named for the respective mirror type - 81 | 82 | ``` 83 | mirror_rpm 84 | mirror_misc 85 | mirror_python 86 | mirror_anaconda 87 | mirror_cloudera 88 | mirror_hdp 89 | mirror_apps 90 | ``` 91 | 92 | For more about creating and maintaining mirrors, please refer to the [repository notes](https://github.com/pndaproject/pnda/tree/master/mirror). 93 | 94 | # [Next](BUILD.md) 95 | 96 | | [Home](../OVERVIEW.md) | [Prepare](PREPARE.md) | [Mirror](MIRROR.md) | [Build](BUILD.md) | [Stage](STAGE.md) | [Configure](CONFIGURE.md) | [Create](CREATE.md) | 97 | | --- | --- | --- | --- | --- | --- | --- | 98 | -------------------------------------------------------------------------------- /provisioning/server-cluster/PREPARE.md: -------------------------------------------------------------------------------- 1 | # Select & Prepare Platform 2 | 3 | ![](../images/breadcrumbs.jpg) 4 | 5 | ## Commission server cluster 6 | 7 | The target server cluster must have adequate resources for deploying the chosen PNDA flavor in terms of numbers of nodes, CPU, memory and storage capacity. 8 | 9 | Building and configuring a suitable server cluster is outside the scope of this guide. Quite frequently technologies such as [Cobbler](http://cobbler.github.io/) are used to automate this process. 10 | 11 | ## Required resources 12 | 13 | In addition to the existing PNDA flavors, [described elsewhere in this guide](https://github.com/pndaproject/pnda-guide/blob/develop/provisioning/aws/PREPARE.md), we have defined a flavor specifically intended for creating PNDA on physical server clusters. 14 | 15 | #### Production 16 | 17 | Production flavor is intended for a reasonably sized physical cluster. It runs the core services in high-availability mode and provides reasonable storage space and compute resource. 18 | 19 | | Role | Number required | Cores | Memory | Storage 20 | | --- | --- | --- | --- | --- | 21 | | `tools` | 1 | 16 | 64 GB | 1TB 22 | | `edge` | 1 | 20 | 256 GB | 1TB 23 | | `mgr1` | 1 | 16 | 128 GB | 1TB 24 | | `mgr2` | 1 | 16 | 128 GB | 1TB 25 | | `mgr3` | 1 | 16 | 128 GB | 1TB 26 | | `mgr4` | 1 | 16 | 128 GB | 1TB 27 | | `datanode` | 3 | 24 | 128 GB | As required 28 | | `opentsdb` | 2 | 16 | 64 GB | 1TB 29 | | `hadoop-manager` | 1 | 16 | 64 GB | 1TB 30 | | `kafka` | 3 | 20 | 128 GB | As required 31 | | `zookeeper` | 3 | 16 | 64 GB | 1TB 32 | 33 | We recommended dividing the storage on every node into at least a root volume (100GB is sufficient) and a data volume. We also recommend JBOD for Hadoop datanodes and RAID10 for Kafka. However, all these aspects are configurable. 34 | 35 | ## Firewall setup 36 | The firewall needs to allow TCP and UDP (DNS service) ports within the PNDA Cluster, or firewall needs to be disabled. 37 | 38 | 39 | # [Next](MIRROR.md) 40 | 41 | | [Home](../OVERVIEW.md) | [Prepare](PREPARE.md) | [Mirror](MIRROR.md) | [Build](BUILD.md) | [Stage](STAGE.md) | [Configure](CONFIGURE.md) | [Create](CREATE.md) | 42 | | --- | --- | --- | --- | --- | --- | --- | 43 | -------------------------------------------------------------------------------- /provisioning/server-cluster/STAGE.md: -------------------------------------------------------------------------------- 1 | # Stage PNDA mirror & components 2 | 3 | ![](../images/breadcrumbs-stage.jpg) 4 | 5 | ## Introduction 6 | 7 | In order to make built components available during the PNDA provisioning process, they need to be staged in a location that is accessible from the target environment via HTTP. 8 | 9 | ## Create server 10 | 11 | Create an ordinary HTTP server in the target environment or identify an existing server. The server must have connectivity with the PNDA cluster being provisioned. 12 | 13 | See [these tips](EXAMPLES.md) for rapidly creating an HTTP server using a number of different approaches. 14 | 15 | Your existing CICD system may already include the capability to host and serve build artifacts over HTTP. In this case, simply make use of an appropriate location on the existing resource. 16 | 17 | ## Stage files 18 | 19 | Copy the *contents* of ```mirror-dist``` and ```pnda-dist``` from the mirror creation and build steps respectively to the HTTP server. 20 | 21 | The final directory layout should resemble the following - 22 | 23 | ``` 24 | pnda-root 25 | │ 26 | ├── console-backend-data-logger-develop.tar.gz 27 | ├── console-backend-data-logger-develop.tar.gz.sha512.txt 28 | ├── etc 29 | │ 30 | ├── mirror_anaconda 31 | │   ├── Anaconda-4.0.0-el7.parcel 32 | │ ├── etc 33 | │ 34 | ├── mirror_rpm 35 | │   ├── a-rpm.rpm 36 | │ ├── etc 37 | │ 38 | ├── etc 39 | ``` 40 | 41 | Note that ```pnda-root``` can be any location, all that is required is that the hierarchy under this is available via a known URI. For example, using a standard Apache 2 installation on RHEL 7, if the hierarchy above is placed in ```/var/www/html``` this will be available via the URI ```http:///``` since ```/var/www/html``` is the default *document root*. Please refer to your HTTP server documentation for more details. 42 | 43 | Make a note of the URI to ```pnda-root``` as this will be used in configuring the PNDA creation process. 44 | 45 | # [Next](CONFIGURE.md) 46 | 47 | | [Home](../OVERVIEW.md) | [Prepare](PREPARE.md) | [Mirror](MIRROR.md) | [Build](BUILD.md) | [Stage](STAGE.md) | [Configure](CONFIGURE.md) | [Create](CREATE.md) | 48 | | --- | --- | --- | --- | --- | --- | --- | 49 | -------------------------------------------------------------------------------- /provisioning/versions.md: -------------------------------------------------------------------------------- 1 | # Versions 2 | The following table lists the technologies used in this PNDA release. 3 | 4 | Some technologies depend on which distribution of Hadoop is chosen, as PNDA can be deployed using either Cloudera CDH or Hortonworks HDP. 5 | 6 | ## Common to all variants of PNDA 7 | 8 | | Technology | Version | 9 | |---|---| 10 | |SaltStack|2015.8.11| 11 | |OpenStack Heat templates|2015-04-30| 12 | |AWS CFN templates|2010-09-09| 13 | |Kafka|1.1.0| 14 | |Zookeeper for Kafka|3.4.11| 15 | |JMX Proxy|3.2.0| 16 | |Kafka Manager|1.3.3.15| 17 | |ELK (Logserver)|Logstash 6.2.1, Elasticsearch 6.2.1, Kibana 6.2.1| 18 | |Jupyter Hub|0.7.0| 19 | |Jupyter|4.2.1| 20 | |OpenTSDB|2.3.0| 21 | |Grafana|5.1.3| 22 | |Anaconda|5.1.0| 23 | |Consul|1.0.3| 24 | |Apache Flink|1.4.2| 25 | |Apache Knox|1.1.0| 26 | 27 | ## For Cloudera CDH PNDA 28 | 29 | | Technology | Version | 30 | |---|---| 31 | |Hadoop (see below for components)|CM 5.9.0, CDH 5.9.0-1.cdh5.9.0.p0.23 | 32 | |Apache Avro|avro-1.7.6+cdh5.12.1+133| 33 | |Apache Crunch|crunch-0.11.0+cdh5.12.1+101| 34 | |Apache DataFu|pig-udf-datafu-1.1.0+cdh5.12.1+24| 35 | |Apache Hadoop|hadoop-2.6.0+cdh5.12.1+2540| 36 | |Apache HBase|hbase-1.2.0+cdh5.12.1+365| 37 | |Apache Hive|hive-1.1.0+cdh5.12.1+1197| 38 | |Hue|hue-3.9.0+cdh5.12.1+6507| 39 | |Apache Impala|impala-2.9.0+cdh5.12.1+0| 40 | |Kite SDK|kite-1.0.0+cdh5.12.1+144| 41 | |Apache Mahout|mahout-0.9+cdh5.12.1+34| 42 | |Apache Oozie|oozie-4.1.0+cdh5.12.1+446| 43 | |Apache Parquet|parquet-1.5.0+cdh5.12.1+187| 44 | |Parquet-format|parquet-format-2.1.0+cdh5.12.1+18| 45 | |Apache Pig|pig-0.12.0+cdh5.12.1+110| 46 | |Apache Spark|spark-1.6.0+cdh5.12.1+530| 47 | |Apache Sqoop|sqoop-1.4.6+cdh5.12.1+113| 48 | |Apache Sqoop2|sqoop2-1.99.5+cdh5.12.1+46| 49 | |Apache ZooKeeper|zookeeper-3.4.5+cdh5.12.1+117| 50 | 51 | If you want to get the full list of [CDH 5.12.1 Packaging and Tarballs](https://www.cloudera.com/documentation/enterprise/release-notes/topics/cm_vd_cdh_package_tarball_512.html#concept_4g0_dmn_yk) 52 | 53 | ## For Hortonworks HDP PNDA 54 | 55 | | Technology | Version | 56 | |---|---| 57 | |Apache Ambari |2.7.0.0| 58 | |Hortonworks HDP |2.6.5.0| 59 | |Apache Hadoop|2.7.3| 60 | |Apache HBase|1.1.2| 61 | |Apache Hive|2.1.0| 62 | |Apache Mahout|0.9.0+| 63 | |Apache Oozie |4.2.0| 64 | |Apache Pig |0.16.0| 65 | |Apache DataFu |1.3.0| 66 | |Apache Spark |1.6.3| 67 | |Apache Spark |2.3.0| 68 | |Apache Sqoop |1.4.6| 69 | |Apache ZooKeeper|3.4.6| 70 | |Apache Phoenix|4.7.0| 71 | |Apache Slider|0.92.0| 72 | |Apache TEZ|0.7.0| 73 | -------------------------------------------------------------------------------- /provisioning/vmware.md: -------------------------------------------------------------------------------- 1 | # Getting started with VMWare 2 | 3 | [VMWare vSphere](https://www.vmware.com/nl/products/vsphere.html) is a server virtualisation platform that aggregates compute resources and allows virtual machines to be created from them. 4 | 5 | Setting up VMWare vSphere is outside the scope of this guide. Once a target environment has been set up, follow the instructions for [creating PNDA on VMWare](OVERVIEW.md). 6 | -------------------------------------------------------------------------------- /provisioning/vmware/CONFIGURE.md: -------------------------------------------------------------------------------- 1 | # Configure PNDA creation process 2 | 3 | ![](../images/breadcrumbs-cfg.jpg) 4 | 5 | ## Introduction 6 | 7 | The PNDA creation process is controlled primarily via a YAML configuration file. 8 | 9 | A template YAML configuration can be found in the [pnda-cli repository](https://github.com/pndaproject/pnda-cli). 10 | 11 | ## Configure pnda_env.yaml 12 | 13 | #### Designate client machine 14 | 15 | Create or designate a suitable machine for running the PNDA CLI. We recommend CentOS 7. 16 | 17 | #### Obtain code 18 | 19 | Clone the [pnda-cli repository](https://github.com/pndaproject/pnda-cli) repository from the master branch at a specific release tag (e.g. ```release/4.0```) to the client machine. 20 | 21 | Copy ```pnda_env_example.yaml``` to create ```pnda_env.yaml``` 22 | 23 | #### Set access credentials 24 | 25 | Set the following fields under `terraform_parameters` section in `pnda_env.yaml` . 26 | 27 | | Field | Value | 28 | | --- | --- | 29 | |VS_USER| vSphere user for creating PNDA | 30 | |VS_PASSWORD| vSphere password for VS_USER | 31 | |VS_DS| vSphere data store to use for this deployment | 32 | |VS_PUBLIC_NETWORK| ID of the network to place PNDA VMs on | 33 | |VS_SERVER| vSphere API endpoint | 34 | |VS_TEMPLATE_xxx| VM Images to use for various node types | 35 | |TF_ROOT_USER| Username with root ssh login to the VS_TEMPLATE_xxx images | 36 | |TF_ROOT_PASSWORD| Password for TF_ROOT_USER | 37 | 38 | As part of the provisioning process ssh login for TF_ROOT_USER is disabled and replaced with key-based login for a user named 'cloud-user'. 39 | 40 | #### Hadoop distribution 41 | 42 | Decide whether you want to run the Cloudera CDH or the Hortonworks HDP Hadoop distribution. 43 | 44 | Set `hadoop.HADOOP_DISTRO` to either `CDH` or `HDP`. 45 | 46 | #### Set source of SaltStack provisioning scripts 47 | 48 | The PNDA software is installed and configured using the SaltStack code found in the [platform-salt](https://github.com/pndaproject/platform-salt) repository. There are two main options to provide source for platform-salt: 49 | 50 | 1. Set `platform_salt.PLATFORM_GIT_REPO_URI` to the remote git URI and `platform_salt.PLATFORM_GIT_BRANCH` at the specified branch to be cloned during provisioning. 51 | If authenticated access to `platform_salt.PLATFORM_GIT_REPO_URI` is required, then place the ssh key to use, named git.pem, in the top level directory of "pnda-cli" repository and also set `platform_salt.PLATFORM_GIT_REPO_HOST` to the hostname of the server. 52 | 53 | 2. A local copy of platform-salt can be used by setting (`platform_salt.PLATFORM_SALT_LOCAL`) to the path to the platform-salt folder on the local machine running pnda-cli.py. 54 | 55 | #### PNDA mirror 56 | 57 | Set `mirrors.PNDA_MIRROR` to the URI determined by the placement of the mirror and build components in the staging phase. 58 | 59 | #### Other fields 60 | 61 | There are a wide range of parameters that can be set, please refer to ```pnda_env_example.yaml``` in the [pnda-cli repository](https://github.com/pndaproject/pnda-cli) for more details. 62 | 63 | #### SSH key pair 64 | 65 | Create an ssh keypair with `ssh-keygen -t rsa -N '' -f key_name` to use when creating the virtual machine instances for PNDA as ```key_name```. 66 | 67 | Place both parts of the key root of the pnda-cli directory named as follows: the private key ```key_name.pem``` and public key ```key_name.pub```. 68 | 69 | Ensure that key_name.pem has 0600 permissions. 70 | 71 | # [Next](CREATE.md) 72 | 73 | | [Home](../OVERVIEW.md) | [Prepare](PREPARE.md) | [Image](IMAGE.md) | [Configure](CONFIGURE.md) | [Create](CREATE.md) | 74 | | --- | --- | --- | --- | --- | 75 | -------------------------------------------------------------------------------- /provisioning/vmware/CREATE.md: -------------------------------------------------------------------------------- 1 | # Create PNDA cluster 2 | 3 | ![](../images/breadcrumbs-create.jpg) 4 | 5 | ## Introduction 6 | 7 | With the target platform fully prepared, the PNDA software staged and the YAML configuration completed, the final step is to invoke the PNDA CLI to create the cluster. 8 | 9 | ## Create PNDA 10 | 11 | #### Install CLI dependencies 12 | 13 | To use the PNDA command line interface, you will need to install the python and terraform clients. To install them on CentOS, run: 14 | 15 | ``` 16 | # Download terraform and place on $PATH 17 | cd /tmp 18 | curl -LOJ https://releases.hashicorp.com/terraform/0.11.7/terraform_0.11.7_linux_amd64.zip 19 | unzip terraform_0.11.7_linux_amd64.zip 20 | sudo cp terraform /usr/local/bin/ 21 | 22 | sudo yum install -y epel-release 23 | sudo yum install -y python python-pip python-devel gcc 24 | 25 | cd cli 26 | sudo pip install -r requirements.txt 27 | ``` 28 | 29 | #### CLI invocation 30 | 31 | The pnda-cli.py script allows launching a PNDA deployment. It sits in the cli subdirectory. 32 | 33 | ``` 34 | cd cli 35 | ./pnda-cli.py 36 | usage: pnda-cli.py [-h] [-e PNDA_CLUSTER] [-n DATANODES] [-o OPENTSDB_NODES] 37 | [-k KAFKA_NODES] [-z ZK_NODES] [-f FLAVOR] [-s KEYNAME] 38 | [-x] [-b BRANCH] [-d] [-m X_MACHINES_DEFINITION] [-v] 39 | {create,expand,destroy} 40 | ``` 41 | 42 | | Options | Description | 43 | | --- | --- | 44 | | -e | Namespaced environment for machines in this cluster. 45 | | -n | How many datanodes needed for the hadoop cluster 46 | | -o | How many Open TSDB nodes for the hadoop cluster 47 | | -k | How many kafka nodes for the databus cluster 48 | | -z | How many zookeeper nodes for the databus cluster 49 | | -f | PNDA flavours: ['pico', 'standard'] 50 | | -s | Name of key-pair name created in the OpenStack Tenant during the preparation phase. 51 | | -b | Branch of [platform-salt](https://github.com/pndaproject/platform-salt) to use. Overrides value in pnda_env.yaml 52 | | -v | Verbose logging 53 | 54 | Note: Make sure you have access to the private key of this key pair otherwise you will not be able to connect to the gateway node and access the cluster. 55 | 56 | Examples on invocation of CLI: 57 | 58 | ##### Create new cluster: 59 | ``` 60 | cd cli 61 | pnda-cli.py create -e -s -f standard -o 2 -n 3 -k 2 -z 3 -b release/4.0 62 | ``` 63 | The options shown select the standard flavor, 2 openTSDB instances, 3 hadoop datanodes, 2 kafka brokers, and 3 zookeeper nodes. If you need to operate within the Openstack tenant instance quota of 20 instances then you can reduce this to 1 kafka and 1 zookeeper instance or use the pico flavor. 64 | 65 | ``` 66 | pnda-cli.py create -e -s -f standard -o 1 -n 1 -k 1 -z 1 -b release/4.0 67 | pnda-cli.py create -e -s -f pico -n 1 -k 1 -b release/4.0 68 | ``` 69 | 70 | ##### Destroy existing cluster: 71 | ``` 72 | pnda-cli.py destroy -e 73 | ``` 74 | ##### Expand existing cluster: 75 | ``` 76 | pnda-cli.py expand -e -f standard -s -n 10 -k 5 77 | ``` 78 | Either, or both, kafka (k) and datanodes (n) can be changed. 79 | The value specifies the new total number of nodes. 80 | Shrinking is not supported - this must be done very carefully to avoid data loss. 81 | 82 | To orchestrate PNDA on Openstack clone [pnda-cli repository](https://github.com/pndaproject/pnda-cli). 83 | 84 | **Important:** ensure you are certain what version of PNDA you want to deploy, and specify the correct branch or tag when invoking the CLI using the -b option. In most circumstances you'll want to make sure the branch or tag you specify is identical to the branch or tag you used to build the PNDA mirror, and identical to the version you checked out from the pnda-cli repository. All PNDA releases are designated with a tag such as ```release/4.0``` across all repositories. 85 | 86 | # [Home](../OVERVIEW.md) 87 | 88 | | [Home](../OVERVIEW.md) | [Prepare](PREPARE.md) | [Image](IMAGE.md) | [Configure](CONFIGURE.md) | [Create](CREATE.md) | 89 | | --- | --- | --- | --- | --- | 90 | -------------------------------------------------------------------------------- /provisioning/vmware/IMAGE.md: -------------------------------------------------------------------------------- 1 | # Select & Prepare Platform 2 | 3 | ![](../images/breadcrumbs.jpg) 4 | 5 | ## Creating PNDA images 6 | 7 | Deploying PNDA using Terraform requires images with some pre-installed elements, such as `VMWare tools`. 8 | 9 | PNDA currently runs on CentOS/RHEL, but you can use Ubuntu/CentOS/RHEL/Mac/Windows OSes to create the PNDA images. 10 | 11 | ### Pre-requisites 12 | 13 | For building based images on VMWare, we are using [Packer](https://www.packer.io/) so make sure you've got the right package for your OS download and installed as describe [here](https://www.packer.io/downloads.html) 14 | 15 | 16 | ### Create PNDA images 17 | 18 | All the process for building base images is describe on the [PNDA repo](https://github.com/pndaproject/pnda/blob/develop/packer/README.md) 19 | 20 | ``` 21 | git clone https://github.com/pndaproject/pnda.git 22 | cd packer 23 | ``` 24 | 25 | Then follow the instructions describe in the README to build the differents images 26 | 27 | 28 | # [Next](CONFIGURE.md) 29 | 30 | | [Home](../OVERVIEW.md) | [Prepare](PREPARE.md) | [Image](IMAGE.md) | [Configure](CONFIGURE.md) | [Create](CREATE.md) | 31 | | --- | --- | --- | --- | --- | 32 | -------------------------------------------------------------------------------- /provisioning/vmware/PREPARE.md: -------------------------------------------------------------------------------- 1 | # Select & Prepare Platform 2 | 3 | ![](../images/breadcrumbs.jpg) 4 | 5 | ## VMWare prerequisites 6 | 7 | Building and configuring VMWare is outside the scope of this guide. Please note the following - 8 | 9 | - PNDA is regularly tested on [VMWare vSphere 6.5](https://www.vmware.com/products/vsphere.html) but is expected to work on versions of vSphere from 5.5 onwards. 10 | - There is no orchestration available as part of VMWare, but this will be covered later using [Terraform](https://www.terraform.io/). 11 | - Before continuing, please ensure - 12 | - At least one suitable network / Datastore / exists 13 | - It provides sufficient resources and quota to provision one of the flavors described below 14 | 15 | ## Required resources 16 | 17 | The resource requirements for the default pico and standard flavor PNDA clusters are detailed below. However, you are strongly encouraged to create a PNDA flavor specifically designed for your infrastructure. 18 | 19 | #### Pico 20 | 21 | Pico flavor is intended for development / learning purposes. It is fully functional, but does not run the core services in high-availability mode and does not provide much storage space or compute resource. 22 | 23 | | Role | Instance type | Number required | CPUs | Memory | Storage 24 | | --- | --- | --- | --- | --- | --- | 25 | | `gateway` | t2.medium | 1 | 2 | 4 GB | 40 GB 26 | | `edge` | m4.2xlarge | 1 | 8 | 32 GB | 50 GB 27 | | `mgr1` | m4.xlarge | 1 | 4 | 16 GB | 50 GB 28 | | `datanode` | c4.xlarge | 1 | 4 | 7.5 GB | 65 GB 29 | | `kafka` | m4.large | 1 | 2 | 8 GB | 50 GB 30 | | - | - | - | - | - | - | 31 | | `total` | | 5 | 20 | 67,5 GB | 255 GB 32 | 33 | The storage per node is allocated as: 34 | - 10 GB log volume (not present on gateway or saltmaster). This is provision-time configurable. 35 | - 20 GB operating system partition. This is configured in the templates per-node. 36 | - 35 GB HDFS (only on datanode). This is configured in the templates for the datanode. 37 | 38 | #### Standard 39 | 40 | Standard flavor is intended for meaningful PoC and investigations at scale. It runs the core services in high-availability mode and provides reasonable storage space and compute resource. 41 | 42 | | Role | Instance type | Number required | CPUs | Memory | Storage 43 | | --- | --- | --- | --- | --- | --- | 44 | | `gateway` | t2.medium | 1 | 2 | 4 GB | 170 GB 45 | | `saltmaster` | m4.large | 1 | 2 | 8 GB | 50 GB 46 | | `edge` | t2.medium | 1 | 2 | 4 GB | 370 GB 47 | | `mgr1` | m4.2xlarge | 1 | 8 | 32 GB | 370 GB 48 | | `mgr2` | m4.2xlarge | 1 | 8 | 32 GB | 370 GB 49 | | `mgr3` | m4.2xlarge | 1 | 8 | 32 GB | 370 GB 50 | | `mgr4` | m4.2xlarge | 1 | 8 | 32 GB | 370 GB 51 | | `datanode` | m4.2xlarge | 3 | 8 | 32 GB | 1394 GB 52 | | `opentsdb` | m4.xlarge | 2 | 4 | 16 GB | 170 GB 53 | | `hadoop-manager` | m4.xlarge | 1 | 4 | 16 GB | 170 GB 54 | | `jupyter` | m4.large | 1 | 2 | 8 GB | 170 GB 55 | | `logserver` | m4.large | 1 | 2 | 8 GB | 500 GB 56 | | `kafka` | m4.xlarge | 2 | 4 | 16 GB | 270 GB 57 | | `zookeeper` | m4.large | 3 | 2 | 8 GB | 170 GB 58 | | `tools` | m4.large | 1 | 2 | 8 GB | 50 GB 59 | | - | - | - | - | - | - | 60 | | `total` | | 21 | 94 | 368 GB | 7.7TB 61 | 62 | The storage per node is allocated as: 63 | - 120 GB log volume (not present on gateway, saltmaster or tools). This is provision-time configurable. 64 | - 1024 GB HDFS (only on datanode). This is configured in the templates for the datanode. 65 | - 50-250 GB operating system partition. This is configured in the templates per-node. 66 | 67 | # [Next](IMAGE.md) 68 | 69 | | [Home](../OVERVIEW.md) | [Prepare](PREPARE.md) | [Image](IMAGE.md) | [Configure](CONFIGURE.md) | [Create](CREATE.md) | 70 | | --- | --- | --- | --- | --- | 71 | -------------------------------------------------------------------------------- /query/README.md: -------------------------------------------------------------------------------- 1 | # Structured Query 2 | 3 | ## [Impala](impala.md) 4 | 5 | Apache Impala is a parallel execution engine for SQL queries. It supports low-latency access and interactive exploration of data in HDFS and HBase. Impala allows data to be stored in a raw form, with aggregation performed at query time without requiring upfront aggregation of data. 6 | 7 | Apache Impala is only available when using Cloudera CDH as the Hadoop distribution for PNDA. 8 | -------------------------------------------------------------------------------- /query/impala.md: -------------------------------------------------------------------------------- 1 | # Impala 2 | 3 | [Apache Impala](http://impala.io) is the general purpose interface that clients use to retrieve the results of data processing carried out on the PNDA platform. 4 | 5 | Impala is a distributed, massively parallel processing (MPP) database engine providing high-performance, low-latency SQL queries on data stored in Hadoop. It's designed for interactive use and typically returns results within seconds or a few minutes, rather than the many minutes or hours that are often required for standard Hadoop batch operations. It can access data directly from the HDFS file system or from HBase and also perform joins between these heterogeneous data stores. 6 | 7 | In addition to straightforward data retrieval, Impala supports an extended subset of ANSI-92 SQL and so is powerful enough to provide analytical capabilities to clients over large datasets in Hadoop without requiring the coding required for Spark jobs. 8 | 9 | ## Metadata 10 | 11 | In order to make use of Impala over data in Hadoop it is necessary to create metadata describing the desired mapping in the Hive Metastore. 12 | 13 | For example, to make the data in `/user/pnda/important_data/set` available through Impala as a table `pndaA`: 14 | 15 | CREATE EXTERNAL TABLE pndaA 16 | ( 17 | id INT, 18 | col_1 BOOLEAN, 19 | col_2 DOUBLE, 20 | col_3 TIMESTAMP 21 | ) 22 | ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' 23 | LOCATION '/user/pnda/important_data/set'; 24 | 25 | If the data is structured, JSON for example, it could make sense to store it as parquet which allows for schema inference and efficient columnar aggregations: 26 | 27 | CREATE EXTERNAL TABLE pndaB 28 | LIKE PARQUET '/user/pnda/parquet/set/part-r-00001.parquet' 29 | STORED AS PARQUET LOCATION '/user/pnda/parquet/set/'; 30 | 31 | As another example, to make data in a HBase table `dataset` available through Impala as a table `pndaC`: 32 | 33 | CREATE EXTERNAL TABLE pndaC ( 34 | id string, 35 | bool_col boolean, 36 | tinyint_col tinyint, 37 | timestamp_col timestamp) 38 | STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' 39 | WITH SERDEPROPERTIES ( 40 | "hbase.columns.mapping" = 41 | ":key,bools:bool_col,ints:tinyint_col,strings:timestamp_col" 42 | ) 43 | TBLPROPERTIES("hbase.table.name" = "dataset"); 44 | 45 | ## Query 46 | 47 | Impala supports a variety of mechanisms for executing queries following SQL-92 syntax. 48 | 49 | Taking the above tables as an example: 50 | 51 | SELECT pndaA.col_1, pndaC.tinyint_col FROM pndaA, pndaC WHERE pndaA.id = pndaC.id 52 | 53 | ## Impala shell 54 | 55 | The user can access the PNDA edge node and open the [impala shell](http://www.cloudera.com/documentation/cdh/5-1-x/Impala/Installing-and-Using-Impala/ciiu_impala_shell.html#impala_shell) which supports a variety of DML and DDL operations and allows ad hoc queries and data exploration. 56 | 57 | ## JDBC Connector 58 | 59 | For Java developers, Cloudera have made available a [Impala JDBC Connector](http://www.cloudera.com/documentation/enterprise/latest/topics/impala_jdbc.html) which can be integrated into client applications. See this [example](https://github.com/onefoursix/Cloudera-Impala-JDBC-Example). 60 | 61 | ## ODBC Connector 62 | 63 | Many client applications are able to make use of ODBC, for example the popular BI tool Tableau. Refer to the Cloudera documentation on the [Impala ODBC Connector](http://www.cloudera.com/documentation/cdh/5-1-x/Impala/Installing-and-Using-Impala/ciiu_impala_odbc.html?scroll=impala_odbc) for more details. 64 | 65 | ## Learn More about Impala 66 | 67 | For a worked example of integrating a data processing application with HBase on PNDA, using Impala to access the result data refer to the DevNet page in the Developer menu. 68 | 69 | Further reference material is available at the [Impala documentation](http://www.cloudera.com/documentation/archive/impala/2-x/2-1-x/topics/impala_tutorial.html) site. 70 | -------------------------------------------------------------------------------- /repos/README.md: -------------------------------------------------------------------------------- 1 | # Repositories 2 | 3 | The PNDA distribution is available on GitHub at: 4 | 5 | * [https://github.com/pndaproject](https://github.com/pndaproject) 6 | 7 | It consists of the following source code repositories and sub-projects: 8 | 9 | ### Provisioning 10 | 11 | * [platform-salt](http://github.com/pndaproject/platform-salt): provisioning logic for creating PNDA 12 | * [pnda-cli](http://github.com/pndaproject/pnda-cli): orchestration application for creating PNDA on AWS, OpenStack or an existing pre-prepared cluster 13 | * [pnda-dib-elements](http://github.com/pndaproject/pnda-dib-elements): tools for building disk image templates 14 | * [pnda](https://github.com/pndaproject/pnda): pnda release notes and build system 15 | 16 | ### Platform 17 | 18 | * [platform-libraries](http://github.com/pndaproject/platform-libraries): libraries for working with interactive notebooks 19 | * [platform-tools](http://github.com/pndaproject/platform-tools): tools for operating a cluster 20 | * [bulkingest](http://github.com/pndaproject/platform-tools/tree/master/bulkingest): tools for performing a bulk ingest of data 21 | * [platform-console-frontend](http://github.com/pndaproject/platform-console-frontend): “single pane of glass” giving operational overview and access to application and data management functions 22 | * [platform-console-backend](http://github.com/pndaproject/platform-console-backend): APIs that provide data to the console frontend 23 | * [console-backend-data-logger](http://github.com/pndaproject/platform-console-backend/tree/master/console-backend-data-logger): APIs to ingest data 24 | * [console-backend-data-manager](http://github.com/pndaproject/platform-console-backend/tree/master/console-backend-data-manager): APIs to provide data 25 | * [platform-testing](http://github.com/pndaproject/platform-testing): modules that test both the end to end platform and individual components and collect metrics 26 | * [platform-deployment-manager](http://github.com/pndaproject/platform-deployment-manager): API to manage packages and application deployment and lifecycle 27 | * [platform-data-mgmnt](http://github.com/pndaproject/platform-data-mgmnt): tools to manage data retention 28 | * [data-service](http://github.com/pndaproject/platform-data-mgmnt/tree/master/data-service): API to set data retention policies 29 | * [hdfs-cleaner](http://github.com/pndaproject/platform-data-mgmnt/tree/master/hdfs-cleaner): cron job to clean up HDFS data 30 | * [oozie-templates](http://github.com/pndaproject/platform-data-mgmnt/tree/master/oozie-templates): templates that archive or delete data 31 | * [platform-package-repository](http://github.com/pndaproject/platform-package-repository): manages a simple package repository backed by OpenStack Swift 32 | * [gobblin](http://github.com/pndaproject/gobblin): customized fork of the Gobblin data ingest framework 33 | 34 | ### Producers 35 | 36 | * [prod-odl-kafka](https://github.com/pndaproject/prod-odl-kafka): plugin to ingest data from OpenDaylight 37 | * [logstash-codec-pnda-avro](https://github.com/pndaproject/logstash-codec-pnda-avro): patched AVRO codec ingest data from Logstash 38 | 39 | ### Examples 40 | 41 | * [example-applications](https://github.com/pndaproject/example-applications): example applications that can be built and run on PNDA 42 | * [spark-batch](https://github.com/pndaproject/example-applications/tree/master/spark-batch): example batch data processing application 43 | * [spark-streaming](https://github.com/pndaproject/example-applications/tree/master/spark-streaming): example streaming data processing application 44 | * [jupyter-notebooks](https://github.com/pndaproject/example-applications/tree/master/jupyter-notebooks): examples for working with Jupyter notebooks 45 | * [kafka-spark-opentsdb](https://github.com/pndaproject/example-applications/tree/master/kafka-spark-opentsdb): example consumer that feeds data to OpenTSDB 46 | * [example-kafka-clients](https://github.com/pndaproject/example-kafka-clients): examples for working with kafka clients 47 | * [java](https://github.com/pndaproject/example-kafka-clients/tree/master/java) 48 | * [php](https://github.com/pndaproject/example-kafka-clients/tree/master/php) 49 | * [python](https://github.com/pndaproject/example-kafka-clients/tree/master/python) 50 | 51 | ### Documentation 52 | 53 | * [pnda-guide](README.md): this guide 54 | 55 | -------------------------------------------------------------------------------- /security/Architecture.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/security/Architecture.jpg -------------------------------------------------------------------------------- /security/Basic-Authentication.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/security/Basic-Authentication.png -------------------------------------------------------------------------------- /security/E-HDFS-Access-from-Shell.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/security/E-HDFS-Access-from-Shell.jpg -------------------------------------------------------------------------------- /security/Impersonation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/security/Impersonation.png -------------------------------------------------------------------------------- /security/impala-ldap-f1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/security/impala-ldap-f1.png -------------------------------------------------------------------------------- /security/settings.json: -------------------------------------------------------------------------------- 1 | {"id":"3e494f97-2ab9-4a37-84d1-a19d37eac1f2","account":{"username":"venks61176","token":"5bf9ce40-d7f9-4451-b79f-4487dbac6a1b","avatar":"https://avatars0.githubusercontent.com/venks61176","email":"mevenkat@gmail.com","name":"venks61176","host":"https://www.gitbook.com"}} -------------------------------------------------------------------------------- /streamingest/README.md: -------------------------------------------------------------------------------- 1 | # Producer Integration 2 | 3 | Kafka is the "front door" of PNDA, allowing the ingest of high-velocity data streams, distributing data to all interested consumers and decoupling data sources from data processing applications and platform clients. 4 | 5 | It is normally not necessary to create a new producer to start acquiring network data as any data is supported out of the box. Additionally, some encodings do benefit from special accommodations. But even for these, no new producer should be required as there are a growing number of data plugins that have already been integrated with PNDA. It’s not always clear which plugins to use for which data types, hence we’ve summarized some common combinations in the table at the bottom of this page. 6 | 7 | If you do have other data sources you want to integrate with PNDA it’s easy enough to write a PNDA producer – see [producer.md](producer.md) 8 | 9 | PNDA adopts a [schema-on-read](https://www.techopedia.com/definition/30153/schema-on-read) approach to data processing, so all data directed towards the platform is stored in as close to its raw form as possible. When data is persisted, each datum is ensured compliance to a consistent Avro wrapper that contains both the logical source of the data and a timestamp besides the data payload. 10 | 11 | Kafka data is stored in topics, each topic being divided into partitions and each partition being replicated to avoid data loss. Ingest is achieved by delivering data through a "producer" which is implemented to send data to one or more well defined topics by direct connection to the broker cluster. Load balancing is carried out by the broker cluster itself via negotiation with topic partition leaders. 12 | 13 | PNDA is typically deployed with a set of well defined topics in accordance with the deployment context, each topic being carefully configured with a set of replicated partitions in line with the expected ingest and consumption rates. Please refer to our [Topic Preparation Guide](topic-preparation.md) to understand how to create and setup up topics. By convention topics are named according to a hierarchical scheme such that consumers are able to "whitelist" data of interest and subscribe to multiple topics at once (e.g. `mytelco.service6.netflow.*` or `mytelco.*`). 14 | 15 | PNDA includes tools for managing topics, partitions and brokers and for monitoring the data flow across them. 16 | 17 | Integrators can make use of the high and low level [Kafka APIs](http://kafka.apache.org/documentation.html#api). Please refer to our [Topic Preparation Guide](topic-preparation.md) to discover how to leverage advanced feature that come with some dedicated encodings and our [Data Preparation Guide](data-preparation.md) to understand how to encapsulate data for those encoding options. 18 | 19 | # Data types mapped to existing PNDA producers 20 | 21 | Data Type | Data Aggregator | Data Aggregator Reference | PNDA Producer Reference 22 | --------- | --------------- | ------------------------- | ----------------------- 23 | BGP (inc. BGP LS) | OpenBMP | http://www.openbmp.org/#!index.md#Using_Kafka_for_Collector_Integration | [openbmp](openbmp.md) 24 | BGP | PMACCT (BGP listener) | http://www.pmacct.net/ | [pmacct](pmacct.md) 25 | Bulk Ingest | PNDA Bulk Ingest Tool | | http://pnda.io/pnda-guide/bulkingest/ 26 | ISIS | PMACCT (ISIS listener) | http://www.pmacct.net/ | [pmacct](pmacct.md) 27 | Cisco XR streaming telemetry | Pipeline | https://github.com/cisco/bigmuddy-network-telemetry-collector | 28 | CollectD (CollectD supports multiple plugins as listed here https://collectd.org/wiki/index.php/Table_of_Plugins) | Logstash | https://www.elastic.co/guide/en/logstash/current/plugins-codecs-collectd.html | [logstash](logstash.md) 29 | IoT sensor via HTTP | Node-RED | https://nodered.org | 30 | Logstash (Logstash supports multiple plugins as listed here https://www.elastic.co/guide/en/logstash/current/input-plugins.html) | Logstash | | [logstash](logstash.md) 31 | NETCONF Notifications | ODL | http://www.opendaylight.org/ | [opendl](opendl.md) 32 | Netflow / IPFIX | Logstash | https://www.elastic.co/guide/en/logstash/current/plugins-codecs-netflow.html | [logstash](logstash.md) 33 | Netflow / IPFIX / sFlow | pmacct | http://www.pmacct.net/ | [pmacct](pmacct.md) 34 | Openstack | Work in progress | | 35 | sFlow | Logstash | https://github.com/ashangit/logstash-codec-sflow | [logstash](logstash.md) 36 | SNMP Metrics and Traps | ODL | https://wiki.opendaylight.org/view/SNMP_Plugin:Getting_Started | [opendl](opendl.md) 37 | SNMP Traps | Logstash | https://www.elastic.co/guide/en/logstash/current/plugins-inputs-snmptrap.html | [logstash](logstash.md) 38 | Syslog | Logstash | https://www.elastic.co/guide/en/logstash/current/plugins-inputs-syslog.html | [logstash](logstash.md) 39 | Syslog (RFC3164 or RFC5424 - needed for newer IOS/IOS XR/ NX OS etc.) | Logstash | https://gist.github.com/donaldh/89b7304981f96497c94fe4d98bb03d71 | [logstash](logstash.md) 40 | 41 | -------------------------------------------------------------------------------- /streamingest/images/cluster_config.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pndaproject/pnda-guide/aff0ba9da92bdb0cc37473860fd6b55a86da1a6a/streamingest/images/cluster_config.png -------------------------------------------------------------------------------- /streamingest/opendl.md: -------------------------------------------------------------------------------- 1 | # Integrating OpenDaylight 2 | 3 | Integrating OpenDayLight (ODL) to PNDA can be done via [prod-odl-kafka](https://github.com/pndaproject/odl-kafka-plugin), an ODL northbound plugin that streaming Event-Topic Broker (ETB) events to PNDA cluster. 4 | 5 | To install, configure and use OpenDaylight with PNDA, follow the [prod-odl-kafka](https://github.com/pndaproject/odl-kafka-plugin) instructions. After you have set up the plugin, see the section on [sending data to PNDA](https://github.com/pndaproject/odl-kafka-plugin#send-data-to-pnda). 6 | -------------------------------------------------------------------------------- /streamingest/pmacct.md: -------------------------------------------------------------------------------- 1 | # Integrating Pmacct 2 | 3 | Pmacct is a collection of passive network monitoring tools to measure, account, classify, aggregate and export IPv4 and IPv6 traffic. We will be publish details of how to integrate pmacct with PNDA soon. 4 | -------------------------------------------------------------------------------- /streamingest/producer.md: -------------------------------------------------------------------------------- 1 | # Developing a Producer 2 | 3 | In many circumstances, it may be advantageous to create your own producer: for example, if you have a data source not already handled by Logstash or Open Daylight, or if you wish to exercise more control over the way data is produced into the platform. 4 | 5 | In this case, you can use the [Kafka Consumer API](http://kafka.apache.org/documentation.html#newconsumerapi). Example code showing how to build a straightforward consumer using this API can be found in the [Kafka Quickstart](http://kafka.apache.org/07/quickstart.html). 6 | 7 | We have several examples of building Kafka clients in our [example code repository](https://github.com/pndaproject/example-kafka-clients). 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /timeseries/README.md: -------------------------------------------------------------------------------- 1 | # Time Series 2 | 3 | ## [OpenTSDB](opentsdb.md) 4 | 5 | OpenTSDB is a scalable time series database that lets you store and serve massive amounts of time series data, without losing granularity. In PNDA, a custom application (reading data from Kafka or HDFS for example) could write time series and store them in OpenTSDB. 6 | 7 | ## [Grafana](grafana.md) 8 | 9 | Grafana is a graph and dashboard builder for visualizing time series metrics. It is pre-configured to connect to OpenTSDB as its data source. It is much easier to create dashboards in Grafana than using the OpenTSDB user interface. -------------------------------------------------------------------------------- /timeseries/grafana.md: -------------------------------------------------------------------------------- 1 | # Grafana 2 | 3 | [Grafana](http://grafana.org) is a graph and dashboard builder for visualizing time series metrics. 4 | 5 | It is pre-configured to connect to [OpenTSDB](opentsdb.md) as its data source. It is much easier to create dashboards in Grafana than using the OpenTSDB user interface. -------------------------------------------------------------------------------- /timeseries/opentsdb.md: -------------------------------------------------------------------------------- 1 | # OpenTSDB # 2 | 3 | [OpenTSDB](http://opentsdb.net) is a scalable time series database that can store and serve massive amounts of time series data without losing granularity. 4 | 5 | OpenTSDB consists of a Time Series Daemon (TSD) as well as set of command line utilities. Interaction with OpenTSDB is primarily achieved by running one or more of the TSDs. Each TSD is independent. There is no master, no shared state so you can run as many TSDs as required to handle any load you throw at it. Each TSD uses the open source database HBase to store and retrieve time-series data. The HBase schema is highly optimized for fast aggregations of similar time series to minimize storage space. 6 | 7 | ## Installation ## 8 | 9 | When deploying PNDA, there is an option to set up OpenTSDB. Once the cluster is deployed, OpenTSDB is ready to be used. The only thing left to do is to create the time series. 10 | 11 | ## Time series creation ## 12 | 13 | On the server running OpenTSDB, create the `metric sys.cpu.user` with this command: 14 | 15 | sudo /usr/share/opentsdb/bin/tsdb/tsdb mkmetric sys.cpu.user --config /etc/opentsdb/opentsdb.conf 16 | 17 | ## Writing ## 18 | 19 | A time series is a collection of data points for an identity over time. 20 | 21 | In OpenTSDB, a time series data point consists of: 22 | 23 | - A metric name. 24 | - A UNIX timestamp (seconds or milliseconds since Epoch). 25 | - A value (64 bit integer or single-precision floating point value). 26 | - A set of tags (key-value pairs) that describe the time series the point belongs to. 27 | 28 | Take the following data point as an example: 29 | 30 | sys.cpu.user 1234567890 42 host=123 type=router cpu=0 31 | 32 | In this data point, 33 | 34 | - the metric name is `sys.cpu.user` 35 | - the timestamp is 1234567890 36 | - the value is 42 37 | - tags are `host`, `type` and `cpu` 38 | 39 | It is important to think about time series naming optimization in order to determine when aggregation is useful. 40 | Think about cardinality: let's say you have another tag type such as firewall. If you are interested in 41 | aggregated value by type, it will be more efficient to move the type from the tag section to the metric name (`router.sys.cpu.user` and `firewall.sys.cpu.user`). 42 | 43 | The most basic operations for writing a data point are: 44 | 45 | - telnet and put `router.sys.cpu.user 1234567890 42 host=123 type=router cpu=0` 46 | - HTTP POST with JSON to `http://{host}:{port}/api/put` 47 | 48 | **Ingesting PNDA input data** 49 | 50 | One technique is to develop a Spark Streaming application that reads from Kafka and writes the processing results to OpenTSDB in the appropriate time series based on the app logic. We provide sample code for a Spark streaming skeleton. 51 | 52 | ## Reading ## 53 | 54 | OpenTSDB offers a number of means to extract data, such as CLI tools, an HTTP API and as a GnuPlot graph. Querying with OpenTSDB's tag based system can be a bit tricky, so read through this page, and see the [OpenTSDB HTTP API format](http://opentsdb.net/docs/build/html/user_guide/query/index.html) document for further information. 55 | 56 | Here is a snippet of the key elements for performing a query: 57 | 58 | | endpoint | name | required | type | description | default | example | 59 | |:-----------------|:--------:|:---------:|:-------|:---------------------------|:-----------------|----------------| 60 | |/api/query (GET)|start|yes|string,integer|The start time for the query.
This can be a relative or absolute timestamp. | n/a |1h-ago
2015/10/15-11:21:02
1444859743020 | 61 | |-|end|no|string,integer|An end time for the query. If not supplied, the TSD will assume the local system time on the server.
This may be a relative or absolute timestamp.| n/a |10s-ago
2015/10/15-11:21:02
1444859743020 | 62 | |-|ms|no|boolean|Output data timestamp in ms or s| false |false
true| 63 | |-|m|yes|array of sub queries|Requires at least one sub query, a means of selecting which time series should be included in the result set| n/a |See sub query section| 64 | |/api/query/last (GET)|timeseries|yes|string|[{=[,...=]}]| n/a |test{host=router01} | 65 | |-|back_scan|no|integer|Number of hours to search in the past| n/a |24 | 66 | |/api/suggest (GET)|type|yes|string|The type of data to auto complete on| n/a |metric
tagk
tagv | 67 | |-|q|no|string|A string to match on for the given type| n/a |sys.cpu | 68 | |-|max|no|integer|Max. number of suggested results to return| 25 |10| 69 | 70 | Here is a snippet of the key elements for the sub metric query syntax (for /api/query m param): 71 | 72 | | | | | | | | 73 | |:--------|:---------:|:-------|:---------------------------|:-----------------:|:-------| 74 | |\|yes|string|The name of an aggregation function to use.| n/a |sum
avg
dev
min
max| 75 | |\|no|string|An optional downsampling function to reduce the amount of data returned.| n/a |5m-avg| 76 | |\|yes|string|The name of a metric stored in the system.| n/a |sys.cpu.user| 77 | |\|no|Map|To drill down to specific timeseries or group results by tag, supply one or more map values in the same format as the query string. Note that if no tags are specified, all metrics in the system will be aggregated into the results.| n/a |{host=router01}| 78 | |\|no|List|Filters the time series emitted in the results. Note that if no filters are specified, all time series for the given metric will be aggregated into the results. The type precedes the filter expression in parentheses.
Whether or not results are grouped depends on which curly bracket the filter is in. Two curly braces are now supported per metric query. The first set is the group by filter and the second is a non group by filter, e.g. `{host=web01}{colo=regexp(sjc.\*)}`. If you only want to filter without grouping then the first curly set must be empty, e.g. `{}{host=router\*}`| n/a |m=avg:sys.cpu.percent.idle{host=router01,plugin_instance=0|1}| 79 | --------------------------------------------------------------------------------