├── etc
    ├── myHadoop.png
    ├── hdfs-site.xml
    ├── core-site.xml
    └── mapred-site.xml
├── docs
    ├── myHadoop_0.2a.pdf
    └── myHadoop_white_paper.pdf
├── bin
    ├── setenv.sh
    ├── sge-cleanup.sh
    ├── pbs-cleanup.sh
    ├── sge-configure.sh
    └── pbs-configure.sh
├── RELEASE_NOTES
├── CHANGELOG
├── README.md
├── LICENSE
├── sge-example.sh
└── pbs-example.sh


/etc/myHadoop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenTopography/myHadoop/main/etc/myHadoop.png


--------------------------------------------------------------------------------
/docs/myHadoop_0.2a.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenTopography/myHadoop/main/docs/myHadoop_0.2a.pdf


--------------------------------------------------------------------------------
/docs/myHadoop_white_paper.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenTopography/myHadoop/main/docs/myHadoop_white_paper.pdf


--------------------------------------------------------------------------------
/bin/setenv.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Set this to location of myHadoop 
 4 | export MY_HADOOP_HOME="/home/srkrishnan/Software/myHadoop"
 5 | 
 6 | # Set this to the location of the Hadoop installation
 7 | export HADOOP_HOME="/home/srkrishnan/Software/hadoop-0.20.2"
 8 | 
 9 | # Set this to the location you want to use for HDFS
10 | # Note that this path should point to a LOCAL directory, and
11 | # that the path should exist on all slave nodes
12 | export HADOOP_DATA_DIR="/state/partition1/hadoop-$USER/data"
13 | 
14 | # Set this to the location where you want the Hadoop logfies
15 | export HADOOP_LOG_DIR="/state/partition1/hadoop-$USER/log"
16 | 
17 | 


--------------------------------------------------------------------------------
/RELEASE_NOTES:
--------------------------------------------------------------------------------
 1 | Version 0.2a
 2 | ------------
 3 | 
 4 | We are pleased to announce the release of version 0.2a of myHadoop, which
 5 | enables the use of Apache Hadoop in a non-dedicated cluster environment,
 6 | being administered by typical batch scheduler. 
 7 | 
 8 | This release adds to the 0.1 release as follows:
 9 | * Support for the Sun Grid Engine (SGE)
10 | * Updated documentation for SGE and PBS
11 | 
12 | Please feel free to send questions/comments/concerns to the myHadoop
13 | mailing list at myhadoop-users@lists.sourceforge.net. You may also send
14 | individual private comments to Sriram Krishnan at sriram@sdsc.edu.
15 | 
16 | Sincerely,
17 | Sriram
18 | 


--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
 1 | Version 0.2a
 2 | ------------
 3 | Version 0.2a adds to the 0.1a release as follows:
 4 | * Support for the Sun Grid Engine (SGE)
 5 | * Updated documentation for SGE and PBS
 6 | 
 7 | Version 0.1a
 8 | ------------
 9 | 
10 | We are pleased to announce the release of version 0.1a of myHadoop, which
11 | enables the use of Apache Hadoop in a non-dedicated cluster environment,
12 | being administered by typical batch scheduler. 
13 | 
14 | This release supports the following features:
15 | * Provisioning of on-demand Hadoop clusters using PBS (Moab)
16 | * Ability to configure Hadoop in "persistent" and "non-persistent" modes
17 | * Ability to run Hadoop in regular user mode, without need any root privileges
18 | * Ability to tune the various configuration parameters for Hadoop.
19 | 
20 | 


--------------------------------------------------------------------------------
/etc/hdfs-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | 
 4 | <!-- Put site-specific property overrides in this file. -->
 5 | <configuration>
 6 | <property>
 7 |   <name>dfs.replication</name>
 8 |   <value>2</value>
 9 |   <description>Default block replication.
10 |   The actual number of replications can be specified when the file is created.
11 |   The default is used if replication is not specified in create time.
12 |   </description>
13 | </property>
14 | 
15 | <property>
16 |   <name>dfs.block.size</name>
17 |   <value>134217728</value>
18 |   <description>Set to the number suggested as best practice</description>
19 | </property>
20 | 
21 | <property>
22 |   <name>dfs.datanode.handler.count</name>
23 |   <value>64</value>
24 |   <description>Number of handlers recommended to serve block requests - recommended by cloudera for big hardware:wq
25 | </description>
26 | </property>
27 | 
28 | </configuration>
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![NSF-0844530](https://img.shields.io/badge/NSF-0844530-blue.svg)](https://nsf.gov/awardsearch/showAward?AWD_ID=0844530) 
 2 | 
 3 | 
 4 | *UPDATE*
 5 | myHadoop is no longer being updated here. Originally written by Sriram Krishnan at SDSC, it is currently being maintained at https://github.com/glennklockwood/myhadoop/ 
 6 | 
 7 | myHadoop
 8 | --------
 9 | myHadoop enables the use of Hadoop in a non-dedicated cluster environment,
10 | being administered by typical batch scheduler. We are currently supporting
11 | the PBS (Moab) and the Sun Grid Engine (SGE) schedulers, although a port to
12 | a scheduler such as Condor would be very trivial.
13 | 
14 | The pbs-example.sh and sge-example.sh provide an example of how to use
15 | myHadoop with PBS and SGE respectively. For more details, please read the
16 | documentation in the "docs" directory.
17 | 
18 | Pre-requisites
19 | --------------
20 | myHadoop needs Apache Hadoop 0.20.2 and a batch scheduler such as PBS.
21 | 
22 | Other
23 | -----
24 | Work was funded by a grant from the NSF Cluster Exploratory (CluE) program (Award# IIS-0844530, PI: Baru, Co-PI Krishnan)
25 | More info: 
26 | http://nsf.gov/awardsearch/showAward?AWD_ID=0844530
27 | 


--------------------------------------------------------------------------------
/bin/sge-cleanup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | function print_usage {
 4 |     echo "Usage: -n NODES -h"
 5 |     echo "       -n: Number of nodes requested for the Hadoop installation"
 6 |     echo "       -h: Print help"
 7 | }
 8 | 
 9 | # initialize arguments
10 | NODES=""
11 | 
12 | # parse arguments
13 | args=`getopt n:h $*`
14 | if test $? != 0
15 | then
16 |     print_usage
17 |     exit 1
18 | fi
19 | set -- $args
20 | for i
21 | do
22 |     case "$i" in
23 |         -n) shift;
24 | 	    NODES=$1
25 |             shift;;
26 | 
27 |         -h) shift;
28 | 	    print_usage
29 | 	    exit 0
30 |     esac
31 | done
32 | 
33 | if [ "$NODES" != "" ]; then
34 |     echo "Number of Hadoop nodes specified by user: $NODES"
35 | else 
36 |     echo "Required parameter not set - number of nodes (-n)"
37 |     print_usage
38 |     exit 1
39 | fi
40 | 
41 | # get the number of nodes from SGE
42 | if [ "$NODES" != "$NSLOTS" ]; then
43 |     echo "Number of nodes received from SGE not the same as number of nodes requested by user"
44 |     exit 1
45 | fi
46 | 
47 | # clean up working directories for N-node Hadoop cluster
48 | for ((i=1; i<=$NODES; i++))
49 | do
50 |     node=`awk 'NR=='"$i"'{print $1;exit}' $PE_HOSTFILE`
51 |     echo "Clean up node: $node"
52 |     cmd="rm -rf $HADOOP_DATA_DIR $HADOOP_LOG_DIR"
53 |     echo $cmd
54 |     ssh $node $cmd 
55 | done
56 | 


--------------------------------------------------------------------------------
/etc/core-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | 
 4 | <!-- Put site-specific property overrides in this file. -->
 5 | 
 6 | <configuration>
 7 | 
 8 | <!-- DO NOT EDIT - THESE PROPERTIES ARE SET BY THE CONFIGURE SCRIPTS -->
 9 | <property>
10 |   <name>hadoop.tmp.dir</name>
11 |   <value>HADOOP_DATA_DIR</value>
12 |   <description>A base for other temporary directories.</description>
13 | </property>
14 | 
15 | <property>
16 |   <name>fs.default.name</name>
17 |   <value>hdfs://MASTER:54310</value>
18 |   <description>The name of the default file system.  A URI whose
19 |   scheme and authority determine the FileSystem implementation.  The
20 |   uri's scheme determines the config property (fs.SCHEME.impl) naming
21 |   the FileSystem implementation class.  The uri's authority is used to
22 |   determine the host, port, etc. for a filesystem.</description>
23 | </property>
24 | <!-- END OF DO NOT EDIT SECTION -->
25 | 
26 | <property>
27 |   <name>io.file.buffer.size</name>
28 |   <value>131072</value>
29 |   <description>Size of read/write buffer</description>
30 | </property>
31 | 
32 | <property>
33 |   <name>fs.inmemory.size.mb</name>
34 |   <value>650</value>
35 |   <description>Larger amount of memory allocated for the in-memory file-system used to merge map-outputs at the reduces. </description>
36 | </property>
37 | 
38 | <property>
39 |   <name>io.sort.mb</name>
40 |   <value>650</value>
41 |   <description>Higher memory-limit while sorting data.</description>
42 | </property>
43 | 
44 | </configuration>
45 | 


--------------------------------------------------------------------------------
/bin/pbs-cleanup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | function print_usage {
 4 |     echo "Usage: -n NODES -h"
 5 |     echo "       -n: Number of nodes requested for the Hadoop installation"
 6 |     echo "       -h: Print help"
 7 | }
 8 | 
 9 | # initialize arguments
10 | NODES=""
11 | 
12 | # parse arguments
13 | args=`getopt n:h $*`
14 | if test $? != 0
15 | then
16 |     print_usage
17 |     exit 1
18 | fi
19 | set -- $args
20 | for i
21 | do
22 |     case "$i" in
23 |         -n) shift;
24 | 	    NODES=$1
25 |             shift;;
26 | 
27 |         -h) shift;
28 | 	    print_usage
29 | 	    exit 0
30 |     esac
31 | done
32 | 
33 | if [ "$NODES" != "" ]; then
34 |     echo "Number of Hadoop nodes specified by user: $NODES"
35 | else 
36 |     echo "Required parameter not set - number of nodes (-n)"
37 |     print_usage
38 |     exit 1
39 | fi
40 | 
41 | # get the number of nodes from PBS
42 | if [ -e $PBS_NODEFILE ]; then
43 |     pbsNodes=`awk 'END { print NR }' $PBS_NODEFILE`
44 |     echo "Received $pbsNodes nodes from PBS"
45 | 
46 |     if [ "$NODES" != "$pbsNodes" ]; then
47 | 	echo "Number of nodes received from PBS not the same as number of nodes requested by user"
48 | 	exit 1
49 |     fi
50 | else 
51 |     echo "PBS_NODEFILE is unavailable"
52 |     exit 1
53 | fi
54 | 
55 | # clean up working directories for N-node Hadoop cluster
56 | for ((i=1; i<=$NODES; i++))
57 | do
58 |     node=`awk 'NR=='"$i"'{print;exit}' $PBS_NODEFILE`
59 |     echo "Clean up node: $node"
60 |     cmd="rm -rf $HADOOP_DATA_DIR $HADOOP_LOG_DIR"
61 |     echo $cmd
62 |     ssh $node $cmd 
63 | done
64 | 


--------------------------------------------------------------------------------
/etc/mapred-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | 
 4 | <!-- Put site-specific property overrides in this file. -->
 5 | <configuration>
 6 | 
 7 | <!-- DO NOT EDIT - THESE PROPERTIES ARE SET BY THE CONFIGURE SCRIPTS -->
 8 | <property>
 9 |   <name>mapred.job.tracker</name>
10 |   <value>MASTER:54311</value>
11 |   <description>The host and port that the MapReduce job tracker runs
12 |   at.  If "local", then jobs are run in-process as a single map
13 |   and reduce task.
14 |   </description>
15 | </property>
16 | <!-- END OF DO NOT EDIT SECTION -->
17 | 
18 | <property>
19 |   <name>mapred.reduce.parallel.copies</name>
20 |   <value>4</value>
21 |   <description>Number of parallel copies run by reduces - set to total number of nodes in the system
22 |   </description>
23 | </property>
24 | 
25 | <property>
26 |   <name>mapred.tasktracker.map.tasks.maximum</name>
27 |   <value>4</value>
28 |   <description>Maximum map tasks to be run simultaenously.
29 |   </description>
30 | </property>
31 | 
32 | <property>
33 |   <name>mapred.tasktracker.reduce.tasks.maximum</name>
34 |   <value>2</value>
35 |   <description>Maximum reduce tasks to be run simultaenously.
36 |   </description>
37 | </property>
38 | 
39 | <property>
40 |   <name>mapred.job.reuse.jvm.num.tasks</name>
41 |   <value>-1</value>
42 |   <description>Reuse the JVM between tasks
43 |   </description>
44 | </property>
45 | 
46 | <property>
47 |   <name>mapred.child.java.opts</name>
48 |   <value>-Xmx1024m</value>
49 |   <description>Large heap-size for child JVMs of maps/reduces
50 |   </description>
51 | </property>
52 | </configuration>
53 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | COPYRIGHT AND LICENSE
 2 | 
 3 | Copyright (c) 2005-2012 The Regents of the University of California.
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or
 7 | without modification, are permitted provided that the following
 8 | conditions are met:
 9 | 
10 | 1. Redistributions of source code must retain the above copyright
11 | notice, this list of conditions and the following disclaimer.
12 | 
13 | 2. Redistributions in binary form must reproduce the above
14 | copyright notice, this list of conditions and the following
15 | disclaimer in the documentation and/or other materials provided
16 | with the distribution.
17 | 
18 | 3. All advertising materials mentioning features or use of this
19 | software must display the following acknowledgement: This product
20 | includes software developed by the San Diego Supercomputer Center.
21 | 
22 | 4. Neither the names of the Centers nor the names of the contributors
23 | may be used to endorse or promote products derived from this
24 | software without specific prior written permission.
25 | 
26 | THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS''
27 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
29 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS
30 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
32 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
33 | USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
34 | AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
36 | ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 | POSSIBILITY OF SUCH DAMAGE.
38 | 


--------------------------------------------------------------------------------
/sge-example.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #$ -V -cwd
 4 | #$ -N test
 5 | #$ -pe hadoop 4
 6 | #$ -o hadoop_test.out
 7 | #$ -e hadoop_test.err
 8 | #$ -S /bin/bash
 9 | 
10 | ### Run the myHadoop environment script to set the appropriate variables
11 | #
12 | # Note: ensure that the variables are set correctly in bin/setenv.sh
13 | source /home/sriram/Software/myHadoop-core/bin/setenv.sh
14 | 
15 | #### Set this to the directory where Hadoop configs should be generated
16 | # Don't change the name of this variable (HADOOP_CONF_DIR) as it is
17 | # required by Hadoop - all config files will be picked up from here
18 | #
19 | # Make sure that this is accessible to all nodes
20 | export HADOOP_CONF_DIR="/home/sriram/Software/myHadoop-core/config"
21 | 
22 | #### Set up the configuration
23 | # Make sure number of nodes is the same as what you have requested from SGE
24 | # usage: $MY_HADOOP_HOME/bin/sge-configure.sh -h
25 | echo "Set up the configurations for myHadoop"
26 | # this is the non-persistent mode
27 | $MY_HADOOP_HOME/bin/sge-configure.sh -n 4 -c $HADOOP_CONF_DIR
28 | # this is the persistent mode
29 | # $MY_HADOOP_HOME/bin/sge-configure.sh -n 4 -c $HADOOP_CONF_DIR -p -d /oasis/cloudstor-group/HDFS
30 | echo
31 | 
32 | #### Format HDFS, if this is the first time or not a persistent instance
33 | echo "Format HDFS"
34 | $HADOOP_HOME/bin/hadoop --config $HADOOP_CONF_DIR namenode -format
35 | echo
36 | 
37 | #### Start the Hadoop cluster
38 | echo "Start all Hadoop daemons"
39 | $HADOOP_HOME/bin/start-all.sh
40 | # $HADOOP_HOME/bin/hadoop dfsadmin -safemode leave
41 | echo
42 | 
43 | #### Run your jobs here
44 | echo "Run some test Hadoop jobs"
45 | $HADOOP_HOME/bin/hadoop --config $HADOOP_CONF_DIR dfs -mkdir Data
46 | $HADOOP_HOME/bin/hadoop --config $HADOOP_CONF_DIR dfs -copyFromLocal /home/sriram/Data/gutenberg Data
47 | $HADOOP_HOME/bin/hadoop --config $HADOOP_CONF_DIR dfs -ls Data/gutenberg
48 | $HADOOP_HOME/bin/hadoop --config $HADOOP_CONF_DIR jar $HADOOP_HOME/hadoop-0.20.2-examples.jar wordcount Data/gutenberg Outputs
49 | $HADOOP_HOME/bin/hadoop --config $HADOOP_CONF_DIR dfs -ls Outputs
50 | echo
51 | 
52 | #### Stop the Hadoop cluster
53 | echo "Stop all Hadoop daemons"
54 | $HADOOP_HOME/bin/stop-all.sh
55 | echo
56 | 
57 | #### Clean up the working directories after job completion
58 | echo "Clean up"
59 | $MY_HADOOP_HOME/bin/sge-cleanup.sh -n 4
60 | echo
61 | 


--------------------------------------------------------------------------------
/pbs-example.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #PBS -q batch
 4 | #PBS -N hadoop_job
 5 | #PBS -l nodes=4:ppn=1
 6 | #PBS -o hadoop_run.out
 7 | #PBS -e hadoop_run.err
 8 | #PBS -A baru-tro
 9 | #PBS -V
10 | 
11 | ### Run the myHadoop environment script to set the appropriate variables
12 | #
13 | # Note: ensure that the variables are set correctly in bin/setenv.sh
14 | . /home/srkrishnan/Software/myHadoop/bin/setenv.sh
15 | 
16 | #### Set this to the directory where Hadoop configs should be generated
17 | # Don't change the name of this variable (HADOOP_CONF_DIR) as it is
18 | # required by Hadoop - all config files will be picked up from here
19 | #
20 | # Make sure that this is accessible to all nodes
21 | export HADOOP_CONF_DIR="/home/srkrishnan/Software/myHadoop/config"
22 | 
23 | #### Set up the configuration
24 | # Make sure number of nodes is the same as what you have requested from PBS
25 | # usage: $MY_HADOOP_HOME/bin/pbs-configure.sh -h
26 | echo "Set up the configurations for myHadoop"
27 | # this is the non-persistent mode
28 | $MY_HADOOP_HOME/bin/pbs-configure.sh -n 4 -c $HADOOP_CONF_DIR
29 | # this is the persistent mode
30 | # $MY_HADOOP_HOME/bin/pbs-configure.sh -n 4 -c $HADOOP_CONF_DIR -p -d /oasis/cloudstor-group/HDFS
31 | echo
32 | 
33 | #### Format HDFS, if this is the first time or not a persistent instance
34 | echo "Format HDFS"
35 | $HADOOP_HOME/bin/hadoop --config $HADOOP_CONF_DIR namenode -format
36 | echo
37 | 
38 | #### Start the Hadoop cluster
39 | echo "Start all Hadoop daemons"
40 | $HADOOP_HOME/bin/start-all.sh
41 | #$HADOOP_HOME/bin/hadoop dfsadmin -safemode leave
42 | echo
43 | 
44 | #### Run your jobs here
45 | echo "Run some test Hadoop jobs"
46 | $HADOOP_HOME/bin/hadoop --config $HADOOP_CONF_DIR dfs -mkdir Data
47 | $HADOOP_HOME/bin/hadoop --config $HADOOP_CONF_DIR dfs -copyFromLocal /home/srkrishnan/Data/gutenberg Data
48 | $HADOOP_HOME/bin/hadoop --config $HADOOP_CONF_DIR dfs -ls Data/gutenberg
49 | $HADOOP_HOME/bin/hadoop --config $HADOOP_CONF_DIR jar $HADOOP_HOME/hadoop-0.20.2-examples.jar wordcount Data/gutenberg Outputs
50 | $HADOOP_HOME/bin/hadoop --config $HADOOP_CONF_DIR dfs -ls Outputs
51 | echo
52 | 
53 | #### Stop the Hadoop cluster
54 | echo "Stop all Hadoop daemons"
55 | $HADOOP_HOME/bin/stop-all.sh
56 | echo
57 | 
58 | #### Clean up the working directories after job completion
59 | echo "Clean up"
60 | $MY_HADOOP_HOME/bin/pbs-cleanup.sh -n 4
61 | echo
62 | 


--------------------------------------------------------------------------------
/bin/sge-configure.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | function print_usage {
  4 |     echo "Usage: -n NODES -p -d BASE_DIR -c CONFIG_DIR -h"
  5 |     echo "       -n: Number of nodes requested for the Hadoop installation"
  6 |     echo "       -p: Whether the Hadoop installation should be persistent"
  7 |     echo "           If so, data directories will have to be linked to a"
  8 |     echo "           directory that is not local to enable persistence"
  9 |     echo "       -d: Base directory to persist HDFS state, to be used if"
 10 |     echo "           -p is set"
 11 |     echo "       -c: The directory to generate Hadoop configs in"
 12 |     echo "       -h: Print help"
 13 | }
 14 | 
 15 | # initialize arguments
 16 | NODES=""
 17 | PERSIST="false"
 18 | BASE_DIR=""
 19 | CONFIG_DIR=""
 20 | 
 21 | # parse arguments
 22 | args=`getopt n:pd:c:h $*`
 23 | if test $? != 0
 24 | then
 25 |     print_usage
 26 |     exit 1
 27 | fi
 28 | set -- $args
 29 | for i
 30 | do
 31 |     case "$i" in
 32 |         -n) shift;
 33 | 	    NODES=$1
 34 |             shift;;
 35 | 
 36 |         -d) shift;
 37 | 	    BASE_DIR=$1
 38 |             shift;;
 39 | 
 40 |         -c) shift;
 41 | 	    CONFIG_DIR=$1
 42 |             shift;;
 43 | 
 44 |         -p) shift;
 45 | 	    PERSIST="true"
 46 | 	    ;;
 47 | 
 48 |         -h) shift;
 49 | 	    print_usage
 50 | 	    exit 0
 51 |     esac
 52 | done
 53 | 
 54 | if [ "$NODES" != "" ]; then
 55 |     echo "Number of Hadoop nodes requested: $NODES"
 56 | else 
 57 |     echo "Required parameter not set - number of nodes (-n)"
 58 |     print_usage
 59 |     exit 1
 60 | fi
 61 | 
 62 | if [ "$CONFIG_DIR" != "" ]; then
 63 |     echo "Generation Hadoop configuration in directory: $CONFIG_DIR"
 64 | else 
 65 |     echo "Location of configuration directory not specified"
 66 |     print_usage
 67 |     exit 1
 68 | fi
 69 | 
 70 | if [ "$PERSIST" = "true" ]; then
 71 |     echo "Persisting HDFS state (-p)"
 72 |     if [ "$BASE_DIR" = "" ]; then
 73 | 	echo "Base directory (-d) not set for persisting HDFS state"
 74 | 	print_usage
 75 | 	exit 1
 76 |     else
 77 | 	echo "Using directory $BASE_DIR for persisting HDFS state"
 78 |     fi
 79 | else
 80 |     echo "Not persisting HDFS state"
 81 | fi
 82 | 
 83 | # get the number of nodes from SGE
 84 | if [ "$NODES" != "$NSLOTS" ]; then
 85 |     echo "Number of nodes received from SGE not the same as number of nodes requested by user"
 86 |     exit 1
 87 | fi
 88 | 
 89 | # create the config, data, and log directories
 90 | rm -rf $CONFIG_DIR
 91 | mkdir -p $CONFIG_DIR
 92 | 
 93 | # first copy over all default Hadoop configs
 94 | cp $HADOOP_HOME/conf/* $CONFIG_DIR
 95 | 
 96 | # pick the master node as the first node in the PE_HOSTFILE
 97 | MASTER_NODE=`awk 'NR==1{print $1;exit}' $PE_HOSTFILE`
 98 | echo "Master is: $MASTER_NODE"
 99 | echo $MASTER_NODE > $CONFIG_DIR/masters
100 | 
101 | # every node in the PE_HOSTFILE is a slave
102 | awk < $PE_HOSTFILE '{print $1}' > $CONFIG_DIR/slaves
103 | 
104 | # update the hdfs and mapred configs
105 | sed 's/<value>.*:/<value>'"$MASTER_NODE"':/g' $MY_HADOOP_HOME/etc/mapred-site.xml > $CONFIG_DIR/mapred-site.xml
106 | sed 's/hdfs:\/\/.*:/hdfs:\/\/'"$MASTER_NODE"':/g' $MY_HADOOP_HOME/etc/core-site.xml > $CONFIG_DIR/core-site.xml
107 | sed -i 's:HADOOP_DATA_DIR:'"$HADOOP_DATA_DIR"':g' $CONFIG_DIR/core-site.xml
108 | cp $MY_HADOOP_HOME/etc/hdfs-site.xml $CONFIG_DIR/
109 | 
110 | # update the HADOOP log directory
111 | echo "" >> $CONFIG_DIR/hadoop-env.sh
112 | echo "# Overwrite location of the log directory" >> $CONFIG_DIR/hadoop-env.sh
113 | echo "export HADOOP_LOG_DIR=$HADOOP_LOG_DIR" >> $CONFIG_DIR/hadoop-env.sh
114 | 
115 | # set the HADOOP_HEAPSIZE to 4GB 
116 | # echo "" >> $CONFIG_DIR/hadoop-env.sh
117 | # echo "# Set the HADOOP_HEAPSIZE to 4GB" >> $CONFIG_DIR/hadoop-env.sh
118 | # echo "export HADOOP_HEAPSIZE=4096" >> $CONFIG_DIR/hadoop-env.sh
119 | 
120 | # JVM settings
121 | # echo "" >> $CONFIG_DIR/hadoop-env.sh
122 | # echo "# JVM settings for Hadoop" >> $CONFIG_DIR/hadoop-env.sh
123 | # echo "export HADOOP_OPTS=\"-server -XX:+UseParallelGC -XX:ParallelGCThreads=4 -XX:+AggressiveHeap -XX:+HeapDumpOnOutOfMemoryError\"" >> $CONFIG_DIR/hadoop-env.sh
124 | 
125 | # create or link HADOOP_{DATA,LOG}_DIR on all slaves
126 | for ((i=1; i<=$NODES; i++))
127 | do
128 |     node=`awk 'NR=='"$i"'{print $1;exit}' $PE_HOSTFILE`
129 |     echo "Configuring node: $node"
130 |     cmd="rm -rf $HADOOP_LOG_DIR; mkdir -p $HADOOP_LOG_DIR"
131 |     echo $cmd
132 |     ssh $node $cmd 
133 |     if [ "$PERSIST" = "true" ]; then
134 | 	cmd="rm -rf $HADOOP_DATA_DIR; ln -s $BASE_DIR/$i $HADOOP_DATA_DIR"
135 | 	echo $cmd
136 | 	ssh $node $cmd
137 |     else
138 | 	cmd="rm -rf $HADOOP_DATA_DIR; mkdir -p $HADOOP_DATA_DIR"
139 | 	echo $cmd
140 | 	ssh $node $cmd 
141 |     fi
142 | done
143 | 


--------------------------------------------------------------------------------
/bin/pbs-configure.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | function print_usage {
  4 |     echo "Usage: -n NODES -p -d BASE_DIR -c CONFIG_DIR -h"
  5 |     echo "       -n: Number of nodes requested for the Hadoop installation"
  6 |     echo "       -p: Whether the Hadoop installation should be persistent"
  7 |     echo "           If so, data directories will have to be linked to a"
  8 |     echo "           directory that is not local to enable persistence"
  9 |     echo "       -d: Base directory to persist HDFS state, to be used if"
 10 |     echo "           -p is set"
 11 |     echo "       -c: The directory to generate Hadoop configs in"
 12 |     echo "       -h: Print help"
 13 | }
 14 | 
 15 | # initialize arguments
 16 | NODES=""
 17 | PERSIST="false"
 18 | BASE_DIR=""
 19 | CONFIG_DIR=""
 20 | 
 21 | # parse arguments
 22 | args=`getopt n:pd:c:h $*`
 23 | if test $? != 0
 24 | then
 25 |     print_usage
 26 |     exit 1
 27 | fi
 28 | set -- $args
 29 | for i
 30 | do
 31 |     case "$i" in
 32 |         -n) shift;
 33 | 	    NODES=$1
 34 |             shift;;
 35 | 
 36 |         -d) shift;
 37 | 	    BASE_DIR=$1
 38 |             shift;;
 39 | 
 40 |         -c) shift;
 41 | 	    CONFIG_DIR=$1
 42 |             shift;;
 43 | 
 44 |         -p) shift;
 45 | 	    PERSIST="true"
 46 | 	    ;;
 47 | 
 48 |         -h) shift;
 49 | 	    print_usage
 50 | 	    exit 0
 51 |     esac
 52 | done
 53 | 
 54 | if [ "$NODES" != "" ]; then
 55 |     echo "Number of Hadoop nodes requested: $NODES"
 56 | else 
 57 |     echo "Required parameter not set - number of nodes (-n)"
 58 |     print_usage
 59 |     exit 1
 60 | fi
 61 | 
 62 | if [ "$CONFIG_DIR" != "" ]; then
 63 |     echo "Generation Hadoop configuration in directory: $CONFIG_DIR"
 64 | else 
 65 |     echo "Location of configuration directory not specified"
 66 |     print_usage
 67 |     exit 1
 68 | fi
 69 | 
 70 | if [ "$PERSIST" = "true" ]; then
 71 |     echo "Persisting HDFS state (-p)"
 72 |     if [ "$BASE_DIR" = "" ]; then
 73 | 	echo "Base directory (-d) not set for persisting HDFS state"
 74 | 	print_usage
 75 | 	exit 1
 76 |     else
 77 | 	echo "Using directory $BASE_DIR for persisting HDFS state"
 78 |     fi
 79 | else
 80 |     echo "Not persisting HDFS state"
 81 | fi
 82 | 
 83 | # get the number of nodes from PBS
 84 | if [ -e $PBS_NODEFILE ]; then
 85 |     PBS_NODES=`awk 'END { print NR }' $PBS_NODEFILE`
 86 |     echo "Received $PBS_NODES nodes from PBS"
 87 | 
 88 |     if [ "$NODES" != "$PBS_NODES" ]; then
 89 | 	echo "Number of nodes received from PBS not the same as number of nodes requested by user"
 90 | 	exit 1
 91 |     fi
 92 | else 
 93 |     echo "PBS_NODEFILE is unavailable"
 94 |     exit 1
 95 | fi
 96 | 
 97 | # create the config, data, and log directories
 98 | rm -rf $CONFIG_DIR
 99 | mkdir -p $CONFIG_DIR
100 | 
101 | # first copy over all default Hadoop configs
102 | cp $HADOOP_HOME/conf/* $CONFIG_DIR
103 | 
104 | # pick the master node as the first node in the PBS_NODEFILE
105 | MASTER_NODE=`awk 'NR==1{print;exit}' $PBS_NODEFILE`
106 | echo "Master is: $MASTER_NODE"
107 | echo $MASTER_NODE > $CONFIG_DIR/masters
108 | 
109 | # every node in the PBS_NODEFILE is a slave
110 | cat $PBS_NODEFILE > $CONFIG_DIR/slaves
111 | 
112 | # update the hdfs and mapred configs
113 | sed 's/<value>.*:/<value>'"$MASTER_NODE"':/g' $MY_HADOOP_HOME/etc/mapred-site.xml > $CONFIG_DIR/mapred-site.xml
114 | sed 's/hdfs:\/\/.*:/hdfs:\/\/'"$MASTER_NODE"':/g' $MY_HADOOP_HOME/etc/core-site.xml > $CONFIG_DIR/core-site.xml
115 | sed -i 's:HADOOP_DATA_DIR:'"$HADOOP_DATA_DIR"':g' $CONFIG_DIR/core-site.xml
116 | cp $MY_HADOOP_HOME/etc/hdfs-site.xml $CONFIG_DIR/
117 | 
118 | # update the HADOOP log directory
119 | echo "" >> $CONFIG_DIR/hadoop-env.sh
120 | echo "# Overwrite location of the log directory" >> $CONFIG_DIR/hadoop-env.sh
121 | echo "export HADOOP_LOG_DIR=$HADOOP_LOG_DIR" >> $CONFIG_DIR/hadoop-env.sh
122 | 
123 | # set the HADOOP_HEAPSIZE to 4GB 
124 | # echo "" >> $CONFIG_DIR/hadoop-env.sh
125 | # echo "# Set the HADOOP_HEAPSIZE to 4GB" >> $CONFIG_DIR/hadoop-env.sh
126 | # echo "export HADOOP_HEAPSIZE=4096" >> $CONFIG_DIR/hadoop-env.sh
127 | 
128 | # JVM settings
129 | # echo "" >> $CONFIG_DIR/hadoop-env.sh
130 | # echo "# JVM settings for Hadoop" >> $CONFIG_DIR/hadoop-env.sh
131 | # echo "export HADOOP_OPTS=\"-server -XX:+UseParallelGC -XX:ParallelGCThreads=4 -XX:+AggressiveHeap -XX:+HeapDumpOnOutOfMemoryError\"" >> $CONFIG_DIR/hadoop-env.sh
132 | 
133 | # create or link HADOOP_{DATA,LOG}_DIR on all slaves
134 | for ((i=1; i<=$NODES; i++))
135 | do
136 |     node=`awk 'NR=='"$i"'{print;exit}' $PBS_NODEFILE`
137 |     echo "Configuring node: $node"
138 |     cmd="rm -rf $HADOOP_LOG_DIR; mkdir -p $HADOOP_LOG_DIR"
139 |     echo $cmd
140 |     ssh $node $cmd 
141 |     if [ "$PERSIST" = "true" ]; then
142 | 	cmd="rm -rf $HADOOP_DATA_DIR; ln -s $BASE_DIR/$i $HADOOP_DATA_DIR"
143 | 	echo $cmd
144 | 	ssh $node $cmd
145 |     else
146 | 	cmd="rm -rf $HADOOP_DATA_DIR; mkdir -p $HADOOP_DATA_DIR"
147 | 	echo $cmd
148 | 	ssh $node $cmd 
149 |     fi
150 | done
151 | 


--------------------------------------------------------------------------------