├── etc
├── myHadoop.png
├── hdfs-site.xml
├── core-site.xml
└── mapred-site.xml
├── docs
├── myHadoop_0.2a.pdf
└── myHadoop_white_paper.pdf
├── bin
├── setenv.sh
├── sge-cleanup.sh
├── pbs-cleanup.sh
├── sge-configure.sh
└── pbs-configure.sh
├── RELEASE_NOTES
├── CHANGELOG
├── README.md
├── LICENSE
├── sge-example.sh
└── pbs-example.sh
/etc/myHadoop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenTopography/myHadoop/main/etc/myHadoop.png
--------------------------------------------------------------------------------
/docs/myHadoop_0.2a.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenTopography/myHadoop/main/docs/myHadoop_0.2a.pdf
--------------------------------------------------------------------------------
/docs/myHadoop_white_paper.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenTopography/myHadoop/main/docs/myHadoop_white_paper.pdf
--------------------------------------------------------------------------------
/bin/setenv.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Set this to location of myHadoop
4 | export MY_HADOOP_HOME="/home/srkrishnan/Software/myHadoop"
5 |
6 | # Set this to the location of the Hadoop installation
7 | export HADOOP_HOME="/home/srkrishnan/Software/hadoop-0.20.2"
8 |
9 | # Set this to the location you want to use for HDFS
10 | # Note that this path should point to a LOCAL directory, and
11 | # that the path should exist on all slave nodes
12 | export HADOOP_DATA_DIR="/state/partition1/hadoop-$USER/data"
13 |
14 | # Set this to the location where you want the Hadoop logfies
15 | export HADOOP_LOG_DIR="/state/partition1/hadoop-$USER/log"
16 |
17 |
--------------------------------------------------------------------------------
/RELEASE_NOTES:
--------------------------------------------------------------------------------
1 | Version 0.2a
2 | ------------
3 |
4 | We are pleased to announce the release of version 0.2a of myHadoop, which
5 | enables the use of Apache Hadoop in a non-dedicated cluster environment,
6 | being administered by typical batch scheduler.
7 |
8 | This release adds to the 0.1 release as follows:
9 | * Support for the Sun Grid Engine (SGE)
10 | * Updated documentation for SGE and PBS
11 |
12 | Please feel free to send questions/comments/concerns to the myHadoop
13 | mailing list at myhadoop-users@lists.sourceforge.net. You may also send
14 | individual private comments to Sriram Krishnan at sriram@sdsc.edu.
15 |
16 | Sincerely,
17 | Sriram
18 |
--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
1 | Version 0.2a
2 | ------------
3 | Version 0.2a adds to the 0.1a release as follows:
4 | * Support for the Sun Grid Engine (SGE)
5 | * Updated documentation for SGE and PBS
6 |
7 | Version 0.1a
8 | ------------
9 |
10 | We are pleased to announce the release of version 0.1a of myHadoop, which
11 | enables the use of Apache Hadoop in a non-dedicated cluster environment,
12 | being administered by typical batch scheduler.
13 |
14 | This release supports the following features:
15 | * Provisioning of on-demand Hadoop clusters using PBS (Moab)
16 | * Ability to configure Hadoop in "persistent" and "non-persistent" modes
17 | * Ability to run Hadoop in regular user mode, without need any root privileges
18 | * Ability to tune the various configuration parameters for Hadoop.
19 |
20 |
--------------------------------------------------------------------------------
/etc/hdfs-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | dfs.replication
8 | 2
9 | Default block replication.
10 | The actual number of replications can be specified when the file is created.
11 | The default is used if replication is not specified in create time.
12 |
13 |
14 |
15 |
16 | dfs.block.size
17 | 134217728
18 | Set to the number suggested as best practice
19 |
20 |
21 |
22 | dfs.datanode.handler.count
23 | 64
24 | Number of handlers recommended to serve block requests - recommended by cloudera for big hardware:wq
25 |
26 |
27 |
28 |
29 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://nsf.gov/awardsearch/showAward?AWD_ID=0844530)
2 |
3 |
4 | *UPDATE*
5 | myHadoop is no longer being updated here. Originally written by Sriram Krishnan at SDSC, it is currently being maintained at https://github.com/glennklockwood/myhadoop/
6 |
7 | myHadoop
8 | --------
9 | myHadoop enables the use of Hadoop in a non-dedicated cluster environment,
10 | being administered by typical batch scheduler. We are currently supporting
11 | the PBS (Moab) and the Sun Grid Engine (SGE) schedulers, although a port to
12 | a scheduler such as Condor would be very trivial.
13 |
14 | The pbs-example.sh and sge-example.sh provide an example of how to use
15 | myHadoop with PBS and SGE respectively. For more details, please read the
16 | documentation in the "docs" directory.
17 |
18 | Pre-requisites
19 | --------------
20 | myHadoop needs Apache Hadoop 0.20.2 and a batch scheduler such as PBS.
21 |
22 | Other
23 | -----
24 | Work was funded by a grant from the NSF Cluster Exploratory (CluE) program (Award# IIS-0844530, PI: Baru, Co-PI Krishnan)
25 | More info:
26 | http://nsf.gov/awardsearch/showAward?AWD_ID=0844530
27 |
--------------------------------------------------------------------------------
/bin/sge-cleanup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | function print_usage {
4 | echo "Usage: -n NODES -h"
5 | echo " -n: Number of nodes requested for the Hadoop installation"
6 | echo " -h: Print help"
7 | }
8 |
9 | # initialize arguments
10 | NODES=""
11 |
12 | # parse arguments
13 | args=`getopt n:h $*`
14 | if test $? != 0
15 | then
16 | print_usage
17 | exit 1
18 | fi
19 | set -- $args
20 | for i
21 | do
22 | case "$i" in
23 | -n) shift;
24 | NODES=$1
25 | shift;;
26 |
27 | -h) shift;
28 | print_usage
29 | exit 0
30 | esac
31 | done
32 |
33 | if [ "$NODES" != "" ]; then
34 | echo "Number of Hadoop nodes specified by user: $NODES"
35 | else
36 | echo "Required parameter not set - number of nodes (-n)"
37 | print_usage
38 | exit 1
39 | fi
40 |
41 | # get the number of nodes from SGE
42 | if [ "$NODES" != "$NSLOTS" ]; then
43 | echo "Number of nodes received from SGE not the same as number of nodes requested by user"
44 | exit 1
45 | fi
46 |
47 | # clean up working directories for N-node Hadoop cluster
48 | for ((i=1; i<=$NODES; i++))
49 | do
50 | node=`awk 'NR=='"$i"'{print $1;exit}' $PE_HOSTFILE`
51 | echo "Clean up node: $node"
52 | cmd="rm -rf $HADOOP_DATA_DIR $HADOOP_LOG_DIR"
53 | echo $cmd
54 | ssh $node $cmd
55 | done
56 |
--------------------------------------------------------------------------------
/etc/core-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 | hadoop.tmp.dir
11 | HADOOP_DATA_DIR
12 | A base for other temporary directories.
13 |
14 |
15 |
16 | fs.default.name
17 | hdfs://MASTER:54310
18 | The name of the default file system. A URI whose
19 | scheme and authority determine the FileSystem implementation. The
20 | uri's scheme determines the config property (fs.SCHEME.impl) naming
21 | the FileSystem implementation class. The uri's authority is used to
22 | determine the host, port, etc. for a filesystem.
23 |
24 |
25 |
26 |
27 | io.file.buffer.size
28 | 131072
29 | Size of read/write buffer
30 |
31 |
32 |
33 | fs.inmemory.size.mb
34 | 650
35 | Larger amount of memory allocated for the in-memory file-system used to merge map-outputs at the reduces.
36 |
37 |
38 |
39 | io.sort.mb
40 | 650
41 | Higher memory-limit while sorting data.
42 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/bin/pbs-cleanup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | function print_usage {
4 | echo "Usage: -n NODES -h"
5 | echo " -n: Number of nodes requested for the Hadoop installation"
6 | echo " -h: Print help"
7 | }
8 |
9 | # initialize arguments
10 | NODES=""
11 |
12 | # parse arguments
13 | args=`getopt n:h $*`
14 | if test $? != 0
15 | then
16 | print_usage
17 | exit 1
18 | fi
19 | set -- $args
20 | for i
21 | do
22 | case "$i" in
23 | -n) shift;
24 | NODES=$1
25 | shift;;
26 |
27 | -h) shift;
28 | print_usage
29 | exit 0
30 | esac
31 | done
32 |
33 | if [ "$NODES" != "" ]; then
34 | echo "Number of Hadoop nodes specified by user: $NODES"
35 | else
36 | echo "Required parameter not set - number of nodes (-n)"
37 | print_usage
38 | exit 1
39 | fi
40 |
41 | # get the number of nodes from PBS
42 | if [ -e $PBS_NODEFILE ]; then
43 | pbsNodes=`awk 'END { print NR }' $PBS_NODEFILE`
44 | echo "Received $pbsNodes nodes from PBS"
45 |
46 | if [ "$NODES" != "$pbsNodes" ]; then
47 | echo "Number of nodes received from PBS not the same as number of nodes requested by user"
48 | exit 1
49 | fi
50 | else
51 | echo "PBS_NODEFILE is unavailable"
52 | exit 1
53 | fi
54 |
55 | # clean up working directories for N-node Hadoop cluster
56 | for ((i=1; i<=$NODES; i++))
57 | do
58 | node=`awk 'NR=='"$i"'{print;exit}' $PBS_NODEFILE`
59 | echo "Clean up node: $node"
60 | cmd="rm -rf $HADOOP_DATA_DIR $HADOOP_LOG_DIR"
61 | echo $cmd
62 | ssh $node $cmd
63 | done
64 |
--------------------------------------------------------------------------------
/etc/mapred-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | mapred.job.tracker
10 | MASTER:54311
11 | The host and port that the MapReduce job tracker runs
12 | at. If "local", then jobs are run in-process as a single map
13 | and reduce task.
14 |
15 |
16 |
17 |
18 |
19 | mapred.reduce.parallel.copies
20 | 4
21 | Number of parallel copies run by reduces - set to total number of nodes in the system
22 |
23 |
24 |
25 |
26 | mapred.tasktracker.map.tasks.maximum
27 | 4
28 | Maximum map tasks to be run simultaenously.
29 |
30 |
31 |
32 |
33 | mapred.tasktracker.reduce.tasks.maximum
34 | 2
35 | Maximum reduce tasks to be run simultaenously.
36 |
37 |
38 |
39 |
40 | mapred.job.reuse.jvm.num.tasks
41 | -1
42 | Reuse the JVM between tasks
43 |
44 |
45 |
46 |
47 | mapred.child.java.opts
48 | -Xmx1024m
49 | Large heap-size for child JVMs of maps/reduces
50 |
51 |
52 |
53 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | COPYRIGHT AND LICENSE
2 |
3 | Copyright (c) 2005-2012 The Regents of the University of California.
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or
7 | without modification, are permitted provided that the following
8 | conditions are met:
9 |
10 | 1. Redistributions of source code must retain the above copyright
11 | notice, this list of conditions and the following disclaimer.
12 |
13 | 2. Redistributions in binary form must reproduce the above
14 | copyright notice, this list of conditions and the following
15 | disclaimer in the documentation and/or other materials provided
16 | with the distribution.
17 |
18 | 3. All advertising materials mentioning features or use of this
19 | software must display the following acknowledgement: This product
20 | includes software developed by the San Diego Supercomputer Center.
21 |
22 | 4. Neither the names of the Centers nor the names of the contributors
23 | may be used to endorse or promote products derived from this
24 | software without specific prior written permission.
25 |
26 | THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS''
27 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
29 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS
30 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
32 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
33 | USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
34 | AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
36 | ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 | POSSIBILITY OF SUCH DAMAGE.
38 |
--------------------------------------------------------------------------------
/sge-example.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #$ -V -cwd
4 | #$ -N test
5 | #$ -pe hadoop 4
6 | #$ -o hadoop_test.out
7 | #$ -e hadoop_test.err
8 | #$ -S /bin/bash
9 |
10 | ### Run the myHadoop environment script to set the appropriate variables
11 | #
12 | # Note: ensure that the variables are set correctly in bin/setenv.sh
13 | source /home/sriram/Software/myHadoop-core/bin/setenv.sh
14 |
15 | #### Set this to the directory where Hadoop configs should be generated
16 | # Don't change the name of this variable (HADOOP_CONF_DIR) as it is
17 | # required by Hadoop - all config files will be picked up from here
18 | #
19 | # Make sure that this is accessible to all nodes
20 | export HADOOP_CONF_DIR="/home/sriram/Software/myHadoop-core/config"
21 |
22 | #### Set up the configuration
23 | # Make sure number of nodes is the same as what you have requested from SGE
24 | # usage: $MY_HADOOP_HOME/bin/sge-configure.sh -h
25 | echo "Set up the configurations for myHadoop"
26 | # this is the non-persistent mode
27 | $MY_HADOOP_HOME/bin/sge-configure.sh -n 4 -c $HADOOP_CONF_DIR
28 | # this is the persistent mode
29 | # $MY_HADOOP_HOME/bin/sge-configure.sh -n 4 -c $HADOOP_CONF_DIR -p -d /oasis/cloudstor-group/HDFS
30 | echo
31 |
32 | #### Format HDFS, if this is the first time or not a persistent instance
33 | echo "Format HDFS"
34 | $HADOOP_HOME/bin/hadoop --config $HADOOP_CONF_DIR namenode -format
35 | echo
36 |
37 | #### Start the Hadoop cluster
38 | echo "Start all Hadoop daemons"
39 | $HADOOP_HOME/bin/start-all.sh
40 | # $HADOOP_HOME/bin/hadoop dfsadmin -safemode leave
41 | echo
42 |
43 | #### Run your jobs here
44 | echo "Run some test Hadoop jobs"
45 | $HADOOP_HOME/bin/hadoop --config $HADOOP_CONF_DIR dfs -mkdir Data
46 | $HADOOP_HOME/bin/hadoop --config $HADOOP_CONF_DIR dfs -copyFromLocal /home/sriram/Data/gutenberg Data
47 | $HADOOP_HOME/bin/hadoop --config $HADOOP_CONF_DIR dfs -ls Data/gutenberg
48 | $HADOOP_HOME/bin/hadoop --config $HADOOP_CONF_DIR jar $HADOOP_HOME/hadoop-0.20.2-examples.jar wordcount Data/gutenberg Outputs
49 | $HADOOP_HOME/bin/hadoop --config $HADOOP_CONF_DIR dfs -ls Outputs
50 | echo
51 |
52 | #### Stop the Hadoop cluster
53 | echo "Stop all Hadoop daemons"
54 | $HADOOP_HOME/bin/stop-all.sh
55 | echo
56 |
57 | #### Clean up the working directories after job completion
58 | echo "Clean up"
59 | $MY_HADOOP_HOME/bin/sge-cleanup.sh -n 4
60 | echo
61 |
--------------------------------------------------------------------------------
/pbs-example.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #PBS -q batch
4 | #PBS -N hadoop_job
5 | #PBS -l nodes=4:ppn=1
6 | #PBS -o hadoop_run.out
7 | #PBS -e hadoop_run.err
8 | #PBS -A baru-tro
9 | #PBS -V
10 |
11 | ### Run the myHadoop environment script to set the appropriate variables
12 | #
13 | # Note: ensure that the variables are set correctly in bin/setenv.sh
14 | . /home/srkrishnan/Software/myHadoop/bin/setenv.sh
15 |
16 | #### Set this to the directory where Hadoop configs should be generated
17 | # Don't change the name of this variable (HADOOP_CONF_DIR) as it is
18 | # required by Hadoop - all config files will be picked up from here
19 | #
20 | # Make sure that this is accessible to all nodes
21 | export HADOOP_CONF_DIR="/home/srkrishnan/Software/myHadoop/config"
22 |
23 | #### Set up the configuration
24 | # Make sure number of nodes is the same as what you have requested from PBS
25 | # usage: $MY_HADOOP_HOME/bin/pbs-configure.sh -h
26 | echo "Set up the configurations for myHadoop"
27 | # this is the non-persistent mode
28 | $MY_HADOOP_HOME/bin/pbs-configure.sh -n 4 -c $HADOOP_CONF_DIR
29 | # this is the persistent mode
30 | # $MY_HADOOP_HOME/bin/pbs-configure.sh -n 4 -c $HADOOP_CONF_DIR -p -d /oasis/cloudstor-group/HDFS
31 | echo
32 |
33 | #### Format HDFS, if this is the first time or not a persistent instance
34 | echo "Format HDFS"
35 | $HADOOP_HOME/bin/hadoop --config $HADOOP_CONF_DIR namenode -format
36 | echo
37 |
38 | #### Start the Hadoop cluster
39 | echo "Start all Hadoop daemons"
40 | $HADOOP_HOME/bin/start-all.sh
41 | #$HADOOP_HOME/bin/hadoop dfsadmin -safemode leave
42 | echo
43 |
44 | #### Run your jobs here
45 | echo "Run some test Hadoop jobs"
46 | $HADOOP_HOME/bin/hadoop --config $HADOOP_CONF_DIR dfs -mkdir Data
47 | $HADOOP_HOME/bin/hadoop --config $HADOOP_CONF_DIR dfs -copyFromLocal /home/srkrishnan/Data/gutenberg Data
48 | $HADOOP_HOME/bin/hadoop --config $HADOOP_CONF_DIR dfs -ls Data/gutenberg
49 | $HADOOP_HOME/bin/hadoop --config $HADOOP_CONF_DIR jar $HADOOP_HOME/hadoop-0.20.2-examples.jar wordcount Data/gutenberg Outputs
50 | $HADOOP_HOME/bin/hadoop --config $HADOOP_CONF_DIR dfs -ls Outputs
51 | echo
52 |
53 | #### Stop the Hadoop cluster
54 | echo "Stop all Hadoop daemons"
55 | $HADOOP_HOME/bin/stop-all.sh
56 | echo
57 |
58 | #### Clean up the working directories after job completion
59 | echo "Clean up"
60 | $MY_HADOOP_HOME/bin/pbs-cleanup.sh -n 4
61 | echo
62 |
--------------------------------------------------------------------------------
/bin/sge-configure.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | function print_usage {
4 | echo "Usage: -n NODES -p -d BASE_DIR -c CONFIG_DIR -h"
5 | echo " -n: Number of nodes requested for the Hadoop installation"
6 | echo " -p: Whether the Hadoop installation should be persistent"
7 | echo " If so, data directories will have to be linked to a"
8 | echo " directory that is not local to enable persistence"
9 | echo " -d: Base directory to persist HDFS state, to be used if"
10 | echo " -p is set"
11 | echo " -c: The directory to generate Hadoop configs in"
12 | echo " -h: Print help"
13 | }
14 |
15 | # initialize arguments
16 | NODES=""
17 | PERSIST="false"
18 | BASE_DIR=""
19 | CONFIG_DIR=""
20 |
21 | # parse arguments
22 | args=`getopt n:pd:c:h $*`
23 | if test $? != 0
24 | then
25 | print_usage
26 | exit 1
27 | fi
28 | set -- $args
29 | for i
30 | do
31 | case "$i" in
32 | -n) shift;
33 | NODES=$1
34 | shift;;
35 |
36 | -d) shift;
37 | BASE_DIR=$1
38 | shift;;
39 |
40 | -c) shift;
41 | CONFIG_DIR=$1
42 | shift;;
43 |
44 | -p) shift;
45 | PERSIST="true"
46 | ;;
47 |
48 | -h) shift;
49 | print_usage
50 | exit 0
51 | esac
52 | done
53 |
54 | if [ "$NODES" != "" ]; then
55 | echo "Number of Hadoop nodes requested: $NODES"
56 | else
57 | echo "Required parameter not set - number of nodes (-n)"
58 | print_usage
59 | exit 1
60 | fi
61 |
62 | if [ "$CONFIG_DIR" != "" ]; then
63 | echo "Generation Hadoop configuration in directory: $CONFIG_DIR"
64 | else
65 | echo "Location of configuration directory not specified"
66 | print_usage
67 | exit 1
68 | fi
69 |
70 | if [ "$PERSIST" = "true" ]; then
71 | echo "Persisting HDFS state (-p)"
72 | if [ "$BASE_DIR" = "" ]; then
73 | echo "Base directory (-d) not set for persisting HDFS state"
74 | print_usage
75 | exit 1
76 | else
77 | echo "Using directory $BASE_DIR for persisting HDFS state"
78 | fi
79 | else
80 | echo "Not persisting HDFS state"
81 | fi
82 |
83 | # get the number of nodes from SGE
84 | if [ "$NODES" != "$NSLOTS" ]; then
85 | echo "Number of nodes received from SGE not the same as number of nodes requested by user"
86 | exit 1
87 | fi
88 |
89 | # create the config, data, and log directories
90 | rm -rf $CONFIG_DIR
91 | mkdir -p $CONFIG_DIR
92 |
93 | # first copy over all default Hadoop configs
94 | cp $HADOOP_HOME/conf/* $CONFIG_DIR
95 |
96 | # pick the master node as the first node in the PE_HOSTFILE
97 | MASTER_NODE=`awk 'NR==1{print $1;exit}' $PE_HOSTFILE`
98 | echo "Master is: $MASTER_NODE"
99 | echo $MASTER_NODE > $CONFIG_DIR/masters
100 |
101 | # every node in the PE_HOSTFILE is a slave
102 | awk < $PE_HOSTFILE '{print $1}' > $CONFIG_DIR/slaves
103 |
104 | # update the hdfs and mapred configs
105 | sed 's/.*:/'"$MASTER_NODE"':/g' $MY_HADOOP_HOME/etc/mapred-site.xml > $CONFIG_DIR/mapred-site.xml
106 | sed 's/hdfs:\/\/.*:/hdfs:\/\/'"$MASTER_NODE"':/g' $MY_HADOOP_HOME/etc/core-site.xml > $CONFIG_DIR/core-site.xml
107 | sed -i 's:HADOOP_DATA_DIR:'"$HADOOP_DATA_DIR"':g' $CONFIG_DIR/core-site.xml
108 | cp $MY_HADOOP_HOME/etc/hdfs-site.xml $CONFIG_DIR/
109 |
110 | # update the HADOOP log directory
111 | echo "" >> $CONFIG_DIR/hadoop-env.sh
112 | echo "# Overwrite location of the log directory" >> $CONFIG_DIR/hadoop-env.sh
113 | echo "export HADOOP_LOG_DIR=$HADOOP_LOG_DIR" >> $CONFIG_DIR/hadoop-env.sh
114 |
115 | # set the HADOOP_HEAPSIZE to 4GB
116 | # echo "" >> $CONFIG_DIR/hadoop-env.sh
117 | # echo "# Set the HADOOP_HEAPSIZE to 4GB" >> $CONFIG_DIR/hadoop-env.sh
118 | # echo "export HADOOP_HEAPSIZE=4096" >> $CONFIG_DIR/hadoop-env.sh
119 |
120 | # JVM settings
121 | # echo "" >> $CONFIG_DIR/hadoop-env.sh
122 | # echo "# JVM settings for Hadoop" >> $CONFIG_DIR/hadoop-env.sh
123 | # echo "export HADOOP_OPTS=\"-server -XX:+UseParallelGC -XX:ParallelGCThreads=4 -XX:+AggressiveHeap -XX:+HeapDumpOnOutOfMemoryError\"" >> $CONFIG_DIR/hadoop-env.sh
124 |
125 | # create or link HADOOP_{DATA,LOG}_DIR on all slaves
126 | for ((i=1; i<=$NODES; i++))
127 | do
128 | node=`awk 'NR=='"$i"'{print $1;exit}' $PE_HOSTFILE`
129 | echo "Configuring node: $node"
130 | cmd="rm -rf $HADOOP_LOG_DIR; mkdir -p $HADOOP_LOG_DIR"
131 | echo $cmd
132 | ssh $node $cmd
133 | if [ "$PERSIST" = "true" ]; then
134 | cmd="rm -rf $HADOOP_DATA_DIR; ln -s $BASE_DIR/$i $HADOOP_DATA_DIR"
135 | echo $cmd
136 | ssh $node $cmd
137 | else
138 | cmd="rm -rf $HADOOP_DATA_DIR; mkdir -p $HADOOP_DATA_DIR"
139 | echo $cmd
140 | ssh $node $cmd
141 | fi
142 | done
143 |
--------------------------------------------------------------------------------
/bin/pbs-configure.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | function print_usage {
4 | echo "Usage: -n NODES -p -d BASE_DIR -c CONFIG_DIR -h"
5 | echo " -n: Number of nodes requested for the Hadoop installation"
6 | echo " -p: Whether the Hadoop installation should be persistent"
7 | echo " If so, data directories will have to be linked to a"
8 | echo " directory that is not local to enable persistence"
9 | echo " -d: Base directory to persist HDFS state, to be used if"
10 | echo " -p is set"
11 | echo " -c: The directory to generate Hadoop configs in"
12 | echo " -h: Print help"
13 | }
14 |
15 | # initialize arguments
16 | NODES=""
17 | PERSIST="false"
18 | BASE_DIR=""
19 | CONFIG_DIR=""
20 |
21 | # parse arguments
22 | args=`getopt n:pd:c:h $*`
23 | if test $? != 0
24 | then
25 | print_usage
26 | exit 1
27 | fi
28 | set -- $args
29 | for i
30 | do
31 | case "$i" in
32 | -n) shift;
33 | NODES=$1
34 | shift;;
35 |
36 | -d) shift;
37 | BASE_DIR=$1
38 | shift;;
39 |
40 | -c) shift;
41 | CONFIG_DIR=$1
42 | shift;;
43 |
44 | -p) shift;
45 | PERSIST="true"
46 | ;;
47 |
48 | -h) shift;
49 | print_usage
50 | exit 0
51 | esac
52 | done
53 |
54 | if [ "$NODES" != "" ]; then
55 | echo "Number of Hadoop nodes requested: $NODES"
56 | else
57 | echo "Required parameter not set - number of nodes (-n)"
58 | print_usage
59 | exit 1
60 | fi
61 |
62 | if [ "$CONFIG_DIR" != "" ]; then
63 | echo "Generation Hadoop configuration in directory: $CONFIG_DIR"
64 | else
65 | echo "Location of configuration directory not specified"
66 | print_usage
67 | exit 1
68 | fi
69 |
70 | if [ "$PERSIST" = "true" ]; then
71 | echo "Persisting HDFS state (-p)"
72 | if [ "$BASE_DIR" = "" ]; then
73 | echo "Base directory (-d) not set for persisting HDFS state"
74 | print_usage
75 | exit 1
76 | else
77 | echo "Using directory $BASE_DIR for persisting HDFS state"
78 | fi
79 | else
80 | echo "Not persisting HDFS state"
81 | fi
82 |
83 | # get the number of nodes from PBS
84 | if [ -e $PBS_NODEFILE ]; then
85 | PBS_NODES=`awk 'END { print NR }' $PBS_NODEFILE`
86 | echo "Received $PBS_NODES nodes from PBS"
87 |
88 | if [ "$NODES" != "$PBS_NODES" ]; then
89 | echo "Number of nodes received from PBS not the same as number of nodes requested by user"
90 | exit 1
91 | fi
92 | else
93 | echo "PBS_NODEFILE is unavailable"
94 | exit 1
95 | fi
96 |
97 | # create the config, data, and log directories
98 | rm -rf $CONFIG_DIR
99 | mkdir -p $CONFIG_DIR
100 |
101 | # first copy over all default Hadoop configs
102 | cp $HADOOP_HOME/conf/* $CONFIG_DIR
103 |
104 | # pick the master node as the first node in the PBS_NODEFILE
105 | MASTER_NODE=`awk 'NR==1{print;exit}' $PBS_NODEFILE`
106 | echo "Master is: $MASTER_NODE"
107 | echo $MASTER_NODE > $CONFIG_DIR/masters
108 |
109 | # every node in the PBS_NODEFILE is a slave
110 | cat $PBS_NODEFILE > $CONFIG_DIR/slaves
111 |
112 | # update the hdfs and mapred configs
113 | sed 's/.*:/'"$MASTER_NODE"':/g' $MY_HADOOP_HOME/etc/mapred-site.xml > $CONFIG_DIR/mapred-site.xml
114 | sed 's/hdfs:\/\/.*:/hdfs:\/\/'"$MASTER_NODE"':/g' $MY_HADOOP_HOME/etc/core-site.xml > $CONFIG_DIR/core-site.xml
115 | sed -i 's:HADOOP_DATA_DIR:'"$HADOOP_DATA_DIR"':g' $CONFIG_DIR/core-site.xml
116 | cp $MY_HADOOP_HOME/etc/hdfs-site.xml $CONFIG_DIR/
117 |
118 | # update the HADOOP log directory
119 | echo "" >> $CONFIG_DIR/hadoop-env.sh
120 | echo "# Overwrite location of the log directory" >> $CONFIG_DIR/hadoop-env.sh
121 | echo "export HADOOP_LOG_DIR=$HADOOP_LOG_DIR" >> $CONFIG_DIR/hadoop-env.sh
122 |
123 | # set the HADOOP_HEAPSIZE to 4GB
124 | # echo "" >> $CONFIG_DIR/hadoop-env.sh
125 | # echo "# Set the HADOOP_HEAPSIZE to 4GB" >> $CONFIG_DIR/hadoop-env.sh
126 | # echo "export HADOOP_HEAPSIZE=4096" >> $CONFIG_DIR/hadoop-env.sh
127 |
128 | # JVM settings
129 | # echo "" >> $CONFIG_DIR/hadoop-env.sh
130 | # echo "# JVM settings for Hadoop" >> $CONFIG_DIR/hadoop-env.sh
131 | # echo "export HADOOP_OPTS=\"-server -XX:+UseParallelGC -XX:ParallelGCThreads=4 -XX:+AggressiveHeap -XX:+HeapDumpOnOutOfMemoryError\"" >> $CONFIG_DIR/hadoop-env.sh
132 |
133 | # create or link HADOOP_{DATA,LOG}_DIR on all slaves
134 | for ((i=1; i<=$NODES; i++))
135 | do
136 | node=`awk 'NR=='"$i"'{print;exit}' $PBS_NODEFILE`
137 | echo "Configuring node: $node"
138 | cmd="rm -rf $HADOOP_LOG_DIR; mkdir -p $HADOOP_LOG_DIR"
139 | echo $cmd
140 | ssh $node $cmd
141 | if [ "$PERSIST" = "true" ]; then
142 | cmd="rm -rf $HADOOP_DATA_DIR; ln -s $BASE_DIR/$i $HADOOP_DATA_DIR"
143 | echo $cmd
144 | ssh $node $cmd
145 | else
146 | cmd="rm -rf $HADOOP_DATA_DIR; mkdir -p $HADOOP_DATA_DIR"
147 | echo $cmd
148 | ssh $node $cmd
149 | fi
150 | done
151 |
--------------------------------------------------------------------------------