├── README.md
└── scripts
    ├── hive
        └── validationHiveTablesLinuxMode.sh
    └── sqoop
        └── sqoopTables.sh


/README.md:
--------------------------------------------------------------------------------
1 | # hadoopUtils
2 | 
3 | Repository for some small tools that might prove useful with Hadoop.
4 | 
5 | 


--------------------------------------------------------------------------------
/scripts/hive/validationHiveTablesLinuxMode.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # script to do quick validation of Hive tables
  4 | 
  5 | # Documentation:
  6 | # https://community.hortonworks.com/articles/1283/hive-script-to-validate-tables-compare-one-with-an.html
  7 | 
  8 | listOfTables=$*
  9 | 
 10 | origDatabase=hiveSourceOfTruthDatabase
 11 | resultDatabase=hiveDatabasetoValidate
 12 | 
 13 | baseDir=/tmp/validation/${origDatabase}_${resultDatabase}
 14 | [ -d $baseDir ] || mkdir -p $baseDir
 15 | globalReport=$baseDir/globalReport
 16 | echo -e "\n\n######################################################### New analysis (`date`) #########################################################\n" >> $globalReport
 17 | echo -e "Arguments passed:\n$listOfTables\n" >> $globalReport
 18 | 
 19 | # If a file is bigger than splitNumLines, then we split it into chunks (each chunk having splitNumLines lines at most), in order to make easier the comparisons with vimdiff
 20 | splitNumLines=301024
 21 | 
 22 | for bucket in $listOfTables ; do
 23 | 
 24 |     splitMode=0
 25 | 
 26 |     # if there are exclusions, let's handle them here
 27 |     table=`echo $bucket | cut -d: -f1`
 28 |     # columnsToExclude should have the format: col24,col12,col3
 29 |     # if some columns have a "space", use '.' instead. For instance, for column "coa id" you would have to write "coa.id"
 30 |     columnsToExclude=`echo $bucket | cut -s -d: -f2 | sed 's/,/|/g' `
 31 | 
 32 |     echo -n "####  Comparing $origDatabase.$table with $resultDatabase.$table" | tee -a $globalReport
 33 |     if [ "x$columnsToExclude" = "x" ]; then
 34 | 	echo | tee -a $globalReport
 35 |     else
 36 | 	echo " with the following columns excluded: $columnsToExclude" | tee -a $globalReport
 37 |     fi
 38 | 
 39 |     communDir=$baseDir/$table
 40 |     origDir=$communDir/orig
 41 |     resultDir=$communDir/result
 42 |     [ -d $communDir/tmp ] || mkdir -p $communDir/tmp
 43 |     hiveExecuteFile_orig=$communDir/tmp/hiveExecute_orig
 44 |     hiveExecuteFile_result=$communDir/tmp/hiveExecute_result
 45 | 
 46 |     #####
 47 |     # Download all the table to the Linux file system
 48 |     #####
 49 | 
 50 |     # move dir if already exists. Only keep 1 history
 51 |     for mydir in $origDir $resultDir; do
 52 | 	[ -d $mydir.old ] && rm -rf $mydir.old
 53 | 	[ -d $mydir ] && mv $mydir $mydir.old
 54 |     done
 55 | 
 56 |     insert1stPart="INSERT OVERWRITE local directory"
 57 |     insert2stPart="ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' select"
 58 | 
 59 |     if [ "x$columnsToExclude" = "x" ]; then
 60 |     	echo "$insert1stPart '$origDir' $insert2stPart * from $origDatabase.$table;" > $hiveExecuteFile_orig
 61 |     	echo "$insert1stPart '$resultDir' $insert2stPart * from $resultDatabase.$table;" > $hiveExecuteFile_result
 62 |     else
 63 |     	echo "set hive.support.quoted.identifiers=none;" | tee $hiveExecuteFile_orig $hiveExecuteFile_result &> /dev/null
 64 |     	echo "$insert1stPart '$origDir' $insert2stPart " '`('$columnsToExclude')?+.+` ' "from $origDatabase.$table;" >> $hiveExecuteFile_orig
 65 |     	echo "$insert1stPart '$resultDir' $insert2stPart " '`('$columnsToExclude')?+.+` ' " from $resultDatabase.$table;" >> $hiveExecuteFile_result
 66 |     fi
 67 | 
 68 |     queueConf='--hiveconf tez.queue.name=default'
 69 |     ( hive $queueConf -f $hiveExecuteFile_orig &> ${hiveExecuteFile_orig}.log ) &
 70 |     ( hive $queueConf -f $hiveExecuteFile_result &> ${hiveExecuteFile_result}.log ) &
 71 |     wait
 72 | 
 73 |     for tableDir in $origDir $resultDir; do
 74 | 	if [ ! -d $tableDir ]; then
 75 | 		echo "Was not able to find the path $tableDir. Are you sure you that the table $table is in the Hive database?" | tee -a $globalReport
 76 | 		continue 2
 77 | 	fi
 78 |     done
 79 | 
 80 |     #####
 81 |     # Sort the tables downloaded
 82 |     #####
 83 |     for i in $origDir $resultDir; do
 84 | 	(
 85 |         cd $i
 86 | 	echo $columnsToExclude > columnsToExclude
 87 | 
 88 | 	sort -S 30% --temporary-directory=$communDir/tmp --numeric-sort 0* -o sorted
 89 | 	) &
 90 |     done
 91 |     wait
 92 | 
 93 |     #####
 94 |     # Count the number of rows
 95 |     #####
 96 |     for i in $origDir $resultDir; do
 97 | 	cd $i
 98 | 	if [ $i = $origDir ]; then
 99 | 		origTableNumLines=`wc -l $origDir/sorted | cut -d' ' -f1`
100 | 		[ $origTableNumLines -gt $splitNumLines ] && splitMode=1	# we only check the size of the orig table to decide if we need to split
101 | 	else
102 | 		resultTableNumLines=`wc -l $resultDir/sorted | cut -d' ' -f1`
103 | 	fi
104 |     done
105 | 
106 |     for i in $origDir $resultDir; do
107 | 	(
108 | 	cd $i
109 | 	if [ "x$splitMode" = "x1" ]; then
110 | 		split --suffix-length=3 --lines=$splitNumLines sorted sorted-
111 | 		rm -f sorted
112 | 	fi
113 | 
114 |         rm -f 0* .0*crc   # remove unnecessary files in order to not fill the FileSystem
115 | 	) &
116 |     done
117 |     wait
118 | 
119 |     #####
120 |     # Compare the results
121 |     #####
122 |     [ "x$origTableNumLines" != "x$resultTableNumLines" ] && echo "ERROR: the number of rows is different from orig table ($origTableNumLines rows) to result table ($resultTableNumLines rows)" | tee -a $globalReport
123 |     if [ "x$splitMode" = "x1" ]; then
124 | 	numDiff=0
125 | 	chunksWithErrors=""
126 | 	for fileChunk in $origDir/sorted-* ; do
127 | 	    chunk=`basename $fileChunk`
128 | 	    [ ! -e $resultDir/$chunk ] && continue	# we might have no file if there is less data in result table than in orig
129 | 	    numDiffTmp=`diff {$origDir,$resultDir}/$chunk | grep '^<' | wc -l`
130 | 	    if [ $numDiffTmp -gt 0 ]; then
131 | 		((numDiff=$numDiff + $numDiffTmp))
132 | 		chunksWithErrors="${chunksWithErrors}$numDiffTmp\t\t\t$chunk\n"
133 | 	    else
134 | 		rm -f $resultDir/$chunk		# we don't need to replicate the same data
135 | 	    fi
136 | 	done
137 |     else
138 | 	numDiff=0
139 | 	if [ -e $resultDir/sorted ]; then
140 | 	    grepDiffPattern='^<'
141 | 	    [ $origTableNumLines -lt $resultTableNumLines ] && grepDiffPattern='^>'
142 | 	    numDiff=`diff {$origDir,$resultDir}/sorted | grep $grepDiffPattern | wc -l`
143 | 	fi
144 | 	[ "x$numDiff" = "x0" ] && rm -f $resultDir/sorted	# we don't need to replicate the same data
145 |     fi
146 | 
147 |     #####
148 |     # Show the results
149 |     #####
150 |     echo 'Number of differences: ' $numDiff "(out of  $origTableNumLines rows)" | tee -a $globalReport
151 | 
152 |     if [ "x$numDiff" != "x0" ]; then
153 |         if [ "x$splitMode" = "x1" ]; then
154 | 	    echo "To see the differences visually: vimdiff -o -c \"windo set wrap foldcolumn=0\" $communDir/{orig,result}/<chunkID>.gz"
155 | 	    echo "NumberOfDifferences	ChunkIK"
156 | 	    echo -e $chunksWithErrors
157 | 	else
158 | 	    echo "To see the differences visually: vimdiff -o -c \"windo set wrap foldcolumn=0\" $communDir/{orig,result}/sorted.gz"
159 |     	    echo
160 | 	fi
161 |     else
162 | 	echo
163 |     fi
164 | 
165 |     gzip $origDir/sorted* &
166 |     [ "x$numDiff" != "x0" ] && gzip $resultDir/sorted* &
167 | done
168 | 
169 | 
170 | 


--------------------------------------------------------------------------------
/scripts/sqoop/sqoopTables.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Script to download lot of tables in parallel with Sqoop and write them to Hive
  4 | 
  5 | # Example of usage:
  6 | # ./sqoopTables.sh -d myDatabase2 -H myHiveDatabase3 -p 6 -q etl /tmp/foo
  7 | # the file (in that example: /tmp/foo) must contain on each line the name of a table you want to sqoop. If you want to sqoop 5 tables, you need 5 lines in your file
  8 | # your TODO: this script is focused to SQLserver. Search the "sqoop import" line (in the middle of the script) and change the header of the URL appropriately. Also change in this line the user and password here.
  9 | 
 10 | # Documentation:
 11 | # https://community.hortonworks.com/articles/23602/sqoop-fetching-lot-of-tables-in-parallel.html
 12 | 
 13 | #### Default values:
 14 | # (configure here your default values. Those values can be overriden on the command line)
 15 | origServer=myRelationalDatabase.example.com		# The FQDN of the relational database you want to fetch (option: -o)
 16 | origDatabase=myDatabase							# The names of the database that contains the tables to fetch (option: -d)
 17 | hiveDatabase=myHiveDatabase						# The name of the Hive database that will get the tables fetched (option: -H)
 18 | parallelism=4									# The number of tables (sqoop processes) you want to download at the same time (option: -p)
 19 | queue=default									# The queue used in Yarn (option: -q)
 20 | baseDir=/tmp/sqoopTables						# Base directory where will be stored the log files (option: -b)
 21 | dirJavaGeneratedCode=/tmp/doSqoopTable-`id -u`-tmp		# Directory for the java code generated by Sqoop (option: -c)
 22 | targetTmpHdfsDir=/tmp/doSqoopTable-`id -u`-tmp/$$		# Temporary directory in HDFS to store downloaded data before moving it to Hive
 23 | 
 24 | # TODO: delete sqoop tmp directory after jobs ends
 25 | 
 26 | # argument: variable you want to be populated byt the value poped by the stack
 27 | function popStack(){
 28 | 	: ${1?'Missing stack name'}
 29 | 
 30 | 	# check if array not void
 31 | 	local pointerStack=$(<$pointerStackFile)
 32 | 	[ $pointerStack -ge $initialSizeStack ] && return 1
 33 | 
 34 | 	eval "$1"=${listOfTables[$pointerStack]}
 35 | 
 36 | 	(( pointerStack=$pointerStack+1))
 37 | 	echo $pointerStack > $pointerStackFile
 38 | 	return 0
 39 | }
 40 | 
 41 | function doSqoop(){
 42 | 	local id=$1	# this id should be unique betwween all the instances of doSqoop
 43 | 	local myVariable=tableFor_$id
 44 | 	local logFileSummary=$databaseBaseDir/process_$id-summary.log
 45 | 	local logFileRaw=$databaseBaseDir/process_$id-raw.log
 46 | 	
 47 | 	echo -e "\n\n############################# `date` Starting new execution [ $origDatabase -> $hiveDatabase ] #############################" >> $logFileSummary
 48 | 
 49 | 	while true; do
 50 | 		# get the name of the next table to sqoop
 51 | 		popStack $myVariable
 52 | 		[ $? -ne 0 ] && break
 53 | 
 54 | 		local myTable=${!myVariable}
 55 | 		# in my current project, there were some special mapping to get the name of the Hive tables from the name of the SQLserver tables:
 56 | 		#local origTable=`echo $myTable | sed -e 's/^su_/dbosu./' -e 's/^sc_/dbosc./' -e 's/^sm_/dbosm./' -e 's/^cm_/dbo./'`
 57 | 		local origTable=$myTable
 58 | 
 59 | 		echo "[`date`] Creating the table $hiveDatabase.$myTable from the SQLserver table $origDatabase.$origTable" | tee -a $logFileSummary $logFileRaw
 60 | 
 61 | 		# To fix bug for tables that have columns with spaces:
 62 | 		# see: http://stackoverflow.com/questions/27572527/how-to-support-column-names-with-spaces-using-sqoop-import
 63 | 		sqoop import -D mapreduce.job.queuename=$queue -D mapreduce.job.ubertask.enable=true --connect "jdbc:sqlserver://$origServer:1433; database=$origDatabase; username=myUser; password=myPassword" --hive-import --hive-database $hiveDatabase --fields-terminated-by '\t' --null-string '' --null-non-string '' -m 1 --outdir $dirJavaGeneratedCode --query "select a.* from $origTable a where \$CONDITIONS" --target-dir $targetTmpHdfsDir/$myTable --hive-table $myTable >> $logFileRaw 2>> $logFileRaw
 64 | 
 65 | 		echo "Tail of : $logFileRaw" >> $logFileSummary
 66 | 		tail -6 $logFileRaw  >> $logFileSummary
 67 | 	done
 68 | 
 69 | 	echo -e "\n############################# `date` Ending execution [ $origDatabase -> $hiveDatabase ] #############################" >> $logFileSummary
 70 | }
 71 | 
 72 | function usage() {
 73 | 	echo -e "usage:\t`basename $0` [-b <report directory>] [-c <directory for java code>] [-d <source database] [-H <hive database>] [-o <source server>] [-p <parallelism>] [-q <queue>] <fileName>"
 74 | 	echo -e "usage:\t`basename $0` -h"
 75 | 	echo -e "\t\t\tthe file must contain on each line the name of a table you want to sqoop. If you want to sqoop 5 tables, you need 5 lines in your file"
 76 | 	exit 0
 77 | }
 78 | 
 79 | while getopts "b:c:d:hH:o:p:q:" FLAG; do
 80 |   case $FLAG in
 81 |     b)  baseDir=$OPTARG
 82 |       ;;
 83 |     c)  dirJavaGeneratedCode=$OPTARG
 84 |       ;;
 85 |     d)  origDatabase=$OPTARG
 86 |       ;;
 87 |     h)  usage
 88 |       ;;
 89 |     H)  hiveDatabase=$OPTARG
 90 |       ;;
 91 |     o)  origServer=$OPTARG
 92 |       ;;
 93 |     p)  parallelism=$OPTARG
 94 |       ;;
 95 |     q)  queue=$OPTARG
 96 |       ;;
 97 |     \?) #unrecognized option - show help
 98 |       echo -e \\n"Option -${BOLD}$OPTARG${NORM} not allowed."
 99 |       usage
100 |       ;;
101 |   esac
102 | done
103 | 
104 | shift $((OPTIND-1))  #This tells getopts to move on to the next argument.
105 | 
106 | # TODO: check if the argument is a file. If not, consider this is a list of table
107 | 
108 | myFile=$1
109 | [ "x$myFile" = "x" ] && usage
110 | [ -f $myFile ] || usage
111 | 
112 | listOfTables=( `cat $myFile`)
113 | initialSizeStack=${#listOfTables[@]}
114 | pointerStack=0	# to know which element must be fetched next
115 | pointerStackFile=/dev/shm/.sqoopTables_pointerStack_$$
116 | echo 0 > $pointerStackFile
117 | 
118 | databaseBaseDir=$baseDir/$origDatabase-$hiveDatabase
119 | [ -d $databaseBaseDir ] || mkdir -p $databaseBaseDir
120 | for i in `seq $parallelism`; do
121 | 	sleep 0.1 # to avoid subprocesses to pop the stack at the same time
122 | 	(doSqoop $i) &
123 | done
124 | 
125 | wait
126 | rm -f $pointerStackFile
127 | 
128 | 


--------------------------------------------------------------------------------