├── README.md └── scripts ├── hive └── validationHiveTablesLinuxMode.sh └── sqoop └── sqoopTables.sh /README.md: -------------------------------------------------------------------------------- 1 | # hadoopUtils 2 | 3 | Repository for some small tools that might prove useful with Hadoop. 4 | 5 | -------------------------------------------------------------------------------- /scripts/hive/validationHiveTablesLinuxMode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # script to do quick validation of Hive tables 4 | 5 | # Documentation: 6 | # https://community.hortonworks.com/articles/1283/hive-script-to-validate-tables-compare-one-with-an.html 7 | 8 | listOfTables=$* 9 | 10 | origDatabase=hiveSourceOfTruthDatabase 11 | resultDatabase=hiveDatabasetoValidate 12 | 13 | baseDir=/tmp/validation/${origDatabase}_${resultDatabase} 14 | [ -d $baseDir ] || mkdir -p $baseDir 15 | globalReport=$baseDir/globalReport 16 | echo -e "\n\n######################################################### New analysis (`date`) #########################################################\n" >> $globalReport 17 | echo -e "Arguments passed:\n$listOfTables\n" >> $globalReport 18 | 19 | # If a file is bigger than splitNumLines, then we split it into chunks (each chunk having splitNumLines lines at most), in order to make easier the comparisons with vimdiff 20 | splitNumLines=301024 21 | 22 | for bucket in $listOfTables ; do 23 | 24 | splitMode=0 25 | 26 | # if there are exclusions, let's handle them here 27 | table=`echo $bucket | cut -d: -f1` 28 | # columnsToExclude should have the format: col24,col12,col3 29 | # if some columns have a "space", use '.' instead. For instance, for column "coa id" you would have to write "coa.id" 30 | columnsToExclude=`echo $bucket | cut -s -d: -f2 | sed 's/,/|/g' ` 31 | 32 | echo -n "#### Comparing $origDatabase.$table with $resultDatabase.$table" | tee -a $globalReport 33 | if [ "x$columnsToExclude" = "x" ]; then 34 | echo | tee -a $globalReport 35 | else 36 | echo " with the following columns excluded: $columnsToExclude" | tee -a $globalReport 37 | fi 38 | 39 | communDir=$baseDir/$table 40 | origDir=$communDir/orig 41 | resultDir=$communDir/result 42 | [ -d $communDir/tmp ] || mkdir -p $communDir/tmp 43 | hiveExecuteFile_orig=$communDir/tmp/hiveExecute_orig 44 | hiveExecuteFile_result=$communDir/tmp/hiveExecute_result 45 | 46 | ##### 47 | # Download all the table to the Linux file system 48 | ##### 49 | 50 | # move dir if already exists. Only keep 1 history 51 | for mydir in $origDir $resultDir; do 52 | [ -d $mydir.old ] && rm -rf $mydir.old 53 | [ -d $mydir ] && mv $mydir $mydir.old 54 | done 55 | 56 | insert1stPart="INSERT OVERWRITE local directory" 57 | insert2stPart="ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' select" 58 | 59 | if [ "x$columnsToExclude" = "x" ]; then 60 | echo "$insert1stPart '$origDir' $insert2stPart * from $origDatabase.$table;" > $hiveExecuteFile_orig 61 | echo "$insert1stPart '$resultDir' $insert2stPart * from $resultDatabase.$table;" > $hiveExecuteFile_result 62 | else 63 | echo "set hive.support.quoted.identifiers=none;" | tee $hiveExecuteFile_orig $hiveExecuteFile_result &> /dev/null 64 | echo "$insert1stPart '$origDir' $insert2stPart " '`('$columnsToExclude')?+.+` ' "from $origDatabase.$table;" >> $hiveExecuteFile_orig 65 | echo "$insert1stPart '$resultDir' $insert2stPart " '`('$columnsToExclude')?+.+` ' " from $resultDatabase.$table;" >> $hiveExecuteFile_result 66 | fi 67 | 68 | queueConf='--hiveconf tez.queue.name=default' 69 | ( hive $queueConf -f $hiveExecuteFile_orig &> ${hiveExecuteFile_orig}.log ) & 70 | ( hive $queueConf -f $hiveExecuteFile_result &> ${hiveExecuteFile_result}.log ) & 71 | wait 72 | 73 | for tableDir in $origDir $resultDir; do 74 | if [ ! -d $tableDir ]; then 75 | echo "Was not able to find the path $tableDir. Are you sure you that the table $table is in the Hive database?" | tee -a $globalReport 76 | continue 2 77 | fi 78 | done 79 | 80 | ##### 81 | # Sort the tables downloaded 82 | ##### 83 | for i in $origDir $resultDir; do 84 | ( 85 | cd $i 86 | echo $columnsToExclude > columnsToExclude 87 | 88 | sort -S 30% --temporary-directory=$communDir/tmp --numeric-sort 0* -o sorted 89 | ) & 90 | done 91 | wait 92 | 93 | ##### 94 | # Count the number of rows 95 | ##### 96 | for i in $origDir $resultDir; do 97 | cd $i 98 | if [ $i = $origDir ]; then 99 | origTableNumLines=`wc -l $origDir/sorted | cut -d' ' -f1` 100 | [ $origTableNumLines -gt $splitNumLines ] && splitMode=1 # we only check the size of the orig table to decide if we need to split 101 | else 102 | resultTableNumLines=`wc -l $resultDir/sorted | cut -d' ' -f1` 103 | fi 104 | done 105 | 106 | for i in $origDir $resultDir; do 107 | ( 108 | cd $i 109 | if [ "x$splitMode" = "x1" ]; then 110 | split --suffix-length=3 --lines=$splitNumLines sorted sorted- 111 | rm -f sorted 112 | fi 113 | 114 | rm -f 0* .0*crc # remove unnecessary files in order to not fill the FileSystem 115 | ) & 116 | done 117 | wait 118 | 119 | ##### 120 | # Compare the results 121 | ##### 122 | [ "x$origTableNumLines" != "x$resultTableNumLines" ] && echo "ERROR: the number of rows is different from orig table ($origTableNumLines rows) to result table ($resultTableNumLines rows)" | tee -a $globalReport 123 | if [ "x$splitMode" = "x1" ]; then 124 | numDiff=0 125 | chunksWithErrors="" 126 | for fileChunk in $origDir/sorted-* ; do 127 | chunk=`basename $fileChunk` 128 | [ ! -e $resultDir/$chunk ] && continue # we might have no file if there is less data in result table than in orig 129 | numDiffTmp=`diff {$origDir,$resultDir}/$chunk | grep '^<' | wc -l` 130 | if [ $numDiffTmp -gt 0 ]; then 131 | ((numDiff=$numDiff + $numDiffTmp)) 132 | chunksWithErrors="${chunksWithErrors}$numDiffTmp\t\t\t$chunk\n" 133 | else 134 | rm -f $resultDir/$chunk # we don't need to replicate the same data 135 | fi 136 | done 137 | else 138 | numDiff=0 139 | if [ -e $resultDir/sorted ]; then 140 | grepDiffPattern='^<' 141 | [ $origTableNumLines -lt $resultTableNumLines ] && grepDiffPattern='^>' 142 | numDiff=`diff {$origDir,$resultDir}/sorted | grep $grepDiffPattern | wc -l` 143 | fi 144 | [ "x$numDiff" = "x0" ] && rm -f $resultDir/sorted # we don't need to replicate the same data 145 | fi 146 | 147 | ##### 148 | # Show the results 149 | ##### 150 | echo 'Number of differences: ' $numDiff "(out of $origTableNumLines rows)" | tee -a $globalReport 151 | 152 | if [ "x$numDiff" != "x0" ]; then 153 | if [ "x$splitMode" = "x1" ]; then 154 | echo "To see the differences visually: vimdiff -o -c \"windo set wrap foldcolumn=0\" $communDir/{orig,result}/.gz" 155 | echo "NumberOfDifferences ChunkIK" 156 | echo -e $chunksWithErrors 157 | else 158 | echo "To see the differences visually: vimdiff -o -c \"windo set wrap foldcolumn=0\" $communDir/{orig,result}/sorted.gz" 159 | echo 160 | fi 161 | else 162 | echo 163 | fi 164 | 165 | gzip $origDir/sorted* & 166 | [ "x$numDiff" != "x0" ] && gzip $resultDir/sorted* & 167 | done 168 | 169 | 170 | -------------------------------------------------------------------------------- /scripts/sqoop/sqoopTables.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Script to download lot of tables in parallel with Sqoop and write them to Hive 4 | 5 | # Example of usage: 6 | # ./sqoopTables.sh -d myDatabase2 -H myHiveDatabase3 -p 6 -q etl /tmp/foo 7 | # the file (in that example: /tmp/foo) must contain on each line the name of a table you want to sqoop. If you want to sqoop 5 tables, you need 5 lines in your file 8 | # your TODO: this script is focused to SQLserver. Search the "sqoop import" line (in the middle of the script) and change the header of the URL appropriately. Also change in this line the user and password here. 9 | 10 | # Documentation: 11 | # https://community.hortonworks.com/articles/23602/sqoop-fetching-lot-of-tables-in-parallel.html 12 | 13 | #### Default values: 14 | # (configure here your default values. Those values can be overriden on the command line) 15 | origServer=myRelationalDatabase.example.com # The FQDN of the relational database you want to fetch (option: -o) 16 | origDatabase=myDatabase # The names of the database that contains the tables to fetch (option: -d) 17 | hiveDatabase=myHiveDatabase # The name of the Hive database that will get the tables fetched (option: -H) 18 | parallelism=4 # The number of tables (sqoop processes) you want to download at the same time (option: -p) 19 | queue=default # The queue used in Yarn (option: -q) 20 | baseDir=/tmp/sqoopTables # Base directory where will be stored the log files (option: -b) 21 | dirJavaGeneratedCode=/tmp/doSqoopTable-`id -u`-tmp # Directory for the java code generated by Sqoop (option: -c) 22 | targetTmpHdfsDir=/tmp/doSqoopTable-`id -u`-tmp/$$ # Temporary directory in HDFS to store downloaded data before moving it to Hive 23 | 24 | # TODO: delete sqoop tmp directory after jobs ends 25 | 26 | # argument: variable you want to be populated byt the value poped by the stack 27 | function popStack(){ 28 | : ${1?'Missing stack name'} 29 | 30 | # check if array not void 31 | local pointerStack=$(<$pointerStackFile) 32 | [ $pointerStack -ge $initialSizeStack ] && return 1 33 | 34 | eval "$1"=${listOfTables[$pointerStack]} 35 | 36 | (( pointerStack=$pointerStack+1)) 37 | echo $pointerStack > $pointerStackFile 38 | return 0 39 | } 40 | 41 | function doSqoop(){ 42 | local id=$1 # this id should be unique betwween all the instances of doSqoop 43 | local myVariable=tableFor_$id 44 | local logFileSummary=$databaseBaseDir/process_$id-summary.log 45 | local logFileRaw=$databaseBaseDir/process_$id-raw.log 46 | 47 | echo -e "\n\n############################# `date` Starting new execution [ $origDatabase -> $hiveDatabase ] #############################" >> $logFileSummary 48 | 49 | while true; do 50 | # get the name of the next table to sqoop 51 | popStack $myVariable 52 | [ $? -ne 0 ] && break 53 | 54 | local myTable=${!myVariable} 55 | # in my current project, there were some special mapping to get the name of the Hive tables from the name of the SQLserver tables: 56 | #local origTable=`echo $myTable | sed -e 's/^su_/dbosu./' -e 's/^sc_/dbosc./' -e 's/^sm_/dbosm./' -e 's/^cm_/dbo./'` 57 | local origTable=$myTable 58 | 59 | echo "[`date`] Creating the table $hiveDatabase.$myTable from the SQLserver table $origDatabase.$origTable" | tee -a $logFileSummary $logFileRaw 60 | 61 | # To fix bug for tables that have columns with spaces: 62 | # see: http://stackoverflow.com/questions/27572527/how-to-support-column-names-with-spaces-using-sqoop-import 63 | sqoop import -D mapreduce.job.queuename=$queue -D mapreduce.job.ubertask.enable=true --connect "jdbc:sqlserver://$origServer:1433; database=$origDatabase; username=myUser; password=myPassword" --hive-import --hive-database $hiveDatabase --fields-terminated-by '\t' --null-string '' --null-non-string '' -m 1 --outdir $dirJavaGeneratedCode --query "select a.* from $origTable a where \$CONDITIONS" --target-dir $targetTmpHdfsDir/$myTable --hive-table $myTable >> $logFileRaw 2>> $logFileRaw 64 | 65 | echo "Tail of : $logFileRaw" >> $logFileSummary 66 | tail -6 $logFileRaw >> $logFileSummary 67 | done 68 | 69 | echo -e "\n############################# `date` Ending execution [ $origDatabase -> $hiveDatabase ] #############################" >> $logFileSummary 70 | } 71 | 72 | function usage() { 73 | echo -e "usage:\t`basename $0` [-b ] [-c ] [-d ] [-o ] [-p ] [-q ] " 74 | echo -e "usage:\t`basename $0` -h" 75 | echo -e "\t\t\tthe file must contain on each line the name of a table you want to sqoop. If you want to sqoop 5 tables, you need 5 lines in your file" 76 | exit 0 77 | } 78 | 79 | while getopts "b:c:d:hH:o:p:q:" FLAG; do 80 | case $FLAG in 81 | b) baseDir=$OPTARG 82 | ;; 83 | c) dirJavaGeneratedCode=$OPTARG 84 | ;; 85 | d) origDatabase=$OPTARG 86 | ;; 87 | h) usage 88 | ;; 89 | H) hiveDatabase=$OPTARG 90 | ;; 91 | o) origServer=$OPTARG 92 | ;; 93 | p) parallelism=$OPTARG 94 | ;; 95 | q) queue=$OPTARG 96 | ;; 97 | \?) #unrecognized option - show help 98 | echo -e \\n"Option -${BOLD}$OPTARG${NORM} not allowed." 99 | usage 100 | ;; 101 | esac 102 | done 103 | 104 | shift $((OPTIND-1)) #This tells getopts to move on to the next argument. 105 | 106 | # TODO: check if the argument is a file. If not, consider this is a list of table 107 | 108 | myFile=$1 109 | [ "x$myFile" = "x" ] && usage 110 | [ -f $myFile ] || usage 111 | 112 | listOfTables=( `cat $myFile`) 113 | initialSizeStack=${#listOfTables[@]} 114 | pointerStack=0 # to know which element must be fetched next 115 | pointerStackFile=/dev/shm/.sqoopTables_pointerStack_$$ 116 | echo 0 > $pointerStackFile 117 | 118 | databaseBaseDir=$baseDir/$origDatabase-$hiveDatabase 119 | [ -d $databaseBaseDir ] || mkdir -p $databaseBaseDir 120 | for i in `seq $parallelism`; do 121 | sleep 0.1 # to avoid subprocesses to pop the stack at the same time 122 | (doSqoop $i) & 123 | done 124 | 125 | wait 126 | rm -f $pointerStackFile 127 | 128 | --------------------------------------------------------------------------------