├── .gitignore ├── README.md ├── appc └── src │ └── main │ └── sh │ ├── create_ncdc_files.sh │ ├── load_ncdc.sh │ ├── load_ncdc_map.sh │ └── ncdc_files.txt ├── book ├── pom.xml └── src │ └── main │ └── assembly │ ├── jar.xml │ └── oozie-workflow-application.xml ├── ch02-mr-intro ├── pom.xml └── src │ └── main │ ├── awk │ └── max_temperature.sh │ ├── cpp │ ├── Makefile │ └── max_temperature.cpp │ ├── examples │ ├── MaxTemperature │ │ ├── input.txt │ │ └── output │ │ │ └── part-r-00000 │ ├── MaxTemperatureWithCombiner │ │ ├── input.txt │ │ └── output │ │ │ └── part-r-00000 │ ├── OldMaxTemperature │ │ ├── input.txt │ │ └── output │ │ │ └── part-00000 │ ├── max_temperature.cpp.input.txt │ ├── max_temperature_hadoop.input.txt │ ├── max_temperature_hadoop_cluster.input.txt │ └── max_temperature_py │ │ ├── 2 │ │ ├── input.txt │ │ └── output │ │ │ └── part-00000 │ │ ├── input.txt │ │ ├── output │ │ └── part-r-00000 │ │ └── pseudo │ │ ├── input.txt │ │ └── output │ │ └── part-00000 │ ├── java │ ├── MaxTemperature.java │ ├── MaxTemperatureMapper.java │ ├── MaxTemperatureReducer.java │ ├── MaxTemperatureWithCombiner.java │ ├── OldMaxTemperature.java │ └── oldapi │ │ ├── MaxTemperature.java │ │ ├── MaxTemperatureMapper.java │ │ ├── MaxTemperatureReducer.java │ │ └── MaxTemperatureWithCombiner.java │ ├── python │ ├── max_temperature_map.py │ └── max_temperature_reduce.py │ ├── ruby │ ├── max_temperature_map.rb │ └── max_temperature_reduce.rb │ └── sh │ └── max_temp.sh ├── ch03-hdfs ├── pom.xml └── src │ ├── main │ ├── conf │ │ ├── core-site.xml │ │ └── hdfs-site.xml │ ├── java │ │ ├── DateRangePathFilter.java │ │ ├── FileCopyWithProgress.java │ │ ├── FileSystemCat.java │ │ ├── FileSystemDoubleCat.java │ │ ├── ListStatus.java │ │ ├── RegexExcludePathFilter.java │ │ ├── RegexPathFilter.java │ │ └── URLCat.java │ └── sh │ │ ├── file.sh │ │ └── hars.sh │ └── test │ └── java │ ├── CoherencyModelTest.java │ ├── FileSystemDeleteTest.java │ ├── FileSystemGlobTest.java │ └── ShowFileStatusTest.java ├── ch04-yarn ├── capacity-scheduler.xml └── fair-scheduler.xml ├── ch05-io ├── pom.xml └── src │ ├── main │ ├── examples │ │ ├── FileDecompressor.java.input.txt │ │ ├── MapFile-data-head.input.txt │ │ ├── MapFile-data-head.output.txt │ │ ├── MapFile-index.input.txt │ │ ├── MapFile-index.output.txt │ │ ├── MapFile-ls.input.txt │ │ ├── MapFile-ls.output.txt │ │ ├── MapFileWriteDemo.java.input.txt │ │ ├── MaxTemperatureWithCompression │ │ │ ├── input.txt │ │ │ └── output │ │ │ │ └── part-r-00000.gz │ │ ├── MaxTemperatureWithMapOutputCompression.ignore │ │ │ ├── input.txt │ │ │ └── output │ │ │ │ └── part-r-00000 │ │ ├── SequenceFileMapReduceSort.java.input.txt │ │ ├── SequenceFileMapReduceSortResults.java.input.txt │ │ ├── SequenceFileMapReduceSortResults.java.output.txt │ │ ├── SequenceFileMapReduceSortResults.java.pre.sh │ │ ├── SequenceFileReadDemo.java.input.txt │ │ ├── SequenceFileReadDemo.java.output.txt │ │ ├── SequenceFileReadDemo.java.pre.sh │ │ ├── SequenceFileToMapFileConverter-fix.java.input.txt │ │ ├── SequenceFileToMapFileConverter-mv.java.input.txt │ │ ├── SequenceFileToMapFileConverter-sort.java.input.txt │ │ ├── SequenceFileWriteDemo.java.input.txt │ │ ├── SequenceFileWriteDemo.java.output.txt │ │ ├── StreamCompressor.java.input.txt │ │ ├── StreamCompressor.java.output.txt │ │ ├── TextIterator.java.input.txt │ │ ├── TextIterator.java.output.txt │ │ ├── hadoop-fs-text.input.txt │ │ └── hadoop-fs-text.output.txt │ └── java │ │ ├── FileDecompressor.java │ │ ├── IntPair.java │ │ ├── MapFileFixer.java │ │ ├── MapFileWriteDemo.java │ │ ├── MaxTemperatureWithCompression.java │ │ ├── MaxTemperatureWithMapOutputCompression.java │ │ ├── PooledStreamCompressor.java │ │ ├── SequenceFileReadDemo.java │ │ ├── SequenceFileWriteDemo.java │ │ ├── StreamCompressor.java │ │ ├── TextArrayWritable.java │ │ ├── TextIterator.java │ │ ├── TextPair.java │ │ └── oldapi │ │ ├── IntPair.java │ │ ├── MaxTemperatureWithCompression.java │ │ ├── MaxTemperatureWithMapOutputCompression.java │ │ └── TextPair.java │ └── test │ ├── java │ ├── ArrayWritableTest.java │ ├── BinaryOrTextWritable.java │ ├── BooleanWritableTest.java │ ├── BytesWritableTest.java │ ├── FileDecompressorTest.java │ ├── GenericWritableTest.java │ ├── IntPairTest.java │ ├── IntWritableTest.java │ ├── MapFileSeekTest.java │ ├── MapWritableTest.java │ ├── NullWritableTest.java │ ├── ObjectWritableTest.java │ ├── SequenceFileSeekAndSyncTest.java │ ├── StringTextComparisonTest.java │ ├── TextPairTest.java │ ├── TextTest.java │ ├── VIntWritableTest.java │ ├── VLongWritableTest.java │ └── WritableTestBase.java │ └── resources │ └── file.gz ├── ch06-mr-dev ├── input │ └── ncdc │ │ └── micro │ │ └── sample.txt ├── output │ ├── ._SUCCESS.crc │ ├── .part-r-00000.crc │ ├── _SUCCESS │ └── part-r-00000 ├── pom.xml └── src │ ├── main │ ├── examples │ │ ├── ConfigurationPrinterSystem.java.input.txt │ │ ├── ConfigurationPrinterWithConf.java.input.txt │ │ ├── ConfigurationPrinterWithConf.java.output.txt │ │ ├── ConfigurationPrinterWithConfAndD.java.input.txt │ │ ├── ConfigurationPrinterWithD.java.input.txt │ │ ├── ConfigurationPrinterWithD.java.output.txt │ │ ├── MaxTemperatureDriver.java.input.txt │ │ ├── MaxTemperatureDriverV2.ignore │ │ │ └── input.txt │ │ ├── MaxTemperatureDriverV2GOP.ignore │ │ │ └── input.txt │ │ ├── MaxTemperatureDriverV3 │ │ │ ├── input.txt │ │ │ └── output │ │ │ │ └── part-r-00000 │ │ └── MaxTemperatureDriverV3GOP │ │ │ ├── input.txt │ │ │ └── output │ │ │ └── part-r-00000 │ ├── java │ │ ├── ConfigurationPrinter.java │ │ ├── LoggingDriver.java │ │ ├── LoggingIdentityMapper.java │ │ ├── v1 │ │ │ ├── MaxTemperatureMapper.java │ │ │ └── MaxTemperatureReducer.java │ │ ├── v2 │ │ │ ├── MaxTemperatureDriver.java │ │ │ ├── MaxTemperatureMapper.java │ │ │ └── NcdcRecordParser.java │ │ ├── v3 │ │ │ ├── MaxTemperatureDriver.java │ │ │ └── MaxTemperatureMapper.java │ │ └── v4 │ │ │ ├── MaxTemperatureDriver.java │ │ │ ├── MaxTemperatureMapper.java │ │ │ └── NcdcRecordParser.java │ └── resources │ │ ├── configuration-1.xml │ │ ├── configuration-2.xml │ │ ├── max-temp-workflow.properties │ │ └── max-temp-workflow │ │ └── workflow.xml │ └── test │ ├── java │ ├── MultipleResourceConfigurationTest.java │ ├── SingleResourceConfigurationTest.java │ ├── v1 │ │ ├── MaxTemperatureMapperTest.java │ │ └── MaxTemperatureReducerTest.java │ ├── v2 │ │ ├── MaxTemperatureDriverMiniTest.java │ │ ├── MaxTemperatureDriverTest.java │ │ └── MaxTemperatureMapperTest.java │ └── v4 │ │ └── MaxTemperatureMapperTest.java │ └── resources │ └── expected.txt ├── ch08-mr-types ├── pom.xml └── src │ ├── main │ ├── examples │ │ ├── MaxTemperatureWithMultipleInputs │ │ │ ├── input.txt │ │ │ └── output │ │ │ │ └── part-r-00000 │ │ ├── MinimalMapReduce.java.input.txt │ │ ├── MinimalMapReduce │ │ │ ├── input.txt │ │ │ └── output │ │ │ │ └── part-00000 │ │ ├── MinimalMapReduceWithDefaults │ │ │ ├── input.txt │ │ │ └── output │ │ │ │ └── part-00000 │ │ ├── PartitionByStationUsingMultipleOutputFormat.java.input.txt │ │ ├── PartitionByStationUsingMultipleOutputs │ │ │ ├── 2 │ │ │ │ ├── input.txt │ │ │ │ └── output │ │ │ │ │ ├── 029070-99999-r-00000 │ │ │ │ │ ├── 029500-99999-r-00000 │ │ │ │ │ ├── 029600-99999-r-00000 │ │ │ │ │ ├── 029720-99999-r-00000 │ │ │ │ │ ├── 029810-99999-r-00000 │ │ │ │ │ ├── 227070-99999-r-00000 │ │ │ │ │ └── part-r-00000 │ │ │ ├── input.txt │ │ │ └── output │ │ │ │ ├── 029070-99999-r-00000 │ │ │ │ ├── 029500-99999-r-00000 │ │ │ │ ├── 029600-99999-r-00000 │ │ │ │ ├── 029720-99999-r-00000 │ │ │ │ ├── 029810-99999-r-00000 │ │ │ │ ├── 227070-99999-r-00000 │ │ │ │ └── part-r-00000 │ │ ├── PartitionByStationYearUsingMultipleOutputs │ │ │ ├── 2 │ │ │ │ ├── input.txt │ │ │ │ └── output │ │ │ │ │ ├── 029070-99999 │ │ │ │ │ ├── 1901 │ │ │ │ │ │ └── part-r-00000 │ │ │ │ │ └── 1902 │ │ │ │ │ │ └── part-r-00000 │ │ │ │ │ ├── 029500-99999 │ │ │ │ │ ├── 1901 │ │ │ │ │ │ └── part-r-00000 │ │ │ │ │ └── 1902 │ │ │ │ │ │ └── part-r-00000 │ │ │ │ │ ├── 029600-99999 │ │ │ │ │ ├── 1901 │ │ │ │ │ │ └── part-r-00000 │ │ │ │ │ └── 1902 │ │ │ │ │ │ └── part-r-00000 │ │ │ │ │ ├── 029720-99999 │ │ │ │ │ ├── 1901 │ │ │ │ │ │ └── part-r-00000 │ │ │ │ │ └── 1902 │ │ │ │ │ │ └── part-r-00000 │ │ │ │ │ ├── 029810-99999 │ │ │ │ │ ├── 1901 │ │ │ │ │ │ └── part-r-00000 │ │ │ │ │ └── 1902 │ │ │ │ │ │ └── part-r-00000 │ │ │ │ │ ├── 227070-99999 │ │ │ │ │ ├── 1901 │ │ │ │ │ │ └── part-r-00000 │ │ │ │ │ └── 1902 │ │ │ │ │ │ └── part-r-00000 │ │ │ │ │ └── part-r-00000 │ │ │ ├── input.txt │ │ │ └── output │ │ │ │ ├── 029070-99999 │ │ │ │ ├── 1901 │ │ │ │ │ └── part-r-00000 │ │ │ │ └── 1902 │ │ │ │ │ └── part-r-00000 │ │ │ │ ├── 029500-99999 │ │ │ │ ├── 1901 │ │ │ │ │ └── part-r-00000 │ │ │ │ └── 1902 │ │ │ │ │ └── part-r-00000 │ │ │ │ ├── 029600-99999 │ │ │ │ ├── 1901 │ │ │ │ │ └── part-r-00000 │ │ │ │ └── 1902 │ │ │ │ │ └── part-r-00000 │ │ │ │ ├── 029720-99999 │ │ │ │ ├── 1901 │ │ │ │ │ └── part-r-00000 │ │ │ │ └── 1902 │ │ │ │ │ └── part-r-00000 │ │ │ │ ├── 029810-99999 │ │ │ │ ├── 1901 │ │ │ │ │ └── part-r-00000 │ │ │ │ └── 1902 │ │ │ │ │ └── part-r-00000 │ │ │ │ ├── 227070-99999 │ │ │ │ ├── 1901 │ │ │ │ │ └── part-r-00000 │ │ │ │ └── 1902 │ │ │ │ │ └── part-r-00000 │ │ │ │ └── part-r-00000 │ │ ├── SmallFilesToSequenceFileConverter.ignore │ │ │ └── input.txt │ │ ├── SmallFilesToSequenceFileConverter.java.input.txt │ │ ├── default_streaming.input.txt │ │ └── minimal_streaming.input.txt │ ├── java │ │ ├── MaxTemperatureWithMultipleInputs.java │ │ ├── MinimalMapReduce.java │ │ ├── MinimalMapReduceWithDefaults.java │ │ ├── NonSplittableTextInputFormat.java │ │ ├── PartitionByStationUsingMultipleOutputs.java │ │ ├── PartitionByStationYearUsingMultipleOutputs.java │ │ ├── SmallFilesToSequenceFileConverter.java │ │ ├── StationPartitioner.java │ │ ├── WholeFileInputFormat.java │ │ ├── WholeFileRecordReader.java │ │ └── oldapi │ │ │ ├── MaxTemperatureWithMultipleInputs.java │ │ │ ├── MinimalMapReduce.java │ │ │ ├── MinimalMapReduceWithDefaults.java │ │ │ ├── NonSplittableTextInputFormat.java │ │ │ ├── PartitionByStationUsingMultipleOutputFormat.java │ │ │ ├── PartitionByStationUsingMultipleOutputs.java │ │ │ ├── PartitionByStationYearUsingMultipleOutputFormat.java │ │ │ ├── SmallFilesToSequenceFileConverter.java │ │ │ ├── StationPartitioner.java │ │ │ ├── WholeFileInputFormat.java │ │ │ └── WholeFileRecordReader.java │ └── sh │ │ └── streaming.sh │ └── test │ └── java │ └── TextInputFormatsTest.java ├── ch09-mr-features ├── pom.xml └── src │ ├── main │ ├── examples │ │ ├── JoinRecordWithStationName │ │ │ ├── 2 │ │ │ │ ├── input.txt │ │ │ │ └── output │ │ │ │ │ └── part-r-00000 │ │ │ ├── input.txt │ │ │ └── output │ │ │ │ └── part-r-00000 │ │ ├── LookupRecordByTemperature.java.input.txt │ │ ├── LookupRecordByTemperature.java.output.txt │ │ ├── LookupRecordsByTemperature.java.input.txt │ │ ├── LookupRecordsByTemperature.java.output.txt │ │ ├── MaxTemperatureByStationNameUsingDistributedCacheFile.java.input.txt │ │ ├── MaxTemperatureByStationNameUsingDistributedCacheFileApi.ignore │ │ │ └── input.txt │ │ ├── MaxTemperatureUsingSecondarySort │ │ │ ├── input.txt │ │ │ └── output │ │ │ │ └── part-r-00000 │ │ ├── MaxTemperatureWithCounters.java.input.txt │ │ ├── MaxTemperatureWithCounters │ │ │ ├── input.txt │ │ │ └── output │ │ │ │ └── part-r-00000 │ │ ├── MissingTemperatureFields.java.input.txt │ │ ├── SortByTemperatureToMapFile.ignore │ │ │ └── input.txt │ │ ├── SortByTemperatureUsingHashPartitioner.ignore │ │ │ └── input.txt │ │ ├── SortByTemperatureUsingHashPartitioner.java.input.txt │ │ ├── SortByTemperatureUsingTotalOrderPartitioner.java.input.txt │ │ ├── SortDataPreprocessor.ignore │ │ │ └── input.txt │ │ └── SortDataPreprocessor.java.input.txt │ ├── java │ │ ├── JoinRecordMapper.java │ │ ├── JoinRecordWithStationName.java │ │ ├── JoinReducer.java │ │ ├── JoinStationMapper.java │ │ ├── LookupRecordByTemperature.java │ │ ├── LookupRecordsByTemperature.java │ │ ├── MaxTemperatureByStationNameUsingDistributedCacheFile.java │ │ ├── MaxTemperatureUsingSecondarySort.java │ │ ├── MaxTemperatureWithCounters.java │ │ ├── MissingTemperatureFields.java │ │ ├── SortByTemperatureToMapFile.java │ │ ├── SortByTemperatureUsingHashPartitioner.java │ │ ├── SortByTemperatureUsingTotalOrderPartitioner.java │ │ ├── SortDataPreprocessor.java │ │ ├── TemperatureDistribution.java │ │ └── oldapi │ │ │ ├── JoinRecordMapper.java │ │ │ ├── JoinRecordWithStationName.java │ │ │ ├── JoinReducer.java │ │ │ ├── JoinStationMapper.java │ │ │ ├── LookupRecordByTemperature.java │ │ │ ├── LookupRecordsByTemperature.java │ │ │ ├── MaxTemperatureByStationNameUsingDistributedCacheFile.java │ │ │ ├── MaxTemperatureByStationNameUsingDistributedCacheFileApi.java │ │ │ ├── MaxTemperatureUsingSecondarySort.java │ │ │ ├── MaxTemperatureWithCounters.java │ │ │ ├── MissingTemperatureFields.java │ │ │ ├── SortByTemperatureToMapFile.java │ │ │ ├── SortByTemperatureUsingHashPartitioner.java │ │ │ ├── SortByTemperatureUsingTotalOrderPartitioner.java │ │ │ ├── SortDataPreprocessor.java │ │ │ └── TemperatureDistribution.java │ ├── python │ │ ├── max_daily_temp_map.py │ │ ├── max_daily_temp_reduce.py │ │ ├── mean_max_daily_temp.sh │ │ ├── mean_max_daily_temp_map.py │ │ ├── mean_max_daily_temp_reduce.py │ │ ├── secondary_sort.sh │ │ ├── secondary_sort_map.py │ │ └── secondary_sort_reduce.py │ ├── r │ │ ├── fixed-partitions │ │ ├── output │ │ ├── output_sorted │ │ ├── sampled-partitions │ │ ├── temperature_distribution.png │ │ └── temperature_distribution.r │ └── resources │ │ ├── MaxTemperatureWithCounters_Temperature.properties │ │ └── oldapi │ │ └── MaxTemperatureWithCounters_Temperature.properties │ └── test │ └── java │ └── KeyFieldBasedComparatorTest.java ├── ch10-setup └── src │ └── main │ ├── conf │ ├── core-site.xml │ ├── hdfs-site.xml │ └── yarn-site.xml │ └── sh │ └── trash.sh ├── ch12-avro ├── pom.xml └── src │ ├── main │ ├── assembly │ │ └── job.xml │ ├── c │ │ └── dump_pairs.c │ ├── examples │ │ ├── AvroGenericMaxTemperature │ │ │ ├── input.txt │ │ │ └── output │ │ │ │ ├── ._SUCCESS.crc │ │ │ │ ├── .part-r-00000.avro.crc │ │ │ │ ├── _SUCCESS │ │ │ │ └── part-r-00000.avro │ │ └── AvroSort │ │ │ ├── input.txt │ │ │ └── output │ │ │ ├── ._SUCCESS.crc │ │ │ ├── .part-r-00000.avro.crc │ │ │ ├── _SUCCESS │ │ │ └── part-r-00000.avro │ ├── java │ │ ├── AvroGenericMaxTemperature.java │ │ ├── AvroSort.java │ │ ├── NcdcRecordParser.java │ │ └── oldapi │ │ │ ├── AvroGenericMaxTemperature.java │ │ │ ├── AvroProjection.java │ │ │ ├── AvroSort.java │ │ │ ├── AvroSpecificMaxTemperature.java │ │ │ └── NcdcRecordParser.java │ ├── py │ │ ├── test_avro.py │ │ └── write_pairs.py │ └── resources │ │ ├── AliasedStringPair.avsc │ │ ├── Array.avsc │ │ ├── Enum.avsc │ │ ├── Fixed.avsc │ │ ├── Map.avsc │ │ ├── NewStringPair.avsc │ │ ├── NewStringPairWithNull.avsc │ │ ├── ProjectedStringPair.avsc │ │ ├── SortedStringPair.avsc │ │ ├── StringPair.avsc │ │ ├── SwitchedStringPair.avsc │ │ ├── Union.avsc │ │ └── WeatherRecord.avsc │ └── test │ └── java │ └── AvroTest.java ├── ch13-parquet ├── pom.xml └── src │ ├── main │ ├── assembly │ │ └── job.xml │ ├── examples │ │ └── TextToParquetWithAvro │ │ │ ├── input.txt │ │ │ └── output │ │ │ ├── _SUCCESS │ │ │ ├── _metadata │ │ │ └── part-m-00000.parquet │ └── java │ │ ├── ParquetToTextWithAvro.java │ │ ├── ParquetToTextWithExample.java │ │ ├── TextToParquetWithAvro.java │ │ └── TextToParquetWithExample.java │ └── test │ ├── java │ ├── ParquetMRWithAvroTest.java │ ├── ParquetMRWithExampleTest.java │ └── ParquetTest.java │ └── resources │ ├── NewStringPair.avsc │ ├── ProjectedStringPair.avsc │ ├── StringPair.avsc │ └── fruit.txt ├── ch14-flume ├── spool-to-hdfs-and-logger.properties ├── spool-to-hdfs-avro.properties ├── spool-to-hdfs-partitioned.properties ├── spool-to-hdfs-tiered-load-balance.properties ├── spool-to-hdfs-tiered.properties ├── spool-to-hdfs.properties └── spool-to-logger.properties ├── ch15-sqoop ├── pom.xml ├── src │ └── main │ │ └── java │ │ ├── MaxWidgetId.java │ │ ├── MaxWidgetIdGenericAvro.java │ │ └── Widget.java └── widgets │ └── part-m-00000.avro ├── ch16-pig ├── pom.xml └── src │ ├── main │ ├── grunt │ │ ├── combine.grunt │ │ ├── disambiguate.grunt │ │ ├── flatten.grunt │ │ ├── foreach.grunt │ │ ├── group.grunt │ │ ├── join.grunt │ │ ├── max_temp.grunt │ │ ├── missing.grunt │ │ ├── multiquery.grunt │ │ ├── null.grunt │ │ ├── schema.grunt │ │ ├── set.grunt │ │ ├── sort.grunt │ │ ├── store.grunt │ │ ├── stream.grunt │ │ ├── tuples.grunt │ │ ├── types.grunt │ │ └── udfs.grunt │ ├── java │ │ └── com │ │ │ └── hadoopbook │ │ │ └── pig │ │ │ ├── CutLoadFunc.java │ │ │ ├── IsGoodQuality.java │ │ │ ├── Range.java │ │ │ └── Trim.java │ ├── pig │ │ ├── comment_c-style.pig │ │ ├── comment_single_line.pig │ │ ├── max_temp.macro │ │ ├── max_temp.pig │ │ ├── max_temp_filter_stream.pig │ │ ├── max_temp_filter_udf.pig │ │ ├── max_temp_macro.pig │ │ ├── max_temp_macro_import.pig │ │ ├── max_temp_param.param │ │ ├── max_temp_param.pig │ │ ├── max_temp_station_name.pig │ │ └── year_stats.pig │ └── python │ │ └── is_good_quality.py │ └── test │ └── java │ └── com │ └── hadoopbook │ └── pig │ ├── IsGoodQualityTest.java │ └── RangeTest.java ├── ch17-hive ├── pom.xml └── src │ └── main │ ├── hive │ ├── buckets.hive │ ├── conversions.hive │ ├── indexes.hive │ ├── joins.hive │ ├── mapreduce.hive │ ├── max_temp.hive │ ├── multitable_insert.hive │ ├── partitions.hive │ ├── regex_serde.hive │ ├── set.hive │ ├── sort.hive │ ├── storage.hive │ ├── types.hive │ └── udfs.hive │ ├── java │ └── com │ │ └── hadoopbook │ │ └── hive │ │ ├── Maximum.java │ │ ├── Mean.java │ │ └── Strip.java │ └── python │ ├── is_good_quality.py │ └── max_temperature_reduce.py ├── ch18-crunch ├── pom.xml └── src │ ├── main │ ├── assembly │ │ └── hadoop-job.xml │ └── java │ │ └── crunch │ │ ├── AvroGenericMaxTemperatureCrunch.java │ │ ├── JoinRecordWithStationNameCrunch.java │ │ ├── MaxTemperatureByStationNameCrunch.java │ │ ├── MaxTemperatureCrunch.java │ │ ├── MaxTemperatureCrunchWithShutdownHook.java │ │ ├── MaxTemperatureUsingSecondarySortCrunch.java │ │ ├── MaxTemperatureWithCompressionCrunch.java │ │ ├── MaxTemperatureWithCountersCrunch.java │ │ ├── MaxTemperatureWithMultipleInputsCrunch.java │ │ ├── MetOfficeRecordParser.java │ │ ├── NcdcRecordParser.java │ │ ├── NcdcStationMetadataParser.java │ │ ├── SortByTemperatureCrunch.java │ │ └── SplitCrunch.java │ └── test │ ├── java │ └── crunch │ │ ├── CheckpointTest.java │ │ ├── CountValuesFn.java │ │ ├── CustomDoFn.java │ │ ├── InversePairFn.java │ │ ├── JoinTest.java │ │ ├── MaterializeTest.java │ │ ├── NonSerializableOuterClass.java │ │ ├── ObjectReuseTest.java │ │ ├── PCollections.java │ │ ├── PageRankTest.java │ │ ├── PipelineDebugTest.java │ │ ├── PipelineExecutionTest.java │ │ ├── PrimitiveOperationsTest.java │ │ ├── SerializableFunctionsTest.java │ │ ├── SortTest.java │ │ ├── SourcesAndTargetsTest.java │ │ ├── ToLowerFn.java │ │ ├── TypesTest.java │ │ └── WeatherRecord.java │ └── resources │ ├── A │ ├── B │ ├── fruit.txt │ ├── ints.txt │ ├── log4j.properties │ ├── numbers.seq │ ├── sample.txt │ ├── set1.txt │ ├── set2.txt │ └── urls.txt ├── ch19-spark ├── pom.xml └── src │ ├── main │ ├── java │ │ └── MaxTemperatureSpark.java │ ├── python │ │ └── MaxTemperature.py │ └── scala │ │ ├── MaxTemperature.scala │ │ └── MaxTemperatureWithPlacement.scala │ └── test │ ├── avro │ ├── IntWrapper.avsc │ └── WeatherRecord.avsc │ ├── java │ └── SimpleTest.java │ ├── resources │ ├── fruit.txt │ ├── log4j.properties │ ├── numbers.seq │ ├── quangle.txt │ └── set2.txt │ └── scala │ ├── CustomKryoRegistrator.scala │ ├── DataSerializationTest.scala │ ├── FunctionSerializationTest.scala │ ├── RDDCreationTest.scala │ ├── ReflectWeatherRecord.scala │ ├── SharedDataTest.scala │ ├── TransformationsAndActionsTest.scala │ └── WordCountHistogramTest.scala ├── ch20-hbase ├── pom.xml └── src │ └── main │ └── java │ ├── ExampleClient.java │ ├── HBaseStationImporter.java │ ├── HBaseStationQuery.java │ ├── HBaseTemperatureBulkImporter.java │ ├── HBaseTemperatureDirectImporter.java │ ├── HBaseTemperatureImporter.java │ ├── HBaseTemperatureQuery.java │ ├── NewExampleClient.java │ ├── NewHBaseStationImporter.java │ ├── NewHBaseStationQuery.java │ ├── NewHBaseTemperatureQuery.java │ ├── RowKeyConverter.java │ └── SimpleRowCounter.java ├── ch21-zk ├── pom.xml └── src │ └── main │ ├── java │ ├── ActiveKeyValueStore.java │ ├── ConfigUpdater.java │ ├── ConfigWatcher.java │ ├── ConnectionWatcher.java │ ├── CreateGroup.java │ ├── DeleteGroup.java │ ├── JoinGroup.java │ ├── ListGroup.java │ ├── ResilientActiveKeyValueStore.java │ └── ResilientConfigUpdater.java │ └── sh │ └── group.sh ├── ch22-case-studies ├── pom.xml └── src │ └── main │ └── java │ ├── TrackStats.jr │ └── fm │ └── last │ └── hadoop │ ├── io │ └── records │ │ └── TrackStats.java │ └── programs │ └── labs │ └── trackstats │ └── TrackStatisticsProgram.java ├── common ├── pom.xml └── src │ ├── main │ └── java │ │ ├── JobBuilder.java │ │ ├── MetOfficeRecordParser.java │ │ ├── NcdcRecordParser.java │ │ ├── NcdcStationMetadata.java │ │ ├── NcdcStationMetadataParser.java │ │ └── oldapi │ │ ├── JobBuilder.java │ │ ├── MetOfficeRecordParser.java │ │ ├── NcdcRecordParser.java │ │ ├── NcdcStationMetadata.java │ │ └── NcdcStationMetadataParser.java │ └── test │ └── java │ ├── MetOfficeRecordParserTest.java │ ├── NcdcRecordParserTest.java │ └── NcdcStationMetadataParserTest.java ├── conf ├── hadoop-cluster.template.xml ├── hadoop-local.xml ├── hadoop-localhost.xml ├── hadoop │ └── pseudo-distributed │ │ ├── core-site.xml │ │ ├── hdfs-site.xml │ │ ├── mapred-site.xml │ │ └── yarn-site.xml ├── pig │ └── localhost │ │ └── pig.properties └── zookeeper │ ├── cluster │ └── zoo.cfg │ ├── localhost │ └── zoo.cfg │ └── log4j.properties ├── hadoop-examples └── pom.xml ├── hadoop-meta └── pom.xml ├── input ├── avro │ └── pairs.avro ├── badrecords │ ├── a │ ├── b │ └── c ├── docs │ ├── 1400-8.txt │ └── quangle.txt ├── fileglob │ ├── 2007 │ │ └── 12 │ │ │ ├── 30 │ │ │ ├── data-2007-12-30 │ │ │ └── data[2007-12-30] │ │ │ └── 31 │ │ │ └── data-2007-12-31 │ └── 2008 │ │ └── 01 │ │ └── 01 │ │ └── data-2008-01-01 ├── fileinput │ ├── a │ └── dir │ │ └── b ├── hive │ ├── README │ ├── dummy.txt │ ├── joins │ │ ├── sales.txt │ │ └── things.txt │ ├── partitions │ │ ├── file1 │ │ ├── file2 │ │ ├── file3 │ │ ├── file4 │ │ ├── file5 │ │ └── file6 │ ├── tables │ │ ├── users.txt │ │ └── users_extended.txt │ ├── tmp.txt │ ├── types │ │ ├── complex.txt │ │ └── nested.txt │ └── udfs │ │ ├── arrays.txt │ │ ├── fruit.txt │ │ ├── max1.txt │ │ └── max2.txt ├── metoffice │ ├── aberporthdata.txt │ ├── armaghdata.txt │ ├── bradforddata.txt │ ├── braemardata.txt │ ├── cambridgedata.txt │ ├── cardiffdata.txt │ ├── durhamdata.txt │ ├── eastbournedata.txt │ ├── greenwichdata.txt │ ├── hurndata.txt │ ├── lerwickdata.txt │ ├── leucharsdata.txt │ ├── newtonriggdata.txt │ ├── oxforddata.txt │ ├── paisleydata.txt │ ├── ringwaydata.txt │ ├── rossonwyedata.txt │ ├── shawburydata.txt │ ├── sheffielddata.txt │ ├── southamptondata.txt │ ├── stmawgandata.txt │ ├── stornowaydata.txt │ ├── suttonbonningtondata.txt │ ├── tireedata.txt │ ├── valleydata.txt │ └── yeoviltondata.txt ├── ncdc │ ├── all │ │ ├── 1901.gz │ │ └── 1902.gz │ ├── metadata │ │ ├── ish-history.txt │ │ └── stations-fixed-width.txt │ ├── micro-tab │ │ ├── sample.txt │ │ ├── sample2.txt │ │ └── sample_corrupt.txt │ ├── micro │ │ └── sample.txt │ ├── sample.txt │ └── sample.txt.gz ├── pig │ ├── combine │ │ ├── A │ │ └── B │ ├── corrupt │ │ └── missing_fields │ ├── foreach │ │ └── A │ ├── group │ │ └── A │ ├── join │ │ ├── A │ │ └── B │ ├── multiquery │ │ └── A │ ├── nested │ │ ├── A │ │ └── B │ ├── pairwise │ │ └── postings │ ├── schema │ │ └── A │ ├── sort │ │ └── A │ ├── tuples │ │ └── A │ ├── types │ │ ├── A │ │ ├── B │ │ ├── C │ │ └── one │ └── udfs │ │ └── A ├── smallfiles │ ├── a │ ├── b │ ├── c │ ├── d │ ├── e │ └── f └── wikipedia │ ├── example.xml │ └── sample.xml ├── pom.xml └── snippet ├── README ├── bin ├── check_expected.sh ├── check_manuscript.py ├── check_manuscript.sh ├── generate_listings.sh ├── grunter.sh ├── hiver.sh ├── phragmite_db.pl ├── phragmite_hive.py └── phragmite_pig.py ├── conf ├── local │ ├── capacity-scheduler.xml │ ├── configuration.xsl │ ├── container-executor.cfg │ ├── core-site.xml │ ├── hadoop-env.cmd │ ├── hadoop-env.sh │ ├── hadoop-metrics.properties │ ├── hadoop-metrics2.properties │ ├── hadoop-policy.xml │ ├── hdfs-site.xml │ ├── httpfs-env.sh │ ├── httpfs-log4j.properties │ ├── httpfs-signature.secret │ ├── httpfs-site.xml │ ├── log4j.properties │ ├── mapred-env.cmd │ ├── mapred-env.sh │ ├── mapred-queues.xml.template │ ├── mapred-site.xml │ ├── mapred-site.xml.template │ ├── slaves │ ├── ssl-client.xml.example │ ├── ssl-server.xml.example │ ├── yarn-env.cmd │ ├── yarn-env.sh │ └── yarn-site.xml └── pseudo │ ├── capacity-scheduler.xml │ ├── capacity-scheduler.xml.old │ ├── configuration.xsl │ ├── container-executor.cfg │ ├── core-site.xml │ ├── fair-scheduler.xml │ ├── hadoop-env.cmd │ ├── hadoop-env.sh │ ├── hadoop-metrics.properties │ ├── hadoop-metrics2.properties │ ├── hadoop-policy.xml │ ├── hdfs-site.xml │ ├── httpfs-env.sh │ ├── httpfs-log4j.properties │ ├── httpfs-signature.secret │ ├── httpfs-site.xml │ ├── log4j.properties │ ├── mapred-env.cmd │ ├── mapred-env.sh │ ├── mapred-queues.xml.template │ ├── mapred-site.xml │ ├── mapred-site.xml.template │ ├── slaves │ ├── ssl-client.xml.example │ ├── ssl-server.xml.example │ ├── yarn-env.cmd │ ├── yarn-env.sh │ └── yarn-site.xml ├── expected └── ch11 │ └── grunt │ ├── combine_schema.xml │ ├── combine_union.xml │ ├── foreach_generate.xml │ ├── group_all.xml │ ├── group_dump.xml │ ├── group_expression.xml │ ├── join_cogroup.xml │ ├── join_cogroup_flatten.xml │ ├── join_cogroup_inner.xml │ ├── join_cogroup_join.xml │ ├── join_dump.xml │ ├── join_frj.xml │ ├── join_join.xml │ ├── max_temp_describe_records.xml │ ├── max_temp_dump_grouped_records.xml │ ├── max_temp_dump_records.xml │ ├── max_temp_filter_records.xml │ ├── max_temp_load.xml │ ├── max_temp_max_temp.xml │ ├── max_temp_result.xml │ ├── missing_fields.xml │ ├── null_corrupt.xml │ ├── null_count.xml │ ├── null_dump.xml │ ├── null_split.xml │ ├── null_undetected.xml │ ├── schema_absent.xml │ ├── schema_absent_projected.xml │ ├── schema_names_only.xml │ ├── schema_one_type_only.xml │ ├── schema_types.xml │ ├── set_debug_on.xml │ ├── sort_dump.xml │ ├── sort_limit.xml │ ├── sort_no_order.xml │ ├── sort_order.xml │ ├── store_colon_delimited.xml │ ├── stream_cut.xml │ ├── udfs_invoke_long.xml │ ├── udfs_invoke_short.xml │ ├── udfs_load.xml │ ├── udfs_register.xml │ └── udfs_schema.xml ├── pom.xml └── src └── test ├── java └── ExamplesIT.java └── resources ├── copyoutput.sh └── setup.sh /.gitignore: -------------------------------------------------------------------------------- 1 | /*.jar 2 | *.log 3 | /build 4 | /lib 5 | /out 6 | /output* 7 | ch*/maxwidget 8 | snippet/actual 9 | target 10 | /target 11 | .classpath 12 | .project 13 | .pydevproject 14 | .settings 15 | metastore_db 16 | -------------------------------------------------------------------------------- /appc/src/main/sh/create_ncdc_files.sh: -------------------------------------------------------------------------------- 1 | for ((i=1901;i<=2000;i+=1)) 2 | do 3 | echo s3n://hadoopbook/ncdc/raw/isd-$i.tar.bz2 4 | done 5 | -------------------------------------------------------------------------------- /appc/src/main/sh/load_ncdc.sh: -------------------------------------------------------------------------------- 1 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \ 2 | -D mapred.reduce.tasks=0 \ 3 | -D mapred.map.tasks.speculative.execution=false \ 4 | -D mapred.task.timeout=12000000 \ 5 | -input ncdc_files.txt \ 6 | -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat \ 7 | -output output \ 8 | -mapper load_ncdc_map.sh \ 9 | -file load_ncdc_map.sh 10 | 11 | -------------------------------------------------------------------------------- /appc/src/main/sh/load_ncdc_map.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # NLineInputFormat gives a single line: key is offset, value is S3 URI 4 | read offset s3file 5 | 6 | # Retrieve file from S3 to local disk 7 | echo "reporter:status:Retrieving $s3file" >&2 8 | $HADOOP_INSTALL/bin/hadoop fs -get $s3file . 9 | 10 | # Un-bzip and un-tar the local file 11 | target=`basename $s3file .tar.bz2` 12 | mkdir -p $target 13 | echo "reporter:status:Un-tarring $s3file to $target" >&2 14 | tar jxf `basename $s3file` -C $target 15 | 16 | # Un-gzip each station file and concat into one file 17 | echo "reporter:status:Un-gzipping $target" >&2 18 | for file in $target/*/* 19 | do 20 | gunzip -c $file >> $target.all 21 | echo "reporter:status:Processed $file" >&2 22 | done 23 | 24 | # Put gzipped version into HDFS 25 | echo "reporter:status:Gzipping $target and putting in HDFS" >&2 26 | gzip -c $target.all | $HADOOP_INSTALL/bin/hadoop fs -put - gz/$target.gz -------------------------------------------------------------------------------- /book/src/main/assembly/jar.xml: -------------------------------------------------------------------------------- 1 | 4 | jar 5 | 6 | jar 7 | 8 | false 9 | 10 | 11 | / 12 | true 13 | true 14 | runtime 15 | false 16 | 17 | com.hadoopbook:* 18 | 19 | 20 | 21 | 22 | 23 | target/classes 24 | / 25 | 26 | 27 | -------------------------------------------------------------------------------- /book/src/main/assembly/oozie-workflow-application.xml: -------------------------------------------------------------------------------- 1 | 4 | oozie-workflow-application 5 | 6 | dir 7 | 8 | false 9 | 10 | 11 | ../ch06-mr-dev/src/main/resources/max-temp-workflow 12 | max-temp-workflow 13 | 14 | 15 | 16 | 17 | ../hadoop-examples.jar 18 | max-temp-workflow/lib 19 | 20 | 21 | -------------------------------------------------------------------------------- /ch02-mr-intro/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.hadoopbook 6 | hadoop-meta 7 | 4.0 8 | ../hadoop-meta/pom.xml 9 | 10 | com.hadoopbook 11 | ch02-mr-intro 12 | jar 13 | 4.0 14 | Chapter 2: MapReduce 15 | 16 | 17 | junit 18 | junit 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /ch02-mr-intro/src/main/awk/max_temperature.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | for year in all/* 3 | do 4 | echo -ne `basename $year .gz`"\t" 5 | gunzip -c $year | \ 6 | awk '{ temp = substr($0, 88, 5) + 0; 7 | q = substr($0, 93, 1); 8 | if (temp !=9999 && q ~ /[01459]/ && temp > max) max = temp } 9 | END { print max }' 10 | done -------------------------------------------------------------------------------- /ch02-mr-intro/src/main/cpp/Makefile: -------------------------------------------------------------------------------- 1 | CC = g++ 2 | CPPFLAGS = -m32 -I$(HADOOP_INSTALL)/c++/$(PLATFORM)/include 3 | 4 | max_temperature: max_temperature.cpp 5 | $(CC) $(CPPFLAGS) $< -Wall -L$(HADOOP_INSTALL)/c++/$(PLATFORM)/lib -lhadooppipes \ 6 | -lhadooputils -lpthread -g -O2 -o $@ 7 | -------------------------------------------------------------------------------- /ch02-mr-intro/src/main/examples/MaxTemperature/input.txt: -------------------------------------------------------------------------------- 1 | hadoop MaxTemperature input/ncdc/sample.txt output -------------------------------------------------------------------------------- /ch02-mr-intro/src/main/examples/MaxTemperature/output/part-r-00000: -------------------------------------------------------------------------------- 1 | 1949 111 2 | 1950 22 3 | -------------------------------------------------------------------------------- /ch02-mr-intro/src/main/examples/MaxTemperatureWithCombiner/input.txt: -------------------------------------------------------------------------------- 1 | hadoop MaxTemperatureWithCombiner input/ncdc/sample.txt output -------------------------------------------------------------------------------- /ch02-mr-intro/src/main/examples/MaxTemperatureWithCombiner/output/part-r-00000: -------------------------------------------------------------------------------- 1 | 1949 111 2 | 1950 22 3 | -------------------------------------------------------------------------------- /ch02-mr-intro/src/main/examples/OldMaxTemperature/input.txt: -------------------------------------------------------------------------------- 1 | hadoop OldMaxTemperature input/ncdc/sample.txt output -------------------------------------------------------------------------------- /ch02-mr-intro/src/main/examples/OldMaxTemperature/output/part-00000: -------------------------------------------------------------------------------- 1 | 1949 111 2 | 1950 22 3 | -------------------------------------------------------------------------------- /ch02-mr-intro/src/main/examples/max_temperature.cpp.input.txt: -------------------------------------------------------------------------------- 1 | hadoop pipes \ 2 | -D hadoop.pipes.java.recordreader=true \ 3 | -D hadoop.pipes.java.recordwriter=true \ 4 | -input sample.txt \ 5 | -output output \ 6 | -program bin/max_temperature -------------------------------------------------------------------------------- /ch02-mr-intro/src/main/examples/max_temperature_hadoop.input.txt: -------------------------------------------------------------------------------- 1 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \ 2 | -input input/ncdc/sample.txt \ 3 | -output output \ 4 | -mapper ch02-mr-intro/src/main/ruby/max_temperature_map.rb \ 5 | -reducer ch02-mr-intro/src/main/ruby/max_temperature_reduce.rb -------------------------------------------------------------------------------- /ch02-mr-intro/src/main/examples/max_temperature_hadoop_cluster.input.txt: -------------------------------------------------------------------------------- 1 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \ 2 | -files ch02-mr-intro/src/main/ruby/max_temperature_map.rb,\ 3 | ch02-mr-intro/src/main/ruby/max_temperature_reduce.rb \ 4 | -input input/ncdc/all \ 5 | -output output \ 6 | -mapper ch02-mr-intro/src/main/ruby/max_temperature_map.rb \ 7 | -combiner ch02-mr-intro/src/main/ruby/max_temperature_reduce.rb \ 8 | -reducer ch02-mr-intro/src/main/ruby/max_temperature_reduce.rb 9 | -------------------------------------------------------------------------------- /ch02-mr-intro/src/main/examples/max_temperature_py/2/input.txt: -------------------------------------------------------------------------------- 1 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \ 2 | -input input/ncdc/sample.txt \ 3 | -output output \ 4 | -mapper ch02-mr-intro/src/main/python/max_temperature_map.py \ 5 | -reducer ch02-mr-intro/src/main/python/max_temperature_reduce.py -------------------------------------------------------------------------------- /ch02-mr-intro/src/main/examples/max_temperature_py/2/output/part-00000: -------------------------------------------------------------------------------- 1 | 1949 111 2 | 1950 22 3 | -------------------------------------------------------------------------------- /ch02-mr-intro/src/main/examples/max_temperature_py/input.txt: -------------------------------------------------------------------------------- 1 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \ 2 | -input input/ncdc/sample.txt \ 3 | -output output \ 4 | -mapper ch02-mr-intro/src/main/python/max_temperature_map.py \ 5 | -reducer ch02-mr-intro/src/main/python/max_temperature_reduce.py -------------------------------------------------------------------------------- /ch02-mr-intro/src/main/examples/max_temperature_py/output/part-r-00000: -------------------------------------------------------------------------------- 1 | 1949 111 2 | 1950 22 3 | -------------------------------------------------------------------------------- /ch02-mr-intro/src/main/examples/max_temperature_py/pseudo/input.txt: -------------------------------------------------------------------------------- 1 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \ 2 | -files ch02-mr-intro/src/main/python/max_temperature_map.py,\ 3 | ch02-mr-intro/src/main/python/max_temperature_reduce.py \ 4 | -input input/ncdc/sample.txt \ 5 | -output output \ 6 | -mapper ch02-mr-intro/src/main/python/max_temperature_map.py \ 7 | -reducer ch02-mr-intro/src/main/python/max_temperature_reduce.py 8 | -------------------------------------------------------------------------------- /ch02-mr-intro/src/main/examples/max_temperature_py/pseudo/output/part-00000: -------------------------------------------------------------------------------- 1 | 1949 111 2 | 1950 22 3 | -------------------------------------------------------------------------------- /ch02-mr-intro/src/main/java/MaxTemperatureReducer.java: -------------------------------------------------------------------------------- 1 | // cc MaxTemperatureReducer Reducer for maximum temperature example 2 | // vv MaxTemperatureReducer 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Reducer; 8 | 9 | public class MaxTemperatureReducer 10 | extends Reducer { 11 | 12 | @Override 13 | public void reduce(Text key, Iterable values, 14 | Context context) 15 | throws IOException, InterruptedException { 16 | 17 | int maxValue = Integer.MIN_VALUE; 18 | for (IntWritable value : values) { 19 | maxValue = Math.max(maxValue, value.get()); 20 | } 21 | context.write(key, new IntWritable(maxValue)); 22 | } 23 | } 24 | // ^^ MaxTemperatureReducer 25 | -------------------------------------------------------------------------------- /ch02-mr-intro/src/main/java/oldapi/MaxTemperatureReducer.java: -------------------------------------------------------------------------------- 1 | package oldapi; 2 | 3 | import java.io.IOException; 4 | import java.util.Iterator; 5 | 6 | import org.apache.hadoop.io.IntWritable; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapred.MapReduceBase; 9 | import org.apache.hadoop.mapred.OutputCollector; 10 | import org.apache.hadoop.mapred.Reducer; 11 | import org.apache.hadoop.mapred.Reporter; 12 | 13 | public class MaxTemperatureReducer extends MapReduceBase 14 | implements Reducer { 15 | 16 | public void reduce(Text key, Iterator values, 17 | OutputCollector output, Reporter reporter) 18 | throws IOException { 19 | 20 | int maxValue = Integer.MIN_VALUE; 21 | while (values.hasNext()) { 22 | maxValue = Math.max(maxValue, values.next().get()); 23 | } 24 | output.collect(key, new IntWritable(maxValue)); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /ch02-mr-intro/src/main/python/max_temperature_map.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import re 4 | import sys 5 | 6 | for line in sys.stdin: 7 | val = line.strip() 8 | (year, temp, q) = (val[15:19], val[87:92], val[92:93]) 9 | if (temp != "+9999" and re.match("[01459]", q)): 10 | print "%s\t%s" % (year, temp) 11 | -------------------------------------------------------------------------------- /ch02-mr-intro/src/main/python/max_temperature_reduce.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | 5 | (last_key, max_val) = (None, -sys.maxint) 6 | for line in sys.stdin: 7 | (key, val) = line.strip().split("\t") 8 | if last_key and last_key != key: 9 | print "%s\t%s" % (last_key, max_val) 10 | (last_key, max_val) = (key, int(val)) 11 | else: 12 | (last_key, max_val) = (key, max(max_val, int(val))) 13 | 14 | if last_key: 15 | print "%s\t%s" % (last_key, max_val) -------------------------------------------------------------------------------- /ch02-mr-intro/src/main/ruby/max_temperature_map.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | STDIN.each_line do |line| 4 | val = line 5 | year, temp, q = val[15,4], val[87,5], val[92,1] 6 | puts "#{year}\t#{temp}" if (temp != "+9999" && q =~ /[01459]/) 7 | end -------------------------------------------------------------------------------- /ch02-mr-intro/src/main/ruby/max_temperature_reduce.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | last_key, max_val = nil, -1000000 4 | STDIN.each_line do |line| 5 | key, val = line.split("\t") 6 | if last_key && last_key != key 7 | puts "#{last_key}\t#{max_val}" 8 | last_key, max_val = key, val.to_i 9 | else 10 | last_key, max_val = key, [max_val, val.to_i].max 11 | end 12 | end 13 | puts "#{last_key}\t#{max_val}" if last_key -------------------------------------------------------------------------------- /ch02-mr-intro/src/main/sh/max_temp.sh: -------------------------------------------------------------------------------- 1 | : == max_temp_java 2 | : == max_temp_java_output 3 | : == max_temp_ruby_map 4 | : == max_temp_ruby_pipeline 5 | : == max_temp_python_pipeline 6 | rm -r /Users/tom/workspace/htdg/output 7 | : vv max_temp_java 8 | export HADOOP_CLASSPATH=build/classes 9 | hadoop MaxTemperature input/ncdc/sample.txt output 10 | : ^^ max_temp_java 11 | : vv max_temp_java_output 12 | cat output/part-00000 13 | : ^^ max_temp_java_output 14 | : vv max_temp_ruby_map 15 | cat input/ncdc/sample.txt | ch02-mr-intro/src/main/ruby/max_temperature_map.rb 16 | : ^^ max_temp_ruby_map 17 | : vv max_temp_ruby_pipeline 18 | cat input/ncdc/sample.txt | \ 19 | ch02-mr-intro/src/main/ruby/max_temperature_map.rb | \ 20 | sort | ch02-mr-intro/src/main/ruby/max_temperature_reduce.rb 21 | : ^^ max_temp_ruby_pipeline 22 | : vv max_temp_python_pipeline 23 | cat input/ncdc/sample.txt | \ 24 | ch02-mr-intro/src/main/python/max_temperature_map.py | \ 25 | sort | ch02-mr-intro/src/main/python/max_temperature_reduce.py 26 | : ^^ max_temp_python_pipeline 27 | -------------------------------------------------------------------------------- /ch03-hdfs/src/main/conf/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | fs.defaultFS 6 | hdfs://localhost/ 7 | 8 | -------------------------------------------------------------------------------- /ch03-hdfs/src/main/conf/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | dfs.replication 6 | 1 7 | 8 | -------------------------------------------------------------------------------- /ch03-hdfs/src/main/java/FileSystemCat.java: -------------------------------------------------------------------------------- 1 | // cc FileSystemCat Displays files from a Hadoop filesystem on standard output by using the FileSystem directly 2 | import java.io.InputStream; 3 | import java.net.URI; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.FileSystem; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.IOUtils; 9 | 10 | // vv FileSystemCat 11 | public class FileSystemCat { 12 | 13 | public static void main(String[] args) throws Exception { 14 | String uri = args[0]; 15 | Configuration conf = new Configuration(); 16 | FileSystem fs = FileSystem.get(URI.create(uri), conf); 17 | InputStream in = null; 18 | try { 19 | in = fs.open(new Path(uri)); 20 | IOUtils.copyBytes(in, System.out, 4096, false); 21 | } finally { 22 | IOUtils.closeStream(in); 23 | } 24 | } 25 | } 26 | // ^^ FileSystemCat 27 | -------------------------------------------------------------------------------- /ch03-hdfs/src/main/java/FileSystemDoubleCat.java: -------------------------------------------------------------------------------- 1 | // cc FileSystemDoubleCat Displays files from a Hadoop filesystem on standard output twice, by using seek 2 | import java.net.URI; 3 | 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.FSDataInputStream; 6 | import org.apache.hadoop.fs.FileSystem; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.IOUtils; 9 | 10 | // vv FileSystemDoubleCat 11 | public class FileSystemDoubleCat { 12 | 13 | public static void main(String[] args) throws Exception { 14 | String uri = args[0]; 15 | Configuration conf = new Configuration(); 16 | FileSystem fs = FileSystem.get(URI.create(uri), conf); 17 | FSDataInputStream in = null; 18 | try { 19 | in = fs.open(new Path(uri)); 20 | IOUtils.copyBytes(in, System.out, 4096, false); 21 | in.seek(0); // go back to the start of the file 22 | IOUtils.copyBytes(in, System.out, 4096, false); 23 | } finally { 24 | IOUtils.closeStream(in); 25 | } 26 | } 27 | } 28 | // ^^ FileSystemDoubleCat 29 | -------------------------------------------------------------------------------- /ch03-hdfs/src/main/java/ListStatus.java: -------------------------------------------------------------------------------- 1 | // cc ListStatus Shows the file statuses for a collection of paths in a Hadoop filesystem 2 | import java.net.URI; 3 | 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.FileStatus; 6 | import org.apache.hadoop.fs.FileSystem; 7 | import org.apache.hadoop.fs.FileUtil; 8 | import org.apache.hadoop.fs.Path; 9 | 10 | // vv ListStatus 11 | public class ListStatus { 12 | 13 | public static void main(String[] args) throws Exception { 14 | String uri = args[0]; 15 | Configuration conf = new Configuration(); 16 | FileSystem fs = FileSystem.get(URI.create(uri), conf); 17 | 18 | Path[] paths = new Path[args.length]; 19 | for (int i = 0; i < paths.length; i++) { 20 | paths[i] = new Path(args[i]); 21 | } 22 | 23 | FileStatus[] status = fs.listStatus(paths); 24 | Path[] listedPaths = FileUtil.stat2Paths(status); 25 | for (Path p : listedPaths) { 26 | System.out.println(p); 27 | } 28 | } 29 | } 30 | // ^^ ListStatus 31 | -------------------------------------------------------------------------------- /ch03-hdfs/src/main/java/RegexExcludePathFilter.java: -------------------------------------------------------------------------------- 1 | // cc RegexExcludePathFilter A PathFilter for excluding paths that match a regular expression 2 | import org.apache.hadoop.fs.Path; 3 | import org.apache.hadoop.fs.PathFilter; 4 | 5 | // vv RegexExcludePathFilter 6 | public class RegexExcludePathFilter implements PathFilter { 7 | 8 | private final String regex; 9 | 10 | public RegexExcludePathFilter(String regex) { 11 | this.regex = regex; 12 | } 13 | 14 | public boolean accept(Path path) { 15 | return !path.toString().matches(regex); 16 | } 17 | } 18 | // ^^ RegexExcludePathFilter 19 | -------------------------------------------------------------------------------- /ch03-hdfs/src/main/java/RegexPathFilter.java: -------------------------------------------------------------------------------- 1 | import org.apache.hadoop.fs.Path; 2 | import org.apache.hadoop.fs.PathFilter; 3 | 4 | public class RegexPathFilter implements PathFilter { 5 | 6 | private final String regex; 7 | private final boolean include; 8 | 9 | public RegexPathFilter(String regex) { 10 | this(regex, true); 11 | } 12 | 13 | public RegexPathFilter(String regex, boolean include) { 14 | this.regex = regex; 15 | this.include = include; 16 | } 17 | 18 | public boolean accept(Path path) { 19 | return (path.toString().matches(regex)) ? include : !include; 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /ch03-hdfs/src/main/java/URLCat.java: -------------------------------------------------------------------------------- 1 | // cc URLCat Displays files from a Hadoop filesystem on standard output using a URLStreamHandler 2 | import java.io.InputStream; 3 | import java.net.URL; 4 | 5 | import org.apache.hadoop.fs.FsUrlStreamHandlerFactory; 6 | import org.apache.hadoop.io.IOUtils; 7 | 8 | // vv URLCat 9 | public class URLCat { 10 | 11 | static { 12 | URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory()); 13 | } 14 | 15 | public static void main(String[] args) throws Exception { 16 | InputStream in = null; 17 | try { 18 | in = new URL(args[0]).openStream(); 19 | IOUtils.copyBytes(in, System.out, 4096, false); 20 | } finally { 21 | IOUtils.closeStream(in); 22 | } 23 | } 24 | } 25 | // ^^ URLCat 26 | -------------------------------------------------------------------------------- /ch03-hdfs/src/main/sh/file.sh: -------------------------------------------------------------------------------- 1 | : == url_cat 2 | : == filesystem_cat 3 | : == filesystem_double_cat 4 | : == list_status 5 | : == file_copy_with_progress 6 | rm -r /Users/tom/workspace/htdg/output 7 | export HADOOP_CLASSPATH=build/classes 8 | : vv url_cat 9 | hadoop URLCat hdfs://localhost/user/tom/quangle.txt 10 | : ^^ url_cat 11 | : vv filesystem_cat 12 | hadoop FileSystemCat hdfs://localhost/user/tom/quangle.txt 13 | : ^^ filesystem_cat 14 | : vv filesystem_double_cat 15 | hadoop FileSystemDoubleCat hdfs://localhost/user/tom/quangle.txt 16 | : ^^ filesystem_double_cat 17 | : vv list_status 18 | hadoop ListStatus hdfs://localhost/ hdfs://localhost/user/tom 19 | : ^^ list_status 20 | : vv file_copy_with_progress 21 | hadoop FileCopyWithProgress input/docs/1400-8.txt hdfs://localhost/user/tom/1400-8.txt 22 | : ^^ file_copy_with_progress 23 | 24 | 25 | -------------------------------------------------------------------------------- /ch03-hdfs/src/main/sh/hars.sh: -------------------------------------------------------------------------------- 1 | : == har_ls_files 2 | : == har_create 3 | : == har_inspect 4 | : == har_ls 5 | : == har_ls_long 6 | : == har_rmr 7 | rsync -avz --exclude '.svn' /Users/tom/workspace/htdg/input/fileinput/ /tmp/fileinput 8 | hadoop fs -copyFromLocal /tmp/fileinput /my/files 9 | rm -rf /tmp/fileinput 10 | : vv har_ls_files 11 | hadoop fs -lsr /my/files 12 | : ^^ har_ls_files 13 | : vv har_create 14 | hadoop archive -archiveName files.har /my/files /my 15 | : ^^ har_create 16 | : vv har_inspect 17 | hadoop fs -ls /my 18 | hadoop fs -ls /my/files.har 19 | : ^^ har_inspect 20 | : vv har_ls 21 | hadoop fs -lsr har:///my/files.har 22 | : ^^ har_ls 23 | : vv har_ls_long 24 | hadoop fs -lsr har:///my/files.har/my/files/dir 25 | hadoop fs -lsr har://hdfs-localhost:8020/my/files.har/my/files/dir 26 | : ^^ har_ls_long 27 | : vv har_rmr 28 | hadoop fs -rmr /my/files.har 29 | : ^^ har_rmr 30 | hadoop fs -rmr /my/files -------------------------------------------------------------------------------- /ch04-yarn/capacity-scheduler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | yarn.scheduler.capacity.root.queues 5 | prod,dev 6 | 7 | 8 | yarn.scheduler.capacity.root.dev.queues 9 | eng,science 10 | 11 | 12 | yarn.scheduler.capacity.root.prod.capacity 13 | 40 14 | 15 | 16 | yarn.scheduler.capacity.root.dev.capacity 17 | 60 18 | 19 | 20 | yarn.scheduler.capacity.root.dev.maximum-capacity 21 | 75 22 | 23 | 24 | yarn.scheduler.capacity.root.dev.eng.capacity 25 | 50 26 | 27 | 28 | yarn.scheduler.capacity.root.dev.science.capacity 29 | 50 30 | 31 | -------------------------------------------------------------------------------- /ch04-yarn/fair-scheduler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | fair 4 | 5 | 6 | 40 7 | fifo 8 | 9 | 10 | 11 | 60 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /ch05-io/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.hadoopbook 6 | hadoop-meta 7 | 4.0 8 | ../hadoop-meta/pom.xml 9 | 10 | com.hadoopbook 11 | ch05-io 12 | jar 13 | 4.0 14 | Chapter 5: Hadoop I/O 15 | 16 | 17 | com.hadoopbook 18 | ch02-mr-intro 19 | 4.0 20 | 21 | 22 | junit 23 | junit 24 | 25 | 26 | org.hamcrest 27 | hamcrest-all 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /ch05-io/src/main/examples/FileDecompressor.java.input.txt: -------------------------------------------------------------------------------- 1 | hadoop FileDecompressor file.gz -------------------------------------------------------------------------------- /ch05-io/src/main/examples/MapFile-data-head.input.txt: -------------------------------------------------------------------------------- 1 | hadoop fs -text numbers.map/data | head -------------------------------------------------------------------------------- /ch05-io/src/main/examples/MapFile-data-head.output.txt: -------------------------------------------------------------------------------- 1 | 1 One, two, buckle my shoe 2 | 2 Three, four, shut the door 3 | 3 Five, six, pick up sticks 4 | 4 Seven, eight, lay them straight 5 | 5 Nine, ten, a big fat hen 6 | 6 One, two, buckle my shoe 7 | 7 Three, four, shut the door 8 | 8 Five, six, pick up sticks 9 | 9 Seven, eight, lay them straight 10 | 10 Nine, ten, a big fat hen -------------------------------------------------------------------------------- /ch05-io/src/main/examples/MapFile-index.input.txt: -------------------------------------------------------------------------------- 1 | hadoop fs -text numbers.map/index -------------------------------------------------------------------------------- /ch05-io/src/main/examples/MapFile-index.output.txt: -------------------------------------------------------------------------------- 1 | 1 128 2 | 129 6079 3 | 257 12054 4 | 385 18030 5 | 513 24002 6 | 641 29976 7 | 769 35947 8 | 897 41922 -------------------------------------------------------------------------------- /ch05-io/src/main/examples/MapFile-ls.input.txt: -------------------------------------------------------------------------------- 1 | ls -l numbers.map -------------------------------------------------------------------------------- /ch05-io/src/main/examples/MapFile-ls.output.txt: -------------------------------------------------------------------------------- 1 | total 104 2 | -rw-r--r-- 1 tom tom 47898 Jul 29 22:06 data 3 | -rw-r--r-- 1 tom tom 251 Jul 29 22:06 index -------------------------------------------------------------------------------- /ch05-io/src/main/examples/MapFileWriteDemo.java.input.txt: -------------------------------------------------------------------------------- 1 | hadoop MapFileWriteDemo numbers.map -------------------------------------------------------------------------------- /ch05-io/src/main/examples/MaxTemperatureWithCompression/input.txt: -------------------------------------------------------------------------------- 1 | hadoop MaxTemperatureWithCompression input/ncdc/sample.txt.gz output -------------------------------------------------------------------------------- /ch05-io/src/main/examples/MaxTemperatureWithCompression/output/part-r-00000.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch05-io/src/main/examples/MaxTemperatureWithCompression/output/part-r-00000.gz -------------------------------------------------------------------------------- /ch05-io/src/main/examples/MaxTemperatureWithMapOutputCompression.ignore/input.txt: -------------------------------------------------------------------------------- 1 | hadoop MaxTemperatureWithMapOutputCompression input/ncdc/sample.txt.gz output -------------------------------------------------------------------------------- /ch05-io/src/main/examples/MaxTemperatureWithMapOutputCompression.ignore/output/part-r-00000: -------------------------------------------------------------------------------- 1 | 1949 111 2 | 1950 22 3 | -------------------------------------------------------------------------------- /ch05-io/src/main/examples/SequenceFileMapReduceSort.java.input.txt: -------------------------------------------------------------------------------- 1 | hadoop jar $HADOOP_INSTALL/hadoop-*-examples.jar sort -r 1 \ 2 | -inFormat org.apache.hadoop.mapred.SequenceFileInputFormat \ 3 | -outFormat org.apache.hadoop.mapred.SequenceFileOutputFormat \ 4 | -outKey org.apache.hadoop.io.IntWritable \ 5 | -outValue org.apache.hadoop.io.Text \ 6 | numbers.seq sorted -------------------------------------------------------------------------------- /ch05-io/src/main/examples/SequenceFileMapReduceSortResults.java.input.txt: -------------------------------------------------------------------------------- 1 | hadoop fs -text sorted/part-00000 | head -------------------------------------------------------------------------------- /ch05-io/src/main/examples/SequenceFileMapReduceSortResults.java.output.txt: -------------------------------------------------------------------------------- 1 | 1 Nine, ten, a big fat hen 2 | 2 Seven, eight, lay them straight 3 | 3 Five, six, pick up sticks 4 | 4 Three, four, shut the door 5 | 5 One, two, buckle my shoe 6 | 6 Nine, ten, a big fat hen 7 | 7 Seven, eight, lay them straight 8 | 8 Five, six, pick up sticks 9 | 9 Three, four, shut the door 10 | 10 One, two, buckle my shoe -------------------------------------------------------------------------------- /ch05-io/src/main/examples/SequenceFileMapReduceSortResults.java.pre.sh: -------------------------------------------------------------------------------- 1 | # Produce sorted seq file 2 | hadoop SequenceFileWriteDemo numbers.seq 3 | 4 | hadoop jar $HADOOP_INSTALL/hadoop-*-examples.jar sort -r 1 \ 5 | -inFormat org.apache.hadoop.mapred.SequenceFileInputFormat \ 6 | -outFormat org.apache.hadoop.mapred.SequenceFileOutputFormat \ 7 | -outKey org.apache.hadoop.io.IntWritable \ 8 | -outValue org.apache.hadoop.io.Text \ 9 | numbers.seq sorted -------------------------------------------------------------------------------- /ch05-io/src/main/examples/SequenceFileReadDemo.java.input.txt: -------------------------------------------------------------------------------- 1 | hadoop SequenceFileReadDemo numbers.seq -------------------------------------------------------------------------------- /ch05-io/src/main/examples/SequenceFileReadDemo.java.output.txt: -------------------------------------------------------------------------------- 1 | [128] 100 One, two, buckle my shoe 2 | [173] 99 Three, four, shut the door 3 | [220] 98 Five, six, pick up sticks 4 | [264] 97 Seven, eight, lay them straight 5 | [314] 96 Nine, ten, a big fat hen 6 | [359] 95 One, two, buckle my shoe 7 | [404] 94 Three, four, shut the door 8 | [451] 93 Five, six, pick up sticks 9 | [495] 92 Seven, eight, lay them straight 10 | [545] 91 Nine, ten, a big fat hen 11 | [590] 90 One, two, buckle my shoe 12 | ... 13 | [1976] 60 One, two, buckle my shoe 14 | [2021*] 59 Three, four, shut the door 15 | [2088] 58 Five, six, pick up sticks 16 | [2132] 57 Seven, eight, lay them straight 17 | [2182] 56 Nine, ten, a big fat hen 18 | ... 19 | [4557] 5 One, two, buckle my shoe 20 | [4602] 4 Three, four, shut the door 21 | [4649] 3 Five, six, pick up sticks 22 | [4693] 2 Seven, eight, lay them straight 23 | [4743] 1 Nine, ten, a big fat hen -------------------------------------------------------------------------------- /ch05-io/src/main/examples/SequenceFileReadDemo.java.pre.sh: -------------------------------------------------------------------------------- 1 | # Make sure file is there to be read 2 | hadoop SequenceFileWriteDemo numbers.seq -------------------------------------------------------------------------------- /ch05-io/src/main/examples/SequenceFileToMapFileConverter-fix.java.input.txt: -------------------------------------------------------------------------------- 1 | hadoop MapFileFixer numbers.map -------------------------------------------------------------------------------- /ch05-io/src/main/examples/SequenceFileToMapFileConverter-mv.java.input.txt: -------------------------------------------------------------------------------- 1 | hadoop fs -mv numbers.map/part-00000 numbers.map/data -------------------------------------------------------------------------------- /ch05-io/src/main/examples/SequenceFileToMapFileConverter-sort.java.input.txt: -------------------------------------------------------------------------------- 1 | hadoop jar $HADOOP_INSTALL/hadoop-*-examples.jar sort -r 1 \ 2 | -inFormat org.apache.hadoop.mapred.SequenceFileInputFormat \ 3 | -outFormat org.apache.hadoop.mapred.SequenceFileOutputFormat \ 4 | -outKey org.apache.hadoop.io.IntWritable \ 5 | -outValue org.apache.hadoop.io.Text \ 6 | numbers.seq numbers.map -------------------------------------------------------------------------------- /ch05-io/src/main/examples/SequenceFileWriteDemo.java.input.txt: -------------------------------------------------------------------------------- 1 | hadoop SequenceFileWriteDemo numbers.seq -------------------------------------------------------------------------------- /ch05-io/src/main/examples/SequenceFileWriteDemo.java.output.txt: -------------------------------------------------------------------------------- 1 | [128] 100 One, two, buckle my shoe 2 | [173] 99 Three, four, shut the door 3 | [220] 98 Five, six, pick up sticks 4 | [264] 97 Seven, eight, lay them straight 5 | [314] 96 Nine, ten, a big fat hen 6 | [359] 95 One, two, buckle my shoe 7 | [404] 94 Three, four, shut the door 8 | [451] 93 Five, six, pick up sticks 9 | [495] 92 Seven, eight, lay them straight 10 | [545] 91 Nine, ten, a big fat hen 11 | ... 12 | [1976] 60 One, two, buckle my shoe 13 | [2021] 59 Three, four, shut the door 14 | [2088] 58 Five, six, pick up sticks 15 | [2132] 57 Seven, eight, lay them straight 16 | [2182] 56 Nine, ten, a big fat hen 17 | ... 18 | [4557] 5 One, two, buckle my shoe 19 | [4602] 4 Three, four, shut the door 20 | [4649] 3 Five, six, pick up sticks 21 | [4693] 2 Seven, eight, lay them straight 22 | [4743] 1 Nine, ten, a big fat hen -------------------------------------------------------------------------------- /ch05-io/src/main/examples/StreamCompressor.java.input.txt: -------------------------------------------------------------------------------- 1 | echo "Text" | hadoop StreamCompressor org.apache.hadoop.io.compress.GzipCodec \ 2 | | gunzip - -------------------------------------------------------------------------------- /ch05-io/src/main/examples/StreamCompressor.java.output.txt: -------------------------------------------------------------------------------- 1 | Text -------------------------------------------------------------------------------- /ch05-io/src/main/examples/TextIterator.java.input.txt: -------------------------------------------------------------------------------- 1 | hadoop TextIterator -------------------------------------------------------------------------------- /ch05-io/src/main/examples/TextIterator.java.output.txt: -------------------------------------------------------------------------------- 1 | 41 2 | df 3 | 6771 4 | 10400 -------------------------------------------------------------------------------- /ch05-io/src/main/examples/hadoop-fs-text.input.txt: -------------------------------------------------------------------------------- 1 | hadoop fs -text numbers.seq | head -------------------------------------------------------------------------------- /ch05-io/src/main/examples/hadoop-fs-text.output.txt: -------------------------------------------------------------------------------- 1 | 100 One, two, buckle my shoe 2 | 99 Three, four, shut the door 3 | 98 Five, six, pick up sticks 4 | 97 Seven, eight, lay them straight 5 | 96 Nine, ten, a big fat hen 6 | 95 One, two, buckle my shoe 7 | 94 Three, four, shut the door 8 | 93 Five, six, pick up sticks 9 | 92 Seven, eight, lay them straight 10 | 91 Nine, ten, a big fat hen -------------------------------------------------------------------------------- /ch05-io/src/main/java/StreamCompressor.java: -------------------------------------------------------------------------------- 1 | // cc StreamCompressor A program to compress data read from standard input and write it to standard output 2 | import org.apache.hadoop.conf.Configuration; 3 | import org.apache.hadoop.io.IOUtils; 4 | import org.apache.hadoop.io.compress.CompressionCodec; 5 | import org.apache.hadoop.io.compress.CompressionOutputStream; 6 | import org.apache.hadoop.util.ReflectionUtils; 7 | 8 | // vv StreamCompressor 9 | public class StreamCompressor { 10 | 11 | public static void main(String[] args) throws Exception { 12 | String codecClassname = args[0]; 13 | Class codecClass = Class.forName(codecClassname); 14 | Configuration conf = new Configuration(); 15 | CompressionCodec codec = (CompressionCodec) 16 | ReflectionUtils.newInstance(codecClass, conf); 17 | 18 | CompressionOutputStream out = codec.createOutputStream(System.out); 19 | IOUtils.copyBytes(System.in, out, 4096, false); 20 | out.finish(); 21 | } 22 | } 23 | // ^^ StreamCompressor 24 | -------------------------------------------------------------------------------- /ch05-io/src/main/java/TextArrayWritable.java: -------------------------------------------------------------------------------- 1 | // == TextArrayWritable 2 | import org.apache.hadoop.io.ArrayWritable; 3 | import org.apache.hadoop.io.Text; 4 | 5 | // vv TextArrayWritable 6 | public class TextArrayWritable extends ArrayWritable { 7 | public TextArrayWritable() { 8 | super(Text.class); 9 | } 10 | } 11 | // ^^ TextArrayWritable 12 | -------------------------------------------------------------------------------- /ch05-io/src/main/java/TextIterator.java: -------------------------------------------------------------------------------- 1 | // cc TextIterator Iterating over the characters in a Text object 2 | import java.nio.ByteBuffer; 3 | 4 | import org.apache.hadoop.io.Text; 5 | 6 | // vv TextIterator 7 | public class TextIterator { 8 | 9 | public static void main(String[] args) { 10 | Text t = new Text("\u0041\u00DF\u6771\uD801\uDC00"); 11 | 12 | ByteBuffer buf = ByteBuffer.wrap(t.getBytes(), 0, t.getLength()); 13 | int cp; 14 | while (buf.hasRemaining() && (cp = Text.bytesToCodePoint(buf)) != -1) { 15 | System.out.println(Integer.toHexString(cp)); 16 | } 17 | } 18 | } 19 | // ^^ TextIterator 20 | -------------------------------------------------------------------------------- /ch05-io/src/test/java/ArrayWritableTest.java: -------------------------------------------------------------------------------- 1 | // == ArrayWritableTest 2 | import static org.hamcrest.CoreMatchers.is; 3 | import static org.junit.Assert.assertThat; 4 | 5 | import java.io.IOException; 6 | import org.apache.hadoop.io.*; 7 | import org.junit.Test; 8 | 9 | public class ArrayWritableTest extends WritableTestBase { 10 | 11 | @Test 12 | public void test() throws IOException { 13 | // vv ArrayWritableTest 14 | ArrayWritable writable = new ArrayWritable(Text.class); 15 | // ^^ ArrayWritableTest 16 | writable.set(new Text[] { new Text("cat"), new Text("dog") }); 17 | 18 | TextArrayWritable dest = new TextArrayWritable(); 19 | WritableUtils.cloneInto(dest, writable); 20 | assertThat(dest.get().length, is(2)); 21 | // TODO: fix cast, also use single assert 22 | assertThat((Text) dest.get()[0], is(new Text("cat"))); 23 | assertThat((Text) dest.get()[1], is(new Text("dog"))); 24 | 25 | Text[] copy = (Text[]) dest.toArray(); 26 | assertThat(copy[0], is(new Text("cat"))); 27 | assertThat(copy[1], is(new Text("dog"))); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /ch05-io/src/test/java/BinaryOrTextWritable.java: -------------------------------------------------------------------------------- 1 | import org.apache.hadoop.io.BytesWritable; 2 | import org.apache.hadoop.io.GenericWritable; 3 | import org.apache.hadoop.io.Text; 4 | import org.apache.hadoop.io.Writable; 5 | 6 | public class BinaryOrTextWritable extends GenericWritable { 7 | private static Class[] TYPES = { BytesWritable.class, Text.class }; 8 | 9 | @Override 10 | @SuppressWarnings("unchecked") 11 | protected Class[] getTypes() { 12 | return TYPES; 13 | } 14 | 15 | } 16 | -------------------------------------------------------------------------------- /ch05-io/src/test/java/BooleanWritableTest.java: -------------------------------------------------------------------------------- 1 | import static org.hamcrest.CoreMatchers.is; 2 | import static org.junit.Assert.assertThat; 3 | 4 | import java.io.IOException; 5 | import org.apache.hadoop.io.BooleanWritable; 6 | import org.junit.Test; 7 | 8 | public class BooleanWritableTest extends WritableTestBase { 9 | 10 | @Test 11 | public void test() throws IOException { 12 | BooleanWritable src = new BooleanWritable(true); 13 | BooleanWritable dest = new BooleanWritable(); 14 | assertThat(writeTo(src, dest), is("01")); 15 | assertThat(dest.get(), is(src.get())); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /ch05-io/src/test/java/BytesWritableTest.java: -------------------------------------------------------------------------------- 1 | // == BytesWritableTest 2 | // == BytesWritableTest-Capacity 3 | import static org.hamcrest.CoreMatchers.is; 4 | import static org.junit.Assert.assertThat; 5 | 6 | import java.io.IOException; 7 | import org.apache.hadoop.io.BytesWritable; 8 | import org.apache.hadoop.util.StringUtils; 9 | import org.junit.Test; 10 | 11 | public class BytesWritableTest extends WritableTestBase { 12 | 13 | @Test 14 | public void test() throws IOException { 15 | // vv BytesWritableTest 16 | BytesWritable b = new BytesWritable(new byte[] { 3, 5 }); 17 | byte[] bytes = serialize(b); 18 | assertThat(StringUtils.byteToHexString(bytes), is("000000020305")); 19 | // ^^ BytesWritableTest 20 | 21 | // vv BytesWritableTest-Capacity 22 | b.setCapacity(11); 23 | assertThat(b.getLength(), is(2)); 24 | assertThat(b.getBytes().length, is(11)); 25 | // ^^ BytesWritableTest-Capacity 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /ch05-io/src/test/java/FileDecompressorTest.java: -------------------------------------------------------------------------------- 1 | import static org.hamcrest.CoreMatchers.is; 2 | import static org.junit.Assert.assertThat; 3 | 4 | import java.io.*; 5 | import java.util.Scanner; 6 | import org.apache.hadoop.fs.FileUtil; 7 | import org.apache.hadoop.io.IOUtils; 8 | import org.junit.Test; 9 | 10 | public class FileDecompressorTest { 11 | 12 | @Test 13 | public void decompressesGzippedFile() throws Exception { 14 | File file = File.createTempFile("file", ".gz"); 15 | file.deleteOnExit(); 16 | InputStream in = this.getClass().getResourceAsStream("/file.gz"); 17 | IOUtils.copyBytes(in, new FileOutputStream(file), 4096, true); 18 | 19 | String path = file.getAbsolutePath(); 20 | FileDecompressor.main(new String[] { path }); 21 | 22 | String decompressedPath = path.substring(0, path.length() - 3); 23 | assertThat(readFile(new File(decompressedPath)), is("Text\n")); 24 | } 25 | 26 | private String readFile(File file) throws IOException { 27 | return new Scanner(file).useDelimiter("\\A").next(); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /ch05-io/src/test/java/GenericWritableTest.java: -------------------------------------------------------------------------------- 1 | import static org.hamcrest.CoreMatchers.is; 2 | import static org.junit.Assert.assertThat; 3 | 4 | import java.io.IOException; 5 | import org.apache.hadoop.io.*; 6 | import org.junit.Test; 7 | 8 | public class GenericWritableTest extends WritableTestBase { 9 | 10 | @Test 11 | public void test() throws IOException { 12 | BinaryOrTextWritable src = new BinaryOrTextWritable(); 13 | src.set(new Text("text")); 14 | BinaryOrTextWritable dest = new BinaryOrTextWritable(); 15 | WritableUtils.cloneInto(dest, src); 16 | assertThat((Text) dest.get(), is(new Text("text"))); 17 | 18 | src.set(new BytesWritable(new byte[] {3, 5})); 19 | WritableUtils.cloneInto(dest, src); 20 | assertThat(((BytesWritable) dest.get()).getLength(), is(2)); // TODO proper assert 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /ch05-io/src/test/java/NullWritableTest.java: -------------------------------------------------------------------------------- 1 | import static org.hamcrest.CoreMatchers.is; 2 | import static org.junit.Assert.assertThat; 3 | 4 | import java.io.IOException; 5 | import org.apache.hadoop.io.NullWritable; 6 | import org.junit.Test; 7 | 8 | public class NullWritableTest extends WritableTestBase { 9 | 10 | @Test 11 | public void test() throws IOException { 12 | NullWritable writable = NullWritable.get(); 13 | assertThat(serialize(writable).length, is(0)); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /ch05-io/src/test/java/ObjectWritableTest.java: -------------------------------------------------------------------------------- 1 | import static org.hamcrest.CoreMatchers.is; 2 | import static org.junit.Assert.assertThat; 3 | 4 | import java.io.IOException; 5 | import org.apache.hadoop.io.*; 6 | import org.junit.Test; 7 | 8 | public class ObjectWritableTest extends WritableTestBase { 9 | 10 | @Test 11 | public void test() throws IOException { 12 | ObjectWritable src = new ObjectWritable(Integer.TYPE, 163); 13 | ObjectWritable dest = new ObjectWritable(); 14 | WritableUtils.cloneInto(dest, src); 15 | assertThat((Integer) dest.get(), is(163)); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /ch05-io/src/test/java/VLongWritableTest.java: -------------------------------------------------------------------------------- 1 | import static org.hamcrest.CoreMatchers.is; 2 | import static org.junit.Assert.assertThat; 3 | 4 | import java.io.IOException; 5 | import org.apache.hadoop.io.VLongWritable; 6 | import org.junit.Test; 7 | 8 | public class VLongWritableTest extends WritableTestBase { 9 | 10 | @Test 11 | public void test() throws IOException { 12 | assertThat(serializeToString(new VLongWritable(1)), is("01")); // 1 byte 13 | assertThat(serializeToString(new VLongWritable(127)), is("7f")); // 1 byte 14 | assertThat(serializeToString(new VLongWritable(128)), is("8f80")); // 2 byte 15 | assertThat(serializeToString(new VLongWritable(163)), is("8fa3")); // 2 byte 16 | assertThat(serializeToString(new VLongWritable(Long.MAX_VALUE)), is("887fffffffffffffff")); // 9 byte 17 | assertThat(serializeToString(new VLongWritable(Long.MIN_VALUE)), is("807fffffffffffffff")); // 9 byte 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /ch05-io/src/test/resources/file.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch05-io/src/test/resources/file.gz -------------------------------------------------------------------------------- /ch06-mr-dev/input/ncdc/micro/sample.txt: -------------------------------------------------------------------------------- 1 | 0067011990999991950051507004+68750+023550FM-12+038299999V0203301N00671220001CN9999999N9+00001+99999999999 2 | 0043011990999991950051512004+68750+023550FM-12+038299999V0203201N00671220001CN9999999N9+00221+99999999999 3 | 0043011990999991950051518004+68750+023550FM-12+038299999V0203201N00261220001CN9999999N9-00111+99999999999 4 | 0043012650999991949032412004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+01111+99999999999 5 | 0043012650999991949032418004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+00781+99999999999 -------------------------------------------------------------------------------- /ch06-mr-dev/output/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /ch06-mr-dev/output/.part-r-00000.crc: -------------------------------------------------------------------------------- 1 | crc=)|$ -------------------------------------------------------------------------------- /ch06-mr-dev/output/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch06-mr-dev/output/_SUCCESS -------------------------------------------------------------------------------- /ch06-mr-dev/output/part-r-00000: -------------------------------------------------------------------------------- 1 | 1949 111 2 | 1950 22 3 | -------------------------------------------------------------------------------- /ch06-mr-dev/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.hadoopbook 6 | hadoop-meta 7 | 4.0 8 | ../hadoop-meta/pom.xml 9 | 10 | com.hadoopbook 11 | ch06-mr-dev 12 | jar 13 | 4.0 14 | Chapter 6: Developing a MapReduce Application 15 | 16 | 17 | junit 18 | junit 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /ch06-mr-dev/src/main/examples/ConfigurationPrinterSystem.java.input.txt: -------------------------------------------------------------------------------- 1 | hadoop -Dcolor=yellow ConfigurationPrinter | grep color -------------------------------------------------------------------------------- /ch06-mr-dev/src/main/examples/ConfigurationPrinterWithConf.java.input.txt: -------------------------------------------------------------------------------- 1 | hadoop ConfigurationPrinter -conf conf/hadoop-localhost.xml \ 2 | | grep mapred.job.tracker= -------------------------------------------------------------------------------- /ch06-mr-dev/src/main/examples/ConfigurationPrinterWithConf.java.output.txt: -------------------------------------------------------------------------------- 1 | mapred.job.tracker=localhost:8021 2 | -------------------------------------------------------------------------------- /ch06-mr-dev/src/main/examples/ConfigurationPrinterWithConfAndD.java.input.txt: -------------------------------------------------------------------------------- 1 | hadoop ConfigurationPrinter -conf conf/hadoop-localhost.xml \ 2 | -D mapred.job.tracker=example.com:8021 \ 3 | | grep mapred.job.tracker -------------------------------------------------------------------------------- /ch06-mr-dev/src/main/examples/ConfigurationPrinterWithD.java.input.txt: -------------------------------------------------------------------------------- 1 | hadoop ConfigurationPrinter -D color=yellow | grep color -------------------------------------------------------------------------------- /ch06-mr-dev/src/main/examples/ConfigurationPrinterWithD.java.output.txt: -------------------------------------------------------------------------------- 1 | color=yellow 2 | -------------------------------------------------------------------------------- /ch06-mr-dev/src/main/examples/MaxTemperatureDriver.java.input.txt: -------------------------------------------------------------------------------- 1 | hadoop jar hadoop-examples.jar v3.MaxTemperatureDriver -conf conf/hadoop-cluster.xml \ 2 | input/ncdc/all max-temp -------------------------------------------------------------------------------- /ch06-mr-dev/src/main/examples/MaxTemperatureDriverV2.ignore/input.txt: -------------------------------------------------------------------------------- 1 | hadoop v2.MaxTemperatureDriver -conf conf/hadoop-local.xml \ 2 | input/ncdc/micro output -------------------------------------------------------------------------------- /ch06-mr-dev/src/main/examples/MaxTemperatureDriverV2GOP.ignore/input.txt: -------------------------------------------------------------------------------- 1 | hadoop v2.MaxTemperatureDriver -fs file:/// -jt local input/ncdc/micro output -------------------------------------------------------------------------------- /ch06-mr-dev/src/main/examples/MaxTemperatureDriverV3/input.txt: -------------------------------------------------------------------------------- 1 | hadoop v3.MaxTemperatureDriver -conf conf/hadoop-local.xml \ 2 | input/ncdc/micro output -------------------------------------------------------------------------------- /ch06-mr-dev/src/main/examples/MaxTemperatureDriverV3/output/part-r-00000: -------------------------------------------------------------------------------- 1 | 1949 111 2 | 1950 22 3 | -------------------------------------------------------------------------------- /ch06-mr-dev/src/main/examples/MaxTemperatureDriverV3GOP/input.txt: -------------------------------------------------------------------------------- 1 | hadoop v3.MaxTemperatureDriver -fs file:/// -jt local input/ncdc/micro output -------------------------------------------------------------------------------- /ch06-mr-dev/src/main/examples/MaxTemperatureDriverV3GOP/output/part-r-00000: -------------------------------------------------------------------------------- 1 | 1949 111 2 | 1950 22 3 | -------------------------------------------------------------------------------- /ch06-mr-dev/src/main/java/LoggingIdentityMapper.java: -------------------------------------------------------------------------------- 1 | //cc LoggingIdentityMapper An identity mapper that writes to standard output and also uses the Apache Commons Logging API 2 | import java.io.IOException; 3 | 4 | //vv LoggingIdentityMapper 5 | import org.apache.commons.logging.Log; 6 | import org.apache.commons.logging.LogFactory; 7 | import org.apache.hadoop.mapreduce.Mapper; 8 | 9 | public class LoggingIdentityMapper 10 | extends Mapper { 11 | 12 | private static final Log LOG = LogFactory.getLog(LoggingIdentityMapper.class); 13 | 14 | @Override 15 | @SuppressWarnings("unchecked") 16 | public void map(KEYIN key, VALUEIN value, Context context) 17 | throws IOException, InterruptedException { 18 | // Log to stdout file 19 | System.out.println("Map key: " + key); 20 | 21 | // Log to syslog file 22 | LOG.info("Map key: " + key); 23 | if (LOG.isDebugEnabled()) { 24 | LOG.debug("Map value: " + value); 25 | } 26 | context.write((KEYOUT) key, (VALUEOUT) value); 27 | } 28 | } 29 | //^^ LoggingIdentityMapper -------------------------------------------------------------------------------- /ch06-mr-dev/src/main/java/v1/MaxTemperatureMapper.java: -------------------------------------------------------------------------------- 1 | package v1; 2 | // cc MaxTemperatureMapperV1 First version of a Mapper that passes MaxTemperatureMapperTest 3 | import java.io.IOException; 4 | import org.apache.hadoop.io.*; 5 | import org.apache.hadoop.mapreduce.*; 6 | //vv MaxTemperatureMapperV1 7 | public class MaxTemperatureMapper 8 | extends Mapper { 9 | 10 | @Override 11 | public void map(LongWritable key, Text value, Context context) 12 | throws IOException, InterruptedException { 13 | 14 | String line = value.toString(); 15 | String year = line.substring(15, 19); 16 | int airTemperature = Integer.parseInt(line.substring(87, 92)); 17 | context.write(new Text(year), new IntWritable(airTemperature)); 18 | } 19 | } 20 | //^^ MaxTemperatureMapperV1 21 | -------------------------------------------------------------------------------- /ch06-mr-dev/src/main/java/v1/MaxTemperatureReducer.java: -------------------------------------------------------------------------------- 1 | package v1; 2 | //cc MaxTemperatureReducerV1 Reducer for maximum temperature example 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Reducer; 8 | 9 | // vv MaxTemperatureReducerV1 10 | public class MaxTemperatureReducer 11 | extends Reducer { 12 | 13 | @Override 14 | public void reduce(Text key, Iterable values, 15 | Context context) 16 | throws IOException, InterruptedException { 17 | 18 | int maxValue = Integer.MIN_VALUE; 19 | for (IntWritable value : values) { 20 | maxValue = Math.max(maxValue, value.get()); 21 | } 22 | context.write(key, new IntWritable(maxValue)); 23 | } 24 | } 25 | // ^^ MaxTemperatureReducerV1 26 | -------------------------------------------------------------------------------- /ch06-mr-dev/src/main/java/v2/MaxTemperatureMapper.java: -------------------------------------------------------------------------------- 1 | package v2; 2 | // cc MaxTemperatureMapperV2 A Mapper that uses a utility class to parse records 3 | 4 | import java.io.IOException; 5 | 6 | import org.apache.hadoop.io.IntWritable; 7 | import org.apache.hadoop.io.LongWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Mapper; 10 | import v2.NcdcRecordParser; 11 | 12 | // vv MaxTemperatureMapperV2 13 | public class MaxTemperatureMapper 14 | extends Mapper { 15 | 16 | /*[*/private NcdcRecordParser parser = new NcdcRecordParser();/*]*/ 17 | 18 | @Override 19 | public void map(LongWritable key, Text value, Context context) 20 | throws IOException, InterruptedException { 21 | 22 | /*[*/parser.parse(value);/*]*/ 23 | if (/*[*/parser.isValidTemperature()/*]*/) { 24 | context.write(new Text(/*[*/parser.getYear()/*]*/), 25 | new IntWritable(/*[*/parser.getAirTemperature()/*]*/)); 26 | } 27 | } 28 | } 29 | // ^^ MaxTemperatureMapperV2 30 | -------------------------------------------------------------------------------- /ch06-mr-dev/src/main/resources/configuration-1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | color 5 | yellow 6 | Color 7 | 8 | 9 | 10 | size 11 | 10 12 | Size 13 | 14 | 15 | 16 | weight 17 | heavy 18 | true 19 | Weight 20 | 21 | 22 | 23 | size-weight 24 | ${size},${weight} 25 | Size and weight 26 | 27 | -------------------------------------------------------------------------------- /ch06-mr-dev/src/main/resources/configuration-2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | size 5 | 12 6 | 7 | 8 | 9 | weight 10 | light 11 | 12 | -------------------------------------------------------------------------------- /ch06-mr-dev/src/main/resources/max-temp-workflow.properties: -------------------------------------------------------------------------------- 1 | # A properties file used to submit an Oozie workflow job. 2 | # This file is not bundled as a part of the workflow application. 3 | nameNode=hdfs://localhost:8020 4 | resourceManager=localhost:8032 5 | oozie.wf.application.path=${nameNode}/user/${user.name}/max-temp-workflow -------------------------------------------------------------------------------- /ch06-mr-dev/src/test/java/SingleResourceConfigurationTest.java: -------------------------------------------------------------------------------- 1 | // == SingleResourceConfigurationTest 2 | import static org.hamcrest.CoreMatchers.is; 3 | import static org.junit.Assert.assertThat; 4 | 5 | import java.io.IOException; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.junit.Test; 9 | 10 | public class SingleResourceConfigurationTest { 11 | 12 | @Test 13 | public void get() throws IOException { 14 | // vv SingleResourceConfigurationTest 15 | Configuration conf = new Configuration(); 16 | conf.addResource("configuration-1.xml"); 17 | assertThat(conf.get("color"), is("yellow")); 18 | assertThat(conf.getInt("size", 0), is(10)); 19 | assertThat(conf.get("breadth", "wide"), is("wide")); 20 | // ^^ SingleResourceConfigurationTest 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /ch06-mr-dev/src/test/java/v1/MaxTemperatureReducerTest.java: -------------------------------------------------------------------------------- 1 | package v1; 2 | // == MaxTemperatureReducerTestV1 3 | import java.io.IOException; 4 | import java.util.*; 5 | import org.apache.hadoop.io.*; 6 | import org.apache.hadoop.mrunit.mapreduce.ReduceDriver; 7 | import org.junit.*; 8 | 9 | public class MaxTemperatureReducerTest { 10 | 11 | //vv MaxTemperatureReducerTestV1 12 | @Test 13 | public void returnsMaximumIntegerInValues() throws IOException, 14 | InterruptedException { 15 | new ReduceDriver() 16 | .withReducer(new MaxTemperatureReducer()) 17 | .withInput(new Text("1950"), 18 | Arrays.asList(new IntWritable(10), new IntWritable(5))) 19 | .withOutput(new Text("1950"), new IntWritable(10)) 20 | .runTest(); 21 | } 22 | //^^ MaxTemperatureReducerTestV1 23 | } 24 | -------------------------------------------------------------------------------- /ch06-mr-dev/src/test/resources/expected.txt: -------------------------------------------------------------------------------- 1 | 1949 111 2 | 1950 22 3 | -------------------------------------------------------------------------------- /ch08-mr-types/src/main/examples/MaxTemperatureWithMultipleInputs/input.txt: -------------------------------------------------------------------------------- 1 | hadoop MaxTemperatureWithMultipleInputs input/ncdc/micro/sample.txt input/metoffice output -------------------------------------------------------------------------------- /ch08-mr-types/src/main/examples/MinimalMapReduce.java.input.txt: -------------------------------------------------------------------------------- 1 | hadoop MinimalMapReduce "input/ncdc/all/190{1,2}.gz" output -------------------------------------------------------------------------------- /ch08-mr-types/src/main/examples/MinimalMapReduce/input.txt: -------------------------------------------------------------------------------- 1 | hadoop MinimalMapReduce "input/ncdc/all/190{1,2}.gz" output -------------------------------------------------------------------------------- /ch08-mr-types/src/main/examples/MinimalMapReduceWithDefaults/input.txt: -------------------------------------------------------------------------------- 1 | hadoop MinimalMapReduceWithDefaults "input/ncdc/all/190{1,2}.gz" output -------------------------------------------------------------------------------- /ch08-mr-types/src/main/examples/PartitionByStationUsingMultipleOutputFormat.java.input.txt: -------------------------------------------------------------------------------- 1 | hadoop jar hadoop-examples.jar PartitionByStationUsingMultipleOutputFormat 'input/ncdc/all/190?.gz' output-part-by-station -------------------------------------------------------------------------------- /ch08-mr-types/src/main/examples/PartitionByStationUsingMultipleOutputs/2/input.txt: -------------------------------------------------------------------------------- 1 | hadoop PartitionByStationUsingMultipleOutputs "input/ncdc/all/190{1,2}.gz" output -------------------------------------------------------------------------------- /ch08-mr-types/src/main/examples/PartitionByStationUsingMultipleOutputs/2/output/part-r-00000: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch08-mr-types/src/main/examples/PartitionByStationUsingMultipleOutputs/2/output/part-r-00000 -------------------------------------------------------------------------------- /ch08-mr-types/src/main/examples/PartitionByStationUsingMultipleOutputs/input.txt: -------------------------------------------------------------------------------- 1 | hadoop PartitionByStationUsingMultipleOutputs "input/ncdc/all/190{1,2}.gz" output -------------------------------------------------------------------------------- /ch08-mr-types/src/main/examples/PartitionByStationUsingMultipleOutputs/output/part-r-00000: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch08-mr-types/src/main/examples/PartitionByStationUsingMultipleOutputs/output/part-r-00000 -------------------------------------------------------------------------------- /ch08-mr-types/src/main/examples/PartitionByStationYearUsingMultipleOutputs/2/input.txt: -------------------------------------------------------------------------------- 1 | hadoop PartitionByStationYearUsingMultipleOutputs "input/ncdc/all/190{1,2}.gz" output -------------------------------------------------------------------------------- /ch08-mr-types/src/main/examples/PartitionByStationYearUsingMultipleOutputs/2/output/part-r-00000: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch08-mr-types/src/main/examples/PartitionByStationYearUsingMultipleOutputs/2/output/part-r-00000 -------------------------------------------------------------------------------- /ch08-mr-types/src/main/examples/PartitionByStationYearUsingMultipleOutputs/input.txt: -------------------------------------------------------------------------------- 1 | hadoop PartitionByStationYearUsingMultipleOutputs "input/ncdc/all/190{1,2}.gz" output -------------------------------------------------------------------------------- /ch08-mr-types/src/main/examples/PartitionByStationYearUsingMultipleOutputs/output/part-r-00000: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch08-mr-types/src/main/examples/PartitionByStationYearUsingMultipleOutputs/output/part-r-00000 -------------------------------------------------------------------------------- /ch08-mr-types/src/main/examples/SmallFilesToSequenceFileConverter.ignore/input.txt: -------------------------------------------------------------------------------- 1 | hadoop SmallFilesToSequenceFileConverter input/smallfiles outputhadoop jar hadoop-examples.jar SmallFilesToSequenceFileConverter \ 2 | -conf conf/hadoop-localhost.xml -D mapred.reduce.tasks=2 input/smallfiles output -------------------------------------------------------------------------------- /ch08-mr-types/src/main/examples/SmallFilesToSequenceFileConverter.java.input.txt: -------------------------------------------------------------------------------- 1 | hadoop jar hadoop-examples.jar SmallFilesToSequenceFileConverter \ 2 | -conf conf/hadoop-localhost.xml -D mapred.reduce.tasks=2 input/smallfiles output -------------------------------------------------------------------------------- /ch08-mr-types/src/main/examples/default_streaming.input.txt: -------------------------------------------------------------------------------- 1 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \ 2 | -input input/ncdc/sample.txt \ 3 | -output output \ 4 | -inputformat org.apache.hadoop.mapred.TextInputFormat \ 5 | -mapper /bin/cat \ 6 | -partitioner org.apache.hadoop.mapred.lib.HashPartitioner \ 7 | -numReduceTasks 1 \ 8 | -reducer org.apache.hadoop.mapred.lib.IdentityReducer \ 9 | -outputformat org.apache.hadoop.mapred.TextOutputFormat -------------------------------------------------------------------------------- /ch08-mr-types/src/main/examples/minimal_streaming.input.txt: -------------------------------------------------------------------------------- 1 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \ 2 | -input input/ncdc/sample.txt \ 3 | -output output \ 4 | -mapper /bin/cat -------------------------------------------------------------------------------- /ch08-mr-types/src/main/java/NonSplittableTextInputFormat.java: -------------------------------------------------------------------------------- 1 | // == NonSplittableTextInputFormat 2 | import org.apache.hadoop.fs.Path; 3 | import org.apache.hadoop.mapreduce.JobContext; 4 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 5 | 6 | public class NonSplittableTextInputFormat extends TextInputFormat { 7 | @Override 8 | protected boolean isSplitable(JobContext context, Path file) { 9 | return false; 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /ch08-mr-types/src/main/java/StationPartitioner.java: -------------------------------------------------------------------------------- 1 | // == StationPartitioner 2 | import org.apache.hadoop.io.LongWritable; 3 | import org.apache.hadoop.io.Text; 4 | import org.apache.hadoop.mapreduce.Partitioner; 5 | 6 | //vv StationPartitioner 7 | public class StationPartitioner extends Partitioner { 8 | 9 | private NcdcRecordParser parser = new NcdcRecordParser(); 10 | 11 | @Override 12 | public int getPartition(LongWritable key, Text value, int numPartitions) { 13 | parser.parse(value); 14 | return getPartition(parser.getStationId()); 15 | } 16 | 17 | private int getPartition(String stationId) { 18 | /*...*/ 19 | // ^^ StationPartitioner 20 | return 0; 21 | // vv StationPartitioner 22 | } 23 | 24 | } 25 | //^^ StationPartitioner -------------------------------------------------------------------------------- /ch08-mr-types/src/main/java/oldapi/MinimalMapReduce.java: -------------------------------------------------------------------------------- 1 | package oldapi; 2 | 3 | import org.apache.hadoop.conf.Configured; 4 | import org.apache.hadoop.fs.Path; 5 | import org.apache.hadoop.mapred.*; 6 | import org.apache.hadoop.util.*; 7 | 8 | public class MinimalMapReduce extends Configured implements Tool { 9 | 10 | @Override 11 | public int run(String[] args) throws Exception { 12 | if (args.length != 2) { 13 | System.err.printf("Usage: %s [generic options] \n", 14 | getClass().getSimpleName()); 15 | ToolRunner.printGenericCommandUsage(System.err); 16 | return -1; 17 | } 18 | 19 | JobConf conf = new JobConf(getConf(), getClass()); 20 | FileInputFormat.addInputPath(conf, new Path(args[0])); 21 | FileOutputFormat.setOutputPath(conf, new Path(args[1])); 22 | JobClient.runJob(conf); 23 | return 0; 24 | } 25 | 26 | public static void main(String[] args) throws Exception { 27 | int exitCode = ToolRunner.run(new MinimalMapReduce(), args); 28 | System.exit(exitCode); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /ch08-mr-types/src/main/java/oldapi/NonSplittableTextInputFormat.java: -------------------------------------------------------------------------------- 1 | package oldapi; 2 | 3 | import org.apache.hadoop.fs.*; 4 | import org.apache.hadoop.mapred.TextInputFormat; 5 | 6 | public class NonSplittableTextInputFormat extends TextInputFormat { 7 | @Override 8 | protected boolean isSplitable(FileSystem fs, Path file) { 9 | return false; 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /ch08-mr-types/src/main/java/oldapi/StationPartitioner.java: -------------------------------------------------------------------------------- 1 | package oldapi; 2 | 3 | import org.apache.hadoop.io.*; 4 | import org.apache.hadoop.mapred.*; 5 | 6 | public class StationPartitioner implements Partitioner { 7 | 8 | private NcdcRecordParser parser = new NcdcRecordParser(); 9 | 10 | @Override 11 | public int getPartition(LongWritable key, Text value, int numPartitions) { 12 | parser.parse(value); 13 | return getPartition(parser.getStationId()); 14 | } 15 | 16 | private int getPartition(String stationId) { 17 | return 0; 18 | } 19 | 20 | @Override 21 | public void configure(JobConf conf) { } 22 | } 23 | -------------------------------------------------------------------------------- /ch08-mr-types/src/main/java/oldapi/WholeFileInputFormat.java: -------------------------------------------------------------------------------- 1 | package oldapi; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.fs.*; 5 | import org.apache.hadoop.io.*; 6 | import org.apache.hadoop.mapred.*; 7 | 8 | public class WholeFileInputFormat 9 | extends FileInputFormat { 10 | 11 | @Override 12 | protected boolean isSplitable(FileSystem fs, Path filename) { 13 | return false; 14 | } 15 | 16 | @Override 17 | public RecordReader getRecordReader( 18 | InputSplit split, JobConf job, Reporter reporter) throws IOException { 19 | 20 | return new WholeFileRecordReader((FileSplit) split, job); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /ch08-mr-types/src/main/sh/streaming.sh: -------------------------------------------------------------------------------- 1 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \ 2 | -input input/ncdc/sample.txt \ 3 | -output output \ 4 | -mapper /bin/cat 5 | 6 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \ 7 | -input input/ncdc/sample.txt \ 8 | -output output \ 9 | -inputformat org.apache.hadoop.mapred.TextInputFormat \ 10 | -mapper /bin/cat \ 11 | -partitioner org.apache.hadoop.mapred.lib.HashPartitioner \ 12 | -numReduceTasks 1 \ 13 | -reducer org.apache.hadoop.mapred.lib.IdentityReducer \ 14 | -outputformat org.apache.hadoop.mapred.TextOutputFormat -------------------------------------------------------------------------------- /ch09-mr-features/src/main/examples/JoinRecordWithStationName/2/input.txt: -------------------------------------------------------------------------------- 1 | hadoop JoinRecordWithStationName input/ncdc/sample.txt input/ncdc/metadata/stations-fixed-width.txt output -------------------------------------------------------------------------------- /ch09-mr-features/src/main/examples/JoinRecordWithStationName/2/output/part-r-00000: -------------------------------------------------------------------------------- 1 | 011990-99999 SIHCCAJAVRI 0043011990999991950051518004+68750+023550FM-12+038299999V0203201N00261220001CN9999999N9-00111+99999999999 2 | 011990-99999 SIHCCAJAVRI 0043011990999991950051512004+68750+023550FM-12+038299999V0203201N00671220001CN9999999N9+00221+99999999999 3 | 011990-99999 SIHCCAJAVRI 0067011990999991950051507004+68750+023550FM-12+038299999V0203301N00671220001CN9999999N9+00001+99999999999 4 | 012650-99999 TYNSET-HANSMOEN 0043012650999991949032418004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+00781+99999999999 5 | 012650-99999 TYNSET-HANSMOEN 0043012650999991949032412004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+01111+99999999999 6 | -------------------------------------------------------------------------------- /ch09-mr-features/src/main/examples/JoinRecordWithStationName/input.txt: -------------------------------------------------------------------------------- 1 | hadoop JoinRecordWithStationName input/ncdc/sample.txt input/ncdc/metadata/stations-fixed-width.txt output -------------------------------------------------------------------------------- /ch09-mr-features/src/main/examples/JoinRecordWithStationName/output/part-r-00000: -------------------------------------------------------------------------------- 1 | 011990-99999 SIHCCAJAVRI 0067011990999991950051507004+68750+023550FM-12+038299999V0203301N00671220001CN9999999N9+00001+99999999999 2 | 011990-99999 SIHCCAJAVRI 0043011990999991950051512004+68750+023550FM-12+038299999V0203201N00671220001CN9999999N9+00221+99999999999 3 | 011990-99999 SIHCCAJAVRI 0043011990999991950051518004+68750+023550FM-12+038299999V0203201N00261220001CN9999999N9-00111+99999999999 4 | 012650-99999 TYNSET-HANSMOEN 0043012650999991949032412004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+01111+99999999999 5 | 012650-99999 TYNSET-HANSMOEN 0043012650999991949032418004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+00781+99999999999 6 | -------------------------------------------------------------------------------- /ch09-mr-features/src/main/examples/LookupRecordByTemperature.java.input.txt: -------------------------------------------------------------------------------- 1 | hadoop jar hadoop-examples.jar LookupRecordByTemperature output-hashmapsort -100 -------------------------------------------------------------------------------- /ch09-mr-features/src/main/examples/LookupRecordByTemperature.java.output.txt: -------------------------------------------------------------------------------- 1 | 357460-99999 1956 -------------------------------------------------------------------------------- /ch09-mr-features/src/main/examples/LookupRecordsByTemperature.java.input.txt: -------------------------------------------------------------------------------- 1 | hadoop jar hadoop-examples.jar LookupRecordsByTemperature output-hashmapsort -100 \ 2 | 2> /dev/null | wc -l -------------------------------------------------------------------------------- /ch09-mr-features/src/main/examples/LookupRecordsByTemperature.java.output.txt: -------------------------------------------------------------------------------- 1 | 1489272 -------------------------------------------------------------------------------- /ch09-mr-features/src/main/examples/MaxTemperatureByStationNameUsingDistributedCacheFile.java.input.txt: -------------------------------------------------------------------------------- 1 | hadoop jar hadoop-examples.jar MaxTemperatureByStationNameUsingDistributedCacheFile \ 2 | -files input/ncdc/metadata/stations-fixed-width.txt input/ncdc/all output -------------------------------------------------------------------------------- /ch09-mr-features/src/main/examples/MaxTemperatureByStationNameUsingDistributedCacheFileApi.ignore/input.txt: -------------------------------------------------------------------------------- 1 | hadoop MaxTemperatureByStationNameUsingDistributedCacheFileApi \ 2 | -files input/ncdc/metadata/stations-fixed-width.txt input/ncdc/micro output -------------------------------------------------------------------------------- /ch09-mr-features/src/main/examples/MaxTemperatureUsingSecondarySort/input.txt: -------------------------------------------------------------------------------- 1 | hadoop MaxTemperatureUsingSecondarySort input/ncdc/sample.txt output -------------------------------------------------------------------------------- /ch09-mr-features/src/main/examples/MaxTemperatureUsingSecondarySort/output/part-r-00000: -------------------------------------------------------------------------------- 1 | 1949 111 2 | 1950 22 3 | -------------------------------------------------------------------------------- /ch09-mr-features/src/main/examples/MaxTemperatureWithCounters.java.input.txt: -------------------------------------------------------------------------------- 1 | hadoop jar hadoop-examples.jar MaxTemperatureWithCounters input/ncdc/all output-counters 2 | -------------------------------------------------------------------------------- /ch09-mr-features/src/main/examples/MaxTemperatureWithCounters/input.txt: -------------------------------------------------------------------------------- 1 | hadoop MaxTemperatureWithCounters input/ncdc/sample.txt output -------------------------------------------------------------------------------- /ch09-mr-features/src/main/examples/MaxTemperatureWithCounters/output/part-r-00000: -------------------------------------------------------------------------------- 1 | 1949 111 2 | 1950 22 3 | -------------------------------------------------------------------------------- /ch09-mr-features/src/main/examples/MissingTemperatureFields.java.input.txt: -------------------------------------------------------------------------------- 1 | hadoop jar hadoop-examples.jar MissingTemperatureFields job_200904200610_0003 -------------------------------------------------------------------------------- /ch09-mr-features/src/main/examples/SortByTemperatureToMapFile.ignore/input.txt: -------------------------------------------------------------------------------- 1 | hadoop SortDataPreprocessor input/ncdc/micro output-seq 2 | hadoop SortByTemperatureToMapFile output-seq output 3 | -------------------------------------------------------------------------------- /ch09-mr-features/src/main/examples/SortByTemperatureUsingHashPartitioner.ignore/input.txt: -------------------------------------------------------------------------------- 1 | hadoop SortDataPreprocessor input/ncdc/micro output-seq 2 | hadoop SortByTemperatureUsingHashPartitioner output-seq output 3 | -------------------------------------------------------------------------------- /ch09-mr-features/src/main/examples/SortByTemperatureUsingHashPartitioner.java.input.txt: -------------------------------------------------------------------------------- 1 | hadoop jar hadoop-examples.jar SortByTemperatureUsingHashPartitioner \ 2 | -D mapred.reduce.tasks=30 input/ncdc/all-seq output-hashsort 3 | -------------------------------------------------------------------------------- /ch09-mr-features/src/main/examples/SortByTemperatureUsingTotalOrderPartitioner.java.input.txt: -------------------------------------------------------------------------------- 1 | hadoop jar hadoop-examples.jar SortByTemperatureUsingTotalOrderPartitioner \ 2 | -D mapred.reduce.tasks=30 input/ncdc/all-seq output-totalsort -------------------------------------------------------------------------------- /ch09-mr-features/src/main/examples/SortDataPreprocessor.ignore/input.txt: -------------------------------------------------------------------------------- 1 | hadoop SortDataPreprocessor input/ncdc/micro output -------------------------------------------------------------------------------- /ch09-mr-features/src/main/examples/SortDataPreprocessor.java.input.txt: -------------------------------------------------------------------------------- 1 | hadoop jar hadoop-examples.jar SortDataPreprocessor input/ncdc/all \ 2 | input/ncdc/all-seq -------------------------------------------------------------------------------- /ch09-mr-features/src/main/java/JoinRecordMapper.java: -------------------------------------------------------------------------------- 1 | // cc JoinRecordMapper Mapper for tagging weather records for a reduce-side join 2 | import java.io.IOException; 3 | 4 | import org.apache.hadoop.io.LongWritable; 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapreduce.Mapper; 7 | 8 | //vv JoinRecordMapper 9 | public class JoinRecordMapper 10 | extends Mapper { 11 | private NcdcRecordParser parser = new NcdcRecordParser(); 12 | 13 | @Override 14 | protected void map(LongWritable key, Text value, Context context) 15 | throws IOException, InterruptedException { 16 | parser.parse(value); 17 | context.write(new TextPair(parser.getStationId(), "1"), value); 18 | } 19 | 20 | } 21 | //^^ JoinRecordMapper -------------------------------------------------------------------------------- /ch09-mr-features/src/main/java/JoinReducer.java: -------------------------------------------------------------------------------- 1 | // cc JoinReducer Reducer for joining tagged station records with tagged weather records 2 | import java.io.IOException; 3 | import java.util.Iterator; 4 | 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapreduce.Reducer; 7 | 8 | // vv JoinReducer 9 | public class JoinReducer extends Reducer { 10 | 11 | @Override 12 | protected void reduce(TextPair key, Iterable values, Context context) 13 | throws IOException, InterruptedException { 14 | Iterator iter = values.iterator(); 15 | Text stationName = new Text(iter.next()); 16 | while (iter.hasNext()) { 17 | Text record = iter.next(); 18 | Text outValue = new Text(stationName.toString() + "\t" + record.toString()); 19 | context.write(key.getFirst(), outValue); 20 | } 21 | } 22 | } 23 | // ^^ JoinReducer -------------------------------------------------------------------------------- /ch09-mr-features/src/main/java/JoinStationMapper.java: -------------------------------------------------------------------------------- 1 | // cc JoinStationMapper Mapper for tagging station records for a reduce-side join 2 | import java.io.IOException; 3 | 4 | import org.apache.hadoop.io.*; 5 | import org.apache.hadoop.mapreduce.Mapper; 6 | 7 | // vv JoinStationMapper 8 | public class JoinStationMapper 9 | extends Mapper { 10 | private NcdcStationMetadataParser parser = new NcdcStationMetadataParser(); 11 | 12 | @Override 13 | protected void map(LongWritable key, Text value, Context context) 14 | throws IOException, InterruptedException { 15 | if (parser.parse(value)) { 16 | context.write(new TextPair(parser.getStationId(), "0"), 17 | new Text(parser.getStationName())); 18 | } 19 | } 20 | } 21 | // ^^ JoinStationMapper -------------------------------------------------------------------------------- /ch09-mr-features/src/main/java/oldapi/JoinRecordMapper.java: -------------------------------------------------------------------------------- 1 | package oldapi; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.*; 6 | import org.apache.hadoop.mapred.*; 7 | 8 | public class JoinRecordMapper extends MapReduceBase 9 | implements Mapper { 10 | private NcdcRecordParser parser = new NcdcRecordParser(); 11 | 12 | public void map(LongWritable key, Text value, 13 | OutputCollector output, Reporter reporter) 14 | throws IOException { 15 | 16 | parser.parse(value); 17 | output.collect(new TextPair(parser.getStationId(), "1"), value); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /ch09-mr-features/src/main/java/oldapi/JoinReducer.java: -------------------------------------------------------------------------------- 1 | package oldapi; 2 | 3 | import java.io.IOException; 4 | import java.util.Iterator; 5 | 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapred.*; 8 | 9 | public class JoinReducer extends MapReduceBase implements 10 | Reducer { 11 | 12 | public void reduce(TextPair key, Iterator values, 13 | OutputCollector output, Reporter reporter) 14 | throws IOException { 15 | 16 | Text stationName = new Text(values.next()); 17 | while (values.hasNext()) { 18 | Text record = values.next(); 19 | Text outValue = new Text(stationName.toString() + "\t" + record.toString()); 20 | output.collect(key.getFirst(), outValue); 21 | } 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /ch09-mr-features/src/main/java/oldapi/JoinStationMapper.java: -------------------------------------------------------------------------------- 1 | package oldapi; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.*; 6 | import org.apache.hadoop.mapred.*; 7 | 8 | public class JoinStationMapper extends MapReduceBase 9 | implements Mapper { 10 | private NcdcStationMetadataParser parser = new NcdcStationMetadataParser(); 11 | 12 | public void map(LongWritable key, Text value, 13 | OutputCollector output, Reporter reporter) 14 | throws IOException { 15 | 16 | if (parser.parse(value)) { 17 | output.collect(new TextPair(parser.getStationId(), "0"), 18 | new Text(parser.getStationName())); 19 | } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /ch09-mr-features/src/main/python/max_daily_temp_map.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import re 4 | import sys 5 | 6 | for line in sys.stdin: 7 | val = line.strip() 8 | (usaf, wban, date, temp, q) = (val[4:10], val[10:15], val[15:23], 9 | int(val[87:92]), val[92:93]) 10 | if (temp != 9999 and re.match("[01459]", q)): 11 | print "%s-%s\t%s\t%s" % (usaf, wban, date, temp) -------------------------------------------------------------------------------- /ch09-mr-features/src/main/python/max_daily_temp_reduce.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | 5 | (last_key, max_val) = (None, 0) 6 | for line in sys.stdin: 7 | (station, date, temp) = line.strip().split("\t") 8 | key = "%s\t%s" % (station, date) 9 | if last_key and last_key != key: 10 | print "%s\t%s" % (last_key, max_val) 11 | (last_key, max_val) = (key, int(temp)) 12 | else: 13 | (last_key, max_val) = (key, max(max_val, int(temp))) 14 | 15 | if last_key: 16 | print "%s\t%s" % (last_key, max_val) -------------------------------------------------------------------------------- /ch09-mr-features/src/main/python/mean_max_daily_temp.sh: -------------------------------------------------------------------------------- 1 | STREAM="hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar -conf conf/hadoop-localhost.xml" 2 | 3 | $STREAM \ 4 | -D stream.num.map.output.key.fields=2 \ 5 | -files ch09-mr-features/src/main/python/max_daily_temp_map.py,\ 6 | ch09-mr-features/src/main/python/max_daily_temp_reduce.py \ 7 | -input input/ncdc/all \ 8 | -output out_max_daily \ 9 | -mapper ch09-mr-features/src/main/python/max_daily_temp_map.py \ 10 | -reducer ch09-mr-features/src/main/python/max_daily_temp_reduce.py 11 | 12 | $STREAM \ 13 | -D stream.num.map.output.key.fields=2 \ 14 | -files ch09-mr-features/src/main/python/mean_max_daily_temp_map.py,\ 15 | ch09-mr-features/src/main/python/mean_max_daily_temp_map.py \ 16 | -input out_max_daily \ 17 | -output out_mean_max_daily \ 18 | -mapper ch09-mr-features/src/main/python/mean_max_daily_temp_map.py \ 19 | -reducer ch09-mr-features/src/main/python/mean_max_daily_temp_reduce.py 20 | 21 | -------------------------------------------------------------------------------- /ch09-mr-features/src/main/python/mean_max_daily_temp_map.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | 5 | # Change date to month and day 6 | for line in sys.stdin: 7 | (station, date, temp) = line.strip().split("\t") 8 | print "%s\t%s\t%s" % (station, date[4:8], temp) -------------------------------------------------------------------------------- /ch09-mr-features/src/main/python/mean_max_daily_temp_reduce.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | 5 | (last_key, count, sum) = (None, 0, 0) 6 | for line in sys.stdin: 7 | (station, month_day, temp) = line.strip().split("\t") 8 | key = "%s\t%s" % (station, month_day) 9 | if last_key and last_key != key: 10 | print "%s\t%s" % (last_key, sum / count) 11 | (last_key, count, sum) = (key, 1, int(temp)) 12 | else: 13 | (last_key, count, sum) = (key, count + 1, sum + int(temp)) 14 | 15 | if last_key: 16 | print "%s\t%s" % (last_key, sum / count) -------------------------------------------------------------------------------- /ch09-mr-features/src/main/python/secondary_sort.sh: -------------------------------------------------------------------------------- 1 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \ 2 | -D stream.num.map.output.key.fields=2 \ 3 | -D mapreduce.partition.keypartitioner.options=-k1,1 \ 4 | -D mapreduce.job.output.key.comparator.class=\ 5 | org.apache.hadoop.mapred.lib.KeyFieldBasedComparator \ 6 | -D mapreduce.partition.keycomparator.options="-k1n -k2nr" \ 7 | -files secondary_sort_map.py,secondary_sort_reduce.py \ 8 | -input input/ncdc/all \ 9 | -output output-secondarysort-streaming \ 10 | -mapper ch09-mr-features/src/main/python/secondary_sort_map.py \ 11 | -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \ 12 | -reducer ch09-mr-features/src/main/python/secondary_sort_reduce.py 13 | -------------------------------------------------------------------------------- /ch09-mr-features/src/main/python/secondary_sort_map.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import re 4 | import sys 5 | 6 | for line in sys.stdin: 7 | val = line.strip() 8 | (year, temp, q) = (val[15:19], int(val[87:92]), val[92:93]) 9 | if temp == 9999: 10 | sys.stderr.write("reporter:counter:Temperature,Missing,1\n") 11 | elif re.match("[01459]", q): 12 | print "%s\t%s" % (year, temp) 13 | -------------------------------------------------------------------------------- /ch09-mr-features/src/main/python/secondary_sort_reduce.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | 5 | last_group = None 6 | for line in sys.stdin: 7 | val = line.strip() 8 | (year, temp) = val.split("\t") 9 | group = year 10 | if last_group != group: 11 | print val 12 | last_group = group 13 | -------------------------------------------------------------------------------- /ch09-mr-features/src/main/r/fixed-partitions: -------------------------------------------------------------------------------- 1 | 0 124013605 2 | 1 151590303 3 | 2 191822960 4 | 3 675051684 5 | 6 | -------------------------------------------------------------------------------- /ch09-mr-features/src/main/r/sampled-partitions: -------------------------------------------------------------------------------- 1 | 0 331955753 2 | 1 276755563 3 | 2 263474844 4 | 3 270292395 5 | -------------------------------------------------------------------------------- /ch09-mr-features/src/main/r/temperature_distribution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch09-mr-features/src/main/r/temperature_distribution.png -------------------------------------------------------------------------------- /ch09-mr-features/src/main/r/temperature_distribution.r: -------------------------------------------------------------------------------- 1 | png("temperature_distribution.png") 2 | data <- read.table("output_sorted") 3 | plot(data, xlab="Temperature", ylab="Number of readings") 4 | dev.off() -------------------------------------------------------------------------------- /ch09-mr-features/src/main/resources/MaxTemperatureWithCounters_Temperature.properties: -------------------------------------------------------------------------------- 1 | CounterGroupName=Air Temperature Records 2 | MISSING.name=Missing 3 | MALFORMED.name=Malformed -------------------------------------------------------------------------------- /ch09-mr-features/src/main/resources/oldapi/MaxTemperatureWithCounters_Temperature.properties: -------------------------------------------------------------------------------- 1 | CounterGroupName=Air Temperature Records 2 | MISSING.name=Missing 3 | MALFORMED.name=Malformed -------------------------------------------------------------------------------- /ch10-setup/src/main/conf/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | fs.defaultFS 6 | hdfs://namenode/ 7 | 8 | -------------------------------------------------------------------------------- /ch10-setup/src/main/conf/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | dfs.namenode.name.dir 6 | /disk1/hdfs/name,/remote/hdfs/name 7 | 8 | 9 | 10 | dfs.datanode.data.dir 11 | /disk1/hdfs/data,/disk2/hdfs/data 12 | 13 | 14 | 15 | dfs.namenode.checkpoint.dir 16 | /disk1/hdfs/namesecondary,/disk2/hdfs/namesecondary 17 | 18 | -------------------------------------------------------------------------------- /ch10-setup/src/main/conf/yarn-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | yarn.resourcemanager.hostname 6 | resourcemanager 7 | 8 | 9 | 10 | yarn.nodemanager.local-dirs 11 | /disk1/nm-local-dir,/disk2/nm-local-dir 12 | 13 | 14 | 15 | yarn.nodemanager.aux-services 16 | mapreduce.shuffle 17 | 18 | 19 | 20 | yarn.nodemanager.resource.memory-mb 21 | 16384 22 | 23 | 24 | 25 | yarn.nodemanager.resource.cpu-vcores 26 | 16 27 | 28 | -------------------------------------------------------------------------------- /ch10-setup/src/main/sh/trash.sh: -------------------------------------------------------------------------------- 1 | hadoop fs -touchz quangle 2 | hadoop fs -rm quangle 3 | hadoop fs -lsr .Trash 4 | hadoop fs -mv .Trash/Current/quangle . 5 | hadoop fs -ls . -------------------------------------------------------------------------------- /ch12-avro/src/main/c/dump_pairs.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | int main(int argc, char *argv[]) { 6 | if (argc != 2) { 7 | fprintf(stderr, "Usage: dump_pairs \n"); 8 | exit(EXIT_FAILURE); 9 | } 10 | 11 | const char *avrofile = argv[1]; 12 | avro_schema_error_t error; 13 | avro_file_reader_t filereader; 14 | avro_datum_t pair; 15 | avro_datum_t left; 16 | avro_datum_t right; 17 | int rval; 18 | char *p; 19 | 20 | avro_file_reader(avrofile, &filereader); 21 | while (1) { 22 | rval = avro_file_reader_read(filereader, NULL, &pair); 23 | if (rval) break; 24 | if (avro_record_get(pair, "left", &left) == 0) { 25 | avro_string_get(left, &p); 26 | fprintf(stdout, "%s,", p); 27 | } 28 | if (avro_record_get(pair, "right", &right) == 0) { 29 | avro_string_get(right, &p); 30 | fprintf(stdout, "%s\n", p); 31 | } 32 | } 33 | avro_file_reader_close(filereader); 34 | return 0; 35 | } -------------------------------------------------------------------------------- /ch12-avro/src/main/examples/AvroGenericMaxTemperature/input.txt: -------------------------------------------------------------------------------- 1 | hadoop jar avro-examples.jar AvroGenericMaxTemperature \ 2 | input/ncdc/sample.txt output -------------------------------------------------------------------------------- /ch12-avro/src/main/examples/AvroGenericMaxTemperature/output/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /ch12-avro/src/main/examples/AvroGenericMaxTemperature/output/.part-r-00000.avro.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch12-avro/src/main/examples/AvroGenericMaxTemperature/output/.part-r-00000.avro.crc -------------------------------------------------------------------------------- /ch12-avro/src/main/examples/AvroGenericMaxTemperature/output/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch12-avro/src/main/examples/AvroGenericMaxTemperature/output/_SUCCESS -------------------------------------------------------------------------------- /ch12-avro/src/main/examples/AvroGenericMaxTemperature/output/part-r-00000.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch12-avro/src/main/examples/AvroGenericMaxTemperature/output/part-r-00000.avro -------------------------------------------------------------------------------- /ch12-avro/src/main/examples/AvroSort/input.txt: -------------------------------------------------------------------------------- 1 | hadoop jar avro-examples.jar AvroSort input/avro/pairs.avro output \ 2 | ch12-avro/src/main/resources/SortedStringPair.avsc -------------------------------------------------------------------------------- /ch12-avro/src/main/examples/AvroSort/output/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /ch12-avro/src/main/examples/AvroSort/output/.part-r-00000.avro.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch12-avro/src/main/examples/AvroSort/output/.part-r-00000.avro.crc -------------------------------------------------------------------------------- /ch12-avro/src/main/examples/AvroSort/output/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch12-avro/src/main/examples/AvroSort/output/_SUCCESS -------------------------------------------------------------------------------- /ch12-avro/src/main/examples/AvroSort/output/part-r-00000.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch12-avro/src/main/examples/AvroSort/output/part-r-00000.avro -------------------------------------------------------------------------------- /ch12-avro/src/main/py/write_pairs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import string 3 | import sys 4 | 5 | from avro import schema 6 | from avro import io 7 | from avro import datafile 8 | 9 | if __name__ == '__main__': 10 | if len(sys.argv) != 2: 11 | sys.exit('Usage: %s ' % sys.argv[0]) 12 | avro_file = sys.argv[1] 13 | writer = open(avro_file, 'wb') 14 | datum_writer = io.DatumWriter() 15 | schema_object = schema.parse("""\ 16 | { "type": "record", 17 | "name": "StringPair", 18 | "doc": "A pair of strings.", 19 | "fields": [ 20 | {"name": "left", "type": "string"}, 21 | {"name": "right", "type": "string"} 22 | ] 23 | }""") 24 | dfw = datafile.DataFileWriter(writer, datum_writer, schema_object) 25 | for line in sys.stdin.readlines(): 26 | (left, right) = string.split(line.strip(), ',') 27 | dfw.append({'left':left, 'right':right}); 28 | dfw.close() -------------------------------------------------------------------------------- /ch12-avro/src/main/resources/AliasedStringPair.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name": "StringPair", 4 | "doc": "A pair of strings with aliased field names.", 5 | "fields": [ 6 | {"name": "first", "type": "string", "aliases": ["left"]}, 7 | {"name": "second", "type": "string", "aliases": ["right"]} 8 | ] 9 | } -------------------------------------------------------------------------------- /ch12-avro/src/main/resources/Array.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "array", 3 | "items": "long" 4 | } -------------------------------------------------------------------------------- /ch12-avro/src/main/resources/Enum.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "enum", 3 | "name": "Cutlery", 4 | "doc": "An eating utensil.", 5 | "symbols": ["KNIFE", "FORK", "SPOON"] 6 | } -------------------------------------------------------------------------------- /ch12-avro/src/main/resources/Fixed.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "fixed", 3 | "name": "Md5Hash", 4 | "size": 16 5 | } -------------------------------------------------------------------------------- /ch12-avro/src/main/resources/Map.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "map", 3 | "values": "string" 4 | } -------------------------------------------------------------------------------- /ch12-avro/src/main/resources/NewStringPair.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name": "StringPair", 4 | "doc": "A pair of strings with an added field.", 5 | "fields": [ 6 | {"name": "left", "type": "string"}, 7 | {"name": "right", "type": "string"}, 8 | {"name": "description", "type": "string", "default": ""} 9 | ] 10 | } -------------------------------------------------------------------------------- /ch12-avro/src/main/resources/NewStringPairWithNull.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name": "StringPair", 4 | "doc": "A pair of strings with an added (nullable) field.", 5 | "fields": [ 6 | {"name": "left", "type": "string"}, 7 | {"name": "right", "type": "string"}, 8 | {"name": "description", "type": ["null", "string"], "default": null} 9 | ] 10 | } -------------------------------------------------------------------------------- /ch12-avro/src/main/resources/ProjectedStringPair.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name": "StringPair", 4 | "doc": "The right field of a pair of strings.", 5 | "fields": [ 6 | {"name": "right", "type": "string"} 7 | ] 8 | } -------------------------------------------------------------------------------- /ch12-avro/src/main/resources/SortedStringPair.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name": "StringPair", 4 | "doc": "A pair of strings, sorted by right field descending.", 5 | "fields": [ 6 | {"name": "left", "type": "string", "order": "ignore"}, 7 | {"name": "right", "type": "string", "order": "descending"} 8 | ] 9 | } -------------------------------------------------------------------------------- /ch12-avro/src/main/resources/StringPair.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name": "StringPair", 4 | "doc": "A pair of strings.", 5 | "fields": [ 6 | {"name": "left", "type": "string"}, 7 | {"name": "right", "type": "string"} 8 | ] 9 | } -------------------------------------------------------------------------------- /ch12-avro/src/main/resources/SwitchedStringPair.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name": "StringPair", 4 | "doc": "A pair of strings, sorted by right then left.", 5 | "fields": [ 6 | {"name": "right", "type": "string"}, 7 | {"name": "left", "type": "string"} 8 | ] 9 | } -------------------------------------------------------------------------------- /ch12-avro/src/main/resources/Union.avsc: -------------------------------------------------------------------------------- 1 | [ 2 | "null", 3 | "string", 4 | {"type": "map", "values": "string"} 5 | ] -------------------------------------------------------------------------------- /ch12-avro/src/main/resources/WeatherRecord.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name": "WeatherRecord", 4 | "namespace": "specific", 5 | "doc": "A weather reading.", 6 | "fields": [ 7 | {"name": "year", "type": "int"}, 8 | {"name": "temperature", "type": "int"}, 9 | {"name": "stationId", "type": "string"} 10 | ] 11 | } 12 | -------------------------------------------------------------------------------- /ch13-parquet/src/main/examples/TextToParquetWithAvro/input.txt: -------------------------------------------------------------------------------- 1 | hadoop jar parquet-examples.jar TextToParquetWithAvro \ 2 | input/docs/quangle.txt output -------------------------------------------------------------------------------- /ch13-parquet/src/main/examples/TextToParquetWithAvro/output/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch13-parquet/src/main/examples/TextToParquetWithAvro/output/_SUCCESS -------------------------------------------------------------------------------- /ch13-parquet/src/main/examples/TextToParquetWithAvro/output/_metadata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch13-parquet/src/main/examples/TextToParquetWithAvro/output/_metadata -------------------------------------------------------------------------------- /ch13-parquet/src/main/examples/TextToParquetWithAvro/output/part-m-00000.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch13-parquet/src/main/examples/TextToParquetWithAvro/output/part-m-00000.parquet -------------------------------------------------------------------------------- /ch13-parquet/src/test/resources/NewStringPair.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name": "StringPair", 4 | "doc": "A pair of strings with an added field.", 5 | "fields": [ 6 | {"name": "left", "type": "string"}, 7 | {"name": "right", "type": "string"}, 8 | {"name": "description", "type": "string", "default": ""} 9 | ] 10 | } -------------------------------------------------------------------------------- /ch13-parquet/src/test/resources/ProjectedStringPair.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name": "StringPair", 4 | "doc": "The right field of a pair of strings.", 5 | "fields": [ 6 | {"name": "right", "type": "string"} 7 | ] 8 | } -------------------------------------------------------------------------------- /ch13-parquet/src/test/resources/StringPair.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name": "StringPair", 4 | "doc": "A pair of strings.", 5 | "fields": [ 6 | {"name": "left", "type": "string"}, 7 | {"name": "right", "type": "string"} 8 | ] 9 | } -------------------------------------------------------------------------------- /ch13-parquet/src/test/resources/fruit.txt: -------------------------------------------------------------------------------- 1 | cherry 2 | apple 3 | banana 4 | -------------------------------------------------------------------------------- /ch14-flume/spool-to-hdfs-and-logger.properties: -------------------------------------------------------------------------------- 1 | agent1.sources = source1 2 | agent1.sinks = sink1a sink1b 3 | agent1.channels = channel1a channel1b 4 | 5 | agent1.sources.source1.channels = channel1a channel1b 6 | agent1.sources.source1.selector.type = replicating 7 | agent1.sources.source1.selector.optional = channel1b 8 | agent1.sinks.sink1a.channel = channel1a 9 | agent1.sinks.sink1b.channel = channel1b 10 | 11 | agent1.sources.source1.type = spooldir 12 | agent1.sources.source1.spoolDir = /tmp/spooldir 13 | 14 | agent1.sinks.sink1a.type = hdfs 15 | agent1.sinks.sink1a.hdfs.path = /tmp/flume 16 | agent1.sinks.sink1a.hdfs.filePrefix = events 17 | agent1.sinks.sink1a.hdfs.fileSuffix = .log 18 | agent1.sinks.sink1a.hdfs.fileType = DataStream 19 | 20 | agent1.sinks.sink1b.type = logger 21 | 22 | agent1.channels.channel1a.type = file 23 | agent1.channels.channel1b.type = memory 24 | -------------------------------------------------------------------------------- /ch14-flume/spool-to-hdfs-avro.properties: -------------------------------------------------------------------------------- 1 | agent1.sources = source1 2 | agent1.sinks = sink1 3 | agent1.channels = channel1 4 | 5 | agent1.sources.source1.channels = channel1 6 | agent1.sinks.sink1.channel = channel1 7 | 8 | agent1.sources.source1.type = spooldir 9 | agent1.sources.source1.spoolDir = /tmp/spooldir 10 | 11 | agent1.sinks.sink1.type = hdfs 12 | agent1.sinks.sink1.hdfs.path = /tmp/flume 13 | agent1.sinks.sink1.hdfs.filePrefix = events 14 | agent1.sinks.sink1.hdfs.fileSuffix = .avro 15 | agent1.sinks.sink1.hdfs.fileType = DataStream 16 | agent1.sinks.sink1.serializer = avro_event 17 | agent1.sinks.sink1.serializer.compressionCodec = snappy 18 | 19 | agent1.channels.channel1.type = file 20 | -------------------------------------------------------------------------------- /ch14-flume/spool-to-hdfs-partitioned.properties: -------------------------------------------------------------------------------- 1 | agent1.sources = source1 2 | agent1.sinks = sink1 3 | agent1.channels = channel1 4 | 5 | agent1.sources.source1.channels = channel1 6 | agent1.sinks.sink1.channel = channel1 7 | 8 | agent1.sources.source1.type = spooldir 9 | agent1.sources.source1.spoolDir = /tmp/spooldir 10 | agent1.sources.source1.interceptors = interceptor1 11 | agent1.sources.source1.interceptors.interceptor1.type = timestamp 12 | 13 | agent1.sinks.sink1.type = hdfs 14 | agent1.sinks.sink1.hdfs.path = /tmp/flume/year=%Y/month=%m/day=%d 15 | agent1.sinks.sink1.hdfs.filePrefix = events 16 | agent1.sinks.sink1.hdfs.fileSuffix = .log 17 | agent1.sinks.sink1.hdfs.fileType = DataStream 18 | 19 | agent1.channels.channel1.type = file 20 | -------------------------------------------------------------------------------- /ch14-flume/spool-to-hdfs.properties: -------------------------------------------------------------------------------- 1 | agent1.sources = source1 2 | agent1.sinks = sink1 3 | agent1.channels = channel1 4 | 5 | agent1.sources.source1.channels = channel1 6 | agent1.sinks.sink1.channel = channel1 7 | 8 | agent1.sources.source1.type = spooldir 9 | agent1.sources.source1.spoolDir = /tmp/spooldir 10 | 11 | agent1.sinks.sink1.type = hdfs 12 | agent1.sinks.sink1.hdfs.path = /tmp/flume 13 | agent1.sinks.sink1.hdfs.filePrefix = events 14 | agent1.sinks.sink1.hdfs.fileSuffix = .log 15 | agent1.sinks.sink1.hdfs.inUsePrefix = _ 16 | agent1.sinks.sink1.hdfs.fileType = DataStream 17 | 18 | agent1.channels.channel1.type = file 19 | -------------------------------------------------------------------------------- /ch14-flume/spool-to-logger.properties: -------------------------------------------------------------------------------- 1 | agent1.sources = source1 2 | agent1.sinks = sink1 3 | agent1.channels = channel1 4 | 5 | agent1.sources.source1.channels = channel1 6 | agent1.sinks.sink1.channel = channel1 7 | 8 | agent1.sources.source1.type = spooldir 9 | agent1.sources.source1.spoolDir = /tmp/spooldir 10 | 11 | agent1.sinks.sink1.type = logger 12 | 13 | agent1.channels.channel1.type = file 14 | -------------------------------------------------------------------------------- /ch15-sqoop/widgets/part-m-00000.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch15-sqoop/widgets/part-m-00000.avro -------------------------------------------------------------------------------- /ch16-pig/src/main/grunt/combine.grunt: -------------------------------------------------------------------------------- 1 | -- == combine_union 2 | -- == combine_schema 3 | A = LOAD 'input/pig/combine/A' AS (f0:int, f1:int); 4 | B = LOAD 'input/pig/combine/B' AS (f0:chararray, f1:chararray, f2:int); 5 | -- vv combine_union 6 | DUMP A; 7 | DUMP B; 8 | C = UNION A, B; 9 | DUMP C; 10 | -- ^^ combine_union 11 | -- vv combine_schema 12 | DESCRIBE A; 13 | DESCRIBE B; 14 | DESCRIBE C; 15 | -- ^^ combine_schema 16 | -------------------------------------------------------------------------------- /ch16-pig/src/main/grunt/disambiguate.grunt: -------------------------------------------------------------------------------- 1 | A = LOAD 'input/pig/join/A' AS (id:int, name:chararray); 2 | B = LOAD 'input/pig/join/B' AS (name:chararray, id:int); 3 | C = JOIN A by id, B by id; 4 | D = FOREACH C GENERATE A::name; 5 | -------------------------------------------------------------------------------- /ch16-pig/src/main/grunt/flatten.grunt: -------------------------------------------------------------------------------- 1 | -- a demonstration of the different effects of FLATTEN 2 | 3 | C = LOAD 'input/pig/types/C' 4 | AS (f0:chararray, f1:chararray); 5 | 6 | D = FOREACH C GENERATE TOTUPLE(TOTUPLE(f0), TOTUPLE(f1)); 7 | -- D is 8 | -- (((a),(pomegranate))) 9 | -- (((b),(apple))) 10 | DUMP D 11 | 12 | F = FOREACH D GENERATE FLATTEN($0); 13 | -- F is 14 | -- ((a),(pomegranate)) 15 | -- ((b),(apple)) 16 | -- One level of nesting removed 17 | DUMP F 18 | 19 | 20 | B = FOREACH C GENERATE TOBAG(f0, f1); 21 | -- B is 22 | -- ({(a),(pomegranate)}) 23 | -- ({(b),(apple)}) 24 | DUMP B 25 | 26 | F = FOREACH B GENERATE FLATTEN($0); 27 | -- F is 28 | -- (a) 29 | -- (pomegranate) 30 | -- (b) 31 | -- (apple) 32 | -- Tuples in bags are turned into tuples 33 | DUMP F 34 | 35 | B = FOREACH C GENERATE f0, TOBAG(f1, f1); 36 | -- B is 37 | -- (a,{(pomegranate),(pomegranate)}) 38 | -- (b,{(apple),(apple)}) 39 | DUMP B 40 | 41 | F = FOREACH B GENERATE $0, FLATTEN($1); 42 | -- F is 43 | -- (a,pomegranate) 44 | -- (a,pomegranate) 45 | -- (b,apple) 46 | -- (b,apple) 47 | -- Tuples in bags can be added to elements at the top level 48 | DUMP F -------------------------------------------------------------------------------- /ch16-pig/src/main/grunt/foreach.grunt: -------------------------------------------------------------------------------- 1 | -- == foreach_generate 2 | A = LOAD 'input/pig/foreach/A' 3 | AS (f0:chararray, f1:chararray, f2:int); 4 | -- vv foreach_generate 5 | DUMP A; 6 | B = FOREACH A GENERATE $0, $2+1, 'Constant'; 7 | DUMP B; 8 | -- ^^ foreach_generate 9 | DESCRIBE B; 10 | C = FOREACH A GENERATE $0, (int) $2 AS f1, 'Constant' AS f2; 11 | DUMP C; 12 | DESCRIBE C; 13 | 14 | -- C = FOREACH A GENERATE $0, (int) $2 AS f1, 'Constant' AS f2, ($2 > 3 ? 1 : 0); 15 | -------------------------------------------------------------------------------- /ch16-pig/src/main/grunt/group.grunt: -------------------------------------------------------------------------------- 1 | -- == group_dump 2 | -- == group_expression 3 | -- == group_all 4 | A = LOAD 'input/pig/group/A'; 5 | -- vv group_dump 6 | DUMP A; 7 | -- ^^ group_dump 8 | -- vv group_expression 9 | B = GROUP A BY SIZE($1); 10 | DUMP B; 11 | -- ^^ group_expression 12 | -- vv group_all 13 | C = GROUP A ALL; 14 | DUMP C; 15 | -- ^^ group_all 16 | D = FOREACH C GENERATE COUNT(A); 17 | DUMP D; 18 | -------------------------------------------------------------------------------- /ch16-pig/src/main/grunt/missing.grunt: -------------------------------------------------------------------------------- 1 | -- == missing_fields 2 | 3 | -- vv missing_fields 4 | A = LOAD 'input/pig/corrupt/missing_fields'; 5 | DUMP A; 6 | B = FILTER A BY SIZE(TOTUPLE(*)) > 1; 7 | DUMP B; 8 | -- ^^ missing_fields -------------------------------------------------------------------------------- /ch16-pig/src/main/grunt/multiquery.grunt: -------------------------------------------------------------------------------- 1 | A = LOAD 'input/pig/multiquery/A'; 2 | B = FILTER A BY $1 == 'banana'; 3 | C = FILTER A BY $1 != 'banana'; 4 | STORE B INTO 'output/b'; 5 | STORE C INTO 'output/c'; -------------------------------------------------------------------------------- /ch16-pig/src/main/grunt/set.grunt: -------------------------------------------------------------------------------- 1 | -- == set_debug_on 2 | -- vv set_debug_on 3 | set debug on 4 | -- ^^ set_debug_on 5 | -- == set_default_parallel 6 | -- vv set_default_parallel 7 | set default_parallel 30 8 | -- ^^ set_default_parallel 9 | -------------------------------------------------------------------------------- /ch16-pig/src/main/grunt/sort.grunt: -------------------------------------------------------------------------------- 1 | -- == sort_dump 2 | -- == sort_order 3 | -- == sort_no_order 4 | -- == sort_limit 5 | A = LOAD 'input/pig/sort/A'; 6 | -- vv sort_dump 7 | DUMP A; 8 | -- ^^ sort_dump 9 | -- vv sort_order 10 | B = ORDER A BY $0, $1 DESC; 11 | DUMP B; 12 | -- ^^ sort_order 13 | -- vv sort_no_order 14 | C = FOREACH B GENERATE *; 15 | -- ^^ sort_no_order 16 | DUMP C; 17 | -- vv sort_limit 18 | D = LIMIT B 2; 19 | DUMP D; 20 | -- ^^ sort_limit -------------------------------------------------------------------------------- /ch16-pig/src/main/grunt/store.grunt: -------------------------------------------------------------------------------- 1 | -- == store_colon_delimited 2 | A = LOAD 'input/pig/foreach/A'; 3 | -- vv store_colon_delimited 4 | STORE A INTO 'out' USING PigStorage(':'); 5 | cat out 6 | -- ^^ store_colon_delimited 7 | -------------------------------------------------------------------------------- /ch16-pig/src/main/grunt/stream.grunt: -------------------------------------------------------------------------------- 1 | -- == stream_cut 2 | A = LOAD 'input/pig/foreach/A' 3 | AS (f0:chararray, f1:chararray, f2:int); 4 | -- vv stream_cut 5 | C = STREAM A THROUGH `cut -f 2`; 6 | DUMP C; 7 | -- ^^ stream_cut 8 | -------------------------------------------------------------------------------- /ch16-pig/src/main/grunt/tuples.grunt: -------------------------------------------------------------------------------- 1 | A = LOAD 'input/pig/types/A' 2 | AS (f0, t0:tuple(f1:int, f2:chararray, t1:tuple(f3:int, f4:chararray))); 3 | DUMP A; 4 | -------------------------------------------------------------------------------- /ch16-pig/src/main/grunt/types.grunt: -------------------------------------------------------------------------------- 1 | A = LOAD 'input/pig/tuples/A' 2 | AS (t0:tuple(f0:int, f2:chararray)); 3 | DUMP A; 4 | DESCRIBE A; 5 | one = LOAD 'input/pig/types/one'; 6 | B = FOREACH one GENERATE (1,'pomegranate') 7 | AS t0:tuple(f0:int, f2:chararray); 8 | DUMP B; 9 | DESCRIBE B; 10 | C = FOREACH one GENERATE ['a'#'pomegranate'] 11 | AS t0:map[]; 12 | DUMP C; 13 | DESCRIBE C; 14 | 15 | C = LOAD 'input/pig/types/C' 16 | AS (f0:chararray, f1:chararray); 17 | D = FOREACH C GENERATE TOTUPLE(f0, f1); 18 | DUMP D; 19 | D = FOREACH C GENERATE (f0, f1); 20 | DUMP D; 21 | E = FOREACH C GENERATE TOBAG(f0, f1); 22 | DUMP E; 23 | E = FOREACH C GENERATE {f0, f1}; 24 | DUMP E; 25 | F = FOREACH C GENERATE TOMAP(f0, f1); 26 | DUMP F; 27 | F = FOREACH C GENERATE [f0, f1]; 28 | DUMP F; 29 | 30 | G = FOREACH one GENERATE true AS f0:boolean, 1 as f1:int, 1L as f2:long, 31 | 1.0F as f3:float, 1.0 as f4:double, '10000000000' as f5:biginteger, 32 | '0.110001000000000000000001' as f6:bigdecimal, 'a' as f7:chararray, 33 | ToDate('2012-01-02T03:04:05.678Z') as f8:datetime; 34 | DUMP G; 35 | DESCRIBE G; 36 | -------------------------------------------------------------------------------- /ch16-pig/src/main/java/com/hadoopbook/pig/Trim.java: -------------------------------------------------------------------------------- 1 | package com.hadoopbook.pig; 2 | 3 | import org.apache.pig.PrimitiveEvalFunc; 4 | 5 | //cc Trim An EvalFunc UDF to trim leading and trailing whitespace from chararray values 6 | //vv Trim 7 | public class Trim extends PrimitiveEvalFunc { 8 | @Override 9 | public String exec(String input) { 10 | return input.trim(); 11 | } 12 | } 13 | // ^^ Trim -------------------------------------------------------------------------------- /ch16-pig/src/main/pig/comment_c-style.pig: -------------------------------------------------------------------------------- 1 | /* 2 | * Description of my program spanning 3 | * multiple lines. 4 | */ 5 | A = LOAD 'input/pig/join/A'; 6 | B = LOAD 'input/pig/join/B'; 7 | C = JOIN A BY $0, /* ignored */ B BY $1; 8 | DUMP C; -------------------------------------------------------------------------------- /ch16-pig/src/main/pig/comment_single_line.pig: -------------------------------------------------------------------------------- 1 | -- My program 2 | DUMP A; -- What's in A? -------------------------------------------------------------------------------- /ch16-pig/src/main/pig/max_temp.macro: -------------------------------------------------------------------------------- 1 | DEFINE max_by_group(X, group_key, max_field) RETURNS Y { 2 | A = GROUP $X by $group_key; 3 | $Y = FOREACH A GENERATE group, MAX($X.$max_field); 4 | }; -------------------------------------------------------------------------------- /ch16-pig/src/main/pig/max_temp.pig: -------------------------------------------------------------------------------- 1 | -- max_temp.pig: Finds the maximum temperature by year 2 | records = LOAD 'input/ncdc/micro-tab/sample.txt' 3 | AS (year:chararray, temperature:int, quality:int); 4 | filtered_records = FILTER records BY temperature != 9999 AND 5 | quality IN (0, 1, 4, 5, 9); 6 | grouped_records = GROUP filtered_records BY year; 7 | max_temp = FOREACH grouped_records GENERATE group, 8 | MAX(filtered_records.temperature); 9 | DUMP max_temp; 10 | -------------------------------------------------------------------------------- /ch16-pig/src/main/pig/max_temp_filter_stream.pig: -------------------------------------------------------------------------------- 1 | -- max_temp_filter_stream.pig 2 | DEFINE is_good_quality `is_good_quality.py` 3 | SHIP ('ch16-pig/src/main/python/is_good_quality.py'); 4 | records = LOAD 'input/ncdc/micro-tab/sample.txt' 5 | AS (year:chararray, temperature:int, quality:int); 6 | filtered_records = STREAM records THROUGH is_good_quality 7 | AS (year:chararray, temperature:int); 8 | grouped_records = GROUP filtered_records BY year; 9 | max_temp = FOREACH grouped_records GENERATE group, 10 | MAX(filtered_records.temperature); 11 | DUMP max_temp; 12 | -------------------------------------------------------------------------------- /ch16-pig/src/main/pig/max_temp_filter_udf.pig: -------------------------------------------------------------------------------- 1 | -- max_temp_filter_udf.pig 2 | REGISTER pig-examples.jar; 3 | DEFINE isGood com.hadoopbook.pig.IsGoodQuality(); 4 | records = LOAD 'input/ncdc/micro-tab/sample.txt' 5 | AS (year:chararray, temperature:int, quality:int); 6 | filtered_records = FILTER records BY temperature != 9999 AND isGood(quality); 7 | grouped_records = GROUP filtered_records BY year; 8 | max_temp = FOREACH grouped_records GENERATE group, 9 | MAX(filtered_records.temperature); 10 | DUMP max_temp; 11 | -------------------------------------------------------------------------------- /ch16-pig/src/main/pig/max_temp_macro.pig: -------------------------------------------------------------------------------- 1 | DEFINE max_by_group(X, group_key, max_field) RETURNS Y { 2 | A = GROUP $X by $group_key; 3 | $Y = FOREACH A GENERATE group, MAX($X.$max_field); 4 | }; 5 | 6 | records = LOAD 'input/ncdc/micro-tab/sample.txt' 7 | AS (year:chararray, temperature:int, quality:int); 8 | filtered_records = FILTER records BY temperature != 9999 AND 9 | quality IN (0, 1, 4, 5, 9); 10 | max_temp = max_by_group(filtered_records, year, temperature); 11 | DUMP max_temp -------------------------------------------------------------------------------- /ch16-pig/src/main/pig/max_temp_macro_import.pig: -------------------------------------------------------------------------------- 1 | IMPORT './ch16-pig/src/main/pig/max_temp.macro'; 2 | records = LOAD 'input/ncdc/micro-tab/sample.txt' 3 | AS (year:chararray, temperature:int, quality:int); 4 | filtered_records = FILTER records BY temperature != 9999 AND 5 | quality IN (0, 1, 4, 5, 9); 6 | max_temp = max_by_group(filtered_records, year, temperature); 7 | DUMP max_temp -------------------------------------------------------------------------------- /ch16-pig/src/main/pig/max_temp_param.param: -------------------------------------------------------------------------------- 1 | # Input file 2 | input=/user/tom/input/ncdc/micro-tab/sample.txt 3 | # Output file 4 | output=/tmp/out -------------------------------------------------------------------------------- /ch16-pig/src/main/pig/max_temp_param.pig: -------------------------------------------------------------------------------- 1 | -- max_temp_param.pig 2 | records = LOAD '$input' AS (year:chararray, temperature:int, quality:int); 3 | filtered_records = FILTER records BY temperature != 9999 AND 4 | quality IN (0, 1, 4, 5, 9); 5 | grouped_records = GROUP filtered_records BY year; 6 | max_temp = FOREACH grouped_records GENERATE group, 7 | MAX(filtered_records.temperature); 8 | STORE max_temp into '$output'; 9 | -------------------------------------------------------------------------------- /ch16-pig/src/main/pig/max_temp_station_name.pig: -------------------------------------------------------------------------------- 1 | -- max_temp_station_name.pig 2 | REGISTER pig-examples.jar; 3 | DEFINE isGood com.hadoopbook.pig.IsGoodQuality(); 4 | 5 | stations = LOAD 'input/ncdc/metadata/stations-fixed-width.txt' 6 | USING com.hadoopbook.pig.CutLoadFunc('1-6,8-12,14-42') 7 | AS (usaf:chararray, wban:chararray, name:chararray); 8 | 9 | trimmed_stations = FOREACH stations GENERATE usaf, wban, TRIM(name); 10 | 11 | records = LOAD 'input/ncdc/all/191*' 12 | USING com.hadoopbook.pig.CutLoadFunc('5-10,11-15,88-92,93-93') 13 | AS (usaf:chararray, wban:chararray, temperature:int, quality:int); 14 | 15 | filtered_records = FILTER records BY temperature != 9999 AND isGood(quality); 16 | grouped_records = GROUP filtered_records BY (usaf, wban) PARALLEL 30; 17 | max_temp = FOREACH grouped_records GENERATE FLATTEN(group), 18 | MAX(filtered_records.temperature); 19 | max_temp_named = JOIN max_temp BY (usaf, wban), trimmed_stations BY (usaf, wban) 20 | PARALLEL 30; 21 | max_temp_result = FOREACH max_temp_named GENERATE $0, $1, $5, $2; 22 | 23 | STORE max_temp_result INTO 'max_temp_by_station'; -------------------------------------------------------------------------------- /ch16-pig/src/main/pig/year_stats.pig: -------------------------------------------------------------------------------- 1 | -- year_stats.pig 2 | REGISTER pig-examples.jar; 3 | DEFINE isGood com.hadoopbook.pig.IsGoodQuality(); 4 | records = LOAD 'input/ncdc/all/19{1,2,3,4,5}0*' 5 | USING com.hadoopbook.pig.CutLoadFunc('5-10,11-15,16-19,88-92,93-93') 6 | AS (usaf:chararray, wban:chararray, year:int, temperature:int, quality:int); 7 | 8 | grouped_records = GROUP records BY year PARALLEL 30; 9 | 10 | year_stats = FOREACH grouped_records { 11 | uniq_stations = DISTINCT records.usaf; 12 | good_records = FILTER records BY isGood(quality); 13 | GENERATE FLATTEN(group), COUNT(uniq_stations) AS station_count, 14 | COUNT(good_records) AS good_record_count, COUNT(records) AS record_count; 15 | } 16 | 17 | DUMP year_stats; 18 | 19 | -------------------------------------------------------------------------------- /ch16-pig/src/main/python/is_good_quality.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import re 4 | import sys 5 | 6 | for line in sys.stdin: 7 | (year, temp, q) = line.strip().split() 8 | if (temp != "9999" and re.match("[01459]", q)): 9 | print "%s\t%s" % (year, temp) 10 | -------------------------------------------------------------------------------- /ch16-pig/src/test/java/com/hadoopbook/pig/RangeTest.java: -------------------------------------------------------------------------------- 1 | package com.hadoopbook.pig; 2 | 3 | import static org.hamcrest.CoreMatchers.is; 4 | import static org.junit.Assert.assertThat; 5 | 6 | import java.util.List; 7 | 8 | import org.junit.*; 9 | 10 | public class RangeTest { 11 | 12 | @Test 13 | public void parsesEmptyRangeSpec() { 14 | assertThat(Range.parse("").size(), is(0)); 15 | } 16 | 17 | @Test 18 | public void parsesSingleRangeSpec() { 19 | List ranges = Range.parse("1-3"); 20 | assertThat(ranges.size(), is(1)); 21 | assertThat(ranges.get(0), is(new Range(1, 3))); 22 | } 23 | 24 | @Test 25 | public void parsesMultipleRangeSpec() { 26 | List ranges = Range.parse("1-3,5-10"); 27 | assertThat(ranges.size(), is(2)); 28 | assertThat(ranges.get(0), is(new Range(1, 3))); 29 | assertThat(ranges.get(1), is(new Range(5, 10))); 30 | } 31 | 32 | @Test(expected = IllegalArgumentException.class) 33 | public void failsOnInvalidSpec() { 34 | Range.parse("1-n"); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /ch17-hive/src/main/hive/conversions.hive: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS dummy; 2 | CREATE TABLE dummy (value STRING); 3 | LOAD DATA LOCAL INPATH 'input/hive/dummy.txt' 4 | OVERWRITE INTO TABLE dummy; 5 | 6 | SELECT CAST('X' AS INT) from dummy; 7 | 8 | SELECT 2 + TRUE FROM dummy; 9 | 10 | SELECT 2 + CAST(TRUE AS INT) FROM dummy; 11 | 12 | SELECT 2 + '2' FROM dummy; 13 | 14 | SELECT concat('Truth: ', TRUE) FROM simple; 15 | 16 | DROP TABLE IF EXISTS simple; 17 | CREATE TABLE simple ( 18 | col1 TIMESTAMP 19 | ); 20 | 21 | INSERT OVERWRITE TABLE simple 22 | SELECT '2012-01-02 03:04:05.123456789' FROM dummy; 23 | 24 | SELECT 2 + col1 FROM simple; 25 | 26 | SELECT 2L + col1 FROM simple; 27 | 28 | SELECT 2.0 + col1 FROM simple; 29 | 30 | SELECT 2 + CAST(col1 AS BIGINT) FROM simple; 31 | 32 | SELECT concat('Date: ', col1) FROM simple; 33 | -------------------------------------------------------------------------------- /ch17-hive/src/main/hive/indexes.hive: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS users_extended; 2 | 3 | CREATE TABLE users_extended (id INT, name STRING, gender STRING); 4 | 5 | LOAD DATA LOCAL INPATH 'input/hive/tables/users_extended.txt' 6 | OVERWRITE INTO TABLE users_extended; 7 | 8 | DROP INDEX IF EXISTS users_index; 9 | 10 | CREATE INDEX users_index 11 | ON TABLE users_extended (gender) 12 | AS 'BITMAP' WITH DEFERRED REBUILD; 13 | ALTER INDEX users_index ON users_extended REBUILD; 14 | 15 | SELECT * FROM users_extended WHERE gender = 'F'; 16 | -------------------------------------------------------------------------------- /ch17-hive/src/main/hive/max_temp.hive: -------------------------------------------------------------------------------- 1 | ! echo; # == max_temp_select; 2 | 3 | DROP TABLE IF EXISTS records; 4 | 5 | CREATE TABLE records (year STRING, temperature INT, quality INT) 6 | ROW FORMAT DELIMITED 7 | FIELDS TERMINATED BY '\t'; 8 | 9 | LOAD DATA LOCAL INPATH 'input/ncdc/micro-tab/sample.txt' 10 | OVERWRITE INTO TABLE records; 11 | 12 | ! echo; # vv max_temp_select; 13 | SELECT year, MAX(temperature) 14 | FROM records 15 | WHERE temperature != 9999 AND quality IN (0, 1, 4, 5, 9) 16 | GROUP BY year; 17 | ! echo; # ^^ max_temp_select; 18 | -------------------------------------------------------------------------------- /ch17-hive/src/main/hive/regex_serde.hive: -------------------------------------------------------------------------------- 1 | ! echo; # == select_stations; 2 | 3 | DROP TABLE IF EXISTS stations; 4 | 5 | CREATE TABLE stations (usaf STRING, wban STRING, name STRING) 6 | ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe' 7 | WITH SERDEPROPERTIES ( 8 | "input.regex" = "(\\d{6}) (\\d{5}) (.{29}) .*" 9 | ); 10 | 11 | LOAD DATA LOCAL INPATH "input/ncdc/metadata/stations-fixed-width.txt" 12 | INTO TABLE stations; 13 | 14 | ! echo; # vv select_stations; 15 | SELECT * FROM stations LIMIT 4; 16 | ! echo; # ^^ select_stations; 17 | -------------------------------------------------------------------------------- /ch17-hive/src/main/hive/set.hive: -------------------------------------------------------------------------------- 1 | ! echo; # == define_function; 2 | ! echo; # == set_value; 3 | ! echo; # == set_show_value; 4 | 5 | ! echo; # vv define_function; 6 | DESCRIBE FUNCTION length; 7 | ! echo; # ^^ define_function; 8 | 9 | ! echo; # vv set_value; 10 | SET hive.enforce.bucketing=true; 11 | ! echo; # ^^ set_value; 12 | 13 | ! echo; # vv set_show_value; 14 | SET hive.enforce.bucketing; 15 | ! echo; # ^^ set_show_value; 16 | -------------------------------------------------------------------------------- /ch17-hive/src/main/hive/sort.hive: -------------------------------------------------------------------------------- 1 | ! echo; # == sort_by_year; 2 | 3 | DROP TABLE IF EXISTS records2; 4 | 5 | CREATE TABLE records2 (station STRING, year STRING, temperature INT, quality INT) 6 | ROW FORMAT DELIMITED 7 | FIELDS TERMINATED BY '\t'; 8 | 9 | LOAD DATA LOCAL INPATH 'input/ncdc/micro-tab/sample2.txt' 10 | OVERWRITE INTO TABLE records2; 11 | 12 | ! echo; # vv sort_by_year; 13 | FROM records2 14 | SELECT year, temperature 15 | DISTRIBUTE BY year 16 | SORT BY year ASC, temperature DESC; 17 | ! echo; # ^^ sort_by_year; 18 | -------------------------------------------------------------------------------- /ch17-hive/src/main/hive/types.hive: -------------------------------------------------------------------------------- 1 | ! echo; # == complex_types; 2 | 3 | DROP TABLE IF EXISTS complex; 4 | 5 | CREATE TABLE complex ( 6 | c1 ARRAY, 7 | c2 MAP, 8 | c3 STRUCT, 9 | c4 UNIONTYPE 10 | ); 11 | 12 | LOAD DATA LOCAL INPATH 'input/hive/types/complex.txt' 13 | OVERWRITE INTO TABLE complex; 14 | 15 | ! echo; # vv complex_types; 16 | SELECT c1[0], c2['b'], c3.c, c4 FROM complex; 17 | ! echo; # ^^ complex_types; 18 | -------------------------------------------------------------------------------- /ch17-hive/src/main/java/com/hadoopbook/hive/Strip.java: -------------------------------------------------------------------------------- 1 | package com.hadoopbook.hive; 2 | 3 | import org.apache.commons.lang.StringUtils; 4 | import org.apache.hadoop.hive.ql.exec.UDF; 5 | import org.apache.hadoop.io.Text; 6 | 7 | public class Strip extends UDF { 8 | private Text result = new Text(); 9 | 10 | public Text evaluate(Text str) { 11 | if (str == null) { 12 | return null; 13 | } 14 | result.set(StringUtils.strip(str.toString())); 15 | return result; 16 | } 17 | 18 | public Text evaluate(Text str, String stripChars) { 19 | if (str == null) { 20 | return null; 21 | } 22 | result.set(StringUtils.strip(str.toString(), stripChars)); 23 | return result; 24 | } 25 | } -------------------------------------------------------------------------------- /ch17-hive/src/main/python/is_good_quality.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import re 4 | import sys 5 | 6 | for line in sys.stdin: 7 | (year, temp, q) = line.strip().split() 8 | if (temp != "9999" and re.match("[01459]", q)): 9 | print "%s\t%s" % (year, temp) 10 | -------------------------------------------------------------------------------- /ch17-hive/src/main/python/max_temperature_reduce.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | 5 | (last_key, max_val) = (None, 0) 6 | for line in sys.stdin: 7 | (key, val) = line.strip().split("\t") 8 | if last_key and last_key != key: 9 | print "%s\t%s" % (last_key, max_val) 10 | (last_key, max_val) = (key, int(val)) 11 | else: 12 | (last_key, max_val) = (key, max(max_val, int(val))) 13 | 14 | if last_key: 15 | print "%s\t%s" % (last_key, max_val) -------------------------------------------------------------------------------- /ch18-crunch/src/main/assembly/hadoop-job.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | job 8 | 9 | jar 10 | 11 | false 12 | 13 | 14 | false 15 | runtime 16 | lib 17 | 18 | ${groupId}:${artifactId} 19 | 20 | 21 | 22 | true 23 | 24 | ${groupId}:${artifactId} 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /ch18-crunch/src/main/java/crunch/NcdcStationMetadataParser.java: -------------------------------------------------------------------------------- 1 | package crunch; 2 | 3 | import java.io.Serializable; 4 | import org.apache.hadoop.io.Text; 5 | 6 | // Serializable copy of NcdcStationMetadataParser 7 | public class NcdcStationMetadataParser implements Serializable { 8 | 9 | private String stationId; 10 | private String stationName; 11 | 12 | public boolean parse(String record) { 13 | if (record.length() < 42) { // header 14 | return false; 15 | } 16 | String usaf = record.substring(0, 6); 17 | String wban = record.substring(7, 12); 18 | stationId = usaf + "-" + wban; 19 | stationName = record.substring(13, 42); 20 | try { 21 | Integer.parseInt(usaf); // USAF identifiers are numeric 22 | return true; 23 | } catch (NumberFormatException e) { 24 | return false; 25 | } 26 | } 27 | 28 | public boolean parse(Text record) { 29 | return parse(record.toString()); 30 | } 31 | 32 | public String getStationId() { 33 | return stationId; 34 | } 35 | 36 | public String getStationName() { 37 | return stationName; 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /ch18-crunch/src/test/java/crunch/CountValuesFn.java: -------------------------------------------------------------------------------- 1 | package crunch; 2 | 3 | import java.util.Iterator; 4 | import org.apache.crunch.MapFn; 5 | 6 | public class CountValuesFn extends MapFn, Integer> { 7 | @Override 8 | public Integer map(Iterable input) { 9 | int count = 0; 10 | for (Iterator i = input.iterator(); i.hasNext(); ) { 11 | i.next(); 12 | count++; 13 | } 14 | return count; 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /ch18-crunch/src/test/java/crunch/CustomDoFn.java: -------------------------------------------------------------------------------- 1 | package crunch; 2 | 3 | import org.apache.crunch.DoFn; 4 | import org.apache.crunch.Emitter; 5 | 6 | public class CustomDoFn extends DoFn { 7 | 8 | static class NonSerializableHelper { } 9 | 10 | transient NonSerializableHelper helper; 11 | 12 | @Override 13 | public void initialize() { 14 | helper = new NonSerializableHelper(); 15 | } 16 | 17 | @SuppressWarnings("unchecked") 18 | @Override 19 | public void process(S input, Emitter emitter) { 20 | // use helper here 21 | emitter.emit((T) input); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /ch18-crunch/src/test/java/crunch/InversePairFn.java: -------------------------------------------------------------------------------- 1 | package crunch; 2 | 3 | import org.apache.crunch.DoFn; 4 | import org.apache.crunch.Emitter; 5 | import org.apache.crunch.Pair; 6 | 7 | public class InversePairFn extends DoFn, Pair> { 8 | @Override 9 | public void process(Pair input, Emitter> emitter) { 10 | emitter.emit(Pair.of(input.second(), input.first())); 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /ch18-crunch/src/test/java/crunch/PipelineDebugTest.java: -------------------------------------------------------------------------------- 1 | package crunch; 2 | 3 | import org.apache.crunch.PCollection; 4 | import org.apache.crunch.Pipeline; 5 | import org.apache.crunch.impl.mr.MRPipeline; 6 | import org.apache.crunch.test.TemporaryPath; 7 | import org.junit.Rule; 8 | import org.junit.Test; 9 | 10 | public class PipelineDebugTest { 11 | @Rule 12 | public transient TemporaryPath tmpDir = new TemporaryPath(); 13 | 14 | @Test 15 | public void testDebug() throws Exception { 16 | String inputPath = tmpDir.copyResourceFileName("set1.txt"); 17 | Pipeline pipeline = new MRPipeline(getClass()); 18 | pipeline.enableDebug(); 19 | pipeline.getConfiguration().setBoolean("crunch.log.job.progress", true); 20 | PCollection lines = pipeline.readTextFile(inputPath); 21 | pipeline.writeTextFile(lines, tmpDir.getFileName("out")); 22 | pipeline.done(); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /ch18-crunch/src/test/java/crunch/SerializableFunctionsTest.java: -------------------------------------------------------------------------------- 1 | package crunch; 2 | 3 | import java.io.IOException; 4 | import org.apache.crunch.PCollection; 5 | import org.apache.crunch.Pipeline; 6 | import org.apache.crunch.impl.mr.MRPipeline; 7 | import org.apache.crunch.test.TemporaryPath; 8 | import org.junit.Rule; 9 | import org.junit.Test; 10 | 11 | import static org.apache.crunch.types.avro.Avros.strings; 12 | import static org.junit.Assert.assertEquals; 13 | 14 | public class SerializableFunctionsTest { 15 | 16 | @Rule 17 | public transient TemporaryPath tmpDir = new TemporaryPath(); 18 | 19 | @Test 20 | public void testInitialize() throws IOException { 21 | String inputPath = tmpDir.copyResourceFileName("set1.txt"); 22 | Pipeline pipeline = new MRPipeline(getClass()); 23 | PCollection lines = pipeline.readTextFile(inputPath); 24 | long len = lines.parallelDo(new CustomDoFn(), strings()) 25 | .length().getValue(); 26 | assertEquals(4, len); 27 | pipeline.done(); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /ch18-crunch/src/test/java/crunch/ToLowerFn.java: -------------------------------------------------------------------------------- 1 | package crunch; 2 | 3 | import org.apache.crunch.DoFn; 4 | import org.apache.crunch.Emitter; 5 | 6 | public class ToLowerFn extends DoFn { 7 | @Override 8 | public void process(String input, Emitter emitter) { 9 | emitter.emit(input.toLowerCase()); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /ch18-crunch/src/test/resources/A: -------------------------------------------------------------------------------- 1 | 2 Tie 2 | 4 Coat 3 | 3 Hat 4 | 1 Scarf 5 | -------------------------------------------------------------------------------- /ch18-crunch/src/test/resources/B: -------------------------------------------------------------------------------- 1 | Joe 2 2 | Hank 4 3 | Ali 0 4 | Eve 3 5 | Hank 2 6 | -------------------------------------------------------------------------------- /ch18-crunch/src/test/resources/fruit.txt: -------------------------------------------------------------------------------- 1 | cherry 2 | apple 3 | banana 4 | -------------------------------------------------------------------------------- /ch18-crunch/src/test/resources/ints.txt: -------------------------------------------------------------------------------- 1 | 2 2 | 3 3 | 1 4 | 3 -------------------------------------------------------------------------------- /ch18-crunch/src/test/resources/numbers.seq: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch18-crunch/src/test/resources/numbers.seq -------------------------------------------------------------------------------- /ch18-crunch/src/test/resources/sample.txt: -------------------------------------------------------------------------------- 1 | 0067011990999991950051507004+68750+023550FM-12+038299999V0203301N00671220001CN9999999N9+00001+99999999999 2 | 0043011990999991950051512004+68750+023550FM-12+038299999V0203201N00671220001CN9999999N9+00221+99999999999 3 | 0043011990999991950051518004+68750+023550FM-12+038299999V0203201N00261220001CN9999999N9-00111+99999999999 4 | 0043012650999991949032412004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+01111+99999999999 5 | 0043012650999991949032418004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+00781+99999999999 -------------------------------------------------------------------------------- /ch18-crunch/src/test/resources/set1.txt: -------------------------------------------------------------------------------- 1 | b 2 | c 3 | a 4 | e -------------------------------------------------------------------------------- /ch18-crunch/src/test/resources/set2.txt: -------------------------------------------------------------------------------- 1 | b 2 | c 3 | a 4 | e 5 | b 6 | -------------------------------------------------------------------------------- /ch18-crunch/src/test/resources/urls.txt: -------------------------------------------------------------------------------- 1 | www.A.com www.B.com 2 | www.A.com www.C.com 3 | www.A.com www.D.com 4 | www.A.com www.E.com 5 | www.B.com www.D.com 6 | www.B.com www.E.com 7 | www.C.com www.D.com 8 | www.D.com www.B.com 9 | www.E.com www.A.com 10 | www.F.com www.B.com 11 | www.F.com www.C.com 12 | -------------------------------------------------------------------------------- /ch19-spark/src/main/python/MaxTemperature.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext 2 | import re, sys 3 | 4 | sc = SparkContext("local", "Max Temperature") 5 | sc.textFile(sys.argv[1]) \ 6 | .map(lambda s: s.split("\t")) \ 7 | .filter(lambda rec: (rec[1] != "9999" and re.match("[01459]", rec[2]))) \ 8 | .map(lambda rec: (int(rec[0]), int(rec[1]))) \ 9 | .reduceByKey(max) \ 10 | .saveAsTextFile(sys.argv[2]) 11 | -------------------------------------------------------------------------------- /ch19-spark/src/main/scala/MaxTemperature.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.SparkContext._ 2 | import org.apache.spark.{SparkConf, SparkContext} 3 | 4 | object MaxTemperature { 5 | def main(args: Array[String]) { 6 | val conf = new SparkConf().setAppName("Max Temperature") 7 | val sc = new SparkContext(conf) 8 | 9 | sc.textFile(args(0)) 10 | .map(_.split("\t")) 11 | .filter(rec => (rec(1) != "9999" && rec(2).matches("[01459]"))) 12 | .map(rec => (rec(0).toInt, rec(1).toInt)) 13 | .reduceByKey((a, b) => Math.max(a, b)) 14 | .saveAsTextFile(args(1)) 15 | } 16 | } -------------------------------------------------------------------------------- /ch19-spark/src/main/scala/MaxTemperatureWithPlacement.scala: -------------------------------------------------------------------------------- 1 | import org.apache.hadoop.conf.Configuration 2 | import org.apache.hadoop.mapred.TextInputFormat 3 | import org.apache.spark.SparkContext._ 4 | import org.apache.spark.scheduler.InputFormatInfo 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | 7 | object MaxTemperatureWithPlacement { 8 | def main(args: Array[String]) { 9 | val inputPath = args(0) 10 | val conf = new SparkConf().setAppName("Max Temperature") 11 | val preferredLocations = InputFormatInfo.computePreferredLocations( 12 | Seq(new InputFormatInfo(new Configuration(), classOf[TextInputFormat], 13 | inputPath))) 14 | val sc = new SparkContext(conf, preferredLocations) 15 | 16 | sc.textFile(args(0)) 17 | .map(_.split("\t")) 18 | .filter(rec => (rec(1) != "9999" && rec(2).matches("[01459]"))) 19 | .map(rec => (rec(0).toInt, rec(1).toInt)) 20 | .reduceByKey((a, b) => Math.max(a, b)) 21 | .saveAsTextFile(args(1)) 22 | } 23 | } -------------------------------------------------------------------------------- /ch19-spark/src/test/avro/IntWrapper.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name": "example.IntWrapper", 4 | "doc": "A record with a single int value field.", 5 | "fields": [ 6 | {"name": "value", "type": "int"} 7 | ] 8 | } -------------------------------------------------------------------------------- /ch19-spark/src/test/avro/WeatherRecord.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name": "WeatherRecord", 4 | "namespace": "specific", 5 | "doc": "A weather reading.", 6 | "fields": [ 7 | {"name": "year", "type": "int"}, 8 | {"name": "temperature", "type": "int"}, 9 | {"name": "stationId", "type": "string"} 10 | ] 11 | } 12 | -------------------------------------------------------------------------------- /ch19-spark/src/test/resources/fruit.txt: -------------------------------------------------------------------------------- 1 | cherry 2 | apple 3 | banana 4 | -------------------------------------------------------------------------------- /ch19-spark/src/test/resources/numbers.seq: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch19-spark/src/test/resources/numbers.seq -------------------------------------------------------------------------------- /ch19-spark/src/test/resources/quangle.txt: -------------------------------------------------------------------------------- 1 | On the top of the Crumpetty Tree 2 | The Quangle Wangle sat, 3 | But his face you could not see, 4 | On account of his Beaver Hat. 5 | -------------------------------------------------------------------------------- /ch19-spark/src/test/resources/set2.txt: -------------------------------------------------------------------------------- 1 | b 2 | c 3 | a 4 | e 5 | b 6 | -------------------------------------------------------------------------------- /ch19-spark/src/test/scala/CustomKryoRegistrator.scala: -------------------------------------------------------------------------------- 1 | import com.esotericsoftware.kryo.Kryo 2 | import org.apache.spark.serializer.KryoRegistrator 3 | import specific.WeatherRecord 4 | 5 | class CustomKryoRegistrator extends KryoRegistrator { 6 | override def registerClasses(kryo: Kryo) { 7 | kryo.register(classOf[WeatherRecord]) 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /ch19-spark/src/test/scala/ReflectWeatherRecord.scala: -------------------------------------------------------------------------------- 1 | case class ReflectWeatherRecord(year: Int, temperature: Int, stationId: String) { 2 | def this() = this(0, 0, null) 3 | } -------------------------------------------------------------------------------- /ch20-hbase/src/main/java/RowKeyConverter.java: -------------------------------------------------------------------------------- 1 | import org.apache.hadoop.hbase.util.Bytes; 2 | 3 | public class RowKeyConverter { 4 | 5 | private static final int STATION_ID_LENGTH = 12; 6 | 7 | /** 8 | * @return A row key whose format is: 9 | */ 10 | public static byte[] makeObservationRowKey(String stationId, 11 | long observationTime) { 12 | byte[] row = new byte[STATION_ID_LENGTH + Bytes.SIZEOF_LONG]; 13 | Bytes.putBytes(row, 0, Bytes.toBytes(stationId), 0, STATION_ID_LENGTH); 14 | long reverseOrderTimestamp = Long.MAX_VALUE - observationTime; 15 | Bytes.putLong(row, STATION_ID_LENGTH, reverseOrderTimestamp); 16 | return row; 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /ch21-zk/src/main/java/DeleteGroup.java: -------------------------------------------------------------------------------- 1 | //cc DeleteGroup A program to delete a group and its members 2 | import java.util.List; 3 | 4 | import org.apache.zookeeper.KeeperException; 5 | 6 | // vv DeleteGroup 7 | public class DeleteGroup extends ConnectionWatcher { 8 | 9 | public void delete(String groupName) throws KeeperException, 10 | InterruptedException { 11 | String path = "/" + groupName; 12 | 13 | try { 14 | List children = zk.getChildren(path, false); 15 | for (String child : children) { 16 | zk.delete(path + "/" + child, -1); 17 | } 18 | zk.delete(path, -1); 19 | } catch (KeeperException.NoNodeException e) { 20 | System.out.printf("Group %s does not exist\n", groupName); 21 | System.exit(1); 22 | } 23 | } 24 | 25 | public static void main(String[] args) throws Exception { 26 | DeleteGroup deleteGroup = new DeleteGroup(); 27 | deleteGroup.connect(args[0]); 28 | deleteGroup.delete(args[1]); 29 | deleteGroup.close(); 30 | } 31 | } 32 | // ^^ DeleteGroup 33 | -------------------------------------------------------------------------------- /ch21-zk/src/main/java/JoinGroup.java: -------------------------------------------------------------------------------- 1 | //cc JoinGroup A program that joins a group 2 | 3 | import org.apache.zookeeper.CreateMode; 4 | import org.apache.zookeeper.KeeperException; 5 | import org.apache.zookeeper.ZooDefs.Ids; 6 | 7 | // vv JoinGroup 8 | public class JoinGroup extends ConnectionWatcher { 9 | 10 | public void join(String groupName, String memberName) throws KeeperException, 11 | InterruptedException { 12 | String path = "/" + groupName + "/" + memberName; 13 | String createdPath = zk.create(path, null/*data*/, Ids.OPEN_ACL_UNSAFE, 14 | CreateMode.EPHEMERAL); 15 | System.out.println("Created " + createdPath); 16 | } 17 | 18 | public static void main(String[] args) throws Exception { 19 | JoinGroup joinGroup = new JoinGroup(); 20 | joinGroup.connect(args[0]); 21 | joinGroup.join(args[1], args[2]); 22 | 23 | // stay alive until process is killed or thread is interrupted 24 | Thread.sleep(Long.MAX_VALUE); 25 | } 26 | } 27 | // ^^ JoinGroup 28 | -------------------------------------------------------------------------------- /ch21-zk/src/main/sh/group.sh: -------------------------------------------------------------------------------- 1 | : == group_create 2 | : == group_list_empty 3 | : == group_join 4 | : == group_list_after_join 5 | : == group_kill_goat 6 | : == group_list_after_kill 7 | : == group_delete 8 | : vv group_create 9 | java CreateGroup localhost zoo 10 | : ^^ group_create 11 | : vv group_list_empty 12 | java ListGroup localhost zoo 13 | : ^^ group_list_empty 14 | : vv group_join 15 | java JoinGroup localhost zoo duck & 16 | duck_pid=$! 17 | java JoinGroup localhost zoo cow & 18 | cow_pid=$! 19 | java JoinGroup localhost zoo goat & 20 | goat_pid=$! 21 | : ^^ group_join 22 | sleep 5 23 | : vv group_list_after_join 24 | java ListGroup localhost zoo 25 | : ^^ group_list_after_join 26 | : vv group_kill_goat 27 | kill $goat_pid 28 | : ^^ group_kill_goat 29 | sleep 5 30 | sleep 5 # be sure goat process has died 31 | : vv group_list_after_kill 32 | java ListGroup localhost zoo 33 | : ^^ group_list_after_kill 34 | kill $duck_pid 35 | kill $cow_pid 36 | : vv group_delete 37 | java DeleteGroup localhost zoo 38 | java ListGroup localhost zoo 39 | : ^^ group_delete -------------------------------------------------------------------------------- /ch22-case-studies/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.hadoopbook 6 | hadoop-meta 7 | 4.0 8 | ../hadoop-meta/pom.xml 9 | 10 | com.hadoopbook 11 | ch22-case-studies 12 | jar 13 | 4.0 14 | Chapter 22: Case Studies 15 | 16 | 17 | -------------------------------------------------------------------------------- /ch22-case-studies/src/main/java/TrackStats.jr: -------------------------------------------------------------------------------- 1 | module fm.last.hadoop.io.records { 2 | 3 | class TrackStats { 4 | int listeners; 5 | int plays; 6 | int scrobbles; 7 | int radioPlays; 8 | int skips; 9 | } 10 | 11 | } 12 | -------------------------------------------------------------------------------- /common/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.hadoopbook 6 | hadoop-meta 7 | 4.0 8 | ../hadoop-meta/pom.xml 9 | 10 | com.hadoopbook 11 | common 12 | jar 13 | 4.0 14 | Common Code 15 | 16 | 17 | junit 18 | junit 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /common/src/main/java/NcdcStationMetadataParser.java: -------------------------------------------------------------------------------- 1 | import org.apache.hadoop.io.Text; 2 | 3 | public class NcdcStationMetadataParser { 4 | 5 | private String stationId; 6 | private String stationName; 7 | 8 | public boolean parse(String record) { 9 | if (record.length() < 42) { // header 10 | return false; 11 | } 12 | String usaf = record.substring(0, 6); 13 | String wban = record.substring(7, 12); 14 | stationId = usaf + "-" + wban; 15 | stationName = record.substring(13, 42); 16 | try { 17 | Integer.parseInt(usaf); // USAF identifiers are numeric 18 | return true; 19 | } catch (NumberFormatException e) { 20 | return false; 21 | } 22 | } 23 | 24 | public boolean parse(Text record) { 25 | return parse(record.toString()); 26 | } 27 | 28 | public String getStationId() { 29 | return stationId; 30 | } 31 | 32 | public String getStationName() { 33 | return stationName; 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /common/src/main/java/oldapi/NcdcStationMetadataParser.java: -------------------------------------------------------------------------------- 1 | package oldapi; 2 | 3 | import org.apache.hadoop.io.Text; 4 | 5 | public class NcdcStationMetadataParser { 6 | 7 | private String stationId; 8 | private String stationName; 9 | 10 | public boolean parse(String record) { 11 | if (record.length() < 42) { // header 12 | return false; 13 | } 14 | String usaf = record.substring(0, 6); 15 | String wban = record.substring(7, 12); 16 | stationId = usaf + "-" + wban; 17 | stationName = record.substring(13, 42); 18 | try { 19 | Integer.parseInt(usaf); // USAF identifiers are numeric 20 | return true; 21 | } catch (NumberFormatException e) { 22 | return false; 23 | } 24 | } 25 | 26 | public boolean parse(Text record) { 27 | return parse(record.toString()); 28 | } 29 | 30 | public String getStationId() { 31 | return stationId; 32 | } 33 | 34 | public String getStationName() { 35 | return stationName; 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /common/src/test/java/NcdcStationMetadataParserTest.java: -------------------------------------------------------------------------------- 1 | import static org.hamcrest.CoreMatchers.is; 2 | import static org.junit.Assert.assertThat; 3 | import org.junit.*; 4 | 5 | public class NcdcStationMetadataParserTest { 6 | 7 | private NcdcStationMetadataParser parser; 8 | 9 | @Before 10 | public void setUp() { 11 | parser = new NcdcStationMetadataParser(); 12 | } 13 | 14 | @Test 15 | public void parsesValidRecord() { 16 | assertThat(parser.parse("715390 99999 MOOSE JAW CS CN CA SA CZMJ +50317 -105550 +05770"), is(true)); 17 | assertThat(parser.getStationId(), is("715390-99999")); 18 | assertThat(parser.getStationName().trim(), is("MOOSE JAW CS")); 19 | } 20 | 21 | @Test 22 | public void parsesHeader() { 23 | assertThat(parser.parse("Integrated Surface Database Station History, November 2007"), is(false)); 24 | } 25 | 26 | public void parsesBlankLine() { 27 | assertThat(parser.parse(""), is(false)); 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /conf/hadoop-cluster.template.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | fs.defaultFS 6 | hdfs://namenode/ 7 | 8 | 9 | 10 | mapreduce.framework.name 11 | yarn 12 | 13 | 14 | 15 | yarn.resourcemanager.address 16 | resourcemanager:8032 17 | 18 | 19 | -------------------------------------------------------------------------------- /conf/hadoop-local.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | fs.defaultFS 6 | file:/// 7 | 8 | 9 | 10 | mapreduce.framework.name 11 | local 12 | 13 | 14 | -------------------------------------------------------------------------------- /conf/hadoop-localhost.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | fs.defaultFS 6 | hdfs://localhost/ 7 | 8 | 9 | 10 | mapreduce.framework.name 11 | yarn 12 | 13 | 14 | 15 | yarn.resourcemanager.address 16 | localhost:8032 17 | 18 | 19 | -------------------------------------------------------------------------------- /conf/hadoop/pseudo-distributed/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | fs.defaultFS 6 | hdfs://localhost/ 7 | 8 | -------------------------------------------------------------------------------- /conf/hadoop/pseudo-distributed/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | dfs.replication 6 | 1 7 | 8 | -------------------------------------------------------------------------------- /conf/hadoop/pseudo-distributed/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | mapreduce.framework.name 6 | yarn 7 | 8 | -------------------------------------------------------------------------------- /conf/hadoop/pseudo-distributed/yarn-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | yarn.resourcemanager.hostname 6 | localhost 7 | 8 | 9 | yarn.nodemanager.aux-services 10 | mapreduce_shuffle 11 | 12 | -------------------------------------------------------------------------------- /conf/pig/localhost/pig.properties: -------------------------------------------------------------------------------- 1 | fs.defaultFS=hdfs://localhost/ 2 | mapred.job.tracker=localhost:8021 -------------------------------------------------------------------------------- /conf/zookeeper/cluster/zoo.cfg: -------------------------------------------------------------------------------- 1 | tickTime=2000 2 | dataDir=/disk1/zookeeper 3 | dataLogDir=/disk2/zookeeper 4 | clientPort=2181 5 | initLimit=5 6 | syncLimit=2 7 | server.1=zookeeper1:2888:3888 8 | server.2=zookeeper2:2888:3888 9 | server.3=zookeeper3:2888:3888 10 | -------------------------------------------------------------------------------- /conf/zookeeper/localhost/zoo.cfg: -------------------------------------------------------------------------------- 1 | tickTime=2000 2 | dataDir=/Users/tom/zookeeper 3 | clientPort=2181 4 | -------------------------------------------------------------------------------- /input/avro/pairs.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/input/avro/pairs.avro -------------------------------------------------------------------------------- /input/badrecords/a: -------------------------------------------------------------------------------- 1 | 0 2 | 1 3 | 2 4 | 3 5 | 4 6 | S 7 | G 8 | I 9 | 8 10 | 9 -------------------------------------------------------------------------------- /input/badrecords/b: -------------------------------------------------------------------------------- 1 | 0 2 | 1 3 | 2 4 | 3 5 | 4 6 | 5 7 | 6 8 | 7 9 | 8 10 | 9 -------------------------------------------------------------------------------- /input/badrecords/c: -------------------------------------------------------------------------------- 1 | 0 2 | 1 3 | 2 4 | 3 5 | 4 6 | 5 7 | 6 8 | 7 9 | 8 10 | 9 11 | -------------------------------------------------------------------------------- /input/docs/1400-8.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/input/docs/1400-8.txt -------------------------------------------------------------------------------- /input/docs/quangle.txt: -------------------------------------------------------------------------------- 1 | On the top of the Crumpetty Tree 2 | The Quangle Wangle sat, 3 | But his face you could not see, 4 | On account of his Beaver Hat. 5 | -------------------------------------------------------------------------------- /input/fileglob/2007/12/30/data-2007-12-30: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/input/fileglob/2007/12/30/data-2007-12-30 -------------------------------------------------------------------------------- /input/fileglob/2007/12/30/data[2007-12-30]: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/input/fileglob/2007/12/30/data[2007-12-30] -------------------------------------------------------------------------------- /input/fileglob/2007/12/31/data-2007-12-31: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/input/fileglob/2007/12/31/data-2007-12-31 -------------------------------------------------------------------------------- /input/fileglob/2008/01/01/data-2008-01-01: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/input/fileglob/2008/01/01/data-2008-01-01 -------------------------------------------------------------------------------- /input/fileinput/a: -------------------------------------------------------------------------------- 1 | a -------------------------------------------------------------------------------- /input/fileinput/dir/b: -------------------------------------------------------------------------------- 1 | b -------------------------------------------------------------------------------- /input/hive/README: -------------------------------------------------------------------------------- 1 | Commands used to create some of the binary files: 2 | 3 | echo -e '0\x01Nat' > tables/users.txt 4 | echo -e '2\x01Joe' >> tables/users.txt 5 | echo -e '3\x01Kay' >> tables/users.txt 6 | echo -e '4\x01Ann' >> tables/users.txt 7 | 8 | echo -e '1\x022\x01a\x031\x02b\x032\x01a\x021\x021.0' > types/complex.txt 9 | 10 | echo -e '0\x01Nat\x01M' > tables/users_extended.txt 11 | echo -e '2\x01Joe\x01M' >> tables/users_extended.txt 12 | echo -e '3\x01Kay\x01F' >> tables/users_extended.txt 13 | echo -e '4\x01Ann\x01F' >> tables/users_extended.txt 14 | -------------------------------------------------------------------------------- /input/hive/dummy.txt: -------------------------------------------------------------------------------- 1 | X 2 | -------------------------------------------------------------------------------- /input/hive/joins/sales.txt: -------------------------------------------------------------------------------- 1 | Joe 2 2 | Hank 4 3 | Ali 0 4 | Eve 3 5 | Hank 2 -------------------------------------------------------------------------------- /input/hive/joins/things.txt: -------------------------------------------------------------------------------- 1 | 2 Tie 2 | 4 Coat 3 | 3 Hat 4 | 1 Scarf -------------------------------------------------------------------------------- /input/hive/partitions/file1: -------------------------------------------------------------------------------- 1 | 1Log line 1 -------------------------------------------------------------------------------- /input/hive/partitions/file2: -------------------------------------------------------------------------------- 1 | 2Log line 2 -------------------------------------------------------------------------------- /input/hive/partitions/file3: -------------------------------------------------------------------------------- 1 | 3Log line 3 -------------------------------------------------------------------------------- /input/hive/partitions/file4: -------------------------------------------------------------------------------- 1 | 4Log line 4 -------------------------------------------------------------------------------- /input/hive/partitions/file5: -------------------------------------------------------------------------------- 1 | 5Log line 5 -------------------------------------------------------------------------------- /input/hive/partitions/file6: -------------------------------------------------------------------------------- 1 | 6Log line 6 -------------------------------------------------------------------------------- /input/hive/tables/users.txt: -------------------------------------------------------------------------------- 1 | 0Nat 2 | 2Joe 3 | 3Kay 4 | 4Ann 5 | -------------------------------------------------------------------------------- /input/hive/tables/users_extended.txt: -------------------------------------------------------------------------------- 1 | 0NatM 2 | 2JoeM 3 | 3KayF 4 | 4AnnF 5 | -------------------------------------------------------------------------------- /input/hive/tmp.txt: -------------------------------------------------------------------------------- 1 | 1 a 2 | 2 b 3 | 3 c -------------------------------------------------------------------------------- /input/hive/types/complex.txt: -------------------------------------------------------------------------------- 1 | 12b2a1a11.0163 2 | -------------------------------------------------------------------------------- /input/hive/types/nested.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/input/hive/types/nested.txt -------------------------------------------------------------------------------- /input/hive/udfs/arrays.txt: -------------------------------------------------------------------------------- 1 | ab 2 | cde 3 | -------------------------------------------------------------------------------- /input/hive/udfs/fruit.txt: -------------------------------------------------------------------------------- 1 | pomegranate 2 | banana 3 | apple 4 | lychee 5 | -------------------------------------------------------------------------------- /input/hive/udfs/max1.txt: -------------------------------------------------------------------------------- 1 | 1 2 | 2 3 | 3 -------------------------------------------------------------------------------- /input/hive/udfs/max2.txt: -------------------------------------------------------------------------------- 1 | 4 2 | 3 -------------------------------------------------------------------------------- /input/ncdc/all/1901.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/input/ncdc/all/1901.gz -------------------------------------------------------------------------------- /input/ncdc/all/1902.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/input/ncdc/all/1902.gz -------------------------------------------------------------------------------- /input/ncdc/micro-tab/sample.txt: -------------------------------------------------------------------------------- 1 | 1950 0 1 2 | 1950 22 1 3 | 1950 -11 1 4 | 1949 111 1 5 | 1949 78 1 6 | -------------------------------------------------------------------------------- /input/ncdc/micro-tab/sample2.txt: -------------------------------------------------------------------------------- 1 | A 1950 0 1 2 | B 1950 22 1 3 | A 1950 -11 1 4 | B 1949 111 1 5 | A 1949 78 1 6 | -------------------------------------------------------------------------------- /input/ncdc/micro-tab/sample_corrupt.txt: -------------------------------------------------------------------------------- 1 | 1950 0 1 2 | 1950 22 1 3 | 1950 e 1 4 | 1949 111 1 5 | 1949 78 1 6 | -------------------------------------------------------------------------------- /input/ncdc/micro/sample.txt: -------------------------------------------------------------------------------- 1 | 0067011990999991950051507004+68750+023550FM-12+038299999V0203301N00671220001CN9999999N9+00001+99999999999 2 | 0043011990999991950051512004+68750+023550FM-12+038299999V0203201N00671220001CN9999999N9+00221+99999999999 3 | 0043011990999991950051518004+68750+023550FM-12+038299999V0203201N00261220001CN9999999N9-00111+99999999999 4 | 0043012650999991949032412004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+01111+99999999999 5 | 0043012650999991949032418004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+00781+99999999999 -------------------------------------------------------------------------------- /input/ncdc/sample.txt: -------------------------------------------------------------------------------- 1 | 0067011990999991950051507004+68750+023550FM-12+038299999V0203301N00671220001CN9999999N9+00001+99999999999 2 | 0043011990999991950051512004+68750+023550FM-12+038299999V0203201N00671220001CN9999999N9+00221+99999999999 3 | 0043011990999991950051518004+68750+023550FM-12+038299999V0203201N00261220001CN9999999N9-00111+99999999999 4 | 0043012650999991949032412004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+01111+99999999999 5 | 0043012650999991949032418004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+00781+99999999999 -------------------------------------------------------------------------------- /input/ncdc/sample.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/input/ncdc/sample.txt.gz -------------------------------------------------------------------------------- /input/pig/combine/A: -------------------------------------------------------------------------------- 1 | 2 3 2 | 1 2 3 | 2 4 4 | -------------------------------------------------------------------------------- /input/pig/combine/B: -------------------------------------------------------------------------------- 1 | z x 8 2 | w y 1 3 | -------------------------------------------------------------------------------- /input/pig/corrupt/missing_fields: -------------------------------------------------------------------------------- 1 | 2 Tie 2 | 4 Coat 3 | 3 4 | 1 Scarf 5 | -------------------------------------------------------------------------------- /input/pig/foreach/A: -------------------------------------------------------------------------------- 1 | Joe cherry 2 2 | Ali apple 3 3 | Joe banana 2 4 | Eve apple 7 5 | -------------------------------------------------------------------------------- /input/pig/group/A: -------------------------------------------------------------------------------- 1 | Joe cherry 2 | Ali apple 3 | Joe banana 4 | Eve apple 5 | -------------------------------------------------------------------------------- /input/pig/join/A: -------------------------------------------------------------------------------- 1 | 2 Tie 2 | 4 Coat 3 | 3 Hat 4 | 1 Scarf 5 | -------------------------------------------------------------------------------- /input/pig/join/B: -------------------------------------------------------------------------------- 1 | Joe 2 2 | Hank 4 3 | Ali 0 4 | Eve 3 5 | Hank 2 6 | -------------------------------------------------------------------------------- /input/pig/multiquery/A: -------------------------------------------------------------------------------- 1 | Joe cherry 2 | Ali apple 3 | Joe banana 4 | Eve apple 5 | -------------------------------------------------------------------------------- /input/pig/nested/A: -------------------------------------------------------------------------------- 1 | popcorn {(cherry, 1), (cranberry, 3), (pomegranate, 2)} [a:1, b:2] 2 | burger {(apple, 1), (banana, 3), (tangerine, 2)} [a:2, b:1] 3 | -------------------------------------------------------------------------------- /input/pig/nested/B: -------------------------------------------------------------------------------- 1 | popcorn [a:1, b:2] 2 | burger [a:2, b:1] 3 | -------------------------------------------------------------------------------- /input/pig/pairwise/postings: -------------------------------------------------------------------------------- 1 | A {(d1,2),(d3,1)} 2 | B {(d1,1),(d2,1),(d3,2)} 3 | -------------------------------------------------------------------------------- /input/pig/schema/A: -------------------------------------------------------------------------------- 1 | 2 Tie 2 | 4 Coat 3 | 3 Hat 4 | 1 Scarf 5 | -------------------------------------------------------------------------------- /input/pig/sort/A: -------------------------------------------------------------------------------- 1 | 2 3 2 | 1 2 3 | 2 4 4 | -------------------------------------------------------------------------------- /input/pig/tuples/A: -------------------------------------------------------------------------------- 1 | (1,pomegranate) 2 | -------------------------------------------------------------------------------- /input/pig/types/A: -------------------------------------------------------------------------------- 1 | 1 (1,'pomegranate',(2,'apple')) 2 | 2 (3,'banana',(4,lychee)) 3 | -------------------------------------------------------------------------------- /input/pig/types/B: -------------------------------------------------------------------------------- 1 | [a#pomegranate] 2 | -------------------------------------------------------------------------------- /input/pig/types/C: -------------------------------------------------------------------------------- 1 | a pomegranate 2 | b apple 3 | -------------------------------------------------------------------------------- /input/pig/types/one: -------------------------------------------------------------------------------- 1 | 1 2 | -------------------------------------------------------------------------------- /input/pig/udfs/A: -------------------------------------------------------------------------------- 1 | pomegranate 2 | banana 3 | apple 4 | lychee 5 | -------------------------------------------------------------------------------- /input/smallfiles/a: -------------------------------------------------------------------------------- 1 | aaaaaaaaaa -------------------------------------------------------------------------------- /input/smallfiles/b: -------------------------------------------------------------------------------- 1 | bbbbbbbbbb -------------------------------------------------------------------------------- /input/smallfiles/c: -------------------------------------------------------------------------------- 1 | cccccccccc -------------------------------------------------------------------------------- /input/smallfiles/d: -------------------------------------------------------------------------------- 1 | dddddddddd -------------------------------------------------------------------------------- /input/smallfiles/e: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/input/smallfiles/e -------------------------------------------------------------------------------- /input/smallfiles/f: -------------------------------------------------------------------------------- 1 | ffffffffff -------------------------------------------------------------------------------- /input/wikipedia/example.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Page title 4 | edit=sysop:move=sysop 5 | 6 | 2001-01-15T13:15:00Z 7 | Foobar 8 | I have just one thing to say! 9 | A bunch of [[text]] here. 10 | 11 | 12 | 13 | 2001-01-15T13:10:27Z 14 | 10.0.0.2 15 | new! 16 | An earlier [[revision]]. 17 | 18 | 19 | 20 | 21 | Talk:Page title 22 | 23 | 2001-01-15T14:03:00Z 24 | 10.0.0.2 25 | hey 26 | WHYD YOU LOCK PAGE??!!! i was editing that jerk 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /snippet/README: -------------------------------------------------------------------------------- 1 | This directory contains tools for generating code snippets for the book and 2 | testing that they are as expected. 3 | 4 | Example invocations: 5 | 6 | # First set the location of the Hadoop installation you are testing 7 | # You need to have an unpacked copy of Hadoop in this directory 8 | export HADOOP_HOME=~/dev/hadoop-1.0.0/ 9 | 10 | # From the top level 11 | mvn verify -Phadoop.version=1.0.0 12 | 13 | # From the snippet directory 14 | mvn verify 15 | 16 | # Run against a pseudo cluster (you need to start it first) 17 | mvn verify -DargLine="-Dexample.mode=pseudo" 18 | 19 | # Run the examples from chapter 2 only 20 | mvn verify -DargLine="-Dexample.chapters=ch02-mr-intro" -------------------------------------------------------------------------------- /snippet/bin/check_expected.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | bin=`dirname "$0"` 4 | bin=`cd "$bin"; pwd` 5 | 6 | actual="$bin"/../actual 7 | expected="$bin"/../expected 8 | 9 | for f in $expected/ch16-pig/grunt/*.xml; do 10 | echo $f 11 | f_actual=$actual/ch16-pig/grunt/`basename $f` 12 | diff $f $f_actual > /dev/null 13 | if [ $? != 0 ]; then 14 | echo "Expected file $f different to actual $f_actual:" 15 | diff $f $f_actual 16 | #exit 1 17 | fi 18 | done -------------------------------------------------------------------------------- /snippet/bin/check_manuscript.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Check that the expected (or actual) snippets are in the manuscript. E.g. 4 | # bin/check_manuscript.py ~/book-workspace/htdg-git/ch16-pig.xml expected/ch16-pig/grunt/* 5 | 6 | import sys 7 | 8 | manuscript = open(sys.argv[1], 'r').read() 9 | 10 | for snippet_file in sys.argv[2:]: 11 | lines = open(snippet_file, 'r').readlines() 12 | if lines[0].startswith(" 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /snippet/conf/local/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /snippet/conf/local/httpfs-signature.secret: -------------------------------------------------------------------------------- 1 | hadoop httpfs secret 2 | -------------------------------------------------------------------------------- /snippet/conf/local/httpfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /snippet/conf/local/mapred-env.cmd: -------------------------------------------------------------------------------- 1 | @echo off 2 | @rem Licensed to the Apache Software Foundation (ASF) under one or more 3 | @rem contributor license agreements. See the NOTICE file distributed with 4 | @rem this work for additional information regarding copyright ownership. 5 | @rem The ASF licenses this file to You under the Apache License, Version 2.0 6 | @rem (the "License"); you may not use this file except in compliance with 7 | @rem the License. You may obtain a copy of the License at 8 | @rem 9 | @rem http://www.apache.org/licenses/LICENSE-2.0 10 | @rem 11 | @rem Unless required by applicable law or agreed to in writing, software 12 | @rem distributed under the License is distributed on an "AS IS" BASIS, 13 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | @rem See the License for the specific language governing permissions and 15 | @rem limitations under the License. 16 | 17 | set HADOOP_JOB_HISTORYSERVER_HEAPSIZE=1000 18 | 19 | set HADOOP_MAPRED_ROOT_LOGGER=INFO,RFA 20 | 21 | -------------------------------------------------------------------------------- /snippet/conf/local/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /snippet/conf/local/mapred-site.xml.template: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /snippet/conf/local/slaves: -------------------------------------------------------------------------------- 1 | localhost 2 | -------------------------------------------------------------------------------- /snippet/conf/local/yarn-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /snippet/conf/pseudo/capacity-scheduler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | yarn.scheduler.capacity.root.queues 5 | prod,dev 6 | 7 | 8 | yarn.scheduler.capacity.root.dev.queues 9 | eng,science 10 | 11 | 12 | yarn.scheduler.capacity.root.prod.capacity 13 | 40 14 | 15 | 16 | yarn.scheduler.capacity.root.dev.capacity 17 | 60 18 | 19 | 20 | yarn.scheduler.capacity.root.dev.maximum-capacity 21 | 75 22 | 23 | 24 | yarn.scheduler.capacity.root.dev.eng.capacity 25 | 50 26 | 27 | 28 | yarn.scheduler.capacity.root.dev.science.capacity 29 | 50 30 | 31 | -------------------------------------------------------------------------------- /snippet/conf/pseudo/container-executor.cfg: -------------------------------------------------------------------------------- 1 | yarn.nodemanager.linux-container-executor.group=#configured value of yarn.nodemanager.linux-container-executor.group 2 | banned.users=#comma separated list of users who can not run applications 3 | min.user.id=1000#Prevent other super-users 4 | allowed.system.users=##comma separated list of system users who CAN run applications 5 | -------------------------------------------------------------------------------- /snippet/conf/pseudo/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | fs.defaultFS 6 | hdfs://localhost/ 7 | 8 | -------------------------------------------------------------------------------- /snippet/conf/pseudo/fair-scheduler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | fair 4 | 5 | 6 | 40 7 | fifo 8 | 9 | 10 | 11 | 60 12 | 13 | 14 | 15 | 16 | 5 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /snippet/conf/pseudo/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | dfs.replication 6 | 1 7 | 8 | -------------------------------------------------------------------------------- /snippet/conf/pseudo/httpfs-signature.secret: -------------------------------------------------------------------------------- 1 | hadoop httpfs secret 2 | -------------------------------------------------------------------------------- /snippet/conf/pseudo/httpfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /snippet/conf/pseudo/mapred-env.cmd: -------------------------------------------------------------------------------- 1 | @echo off 2 | @rem Licensed to the Apache Software Foundation (ASF) under one or more 3 | @rem contributor license agreements. See the NOTICE file distributed with 4 | @rem this work for additional information regarding copyright ownership. 5 | @rem The ASF licenses this file to You under the Apache License, Version 2.0 6 | @rem (the "License"); you may not use this file except in compliance with 7 | @rem the License. You may obtain a copy of the License at 8 | @rem 9 | @rem http://www.apache.org/licenses/LICENSE-2.0 10 | @rem 11 | @rem Unless required by applicable law or agreed to in writing, software 12 | @rem distributed under the License is distributed on an "AS IS" BASIS, 13 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | @rem See the License for the specific language governing permissions and 15 | @rem limitations under the License. 16 | 17 | set HADOOP_JOB_HISTORYSERVER_HEAPSIZE=1000 18 | 19 | set HADOOP_MAPRED_ROOT_LOGGER=INFO,RFA 20 | 21 | -------------------------------------------------------------------------------- /snippet/conf/pseudo/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | mapreduce.framework.name 6 | yarn 7 | 8 | -------------------------------------------------------------------------------- /snippet/conf/pseudo/mapred-site.xml.template: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /snippet/conf/pseudo/slaves: -------------------------------------------------------------------------------- 1 | localhost 2 | -------------------------------------------------------------------------------- /snippet/conf/pseudo/yarn-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | yarn.resourcemanager.hostname 6 | localhost 7 | 8 | 9 | yarn.nodemanager.aux-services 10 | mapreduce_shuffle 11 | 12 | -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/combine_schema.xml: -------------------------------------------------------------------------------- 1 | grunt> DESCRIBE A; 2 | A: {f0: int,f1: int} 3 | grunt> DESCRIBE B; 4 | B: {f0: chararray,f1: chararray,f2: int} 5 | grunt> DESCRIBE C; 6 | Schema for C unknown. -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/combine_union.xml: -------------------------------------------------------------------------------- 1 | grunt> DUMP A; 2 | (2,3) 3 | (1,2) 4 | (2,4) 5 | grunt> DUMP B; 6 | (z,x,8) 7 | (w,y,1) 8 | grunt> C = UNION A, B; 9 | grunt> DUMP C; 10 | (2,3) 11 | (1,2) 12 | (2,4) 13 | (z,x,8) 14 | (w,y,1) -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/foreach_generate.xml: -------------------------------------------------------------------------------- 1 | grunt> DUMP A; 2 | (Joe,cherry,2) 3 | (Ali,apple,3) 4 | (Joe,banana,2) 5 | (Eve,apple,7) 6 | grunt> B = FOREACH A GENERATE $0, $2+1, 'Constant'; 7 | grunt> DUMP B; 8 | (Joe,3,Constant) 9 | (Ali,4,Constant) 10 | (Joe,3,Constant) 11 | (Eve,8,Constant) -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/group_all.xml: -------------------------------------------------------------------------------- 1 | grunt> C = GROUP A ALL; 2 | grunt> DUMP C; 3 | (all,{(Joe,cherry),(Ali,apple),(Joe,banana),(Eve,apple)}) -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/group_dump.xml: -------------------------------------------------------------------------------- 1 | grunt> DUMP A; 2 | (Joe,cherry) 3 | (Ali,apple) 4 | (Joe,banana) 5 | (Eve,apple) -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/group_expression.xml: -------------------------------------------------------------------------------- 1 | grunt> B = GROUP A BY SIZE($1); 2 | grunt> DUMP B; 3 | (5,{(Ali,apple),(Eve,apple)}) 4 | (6,{(Joe,cherry),(Joe,banana)}) -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/join_cogroup.xml: -------------------------------------------------------------------------------- 1 | grunt> D = COGROUP A BY $0, B BY $1; 2 | grunt> DUMP D; 3 | (0,{},{(Ali,0)}) 4 | (1,{(1,Scarf)},{}) 5 | (2,{(2,Tie)},{(Joe,2),(Hank,2)}) 6 | (3,{(3,Hat)},{(Eve,3)}) 7 | (4,{(4,Coat)},{(Hank,4)}) -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/join_cogroup_flatten.xml: -------------------------------------------------------------------------------- 1 | grunt> F = FOREACH E GENERATE FLATTEN(A), B.$0; 2 | grunt> DUMP F; 3 | (1,Scarf,{}) 4 | (2,Tie,{(Joe),(Hank)}) 5 | (3,Hat,{(Eve)}) 6 | (4,Coat,{(Hank)}) -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/join_cogroup_inner.xml: -------------------------------------------------------------------------------- 1 | grunt> E = COGROUP A BY $0 INNER, B BY $1; 2 | grunt> DUMP E; 3 | (1,{(1,Scarf)},{}) 4 | (2,{(2,Tie)},{(Joe,2),(Hank,2)}) 5 | (3,{(3,Hat)},{(Eve,3)}) 6 | (4,{(4,Coat)},{(Hank,4)}) -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/join_cogroup_join.xml: -------------------------------------------------------------------------------- 1 | grunt> G = COGROUP A BY $0 INNER, B BY $1 INNER; 2 | grunt> H = FOREACH G GENERATE FLATTEN($1), FLATTEN($2); 3 | grunt> DUMP H; 4 | (2,Tie,Joe,2) 5 | (2,Tie,Hank,2) 6 | (3,Hat,Eve,3) 7 | (4,Coat,Hank,4) -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/join_dump.xml: -------------------------------------------------------------------------------- 1 | grunt> DUMP A; 2 | (2,Tie) 3 | (4,Coat) 4 | (3,Hat) 5 | (1,Scarf) 6 | grunt> DUMP B; 7 | (Joe,2) 8 | (Hank,4) 9 | (Ali,0) 10 | (Eve,3) 11 | (Hank,2) -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/join_frj.xml: -------------------------------------------------------------------------------- 1 | grunt> C = JOIN A BY $0, B BY $1 USING "replicated"; -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/join_join.xml: -------------------------------------------------------------------------------- 1 | grunt> C = JOIN A BY $0, B BY $1; 2 | grunt> DUMP C; 3 | (2,Tie,Joe,2) 4 | (2,Tie,Hank,2) 5 | (3,Hat,Eve,3) 6 | (4,Coat,Hank,4) -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/max_temp_describe_records.xml: -------------------------------------------------------------------------------- 1 | grunt> DESCRIBE records; 2 | records: {year: chararray,temperature: int,quality: int} -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/max_temp_dump_grouped_records.xml: -------------------------------------------------------------------------------- 1 | grunt> grouped_records = GROUP filtered_records BY year; 2 | grunt> DUMP grouped_records; 3 | (1949,{(1949,111,1),(1949,78,1)}) 4 | (1950,{(1950,0,1),(1950,22,1),(1950,-11,1)}) -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/max_temp_dump_records.xml: -------------------------------------------------------------------------------- 1 | grunt> DUMP records; 2 | (1950,0,1) 3 | (1950,22,1) 4 | (1950,-11,1) 5 | (1949,111,1) 6 | (1949,78,1) -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/max_temp_filter_records.xml: -------------------------------------------------------------------------------- 1 | grunt> filtered_records = FILTER records BY temperature != 9999 AND 2 | >> (quality == 0 OR quality == 1 OR quality == 4 OR quality == 5 OR quality == 9); 3 | grunt> DUMP filtered_records; 4 | (1950,0,1) 5 | (1950,22,1) 6 | (1950,-11,1) 7 | (1949,111,1) 8 | (1949,78,1) -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/max_temp_load.xml: -------------------------------------------------------------------------------- 1 | grunt> records = LOAD 'input/ncdc/micro-tab/sample.txt' 2 | >> AS (year:chararray, temperature:int, quality:int); -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/max_temp_max_temp.xml: -------------------------------------------------------------------------------- 1 | grunt> max_temp = FOREACH grouped_records GENERATE group, 2 | >> MAX(filtered_records.temperature); -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/max_temp_result.xml: -------------------------------------------------------------------------------- 1 | grunt> DUMP max_temp; 2 | (1949,111) 3 | (1950,22) -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/missing_fields.xml: -------------------------------------------------------------------------------- 1 | grunt> A = LOAD 'input/pig/corrupt/missing_fields'; 2 | grunt> DUMP A; 3 | (2,Tie) 4 | (4,Coat) 5 | (3) 6 | (1,Scarf) 7 | grunt> B = FILTER A BY SIZE(TOTUPLE(*)) > 1; 8 | grunt> DUMP B; 9 | (2,Tie) 10 | (4,Coat) 11 | (1,Scarf) -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/null_corrupt.xml: -------------------------------------------------------------------------------- 1 | grunt> corrupt_records = FILTER records BY temperature is null; 2 | grunt> DUMP corrupt_records; 3 | (1950,,1) -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/null_count.xml: -------------------------------------------------------------------------------- 1 | grunt> grouped = GROUP corrupt_records ALL; 2 | grunt> all_grouped = FOREACH grouped GENERATE group, COUNT(corrupt_records); 3 | grunt> DUMP all_grouped; 4 | (all,1) -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/null_dump.xml: -------------------------------------------------------------------------------- 1 | grunt> records = LOAD 'input/ncdc/micro-tab/sample_corrupt.txt' 2 | >> AS (year:chararray, temperature:int, quality:int); 3 | grunt> DUMP records; 4 | (1950,0,1) 5 | (1950,22,1) 6 | (1950,,1) 7 | (1949,111,1) 8 | (1949,78,1) -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/null_split.xml: -------------------------------------------------------------------------------- 1 | grunt> SPLIT records INTO good_records IF temperature is not null, 2 | >> bad_records IF temperature is null; 3 | grunt> DUMP good_records; 4 | (1950,0,1) 5 | (1950,22,1) 6 | (1949,111,1) 7 | (1949,78,1) 8 | grunt> DUMP bad_records; 9 | (1950,,1) -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/schema_absent.xml: -------------------------------------------------------------------------------- 1 | grunt> records = LOAD 'input/ncdc/micro-tab/sample.txt'; 2 | grunt> DESCRIBE records; 3 | Schema for records unknown. -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/schema_absent_projected.xml: -------------------------------------------------------------------------------- 1 | grunt> projected_records = FOREACH records GENERATE $0, $1, $2; 2 | grunt> DUMP projected_records; 3 | (1950,0,1) 4 | (1950,22,1) 5 | (1950,-11,1) 6 | (1949,111,1) 7 | (1949,78,1) 8 | grunt> DESCRIBE projected_records; 9 | projected_records: {bytearray,bytearray,bytearray} -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/schema_names_only.xml: -------------------------------------------------------------------------------- 1 | grunt> records = LOAD 'input/ncdc/micro-tab/sample.txt' 2 | >> AS (year, temperature, quality); 3 | grunt> DESCRIBE records; 4 | records: {year: bytearray,temperature: bytearray,quality: bytearray} -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/schema_one_type_only.xml: -------------------------------------------------------------------------------- 1 | grunt> records = LOAD 'input/ncdc/micro-tab/sample.txt' 2 | >> AS (year, temperature:int, quality:int); 3 | grunt> DESCRIBE records; 4 | records: {year: bytearray,temperature: int,quality: int} -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/schema_types.xml: -------------------------------------------------------------------------------- 1 | grunt> records = LOAD 'input/ncdc/micro-tab/sample.txt' 2 | >> AS (year:int, temperature:int, quality:int); 3 | grunt> DESCRIBE records; 4 | records: {year: int,temperature: int,quality: int} -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/set_debug_on.xml: -------------------------------------------------------------------------------- 1 | grunt> set debug on -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/sort_dump.xml: -------------------------------------------------------------------------------- 1 | grunt> DUMP A; 2 | (2,3) 3 | (1,2) 4 | (2,4) -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/sort_limit.xml: -------------------------------------------------------------------------------- 1 | grunt> D = LIMIT B 2; 2 | grunt> DUMP D; 3 | (1,2) 4 | (2,4) -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/sort_no_order.xml: -------------------------------------------------------------------------------- 1 | grunt> C = FOREACH B GENERATE *; -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/sort_order.xml: -------------------------------------------------------------------------------- 1 | grunt> B = ORDER A BY $0, $1 DESC; 2 | grunt> DUMP B; 3 | (1,2) 4 | (2,4) 5 | (2,3) -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/store_colon_delimited.xml: -------------------------------------------------------------------------------- 1 | grunt> STORE A INTO 'out' USING PigStorage(':'); 2 | grunt> cat out 3 | Joe:cherry:2 4 | Ali:apple:3 5 | Joe:banana:2 6 | Eve:apple:7 -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/stream_cut.xml: -------------------------------------------------------------------------------- 1 | grunt> C = STREAM A THROUGH `cut -f 2`; 2 | grunt> DUMP C; 3 | (cherry) 4 | (apple) 5 | (banana) 6 | (apple) -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/udfs_invoke_long.xml: -------------------------------------------------------------------------------- 1 | grunt> filtered_records = FILTER records BY temperature != 9999 AND 2 | >> com.hadoopbook.pig.IsGoodQuality(quality); -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/udfs_invoke_short.xml: -------------------------------------------------------------------------------- 1 | grunt> DEFINE isGood com.hadoopbook.pig.IsGoodQuality(); 2 | grunt> filtered_records = FILTER records BY temperature != 9999 AND isGood(quality); -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/udfs_load.xml: -------------------------------------------------------------------------------- 1 | grunt> records = LOAD 'input/ncdc/micro/sample.txt' 2 | >> USING com.hadoopbook.pig.CutLoadFunc('16-19,88-92,93-93') 3 | >> AS (year:int, temperature:int, quality:int); 4 | grunt> DUMP records; 5 | (1950,0,1) 6 | (1950,22,1) 7 | (1950,-11,1) 8 | (1949,111,1) 9 | (1949,78,1) -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/udfs_register.xml: -------------------------------------------------------------------------------- 1 | grunt> REGISTER pig-examples.jar; -------------------------------------------------------------------------------- /snippet/expected/ch11/grunt/udfs_schema.xml: -------------------------------------------------------------------------------- 1 | grunt> DUMP A; 2 | ( pomegranate) 3 | (banana ) 4 | (apple) 5 | ( lychee ) 6 | grunt> DESCRIBE A; 7 | A: {fruit: chararray} 8 | grunt> B = FOREACH A GENERATE com.hadoopbook.pig.Trim(fruit); 9 | grunt> DUMP B; 10 | (pomegranate) 11 | (banana) 12 | (apple) 13 | (lychee) 14 | grunt> DESCRIBE B; 15 | B: {chararray} -------------------------------------------------------------------------------- /snippet/src/test/resources/copyoutput.sh: -------------------------------------------------------------------------------- 1 | if [ ! -e output ]; then 2 | hadoop fs -get output . 3 | fi 4 | -------------------------------------------------------------------------------- /snippet/src/test/resources/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | if ! hadoop fs -test -e input; then 3 | hadoop fs -put input . 4 | fi 5 | if hadoop fs -test -e output; then 6 | hadoop fs -rmr output 7 | fi 8 | if [ -e output ]; then 9 | rm -r output 10 | fi 11 | --------------------------------------------------------------------------------