├── .gitignore
├── README.md
├── appc
└── src
│ └── main
│ └── sh
│ ├── create_ncdc_files.sh
│ ├── load_ncdc.sh
│ ├── load_ncdc_map.sh
│ └── ncdc_files.txt
├── book
├── pom.xml
└── src
│ └── main
│ └── assembly
│ ├── jar.xml
│ └── oozie-workflow-application.xml
├── ch02-mr-intro
├── pom.xml
└── src
│ └── main
│ ├── awk
│ └── max_temperature.sh
│ ├── cpp
│ ├── Makefile
│ └── max_temperature.cpp
│ ├── examples
│ ├── MaxTemperature
│ │ ├── input.txt
│ │ └── output
│ │ │ └── part-r-00000
│ ├── MaxTemperatureWithCombiner
│ │ ├── input.txt
│ │ └── output
│ │ │ └── part-r-00000
│ ├── OldMaxTemperature
│ │ ├── input.txt
│ │ └── output
│ │ │ └── part-00000
│ ├── max_temperature.cpp.input.txt
│ ├── max_temperature_hadoop.input.txt
│ ├── max_temperature_hadoop_cluster.input.txt
│ └── max_temperature_py
│ │ ├── 2
│ │ ├── input.txt
│ │ └── output
│ │ │ └── part-00000
│ │ ├── input.txt
│ │ ├── output
│ │ └── part-r-00000
│ │ └── pseudo
│ │ ├── input.txt
│ │ └── output
│ │ └── part-00000
│ ├── java
│ ├── MaxTemperature.java
│ ├── MaxTemperatureMapper.java
│ ├── MaxTemperatureReducer.java
│ ├── MaxTemperatureWithCombiner.java
│ ├── OldMaxTemperature.java
│ └── oldapi
│ │ ├── MaxTemperature.java
│ │ ├── MaxTemperatureMapper.java
│ │ ├── MaxTemperatureReducer.java
│ │ └── MaxTemperatureWithCombiner.java
│ ├── python
│ ├── max_temperature_map.py
│ └── max_temperature_reduce.py
│ ├── ruby
│ ├── max_temperature_map.rb
│ └── max_temperature_reduce.rb
│ └── sh
│ └── max_temp.sh
├── ch03-hdfs
├── pom.xml
└── src
│ ├── main
│ ├── conf
│ │ ├── core-site.xml
│ │ └── hdfs-site.xml
│ ├── java
│ │ ├── DateRangePathFilter.java
│ │ ├── FileCopyWithProgress.java
│ │ ├── FileSystemCat.java
│ │ ├── FileSystemDoubleCat.java
│ │ ├── ListStatus.java
│ │ ├── RegexExcludePathFilter.java
│ │ ├── RegexPathFilter.java
│ │ └── URLCat.java
│ └── sh
│ │ ├── file.sh
│ │ └── hars.sh
│ └── test
│ └── java
│ ├── CoherencyModelTest.java
│ ├── FileSystemDeleteTest.java
│ ├── FileSystemGlobTest.java
│ └── ShowFileStatusTest.java
├── ch04-yarn
├── capacity-scheduler.xml
└── fair-scheduler.xml
├── ch05-io
├── pom.xml
└── src
│ ├── main
│ ├── examples
│ │ ├── FileDecompressor.java.input.txt
│ │ ├── MapFile-data-head.input.txt
│ │ ├── MapFile-data-head.output.txt
│ │ ├── MapFile-index.input.txt
│ │ ├── MapFile-index.output.txt
│ │ ├── MapFile-ls.input.txt
│ │ ├── MapFile-ls.output.txt
│ │ ├── MapFileWriteDemo.java.input.txt
│ │ ├── MaxTemperatureWithCompression
│ │ │ ├── input.txt
│ │ │ └── output
│ │ │ │ └── part-r-00000.gz
│ │ ├── MaxTemperatureWithMapOutputCompression.ignore
│ │ │ ├── input.txt
│ │ │ └── output
│ │ │ │ └── part-r-00000
│ │ ├── SequenceFileMapReduceSort.java.input.txt
│ │ ├── SequenceFileMapReduceSortResults.java.input.txt
│ │ ├── SequenceFileMapReduceSortResults.java.output.txt
│ │ ├── SequenceFileMapReduceSortResults.java.pre.sh
│ │ ├── SequenceFileReadDemo.java.input.txt
│ │ ├── SequenceFileReadDemo.java.output.txt
│ │ ├── SequenceFileReadDemo.java.pre.sh
│ │ ├── SequenceFileToMapFileConverter-fix.java.input.txt
│ │ ├── SequenceFileToMapFileConverter-mv.java.input.txt
│ │ ├── SequenceFileToMapFileConverter-sort.java.input.txt
│ │ ├── SequenceFileWriteDemo.java.input.txt
│ │ ├── SequenceFileWriteDemo.java.output.txt
│ │ ├── StreamCompressor.java.input.txt
│ │ ├── StreamCompressor.java.output.txt
│ │ ├── TextIterator.java.input.txt
│ │ ├── TextIterator.java.output.txt
│ │ ├── hadoop-fs-text.input.txt
│ │ └── hadoop-fs-text.output.txt
│ └── java
│ │ ├── FileDecompressor.java
│ │ ├── IntPair.java
│ │ ├── MapFileFixer.java
│ │ ├── MapFileWriteDemo.java
│ │ ├── MaxTemperatureWithCompression.java
│ │ ├── MaxTemperatureWithMapOutputCompression.java
│ │ ├── PooledStreamCompressor.java
│ │ ├── SequenceFileReadDemo.java
│ │ ├── SequenceFileWriteDemo.java
│ │ ├── StreamCompressor.java
│ │ ├── TextArrayWritable.java
│ │ ├── TextIterator.java
│ │ ├── TextPair.java
│ │ └── oldapi
│ │ ├── IntPair.java
│ │ ├── MaxTemperatureWithCompression.java
│ │ ├── MaxTemperatureWithMapOutputCompression.java
│ │ └── TextPair.java
│ └── test
│ ├── java
│ ├── ArrayWritableTest.java
│ ├── BinaryOrTextWritable.java
│ ├── BooleanWritableTest.java
│ ├── BytesWritableTest.java
│ ├── FileDecompressorTest.java
│ ├── GenericWritableTest.java
│ ├── IntPairTest.java
│ ├── IntWritableTest.java
│ ├── MapFileSeekTest.java
│ ├── MapWritableTest.java
│ ├── NullWritableTest.java
│ ├── ObjectWritableTest.java
│ ├── SequenceFileSeekAndSyncTest.java
│ ├── StringTextComparisonTest.java
│ ├── TextPairTest.java
│ ├── TextTest.java
│ ├── VIntWritableTest.java
│ ├── VLongWritableTest.java
│ └── WritableTestBase.java
│ └── resources
│ └── file.gz
├── ch06-mr-dev
├── input
│ └── ncdc
│ │ └── micro
│ │ └── sample.txt
├── output
│ ├── ._SUCCESS.crc
│ ├── .part-r-00000.crc
│ ├── _SUCCESS
│ └── part-r-00000
├── pom.xml
└── src
│ ├── main
│ ├── examples
│ │ ├── ConfigurationPrinterSystem.java.input.txt
│ │ ├── ConfigurationPrinterWithConf.java.input.txt
│ │ ├── ConfigurationPrinterWithConf.java.output.txt
│ │ ├── ConfigurationPrinterWithConfAndD.java.input.txt
│ │ ├── ConfigurationPrinterWithD.java.input.txt
│ │ ├── ConfigurationPrinterWithD.java.output.txt
│ │ ├── MaxTemperatureDriver.java.input.txt
│ │ ├── MaxTemperatureDriverV2.ignore
│ │ │ └── input.txt
│ │ ├── MaxTemperatureDriverV2GOP.ignore
│ │ │ └── input.txt
│ │ ├── MaxTemperatureDriverV3
│ │ │ ├── input.txt
│ │ │ └── output
│ │ │ │ └── part-r-00000
│ │ └── MaxTemperatureDriverV3GOP
│ │ │ ├── input.txt
│ │ │ └── output
│ │ │ └── part-r-00000
│ ├── java
│ │ ├── ConfigurationPrinter.java
│ │ ├── LoggingDriver.java
│ │ ├── LoggingIdentityMapper.java
│ │ ├── v1
│ │ │ ├── MaxTemperatureMapper.java
│ │ │ └── MaxTemperatureReducer.java
│ │ ├── v2
│ │ │ ├── MaxTemperatureDriver.java
│ │ │ ├── MaxTemperatureMapper.java
│ │ │ └── NcdcRecordParser.java
│ │ ├── v3
│ │ │ ├── MaxTemperatureDriver.java
│ │ │ └── MaxTemperatureMapper.java
│ │ └── v4
│ │ │ ├── MaxTemperatureDriver.java
│ │ │ ├── MaxTemperatureMapper.java
│ │ │ └── NcdcRecordParser.java
│ └── resources
│ │ ├── configuration-1.xml
│ │ ├── configuration-2.xml
│ │ ├── max-temp-workflow.properties
│ │ └── max-temp-workflow
│ │ └── workflow.xml
│ └── test
│ ├── java
│ ├── MultipleResourceConfigurationTest.java
│ ├── SingleResourceConfigurationTest.java
│ ├── v1
│ │ ├── MaxTemperatureMapperTest.java
│ │ └── MaxTemperatureReducerTest.java
│ ├── v2
│ │ ├── MaxTemperatureDriverMiniTest.java
│ │ ├── MaxTemperatureDriverTest.java
│ │ └── MaxTemperatureMapperTest.java
│ └── v4
│ │ └── MaxTemperatureMapperTest.java
│ └── resources
│ └── expected.txt
├── ch08-mr-types
├── pom.xml
└── src
│ ├── main
│ ├── examples
│ │ ├── MaxTemperatureWithMultipleInputs
│ │ │ ├── input.txt
│ │ │ └── output
│ │ │ │ └── part-r-00000
│ │ ├── MinimalMapReduce.java.input.txt
│ │ ├── MinimalMapReduce
│ │ │ ├── input.txt
│ │ │ └── output
│ │ │ │ └── part-00000
│ │ ├── MinimalMapReduceWithDefaults
│ │ │ ├── input.txt
│ │ │ └── output
│ │ │ │ └── part-00000
│ │ ├── PartitionByStationUsingMultipleOutputFormat.java.input.txt
│ │ ├── PartitionByStationUsingMultipleOutputs
│ │ │ ├── 2
│ │ │ │ ├── input.txt
│ │ │ │ └── output
│ │ │ │ │ ├── 029070-99999-r-00000
│ │ │ │ │ ├── 029500-99999-r-00000
│ │ │ │ │ ├── 029600-99999-r-00000
│ │ │ │ │ ├── 029720-99999-r-00000
│ │ │ │ │ ├── 029810-99999-r-00000
│ │ │ │ │ ├── 227070-99999-r-00000
│ │ │ │ │ └── part-r-00000
│ │ │ ├── input.txt
│ │ │ └── output
│ │ │ │ ├── 029070-99999-r-00000
│ │ │ │ ├── 029500-99999-r-00000
│ │ │ │ ├── 029600-99999-r-00000
│ │ │ │ ├── 029720-99999-r-00000
│ │ │ │ ├── 029810-99999-r-00000
│ │ │ │ ├── 227070-99999-r-00000
│ │ │ │ └── part-r-00000
│ │ ├── PartitionByStationYearUsingMultipleOutputs
│ │ │ ├── 2
│ │ │ │ ├── input.txt
│ │ │ │ └── output
│ │ │ │ │ ├── 029070-99999
│ │ │ │ │ ├── 1901
│ │ │ │ │ │ └── part-r-00000
│ │ │ │ │ └── 1902
│ │ │ │ │ │ └── part-r-00000
│ │ │ │ │ ├── 029500-99999
│ │ │ │ │ ├── 1901
│ │ │ │ │ │ └── part-r-00000
│ │ │ │ │ └── 1902
│ │ │ │ │ │ └── part-r-00000
│ │ │ │ │ ├── 029600-99999
│ │ │ │ │ ├── 1901
│ │ │ │ │ │ └── part-r-00000
│ │ │ │ │ └── 1902
│ │ │ │ │ │ └── part-r-00000
│ │ │ │ │ ├── 029720-99999
│ │ │ │ │ ├── 1901
│ │ │ │ │ │ └── part-r-00000
│ │ │ │ │ └── 1902
│ │ │ │ │ │ └── part-r-00000
│ │ │ │ │ ├── 029810-99999
│ │ │ │ │ ├── 1901
│ │ │ │ │ │ └── part-r-00000
│ │ │ │ │ └── 1902
│ │ │ │ │ │ └── part-r-00000
│ │ │ │ │ ├── 227070-99999
│ │ │ │ │ ├── 1901
│ │ │ │ │ │ └── part-r-00000
│ │ │ │ │ └── 1902
│ │ │ │ │ │ └── part-r-00000
│ │ │ │ │ └── part-r-00000
│ │ │ ├── input.txt
│ │ │ └── output
│ │ │ │ ├── 029070-99999
│ │ │ │ ├── 1901
│ │ │ │ │ └── part-r-00000
│ │ │ │ └── 1902
│ │ │ │ │ └── part-r-00000
│ │ │ │ ├── 029500-99999
│ │ │ │ ├── 1901
│ │ │ │ │ └── part-r-00000
│ │ │ │ └── 1902
│ │ │ │ │ └── part-r-00000
│ │ │ │ ├── 029600-99999
│ │ │ │ ├── 1901
│ │ │ │ │ └── part-r-00000
│ │ │ │ └── 1902
│ │ │ │ │ └── part-r-00000
│ │ │ │ ├── 029720-99999
│ │ │ │ ├── 1901
│ │ │ │ │ └── part-r-00000
│ │ │ │ └── 1902
│ │ │ │ │ └── part-r-00000
│ │ │ │ ├── 029810-99999
│ │ │ │ ├── 1901
│ │ │ │ │ └── part-r-00000
│ │ │ │ └── 1902
│ │ │ │ │ └── part-r-00000
│ │ │ │ ├── 227070-99999
│ │ │ │ ├── 1901
│ │ │ │ │ └── part-r-00000
│ │ │ │ └── 1902
│ │ │ │ │ └── part-r-00000
│ │ │ │ └── part-r-00000
│ │ ├── SmallFilesToSequenceFileConverter.ignore
│ │ │ └── input.txt
│ │ ├── SmallFilesToSequenceFileConverter.java.input.txt
│ │ ├── default_streaming.input.txt
│ │ └── minimal_streaming.input.txt
│ ├── java
│ │ ├── MaxTemperatureWithMultipleInputs.java
│ │ ├── MinimalMapReduce.java
│ │ ├── MinimalMapReduceWithDefaults.java
│ │ ├── NonSplittableTextInputFormat.java
│ │ ├── PartitionByStationUsingMultipleOutputs.java
│ │ ├── PartitionByStationYearUsingMultipleOutputs.java
│ │ ├── SmallFilesToSequenceFileConverter.java
│ │ ├── StationPartitioner.java
│ │ ├── WholeFileInputFormat.java
│ │ ├── WholeFileRecordReader.java
│ │ └── oldapi
│ │ │ ├── MaxTemperatureWithMultipleInputs.java
│ │ │ ├── MinimalMapReduce.java
│ │ │ ├── MinimalMapReduceWithDefaults.java
│ │ │ ├── NonSplittableTextInputFormat.java
│ │ │ ├── PartitionByStationUsingMultipleOutputFormat.java
│ │ │ ├── PartitionByStationUsingMultipleOutputs.java
│ │ │ ├── PartitionByStationYearUsingMultipleOutputFormat.java
│ │ │ ├── SmallFilesToSequenceFileConverter.java
│ │ │ ├── StationPartitioner.java
│ │ │ ├── WholeFileInputFormat.java
│ │ │ └── WholeFileRecordReader.java
│ └── sh
│ │ └── streaming.sh
│ └── test
│ └── java
│ └── TextInputFormatsTest.java
├── ch09-mr-features
├── pom.xml
└── src
│ ├── main
│ ├── examples
│ │ ├── JoinRecordWithStationName
│ │ │ ├── 2
│ │ │ │ ├── input.txt
│ │ │ │ └── output
│ │ │ │ │ └── part-r-00000
│ │ │ ├── input.txt
│ │ │ └── output
│ │ │ │ └── part-r-00000
│ │ ├── LookupRecordByTemperature.java.input.txt
│ │ ├── LookupRecordByTemperature.java.output.txt
│ │ ├── LookupRecordsByTemperature.java.input.txt
│ │ ├── LookupRecordsByTemperature.java.output.txt
│ │ ├── MaxTemperatureByStationNameUsingDistributedCacheFile.java.input.txt
│ │ ├── MaxTemperatureByStationNameUsingDistributedCacheFileApi.ignore
│ │ │ └── input.txt
│ │ ├── MaxTemperatureUsingSecondarySort
│ │ │ ├── input.txt
│ │ │ └── output
│ │ │ │ └── part-r-00000
│ │ ├── MaxTemperatureWithCounters.java.input.txt
│ │ ├── MaxTemperatureWithCounters
│ │ │ ├── input.txt
│ │ │ └── output
│ │ │ │ └── part-r-00000
│ │ ├── MissingTemperatureFields.java.input.txt
│ │ ├── SortByTemperatureToMapFile.ignore
│ │ │ └── input.txt
│ │ ├── SortByTemperatureUsingHashPartitioner.ignore
│ │ │ └── input.txt
│ │ ├── SortByTemperatureUsingHashPartitioner.java.input.txt
│ │ ├── SortByTemperatureUsingTotalOrderPartitioner.java.input.txt
│ │ ├── SortDataPreprocessor.ignore
│ │ │ └── input.txt
│ │ └── SortDataPreprocessor.java.input.txt
│ ├── java
│ │ ├── JoinRecordMapper.java
│ │ ├── JoinRecordWithStationName.java
│ │ ├── JoinReducer.java
│ │ ├── JoinStationMapper.java
│ │ ├── LookupRecordByTemperature.java
│ │ ├── LookupRecordsByTemperature.java
│ │ ├── MaxTemperatureByStationNameUsingDistributedCacheFile.java
│ │ ├── MaxTemperatureUsingSecondarySort.java
│ │ ├── MaxTemperatureWithCounters.java
│ │ ├── MissingTemperatureFields.java
│ │ ├── SortByTemperatureToMapFile.java
│ │ ├── SortByTemperatureUsingHashPartitioner.java
│ │ ├── SortByTemperatureUsingTotalOrderPartitioner.java
│ │ ├── SortDataPreprocessor.java
│ │ ├── TemperatureDistribution.java
│ │ └── oldapi
│ │ │ ├── JoinRecordMapper.java
│ │ │ ├── JoinRecordWithStationName.java
│ │ │ ├── JoinReducer.java
│ │ │ ├── JoinStationMapper.java
│ │ │ ├── LookupRecordByTemperature.java
│ │ │ ├── LookupRecordsByTemperature.java
│ │ │ ├── MaxTemperatureByStationNameUsingDistributedCacheFile.java
│ │ │ ├── MaxTemperatureByStationNameUsingDistributedCacheFileApi.java
│ │ │ ├── MaxTemperatureUsingSecondarySort.java
│ │ │ ├── MaxTemperatureWithCounters.java
│ │ │ ├── MissingTemperatureFields.java
│ │ │ ├── SortByTemperatureToMapFile.java
│ │ │ ├── SortByTemperatureUsingHashPartitioner.java
│ │ │ ├── SortByTemperatureUsingTotalOrderPartitioner.java
│ │ │ ├── SortDataPreprocessor.java
│ │ │ └── TemperatureDistribution.java
│ ├── python
│ │ ├── max_daily_temp_map.py
│ │ ├── max_daily_temp_reduce.py
│ │ ├── mean_max_daily_temp.sh
│ │ ├── mean_max_daily_temp_map.py
│ │ ├── mean_max_daily_temp_reduce.py
│ │ ├── secondary_sort.sh
│ │ ├── secondary_sort_map.py
│ │ └── secondary_sort_reduce.py
│ ├── r
│ │ ├── fixed-partitions
│ │ ├── output
│ │ ├── output_sorted
│ │ ├── sampled-partitions
│ │ ├── temperature_distribution.png
│ │ └── temperature_distribution.r
│ └── resources
│ │ ├── MaxTemperatureWithCounters_Temperature.properties
│ │ └── oldapi
│ │ └── MaxTemperatureWithCounters_Temperature.properties
│ └── test
│ └── java
│ └── KeyFieldBasedComparatorTest.java
├── ch10-setup
└── src
│ └── main
│ ├── conf
│ ├── core-site.xml
│ ├── hdfs-site.xml
│ └── yarn-site.xml
│ └── sh
│ └── trash.sh
├── ch12-avro
├── pom.xml
└── src
│ ├── main
│ ├── assembly
│ │ └── job.xml
│ ├── c
│ │ └── dump_pairs.c
│ ├── examples
│ │ ├── AvroGenericMaxTemperature
│ │ │ ├── input.txt
│ │ │ └── output
│ │ │ │ ├── ._SUCCESS.crc
│ │ │ │ ├── .part-r-00000.avro.crc
│ │ │ │ ├── _SUCCESS
│ │ │ │ └── part-r-00000.avro
│ │ └── AvroSort
│ │ │ ├── input.txt
│ │ │ └── output
│ │ │ ├── ._SUCCESS.crc
│ │ │ ├── .part-r-00000.avro.crc
│ │ │ ├── _SUCCESS
│ │ │ └── part-r-00000.avro
│ ├── java
│ │ ├── AvroGenericMaxTemperature.java
│ │ ├── AvroSort.java
│ │ ├── NcdcRecordParser.java
│ │ └── oldapi
│ │ │ ├── AvroGenericMaxTemperature.java
│ │ │ ├── AvroProjection.java
│ │ │ ├── AvroSort.java
│ │ │ ├── AvroSpecificMaxTemperature.java
│ │ │ └── NcdcRecordParser.java
│ ├── py
│ │ ├── test_avro.py
│ │ └── write_pairs.py
│ └── resources
│ │ ├── AliasedStringPair.avsc
│ │ ├── Array.avsc
│ │ ├── Enum.avsc
│ │ ├── Fixed.avsc
│ │ ├── Map.avsc
│ │ ├── NewStringPair.avsc
│ │ ├── NewStringPairWithNull.avsc
│ │ ├── ProjectedStringPair.avsc
│ │ ├── SortedStringPair.avsc
│ │ ├── StringPair.avsc
│ │ ├── SwitchedStringPair.avsc
│ │ ├── Union.avsc
│ │ └── WeatherRecord.avsc
│ └── test
│ └── java
│ └── AvroTest.java
├── ch13-parquet
├── pom.xml
└── src
│ ├── main
│ ├── assembly
│ │ └── job.xml
│ ├── examples
│ │ └── TextToParquetWithAvro
│ │ │ ├── input.txt
│ │ │ └── output
│ │ │ ├── _SUCCESS
│ │ │ ├── _metadata
│ │ │ └── part-m-00000.parquet
│ └── java
│ │ ├── ParquetToTextWithAvro.java
│ │ ├── ParquetToTextWithExample.java
│ │ ├── TextToParquetWithAvro.java
│ │ └── TextToParquetWithExample.java
│ └── test
│ ├── java
│ ├── ParquetMRWithAvroTest.java
│ ├── ParquetMRWithExampleTest.java
│ └── ParquetTest.java
│ └── resources
│ ├── NewStringPair.avsc
│ ├── ProjectedStringPair.avsc
│ ├── StringPair.avsc
│ └── fruit.txt
├── ch14-flume
├── spool-to-hdfs-and-logger.properties
├── spool-to-hdfs-avro.properties
├── spool-to-hdfs-partitioned.properties
├── spool-to-hdfs-tiered-load-balance.properties
├── spool-to-hdfs-tiered.properties
├── spool-to-hdfs.properties
└── spool-to-logger.properties
├── ch15-sqoop
├── pom.xml
├── src
│ └── main
│ │ └── java
│ │ ├── MaxWidgetId.java
│ │ ├── MaxWidgetIdGenericAvro.java
│ │ └── Widget.java
└── widgets
│ └── part-m-00000.avro
├── ch16-pig
├── pom.xml
└── src
│ ├── main
│ ├── grunt
│ │ ├── combine.grunt
│ │ ├── disambiguate.grunt
│ │ ├── flatten.grunt
│ │ ├── foreach.grunt
│ │ ├── group.grunt
│ │ ├── join.grunt
│ │ ├── max_temp.grunt
│ │ ├── missing.grunt
│ │ ├── multiquery.grunt
│ │ ├── null.grunt
│ │ ├── schema.grunt
│ │ ├── set.grunt
│ │ ├── sort.grunt
│ │ ├── store.grunt
│ │ ├── stream.grunt
│ │ ├── tuples.grunt
│ │ ├── types.grunt
│ │ └── udfs.grunt
│ ├── java
│ │ └── com
│ │ │ └── hadoopbook
│ │ │ └── pig
│ │ │ ├── CutLoadFunc.java
│ │ │ ├── IsGoodQuality.java
│ │ │ ├── Range.java
│ │ │ └── Trim.java
│ ├── pig
│ │ ├── comment_c-style.pig
│ │ ├── comment_single_line.pig
│ │ ├── max_temp.macro
│ │ ├── max_temp.pig
│ │ ├── max_temp_filter_stream.pig
│ │ ├── max_temp_filter_udf.pig
│ │ ├── max_temp_macro.pig
│ │ ├── max_temp_macro_import.pig
│ │ ├── max_temp_param.param
│ │ ├── max_temp_param.pig
│ │ ├── max_temp_station_name.pig
│ │ └── year_stats.pig
│ └── python
│ │ └── is_good_quality.py
│ └── test
│ └── java
│ └── com
│ └── hadoopbook
│ └── pig
│ ├── IsGoodQualityTest.java
│ └── RangeTest.java
├── ch17-hive
├── pom.xml
└── src
│ └── main
│ ├── hive
│ ├── buckets.hive
│ ├── conversions.hive
│ ├── indexes.hive
│ ├── joins.hive
│ ├── mapreduce.hive
│ ├── max_temp.hive
│ ├── multitable_insert.hive
│ ├── partitions.hive
│ ├── regex_serde.hive
│ ├── set.hive
│ ├── sort.hive
│ ├── storage.hive
│ ├── types.hive
│ └── udfs.hive
│ ├── java
│ └── com
│ │ └── hadoopbook
│ │ └── hive
│ │ ├── Maximum.java
│ │ ├── Mean.java
│ │ └── Strip.java
│ └── python
│ ├── is_good_quality.py
│ └── max_temperature_reduce.py
├── ch18-crunch
├── pom.xml
└── src
│ ├── main
│ ├── assembly
│ │ └── hadoop-job.xml
│ └── java
│ │ └── crunch
│ │ ├── AvroGenericMaxTemperatureCrunch.java
│ │ ├── JoinRecordWithStationNameCrunch.java
│ │ ├── MaxTemperatureByStationNameCrunch.java
│ │ ├── MaxTemperatureCrunch.java
│ │ ├── MaxTemperatureCrunchWithShutdownHook.java
│ │ ├── MaxTemperatureUsingSecondarySortCrunch.java
│ │ ├── MaxTemperatureWithCompressionCrunch.java
│ │ ├── MaxTemperatureWithCountersCrunch.java
│ │ ├── MaxTemperatureWithMultipleInputsCrunch.java
│ │ ├── MetOfficeRecordParser.java
│ │ ├── NcdcRecordParser.java
│ │ ├── NcdcStationMetadataParser.java
│ │ ├── SortByTemperatureCrunch.java
│ │ └── SplitCrunch.java
│ └── test
│ ├── java
│ └── crunch
│ │ ├── CheckpointTest.java
│ │ ├── CountValuesFn.java
│ │ ├── CustomDoFn.java
│ │ ├── InversePairFn.java
│ │ ├── JoinTest.java
│ │ ├── MaterializeTest.java
│ │ ├── NonSerializableOuterClass.java
│ │ ├── ObjectReuseTest.java
│ │ ├── PCollections.java
│ │ ├── PageRankTest.java
│ │ ├── PipelineDebugTest.java
│ │ ├── PipelineExecutionTest.java
│ │ ├── PrimitiveOperationsTest.java
│ │ ├── SerializableFunctionsTest.java
│ │ ├── SortTest.java
│ │ ├── SourcesAndTargetsTest.java
│ │ ├── ToLowerFn.java
│ │ ├── TypesTest.java
│ │ └── WeatherRecord.java
│ └── resources
│ ├── A
│ ├── B
│ ├── fruit.txt
│ ├── ints.txt
│ ├── log4j.properties
│ ├── numbers.seq
│ ├── sample.txt
│ ├── set1.txt
│ ├── set2.txt
│ └── urls.txt
├── ch19-spark
├── pom.xml
└── src
│ ├── main
│ ├── java
│ │ └── MaxTemperatureSpark.java
│ ├── python
│ │ └── MaxTemperature.py
│ └── scala
│ │ ├── MaxTemperature.scala
│ │ └── MaxTemperatureWithPlacement.scala
│ └── test
│ ├── avro
│ ├── IntWrapper.avsc
│ └── WeatherRecord.avsc
│ ├── java
│ └── SimpleTest.java
│ ├── resources
│ ├── fruit.txt
│ ├── log4j.properties
│ ├── numbers.seq
│ ├── quangle.txt
│ └── set2.txt
│ └── scala
│ ├── CustomKryoRegistrator.scala
│ ├── DataSerializationTest.scala
│ ├── FunctionSerializationTest.scala
│ ├── RDDCreationTest.scala
│ ├── ReflectWeatherRecord.scala
│ ├── SharedDataTest.scala
│ ├── TransformationsAndActionsTest.scala
│ └── WordCountHistogramTest.scala
├── ch20-hbase
├── pom.xml
└── src
│ └── main
│ └── java
│ ├── ExampleClient.java
│ ├── HBaseStationImporter.java
│ ├── HBaseStationQuery.java
│ ├── HBaseTemperatureBulkImporter.java
│ ├── HBaseTemperatureDirectImporter.java
│ ├── HBaseTemperatureImporter.java
│ ├── HBaseTemperatureQuery.java
│ ├── NewExampleClient.java
│ ├── NewHBaseStationImporter.java
│ ├── NewHBaseStationQuery.java
│ ├── NewHBaseTemperatureQuery.java
│ ├── RowKeyConverter.java
│ └── SimpleRowCounter.java
├── ch21-zk
├── pom.xml
└── src
│ └── main
│ ├── java
│ ├── ActiveKeyValueStore.java
│ ├── ConfigUpdater.java
│ ├── ConfigWatcher.java
│ ├── ConnectionWatcher.java
│ ├── CreateGroup.java
│ ├── DeleteGroup.java
│ ├── JoinGroup.java
│ ├── ListGroup.java
│ ├── ResilientActiveKeyValueStore.java
│ └── ResilientConfigUpdater.java
│ └── sh
│ └── group.sh
├── ch22-case-studies
├── pom.xml
└── src
│ └── main
│ └── java
│ ├── TrackStats.jr
│ └── fm
│ └── last
│ └── hadoop
│ ├── io
│ └── records
│ │ └── TrackStats.java
│ └── programs
│ └── labs
│ └── trackstats
│ └── TrackStatisticsProgram.java
├── common
├── pom.xml
└── src
│ ├── main
│ └── java
│ │ ├── JobBuilder.java
│ │ ├── MetOfficeRecordParser.java
│ │ ├── NcdcRecordParser.java
│ │ ├── NcdcStationMetadata.java
│ │ ├── NcdcStationMetadataParser.java
│ │ └── oldapi
│ │ ├── JobBuilder.java
│ │ ├── MetOfficeRecordParser.java
│ │ ├── NcdcRecordParser.java
│ │ ├── NcdcStationMetadata.java
│ │ └── NcdcStationMetadataParser.java
│ └── test
│ └── java
│ ├── MetOfficeRecordParserTest.java
│ ├── NcdcRecordParserTest.java
│ └── NcdcStationMetadataParserTest.java
├── conf
├── hadoop-cluster.template.xml
├── hadoop-local.xml
├── hadoop-localhost.xml
├── hadoop
│ └── pseudo-distributed
│ │ ├── core-site.xml
│ │ ├── hdfs-site.xml
│ │ ├── mapred-site.xml
│ │ └── yarn-site.xml
├── pig
│ └── localhost
│ │ └── pig.properties
└── zookeeper
│ ├── cluster
│ └── zoo.cfg
│ ├── localhost
│ └── zoo.cfg
│ └── log4j.properties
├── hadoop-examples
└── pom.xml
├── hadoop-meta
└── pom.xml
├── input
├── avro
│ └── pairs.avro
├── badrecords
│ ├── a
│ ├── b
│ └── c
├── docs
│ ├── 1400-8.txt
│ └── quangle.txt
├── fileglob
│ ├── 2007
│ │ └── 12
│ │ │ ├── 30
│ │ │ ├── data-2007-12-30
│ │ │ └── data[2007-12-30]
│ │ │ └── 31
│ │ │ └── data-2007-12-31
│ └── 2008
│ │ └── 01
│ │ └── 01
│ │ └── data-2008-01-01
├── fileinput
│ ├── a
│ └── dir
│ │ └── b
├── hive
│ ├── README
│ ├── dummy.txt
│ ├── joins
│ │ ├── sales.txt
│ │ └── things.txt
│ ├── partitions
│ │ ├── file1
│ │ ├── file2
│ │ ├── file3
│ │ ├── file4
│ │ ├── file5
│ │ └── file6
│ ├── tables
│ │ ├── users.txt
│ │ └── users_extended.txt
│ ├── tmp.txt
│ ├── types
│ │ ├── complex.txt
│ │ └── nested.txt
│ └── udfs
│ │ ├── arrays.txt
│ │ ├── fruit.txt
│ │ ├── max1.txt
│ │ └── max2.txt
├── metoffice
│ ├── aberporthdata.txt
│ ├── armaghdata.txt
│ ├── bradforddata.txt
│ ├── braemardata.txt
│ ├── cambridgedata.txt
│ ├── cardiffdata.txt
│ ├── durhamdata.txt
│ ├── eastbournedata.txt
│ ├── greenwichdata.txt
│ ├── hurndata.txt
│ ├── lerwickdata.txt
│ ├── leucharsdata.txt
│ ├── newtonriggdata.txt
│ ├── oxforddata.txt
│ ├── paisleydata.txt
│ ├── ringwaydata.txt
│ ├── rossonwyedata.txt
│ ├── shawburydata.txt
│ ├── sheffielddata.txt
│ ├── southamptondata.txt
│ ├── stmawgandata.txt
│ ├── stornowaydata.txt
│ ├── suttonbonningtondata.txt
│ ├── tireedata.txt
│ ├── valleydata.txt
│ └── yeoviltondata.txt
├── ncdc
│ ├── all
│ │ ├── 1901.gz
│ │ └── 1902.gz
│ ├── metadata
│ │ ├── ish-history.txt
│ │ └── stations-fixed-width.txt
│ ├── micro-tab
│ │ ├── sample.txt
│ │ ├── sample2.txt
│ │ └── sample_corrupt.txt
│ ├── micro
│ │ └── sample.txt
│ ├── sample.txt
│ └── sample.txt.gz
├── pig
│ ├── combine
│ │ ├── A
│ │ └── B
│ ├── corrupt
│ │ └── missing_fields
│ ├── foreach
│ │ └── A
│ ├── group
│ │ └── A
│ ├── join
│ │ ├── A
│ │ └── B
│ ├── multiquery
│ │ └── A
│ ├── nested
│ │ ├── A
│ │ └── B
│ ├── pairwise
│ │ └── postings
│ ├── schema
│ │ └── A
│ ├── sort
│ │ └── A
│ ├── tuples
│ │ └── A
│ ├── types
│ │ ├── A
│ │ ├── B
│ │ ├── C
│ │ └── one
│ └── udfs
│ │ └── A
├── smallfiles
│ ├── a
│ ├── b
│ ├── c
│ ├── d
│ ├── e
│ └── f
└── wikipedia
│ ├── example.xml
│ └── sample.xml
├── pom.xml
└── snippet
├── README
├── bin
├── check_expected.sh
├── check_manuscript.py
├── check_manuscript.sh
├── generate_listings.sh
├── grunter.sh
├── hiver.sh
├── phragmite_db.pl
├── phragmite_hive.py
└── phragmite_pig.py
├── conf
├── local
│ ├── capacity-scheduler.xml
│ ├── configuration.xsl
│ ├── container-executor.cfg
│ ├── core-site.xml
│ ├── hadoop-env.cmd
│ ├── hadoop-env.sh
│ ├── hadoop-metrics.properties
│ ├── hadoop-metrics2.properties
│ ├── hadoop-policy.xml
│ ├── hdfs-site.xml
│ ├── httpfs-env.sh
│ ├── httpfs-log4j.properties
│ ├── httpfs-signature.secret
│ ├── httpfs-site.xml
│ ├── log4j.properties
│ ├── mapred-env.cmd
│ ├── mapred-env.sh
│ ├── mapred-queues.xml.template
│ ├── mapred-site.xml
│ ├── mapred-site.xml.template
│ ├── slaves
│ ├── ssl-client.xml.example
│ ├── ssl-server.xml.example
│ ├── yarn-env.cmd
│ ├── yarn-env.sh
│ └── yarn-site.xml
└── pseudo
│ ├── capacity-scheduler.xml
│ ├── capacity-scheduler.xml.old
│ ├── configuration.xsl
│ ├── container-executor.cfg
│ ├── core-site.xml
│ ├── fair-scheduler.xml
│ ├── hadoop-env.cmd
│ ├── hadoop-env.sh
│ ├── hadoop-metrics.properties
│ ├── hadoop-metrics2.properties
│ ├── hadoop-policy.xml
│ ├── hdfs-site.xml
│ ├── httpfs-env.sh
│ ├── httpfs-log4j.properties
│ ├── httpfs-signature.secret
│ ├── httpfs-site.xml
│ ├── log4j.properties
│ ├── mapred-env.cmd
│ ├── mapred-env.sh
│ ├── mapred-queues.xml.template
│ ├── mapred-site.xml
│ ├── mapred-site.xml.template
│ ├── slaves
│ ├── ssl-client.xml.example
│ ├── ssl-server.xml.example
│ ├── yarn-env.cmd
│ ├── yarn-env.sh
│ └── yarn-site.xml
├── expected
└── ch11
│ └── grunt
│ ├── combine_schema.xml
│ ├── combine_union.xml
│ ├── foreach_generate.xml
│ ├── group_all.xml
│ ├── group_dump.xml
│ ├── group_expression.xml
│ ├── join_cogroup.xml
│ ├── join_cogroup_flatten.xml
│ ├── join_cogroup_inner.xml
│ ├── join_cogroup_join.xml
│ ├── join_dump.xml
│ ├── join_frj.xml
│ ├── join_join.xml
│ ├── max_temp_describe_records.xml
│ ├── max_temp_dump_grouped_records.xml
│ ├── max_temp_dump_records.xml
│ ├── max_temp_filter_records.xml
│ ├── max_temp_load.xml
│ ├── max_temp_max_temp.xml
│ ├── max_temp_result.xml
│ ├── missing_fields.xml
│ ├── null_corrupt.xml
│ ├── null_count.xml
│ ├── null_dump.xml
│ ├── null_split.xml
│ ├── null_undetected.xml
│ ├── schema_absent.xml
│ ├── schema_absent_projected.xml
│ ├── schema_names_only.xml
│ ├── schema_one_type_only.xml
│ ├── schema_types.xml
│ ├── set_debug_on.xml
│ ├── sort_dump.xml
│ ├── sort_limit.xml
│ ├── sort_no_order.xml
│ ├── sort_order.xml
│ ├── store_colon_delimited.xml
│ ├── stream_cut.xml
│ ├── udfs_invoke_long.xml
│ ├── udfs_invoke_short.xml
│ ├── udfs_load.xml
│ ├── udfs_register.xml
│ └── udfs_schema.xml
├── pom.xml
└── src
└── test
├── java
└── ExamplesIT.java
└── resources
├── copyoutput.sh
└── setup.sh
/.gitignore:
--------------------------------------------------------------------------------
1 | /*.jar
2 | *.log
3 | /build
4 | /lib
5 | /out
6 | /output*
7 | ch*/maxwidget
8 | snippet/actual
9 | target
10 | /target
11 | .classpath
12 | .project
13 | .pydevproject
14 | .settings
15 | metastore_db
16 |
--------------------------------------------------------------------------------
/appc/src/main/sh/create_ncdc_files.sh:
--------------------------------------------------------------------------------
1 | for ((i=1901;i<=2000;i+=1))
2 | do
3 | echo s3n://hadoopbook/ncdc/raw/isd-$i.tar.bz2
4 | done
5 |
--------------------------------------------------------------------------------
/appc/src/main/sh/load_ncdc.sh:
--------------------------------------------------------------------------------
1 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \
2 | -D mapred.reduce.tasks=0 \
3 | -D mapred.map.tasks.speculative.execution=false \
4 | -D mapred.task.timeout=12000000 \
5 | -input ncdc_files.txt \
6 | -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat \
7 | -output output \
8 | -mapper load_ncdc_map.sh \
9 | -file load_ncdc_map.sh
10 |
11 |
--------------------------------------------------------------------------------
/appc/src/main/sh/load_ncdc_map.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # NLineInputFormat gives a single line: key is offset, value is S3 URI
4 | read offset s3file
5 |
6 | # Retrieve file from S3 to local disk
7 | echo "reporter:status:Retrieving $s3file" >&2
8 | $HADOOP_INSTALL/bin/hadoop fs -get $s3file .
9 |
10 | # Un-bzip and un-tar the local file
11 | target=`basename $s3file .tar.bz2`
12 | mkdir -p $target
13 | echo "reporter:status:Un-tarring $s3file to $target" >&2
14 | tar jxf `basename $s3file` -C $target
15 |
16 | # Un-gzip each station file and concat into one file
17 | echo "reporter:status:Un-gzipping $target" >&2
18 | for file in $target/*/*
19 | do
20 | gunzip -c $file >> $target.all
21 | echo "reporter:status:Processed $file" >&2
22 | done
23 |
24 | # Put gzipped version into HDFS
25 | echo "reporter:status:Gzipping $target and putting in HDFS" >&2
26 | gzip -c $target.all | $HADOOP_INSTALL/bin/hadoop fs -put - gz/$target.gz
--------------------------------------------------------------------------------
/book/src/main/assembly/jar.xml:
--------------------------------------------------------------------------------
1 |
4 | jar
5 |
6 | jar
7 |
8 | false
9 |
10 |
11 | /
12 | true
13 | true
14 | runtime
15 | false
16 |
17 | com.hadoopbook:*
18 |
19 |
20 |
21 |
22 |
23 | target/classes
24 | /
25 |
26 |
27 |
--------------------------------------------------------------------------------
/book/src/main/assembly/oozie-workflow-application.xml:
--------------------------------------------------------------------------------
1 |
4 | oozie-workflow-application
5 |
6 | dir
7 |
8 | false
9 |
10 |
11 | ../ch06-mr-dev/src/main/resources/max-temp-workflow
12 | max-temp-workflow
13 |
14 |
15 |
16 |
17 | ../hadoop-examples.jar
18 | max-temp-workflow/lib
19 |
20 |
21 |
--------------------------------------------------------------------------------
/ch02-mr-intro/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.hadoopbook
6 | hadoop-meta
7 | 4.0
8 | ../hadoop-meta/pom.xml
9 |
10 | com.hadoopbook
11 | ch02-mr-intro
12 | jar
13 | 4.0
14 | Chapter 2: MapReduce
15 |
16 |
17 | junit
18 | junit
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/awk/max_temperature.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | for year in all/*
3 | do
4 | echo -ne `basename $year .gz`"\t"
5 | gunzip -c $year | \
6 | awk '{ temp = substr($0, 88, 5) + 0;
7 | q = substr($0, 93, 1);
8 | if (temp !=9999 && q ~ /[01459]/ && temp > max) max = temp }
9 | END { print max }'
10 | done
--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/cpp/Makefile:
--------------------------------------------------------------------------------
1 | CC = g++
2 | CPPFLAGS = -m32 -I$(HADOOP_INSTALL)/c++/$(PLATFORM)/include
3 |
4 | max_temperature: max_temperature.cpp
5 | $(CC) $(CPPFLAGS) $< -Wall -L$(HADOOP_INSTALL)/c++/$(PLATFORM)/lib -lhadooppipes \
6 | -lhadooputils -lpthread -g -O2 -o $@
7 |
--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/MaxTemperature/input.txt:
--------------------------------------------------------------------------------
1 | hadoop MaxTemperature input/ncdc/sample.txt output
--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/MaxTemperature/output/part-r-00000:
--------------------------------------------------------------------------------
1 | 1949 111
2 | 1950 22
3 |
--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/MaxTemperatureWithCombiner/input.txt:
--------------------------------------------------------------------------------
1 | hadoop MaxTemperatureWithCombiner input/ncdc/sample.txt output
--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/MaxTemperatureWithCombiner/output/part-r-00000:
--------------------------------------------------------------------------------
1 | 1949 111
2 | 1950 22
3 |
--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/OldMaxTemperature/input.txt:
--------------------------------------------------------------------------------
1 | hadoop OldMaxTemperature input/ncdc/sample.txt output
--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/OldMaxTemperature/output/part-00000:
--------------------------------------------------------------------------------
1 | 1949 111
2 | 1950 22
3 |
--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/max_temperature.cpp.input.txt:
--------------------------------------------------------------------------------
1 | hadoop pipes \
2 | -D hadoop.pipes.java.recordreader=true \
3 | -D hadoop.pipes.java.recordwriter=true \
4 | -input sample.txt \
5 | -output output \
6 | -program bin/max_temperature
--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/max_temperature_hadoop.input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \
2 | -input input/ncdc/sample.txt \
3 | -output output \
4 | -mapper ch02-mr-intro/src/main/ruby/max_temperature_map.rb \
5 | -reducer ch02-mr-intro/src/main/ruby/max_temperature_reduce.rb
--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/max_temperature_hadoop_cluster.input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \
2 | -files ch02-mr-intro/src/main/ruby/max_temperature_map.rb,\
3 | ch02-mr-intro/src/main/ruby/max_temperature_reduce.rb \
4 | -input input/ncdc/all \
5 | -output output \
6 | -mapper ch02-mr-intro/src/main/ruby/max_temperature_map.rb \
7 | -combiner ch02-mr-intro/src/main/ruby/max_temperature_reduce.rb \
8 | -reducer ch02-mr-intro/src/main/ruby/max_temperature_reduce.rb
9 |
--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/max_temperature_py/2/input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \
2 | -input input/ncdc/sample.txt \
3 | -output output \
4 | -mapper ch02-mr-intro/src/main/python/max_temperature_map.py \
5 | -reducer ch02-mr-intro/src/main/python/max_temperature_reduce.py
--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/max_temperature_py/2/output/part-00000:
--------------------------------------------------------------------------------
1 | 1949 111
2 | 1950 22
3 |
--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/max_temperature_py/input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \
2 | -input input/ncdc/sample.txt \
3 | -output output \
4 | -mapper ch02-mr-intro/src/main/python/max_temperature_map.py \
5 | -reducer ch02-mr-intro/src/main/python/max_temperature_reduce.py
--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/max_temperature_py/output/part-r-00000:
--------------------------------------------------------------------------------
1 | 1949 111
2 | 1950 22
3 |
--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/max_temperature_py/pseudo/input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \
2 | -files ch02-mr-intro/src/main/python/max_temperature_map.py,\
3 | ch02-mr-intro/src/main/python/max_temperature_reduce.py \
4 | -input input/ncdc/sample.txt \
5 | -output output \
6 | -mapper ch02-mr-intro/src/main/python/max_temperature_map.py \
7 | -reducer ch02-mr-intro/src/main/python/max_temperature_reduce.py
8 |
--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/max_temperature_py/pseudo/output/part-00000:
--------------------------------------------------------------------------------
1 | 1949 111
2 | 1950 22
3 |
--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/java/MaxTemperatureReducer.java:
--------------------------------------------------------------------------------
1 | // cc MaxTemperatureReducer Reducer for maximum temperature example
2 | // vv MaxTemperatureReducer
3 | import java.io.IOException;
4 |
5 | import org.apache.hadoop.io.IntWritable;
6 | import org.apache.hadoop.io.Text;
7 | import org.apache.hadoop.mapreduce.Reducer;
8 |
9 | public class MaxTemperatureReducer
10 | extends Reducer {
11 |
12 | @Override
13 | public void reduce(Text key, Iterable values,
14 | Context context)
15 | throws IOException, InterruptedException {
16 |
17 | int maxValue = Integer.MIN_VALUE;
18 | for (IntWritable value : values) {
19 | maxValue = Math.max(maxValue, value.get());
20 | }
21 | context.write(key, new IntWritable(maxValue));
22 | }
23 | }
24 | // ^^ MaxTemperatureReducer
25 |
--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/java/oldapi/MaxTemperatureReducer.java:
--------------------------------------------------------------------------------
1 | package oldapi;
2 |
3 | import java.io.IOException;
4 | import java.util.Iterator;
5 |
6 | import org.apache.hadoop.io.IntWritable;
7 | import org.apache.hadoop.io.Text;
8 | import org.apache.hadoop.mapred.MapReduceBase;
9 | import org.apache.hadoop.mapred.OutputCollector;
10 | import org.apache.hadoop.mapred.Reducer;
11 | import org.apache.hadoop.mapred.Reporter;
12 |
13 | public class MaxTemperatureReducer extends MapReduceBase
14 | implements Reducer {
15 |
16 | public void reduce(Text key, Iterator values,
17 | OutputCollector output, Reporter reporter)
18 | throws IOException {
19 |
20 | int maxValue = Integer.MIN_VALUE;
21 | while (values.hasNext()) {
22 | maxValue = Math.max(maxValue, values.next().get());
23 | }
24 | output.collect(key, new IntWritable(maxValue));
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/python/max_temperature_map.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import re
4 | import sys
5 |
6 | for line in sys.stdin:
7 | val = line.strip()
8 | (year, temp, q) = (val[15:19], val[87:92], val[92:93])
9 | if (temp != "+9999" and re.match("[01459]", q)):
10 | print "%s\t%s" % (year, temp)
11 |
--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/python/max_temperature_reduce.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import sys
4 |
5 | (last_key, max_val) = (None, -sys.maxint)
6 | for line in sys.stdin:
7 | (key, val) = line.strip().split("\t")
8 | if last_key and last_key != key:
9 | print "%s\t%s" % (last_key, max_val)
10 | (last_key, max_val) = (key, int(val))
11 | else:
12 | (last_key, max_val) = (key, max(max_val, int(val)))
13 |
14 | if last_key:
15 | print "%s\t%s" % (last_key, max_val)
--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/ruby/max_temperature_map.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 |
3 | STDIN.each_line do |line|
4 | val = line
5 | year, temp, q = val[15,4], val[87,5], val[92,1]
6 | puts "#{year}\t#{temp}" if (temp != "+9999" && q =~ /[01459]/)
7 | end
--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/ruby/max_temperature_reduce.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 |
3 | last_key, max_val = nil, -1000000
4 | STDIN.each_line do |line|
5 | key, val = line.split("\t")
6 | if last_key && last_key != key
7 | puts "#{last_key}\t#{max_val}"
8 | last_key, max_val = key, val.to_i
9 | else
10 | last_key, max_val = key, [max_val, val.to_i].max
11 | end
12 | end
13 | puts "#{last_key}\t#{max_val}" if last_key
--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/sh/max_temp.sh:
--------------------------------------------------------------------------------
1 | : == max_temp_java
2 | : == max_temp_java_output
3 | : == max_temp_ruby_map
4 | : == max_temp_ruby_pipeline
5 | : == max_temp_python_pipeline
6 | rm -r /Users/tom/workspace/htdg/output
7 | : vv max_temp_java
8 | export HADOOP_CLASSPATH=build/classes
9 | hadoop MaxTemperature input/ncdc/sample.txt output
10 | : ^^ max_temp_java
11 | : vv max_temp_java_output
12 | cat output/part-00000
13 | : ^^ max_temp_java_output
14 | : vv max_temp_ruby_map
15 | cat input/ncdc/sample.txt | ch02-mr-intro/src/main/ruby/max_temperature_map.rb
16 | : ^^ max_temp_ruby_map
17 | : vv max_temp_ruby_pipeline
18 | cat input/ncdc/sample.txt | \
19 | ch02-mr-intro/src/main/ruby/max_temperature_map.rb | \
20 | sort | ch02-mr-intro/src/main/ruby/max_temperature_reduce.rb
21 | : ^^ max_temp_ruby_pipeline
22 | : vv max_temp_python_pipeline
23 | cat input/ncdc/sample.txt | \
24 | ch02-mr-intro/src/main/python/max_temperature_map.py | \
25 | sort | ch02-mr-intro/src/main/python/max_temperature_reduce.py
26 | : ^^ max_temp_python_pipeline
27 |
--------------------------------------------------------------------------------
/ch03-hdfs/src/main/conf/core-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | fs.defaultFS
6 | hdfs://localhost/
7 |
8 |
--------------------------------------------------------------------------------
/ch03-hdfs/src/main/conf/hdfs-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | dfs.replication
6 | 1
7 |
8 |
--------------------------------------------------------------------------------
/ch03-hdfs/src/main/java/FileSystemCat.java:
--------------------------------------------------------------------------------
1 | // cc FileSystemCat Displays files from a Hadoop filesystem on standard output by using the FileSystem directly
2 | import java.io.InputStream;
3 | import java.net.URI;
4 |
5 | import org.apache.hadoop.conf.Configuration;
6 | import org.apache.hadoop.fs.FileSystem;
7 | import org.apache.hadoop.fs.Path;
8 | import org.apache.hadoop.io.IOUtils;
9 |
10 | // vv FileSystemCat
11 | public class FileSystemCat {
12 |
13 | public static void main(String[] args) throws Exception {
14 | String uri = args[0];
15 | Configuration conf = new Configuration();
16 | FileSystem fs = FileSystem.get(URI.create(uri), conf);
17 | InputStream in = null;
18 | try {
19 | in = fs.open(new Path(uri));
20 | IOUtils.copyBytes(in, System.out, 4096, false);
21 | } finally {
22 | IOUtils.closeStream(in);
23 | }
24 | }
25 | }
26 | // ^^ FileSystemCat
27 |
--------------------------------------------------------------------------------
/ch03-hdfs/src/main/java/FileSystemDoubleCat.java:
--------------------------------------------------------------------------------
1 | // cc FileSystemDoubleCat Displays files from a Hadoop filesystem on standard output twice, by using seek
2 | import java.net.URI;
3 |
4 | import org.apache.hadoop.conf.Configuration;
5 | import org.apache.hadoop.fs.FSDataInputStream;
6 | import org.apache.hadoop.fs.FileSystem;
7 | import org.apache.hadoop.fs.Path;
8 | import org.apache.hadoop.io.IOUtils;
9 |
10 | // vv FileSystemDoubleCat
11 | public class FileSystemDoubleCat {
12 |
13 | public static void main(String[] args) throws Exception {
14 | String uri = args[0];
15 | Configuration conf = new Configuration();
16 | FileSystem fs = FileSystem.get(URI.create(uri), conf);
17 | FSDataInputStream in = null;
18 | try {
19 | in = fs.open(new Path(uri));
20 | IOUtils.copyBytes(in, System.out, 4096, false);
21 | in.seek(0); // go back to the start of the file
22 | IOUtils.copyBytes(in, System.out, 4096, false);
23 | } finally {
24 | IOUtils.closeStream(in);
25 | }
26 | }
27 | }
28 | // ^^ FileSystemDoubleCat
29 |
--------------------------------------------------------------------------------
/ch03-hdfs/src/main/java/ListStatus.java:
--------------------------------------------------------------------------------
1 | // cc ListStatus Shows the file statuses for a collection of paths in a Hadoop filesystem
2 | import java.net.URI;
3 |
4 | import org.apache.hadoop.conf.Configuration;
5 | import org.apache.hadoop.fs.FileStatus;
6 | import org.apache.hadoop.fs.FileSystem;
7 | import org.apache.hadoop.fs.FileUtil;
8 | import org.apache.hadoop.fs.Path;
9 |
10 | // vv ListStatus
11 | public class ListStatus {
12 |
13 | public static void main(String[] args) throws Exception {
14 | String uri = args[0];
15 | Configuration conf = new Configuration();
16 | FileSystem fs = FileSystem.get(URI.create(uri), conf);
17 |
18 | Path[] paths = new Path[args.length];
19 | for (int i = 0; i < paths.length; i++) {
20 | paths[i] = new Path(args[i]);
21 | }
22 |
23 | FileStatus[] status = fs.listStatus(paths);
24 | Path[] listedPaths = FileUtil.stat2Paths(status);
25 | for (Path p : listedPaths) {
26 | System.out.println(p);
27 | }
28 | }
29 | }
30 | // ^^ ListStatus
31 |
--------------------------------------------------------------------------------
/ch03-hdfs/src/main/java/RegexExcludePathFilter.java:
--------------------------------------------------------------------------------
1 | // cc RegexExcludePathFilter A PathFilter for excluding paths that match a regular expression
2 | import org.apache.hadoop.fs.Path;
3 | import org.apache.hadoop.fs.PathFilter;
4 |
5 | // vv RegexExcludePathFilter
6 | public class RegexExcludePathFilter implements PathFilter {
7 |
8 | private final String regex;
9 |
10 | public RegexExcludePathFilter(String regex) {
11 | this.regex = regex;
12 | }
13 |
14 | public boolean accept(Path path) {
15 | return !path.toString().matches(regex);
16 | }
17 | }
18 | // ^^ RegexExcludePathFilter
19 |
--------------------------------------------------------------------------------
/ch03-hdfs/src/main/java/RegexPathFilter.java:
--------------------------------------------------------------------------------
1 | import org.apache.hadoop.fs.Path;
2 | import org.apache.hadoop.fs.PathFilter;
3 |
4 | public class RegexPathFilter implements PathFilter {
5 |
6 | private final String regex;
7 | private final boolean include;
8 |
9 | public RegexPathFilter(String regex) {
10 | this(regex, true);
11 | }
12 |
13 | public RegexPathFilter(String regex, boolean include) {
14 | this.regex = regex;
15 | this.include = include;
16 | }
17 |
18 | public boolean accept(Path path) {
19 | return (path.toString().matches(regex)) ? include : !include;
20 | }
21 |
22 | }
23 |
--------------------------------------------------------------------------------
/ch03-hdfs/src/main/java/URLCat.java:
--------------------------------------------------------------------------------
1 | // cc URLCat Displays files from a Hadoop filesystem on standard output using a URLStreamHandler
2 | import java.io.InputStream;
3 | import java.net.URL;
4 |
5 | import org.apache.hadoop.fs.FsUrlStreamHandlerFactory;
6 | import org.apache.hadoop.io.IOUtils;
7 |
8 | // vv URLCat
9 | public class URLCat {
10 |
11 | static {
12 | URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory());
13 | }
14 |
15 | public static void main(String[] args) throws Exception {
16 | InputStream in = null;
17 | try {
18 | in = new URL(args[0]).openStream();
19 | IOUtils.copyBytes(in, System.out, 4096, false);
20 | } finally {
21 | IOUtils.closeStream(in);
22 | }
23 | }
24 | }
25 | // ^^ URLCat
26 |
--------------------------------------------------------------------------------
/ch03-hdfs/src/main/sh/file.sh:
--------------------------------------------------------------------------------
1 | : == url_cat
2 | : == filesystem_cat
3 | : == filesystem_double_cat
4 | : == list_status
5 | : == file_copy_with_progress
6 | rm -r /Users/tom/workspace/htdg/output
7 | export HADOOP_CLASSPATH=build/classes
8 | : vv url_cat
9 | hadoop URLCat hdfs://localhost/user/tom/quangle.txt
10 | : ^^ url_cat
11 | : vv filesystem_cat
12 | hadoop FileSystemCat hdfs://localhost/user/tom/quangle.txt
13 | : ^^ filesystem_cat
14 | : vv filesystem_double_cat
15 | hadoop FileSystemDoubleCat hdfs://localhost/user/tom/quangle.txt
16 | : ^^ filesystem_double_cat
17 | : vv list_status
18 | hadoop ListStatus hdfs://localhost/ hdfs://localhost/user/tom
19 | : ^^ list_status
20 | : vv file_copy_with_progress
21 | hadoop FileCopyWithProgress input/docs/1400-8.txt hdfs://localhost/user/tom/1400-8.txt
22 | : ^^ file_copy_with_progress
23 |
24 |
25 |
--------------------------------------------------------------------------------
/ch03-hdfs/src/main/sh/hars.sh:
--------------------------------------------------------------------------------
1 | : == har_ls_files
2 | : == har_create
3 | : == har_inspect
4 | : == har_ls
5 | : == har_ls_long
6 | : == har_rmr
7 | rsync -avz --exclude '.svn' /Users/tom/workspace/htdg/input/fileinput/ /tmp/fileinput
8 | hadoop fs -copyFromLocal /tmp/fileinput /my/files
9 | rm -rf /tmp/fileinput
10 | : vv har_ls_files
11 | hadoop fs -lsr /my/files
12 | : ^^ har_ls_files
13 | : vv har_create
14 | hadoop archive -archiveName files.har /my/files /my
15 | : ^^ har_create
16 | : vv har_inspect
17 | hadoop fs -ls /my
18 | hadoop fs -ls /my/files.har
19 | : ^^ har_inspect
20 | : vv har_ls
21 | hadoop fs -lsr har:///my/files.har
22 | : ^^ har_ls
23 | : vv har_ls_long
24 | hadoop fs -lsr har:///my/files.har/my/files/dir
25 | hadoop fs -lsr har://hdfs-localhost:8020/my/files.har/my/files/dir
26 | : ^^ har_ls_long
27 | : vv har_rmr
28 | hadoop fs -rmr /my/files.har
29 | : ^^ har_rmr
30 | hadoop fs -rmr /my/files
--------------------------------------------------------------------------------
/ch04-yarn/capacity-scheduler.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | yarn.scheduler.capacity.root.queues
5 | prod,dev
6 |
7 |
8 | yarn.scheduler.capacity.root.dev.queues
9 | eng,science
10 |
11 |
12 | yarn.scheduler.capacity.root.prod.capacity
13 | 40
14 |
15 |
16 | yarn.scheduler.capacity.root.dev.capacity
17 | 60
18 |
19 |
20 | yarn.scheduler.capacity.root.dev.maximum-capacity
21 | 75
22 |
23 |
24 | yarn.scheduler.capacity.root.dev.eng.capacity
25 | 50
26 |
27 |
28 | yarn.scheduler.capacity.root.dev.science.capacity
29 | 50
30 |
31 |
--------------------------------------------------------------------------------
/ch04-yarn/fair-scheduler.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | fair
4 |
5 |
6 | 40
7 | fifo
8 |
9 |
10 |
11 | 60
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/ch05-io/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.hadoopbook
6 | hadoop-meta
7 | 4.0
8 | ../hadoop-meta/pom.xml
9 |
10 | com.hadoopbook
11 | ch05-io
12 | jar
13 | 4.0
14 | Chapter 5: Hadoop I/O
15 |
16 |
17 | com.hadoopbook
18 | ch02-mr-intro
19 | 4.0
20 |
21 |
22 | junit
23 | junit
24 |
25 |
26 | org.hamcrest
27 | hamcrest-all
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/FileDecompressor.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop FileDecompressor file.gz
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/MapFile-data-head.input.txt:
--------------------------------------------------------------------------------
1 | hadoop fs -text numbers.map/data | head
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/MapFile-data-head.output.txt:
--------------------------------------------------------------------------------
1 | 1 One, two, buckle my shoe
2 | 2 Three, four, shut the door
3 | 3 Five, six, pick up sticks
4 | 4 Seven, eight, lay them straight
5 | 5 Nine, ten, a big fat hen
6 | 6 One, two, buckle my shoe
7 | 7 Three, four, shut the door
8 | 8 Five, six, pick up sticks
9 | 9 Seven, eight, lay them straight
10 | 10 Nine, ten, a big fat hen
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/MapFile-index.input.txt:
--------------------------------------------------------------------------------
1 | hadoop fs -text numbers.map/index
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/MapFile-index.output.txt:
--------------------------------------------------------------------------------
1 | 1 128
2 | 129 6079
3 | 257 12054
4 | 385 18030
5 | 513 24002
6 | 641 29976
7 | 769 35947
8 | 897 41922
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/MapFile-ls.input.txt:
--------------------------------------------------------------------------------
1 | ls -l numbers.map
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/MapFile-ls.output.txt:
--------------------------------------------------------------------------------
1 | total 104
2 | -rw-r--r-- 1 tom tom 47898 Jul 29 22:06 data
3 | -rw-r--r-- 1 tom tom 251 Jul 29 22:06 index
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/MapFileWriteDemo.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop MapFileWriteDemo numbers.map
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/MaxTemperatureWithCompression/input.txt:
--------------------------------------------------------------------------------
1 | hadoop MaxTemperatureWithCompression input/ncdc/sample.txt.gz output
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/MaxTemperatureWithCompression/output/part-r-00000.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch05-io/src/main/examples/MaxTemperatureWithCompression/output/part-r-00000.gz
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/MaxTemperatureWithMapOutputCompression.ignore/input.txt:
--------------------------------------------------------------------------------
1 | hadoop MaxTemperatureWithMapOutputCompression input/ncdc/sample.txt.gz output
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/MaxTemperatureWithMapOutputCompression.ignore/output/part-r-00000:
--------------------------------------------------------------------------------
1 | 1949 111
2 | 1950 22
3 |
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/SequenceFileMapReduceSort.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar $HADOOP_INSTALL/hadoop-*-examples.jar sort -r 1 \
2 | -inFormat org.apache.hadoop.mapred.SequenceFileInputFormat \
3 | -outFormat org.apache.hadoop.mapred.SequenceFileOutputFormat \
4 | -outKey org.apache.hadoop.io.IntWritable \
5 | -outValue org.apache.hadoop.io.Text \
6 | numbers.seq sorted
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/SequenceFileMapReduceSortResults.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop fs -text sorted/part-00000 | head
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/SequenceFileMapReduceSortResults.java.output.txt:
--------------------------------------------------------------------------------
1 | 1 Nine, ten, a big fat hen
2 | 2 Seven, eight, lay them straight
3 | 3 Five, six, pick up sticks
4 | 4 Three, four, shut the door
5 | 5 One, two, buckle my shoe
6 | 6 Nine, ten, a big fat hen
7 | 7 Seven, eight, lay them straight
8 | 8 Five, six, pick up sticks
9 | 9 Three, four, shut the door
10 | 10 One, two, buckle my shoe
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/SequenceFileMapReduceSortResults.java.pre.sh:
--------------------------------------------------------------------------------
1 | # Produce sorted seq file
2 | hadoop SequenceFileWriteDemo numbers.seq
3 |
4 | hadoop jar $HADOOP_INSTALL/hadoop-*-examples.jar sort -r 1 \
5 | -inFormat org.apache.hadoop.mapred.SequenceFileInputFormat \
6 | -outFormat org.apache.hadoop.mapred.SequenceFileOutputFormat \
7 | -outKey org.apache.hadoop.io.IntWritable \
8 | -outValue org.apache.hadoop.io.Text \
9 | numbers.seq sorted
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/SequenceFileReadDemo.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop SequenceFileReadDemo numbers.seq
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/SequenceFileReadDemo.java.output.txt:
--------------------------------------------------------------------------------
1 | [128] 100 One, two, buckle my shoe
2 | [173] 99 Three, four, shut the door
3 | [220] 98 Five, six, pick up sticks
4 | [264] 97 Seven, eight, lay them straight
5 | [314] 96 Nine, ten, a big fat hen
6 | [359] 95 One, two, buckle my shoe
7 | [404] 94 Three, four, shut the door
8 | [451] 93 Five, six, pick up sticks
9 | [495] 92 Seven, eight, lay them straight
10 | [545] 91 Nine, ten, a big fat hen
11 | [590] 90 One, two, buckle my shoe
12 | ...
13 | [1976] 60 One, two, buckle my shoe
14 | [2021*] 59 Three, four, shut the door
15 | [2088] 58 Five, six, pick up sticks
16 | [2132] 57 Seven, eight, lay them straight
17 | [2182] 56 Nine, ten, a big fat hen
18 | ...
19 | [4557] 5 One, two, buckle my shoe
20 | [4602] 4 Three, four, shut the door
21 | [4649] 3 Five, six, pick up sticks
22 | [4693] 2 Seven, eight, lay them straight
23 | [4743] 1 Nine, ten, a big fat hen
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/SequenceFileReadDemo.java.pre.sh:
--------------------------------------------------------------------------------
1 | # Make sure file is there to be read
2 | hadoop SequenceFileWriteDemo numbers.seq
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/SequenceFileToMapFileConverter-fix.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop MapFileFixer numbers.map
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/SequenceFileToMapFileConverter-mv.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop fs -mv numbers.map/part-00000 numbers.map/data
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/SequenceFileToMapFileConverter-sort.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar $HADOOP_INSTALL/hadoop-*-examples.jar sort -r 1 \
2 | -inFormat org.apache.hadoop.mapred.SequenceFileInputFormat \
3 | -outFormat org.apache.hadoop.mapred.SequenceFileOutputFormat \
4 | -outKey org.apache.hadoop.io.IntWritable \
5 | -outValue org.apache.hadoop.io.Text \
6 | numbers.seq numbers.map
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/SequenceFileWriteDemo.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop SequenceFileWriteDemo numbers.seq
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/SequenceFileWriteDemo.java.output.txt:
--------------------------------------------------------------------------------
1 | [128] 100 One, two, buckle my shoe
2 | [173] 99 Three, four, shut the door
3 | [220] 98 Five, six, pick up sticks
4 | [264] 97 Seven, eight, lay them straight
5 | [314] 96 Nine, ten, a big fat hen
6 | [359] 95 One, two, buckle my shoe
7 | [404] 94 Three, four, shut the door
8 | [451] 93 Five, six, pick up sticks
9 | [495] 92 Seven, eight, lay them straight
10 | [545] 91 Nine, ten, a big fat hen
11 | ...
12 | [1976] 60 One, two, buckle my shoe
13 | [2021] 59 Three, four, shut the door
14 | [2088] 58 Five, six, pick up sticks
15 | [2132] 57 Seven, eight, lay them straight
16 | [2182] 56 Nine, ten, a big fat hen
17 | ...
18 | [4557] 5 One, two, buckle my shoe
19 | [4602] 4 Three, four, shut the door
20 | [4649] 3 Five, six, pick up sticks
21 | [4693] 2 Seven, eight, lay them straight
22 | [4743] 1 Nine, ten, a big fat hen
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/StreamCompressor.java.input.txt:
--------------------------------------------------------------------------------
1 | echo "Text" | hadoop StreamCompressor org.apache.hadoop.io.compress.GzipCodec \
2 | | gunzip -
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/StreamCompressor.java.output.txt:
--------------------------------------------------------------------------------
1 | Text
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/TextIterator.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop TextIterator
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/TextIterator.java.output.txt:
--------------------------------------------------------------------------------
1 | 41
2 | df
3 | 6771
4 | 10400
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/hadoop-fs-text.input.txt:
--------------------------------------------------------------------------------
1 | hadoop fs -text numbers.seq | head
--------------------------------------------------------------------------------
/ch05-io/src/main/examples/hadoop-fs-text.output.txt:
--------------------------------------------------------------------------------
1 | 100 One, two, buckle my shoe
2 | 99 Three, four, shut the door
3 | 98 Five, six, pick up sticks
4 | 97 Seven, eight, lay them straight
5 | 96 Nine, ten, a big fat hen
6 | 95 One, two, buckle my shoe
7 | 94 Three, four, shut the door
8 | 93 Five, six, pick up sticks
9 | 92 Seven, eight, lay them straight
10 | 91 Nine, ten, a big fat hen
--------------------------------------------------------------------------------
/ch05-io/src/main/java/StreamCompressor.java:
--------------------------------------------------------------------------------
1 | // cc StreamCompressor A program to compress data read from standard input and write it to standard output
2 | import org.apache.hadoop.conf.Configuration;
3 | import org.apache.hadoop.io.IOUtils;
4 | import org.apache.hadoop.io.compress.CompressionCodec;
5 | import org.apache.hadoop.io.compress.CompressionOutputStream;
6 | import org.apache.hadoop.util.ReflectionUtils;
7 |
8 | // vv StreamCompressor
9 | public class StreamCompressor {
10 |
11 | public static void main(String[] args) throws Exception {
12 | String codecClassname = args[0];
13 | Class> codecClass = Class.forName(codecClassname);
14 | Configuration conf = new Configuration();
15 | CompressionCodec codec = (CompressionCodec)
16 | ReflectionUtils.newInstance(codecClass, conf);
17 |
18 | CompressionOutputStream out = codec.createOutputStream(System.out);
19 | IOUtils.copyBytes(System.in, out, 4096, false);
20 | out.finish();
21 | }
22 | }
23 | // ^^ StreamCompressor
24 |
--------------------------------------------------------------------------------
/ch05-io/src/main/java/TextArrayWritable.java:
--------------------------------------------------------------------------------
1 | // == TextArrayWritable
2 | import org.apache.hadoop.io.ArrayWritable;
3 | import org.apache.hadoop.io.Text;
4 |
5 | // vv TextArrayWritable
6 | public class TextArrayWritable extends ArrayWritable {
7 | public TextArrayWritable() {
8 | super(Text.class);
9 | }
10 | }
11 | // ^^ TextArrayWritable
12 |
--------------------------------------------------------------------------------
/ch05-io/src/main/java/TextIterator.java:
--------------------------------------------------------------------------------
1 | // cc TextIterator Iterating over the characters in a Text object
2 | import java.nio.ByteBuffer;
3 |
4 | import org.apache.hadoop.io.Text;
5 |
6 | // vv TextIterator
7 | public class TextIterator {
8 |
9 | public static void main(String[] args) {
10 | Text t = new Text("\u0041\u00DF\u6771\uD801\uDC00");
11 |
12 | ByteBuffer buf = ByteBuffer.wrap(t.getBytes(), 0, t.getLength());
13 | int cp;
14 | while (buf.hasRemaining() && (cp = Text.bytesToCodePoint(buf)) != -1) {
15 | System.out.println(Integer.toHexString(cp));
16 | }
17 | }
18 | }
19 | // ^^ TextIterator
20 |
--------------------------------------------------------------------------------
/ch05-io/src/test/java/ArrayWritableTest.java:
--------------------------------------------------------------------------------
1 | // == ArrayWritableTest
2 | import static org.hamcrest.CoreMatchers.is;
3 | import static org.junit.Assert.assertThat;
4 |
5 | import java.io.IOException;
6 | import org.apache.hadoop.io.*;
7 | import org.junit.Test;
8 |
9 | public class ArrayWritableTest extends WritableTestBase {
10 |
11 | @Test
12 | public void test() throws IOException {
13 | // vv ArrayWritableTest
14 | ArrayWritable writable = new ArrayWritable(Text.class);
15 | // ^^ ArrayWritableTest
16 | writable.set(new Text[] { new Text("cat"), new Text("dog") });
17 |
18 | TextArrayWritable dest = new TextArrayWritable();
19 | WritableUtils.cloneInto(dest, writable);
20 | assertThat(dest.get().length, is(2));
21 | // TODO: fix cast, also use single assert
22 | assertThat((Text) dest.get()[0], is(new Text("cat")));
23 | assertThat((Text) dest.get()[1], is(new Text("dog")));
24 |
25 | Text[] copy = (Text[]) dest.toArray();
26 | assertThat(copy[0], is(new Text("cat")));
27 | assertThat(copy[1], is(new Text("dog")));
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/ch05-io/src/test/java/BinaryOrTextWritable.java:
--------------------------------------------------------------------------------
1 | import org.apache.hadoop.io.BytesWritable;
2 | import org.apache.hadoop.io.GenericWritable;
3 | import org.apache.hadoop.io.Text;
4 | import org.apache.hadoop.io.Writable;
5 |
6 | public class BinaryOrTextWritable extends GenericWritable {
7 | private static Class[] TYPES = { BytesWritable.class, Text.class };
8 |
9 | @Override
10 | @SuppressWarnings("unchecked")
11 | protected Class extends Writable>[] getTypes() {
12 | return TYPES;
13 | }
14 |
15 | }
16 |
--------------------------------------------------------------------------------
/ch05-io/src/test/java/BooleanWritableTest.java:
--------------------------------------------------------------------------------
1 | import static org.hamcrest.CoreMatchers.is;
2 | import static org.junit.Assert.assertThat;
3 |
4 | import java.io.IOException;
5 | import org.apache.hadoop.io.BooleanWritable;
6 | import org.junit.Test;
7 |
8 | public class BooleanWritableTest extends WritableTestBase {
9 |
10 | @Test
11 | public void test() throws IOException {
12 | BooleanWritable src = new BooleanWritable(true);
13 | BooleanWritable dest = new BooleanWritable();
14 | assertThat(writeTo(src, dest), is("01"));
15 | assertThat(dest.get(), is(src.get()));
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/ch05-io/src/test/java/BytesWritableTest.java:
--------------------------------------------------------------------------------
1 | // == BytesWritableTest
2 | // == BytesWritableTest-Capacity
3 | import static org.hamcrest.CoreMatchers.is;
4 | import static org.junit.Assert.assertThat;
5 |
6 | import java.io.IOException;
7 | import org.apache.hadoop.io.BytesWritable;
8 | import org.apache.hadoop.util.StringUtils;
9 | import org.junit.Test;
10 |
11 | public class BytesWritableTest extends WritableTestBase {
12 |
13 | @Test
14 | public void test() throws IOException {
15 | // vv BytesWritableTest
16 | BytesWritable b = new BytesWritable(new byte[] { 3, 5 });
17 | byte[] bytes = serialize(b);
18 | assertThat(StringUtils.byteToHexString(bytes), is("000000020305"));
19 | // ^^ BytesWritableTest
20 |
21 | // vv BytesWritableTest-Capacity
22 | b.setCapacity(11);
23 | assertThat(b.getLength(), is(2));
24 | assertThat(b.getBytes().length, is(11));
25 | // ^^ BytesWritableTest-Capacity
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/ch05-io/src/test/java/FileDecompressorTest.java:
--------------------------------------------------------------------------------
1 | import static org.hamcrest.CoreMatchers.is;
2 | import static org.junit.Assert.assertThat;
3 |
4 | import java.io.*;
5 | import java.util.Scanner;
6 | import org.apache.hadoop.fs.FileUtil;
7 | import org.apache.hadoop.io.IOUtils;
8 | import org.junit.Test;
9 |
10 | public class FileDecompressorTest {
11 |
12 | @Test
13 | public void decompressesGzippedFile() throws Exception {
14 | File file = File.createTempFile("file", ".gz");
15 | file.deleteOnExit();
16 | InputStream in = this.getClass().getResourceAsStream("/file.gz");
17 | IOUtils.copyBytes(in, new FileOutputStream(file), 4096, true);
18 |
19 | String path = file.getAbsolutePath();
20 | FileDecompressor.main(new String[] { path });
21 |
22 | String decompressedPath = path.substring(0, path.length() - 3);
23 | assertThat(readFile(new File(decompressedPath)), is("Text\n"));
24 | }
25 |
26 | private String readFile(File file) throws IOException {
27 | return new Scanner(file).useDelimiter("\\A").next();
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/ch05-io/src/test/java/GenericWritableTest.java:
--------------------------------------------------------------------------------
1 | import static org.hamcrest.CoreMatchers.is;
2 | import static org.junit.Assert.assertThat;
3 |
4 | import java.io.IOException;
5 | import org.apache.hadoop.io.*;
6 | import org.junit.Test;
7 |
8 | public class GenericWritableTest extends WritableTestBase {
9 |
10 | @Test
11 | public void test() throws IOException {
12 | BinaryOrTextWritable src = new BinaryOrTextWritable();
13 | src.set(new Text("text"));
14 | BinaryOrTextWritable dest = new BinaryOrTextWritable();
15 | WritableUtils.cloneInto(dest, src);
16 | assertThat((Text) dest.get(), is(new Text("text")));
17 |
18 | src.set(new BytesWritable(new byte[] {3, 5}));
19 | WritableUtils.cloneInto(dest, src);
20 | assertThat(((BytesWritable) dest.get()).getLength(), is(2)); // TODO proper assert
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/ch05-io/src/test/java/NullWritableTest.java:
--------------------------------------------------------------------------------
1 | import static org.hamcrest.CoreMatchers.is;
2 | import static org.junit.Assert.assertThat;
3 |
4 | import java.io.IOException;
5 | import org.apache.hadoop.io.NullWritable;
6 | import org.junit.Test;
7 |
8 | public class NullWritableTest extends WritableTestBase {
9 |
10 | @Test
11 | public void test() throws IOException {
12 | NullWritable writable = NullWritable.get();
13 | assertThat(serialize(writable).length, is(0));
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/ch05-io/src/test/java/ObjectWritableTest.java:
--------------------------------------------------------------------------------
1 | import static org.hamcrest.CoreMatchers.is;
2 | import static org.junit.Assert.assertThat;
3 |
4 | import java.io.IOException;
5 | import org.apache.hadoop.io.*;
6 | import org.junit.Test;
7 |
8 | public class ObjectWritableTest extends WritableTestBase {
9 |
10 | @Test
11 | public void test() throws IOException {
12 | ObjectWritable src = new ObjectWritable(Integer.TYPE, 163);
13 | ObjectWritable dest = new ObjectWritable();
14 | WritableUtils.cloneInto(dest, src);
15 | assertThat((Integer) dest.get(), is(163));
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/ch05-io/src/test/java/VLongWritableTest.java:
--------------------------------------------------------------------------------
1 | import static org.hamcrest.CoreMatchers.is;
2 | import static org.junit.Assert.assertThat;
3 |
4 | import java.io.IOException;
5 | import org.apache.hadoop.io.VLongWritable;
6 | import org.junit.Test;
7 |
8 | public class VLongWritableTest extends WritableTestBase {
9 |
10 | @Test
11 | public void test() throws IOException {
12 | assertThat(serializeToString(new VLongWritable(1)), is("01")); // 1 byte
13 | assertThat(serializeToString(new VLongWritable(127)), is("7f")); // 1 byte
14 | assertThat(serializeToString(new VLongWritable(128)), is("8f80")); // 2 byte
15 | assertThat(serializeToString(new VLongWritable(163)), is("8fa3")); // 2 byte
16 | assertThat(serializeToString(new VLongWritable(Long.MAX_VALUE)), is("887fffffffffffffff")); // 9 byte
17 | assertThat(serializeToString(new VLongWritable(Long.MIN_VALUE)), is("807fffffffffffffff")); // 9 byte
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/ch05-io/src/test/resources/file.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch05-io/src/test/resources/file.gz
--------------------------------------------------------------------------------
/ch06-mr-dev/input/ncdc/micro/sample.txt:
--------------------------------------------------------------------------------
1 | 0067011990999991950051507004+68750+023550FM-12+038299999V0203301N00671220001CN9999999N9+00001+99999999999
2 | 0043011990999991950051512004+68750+023550FM-12+038299999V0203201N00671220001CN9999999N9+00221+99999999999
3 | 0043011990999991950051518004+68750+023550FM-12+038299999V0203201N00261220001CN9999999N9-00111+99999999999
4 | 0043012650999991949032412004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+01111+99999999999
5 | 0043012650999991949032418004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+00781+99999999999
--------------------------------------------------------------------------------
/ch06-mr-dev/output/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc
--------------------------------------------------------------------------------
/ch06-mr-dev/output/.part-r-00000.crc:
--------------------------------------------------------------------------------
1 | crc =)|$
--------------------------------------------------------------------------------
/ch06-mr-dev/output/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch06-mr-dev/output/_SUCCESS
--------------------------------------------------------------------------------
/ch06-mr-dev/output/part-r-00000:
--------------------------------------------------------------------------------
1 | 1949 111
2 | 1950 22
3 |
--------------------------------------------------------------------------------
/ch06-mr-dev/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.hadoopbook
6 | hadoop-meta
7 | 4.0
8 | ../hadoop-meta/pom.xml
9 |
10 | com.hadoopbook
11 | ch06-mr-dev
12 | jar
13 | 4.0
14 | Chapter 6: Developing a MapReduce Application
15 |
16 |
17 | junit
18 | junit
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/examples/ConfigurationPrinterSystem.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop -Dcolor=yellow ConfigurationPrinter | grep color
--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/examples/ConfigurationPrinterWithConf.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop ConfigurationPrinter -conf conf/hadoop-localhost.xml \
2 | | grep mapred.job.tracker=
--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/examples/ConfigurationPrinterWithConf.java.output.txt:
--------------------------------------------------------------------------------
1 | mapred.job.tracker=localhost:8021
2 |
--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/examples/ConfigurationPrinterWithConfAndD.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop ConfigurationPrinter -conf conf/hadoop-localhost.xml \
2 | -D mapred.job.tracker=example.com:8021 \
3 | | grep mapred.job.tracker
--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/examples/ConfigurationPrinterWithD.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop ConfigurationPrinter -D color=yellow | grep color
--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/examples/ConfigurationPrinterWithD.java.output.txt:
--------------------------------------------------------------------------------
1 | color=yellow
2 |
--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/examples/MaxTemperatureDriver.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar hadoop-examples.jar v3.MaxTemperatureDriver -conf conf/hadoop-cluster.xml \
2 | input/ncdc/all max-temp
--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/examples/MaxTemperatureDriverV2.ignore/input.txt:
--------------------------------------------------------------------------------
1 | hadoop v2.MaxTemperatureDriver -conf conf/hadoop-local.xml \
2 | input/ncdc/micro output
--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/examples/MaxTemperatureDriverV2GOP.ignore/input.txt:
--------------------------------------------------------------------------------
1 | hadoop v2.MaxTemperatureDriver -fs file:/// -jt local input/ncdc/micro output
--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/examples/MaxTemperatureDriverV3/input.txt:
--------------------------------------------------------------------------------
1 | hadoop v3.MaxTemperatureDriver -conf conf/hadoop-local.xml \
2 | input/ncdc/micro output
--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/examples/MaxTemperatureDriverV3/output/part-r-00000:
--------------------------------------------------------------------------------
1 | 1949 111
2 | 1950 22
3 |
--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/examples/MaxTemperatureDriverV3GOP/input.txt:
--------------------------------------------------------------------------------
1 | hadoop v3.MaxTemperatureDriver -fs file:/// -jt local input/ncdc/micro output
--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/examples/MaxTemperatureDriverV3GOP/output/part-r-00000:
--------------------------------------------------------------------------------
1 | 1949 111
2 | 1950 22
3 |
--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/java/LoggingIdentityMapper.java:
--------------------------------------------------------------------------------
1 | //cc LoggingIdentityMapper An identity mapper that writes to standard output and also uses the Apache Commons Logging API
2 | import java.io.IOException;
3 |
4 | //vv LoggingIdentityMapper
5 | import org.apache.commons.logging.Log;
6 | import org.apache.commons.logging.LogFactory;
7 | import org.apache.hadoop.mapreduce.Mapper;
8 |
9 | public class LoggingIdentityMapper
10 | extends Mapper {
11 |
12 | private static final Log LOG = LogFactory.getLog(LoggingIdentityMapper.class);
13 |
14 | @Override
15 | @SuppressWarnings("unchecked")
16 | public void map(KEYIN key, VALUEIN value, Context context)
17 | throws IOException, InterruptedException {
18 | // Log to stdout file
19 | System.out.println("Map key: " + key);
20 |
21 | // Log to syslog file
22 | LOG.info("Map key: " + key);
23 | if (LOG.isDebugEnabled()) {
24 | LOG.debug("Map value: " + value);
25 | }
26 | context.write((KEYOUT) key, (VALUEOUT) value);
27 | }
28 | }
29 | //^^ LoggingIdentityMapper
--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/java/v1/MaxTemperatureMapper.java:
--------------------------------------------------------------------------------
1 | package v1;
2 | // cc MaxTemperatureMapperV1 First version of a Mapper that passes MaxTemperatureMapperTest
3 | import java.io.IOException;
4 | import org.apache.hadoop.io.*;
5 | import org.apache.hadoop.mapreduce.*;
6 | //vv MaxTemperatureMapperV1
7 | public class MaxTemperatureMapper
8 | extends Mapper {
9 |
10 | @Override
11 | public void map(LongWritable key, Text value, Context context)
12 | throws IOException, InterruptedException {
13 |
14 | String line = value.toString();
15 | String year = line.substring(15, 19);
16 | int airTemperature = Integer.parseInt(line.substring(87, 92));
17 | context.write(new Text(year), new IntWritable(airTemperature));
18 | }
19 | }
20 | //^^ MaxTemperatureMapperV1
21 |
--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/java/v1/MaxTemperatureReducer.java:
--------------------------------------------------------------------------------
1 | package v1;
2 | //cc MaxTemperatureReducerV1 Reducer for maximum temperature example
3 | import java.io.IOException;
4 |
5 | import org.apache.hadoop.io.IntWritable;
6 | import org.apache.hadoop.io.Text;
7 | import org.apache.hadoop.mapreduce.Reducer;
8 |
9 | // vv MaxTemperatureReducerV1
10 | public class MaxTemperatureReducer
11 | extends Reducer {
12 |
13 | @Override
14 | public void reduce(Text key, Iterable values,
15 | Context context)
16 | throws IOException, InterruptedException {
17 |
18 | int maxValue = Integer.MIN_VALUE;
19 | for (IntWritable value : values) {
20 | maxValue = Math.max(maxValue, value.get());
21 | }
22 | context.write(key, new IntWritable(maxValue));
23 | }
24 | }
25 | // ^^ MaxTemperatureReducerV1
26 |
--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/java/v2/MaxTemperatureMapper.java:
--------------------------------------------------------------------------------
1 | package v2;
2 | // cc MaxTemperatureMapperV2 A Mapper that uses a utility class to parse records
3 |
4 | import java.io.IOException;
5 |
6 | import org.apache.hadoop.io.IntWritable;
7 | import org.apache.hadoop.io.LongWritable;
8 | import org.apache.hadoop.io.Text;
9 | import org.apache.hadoop.mapreduce.Mapper;
10 | import v2.NcdcRecordParser;
11 |
12 | // vv MaxTemperatureMapperV2
13 | public class MaxTemperatureMapper
14 | extends Mapper {
15 |
16 | /*[*/private NcdcRecordParser parser = new NcdcRecordParser();/*]*/
17 |
18 | @Override
19 | public void map(LongWritable key, Text value, Context context)
20 | throws IOException, InterruptedException {
21 |
22 | /*[*/parser.parse(value);/*]*/
23 | if (/*[*/parser.isValidTemperature()/*]*/) {
24 | context.write(new Text(/*[*/parser.getYear()/*]*/),
25 | new IntWritable(/*[*/parser.getAirTemperature()/*]*/));
26 | }
27 | }
28 | }
29 | // ^^ MaxTemperatureMapperV2
30 |
--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/resources/configuration-1.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | color
5 | yellow
6 | Color
7 |
8 |
9 |
10 | size
11 | 10
12 | Size
13 |
14 |
15 |
16 | weight
17 | heavy
18 | true
19 | Weight
20 |
21 |
22 |
23 | size-weight
24 | ${size},${weight}
25 | Size and weight
26 |
27 |
--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/resources/configuration-2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | size
5 | 12
6 |
7 |
8 |
9 | weight
10 | light
11 |
12 |
--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/resources/max-temp-workflow.properties:
--------------------------------------------------------------------------------
1 | # A properties file used to submit an Oozie workflow job.
2 | # This file is not bundled as a part of the workflow application.
3 | nameNode=hdfs://localhost:8020
4 | resourceManager=localhost:8032
5 | oozie.wf.application.path=${nameNode}/user/${user.name}/max-temp-workflow
--------------------------------------------------------------------------------
/ch06-mr-dev/src/test/java/SingleResourceConfigurationTest.java:
--------------------------------------------------------------------------------
1 | // == SingleResourceConfigurationTest
2 | import static org.hamcrest.CoreMatchers.is;
3 | import static org.junit.Assert.assertThat;
4 |
5 | import java.io.IOException;
6 |
7 | import org.apache.hadoop.conf.Configuration;
8 | import org.junit.Test;
9 |
10 | public class SingleResourceConfigurationTest {
11 |
12 | @Test
13 | public void get() throws IOException {
14 | // vv SingleResourceConfigurationTest
15 | Configuration conf = new Configuration();
16 | conf.addResource("configuration-1.xml");
17 | assertThat(conf.get("color"), is("yellow"));
18 | assertThat(conf.getInt("size", 0), is(10));
19 | assertThat(conf.get("breadth", "wide"), is("wide"));
20 | // ^^ SingleResourceConfigurationTest
21 | }
22 |
23 | }
24 |
--------------------------------------------------------------------------------
/ch06-mr-dev/src/test/java/v1/MaxTemperatureReducerTest.java:
--------------------------------------------------------------------------------
1 | package v1;
2 | // == MaxTemperatureReducerTestV1
3 | import java.io.IOException;
4 | import java.util.*;
5 | import org.apache.hadoop.io.*;
6 | import org.apache.hadoop.mrunit.mapreduce.ReduceDriver;
7 | import org.junit.*;
8 |
9 | public class MaxTemperatureReducerTest {
10 |
11 | //vv MaxTemperatureReducerTestV1
12 | @Test
13 | public void returnsMaximumIntegerInValues() throws IOException,
14 | InterruptedException {
15 | new ReduceDriver()
16 | .withReducer(new MaxTemperatureReducer())
17 | .withInput(new Text("1950"),
18 | Arrays.asList(new IntWritable(10), new IntWritable(5)))
19 | .withOutput(new Text("1950"), new IntWritable(10))
20 | .runTest();
21 | }
22 | //^^ MaxTemperatureReducerTestV1
23 | }
24 |
--------------------------------------------------------------------------------
/ch06-mr-dev/src/test/resources/expected.txt:
--------------------------------------------------------------------------------
1 | 1949 111
2 | 1950 22
3 |
--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/MaxTemperatureWithMultipleInputs/input.txt:
--------------------------------------------------------------------------------
1 | hadoop MaxTemperatureWithMultipleInputs input/ncdc/micro/sample.txt input/metoffice output
--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/MinimalMapReduce.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop MinimalMapReduce "input/ncdc/all/190{1,2}.gz" output
--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/MinimalMapReduce/input.txt:
--------------------------------------------------------------------------------
1 | hadoop MinimalMapReduce "input/ncdc/all/190{1,2}.gz" output
--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/MinimalMapReduceWithDefaults/input.txt:
--------------------------------------------------------------------------------
1 | hadoop MinimalMapReduceWithDefaults "input/ncdc/all/190{1,2}.gz" output
--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/PartitionByStationUsingMultipleOutputFormat.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar hadoop-examples.jar PartitionByStationUsingMultipleOutputFormat 'input/ncdc/all/190?.gz' output-part-by-station
--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/PartitionByStationUsingMultipleOutputs/2/input.txt:
--------------------------------------------------------------------------------
1 | hadoop PartitionByStationUsingMultipleOutputs "input/ncdc/all/190{1,2}.gz" output
--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/PartitionByStationUsingMultipleOutputs/2/output/part-r-00000:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch08-mr-types/src/main/examples/PartitionByStationUsingMultipleOutputs/2/output/part-r-00000
--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/PartitionByStationUsingMultipleOutputs/input.txt:
--------------------------------------------------------------------------------
1 | hadoop PartitionByStationUsingMultipleOutputs "input/ncdc/all/190{1,2}.gz" output
--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/PartitionByStationUsingMultipleOutputs/output/part-r-00000:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch08-mr-types/src/main/examples/PartitionByStationUsingMultipleOutputs/output/part-r-00000
--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/PartitionByStationYearUsingMultipleOutputs/2/input.txt:
--------------------------------------------------------------------------------
1 | hadoop PartitionByStationYearUsingMultipleOutputs "input/ncdc/all/190{1,2}.gz" output
--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/PartitionByStationYearUsingMultipleOutputs/2/output/part-r-00000:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch08-mr-types/src/main/examples/PartitionByStationYearUsingMultipleOutputs/2/output/part-r-00000
--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/PartitionByStationYearUsingMultipleOutputs/input.txt:
--------------------------------------------------------------------------------
1 | hadoop PartitionByStationYearUsingMultipleOutputs "input/ncdc/all/190{1,2}.gz" output
--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/PartitionByStationYearUsingMultipleOutputs/output/part-r-00000:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch08-mr-types/src/main/examples/PartitionByStationYearUsingMultipleOutputs/output/part-r-00000
--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/SmallFilesToSequenceFileConverter.ignore/input.txt:
--------------------------------------------------------------------------------
1 | hadoop SmallFilesToSequenceFileConverter input/smallfiles outputhadoop jar hadoop-examples.jar SmallFilesToSequenceFileConverter \
2 | -conf conf/hadoop-localhost.xml -D mapred.reduce.tasks=2 input/smallfiles output
--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/SmallFilesToSequenceFileConverter.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar hadoop-examples.jar SmallFilesToSequenceFileConverter \
2 | -conf conf/hadoop-localhost.xml -D mapred.reduce.tasks=2 input/smallfiles output
--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/default_streaming.input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \
2 | -input input/ncdc/sample.txt \
3 | -output output \
4 | -inputformat org.apache.hadoop.mapred.TextInputFormat \
5 | -mapper /bin/cat \
6 | -partitioner org.apache.hadoop.mapred.lib.HashPartitioner \
7 | -numReduceTasks 1 \
8 | -reducer org.apache.hadoop.mapred.lib.IdentityReducer \
9 | -outputformat org.apache.hadoop.mapred.TextOutputFormat
--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/minimal_streaming.input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \
2 | -input input/ncdc/sample.txt \
3 | -output output \
4 | -mapper /bin/cat
--------------------------------------------------------------------------------
/ch08-mr-types/src/main/java/NonSplittableTextInputFormat.java:
--------------------------------------------------------------------------------
1 | // == NonSplittableTextInputFormat
2 | import org.apache.hadoop.fs.Path;
3 | import org.apache.hadoop.mapreduce.JobContext;
4 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
5 |
6 | public class NonSplittableTextInputFormat extends TextInputFormat {
7 | @Override
8 | protected boolean isSplitable(JobContext context, Path file) {
9 | return false;
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/ch08-mr-types/src/main/java/StationPartitioner.java:
--------------------------------------------------------------------------------
1 | // == StationPartitioner
2 | import org.apache.hadoop.io.LongWritable;
3 | import org.apache.hadoop.io.Text;
4 | import org.apache.hadoop.mapreduce.Partitioner;
5 |
6 | //vv StationPartitioner
7 | public class StationPartitioner extends Partitioner {
8 |
9 | private NcdcRecordParser parser = new NcdcRecordParser();
10 |
11 | @Override
12 | public int getPartition(LongWritable key, Text value, int numPartitions) {
13 | parser.parse(value);
14 | return getPartition(parser.getStationId());
15 | }
16 |
17 | private int getPartition(String stationId) {
18 | /*...*/
19 | // ^^ StationPartitioner
20 | return 0;
21 | // vv StationPartitioner
22 | }
23 |
24 | }
25 | //^^ StationPartitioner
--------------------------------------------------------------------------------
/ch08-mr-types/src/main/java/oldapi/MinimalMapReduce.java:
--------------------------------------------------------------------------------
1 | package oldapi;
2 |
3 | import org.apache.hadoop.conf.Configured;
4 | import org.apache.hadoop.fs.Path;
5 | import org.apache.hadoop.mapred.*;
6 | import org.apache.hadoop.util.*;
7 |
8 | public class MinimalMapReduce extends Configured implements Tool {
9 |
10 | @Override
11 | public int run(String[] args) throws Exception {
12 | if (args.length != 2) {
13 | System.err.printf("Usage: %s [generic options]