├── .gitignore
├── README.md
├── appc
    └── src
    │   └── main
    │       └── sh
    │           ├── create_ncdc_files.sh
    │           ├── load_ncdc.sh
    │           ├── load_ncdc_map.sh
    │           └── ncdc_files.txt
├── book
    ├── pom.xml
    └── src
    │   └── main
    │       └── assembly
    │           ├── jar.xml
    │           └── oozie-workflow-application.xml
├── ch02-mr-intro
    ├── pom.xml
    └── src
    │   └── main
    │       ├── awk
    │           └── max_temperature.sh
    │       ├── cpp
    │           ├── Makefile
    │           └── max_temperature.cpp
    │       ├── examples
    │           ├── MaxTemperature
    │           │   ├── input.txt
    │           │   └── output
    │           │   │   └── part-r-00000
    │           ├── MaxTemperatureWithCombiner
    │           │   ├── input.txt
    │           │   └── output
    │           │   │   └── part-r-00000
    │           ├── OldMaxTemperature
    │           │   ├── input.txt
    │           │   └── output
    │           │   │   └── part-00000
    │           ├── max_temperature.cpp.input.txt
    │           ├── max_temperature_hadoop.input.txt
    │           ├── max_temperature_hadoop_cluster.input.txt
    │           └── max_temperature_py
    │           │   ├── 2
    │           │       ├── input.txt
    │           │       └── output
    │           │       │   └── part-00000
    │           │   ├── input.txt
    │           │   ├── output
    │           │       └── part-r-00000
    │           │   └── pseudo
    │           │       ├── input.txt
    │           │       └── output
    │           │           └── part-00000
    │       ├── java
    │           ├── MaxTemperature.java
    │           ├── MaxTemperatureMapper.java
    │           ├── MaxTemperatureReducer.java
    │           ├── MaxTemperatureWithCombiner.java
    │           ├── OldMaxTemperature.java
    │           └── oldapi
    │           │   ├── MaxTemperature.java
    │           │   ├── MaxTemperatureMapper.java
    │           │   ├── MaxTemperatureReducer.java
    │           │   └── MaxTemperatureWithCombiner.java
    │       ├── python
    │           ├── max_temperature_map.py
    │           └── max_temperature_reduce.py
    │       ├── ruby
    │           ├── max_temperature_map.rb
    │           └── max_temperature_reduce.rb
    │       └── sh
    │           └── max_temp.sh
├── ch03-hdfs
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── conf
    │       │   ├── core-site.xml
    │       │   └── hdfs-site.xml
    │       ├── java
    │       │   ├── DateRangePathFilter.java
    │       │   ├── FileCopyWithProgress.java
    │       │   ├── FileSystemCat.java
    │       │   ├── FileSystemDoubleCat.java
    │       │   ├── ListStatus.java
    │       │   ├── RegexExcludePathFilter.java
    │       │   ├── RegexPathFilter.java
    │       │   └── URLCat.java
    │       └── sh
    │       │   ├── file.sh
    │       │   └── hars.sh
    │   └── test
    │       └── java
    │           ├── CoherencyModelTest.java
    │           ├── FileSystemDeleteTest.java
    │           ├── FileSystemGlobTest.java
    │           └── ShowFileStatusTest.java
├── ch04-yarn
    ├── capacity-scheduler.xml
    └── fair-scheduler.xml
├── ch05-io
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── examples
    │       │   ├── FileDecompressor.java.input.txt
    │       │   ├── MapFile-data-head.input.txt
    │       │   ├── MapFile-data-head.output.txt
    │       │   ├── MapFile-index.input.txt
    │       │   ├── MapFile-index.output.txt
    │       │   ├── MapFile-ls.input.txt
    │       │   ├── MapFile-ls.output.txt
    │       │   ├── MapFileWriteDemo.java.input.txt
    │       │   ├── MaxTemperatureWithCompression
    │       │   │   ├── input.txt
    │       │   │   └── output
    │       │   │   │   └── part-r-00000.gz
    │       │   ├── MaxTemperatureWithMapOutputCompression.ignore
    │       │   │   ├── input.txt
    │       │   │   └── output
    │       │   │   │   └── part-r-00000
    │       │   ├── SequenceFileMapReduceSort.java.input.txt
    │       │   ├── SequenceFileMapReduceSortResults.java.input.txt
    │       │   ├── SequenceFileMapReduceSortResults.java.output.txt
    │       │   ├── SequenceFileMapReduceSortResults.java.pre.sh
    │       │   ├── SequenceFileReadDemo.java.input.txt
    │       │   ├── SequenceFileReadDemo.java.output.txt
    │       │   ├── SequenceFileReadDemo.java.pre.sh
    │       │   ├── SequenceFileToMapFileConverter-fix.java.input.txt
    │       │   ├── SequenceFileToMapFileConverter-mv.java.input.txt
    │       │   ├── SequenceFileToMapFileConverter-sort.java.input.txt
    │       │   ├── SequenceFileWriteDemo.java.input.txt
    │       │   ├── SequenceFileWriteDemo.java.output.txt
    │       │   ├── StreamCompressor.java.input.txt
    │       │   ├── StreamCompressor.java.output.txt
    │       │   ├── TextIterator.java.input.txt
    │       │   ├── TextIterator.java.output.txt
    │       │   ├── hadoop-fs-text.input.txt
    │       │   └── hadoop-fs-text.output.txt
    │       └── java
    │       │   ├── FileDecompressor.java
    │       │   ├── IntPair.java
    │       │   ├── MapFileFixer.java
    │       │   ├── MapFileWriteDemo.java
    │       │   ├── MaxTemperatureWithCompression.java
    │       │   ├── MaxTemperatureWithMapOutputCompression.java
    │       │   ├── PooledStreamCompressor.java
    │       │   ├── SequenceFileReadDemo.java
    │       │   ├── SequenceFileWriteDemo.java
    │       │   ├── StreamCompressor.java
    │       │   ├── TextArrayWritable.java
    │       │   ├── TextIterator.java
    │       │   ├── TextPair.java
    │       │   └── oldapi
    │       │       ├── IntPair.java
    │       │       ├── MaxTemperatureWithCompression.java
    │       │       ├── MaxTemperatureWithMapOutputCompression.java
    │       │       └── TextPair.java
    │   └── test
    │       ├── java
    │           ├── ArrayWritableTest.java
    │           ├── BinaryOrTextWritable.java
    │           ├── BooleanWritableTest.java
    │           ├── BytesWritableTest.java
    │           ├── FileDecompressorTest.java
    │           ├── GenericWritableTest.java
    │           ├── IntPairTest.java
    │           ├── IntWritableTest.java
    │           ├── MapFileSeekTest.java
    │           ├── MapWritableTest.java
    │           ├── NullWritableTest.java
    │           ├── ObjectWritableTest.java
    │           ├── SequenceFileSeekAndSyncTest.java
    │           ├── StringTextComparisonTest.java
    │           ├── TextPairTest.java
    │           ├── TextTest.java
    │           ├── VIntWritableTest.java
    │           ├── VLongWritableTest.java
    │           └── WritableTestBase.java
    │       └── resources
    │           └── file.gz
├── ch06-mr-dev
    ├── input
    │   └── ncdc
    │   │   └── micro
    │   │       └── sample.txt
    ├── output
    │   ├── ._SUCCESS.crc
    │   ├── .part-r-00000.crc
    │   ├── _SUCCESS
    │   └── part-r-00000
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── examples
    │       │   ├── ConfigurationPrinterSystem.java.input.txt
    │       │   ├── ConfigurationPrinterWithConf.java.input.txt
    │       │   ├── ConfigurationPrinterWithConf.java.output.txt
    │       │   ├── ConfigurationPrinterWithConfAndD.java.input.txt
    │       │   ├── ConfigurationPrinterWithD.java.input.txt
    │       │   ├── ConfigurationPrinterWithD.java.output.txt
    │       │   ├── MaxTemperatureDriver.java.input.txt
    │       │   ├── MaxTemperatureDriverV2.ignore
    │       │   │   └── input.txt
    │       │   ├── MaxTemperatureDriverV2GOP.ignore
    │       │   │   └── input.txt
    │       │   ├── MaxTemperatureDriverV3
    │       │   │   ├── input.txt
    │       │   │   └── output
    │       │   │   │   └── part-r-00000
    │       │   └── MaxTemperatureDriverV3GOP
    │       │   │   ├── input.txt
    │       │   │   └── output
    │       │   │       └── part-r-00000
    │       ├── java
    │       │   ├── ConfigurationPrinter.java
    │       │   ├── LoggingDriver.java
    │       │   ├── LoggingIdentityMapper.java
    │       │   ├── v1
    │       │   │   ├── MaxTemperatureMapper.java
    │       │   │   └── MaxTemperatureReducer.java
    │       │   ├── v2
    │       │   │   ├── MaxTemperatureDriver.java
    │       │   │   ├── MaxTemperatureMapper.java
    │       │   │   └── NcdcRecordParser.java
    │       │   ├── v3
    │       │   │   ├── MaxTemperatureDriver.java
    │       │   │   └── MaxTemperatureMapper.java
    │       │   └── v4
    │       │   │   ├── MaxTemperatureDriver.java
    │       │   │   ├── MaxTemperatureMapper.java
    │       │   │   └── NcdcRecordParser.java
    │       └── resources
    │       │   ├── configuration-1.xml
    │       │   ├── configuration-2.xml
    │       │   ├── max-temp-workflow.properties
    │       │   └── max-temp-workflow
    │       │       └── workflow.xml
    │   └── test
    │       ├── java
    │           ├── MultipleResourceConfigurationTest.java
    │           ├── SingleResourceConfigurationTest.java
    │           ├── v1
    │           │   ├── MaxTemperatureMapperTest.java
    │           │   └── MaxTemperatureReducerTest.java
    │           ├── v2
    │           │   ├── MaxTemperatureDriverMiniTest.java
    │           │   ├── MaxTemperatureDriverTest.java
    │           │   └── MaxTemperatureMapperTest.java
    │           └── v4
    │           │   └── MaxTemperatureMapperTest.java
    │       └── resources
    │           └── expected.txt
├── ch08-mr-types
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── examples
    │       │   ├── MaxTemperatureWithMultipleInputs
    │       │   │   ├── input.txt
    │       │   │   └── output
    │       │   │   │   └── part-r-00000
    │       │   ├── MinimalMapReduce.java.input.txt
    │       │   ├── MinimalMapReduce
    │       │   │   ├── input.txt
    │       │   │   └── output
    │       │   │   │   └── part-00000
    │       │   ├── MinimalMapReduceWithDefaults
    │       │   │   ├── input.txt
    │       │   │   └── output
    │       │   │   │   └── part-00000
    │       │   ├── PartitionByStationUsingMultipleOutputFormat.java.input.txt
    │       │   ├── PartitionByStationUsingMultipleOutputs
    │       │   │   ├── 2
    │       │   │   │   ├── input.txt
    │       │   │   │   └── output
    │       │   │   │   │   ├── 029070-99999-r-00000
    │       │   │   │   │   ├── 029500-99999-r-00000
    │       │   │   │   │   ├── 029600-99999-r-00000
    │       │   │   │   │   ├── 029720-99999-r-00000
    │       │   │   │   │   ├── 029810-99999-r-00000
    │       │   │   │   │   ├── 227070-99999-r-00000
    │       │   │   │   │   └── part-r-00000
    │       │   │   ├── input.txt
    │       │   │   └── output
    │       │   │   │   ├── 029070-99999-r-00000
    │       │   │   │   ├── 029500-99999-r-00000
    │       │   │   │   ├── 029600-99999-r-00000
    │       │   │   │   ├── 029720-99999-r-00000
    │       │   │   │   ├── 029810-99999-r-00000
    │       │   │   │   ├── 227070-99999-r-00000
    │       │   │   │   └── part-r-00000
    │       │   ├── PartitionByStationYearUsingMultipleOutputs
    │       │   │   ├── 2
    │       │   │   │   ├── input.txt
    │       │   │   │   └── output
    │       │   │   │   │   ├── 029070-99999
    │       │   │   │   │       ├── 1901
    │       │   │   │   │       │   └── part-r-00000
    │       │   │   │   │       └── 1902
    │       │   │   │   │       │   └── part-r-00000
    │       │   │   │   │   ├── 029500-99999
    │       │   │   │   │       ├── 1901
    │       │   │   │   │       │   └── part-r-00000
    │       │   │   │   │       └── 1902
    │       │   │   │   │       │   └── part-r-00000
    │       │   │   │   │   ├── 029600-99999
    │       │   │   │   │       ├── 1901
    │       │   │   │   │       │   └── part-r-00000
    │       │   │   │   │       └── 1902
    │       │   │   │   │       │   └── part-r-00000
    │       │   │   │   │   ├── 029720-99999
    │       │   │   │   │       ├── 1901
    │       │   │   │   │       │   └── part-r-00000
    │       │   │   │   │       └── 1902
    │       │   │   │   │       │   └── part-r-00000
    │       │   │   │   │   ├── 029810-99999
    │       │   │   │   │       ├── 1901
    │       │   │   │   │       │   └── part-r-00000
    │       │   │   │   │       └── 1902
    │       │   │   │   │       │   └── part-r-00000
    │       │   │   │   │   ├── 227070-99999
    │       │   │   │   │       ├── 1901
    │       │   │   │   │       │   └── part-r-00000
    │       │   │   │   │       └── 1902
    │       │   │   │   │       │   └── part-r-00000
    │       │   │   │   │   └── part-r-00000
    │       │   │   ├── input.txt
    │       │   │   └── output
    │       │   │   │   ├── 029070-99999
    │       │   │   │       ├── 1901
    │       │   │   │       │   └── part-r-00000
    │       │   │   │       └── 1902
    │       │   │   │       │   └── part-r-00000
    │       │   │   │   ├── 029500-99999
    │       │   │   │       ├── 1901
    │       │   │   │       │   └── part-r-00000
    │       │   │   │       └── 1902
    │       │   │   │       │   └── part-r-00000
    │       │   │   │   ├── 029600-99999
    │       │   │   │       ├── 1901
    │       │   │   │       │   └── part-r-00000
    │       │   │   │       └── 1902
    │       │   │   │       │   └── part-r-00000
    │       │   │   │   ├── 029720-99999
    │       │   │   │       ├── 1901
    │       │   │   │       │   └── part-r-00000
    │       │   │   │       └── 1902
    │       │   │   │       │   └── part-r-00000
    │       │   │   │   ├── 029810-99999
    │       │   │   │       ├── 1901
    │       │   │   │       │   └── part-r-00000
    │       │   │   │       └── 1902
    │       │   │   │       │   └── part-r-00000
    │       │   │   │   ├── 227070-99999
    │       │   │   │       ├── 1901
    │       │   │   │       │   └── part-r-00000
    │       │   │   │       └── 1902
    │       │   │   │       │   └── part-r-00000
    │       │   │   │   └── part-r-00000
    │       │   ├── SmallFilesToSequenceFileConverter.ignore
    │       │   │   └── input.txt
    │       │   ├── SmallFilesToSequenceFileConverter.java.input.txt
    │       │   ├── default_streaming.input.txt
    │       │   └── minimal_streaming.input.txt
    │       ├── java
    │       │   ├── MaxTemperatureWithMultipleInputs.java
    │       │   ├── MinimalMapReduce.java
    │       │   ├── MinimalMapReduceWithDefaults.java
    │       │   ├── NonSplittableTextInputFormat.java
    │       │   ├── PartitionByStationUsingMultipleOutputs.java
    │       │   ├── PartitionByStationYearUsingMultipleOutputs.java
    │       │   ├── SmallFilesToSequenceFileConverter.java
    │       │   ├── StationPartitioner.java
    │       │   ├── WholeFileInputFormat.java
    │       │   ├── WholeFileRecordReader.java
    │       │   └── oldapi
    │       │   │   ├── MaxTemperatureWithMultipleInputs.java
    │       │   │   ├── MinimalMapReduce.java
    │       │   │   ├── MinimalMapReduceWithDefaults.java
    │       │   │   ├── NonSplittableTextInputFormat.java
    │       │   │   ├── PartitionByStationUsingMultipleOutputFormat.java
    │       │   │   ├── PartitionByStationUsingMultipleOutputs.java
    │       │   │   ├── PartitionByStationYearUsingMultipleOutputFormat.java
    │       │   │   ├── SmallFilesToSequenceFileConverter.java
    │       │   │   ├── StationPartitioner.java
    │       │   │   ├── WholeFileInputFormat.java
    │       │   │   └── WholeFileRecordReader.java
    │       └── sh
    │       │   └── streaming.sh
    │   └── test
    │       └── java
    │           └── TextInputFormatsTest.java
├── ch09-mr-features
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── examples
    │       │   ├── JoinRecordWithStationName
    │       │   │   ├── 2
    │       │   │   │   ├── input.txt
    │       │   │   │   └── output
    │       │   │   │   │   └── part-r-00000
    │       │   │   ├── input.txt
    │       │   │   └── output
    │       │   │   │   └── part-r-00000
    │       │   ├── LookupRecordByTemperature.java.input.txt
    │       │   ├── LookupRecordByTemperature.java.output.txt
    │       │   ├── LookupRecordsByTemperature.java.input.txt
    │       │   ├── LookupRecordsByTemperature.java.output.txt
    │       │   ├── MaxTemperatureByStationNameUsingDistributedCacheFile.java.input.txt
    │       │   ├── MaxTemperatureByStationNameUsingDistributedCacheFileApi.ignore
    │       │   │   └── input.txt
    │       │   ├── MaxTemperatureUsingSecondarySort
    │       │   │   ├── input.txt
    │       │   │   └── output
    │       │   │   │   └── part-r-00000
    │       │   ├── MaxTemperatureWithCounters.java.input.txt
    │       │   ├── MaxTemperatureWithCounters
    │       │   │   ├── input.txt
    │       │   │   └── output
    │       │   │   │   └── part-r-00000
    │       │   ├── MissingTemperatureFields.java.input.txt
    │       │   ├── SortByTemperatureToMapFile.ignore
    │       │   │   └── input.txt
    │       │   ├── SortByTemperatureUsingHashPartitioner.ignore
    │       │   │   └── input.txt
    │       │   ├── SortByTemperatureUsingHashPartitioner.java.input.txt
    │       │   ├── SortByTemperatureUsingTotalOrderPartitioner.java.input.txt
    │       │   ├── SortDataPreprocessor.ignore
    │       │   │   └── input.txt
    │       │   └── SortDataPreprocessor.java.input.txt
    │       ├── java
    │       │   ├── JoinRecordMapper.java
    │       │   ├── JoinRecordWithStationName.java
    │       │   ├── JoinReducer.java
    │       │   ├── JoinStationMapper.java
    │       │   ├── LookupRecordByTemperature.java
    │       │   ├── LookupRecordsByTemperature.java
    │       │   ├── MaxTemperatureByStationNameUsingDistributedCacheFile.java
    │       │   ├── MaxTemperatureUsingSecondarySort.java
    │       │   ├── MaxTemperatureWithCounters.java
    │       │   ├── MissingTemperatureFields.java
    │       │   ├── SortByTemperatureToMapFile.java
    │       │   ├── SortByTemperatureUsingHashPartitioner.java
    │       │   ├── SortByTemperatureUsingTotalOrderPartitioner.java
    │       │   ├── SortDataPreprocessor.java
    │       │   ├── TemperatureDistribution.java
    │       │   └── oldapi
    │       │   │   ├── JoinRecordMapper.java
    │       │   │   ├── JoinRecordWithStationName.java
    │       │   │   ├── JoinReducer.java
    │       │   │   ├── JoinStationMapper.java
    │       │   │   ├── LookupRecordByTemperature.java
    │       │   │   ├── LookupRecordsByTemperature.java
    │       │   │   ├── MaxTemperatureByStationNameUsingDistributedCacheFile.java
    │       │   │   ├── MaxTemperatureByStationNameUsingDistributedCacheFileApi.java
    │       │   │   ├── MaxTemperatureUsingSecondarySort.java
    │       │   │   ├── MaxTemperatureWithCounters.java
    │       │   │   ├── MissingTemperatureFields.java
    │       │   │   ├── SortByTemperatureToMapFile.java
    │       │   │   ├── SortByTemperatureUsingHashPartitioner.java
    │       │   │   ├── SortByTemperatureUsingTotalOrderPartitioner.java
    │       │   │   ├── SortDataPreprocessor.java
    │       │   │   └── TemperatureDistribution.java
    │       ├── python
    │       │   ├── max_daily_temp_map.py
    │       │   ├── max_daily_temp_reduce.py
    │       │   ├── mean_max_daily_temp.sh
    │       │   ├── mean_max_daily_temp_map.py
    │       │   ├── mean_max_daily_temp_reduce.py
    │       │   ├── secondary_sort.sh
    │       │   ├── secondary_sort_map.py
    │       │   └── secondary_sort_reduce.py
    │       ├── r
    │       │   ├── fixed-partitions
    │       │   ├── output
    │       │   ├── output_sorted
    │       │   ├── sampled-partitions
    │       │   ├── temperature_distribution.png
    │       │   └── temperature_distribution.r
    │       └── resources
    │       │   ├── MaxTemperatureWithCounters_Temperature.properties
    │       │   └── oldapi
    │       │       └── MaxTemperatureWithCounters_Temperature.properties
    │   └── test
    │       └── java
    │           └── KeyFieldBasedComparatorTest.java
├── ch10-setup
    └── src
    │   └── main
    │       ├── conf
    │           ├── core-site.xml
    │           ├── hdfs-site.xml
    │           └── yarn-site.xml
    │       └── sh
    │           └── trash.sh
├── ch12-avro
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── assembly
    │       │   └── job.xml
    │       ├── c
    │       │   └── dump_pairs.c
    │       ├── examples
    │       │   ├── AvroGenericMaxTemperature
    │       │   │   ├── input.txt
    │       │   │   └── output
    │       │   │   │   ├── ._SUCCESS.crc
    │       │   │   │   ├── .part-r-00000.avro.crc
    │       │   │   │   ├── _SUCCESS
    │       │   │   │   └── part-r-00000.avro
    │       │   └── AvroSort
    │       │   │   ├── input.txt
    │       │   │   └── output
    │       │   │       ├── ._SUCCESS.crc
    │       │   │       ├── .part-r-00000.avro.crc
    │       │   │       ├── _SUCCESS
    │       │   │       └── part-r-00000.avro
    │       ├── java
    │       │   ├── AvroGenericMaxTemperature.java
    │       │   ├── AvroSort.java
    │       │   ├── NcdcRecordParser.java
    │       │   └── oldapi
    │       │   │   ├── AvroGenericMaxTemperature.java
    │       │   │   ├── AvroProjection.java
    │       │   │   ├── AvroSort.java
    │       │   │   ├── AvroSpecificMaxTemperature.java
    │       │   │   └── NcdcRecordParser.java
    │       ├── py
    │       │   ├── test_avro.py
    │       │   └── write_pairs.py
    │       └── resources
    │       │   ├── AliasedStringPair.avsc
    │       │   ├── Array.avsc
    │       │   ├── Enum.avsc
    │       │   ├── Fixed.avsc
    │       │   ├── Map.avsc
    │       │   ├── NewStringPair.avsc
    │       │   ├── NewStringPairWithNull.avsc
    │       │   ├── ProjectedStringPair.avsc
    │       │   ├── SortedStringPair.avsc
    │       │   ├── StringPair.avsc
    │       │   ├── SwitchedStringPair.avsc
    │       │   ├── Union.avsc
    │       │   └── WeatherRecord.avsc
    │   └── test
    │       └── java
    │           └── AvroTest.java
├── ch13-parquet
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── assembly
    │       │   └── job.xml
    │       ├── examples
    │       │   └── TextToParquetWithAvro
    │       │   │   ├── input.txt
    │       │   │   └── output
    │       │   │       ├── _SUCCESS
    │       │   │       ├── _metadata
    │       │   │       └── part-m-00000.parquet
    │       └── java
    │       │   ├── ParquetToTextWithAvro.java
    │       │   ├── ParquetToTextWithExample.java
    │       │   ├── TextToParquetWithAvro.java
    │       │   └── TextToParquetWithExample.java
    │   └── test
    │       ├── java
    │           ├── ParquetMRWithAvroTest.java
    │           ├── ParquetMRWithExampleTest.java
    │           └── ParquetTest.java
    │       └── resources
    │           ├── NewStringPair.avsc
    │           ├── ProjectedStringPair.avsc
    │           ├── StringPair.avsc
    │           └── fruit.txt
├── ch14-flume
    ├── spool-to-hdfs-and-logger.properties
    ├── spool-to-hdfs-avro.properties
    ├── spool-to-hdfs-partitioned.properties
    ├── spool-to-hdfs-tiered-load-balance.properties
    ├── spool-to-hdfs-tiered.properties
    ├── spool-to-hdfs.properties
    └── spool-to-logger.properties
├── ch15-sqoop
    ├── pom.xml
    ├── src
    │   └── main
    │   │   └── java
    │   │       ├── MaxWidgetId.java
    │   │       ├── MaxWidgetIdGenericAvro.java
    │   │       └── Widget.java
    └── widgets
    │   └── part-m-00000.avro
├── ch16-pig
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── grunt
    │       │   ├── combine.grunt
    │       │   ├── disambiguate.grunt
    │       │   ├── flatten.grunt
    │       │   ├── foreach.grunt
    │       │   ├── group.grunt
    │       │   ├── join.grunt
    │       │   ├── max_temp.grunt
    │       │   ├── missing.grunt
    │       │   ├── multiquery.grunt
    │       │   ├── null.grunt
    │       │   ├── schema.grunt
    │       │   ├── set.grunt
    │       │   ├── sort.grunt
    │       │   ├── store.grunt
    │       │   ├── stream.grunt
    │       │   ├── tuples.grunt
    │       │   ├── types.grunt
    │       │   └── udfs.grunt
    │       ├── java
    │       │   └── com
    │       │   │   └── hadoopbook
    │       │   │       └── pig
    │       │   │           ├── CutLoadFunc.java
    │       │   │           ├── IsGoodQuality.java
    │       │   │           ├── Range.java
    │       │   │           └── Trim.java
    │       ├── pig
    │       │   ├── comment_c-style.pig
    │       │   ├── comment_single_line.pig
    │       │   ├── max_temp.macro
    │       │   ├── max_temp.pig
    │       │   ├── max_temp_filter_stream.pig
    │       │   ├── max_temp_filter_udf.pig
    │       │   ├── max_temp_macro.pig
    │       │   ├── max_temp_macro_import.pig
    │       │   ├── max_temp_param.param
    │       │   ├── max_temp_param.pig
    │       │   ├── max_temp_station_name.pig
    │       │   └── year_stats.pig
    │       └── python
    │       │   └── is_good_quality.py
    │   └── test
    │       └── java
    │           └── com
    │               └── hadoopbook
    │                   └── pig
    │                       ├── IsGoodQualityTest.java
    │                       └── RangeTest.java
├── ch17-hive
    ├── pom.xml
    └── src
    │   └── main
    │       ├── hive
    │           ├── buckets.hive
    │           ├── conversions.hive
    │           ├── indexes.hive
    │           ├── joins.hive
    │           ├── mapreduce.hive
    │           ├── max_temp.hive
    │           ├── multitable_insert.hive
    │           ├── partitions.hive
    │           ├── regex_serde.hive
    │           ├── set.hive
    │           ├── sort.hive
    │           ├── storage.hive
    │           ├── types.hive
    │           └── udfs.hive
    │       ├── java
    │           └── com
    │           │   └── hadoopbook
    │           │       └── hive
    │           │           ├── Maximum.java
    │           │           ├── Mean.java
    │           │           └── Strip.java
    │       └── python
    │           ├── is_good_quality.py
    │           └── max_temperature_reduce.py
├── ch18-crunch
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── assembly
    │       │   └── hadoop-job.xml
    │       └── java
    │       │   └── crunch
    │       │       ├── AvroGenericMaxTemperatureCrunch.java
    │       │       ├── JoinRecordWithStationNameCrunch.java
    │       │       ├── MaxTemperatureByStationNameCrunch.java
    │       │       ├── MaxTemperatureCrunch.java
    │       │       ├── MaxTemperatureCrunchWithShutdownHook.java
    │       │       ├── MaxTemperatureUsingSecondarySortCrunch.java
    │       │       ├── MaxTemperatureWithCompressionCrunch.java
    │       │       ├── MaxTemperatureWithCountersCrunch.java
    │       │       ├── MaxTemperatureWithMultipleInputsCrunch.java
    │       │       ├── MetOfficeRecordParser.java
    │       │       ├── NcdcRecordParser.java
    │       │       ├── NcdcStationMetadataParser.java
    │       │       ├── SortByTemperatureCrunch.java
    │       │       └── SplitCrunch.java
    │   └── test
    │       ├── java
    │           └── crunch
    │           │   ├── CheckpointTest.java
    │           │   ├── CountValuesFn.java
    │           │   ├── CustomDoFn.java
    │           │   ├── InversePairFn.java
    │           │   ├── JoinTest.java
    │           │   ├── MaterializeTest.java
    │           │   ├── NonSerializableOuterClass.java
    │           │   ├── ObjectReuseTest.java
    │           │   ├── PCollections.java
    │           │   ├── PageRankTest.java
    │           │   ├── PipelineDebugTest.java
    │           │   ├── PipelineExecutionTest.java
    │           │   ├── PrimitiveOperationsTest.java
    │           │   ├── SerializableFunctionsTest.java
    │           │   ├── SortTest.java
    │           │   ├── SourcesAndTargetsTest.java
    │           │   ├── ToLowerFn.java
    │           │   ├── TypesTest.java
    │           │   └── WeatherRecord.java
    │       └── resources
    │           ├── A
    │           ├── B
    │           ├── fruit.txt
    │           ├── ints.txt
    │           ├── log4j.properties
    │           ├── numbers.seq
    │           ├── sample.txt
    │           ├── set1.txt
    │           ├── set2.txt
    │           └── urls.txt
├── ch19-spark
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── java
    │       │   └── MaxTemperatureSpark.java
    │       ├── python
    │       │   └── MaxTemperature.py
    │       └── scala
    │       │   ├── MaxTemperature.scala
    │       │   └── MaxTemperatureWithPlacement.scala
    │   └── test
    │       ├── avro
    │           ├── IntWrapper.avsc
    │           └── WeatherRecord.avsc
    │       ├── java
    │           └── SimpleTest.java
    │       ├── resources
    │           ├── fruit.txt
    │           ├── log4j.properties
    │           ├── numbers.seq
    │           ├── quangle.txt
    │           └── set2.txt
    │       └── scala
    │           ├── CustomKryoRegistrator.scala
    │           ├── DataSerializationTest.scala
    │           ├── FunctionSerializationTest.scala
    │           ├── RDDCreationTest.scala
    │           ├── ReflectWeatherRecord.scala
    │           ├── SharedDataTest.scala
    │           ├── TransformationsAndActionsTest.scala
    │           └── WordCountHistogramTest.scala
├── ch20-hbase
    ├── pom.xml
    └── src
    │   └── main
    │       └── java
    │           ├── ExampleClient.java
    │           ├── HBaseStationImporter.java
    │           ├── HBaseStationQuery.java
    │           ├── HBaseTemperatureBulkImporter.java
    │           ├── HBaseTemperatureDirectImporter.java
    │           ├── HBaseTemperatureImporter.java
    │           ├── HBaseTemperatureQuery.java
    │           ├── NewExampleClient.java
    │           ├── NewHBaseStationImporter.java
    │           ├── NewHBaseStationQuery.java
    │           ├── NewHBaseTemperatureQuery.java
    │           ├── RowKeyConverter.java
    │           └── SimpleRowCounter.java
├── ch21-zk
    ├── pom.xml
    └── src
    │   └── main
    │       ├── java
    │           ├── ActiveKeyValueStore.java
    │           ├── ConfigUpdater.java
    │           ├── ConfigWatcher.java
    │           ├── ConnectionWatcher.java
    │           ├── CreateGroup.java
    │           ├── DeleteGroup.java
    │           ├── JoinGroup.java
    │           ├── ListGroup.java
    │           ├── ResilientActiveKeyValueStore.java
    │           └── ResilientConfigUpdater.java
    │       └── sh
    │           └── group.sh
├── ch22-case-studies
    ├── pom.xml
    └── src
    │   └── main
    │       └── java
    │           ├── TrackStats.jr
    │           └── fm
    │               └── last
    │                   └── hadoop
    │                       ├── io
    │                           └── records
    │                           │   └── TrackStats.java
    │                       └── programs
    │                           └── labs
    │                               └── trackstats
    │                                   └── TrackStatisticsProgram.java
├── common
    ├── pom.xml
    └── src
    │   ├── main
    │       └── java
    │       │   ├── JobBuilder.java
    │       │   ├── MetOfficeRecordParser.java
    │       │   ├── NcdcRecordParser.java
    │       │   ├── NcdcStationMetadata.java
    │       │   ├── NcdcStationMetadataParser.java
    │       │   └── oldapi
    │       │       ├── JobBuilder.java
    │       │       ├── MetOfficeRecordParser.java
    │       │       ├── NcdcRecordParser.java
    │       │       ├── NcdcStationMetadata.java
    │       │       └── NcdcStationMetadataParser.java
    │   └── test
    │       └── java
    │           ├── MetOfficeRecordParserTest.java
    │           ├── NcdcRecordParserTest.java
    │           └── NcdcStationMetadataParserTest.java
├── conf
    ├── hadoop-cluster.template.xml
    ├── hadoop-local.xml
    ├── hadoop-localhost.xml
    ├── hadoop
    │   └── pseudo-distributed
    │   │   ├── core-site.xml
    │   │   ├── hdfs-site.xml
    │   │   ├── mapred-site.xml
    │   │   └── yarn-site.xml
    ├── pig
    │   └── localhost
    │   │   └── pig.properties
    └── zookeeper
    │   ├── cluster
    │       └── zoo.cfg
    │   ├── localhost
    │       └── zoo.cfg
    │   └── log4j.properties
├── hadoop-examples
    └── pom.xml
├── hadoop-meta
    └── pom.xml
├── input
    ├── avro
    │   └── pairs.avro
    ├── badrecords
    │   ├── a
    │   ├── b
    │   └── c
    ├── docs
    │   ├── 1400-8.txt
    │   └── quangle.txt
    ├── fileglob
    │   ├── 2007
    │   │   └── 12
    │   │   │   ├── 30
    │   │   │       ├── data-2007-12-30
    │   │   │       └── data[2007-12-30]
    │   │   │   └── 31
    │   │   │       └── data-2007-12-31
    │   └── 2008
    │   │   └── 01
    │   │       └── 01
    │   │           └── data-2008-01-01
    ├── fileinput
    │   ├── a
    │   └── dir
    │   │   └── b
    ├── hive
    │   ├── README
    │   ├── dummy.txt
    │   ├── joins
    │   │   ├── sales.txt
    │   │   └── things.txt
    │   ├── partitions
    │   │   ├── file1
    │   │   ├── file2
    │   │   ├── file3
    │   │   ├── file4
    │   │   ├── file5
    │   │   └── file6
    │   ├── tables
    │   │   ├── users.txt
    │   │   └── users_extended.txt
    │   ├── tmp.txt
    │   ├── types
    │   │   ├── complex.txt
    │   │   └── nested.txt
    │   └── udfs
    │   │   ├── arrays.txt
    │   │   ├── fruit.txt
    │   │   ├── max1.txt
    │   │   └── max2.txt
    ├── metoffice
    │   ├── aberporthdata.txt
    │   ├── armaghdata.txt
    │   ├── bradforddata.txt
    │   ├── braemardata.txt
    │   ├── cambridgedata.txt
    │   ├── cardiffdata.txt
    │   ├── durhamdata.txt
    │   ├── eastbournedata.txt
    │   ├── greenwichdata.txt
    │   ├── hurndata.txt
    │   ├── lerwickdata.txt
    │   ├── leucharsdata.txt
    │   ├── newtonriggdata.txt
    │   ├── oxforddata.txt
    │   ├── paisleydata.txt
    │   ├── ringwaydata.txt
    │   ├── rossonwyedata.txt
    │   ├── shawburydata.txt
    │   ├── sheffielddata.txt
    │   ├── southamptondata.txt
    │   ├── stmawgandata.txt
    │   ├── stornowaydata.txt
    │   ├── suttonbonningtondata.txt
    │   ├── tireedata.txt
    │   ├── valleydata.txt
    │   └── yeoviltondata.txt
    ├── ncdc
    │   ├── all
    │   │   ├── 1901.gz
    │   │   └── 1902.gz
    │   ├── metadata
    │   │   ├── ish-history.txt
    │   │   └── stations-fixed-width.txt
    │   ├── micro-tab
    │   │   ├── sample.txt
    │   │   ├── sample2.txt
    │   │   └── sample_corrupt.txt
    │   ├── micro
    │   │   └── sample.txt
    │   ├── sample.txt
    │   └── sample.txt.gz
    ├── pig
    │   ├── combine
    │   │   ├── A
    │   │   └── B
    │   ├── corrupt
    │   │   └── missing_fields
    │   ├── foreach
    │   │   └── A
    │   ├── group
    │   │   └── A
    │   ├── join
    │   │   ├── A
    │   │   └── B
    │   ├── multiquery
    │   │   └── A
    │   ├── nested
    │   │   ├── A
    │   │   └── B
    │   ├── pairwise
    │   │   └── postings
    │   ├── schema
    │   │   └── A
    │   ├── sort
    │   │   └── A
    │   ├── tuples
    │   │   └── A
    │   ├── types
    │   │   ├── A
    │   │   ├── B
    │   │   ├── C
    │   │   └── one
    │   └── udfs
    │   │   └── A
    ├── smallfiles
    │   ├── a
    │   ├── b
    │   ├── c
    │   ├── d
    │   ├── e
    │   └── f
    └── wikipedia
    │   ├── example.xml
    │   └── sample.xml
├── pom.xml
└── snippet
    ├── README
    ├── bin
        ├── check_expected.sh
        ├── check_manuscript.py
        ├── check_manuscript.sh
        ├── generate_listings.sh
        ├── grunter.sh
        ├── hiver.sh
        ├── phragmite_db.pl
        ├── phragmite_hive.py
        └── phragmite_pig.py
    ├── conf
        ├── local
        │   ├── capacity-scheduler.xml
        │   ├── configuration.xsl
        │   ├── container-executor.cfg
        │   ├── core-site.xml
        │   ├── hadoop-env.cmd
        │   ├── hadoop-env.sh
        │   ├── hadoop-metrics.properties
        │   ├── hadoop-metrics2.properties
        │   ├── hadoop-policy.xml
        │   ├── hdfs-site.xml
        │   ├── httpfs-env.sh
        │   ├── httpfs-log4j.properties
        │   ├── httpfs-signature.secret
        │   ├── httpfs-site.xml
        │   ├── log4j.properties
        │   ├── mapred-env.cmd
        │   ├── mapred-env.sh
        │   ├── mapred-queues.xml.template
        │   ├── mapred-site.xml
        │   ├── mapred-site.xml.template
        │   ├── slaves
        │   ├── ssl-client.xml.example
        │   ├── ssl-server.xml.example
        │   ├── yarn-env.cmd
        │   ├── yarn-env.sh
        │   └── yarn-site.xml
        └── pseudo
        │   ├── capacity-scheduler.xml
        │   ├── capacity-scheduler.xml.old
        │   ├── configuration.xsl
        │   ├── container-executor.cfg
        │   ├── core-site.xml
        │   ├── fair-scheduler.xml
        │   ├── hadoop-env.cmd
        │   ├── hadoop-env.sh
        │   ├── hadoop-metrics.properties
        │   ├── hadoop-metrics2.properties
        │   ├── hadoop-policy.xml
        │   ├── hdfs-site.xml
        │   ├── httpfs-env.sh
        │   ├── httpfs-log4j.properties
        │   ├── httpfs-signature.secret
        │   ├── httpfs-site.xml
        │   ├── log4j.properties
        │   ├── mapred-env.cmd
        │   ├── mapred-env.sh
        │   ├── mapred-queues.xml.template
        │   ├── mapred-site.xml
        │   ├── mapred-site.xml.template
        │   ├── slaves
        │   ├── ssl-client.xml.example
        │   ├── ssl-server.xml.example
        │   ├── yarn-env.cmd
        │   ├── yarn-env.sh
        │   └── yarn-site.xml
    ├── expected
        └── ch11
        │   └── grunt
        │       ├── combine_schema.xml
        │       ├── combine_union.xml
        │       ├── foreach_generate.xml
        │       ├── group_all.xml
        │       ├── group_dump.xml
        │       ├── group_expression.xml
        │       ├── join_cogroup.xml
        │       ├── join_cogroup_flatten.xml
        │       ├── join_cogroup_inner.xml
        │       ├── join_cogroup_join.xml
        │       ├── join_dump.xml
        │       ├── join_frj.xml
        │       ├── join_join.xml
        │       ├── max_temp_describe_records.xml
        │       ├── max_temp_dump_grouped_records.xml
        │       ├── max_temp_dump_records.xml
        │       ├── max_temp_filter_records.xml
        │       ├── max_temp_load.xml
        │       ├── max_temp_max_temp.xml
        │       ├── max_temp_result.xml
        │       ├── missing_fields.xml
        │       ├── null_corrupt.xml
        │       ├── null_count.xml
        │       ├── null_dump.xml
        │       ├── null_split.xml
        │       ├── null_undetected.xml
        │       ├── schema_absent.xml
        │       ├── schema_absent_projected.xml
        │       ├── schema_names_only.xml
        │       ├── schema_one_type_only.xml
        │       ├── schema_types.xml
        │       ├── set_debug_on.xml
        │       ├── sort_dump.xml
        │       ├── sort_limit.xml
        │       ├── sort_no_order.xml
        │       ├── sort_order.xml
        │       ├── store_colon_delimited.xml
        │       ├── stream_cut.xml
        │       ├── udfs_invoke_long.xml
        │       ├── udfs_invoke_short.xml
        │       ├── udfs_load.xml
        │       ├── udfs_register.xml
        │       └── udfs_schema.xml
    ├── pom.xml
    └── src
        └── test
            ├── java
                └── ExamplesIT.java
            └── resources
                ├── copyoutput.sh
                └── setup.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | /*.jar
 2 | *.log
 3 | /build
 4 | /lib
 5 | /out
 6 | /output*
 7 | ch*/maxwidget
 8 | snippet/actual
 9 | target
10 | /target
11 | .classpath
12 | .project
13 | .pydevproject
14 | .settings
15 | metastore_db
16 | 


--------------------------------------------------------------------------------
/appc/src/main/sh/create_ncdc_files.sh:
--------------------------------------------------------------------------------
1 | for ((i=1901;i<=2000;i+=1))
2 | do
3 |   echo s3n://hadoopbook/ncdc/raw/isd-$i.tar.bz2
4 | done
5 | 


--------------------------------------------------------------------------------
/appc/src/main/sh/load_ncdc.sh:
--------------------------------------------------------------------------------
 1 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \
 2 |   -D mapred.reduce.tasks=0 \
 3 |   -D mapred.map.tasks.speculative.execution=false \
 4 |   -D mapred.task.timeout=12000000 \
 5 |   -input ncdc_files.txt \
 6 |   -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat \
 7 |   -output output \
 8 |   -mapper load_ncdc_map.sh \
 9 |   -file load_ncdc_map.sh
10 | 
11 | 


--------------------------------------------------------------------------------
/appc/src/main/sh/load_ncdc_map.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # NLineInputFormat gives a single line: key is offset, value is S3 URI
 4 | read offset s3file
 5 | 
 6 | # Retrieve file from S3 to local disk
 7 | echo "reporter:status:Retrieving $s3file" >&2
 8 | $HADOOP_INSTALL/bin/hadoop fs -get $s3file .
 9 | 
10 | # Un-bzip and un-tar the local file
11 | target=`basename $s3file .tar.bz2`
12 | mkdir -p $target
13 | echo "reporter:status:Un-tarring $s3file to $target" >&2
14 | tar jxf `basename $s3file` -C $target
15 | 
16 | # Un-gzip each station file and concat into one file
17 | echo "reporter:status:Un-gzipping $target" >&2
18 | for file in $target/*/*
19 | do
20 |   gunzip -c $file >> $target.all
21 |   echo "reporter:status:Processed $file" >&2
22 | done
23 | 
24 | # Put gzipped version into HDFS
25 | echo "reporter:status:Gzipping $target and putting in HDFS" >&2
26 | gzip -c $target.all | $HADOOP_INSTALL/bin/hadoop fs -put - gz/$target.gz


--------------------------------------------------------------------------------
/book/src/main/assembly/jar.xml:
--------------------------------------------------------------------------------
 1 | <assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2"
 2 |     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 3 |     xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2 http://maven.apache.org/xsd/assembly-1.1.2.xsd">
 4 |   <id>jar</id>
 5 |   <formats>
 6 |     <format>jar</format>
 7 |   </formats>
 8 |   <includeBaseDirectory>false</includeBaseDirectory>
 9 |   <dependencySets>
10 |     <dependencySet>
11 |       <outputDirectory>/</outputDirectory>
12 |       <useProjectArtifact>true</useProjectArtifact>
13 |       <unpack>true</unpack>
14 |       <scope>runtime</scope>
15 |       <useTransitiveDependencies>false</useTransitiveDependencies>
16 |       <includes>
17 |         <include>com.hadoopbook:*</include>
18 |       </includes>
19 |     </dependencySet>
20 |   </dependencySets>
21 |   <fileSets>
22 |     <fileSet>
23 |       <directory>target/classes</directory>
24 |       <outputDirectory>/</outputDirectory>
25 |     </fileSet>
26 |   </fileSets>
27 | </assembly>


--------------------------------------------------------------------------------
/book/src/main/assembly/oozie-workflow-application.xml:
--------------------------------------------------------------------------------
 1 | <assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2"
 2 |     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 3 |     xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2 http://maven.apache.org/xsd/assembly-1.1.2.xsd">
 4 |   <id>oozie-workflow-application</id>
 5 |   <formats>
 6 |     <format>dir</format>
 7 |   </formats>
 8 |   <includeBaseDirectory>false</includeBaseDirectory>
 9 |   <fileSets>
10 |     <fileSet>
11 |       <directory>../ch06-mr-dev/src/main/resources/max-temp-workflow</directory>
12 |       <outputDirectory>max-temp-workflow</outputDirectory>
13 |     </fileSet>
14 |   </fileSets>
15 |   <files>
16 |     <file>
17 |       <source>../hadoop-examples.jar</source>
18 |       <outputDirectory>max-temp-workflow/lib</outputDirectory>
19 |     </file>
20 |   </files>
21 | </assembly>


--------------------------------------------------------------------------------
/ch02-mr-intro/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 |   <parent>
 5 |     <groupId>com.hadoopbook</groupId>
 6 |     <artifactId>hadoop-meta</artifactId>
 7 |     <version>4.0</version>
 8 |     <relativePath>../hadoop-meta/pom.xml</relativePath>
 9 |   </parent>
10 |   <groupId>com.hadoopbook</groupId>
11 |   <artifactId>ch02-mr-intro</artifactId>
12 |   <packaging>jar</packaging>
13 |   <version>4.0</version>
14 |   <name>Chapter 2: MapReduce</name>
15 |   <dependencies>
16 |     <dependency>
17 |       <groupId>junit</groupId>
18 |       <artifactId>junit</artifactId>
19 |     </dependency>
20 |   </dependencies>
21 | </project>
22 | 


--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/awk/max_temperature.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | for year in all/*
 3 | do
 4 |   echo -ne `basename $year .gz`"\t"
 5 |   gunzip -c $year | \
 6 |     awk '{ temp = substr($0, 88, 5) + 0;
 7 |            q = substr($0, 93, 1);
 8 |            if (temp !=9999 && q ~ /[01459]/ && temp > max) max = temp }
 9 |          END { print max }'
10 | done


--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/cpp/Makefile:
--------------------------------------------------------------------------------
1 | CC = g++
2 | CPPFLAGS = -m32 -I$(HADOOP_INSTALL)/c++/$(PLATFORM)/include
3 | 
4 | max_temperature: max_temperature.cpp 
5 | 	$(CC) $(CPPFLAGS) $< -Wall -L$(HADOOP_INSTALL)/c++/$(PLATFORM)/lib -lhadooppipes \
6 | 	-lhadooputils -lpthread -g -O2 -o $@
7 | 


--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/MaxTemperature/input.txt:
--------------------------------------------------------------------------------
1 | hadoop MaxTemperature input/ncdc/sample.txt output


--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/MaxTemperature/output/part-r-00000:
--------------------------------------------------------------------------------
1 | 1949	111
2 | 1950	22
3 | 


--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/MaxTemperatureWithCombiner/input.txt:
--------------------------------------------------------------------------------
1 | hadoop MaxTemperatureWithCombiner input/ncdc/sample.txt output


--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/MaxTemperatureWithCombiner/output/part-r-00000:
--------------------------------------------------------------------------------
1 | 1949	111
2 | 1950	22
3 | 


--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/OldMaxTemperature/input.txt:
--------------------------------------------------------------------------------
1 | hadoop OldMaxTemperature input/ncdc/sample.txt output


--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/OldMaxTemperature/output/part-00000:
--------------------------------------------------------------------------------
1 | 1949	111
2 | 1950	22
3 | 


--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/max_temperature.cpp.input.txt:
--------------------------------------------------------------------------------
1 | hadoop pipes \
2 |   -D hadoop.pipes.java.recordreader=true \
3 |   -D hadoop.pipes.java.recordwriter=true \
4 |   -input sample.txt \
5 |   -output output \
6 |   -program bin/max_temperature


--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/max_temperature_hadoop.input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \
2 |   -input input/ncdc/sample.txt \
3 |   -output output \
4 |   -mapper ch02-mr-intro/src/main/ruby/max_temperature_map.rb \
5 |   -reducer ch02-mr-intro/src/main/ruby/max_temperature_reduce.rb


--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/max_temperature_hadoop_cluster.input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \
2 |   -files ch02-mr-intro/src/main/ruby/max_temperature_map.rb,\
3 | ch02-mr-intro/src/main/ruby/max_temperature_reduce.rb \
4 |   -input input/ncdc/all \
5 |   -output output \
6 |   -mapper ch02-mr-intro/src/main/ruby/max_temperature_map.rb \
7 |   -combiner ch02-mr-intro/src/main/ruby/max_temperature_reduce.rb \
8 |   -reducer ch02-mr-intro/src/main/ruby/max_temperature_reduce.rb
9 | 


--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/max_temperature_py/2/input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \
2 |   -input input/ncdc/sample.txt \
3 |   -output output \
4 |   -mapper ch02-mr-intro/src/main/python/max_temperature_map.py \
5 |   -reducer ch02-mr-intro/src/main/python/max_temperature_reduce.py


--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/max_temperature_py/2/output/part-00000:
--------------------------------------------------------------------------------
1 | 1949	111
2 | 1950	22
3 | 


--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/max_temperature_py/input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \
2 |   -input input/ncdc/sample.txt \
3 |   -output output \
4 |   -mapper ch02-mr-intro/src/main/python/max_temperature_map.py \
5 |   -reducer ch02-mr-intro/src/main/python/max_temperature_reduce.py


--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/max_temperature_py/output/part-r-00000:
--------------------------------------------------------------------------------
1 | 1949	111
2 | 1950	22
3 | 


--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/max_temperature_py/pseudo/input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \
2 |   -files ch02-mr-intro/src/main/python/max_temperature_map.py,\
3 | ch02-mr-intro/src/main/python/max_temperature_reduce.py \
4 |   -input input/ncdc/sample.txt \
5 |   -output output \
6 |   -mapper ch02-mr-intro/src/main/python/max_temperature_map.py \
7 |   -reducer ch02-mr-intro/src/main/python/max_temperature_reduce.py
8 | 


--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/examples/max_temperature_py/pseudo/output/part-00000:
--------------------------------------------------------------------------------
1 | 1949	111
2 | 1950	22
3 | 


--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/java/MaxTemperatureReducer.java:
--------------------------------------------------------------------------------
 1 | // cc MaxTemperatureReducer Reducer for maximum temperature example
 2 | // vv MaxTemperatureReducer
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.io.IntWritable;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapreduce.Reducer;
 8 | 
 9 | public class MaxTemperatureReducer
10 |   extends Reducer<Text, IntWritable, Text, IntWritable> {
11 |   
12 |   @Override
13 |   public void reduce(Text key, Iterable<IntWritable> values,
14 |       Context context)
15 |       throws IOException, InterruptedException {
16 |     
17 |     int maxValue = Integer.MIN_VALUE;
18 |     for (IntWritable value : values) {
19 |       maxValue = Math.max(maxValue, value.get());
20 |     }
21 |     context.write(key, new IntWritable(maxValue));
22 |   }
23 | }
24 | // ^^ MaxTemperatureReducer
25 | 


--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/java/oldapi/MaxTemperatureReducer.java:
--------------------------------------------------------------------------------
 1 | package oldapi;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Iterator;
 5 | 
 6 | import org.apache.hadoop.io.IntWritable;
 7 | import org.apache.hadoop.io.Text;
 8 | import org.apache.hadoop.mapred.MapReduceBase;
 9 | import org.apache.hadoop.mapred.OutputCollector;
10 | import org.apache.hadoop.mapred.Reducer;
11 | import org.apache.hadoop.mapred.Reporter;
12 | 
13 | public class MaxTemperatureReducer extends MapReduceBase
14 |   implements Reducer<Text, IntWritable, Text, IntWritable> {
15 | 
16 |   public void reduce(Text key, Iterator<IntWritable> values,
17 |       OutputCollector<Text, IntWritable> output, Reporter reporter)
18 |       throws IOException {
19 |     
20 |     int maxValue = Integer.MIN_VALUE;
21 |     while (values.hasNext()) {
22 |       maxValue = Math.max(maxValue, values.next().get());
23 |     }
24 |     output.collect(key, new IntWritable(maxValue));
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/python/max_temperature_map.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import re
 4 | import sys
 5 | 
 6 | for line in sys.stdin:
 7 |   val = line.strip()
 8 |   (year, temp, q) = (val[15:19], val[87:92], val[92:93])
 9 |   if (temp != "+9999" and re.match("[01459]", q)):
10 |     print "%s\t%s" % (year, temp)
11 | 


--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/python/max_temperature_reduce.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | 
 5 | (last_key, max_val) = (None, -sys.maxint)
 6 | for line in sys.stdin:
 7 |   (key, val) = line.strip().split("\t")
 8 |   if last_key and last_key != key:
 9 |     print "%s\t%s" % (last_key, max_val)
10 |     (last_key, max_val) = (key, int(val))
11 |   else:
12 |     (last_key, max_val) = (key, max(max_val, int(val)))
13 | 
14 | if last_key:
15 |   print "%s\t%s" % (last_key, max_val)


--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/ruby/max_temperature_map.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | 
3 | STDIN.each_line do |line|
4 |   val = line
5 |   year, temp, q = val[15,4], val[87,5], val[92,1]
6 |   puts "#{year}\t#{temp}" if (temp != "+9999" && q =~ /[01459]/)
7 | end


--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/ruby/max_temperature_reduce.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | last_key, max_val = nil, -1000000
 4 | STDIN.each_line do |line|
 5 |   key, val = line.split("\t")
 6 |   if last_key && last_key != key
 7 |     puts "#{last_key}\t#{max_val}"
 8 |     last_key, max_val = key, val.to_i
 9 |   else
10 |     last_key, max_val = key, [max_val, val.to_i].max
11 |   end
12 | end
13 | puts "#{last_key}\t#{max_val}" if last_key


--------------------------------------------------------------------------------
/ch02-mr-intro/src/main/sh/max_temp.sh:
--------------------------------------------------------------------------------
 1 | : == max_temp_java
 2 | : == max_temp_java_output
 3 | : == max_temp_ruby_map
 4 | : == max_temp_ruby_pipeline
 5 | : == max_temp_python_pipeline
 6 | rm -r /Users/tom/workspace/htdg/output
 7 | : vv max_temp_java
 8 | export HADOOP_CLASSPATH=build/classes
 9 | hadoop MaxTemperature input/ncdc/sample.txt output
10 | : ^^ max_temp_java
11 | : vv max_temp_java_output
12 | cat output/part-00000
13 | : ^^ max_temp_java_output
14 | : vv max_temp_ruby_map
15 | cat input/ncdc/sample.txt | ch02-mr-intro/src/main/ruby/max_temperature_map.rb
16 | : ^^ max_temp_ruby_map
17 | : vv max_temp_ruby_pipeline
18 | cat input/ncdc/sample.txt | \
19 |   ch02-mr-intro/src/main/ruby/max_temperature_map.rb | \
20 |   sort | ch02-mr-intro/src/main/ruby/max_temperature_reduce.rb
21 | : ^^ max_temp_ruby_pipeline
22 | : vv max_temp_python_pipeline
23 | cat input/ncdc/sample.txt | \
24 |   ch02-mr-intro/src/main/python/max_temperature_map.py | \
25 |   sort | ch02-mr-intro/src/main/python/max_temperature_reduce.py
26 | : ^^ max_temp_python_pipeline
27 |   


--------------------------------------------------------------------------------
/ch03-hdfs/src/main/conf/core-site.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0"?>
2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
3 | <configuration>
4 |   <property>
5 |     <name>fs.defaultFS</name>
6 |     <value>hdfs://localhost/</value>
7 |   </property>
8 | </configuration>


--------------------------------------------------------------------------------
/ch03-hdfs/src/main/conf/hdfs-site.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0"?>
2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
3 | <configuration>
4 |   <property>
5 |     <name>dfs.replication</name>
6 |     <value>1</value>
7 |   </property>
8 | </configuration>


--------------------------------------------------------------------------------
/ch03-hdfs/src/main/java/FileSystemCat.java:
--------------------------------------------------------------------------------
 1 | // cc FileSystemCat Displays files from a Hadoop filesystem on standard output by using the FileSystem directly
 2 | import java.io.InputStream;
 3 | import java.net.URI;
 4 | 
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.fs.FileSystem;
 7 | import org.apache.hadoop.fs.Path;
 8 | import org.apache.hadoop.io.IOUtils;
 9 | 
10 | // vv FileSystemCat
11 | public class FileSystemCat {
12 | 
13 |   public static void main(String[] args) throws Exception {
14 |     String uri = args[0];
15 |     Configuration conf = new Configuration();
16 |     FileSystem fs = FileSystem.get(URI.create(uri), conf);
17 |     InputStream in = null;
18 |     try {
19 |       in = fs.open(new Path(uri));
20 |       IOUtils.copyBytes(in, System.out, 4096, false);
21 |     } finally {
22 |       IOUtils.closeStream(in);
23 |     }
24 |   }
25 | }
26 | // ^^ FileSystemCat
27 | 


--------------------------------------------------------------------------------
/ch03-hdfs/src/main/java/FileSystemDoubleCat.java:
--------------------------------------------------------------------------------
 1 | // cc FileSystemDoubleCat Displays files from a Hadoop filesystem on standard output twice, by using seek
 2 | import java.net.URI;
 3 | 
 4 | import org.apache.hadoop.conf.Configuration;
 5 | import org.apache.hadoop.fs.FSDataInputStream;
 6 | import org.apache.hadoop.fs.FileSystem;
 7 | import org.apache.hadoop.fs.Path;
 8 | import org.apache.hadoop.io.IOUtils;
 9 | 
10 | // vv FileSystemDoubleCat
11 | public class FileSystemDoubleCat {
12 | 
13 |   public static void main(String[] args) throws Exception {
14 |     String uri = args[0];
15 |     Configuration conf = new Configuration();
16 |     FileSystem fs = FileSystem.get(URI.create(uri), conf);
17 |     FSDataInputStream in = null;
18 |     try {
19 |       in = fs.open(new Path(uri));
20 |       IOUtils.copyBytes(in, System.out, 4096, false);
21 |       in.seek(0); // go back to the start of the file
22 |       IOUtils.copyBytes(in, System.out, 4096, false);
23 |     } finally {
24 |       IOUtils.closeStream(in);
25 |     }
26 |   }
27 | }
28 | // ^^ FileSystemDoubleCat
29 | 


--------------------------------------------------------------------------------
/ch03-hdfs/src/main/java/ListStatus.java:
--------------------------------------------------------------------------------
 1 | // cc ListStatus Shows the file statuses for a collection of paths in a Hadoop filesystem 
 2 | import java.net.URI;
 3 | 
 4 | import org.apache.hadoop.conf.Configuration;
 5 | import org.apache.hadoop.fs.FileStatus;
 6 | import org.apache.hadoop.fs.FileSystem;
 7 | import org.apache.hadoop.fs.FileUtil;
 8 | import org.apache.hadoop.fs.Path;
 9 | 
10 | // vv ListStatus
11 | public class ListStatus {
12 | 
13 |   public static void main(String[] args) throws Exception {
14 |     String uri = args[0];
15 |     Configuration conf = new Configuration();
16 |     FileSystem fs = FileSystem.get(URI.create(uri), conf);
17 |     
18 |     Path[] paths = new Path[args.length];
19 |     for (int i = 0; i < paths.length; i++) {
20 |       paths[i] = new Path(args[i]);
21 |     }
22 |     
23 |     FileStatus[] status = fs.listStatus(paths);
24 |     Path[] listedPaths = FileUtil.stat2Paths(status);
25 |     for (Path p : listedPaths) {
26 |       System.out.println(p);
27 |     }
28 |   }
29 | }
30 | // ^^ ListStatus
31 | 


--------------------------------------------------------------------------------
/ch03-hdfs/src/main/java/RegexExcludePathFilter.java:
--------------------------------------------------------------------------------
 1 | // cc RegexExcludePathFilter A PathFilter for excluding paths that match a regular expression
 2 | import org.apache.hadoop.fs.Path;
 3 | import org.apache.hadoop.fs.PathFilter;
 4 | 
 5 | // vv RegexExcludePathFilter
 6 | public class RegexExcludePathFilter implements PathFilter {
 7 |   
 8 |   private final String regex;
 9 | 
10 |   public RegexExcludePathFilter(String regex) {
11 |     this.regex = regex;
12 |   }
13 | 
14 |   public boolean accept(Path path) {
15 |     return !path.toString().matches(regex);
16 |   }
17 | }
18 | // ^^ RegexExcludePathFilter
19 | 


--------------------------------------------------------------------------------
/ch03-hdfs/src/main/java/RegexPathFilter.java:
--------------------------------------------------------------------------------
 1 | import org.apache.hadoop.fs.Path;
 2 | import org.apache.hadoop.fs.PathFilter;
 3 | 
 4 | public class RegexPathFilter implements PathFilter {
 5 |   
 6 |   private final String regex;
 7 |   private final boolean include;
 8 | 
 9 |   public RegexPathFilter(String regex) {
10 |     this(regex, true);
11 |   }
12 |   
13 |   public RegexPathFilter(String regex, boolean include) {
14 |     this.regex = regex;
15 |     this.include = include;
16 |   }
17 | 
18 |   public boolean accept(Path path) {
19 |     return (path.toString().matches(regex)) ? include : !include;
20 |   }
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/ch03-hdfs/src/main/java/URLCat.java:
--------------------------------------------------------------------------------
 1 | // cc URLCat Displays files from a Hadoop filesystem on standard output using a URLStreamHandler
 2 | import java.io.InputStream;
 3 | import java.net.URL;
 4 | 
 5 | import org.apache.hadoop.fs.FsUrlStreamHandlerFactory;
 6 | import org.apache.hadoop.io.IOUtils;
 7 | 
 8 | // vv URLCat
 9 | public class URLCat {
10 | 
11 |   static {
12 |     URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory());
13 |   }
14 |   
15 |   public static void main(String[] args) throws Exception {
16 |     InputStream in = null;
17 |     try {
18 |       in = new URL(args[0]).openStream();
19 |       IOUtils.copyBytes(in, System.out, 4096, false);
20 |     } finally {
21 |       IOUtils.closeStream(in);
22 |     }
23 |   }
24 | }
25 | // ^^ URLCat
26 | 


--------------------------------------------------------------------------------
/ch03-hdfs/src/main/sh/file.sh:
--------------------------------------------------------------------------------
 1 | : == url_cat
 2 | : == filesystem_cat
 3 | : == filesystem_double_cat
 4 | : == list_status
 5 | : == file_copy_with_progress
 6 | rm -r /Users/tom/workspace/htdg/output
 7 | export HADOOP_CLASSPATH=build/classes
 8 | : vv url_cat
 9 | hadoop URLCat hdfs://localhost/user/tom/quangle.txt
10 | : ^^ url_cat
11 | : vv filesystem_cat
12 | hadoop FileSystemCat hdfs://localhost/user/tom/quangle.txt
13 | : ^^ filesystem_cat
14 | : vv filesystem_double_cat
15 | hadoop FileSystemDoubleCat hdfs://localhost/user/tom/quangle.txt
16 | : ^^ filesystem_double_cat
17 | : vv list_status
18 | hadoop ListStatus hdfs://localhost/ hdfs://localhost/user/tom
19 | : ^^ list_status
20 | : vv file_copy_with_progress
21 | hadoop FileCopyWithProgress input/docs/1400-8.txt hdfs://localhost/user/tom/1400-8.txt
22 | : ^^ file_copy_with_progress
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/ch03-hdfs/src/main/sh/hars.sh:
--------------------------------------------------------------------------------
 1 | : == har_ls_files
 2 | : == har_create
 3 | : == har_inspect
 4 | : == har_ls
 5 | : == har_ls_long
 6 | : == har_rmr
 7 | rsync -avz --exclude '.svn' /Users/tom/workspace/htdg/input/fileinput/ /tmp/fileinput
 8 | hadoop fs -copyFromLocal /tmp/fileinput /my/files
 9 | rm -rf /tmp/fileinput
10 | : vv har_ls_files
11 | hadoop fs -lsr /my/files
12 | : ^^ har_ls_files
13 | : vv har_create
14 | hadoop archive -archiveName files.har /my/files /my
15 | : ^^ har_create
16 | : vv har_inspect
17 | hadoop fs -ls /my
18 | hadoop fs -ls /my/files.har
19 | : ^^ har_inspect
20 | : vv har_ls
21 | hadoop fs -lsr har:///my/files.har
22 | : ^^ har_ls
23 | : vv har_ls_long
24 | hadoop fs -lsr har:///my/files.har/my/files/dir
25 | hadoop fs -lsr har://hdfs-localhost:8020/my/files.har/my/files/dir
26 | : ^^ har_ls_long
27 | : vv har_rmr
28 | hadoop fs -rmr /my/files.har
29 | : ^^ har_rmr
30 | hadoop fs -rmr /my/files


--------------------------------------------------------------------------------
/ch04-yarn/capacity-scheduler.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <configuration>
 3 |   <property>
 4 |     <name>yarn.scheduler.capacity.root.queues</name>
 5 |     <value>prod,dev</value>
 6 |   </property>
 7 |   <property>
 8 |     <name>yarn.scheduler.capacity.root.dev.queues</name>
 9 |     <value>eng,science</value>
10 |   </property>
11 |   <property>
12 |     <name>yarn.scheduler.capacity.root.prod.capacity</name>
13 |     <value>40</value>
14 |   </property>
15 |   <property>
16 |     <name>yarn.scheduler.capacity.root.dev.capacity</name>
17 |     <value>60</value>
18 |   </property>
19 |   <property>
20 |     <name>yarn.scheduler.capacity.root.dev.maximum-capacity</name>
21 |     <value>75</value>
22 |   </property>
23 |   <property>
24 |     <name>yarn.scheduler.capacity.root.dev.eng.capacity</name>
25 |     <value>50</value>
26 |   </property>
27 |   <property>
28 |     <name>yarn.scheduler.capacity.root.dev.science.capacity</name>
29 |     <value>50</value>
30 |   </property>
31 | </configuration>


--------------------------------------------------------------------------------
/ch04-yarn/fair-scheduler.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <allocations>
 3 |   <defaultQueueSchedulingPolicy>fair</defaultQueueSchedulingPolicy>
 4 | 
 5 |   <queue name="prod">
 6 |     <weight>40</weight>
 7 |     <schedulingPolicy>fifo</schedulingPolicy>
 8 |   </queue>
 9 | 
10 |   <queue name="dev">
11 |     <weight>60</weight>
12 |     <queue name="eng" />
13 |     <queue name="science" />
14 |   </queue>
15 | 
16 |   <queuePlacementPolicy>
17 |     <rule name="specified" create="false" />
18 |     <rule name="primaryGroup" create="false" />
19 |     <rule name="default" queue="dev.eng" />
20 |   </queuePlacementPolicy>
21 | </allocations>


--------------------------------------------------------------------------------
/ch05-io/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 |   <parent>
 5 |     <groupId>com.hadoopbook</groupId>
 6 |     <artifactId>hadoop-meta</artifactId>
 7 |     <version>4.0</version>
 8 |     <relativePath>../hadoop-meta/pom.xml</relativePath>
 9 |   </parent>
10 |   <groupId>com.hadoopbook</groupId>
11 |   <artifactId>ch05-io</artifactId>
12 |   <packaging>jar</packaging>
13 |   <version>4.0</version>
14 |   <name>Chapter 5: Hadoop I/O</name>
15 |   <dependencies>
16 |     <dependency>
17 |       <groupId>com.hadoopbook</groupId>
18 |       <artifactId>ch02-mr-intro</artifactId>
19 |       <version>4.0</version>
20 |     </dependency>
21 |     <dependency>
22 |       <groupId>junit</groupId>
23 |       <artifactId>junit</artifactId>
24 |     </dependency>
25 |     <dependency>
26 |       <groupId>org.hamcrest</groupId>
27 |       <artifactId>hamcrest-all</artifactId>
28 |     </dependency>
29 |   </dependencies>
30 | </project>
31 | 


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/FileDecompressor.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop FileDecompressor file.gz


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/MapFile-data-head.input.txt:
--------------------------------------------------------------------------------
1 | hadoop fs -text numbers.map/data | head


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/MapFile-data-head.output.txt:
--------------------------------------------------------------------------------
 1 | 1       One, two, buckle my shoe
 2 | 2       Three, four, shut the door
 3 | 3       Five, six, pick up sticks
 4 | 4       Seven, eight, lay them straight
 5 | 5       Nine, ten, a big fat hen
 6 | 6       One, two, buckle my shoe
 7 | 7       Three, four, shut the door
 8 | 8       Five, six, pick up sticks
 9 | 9       Seven, eight, lay them straight
10 | 10      Nine, ten, a big fat hen


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/MapFile-index.input.txt:
--------------------------------------------------------------------------------
1 | hadoop fs -text numbers.map/index


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/MapFile-index.output.txt:
--------------------------------------------------------------------------------
1 | 1       128
2 | 129     6079
3 | 257     12054
4 | 385     18030
5 | 513     24002
6 | 641     29976
7 | 769     35947
8 | 897     41922


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/MapFile-ls.input.txt:
--------------------------------------------------------------------------------
1 | ls -l numbers.map


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/MapFile-ls.output.txt:
--------------------------------------------------------------------------------
1 | total 104
2 | -rw-r--r--   1 tom  tom  47898 Jul 29 22:06 data
3 | -rw-r--r--   1 tom  tom    251 Jul 29 22:06 index


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/MapFileWriteDemo.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop MapFileWriteDemo numbers.map


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/MaxTemperatureWithCompression/input.txt:
--------------------------------------------------------------------------------
1 | hadoop MaxTemperatureWithCompression input/ncdc/sample.txt.gz output


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/MaxTemperatureWithCompression/output/part-r-00000.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch05-io/src/main/examples/MaxTemperatureWithCompression/output/part-r-00000.gz


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/MaxTemperatureWithMapOutputCompression.ignore/input.txt:
--------------------------------------------------------------------------------
1 | hadoop MaxTemperatureWithMapOutputCompression input/ncdc/sample.txt.gz output


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/MaxTemperatureWithMapOutputCompression.ignore/output/part-r-00000:
--------------------------------------------------------------------------------
1 | 1949	111
2 | 1950	22
3 | 


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/SequenceFileMapReduceSort.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar $HADOOP_INSTALL/hadoop-*-examples.jar sort -r 1 \
2 |   -inFormat org.apache.hadoop.mapred.SequenceFileInputFormat \
3 |   -outFormat org.apache.hadoop.mapred.SequenceFileOutputFormat \
4 |   -outKey org.apache.hadoop.io.IntWritable \
5 |   -outValue org.apache.hadoop.io.Text \
6 |   numbers.seq sorted


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/SequenceFileMapReduceSortResults.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop fs -text sorted/part-00000 | head


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/SequenceFileMapReduceSortResults.java.output.txt:
--------------------------------------------------------------------------------
 1 | 1       Nine, ten, a big fat hen
 2 | 2       Seven, eight, lay them straight
 3 | 3       Five, six, pick up sticks
 4 | 4       Three, four, shut the door
 5 | 5       One, two, buckle my shoe
 6 | 6       Nine, ten, a big fat hen
 7 | 7       Seven, eight, lay them straight
 8 | 8       Five, six, pick up sticks
 9 | 9       Three, four, shut the door
10 | 10      One, two, buckle my shoe


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/SequenceFileMapReduceSortResults.java.pre.sh:
--------------------------------------------------------------------------------
1 | # Produce sorted seq file
2 | hadoop SequenceFileWriteDemo numbers.seq
3 | 
4 | hadoop jar $HADOOP_INSTALL/hadoop-*-examples.jar sort -r 1 \
5 |   -inFormat org.apache.hadoop.mapred.SequenceFileInputFormat \
6 |   -outFormat org.apache.hadoop.mapred.SequenceFileOutputFormat \
7 |   -outKey org.apache.hadoop.io.IntWritable \
8 |   -outValue org.apache.hadoop.io.Text \
9 |   numbers.seq sorted


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/SequenceFileReadDemo.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop SequenceFileReadDemo numbers.seq


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/SequenceFileReadDemo.java.output.txt:
--------------------------------------------------------------------------------
 1 | [128]   100     One, two, buckle my shoe
 2 | [173]   99      Three, four, shut the door
 3 | [220]   98      Five, six, pick up sticks
 4 | [264]   97      Seven, eight, lay them straight
 5 | [314]   96      Nine, ten, a big fat hen
 6 | [359]   95      One, two, buckle my shoe
 7 | [404]   94      Three, four, shut the door
 8 | [451]   93      Five, six, pick up sticks
 9 | [495]   92      Seven, eight, lay them straight
10 | [545]   91      Nine, ten, a big fat hen
11 | [590]   90      One, two, buckle my shoe
12 | ...
13 | [1976]  60      One, two, buckle my shoe
14 | [2021*] 59      Three, four, shut the door
15 | [2088]  58      Five, six, pick up sticks
16 | [2132]  57      Seven, eight, lay them straight
17 | [2182]  56      Nine, ten, a big fat hen
18 | ...
19 | [4557]  5       One, two, buckle my shoe
20 | [4602]  4       Three, four, shut the door
21 | [4649]  3       Five, six, pick up sticks
22 | [4693]  2       Seven, eight, lay them straight
23 | [4743]  1       Nine, ten, a big fat hen


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/SequenceFileReadDemo.java.pre.sh:
--------------------------------------------------------------------------------
1 | # Make sure file is there to be read
2 | hadoop SequenceFileWriteDemo numbers.seq


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/SequenceFileToMapFileConverter-fix.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop MapFileFixer numbers.map


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/SequenceFileToMapFileConverter-mv.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop fs -mv numbers.map/part-00000 numbers.map/data


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/SequenceFileToMapFileConverter-sort.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar $HADOOP_INSTALL/hadoop-*-examples.jar sort -r 1 \
2 |   -inFormat org.apache.hadoop.mapred.SequenceFileInputFormat \
3 |   -outFormat org.apache.hadoop.mapred.SequenceFileOutputFormat \
4 |   -outKey org.apache.hadoop.io.IntWritable \
5 |   -outValue org.apache.hadoop.io.Text \
6 |   numbers.seq numbers.map


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/SequenceFileWriteDemo.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop SequenceFileWriteDemo numbers.seq


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/SequenceFileWriteDemo.java.output.txt:
--------------------------------------------------------------------------------
 1 | [128]   100     One, two, buckle my shoe
 2 | [173]   99      Three, four, shut the door
 3 | [220]   98      Five, six, pick up sticks
 4 | [264]   97      Seven, eight, lay them straight
 5 | [314]   96      Nine, ten, a big fat hen
 6 | [359]   95      One, two, buckle my shoe
 7 | [404]   94      Three, four, shut the door
 8 | [451]   93      Five, six, pick up sticks
 9 | [495]   92      Seven, eight, lay them straight
10 | [545]   91      Nine, ten, a big fat hen
11 | ...
12 | [1976]  60      One, two, buckle my shoe
13 | [2021]  59      Three, four, shut the door
14 | [2088]  58      Five, six, pick up sticks
15 | [2132]  57      Seven, eight, lay them straight
16 | [2182]  56      Nine, ten, a big fat hen
17 | ...
18 | [4557]  5       One, two, buckle my shoe
19 | [4602]  4       Three, four, shut the door
20 | [4649]  3       Five, six, pick up sticks
21 | [4693]  2       Seven, eight, lay them straight
22 | [4743]  1       Nine, ten, a big fat hen


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/StreamCompressor.java.input.txt:
--------------------------------------------------------------------------------
1 | echo "Text" | hadoop StreamCompressor org.apache.hadoop.io.compress.GzipCodec \
2 |   | gunzip -


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/StreamCompressor.java.output.txt:
--------------------------------------------------------------------------------
1 | Text


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/TextIterator.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop TextIterator


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/TextIterator.java.output.txt:
--------------------------------------------------------------------------------
1 | 41
2 | df
3 | 6771
4 | 10400


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/hadoop-fs-text.input.txt:
--------------------------------------------------------------------------------
1 | hadoop fs -text numbers.seq | head


--------------------------------------------------------------------------------
/ch05-io/src/main/examples/hadoop-fs-text.output.txt:
--------------------------------------------------------------------------------
 1 | 100     One, two, buckle my shoe
 2 | 99      Three, four, shut the door
 3 | 98      Five, six, pick up sticks
 4 | 97      Seven, eight, lay them straight
 5 | 96      Nine, ten, a big fat hen
 6 | 95      One, two, buckle my shoe
 7 | 94      Three, four, shut the door
 8 | 93      Five, six, pick up sticks
 9 | 92      Seven, eight, lay them straight
10 | 91      Nine, ten, a big fat hen


--------------------------------------------------------------------------------
/ch05-io/src/main/java/StreamCompressor.java:
--------------------------------------------------------------------------------
 1 | // cc StreamCompressor A program to compress data read from standard input and write it to standard output
 2 | import org.apache.hadoop.conf.Configuration;
 3 | import org.apache.hadoop.io.IOUtils;
 4 | import org.apache.hadoop.io.compress.CompressionCodec;
 5 | import org.apache.hadoop.io.compress.CompressionOutputStream;
 6 | import org.apache.hadoop.util.ReflectionUtils;
 7 | 
 8 | // vv StreamCompressor
 9 | public class StreamCompressor {
10 | 
11 |   public static void main(String[] args) throws Exception {
12 |     String codecClassname = args[0];
13 |     Class<?> codecClass = Class.forName(codecClassname);
14 |     Configuration conf = new Configuration();
15 |     CompressionCodec codec = (CompressionCodec)
16 |       ReflectionUtils.newInstance(codecClass, conf);
17 |     
18 |     CompressionOutputStream out = codec.createOutputStream(System.out);
19 |     IOUtils.copyBytes(System.in, out, 4096, false);
20 |     out.finish();
21 |   }
22 | }
23 | // ^^ StreamCompressor
24 | 


--------------------------------------------------------------------------------
/ch05-io/src/main/java/TextArrayWritable.java:
--------------------------------------------------------------------------------
 1 | // == TextArrayWritable
 2 | import org.apache.hadoop.io.ArrayWritable;
 3 | import org.apache.hadoop.io.Text;
 4 | 
 5 | // vv TextArrayWritable
 6 | public class TextArrayWritable extends ArrayWritable {
 7 |   public TextArrayWritable() {
 8 |     super(Text.class);
 9 |   }
10 | }
11 | // ^^ TextArrayWritable
12 | 


--------------------------------------------------------------------------------
/ch05-io/src/main/java/TextIterator.java:
--------------------------------------------------------------------------------
 1 | // cc TextIterator Iterating over the characters in a Text object
 2 | import java.nio.ByteBuffer;
 3 | 
 4 | import org.apache.hadoop.io.Text;
 5 | 
 6 | // vv TextIterator
 7 | public class TextIterator {
 8 |   
 9 |   public static void main(String[] args) {    
10 |     Text t = new Text("\u0041\u00DF\u6771\uD801\uDC00");
11 |     
12 |     ByteBuffer buf = ByteBuffer.wrap(t.getBytes(), 0, t.getLength());
13 |     int cp;
14 |     while (buf.hasRemaining() && (cp = Text.bytesToCodePoint(buf)) != -1) {
15 |       System.out.println(Integer.toHexString(cp));
16 |     }
17 |   }  
18 | }
19 | // ^^ TextIterator
20 | 


--------------------------------------------------------------------------------
/ch05-io/src/test/java/ArrayWritableTest.java:
--------------------------------------------------------------------------------
 1 | // == ArrayWritableTest
 2 | import static org.hamcrest.CoreMatchers.is;
 3 | import static org.junit.Assert.assertThat;
 4 | 
 5 | import java.io.IOException;
 6 | import org.apache.hadoop.io.*;
 7 | import org.junit.Test;
 8 | 
 9 | public class ArrayWritableTest extends WritableTestBase {
10 |   
11 |   @Test
12 |   public void test() throws IOException {
13 |     // vv ArrayWritableTest
14 |     ArrayWritable writable = new ArrayWritable(Text.class);
15 |     // ^^ ArrayWritableTest
16 |     writable.set(new Text[] { new Text("cat"), new Text("dog") });
17 |     
18 |     TextArrayWritable dest = new TextArrayWritable();
19 |     WritableUtils.cloneInto(dest, writable);
20 |     assertThat(dest.get().length, is(2));
21 |     // TODO: fix cast, also use single assert
22 |     assertThat((Text) dest.get()[0], is(new Text("cat")));
23 |     assertThat((Text) dest.get()[1], is(new Text("dog")));
24 |     
25 |     Text[] copy = (Text[]) dest.toArray();
26 |     assertThat(copy[0], is(new Text("cat")));
27 |     assertThat(copy[1], is(new Text("dog")));
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/ch05-io/src/test/java/BinaryOrTextWritable.java:
--------------------------------------------------------------------------------
 1 | import org.apache.hadoop.io.BytesWritable;
 2 | import org.apache.hadoop.io.GenericWritable;
 3 | import org.apache.hadoop.io.Text;
 4 | import org.apache.hadoop.io.Writable;
 5 | 
 6 | public class BinaryOrTextWritable extends GenericWritable {
 7 |   private static Class[] TYPES = { BytesWritable.class, Text.class };
 8 | 
 9 |   @Override
10 |   @SuppressWarnings("unchecked")
11 |   protected Class<? extends Writable>[] getTypes() {
12 |     return TYPES;
13 |   }
14 |   
15 | }
16 | 


--------------------------------------------------------------------------------
/ch05-io/src/test/java/BooleanWritableTest.java:
--------------------------------------------------------------------------------
 1 | import static org.hamcrest.CoreMatchers.is;
 2 | import static org.junit.Assert.assertThat;
 3 | 
 4 | import java.io.IOException;
 5 | import org.apache.hadoop.io.BooleanWritable;
 6 | import org.junit.Test;
 7 | 
 8 | public class BooleanWritableTest extends WritableTestBase {
 9 |   
10 |   @Test
11 |   public void test() throws IOException {
12 |     BooleanWritable src = new BooleanWritable(true);
13 |     BooleanWritable dest = new BooleanWritable();
14 |     assertThat(writeTo(src, dest), is("01"));
15 |     assertThat(dest.get(), is(src.get()));
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/ch05-io/src/test/java/BytesWritableTest.java:
--------------------------------------------------------------------------------
 1 | // == BytesWritableTest
 2 | // == BytesWritableTest-Capacity
 3 | import static org.hamcrest.CoreMatchers.is;
 4 | import static org.junit.Assert.assertThat;
 5 | 
 6 | import java.io.IOException;
 7 | import org.apache.hadoop.io.BytesWritable;
 8 | import org.apache.hadoop.util.StringUtils;
 9 | import org.junit.Test;
10 | 
11 | public class BytesWritableTest extends WritableTestBase {
12 |   
13 |   @Test
14 |   public void test() throws IOException {
15 |     // vv BytesWritableTest
16 |     BytesWritable b = new BytesWritable(new byte[] { 3, 5 });
17 |     byte[] bytes = serialize(b);
18 |     assertThat(StringUtils.byteToHexString(bytes), is("000000020305"));
19 |     // ^^ BytesWritableTest
20 |     
21 |     // vv BytesWritableTest-Capacity
22 |     b.setCapacity(11);
23 |     assertThat(b.getLength(), is(2));
24 |     assertThat(b.getBytes().length, is(11));
25 |     // ^^ BytesWritableTest-Capacity
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/ch05-io/src/test/java/FileDecompressorTest.java:
--------------------------------------------------------------------------------
 1 | import static org.hamcrest.CoreMatchers.is;
 2 | import static org.junit.Assert.assertThat;
 3 | 
 4 | import java.io.*;
 5 | import java.util.Scanner;
 6 | import org.apache.hadoop.fs.FileUtil;
 7 | import org.apache.hadoop.io.IOUtils;
 8 | import org.junit.Test;
 9 | 
10 | public class FileDecompressorTest {
11 | 
12 |   @Test
13 |   public void decompressesGzippedFile() throws Exception {
14 |     File file = File.createTempFile("file", ".gz");
15 |     file.deleteOnExit();
16 |     InputStream in = this.getClass().getResourceAsStream("/file.gz");
17 |     IOUtils.copyBytes(in, new FileOutputStream(file), 4096, true);
18 |     
19 |     String path = file.getAbsolutePath();
20 |     FileDecompressor.main(new String[] { path });
21 |     
22 |     String decompressedPath = path.substring(0, path.length() - 3);
23 |     assertThat(readFile(new File(decompressedPath)), is("Text\n"));
24 |   }
25 |   
26 |   private String readFile(File file) throws IOException {
27 |     return new Scanner(file).useDelimiter("\\A").next();
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/ch05-io/src/test/java/GenericWritableTest.java:
--------------------------------------------------------------------------------
 1 | import static org.hamcrest.CoreMatchers.is;
 2 | import static org.junit.Assert.assertThat;
 3 | 
 4 | import java.io.IOException;
 5 | import org.apache.hadoop.io.*;
 6 | import org.junit.Test;
 7 | 
 8 | public class GenericWritableTest extends WritableTestBase {
 9 |   
10 |   @Test
11 |   public void test() throws IOException {
12 |     BinaryOrTextWritable src = new BinaryOrTextWritable();
13 |     src.set(new Text("text"));
14 |     BinaryOrTextWritable dest = new BinaryOrTextWritable();
15 |     WritableUtils.cloneInto(dest, src);
16 |     assertThat((Text) dest.get(), is(new Text("text")));
17 |     
18 |     src.set(new BytesWritable(new byte[] {3, 5}));
19 |     WritableUtils.cloneInto(dest, src);
20 |     assertThat(((BytesWritable) dest.get()).getLength(), is(2)); // TODO proper assert
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/ch05-io/src/test/java/NullWritableTest.java:
--------------------------------------------------------------------------------
 1 | import static org.hamcrest.CoreMatchers.is;
 2 | import static org.junit.Assert.assertThat;
 3 | 
 4 | import java.io.IOException;
 5 | import org.apache.hadoop.io.NullWritable;
 6 | import org.junit.Test;
 7 | 
 8 | public class NullWritableTest extends WritableTestBase {
 9 |   
10 |   @Test
11 |   public void test() throws IOException {
12 |     NullWritable writable = NullWritable.get();
13 |     assertThat(serialize(writable).length, is(0));
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/ch05-io/src/test/java/ObjectWritableTest.java:
--------------------------------------------------------------------------------
 1 | import static org.hamcrest.CoreMatchers.is;
 2 | import static org.junit.Assert.assertThat;
 3 | 
 4 | import java.io.IOException;
 5 | import org.apache.hadoop.io.*;
 6 | import org.junit.Test;
 7 | 
 8 | public class ObjectWritableTest extends WritableTestBase {
 9 |   
10 |   @Test
11 |   public void test() throws IOException {
12 |     ObjectWritable src = new ObjectWritable(Integer.TYPE, 163);
13 |     ObjectWritable dest = new ObjectWritable();
14 |     WritableUtils.cloneInto(dest, src);
15 |     assertThat((Integer) dest.get(), is(163));
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/ch05-io/src/test/java/VLongWritableTest.java:
--------------------------------------------------------------------------------
 1 | import static org.hamcrest.CoreMatchers.is;
 2 | import static org.junit.Assert.assertThat;
 3 | 
 4 | import java.io.IOException;
 5 | import org.apache.hadoop.io.VLongWritable;
 6 | import org.junit.Test;
 7 | 
 8 | public class VLongWritableTest extends WritableTestBase {
 9 |   
10 |   @Test
11 |   public void test() throws IOException {
12 |     assertThat(serializeToString(new VLongWritable(1)), is("01")); // 1 byte
13 |     assertThat(serializeToString(new VLongWritable(127)), is("7f")); // 1 byte
14 |     assertThat(serializeToString(new VLongWritable(128)), is("8f80")); // 2 byte
15 |     assertThat(serializeToString(new VLongWritable(163)), is("8fa3")); // 2 byte
16 |     assertThat(serializeToString(new VLongWritable(Long.MAX_VALUE)), is("887fffffffffffffff")); // 9 byte
17 |     assertThat(serializeToString(new VLongWritable(Long.MIN_VALUE)), is("807fffffffffffffff")); // 9 byte
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/ch05-io/src/test/resources/file.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch05-io/src/test/resources/file.gz


--------------------------------------------------------------------------------
/ch06-mr-dev/input/ncdc/micro/sample.txt:
--------------------------------------------------------------------------------
1 | 0067011990999991950051507004+68750+023550FM-12+038299999V0203301N00671220001CN9999999N9+00001+99999999999
2 | 0043011990999991950051512004+68750+023550FM-12+038299999V0203201N00671220001CN9999999N9+00221+99999999999
3 | 0043011990999991950051518004+68750+023550FM-12+038299999V0203201N00261220001CN9999999N9-00111+99999999999
4 | 0043012650999991949032412004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+01111+99999999999
5 | 0043012650999991949032418004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+00781+99999999999


--------------------------------------------------------------------------------
/ch06-mr-dev/output/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/ch06-mr-dev/output/.part-r-00000.crc:
--------------------------------------------------------------------------------
1 | crc    =)|$


--------------------------------------------------------------------------------
/ch06-mr-dev/output/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch06-mr-dev/output/_SUCCESS


--------------------------------------------------------------------------------
/ch06-mr-dev/output/part-r-00000:
--------------------------------------------------------------------------------
1 | 1949	111
2 | 1950	22
3 | 


--------------------------------------------------------------------------------
/ch06-mr-dev/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 |   <parent>
 5 |     <groupId>com.hadoopbook</groupId>
 6 |     <artifactId>hadoop-meta</artifactId>
 7 |     <version>4.0</version>
 8 |     <relativePath>../hadoop-meta/pom.xml</relativePath>
 9 |   </parent>
10 |   <groupId>com.hadoopbook</groupId>
11 |   <artifactId>ch06-mr-dev</artifactId>
12 |   <packaging>jar</packaging>
13 |   <version>4.0</version>
14 |   <name>Chapter 6: Developing a MapReduce Application</name>
15 |   <dependencies>
16 |     <dependency>
17 |       <groupId>junit</groupId>
18 |       <artifactId>junit</artifactId>
19 |     </dependency>
20 |   </dependencies>
21 | </project>
22 | 


--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/examples/ConfigurationPrinterSystem.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop -Dcolor=yellow ConfigurationPrinter | grep color


--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/examples/ConfigurationPrinterWithConf.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop ConfigurationPrinter -conf conf/hadoop-localhost.xml \
2 |   | grep mapred.job.tracker=


--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/examples/ConfigurationPrinterWithConf.java.output.txt:
--------------------------------------------------------------------------------
1 | mapred.job.tracker=localhost:8021
2 | 


--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/examples/ConfigurationPrinterWithConfAndD.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop ConfigurationPrinter -conf conf/hadoop-localhost.xml \
2 |   -D mapred.job.tracker=example.com:8021 \
3 |   | grep mapred.job.tracker


--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/examples/ConfigurationPrinterWithD.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop ConfigurationPrinter -D color=yellow | grep color


--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/examples/ConfigurationPrinterWithD.java.output.txt:
--------------------------------------------------------------------------------
1 | color=yellow
2 | 


--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/examples/MaxTemperatureDriver.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar hadoop-examples.jar v3.MaxTemperatureDriver -conf conf/hadoop-cluster.xml \
2 |   input/ncdc/all max-temp


--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/examples/MaxTemperatureDriverV2.ignore/input.txt:
--------------------------------------------------------------------------------
1 | hadoop v2.MaxTemperatureDriver -conf conf/hadoop-local.xml \
2 |   input/ncdc/micro output


--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/examples/MaxTemperatureDriverV2GOP.ignore/input.txt:
--------------------------------------------------------------------------------
1 | hadoop v2.MaxTemperatureDriver -fs file:/// -jt local input/ncdc/micro output


--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/examples/MaxTemperatureDriverV3/input.txt:
--------------------------------------------------------------------------------
1 | hadoop v3.MaxTemperatureDriver -conf conf/hadoop-local.xml \
2 |   input/ncdc/micro output


--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/examples/MaxTemperatureDriverV3/output/part-r-00000:
--------------------------------------------------------------------------------
1 | 1949	111
2 | 1950	22
3 | 


--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/examples/MaxTemperatureDriverV3GOP/input.txt:
--------------------------------------------------------------------------------
1 | hadoop v3.MaxTemperatureDriver -fs file:/// -jt local input/ncdc/micro output


--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/examples/MaxTemperatureDriverV3GOP/output/part-r-00000:
--------------------------------------------------------------------------------
1 | 1949	111
2 | 1950	22
3 | 


--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/java/LoggingIdentityMapper.java:
--------------------------------------------------------------------------------
 1 | //cc LoggingIdentityMapper An identity mapper that writes to standard output and also uses the Apache Commons Logging API
 2 | import java.io.IOException;
 3 | 
 4 | //vv LoggingIdentityMapper
 5 | import org.apache.commons.logging.Log;
 6 | import org.apache.commons.logging.LogFactory;
 7 | import org.apache.hadoop.mapreduce.Mapper;
 8 | 
 9 | public class LoggingIdentityMapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
10 |   extends Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT> {
11 |   
12 |   private static final Log LOG = LogFactory.getLog(LoggingIdentityMapper.class);
13 |   
14 |   @Override
15 |   @SuppressWarnings("unchecked")
16 |   public void map(KEYIN key, VALUEIN value, Context context)
17 |       throws IOException, InterruptedException {
18 |     // Log to stdout file
19 |     System.out.println("Map key: " + key);
20 |     
21 |     // Log to syslog file
22 |     LOG.info("Map key: " + key);
23 |     if (LOG.isDebugEnabled()) {
24 |       LOG.debug("Map value: " + value);
25 |     }
26 |     context.write((KEYOUT) key, (VALUEOUT) value);
27 |   }
28 | }
29 | //^^ LoggingIdentityMapper


--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/java/v1/MaxTemperatureMapper.java:
--------------------------------------------------------------------------------
 1 | package v1;
 2 | // cc MaxTemperatureMapperV1 First version of a Mapper that passes MaxTemperatureMapperTest
 3 | import java.io.IOException;
 4 | import org.apache.hadoop.io.*;
 5 | import org.apache.hadoop.mapreduce.*;
 6 | //vv MaxTemperatureMapperV1
 7 | public class MaxTemperatureMapper
 8 |   extends Mapper<LongWritable, Text, Text, IntWritable> {
 9 |   
10 |   @Override
11 |   public void map(LongWritable key, Text value, Context context)
12 |       throws IOException, InterruptedException {
13 |     
14 |     String line = value.toString();
15 |     String year = line.substring(15, 19);
16 |     int airTemperature = Integer.parseInt(line.substring(87, 92));
17 |     context.write(new Text(year), new IntWritable(airTemperature));
18 |   }
19 | }
20 | //^^ MaxTemperatureMapperV1
21 | 


--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/java/v1/MaxTemperatureReducer.java:
--------------------------------------------------------------------------------
 1 | package v1;
 2 | //cc MaxTemperatureReducerV1 Reducer for maximum temperature example
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.io.IntWritable;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapreduce.Reducer;
 8 | 
 9 | // vv MaxTemperatureReducerV1
10 | public class MaxTemperatureReducer
11 |   extends Reducer<Text, IntWritable, Text, IntWritable> {
12 | 
13 |   @Override
14 |   public void reduce(Text key, Iterable<IntWritable> values,
15 |       Context context)
16 |       throws IOException, InterruptedException {
17 |     
18 |     int maxValue = Integer.MIN_VALUE;
19 |     for (IntWritable value : values) {
20 |       maxValue = Math.max(maxValue, value.get());
21 |     }
22 |     context.write(key, new IntWritable(maxValue));
23 |   }
24 | }
25 | // ^^ MaxTemperatureReducerV1
26 | 


--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/java/v2/MaxTemperatureMapper.java:
--------------------------------------------------------------------------------
 1 | package v2;
 2 | // cc MaxTemperatureMapperV2 A Mapper that uses a utility class to parse records
 3 | 
 4 | import java.io.IOException;
 5 | 
 6 | import org.apache.hadoop.io.IntWritable;
 7 | import org.apache.hadoop.io.LongWritable;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.mapreduce.Mapper;
10 | import v2.NcdcRecordParser;
11 | 
12 | // vv MaxTemperatureMapperV2
13 | public class MaxTemperatureMapper
14 |     extends Mapper<LongWritable, Text, Text, IntWritable> {
15 | 
16 |   /*[*/private NcdcRecordParser parser = new NcdcRecordParser();/*]*/
17 | 
18 |   @Override
19 |   public void map(LongWritable key, Text value, Context context)
20 |       throws IOException, InterruptedException {
21 | 
22 |     /*[*/parser.parse(value);/*]*/
23 |     if (/*[*/parser.isValidTemperature()/*]*/) {
24 |       context.write(new Text(/*[*/parser.getYear()/*]*/),
25 |           new IntWritable(/*[*/parser.getAirTemperature()/*]*/));
26 |     }
27 |   }
28 | }
29 | // ^^ MaxTemperatureMapperV2
30 | 


--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/resources/configuration-1.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <configuration>
 3 |   <property>
 4 |     <name>color</name>
 5 |     <value>yellow</value>
 6 |     <description>Color</description>
 7 |   </property>
 8 |   
 9 |   <property>
10 |     <name>size</name>
11 |     <value>10</value>
12 |     <description>Size</description>
13 |   </property>
14 |   
15 |   <property>
16 |     <name>weight</name>
17 |     <value>heavy</value>
18 |     <final>true</final>
19 |     <description>Weight</description>
20 |   </property>
21 |   
22 |   <property>
23 |     <name>size-weight</name>
24 |     <value>${size},${weight}</value>
25 |     <description>Size and weight</description>
26 |   </property>
27 | </configuration>


--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/resources/configuration-2.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <configuration>
 3 |   <property>
 4 |     <name>size</name>
 5 |     <value>12</value>
 6 |   </property>
 7 |   
 8 |   <property>
 9 |     <name>weight</name>
10 |     <value>light</value>
11 |   </property>
12 | </configuration>


--------------------------------------------------------------------------------
/ch06-mr-dev/src/main/resources/max-temp-workflow.properties:
--------------------------------------------------------------------------------
1 | # A properties file used to submit an Oozie workflow job.
2 | # This file is not bundled as a part of the workflow application.
3 | nameNode=hdfs://localhost:8020
4 | resourceManager=localhost:8032
5 | oozie.wf.application.path=${nameNode}/user/${user.name}/max-temp-workflow


--------------------------------------------------------------------------------
/ch06-mr-dev/src/test/java/SingleResourceConfigurationTest.java:
--------------------------------------------------------------------------------
 1 | // == SingleResourceConfigurationTest
 2 | import static org.hamcrest.CoreMatchers.is;
 3 | import static org.junit.Assert.assertThat;
 4 | 
 5 | import java.io.IOException;
 6 | 
 7 | import org.apache.hadoop.conf.Configuration;
 8 | import org.junit.Test;
 9 | 
10 | public class SingleResourceConfigurationTest {
11 |   
12 |   @Test
13 |   public void get() throws IOException {
14 |     // vv SingleResourceConfigurationTest
15 |     Configuration conf = new Configuration();
16 |     conf.addResource("configuration-1.xml");
17 |     assertThat(conf.get("color"), is("yellow"));
18 |     assertThat(conf.getInt("size", 0), is(10));
19 |     assertThat(conf.get("breadth", "wide"), is("wide"));
20 |     // ^^ SingleResourceConfigurationTest
21 |   }
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/ch06-mr-dev/src/test/java/v1/MaxTemperatureReducerTest.java:
--------------------------------------------------------------------------------
 1 | package v1;
 2 | // == MaxTemperatureReducerTestV1
 3 | import java.io.IOException;
 4 | import java.util.*;
 5 | import org.apache.hadoop.io.*;
 6 | import org.apache.hadoop.mrunit.mapreduce.ReduceDriver;
 7 | import org.junit.*;
 8 | 
 9 | public class MaxTemperatureReducerTest {
10 |   
11 |   //vv MaxTemperatureReducerTestV1
12 |   @Test
13 |   public void returnsMaximumIntegerInValues() throws IOException,
14 |       InterruptedException {
15 |     new ReduceDriver<Text, IntWritable, Text, IntWritable>()
16 |       .withReducer(new MaxTemperatureReducer())
17 |       .withInput(new Text("1950"),
18 |           Arrays.asList(new IntWritable(10), new IntWritable(5)))
19 |       .withOutput(new Text("1950"), new IntWritable(10))
20 |       .runTest();
21 |   }
22 |   //^^ MaxTemperatureReducerTestV1
23 | }
24 | 


--------------------------------------------------------------------------------
/ch06-mr-dev/src/test/resources/expected.txt:
--------------------------------------------------------------------------------
1 | 1949	111
2 | 1950	22
3 | 


--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/MaxTemperatureWithMultipleInputs/input.txt:
--------------------------------------------------------------------------------
1 | hadoop MaxTemperatureWithMultipleInputs input/ncdc/micro/sample.txt input/metoffice output


--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/MinimalMapReduce.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop MinimalMapReduce "input/ncdc/all/190{1,2}.gz" output


--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/MinimalMapReduce/input.txt:
--------------------------------------------------------------------------------
1 | hadoop MinimalMapReduce "input/ncdc/all/190{1,2}.gz" output


--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/MinimalMapReduceWithDefaults/input.txt:
--------------------------------------------------------------------------------
1 | hadoop MinimalMapReduceWithDefaults "input/ncdc/all/190{1,2}.gz" output


--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/PartitionByStationUsingMultipleOutputFormat.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar hadoop-examples.jar PartitionByStationUsingMultipleOutputFormat 'input/ncdc/all/190?.gz' output-part-by-station


--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/PartitionByStationUsingMultipleOutputs/2/input.txt:
--------------------------------------------------------------------------------
1 | hadoop PartitionByStationUsingMultipleOutputs "input/ncdc/all/190{1,2}.gz" output


--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/PartitionByStationUsingMultipleOutputs/2/output/part-r-00000:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch08-mr-types/src/main/examples/PartitionByStationUsingMultipleOutputs/2/output/part-r-00000


--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/PartitionByStationUsingMultipleOutputs/input.txt:
--------------------------------------------------------------------------------
1 | hadoop PartitionByStationUsingMultipleOutputs "input/ncdc/all/190{1,2}.gz" output


--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/PartitionByStationUsingMultipleOutputs/output/part-r-00000:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch08-mr-types/src/main/examples/PartitionByStationUsingMultipleOutputs/output/part-r-00000


--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/PartitionByStationYearUsingMultipleOutputs/2/input.txt:
--------------------------------------------------------------------------------
1 | hadoop PartitionByStationYearUsingMultipleOutputs "input/ncdc/all/190{1,2}.gz" output


--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/PartitionByStationYearUsingMultipleOutputs/2/output/part-r-00000:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch08-mr-types/src/main/examples/PartitionByStationYearUsingMultipleOutputs/2/output/part-r-00000


--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/PartitionByStationYearUsingMultipleOutputs/input.txt:
--------------------------------------------------------------------------------
1 | hadoop PartitionByStationYearUsingMultipleOutputs "input/ncdc/all/190{1,2}.gz" output


--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/PartitionByStationYearUsingMultipleOutputs/output/part-r-00000:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch08-mr-types/src/main/examples/PartitionByStationYearUsingMultipleOutputs/output/part-r-00000


--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/SmallFilesToSequenceFileConverter.ignore/input.txt:
--------------------------------------------------------------------------------
1 | hadoop SmallFilesToSequenceFileConverter input/smallfiles outputhadoop jar hadoop-examples.jar SmallFilesToSequenceFileConverter \
2 |   -conf conf/hadoop-localhost.xml -D mapred.reduce.tasks=2 input/smallfiles output


--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/SmallFilesToSequenceFileConverter.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar hadoop-examples.jar SmallFilesToSequenceFileConverter \
2 |   -conf conf/hadoop-localhost.xml -D mapred.reduce.tasks=2 input/smallfiles output


--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/default_streaming.input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \
2 |   -input input/ncdc/sample.txt \
3 |   -output output \
4 |   -inputformat org.apache.hadoop.mapred.TextInputFormat \
5 |   -mapper /bin/cat \
6 |   -partitioner org.apache.hadoop.mapred.lib.HashPartitioner \
7 |   -numReduceTasks 1 \
8 |   -reducer org.apache.hadoop.mapred.lib.IdentityReducer \
9 |   -outputformat org.apache.hadoop.mapred.TextOutputFormat


--------------------------------------------------------------------------------
/ch08-mr-types/src/main/examples/minimal_streaming.input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \
2 |   -input input/ncdc/sample.txt \
3 |   -output output \
4 |   -mapper /bin/cat


--------------------------------------------------------------------------------
/ch08-mr-types/src/main/java/NonSplittableTextInputFormat.java:
--------------------------------------------------------------------------------
 1 | // == NonSplittableTextInputFormat
 2 | import org.apache.hadoop.fs.Path;
 3 | import org.apache.hadoop.mapreduce.JobContext;
 4 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 5 | 
 6 | public class NonSplittableTextInputFormat extends TextInputFormat {
 7 |   @Override
 8 |   protected boolean isSplitable(JobContext context, Path file) {
 9 |     return false;
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/ch08-mr-types/src/main/java/StationPartitioner.java:
--------------------------------------------------------------------------------
 1 | // == StationPartitioner
 2 | import org.apache.hadoop.io.LongWritable;
 3 | import org.apache.hadoop.io.Text;
 4 | import org.apache.hadoop.mapreduce.Partitioner;
 5 | 
 6 | //vv StationPartitioner
 7 | public class StationPartitioner extends Partitioner<LongWritable, Text> {
 8 |   
 9 |   private NcdcRecordParser parser = new NcdcRecordParser();
10 |   
11 |   @Override
12 |   public int getPartition(LongWritable key, Text value, int numPartitions) {
13 |     parser.parse(value);
14 |     return getPartition(parser.getStationId());
15 |   }
16 | 
17 |   private int getPartition(String stationId) {
18 |     /*...*/
19 | // ^^ StationPartitioner
20 |     return 0;
21 | // vv StationPartitioner
22 |   }
23 | 
24 | }
25 | //^^ StationPartitioner


--------------------------------------------------------------------------------
/ch08-mr-types/src/main/java/oldapi/MinimalMapReduce.java:
--------------------------------------------------------------------------------
 1 | package oldapi;
 2 | 
 3 | import org.apache.hadoop.conf.Configured;
 4 | import org.apache.hadoop.fs.Path;
 5 | import org.apache.hadoop.mapred.*;
 6 | import org.apache.hadoop.util.*;
 7 | 
 8 | public class MinimalMapReduce extends Configured implements Tool {
 9 |   
10 |   @Override
11 |   public int run(String[] args) throws Exception {
12 |     if (args.length != 2) {
13 |       System.err.printf("Usage: %s [generic options] <input> <output>\n",
14 |           getClass().getSimpleName());
15 |       ToolRunner.printGenericCommandUsage(System.err);
16 |       return -1;
17 |     }
18 |     
19 |     JobConf conf = new JobConf(getConf(), getClass());
20 |     FileInputFormat.addInputPath(conf, new Path(args[0]));
21 |     FileOutputFormat.setOutputPath(conf, new Path(args[1]));
22 |     JobClient.runJob(conf);
23 |     return 0;
24 |   }
25 |   
26 |   public static void main(String[] args) throws Exception {
27 |     int exitCode = ToolRunner.run(new MinimalMapReduce(), args);
28 |     System.exit(exitCode);
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/ch08-mr-types/src/main/java/oldapi/NonSplittableTextInputFormat.java:
--------------------------------------------------------------------------------
 1 | package oldapi;
 2 | 
 3 | import org.apache.hadoop.fs.*;
 4 | import org.apache.hadoop.mapred.TextInputFormat;
 5 | 
 6 | public class NonSplittableTextInputFormat extends TextInputFormat {
 7 |   @Override
 8 |   protected boolean isSplitable(FileSystem fs, Path file) {
 9 |     return false;
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/ch08-mr-types/src/main/java/oldapi/StationPartitioner.java:
--------------------------------------------------------------------------------
 1 | package oldapi;
 2 | 
 3 | import org.apache.hadoop.io.*;
 4 | import org.apache.hadoop.mapred.*;
 5 | 
 6 | public class StationPartitioner implements Partitioner<LongWritable, Text> {
 7 |   
 8 |   private NcdcRecordParser parser = new NcdcRecordParser();
 9 |   
10 |   @Override
11 |   public int getPartition(LongWritable key, Text value, int numPartitions) {
12 |     parser.parse(value);
13 |     return getPartition(parser.getStationId());
14 |   }
15 | 
16 |   private int getPartition(String stationId) {
17 |     return 0;
18 |   }
19 | 
20 |   @Override
21 |   public void configure(JobConf conf) { }
22 | }
23 | 


--------------------------------------------------------------------------------
/ch08-mr-types/src/main/java/oldapi/WholeFileInputFormat.java:
--------------------------------------------------------------------------------
 1 | package oldapi;
 2 | 
 3 | import java.io.IOException;
 4 | import org.apache.hadoop.fs.*;
 5 | import org.apache.hadoop.io.*;
 6 | import org.apache.hadoop.mapred.*;
 7 | 
 8 | public class WholeFileInputFormat
 9 |     extends FileInputFormat<NullWritable, BytesWritable> {
10 |   
11 |   @Override
12 |   protected boolean isSplitable(FileSystem fs, Path filename) {
13 |     return false;
14 |   }
15 | 
16 |   @Override
17 |   public RecordReader<NullWritable, BytesWritable> getRecordReader(
18 |       InputSplit split, JobConf job, Reporter reporter) throws IOException {
19 | 
20 |     return new WholeFileRecordReader((FileSplit) split, job);
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/ch08-mr-types/src/main/sh/streaming.sh:
--------------------------------------------------------------------------------
 1 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \
 2 |   -input input/ncdc/sample.txt \
 3 |   -output output \
 4 |   -mapper /bin/cat
 5 |   
 6 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \
 7 |   -input input/ncdc/sample.txt \
 8 |   -output output \
 9 |   -inputformat org.apache.hadoop.mapred.TextInputFormat \
10 |   -mapper /bin/cat \
11 |   -partitioner org.apache.hadoop.mapred.lib.HashPartitioner \
12 |   -numReduceTasks 1 \
13 |   -reducer org.apache.hadoop.mapred.lib.IdentityReducer \
14 |   -outputformat org.apache.hadoop.mapred.TextOutputFormat


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/examples/JoinRecordWithStationName/2/input.txt:
--------------------------------------------------------------------------------
1 | hadoop JoinRecordWithStationName input/ncdc/sample.txt input/ncdc/metadata/stations-fixed-width.txt output


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/examples/JoinRecordWithStationName/2/output/part-r-00000:
--------------------------------------------------------------------------------
1 | 011990-99999	SIHCCAJAVRI                  	0043011990999991950051518004+68750+023550FM-12+038299999V0203201N00261220001CN9999999N9-00111+99999999999
2 | 011990-99999	SIHCCAJAVRI                  	0043011990999991950051512004+68750+023550FM-12+038299999V0203201N00671220001CN9999999N9+00221+99999999999
3 | 011990-99999	SIHCCAJAVRI                  	0067011990999991950051507004+68750+023550FM-12+038299999V0203301N00671220001CN9999999N9+00001+99999999999
4 | 012650-99999	TYNSET-HANSMOEN              	0043012650999991949032418004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+00781+99999999999
5 | 012650-99999	TYNSET-HANSMOEN              	0043012650999991949032412004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+01111+99999999999
6 | 


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/examples/JoinRecordWithStationName/input.txt:
--------------------------------------------------------------------------------
1 | hadoop JoinRecordWithStationName input/ncdc/sample.txt input/ncdc/metadata/stations-fixed-width.txt output


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/examples/JoinRecordWithStationName/output/part-r-00000:
--------------------------------------------------------------------------------
1 | 011990-99999	SIHCCAJAVRI                  	0067011990999991950051507004+68750+023550FM-12+038299999V0203301N00671220001CN9999999N9+00001+99999999999
2 | 011990-99999	SIHCCAJAVRI                  	0043011990999991950051512004+68750+023550FM-12+038299999V0203201N00671220001CN9999999N9+00221+99999999999
3 | 011990-99999	SIHCCAJAVRI                  	0043011990999991950051518004+68750+023550FM-12+038299999V0203201N00261220001CN9999999N9-00111+99999999999
4 | 012650-99999	TYNSET-HANSMOEN              	0043012650999991949032412004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+01111+99999999999
5 | 012650-99999	TYNSET-HANSMOEN              	0043012650999991949032418004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+00781+99999999999
6 | 


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/examples/LookupRecordByTemperature.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar hadoop-examples.jar LookupRecordByTemperature output-hashmapsort -100


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/examples/LookupRecordByTemperature.java.output.txt:
--------------------------------------------------------------------------------
1 | 357460-99999    1956


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/examples/LookupRecordsByTemperature.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar hadoop-examples.jar LookupRecordsByTemperature output-hashmapsort -100 \
2 |   2> /dev/null | wc -l


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/examples/LookupRecordsByTemperature.java.output.txt:
--------------------------------------------------------------------------------
1 | 1489272


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/examples/MaxTemperatureByStationNameUsingDistributedCacheFile.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar hadoop-examples.jar MaxTemperatureByStationNameUsingDistributedCacheFile \
2 |   -files input/ncdc/metadata/stations-fixed-width.txt input/ncdc/all output


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/examples/MaxTemperatureByStationNameUsingDistributedCacheFileApi.ignore/input.txt:
--------------------------------------------------------------------------------
1 | hadoop MaxTemperatureByStationNameUsingDistributedCacheFileApi \
2 |   -files input/ncdc/metadata/stations-fixed-width.txt input/ncdc/micro output


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/examples/MaxTemperatureUsingSecondarySort/input.txt:
--------------------------------------------------------------------------------
1 | hadoop MaxTemperatureUsingSecondarySort input/ncdc/sample.txt output


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/examples/MaxTemperatureUsingSecondarySort/output/part-r-00000:
--------------------------------------------------------------------------------
1 | 1949	111
2 | 1950	22
3 | 


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/examples/MaxTemperatureWithCounters.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar hadoop-examples.jar MaxTemperatureWithCounters input/ncdc/all output-counters
2 | 


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/examples/MaxTemperatureWithCounters/input.txt:
--------------------------------------------------------------------------------
1 | hadoop MaxTemperatureWithCounters input/ncdc/sample.txt output


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/examples/MaxTemperatureWithCounters/output/part-r-00000:
--------------------------------------------------------------------------------
1 | 1949	111
2 | 1950	22
3 | 


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/examples/MissingTemperatureFields.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar hadoop-examples.jar MissingTemperatureFields job_200904200610_0003


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/examples/SortByTemperatureToMapFile.ignore/input.txt:
--------------------------------------------------------------------------------
1 | hadoop SortDataPreprocessor input/ncdc/micro output-seq
2 | hadoop SortByTemperatureToMapFile output-seq output
3 | 


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/examples/SortByTemperatureUsingHashPartitioner.ignore/input.txt:
--------------------------------------------------------------------------------
1 | hadoop SortDataPreprocessor input/ncdc/micro output-seq
2 | hadoop SortByTemperatureUsingHashPartitioner output-seq output
3 | 


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/examples/SortByTemperatureUsingHashPartitioner.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar hadoop-examples.jar SortByTemperatureUsingHashPartitioner \
2 |   -D mapred.reduce.tasks=30 input/ncdc/all-seq output-hashsort
3 | 


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/examples/SortByTemperatureUsingTotalOrderPartitioner.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar hadoop-examples.jar SortByTemperatureUsingTotalOrderPartitioner \
2 |   -D mapred.reduce.tasks=30 input/ncdc/all-seq output-totalsort


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/examples/SortDataPreprocessor.ignore/input.txt:
--------------------------------------------------------------------------------
1 | hadoop SortDataPreprocessor input/ncdc/micro output


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/examples/SortDataPreprocessor.java.input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar hadoop-examples.jar SortDataPreprocessor input/ncdc/all \
2 |   input/ncdc/all-seq


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/java/JoinRecordMapper.java:
--------------------------------------------------------------------------------
 1 | // cc JoinRecordMapper Mapper for tagging weather records for a reduce-side join
 2 | import java.io.IOException;
 3 | 
 4 | import org.apache.hadoop.io.LongWritable;
 5 | import org.apache.hadoop.io.Text;
 6 | import org.apache.hadoop.mapreduce.Mapper;
 7 | 
 8 | //vv JoinRecordMapper
 9 | public class JoinRecordMapper
10 |     extends Mapper<LongWritable, Text, TextPair, Text> {
11 |   private NcdcRecordParser parser = new NcdcRecordParser();
12 |   
13 |   @Override
14 |   protected void map(LongWritable key, Text value, Context context)
15 |       throws IOException, InterruptedException {
16 |     parser.parse(value);
17 |     context.write(new TextPair(parser.getStationId(), "1"), value);
18 |   }
19 | 
20 | }
21 | //^^ JoinRecordMapper


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/java/JoinReducer.java:
--------------------------------------------------------------------------------
 1 | // cc JoinReducer Reducer for joining tagged station records with tagged weather records
 2 | import java.io.IOException;
 3 | import java.util.Iterator;
 4 | 
 5 | import org.apache.hadoop.io.Text;
 6 | import org.apache.hadoop.mapreduce.Reducer;
 7 | 
 8 | // vv JoinReducer
 9 | public class JoinReducer extends Reducer<TextPair, Text, Text, Text> {
10 | 
11 |   @Override
12 |   protected void reduce(TextPair key, Iterable<Text> values, Context context)
13 |       throws IOException, InterruptedException {
14 |     Iterator<Text> iter = values.iterator();
15 |     Text stationName = new Text(iter.next());
16 |     while (iter.hasNext()) {
17 |       Text record = iter.next();
18 |       Text outValue = new Text(stationName.toString() + "\t" + record.toString());
19 |       context.write(key.getFirst(), outValue);
20 |     }
21 |   }
22 | }
23 | // ^^ JoinReducer


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/java/JoinStationMapper.java:
--------------------------------------------------------------------------------
 1 | // cc JoinStationMapper Mapper for tagging station records for a reduce-side join
 2 | import java.io.IOException;
 3 | 
 4 | import org.apache.hadoop.io.*;
 5 | import org.apache.hadoop.mapreduce.Mapper;
 6 | 
 7 | // vv JoinStationMapper
 8 | public class JoinStationMapper
 9 |     extends Mapper<LongWritable, Text, TextPair, Text> {
10 |   private NcdcStationMetadataParser parser = new NcdcStationMetadataParser();
11 | 
12 |   @Override
13 |   protected void map(LongWritable key, Text value, Context context)
14 |       throws IOException, InterruptedException {
15 |     if (parser.parse(value)) {
16 |       context.write(new TextPair(parser.getStationId(), "0"),
17 |           new Text(parser.getStationName()));
18 |     }
19 |   }
20 | }
21 | // ^^ JoinStationMapper


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/java/oldapi/JoinRecordMapper.java:
--------------------------------------------------------------------------------
 1 | package oldapi;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.io.*;
 6 | import org.apache.hadoop.mapred.*;
 7 | 
 8 | public class JoinRecordMapper extends MapReduceBase
 9 |     implements Mapper<LongWritable, Text, TextPair, Text> {
10 |   private NcdcRecordParser parser = new NcdcRecordParser();
11 |   
12 |   public void map(LongWritable key, Text value,
13 |       OutputCollector<TextPair, Text> output, Reporter reporter)
14 |       throws IOException {
15 |     
16 |     parser.parse(value);
17 |     output.collect(new TextPair(parser.getStationId(), "1"), value);
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/java/oldapi/JoinReducer.java:
--------------------------------------------------------------------------------
 1 | package oldapi;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Iterator;
 5 | 
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapred.*;
 8 | 
 9 | public class JoinReducer extends MapReduceBase implements
10 |     Reducer<TextPair, Text, Text, Text> {
11 | 
12 |   public void reduce(TextPair key, Iterator<Text> values,
13 |       OutputCollector<Text, Text> output, Reporter reporter)
14 |       throws IOException {
15 | 
16 |     Text stationName = new Text(values.next());
17 |     while (values.hasNext()) {
18 |       Text record = values.next();
19 |       Text outValue = new Text(stationName.toString() + "\t" + record.toString());
20 |       output.collect(key.getFirst(), outValue);
21 |     }
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/java/oldapi/JoinStationMapper.java:
--------------------------------------------------------------------------------
 1 | package oldapi;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.io.*;
 6 | import org.apache.hadoop.mapred.*;
 7 | 
 8 | public class JoinStationMapper extends MapReduceBase
 9 |     implements Mapper<LongWritable, Text, TextPair, Text> {
10 |   private NcdcStationMetadataParser parser = new NcdcStationMetadataParser();
11 | 
12 |   public void map(LongWritable key, Text value,
13 |       OutputCollector<TextPair, Text> output, Reporter reporter)
14 |       throws IOException {
15 | 
16 |     if (parser.parse(value)) {
17 |       output.collect(new TextPair(parser.getStationId(), "0"),
18 |           new Text(parser.getStationName()));
19 |     }
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/python/max_daily_temp_map.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import re
 4 | import sys
 5 | 
 6 | for line in sys.stdin:
 7 |   val = line.strip()
 8 |   (usaf, wban, date, temp, q) = (val[4:10], val[10:15], val[15:23],
 9 |                                  int(val[87:92]), val[92:93])
10 |   if (temp != 9999 and re.match("[01459]", q)):
11 |     print "%s-%s\t%s\t%s" % (usaf, wban, date, temp)


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/python/max_daily_temp_reduce.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | 
 5 | (last_key, max_val) = (None, 0)
 6 | for line in sys.stdin:
 7 |   (station, date, temp) = line.strip().split("\t")
 8 |   key = "%s\t%s" % (station, date)
 9 |   if last_key and last_key != key:
10 |     print "%s\t%s" % (last_key, max_val)
11 |     (last_key, max_val) = (key, int(temp))
12 |   else:
13 |     (last_key, max_val) = (key, max(max_val, int(temp)))
14 | 
15 | if last_key:
16 |   print "%s\t%s" % (last_key, max_val)


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/python/mean_max_daily_temp.sh:
--------------------------------------------------------------------------------
 1 | STREAM="hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar -conf conf/hadoop-localhost.xml"
 2 | 
 3 | $STREAM \
 4 |   -D stream.num.map.output.key.fields=2 \
 5 |   -files ch09-mr-features/src/main/python/max_daily_temp_map.py,\
 6 | ch09-mr-features/src/main/python/max_daily_temp_reduce.py \
 7 |   -input input/ncdc/all \
 8 |   -output out_max_daily \
 9 |   -mapper ch09-mr-features/src/main/python/max_daily_temp_map.py \
10 |   -reducer ch09-mr-features/src/main/python/max_daily_temp_reduce.py
11 | 
12 | $STREAM \
13 |   -D stream.num.map.output.key.fields=2 \
14 |   -files ch09-mr-features/src/main/python/mean_max_daily_temp_map.py,\
15 | ch09-mr-features/src/main/python/mean_max_daily_temp_map.py \
16 |   -input out_max_daily \
17 |   -output out_mean_max_daily \
18 |   -mapper ch09-mr-features/src/main/python/mean_max_daily_temp_map.py \
19 |   -reducer ch09-mr-features/src/main/python/mean_max_daily_temp_reduce.py
20 | 
21 |   


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/python/mean_max_daily_temp_map.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | 
3 | import sys
4 | 
5 | # Change date to month and day
6 | for line in sys.stdin:
7 |   (station, date, temp) = line.strip().split("\t")
8 |   print "%s\t%s\t%s" % (station, date[4:8], temp) 


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/python/mean_max_daily_temp_reduce.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | 
 5 | (last_key, count, sum) = (None, 0, 0)
 6 | for line in sys.stdin:
 7 |   (station, month_day, temp) = line.strip().split("\t")
 8 |   key = "%s\t%s" % (station, month_day)
 9 |   if last_key and last_key != key:
10 |     print "%s\t%s" % (last_key, sum / count)
11 |     (last_key, count, sum) = (key, 1, int(temp))
12 |   else:
13 |     (last_key, count, sum) = (key, count + 1, sum + int(temp))
14 | 
15 | if last_key:
16 |   print "%s\t%s" % (last_key, sum / count)


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/python/secondary_sort.sh:
--------------------------------------------------------------------------------
 1 | hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \
 2 |   -D stream.num.map.output.key.fields=2 \
 3 |   -D mapreduce.partition.keypartitioner.options=-k1,1 \
 4 |   -D mapreduce.job.output.key.comparator.class=\
 5 | org.apache.hadoop.mapred.lib.KeyFieldBasedComparator \
 6 |   -D mapreduce.partition.keycomparator.options="-k1n -k2nr" \
 7 |   -files secondary_sort_map.py,secondary_sort_reduce.py \
 8 |   -input input/ncdc/all \
 9 |   -output output-secondarysort-streaming \
10 |   -mapper ch09-mr-features/src/main/python/secondary_sort_map.py \
11 |   -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
12 |   -reducer ch09-mr-features/src/main/python/secondary_sort_reduce.py
13 |   


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/python/secondary_sort_map.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import re
 4 | import sys
 5 | 
 6 | for line in sys.stdin:
 7 |   val = line.strip()
 8 |   (year, temp, q) = (val[15:19], int(val[87:92]), val[92:93])
 9 |   if temp == 9999:
10 |     sys.stderr.write("reporter:counter:Temperature,Missing,1\n")
11 |   elif re.match("[01459]", q):
12 |     print "%s\t%s" % (year, temp)
13 | 


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/python/secondary_sort_reduce.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | 
 5 | last_group = None
 6 | for line in sys.stdin:
 7 |   val = line.strip()
 8 |   (year, temp) = val.split("\t")
 9 |   group = year
10 |   if last_group != group:
11 |     print val
12 |     last_group = group
13 | 


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/r/fixed-partitions:
--------------------------------------------------------------------------------
1 | 0   124013605
2 | 1   151590303
3 | 2   191822960
4 | 3   675051684
5 | 
6 | 


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/r/sampled-partitions:
--------------------------------------------------------------------------------
1 | 0   331955753
2 | 1   276755563
3 | 2   263474844
4 | 3   270292395
5 | 


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/r/temperature_distribution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch09-mr-features/src/main/r/temperature_distribution.png


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/r/temperature_distribution.r:
--------------------------------------------------------------------------------
1 | png("temperature_distribution.png")
2 | data <- read.table("output_sorted")
3 | plot(data, xlab="Temperature", ylab="Number of readings")
4 | dev.off()


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/resources/MaxTemperatureWithCounters_Temperature.properties:
--------------------------------------------------------------------------------
1 | CounterGroupName=Air Temperature Records
2 | MISSING.name=Missing
3 | MALFORMED.name=Malformed


--------------------------------------------------------------------------------
/ch09-mr-features/src/main/resources/oldapi/MaxTemperatureWithCounters_Temperature.properties:
--------------------------------------------------------------------------------
1 | CounterGroupName=Air Temperature Records
2 | MISSING.name=Missing
3 | MALFORMED.name=Malformed


--------------------------------------------------------------------------------
/ch10-setup/src/main/conf/core-site.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0"?>
2 | <!-- core-site.xml -->
3 | <configuration>
4 |   <property>
5 |     <name>fs.defaultFS</name>
6 |     <value>hdfs://namenode/</value>
7 |   </property>
8 | </configuration>


--------------------------------------------------------------------------------
/ch10-setup/src/main/conf/hdfs-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <!-- hdfs-site.xml -->
 3 | <configuration>
 4 |   <property>
 5 |     <name>dfs.namenode.name.dir</name>
 6 |     <value>/disk1/hdfs/name,/remote/hdfs/name</value>
 7 |   </property>
 8 | 
 9 |   <property>
10 |     <name>dfs.datanode.data.dir</name>
11 |     <value>/disk1/hdfs/data,/disk2/hdfs/data</value>
12 |   </property>
13 |   
14 |   <property>
15 |     <name>dfs.namenode.checkpoint.dir</name>
16 |     <value>/disk1/hdfs/namesecondary,/disk2/hdfs/namesecondary</value>
17 |   </property>
18 | </configuration>


--------------------------------------------------------------------------------
/ch10-setup/src/main/conf/yarn-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <!-- yarn-site.xml -->
 3 | <configuration>
 4 |   <property>
 5 |     <name>yarn.resourcemanager.hostname</name>
 6 |     <value>resourcemanager</value>
 7 |   </property>
 8 | 
 9 |   <property>
10 |     <name>yarn.nodemanager.local-dirs</name>
11 |     <value>/disk1/nm-local-dir,/disk2/nm-local-dir</value>
12 |   </property>
13 | 
14 |   <property>
15 |     <name>yarn.nodemanager.aux-services</name>
16 |     <value>mapreduce.shuffle</value>
17 |   </property>
18 | 
19 |   <property>
20 |     <name>yarn.nodemanager.resource.memory-mb</name>
21 |     <value>16384</value>
22 |   </property>
23 | 
24 |   <property>
25 |     <name>yarn.nodemanager.resource.cpu-vcores</name>
26 |     <value>16</value>
27 |   </property>
28 | </configuration>


--------------------------------------------------------------------------------
/ch10-setup/src/main/sh/trash.sh:
--------------------------------------------------------------------------------
1 | hadoop fs -touchz quangle
2 | hadoop fs -rm quangle
3 | hadoop fs -lsr .Trash
4 | hadoop fs -mv .Trash/Current/quangle .
5 | hadoop fs -ls .


--------------------------------------------------------------------------------
/ch12-avro/src/main/c/dump_pairs.c:
--------------------------------------------------------------------------------
 1 | #include <avro.h>
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | 
 5 | int main(int argc, char *argv[]) {
 6 |   if (argc != 2) {
 7 |     fprintf(stderr, "Usage: dump_pairs <data_file>\n");
 8 |     exit(EXIT_FAILURE);
 9 |   }
10 |   
11 |   const char *avrofile = argv[1];
12 |   avro_schema_error_t error;
13 |   avro_file_reader_t filereader;
14 |   avro_datum_t pair;
15 |   avro_datum_t left;
16 |   avro_datum_t right;
17 |   int rval;
18 |   char *p;
19 | 
20 |   avro_file_reader(avrofile, &filereader);
21 |   while (1) {
22 |     rval = avro_file_reader_read(filereader, NULL, &pair);
23 |     if (rval) break;
24 |     if (avro_record_get(pair, "left", &left) == 0) {
25 |       avro_string_get(left, &p);
26 |       fprintf(stdout, "%s,", p);
27 |     }
28 |     if (avro_record_get(pair, "right", &right) == 0) {
29 |       avro_string_get(right, &p);
30 |       fprintf(stdout, "%s\n", p);
31 |     }
32 |   }
33 |   avro_file_reader_close(filereader);
34 |   return 0;
35 | }


--------------------------------------------------------------------------------
/ch12-avro/src/main/examples/AvroGenericMaxTemperature/input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar avro-examples.jar AvroGenericMaxTemperature \
2 |   input/ncdc/sample.txt output


--------------------------------------------------------------------------------
/ch12-avro/src/main/examples/AvroGenericMaxTemperature/output/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/ch12-avro/src/main/examples/AvroGenericMaxTemperature/output/.part-r-00000.avro.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch12-avro/src/main/examples/AvroGenericMaxTemperature/output/.part-r-00000.avro.crc


--------------------------------------------------------------------------------
/ch12-avro/src/main/examples/AvroGenericMaxTemperature/output/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch12-avro/src/main/examples/AvroGenericMaxTemperature/output/_SUCCESS


--------------------------------------------------------------------------------
/ch12-avro/src/main/examples/AvroGenericMaxTemperature/output/part-r-00000.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch12-avro/src/main/examples/AvroGenericMaxTemperature/output/part-r-00000.avro


--------------------------------------------------------------------------------
/ch12-avro/src/main/examples/AvroSort/input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar avro-examples.jar AvroSort input/avro/pairs.avro output \
2 |   ch12-avro/src/main/resources/SortedStringPair.avsc


--------------------------------------------------------------------------------
/ch12-avro/src/main/examples/AvroSort/output/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/ch12-avro/src/main/examples/AvroSort/output/.part-r-00000.avro.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch12-avro/src/main/examples/AvroSort/output/.part-r-00000.avro.crc


--------------------------------------------------------------------------------
/ch12-avro/src/main/examples/AvroSort/output/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch12-avro/src/main/examples/AvroSort/output/_SUCCESS


--------------------------------------------------------------------------------
/ch12-avro/src/main/examples/AvroSort/output/part-r-00000.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch12-avro/src/main/examples/AvroSort/output/part-r-00000.avro


--------------------------------------------------------------------------------
/ch12-avro/src/main/py/write_pairs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import string
 3 | import sys
 4 | 
 5 | from avro import schema
 6 | from avro import io
 7 | from avro import datafile
 8 | 
 9 | if __name__ == '__main__':
10 |   if len(sys.argv) != 2:
11 |     sys.exit('Usage: %s <data_file>' % sys.argv[0])
12 |   avro_file = sys.argv[1]
13 |   writer = open(avro_file, 'wb')
14 |   datum_writer = io.DatumWriter()
15 |   schema_object = schema.parse("""\
16 | { "type": "record",
17 |   "name": "StringPair",
18 |   "doc": "A pair of strings.",
19 |   "fields": [
20 |     {"name": "left", "type": "string"},
21 |     {"name": "right", "type": "string"}
22 |   ]
23 | }""")
24 |   dfw = datafile.DataFileWriter(writer, datum_writer, schema_object)
25 |   for line in sys.stdin.readlines():
26 |     (left, right) = string.split(line.strip(), ',')
27 |     dfw.append({'left':left, 'right':right});
28 |   dfw.close()


--------------------------------------------------------------------------------
/ch12-avro/src/main/resources/AliasedStringPair.avsc:
--------------------------------------------------------------------------------
1 | {
2 |   "type": "record",
3 |   "name": "StringPair",
4 |   "doc": "A pair of strings with aliased field names.",
5 |   "fields": [
6 |     {"name": "first", "type": "string", "aliases": ["left"]},
7 |     {"name": "second", "type": "string", "aliases": ["right"]}
8 |   ]
9 | }


--------------------------------------------------------------------------------
/ch12-avro/src/main/resources/Array.avsc:
--------------------------------------------------------------------------------
1 | {
2 |   "type": "array",
3 |   "items": "long"
4 | }


--------------------------------------------------------------------------------
/ch12-avro/src/main/resources/Enum.avsc:
--------------------------------------------------------------------------------
1 | {
2 |   "type": "enum",
3 |   "name": "Cutlery",
4 |   "doc": "An eating utensil.",
5 |   "symbols": ["KNIFE", "FORK", "SPOON"]
6 | }


--------------------------------------------------------------------------------
/ch12-avro/src/main/resources/Fixed.avsc:
--------------------------------------------------------------------------------
1 | {
2 |   "type": "fixed",
3 |   "name": "Md5Hash",
4 |   "size": 16
5 | }


--------------------------------------------------------------------------------
/ch12-avro/src/main/resources/Map.avsc:
--------------------------------------------------------------------------------
1 | {
2 |   "type": "map",
3 |   "values": "string"
4 | }


--------------------------------------------------------------------------------
/ch12-avro/src/main/resources/NewStringPair.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "record",
 3 |   "name": "StringPair",
 4 |   "doc": "A pair of strings with an added field.",
 5 |   "fields": [
 6 |     {"name": "left", "type": "string"},
 7 |     {"name": "right", "type": "string"},
 8 |     {"name": "description", "type": "string", "default": ""}
 9 |   ]
10 | }


--------------------------------------------------------------------------------
/ch12-avro/src/main/resources/NewStringPairWithNull.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "record",
 3 |   "name": "StringPair",
 4 |   "doc": "A pair of strings with an added (nullable) field.",
 5 |   "fields": [
 6 |     {"name": "left", "type": "string"},
 7 |     {"name": "right", "type": "string"},
 8 |     {"name": "description", "type": ["null", "string"], "default": null}
 9 |   ]
10 | }


--------------------------------------------------------------------------------
/ch12-avro/src/main/resources/ProjectedStringPair.avsc:
--------------------------------------------------------------------------------
1 | {
2 |   "type": "record",
3 |   "name": "StringPair",
4 |   "doc": "The right field of a pair of strings.",
5 |   "fields": [
6 |     {"name": "right", "type": "string"}
7 |   ]
8 | }


--------------------------------------------------------------------------------
/ch12-avro/src/main/resources/SortedStringPair.avsc:
--------------------------------------------------------------------------------
1 | {
2 |   "type": "record",
3 |   "name": "StringPair",
4 |   "doc": "A pair of strings, sorted by right field descending.",
5 |   "fields": [
6 |     {"name": "left", "type": "string", "order": "ignore"},
7 |     {"name": "right", "type": "string", "order": "descending"}
8 |   ]
9 | }


--------------------------------------------------------------------------------
/ch12-avro/src/main/resources/StringPair.avsc:
--------------------------------------------------------------------------------
1 | {
2 |   "type": "record",
3 |   "name": "StringPair",
4 |   "doc": "A pair of strings.",
5 |   "fields": [
6 |     {"name": "left", "type": "string"},
7 |     {"name": "right", "type": "string"}
8 |   ]
9 | }


--------------------------------------------------------------------------------
/ch12-avro/src/main/resources/SwitchedStringPair.avsc:
--------------------------------------------------------------------------------
1 | {
2 |   "type": "record",
3 |   "name": "StringPair",
4 |   "doc": "A pair of strings, sorted by right then left.",
5 |   "fields": [
6 |     {"name": "right", "type": "string"},
7 |     {"name": "left", "type": "string"}
8 |   ]
9 | }


--------------------------------------------------------------------------------
/ch12-avro/src/main/resources/Union.avsc:
--------------------------------------------------------------------------------
1 | [
2 |   "null",
3 |   "string",
4 |   {"type": "map", "values": "string"}
5 | ]


--------------------------------------------------------------------------------
/ch12-avro/src/main/resources/WeatherRecord.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "record",
 3 |   "name": "WeatherRecord",
 4 |   "namespace": "specific",
 5 |   "doc": "A weather reading.",
 6 |   "fields": [
 7 |     {"name": "year", "type": "int"},
 8 |     {"name": "temperature", "type": "int"},
 9 |     {"name": "stationId", "type": "string"}
10 |   ]
11 | }
12 | 


--------------------------------------------------------------------------------
/ch13-parquet/src/main/examples/TextToParquetWithAvro/input.txt:
--------------------------------------------------------------------------------
1 | hadoop jar parquet-examples.jar TextToParquetWithAvro \
2 |   input/docs/quangle.txt output


--------------------------------------------------------------------------------
/ch13-parquet/src/main/examples/TextToParquetWithAvro/output/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch13-parquet/src/main/examples/TextToParquetWithAvro/output/_SUCCESS


--------------------------------------------------------------------------------
/ch13-parquet/src/main/examples/TextToParquetWithAvro/output/_metadata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch13-parquet/src/main/examples/TextToParquetWithAvro/output/_metadata


--------------------------------------------------------------------------------
/ch13-parquet/src/main/examples/TextToParquetWithAvro/output/part-m-00000.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch13-parquet/src/main/examples/TextToParquetWithAvro/output/part-m-00000.parquet


--------------------------------------------------------------------------------
/ch13-parquet/src/test/resources/NewStringPair.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "record",
 3 |   "name": "StringPair",
 4 |   "doc": "A pair of strings with an added field.",
 5 |   "fields": [
 6 |     {"name": "left", "type": "string"},
 7 |     {"name": "right", "type": "string"},
 8 |     {"name": "description", "type": "string", "default": ""}
 9 |   ]
10 | }


--------------------------------------------------------------------------------
/ch13-parquet/src/test/resources/ProjectedStringPair.avsc:
--------------------------------------------------------------------------------
1 | {
2 |   "type": "record",
3 |   "name": "StringPair",
4 |   "doc": "The right field of a pair of strings.",
5 |   "fields": [
6 |     {"name": "right", "type": "string"}
7 |   ]
8 | }


--------------------------------------------------------------------------------
/ch13-parquet/src/test/resources/StringPair.avsc:
--------------------------------------------------------------------------------
1 | {
2 |   "type": "record",
3 |   "name": "StringPair",
4 |   "doc": "A pair of strings.",
5 |   "fields": [
6 |     {"name": "left", "type": "string"},
7 |     {"name": "right", "type": "string"}
8 |   ]
9 | }


--------------------------------------------------------------------------------
/ch13-parquet/src/test/resources/fruit.txt:
--------------------------------------------------------------------------------
1 | cherry
2 | apple
3 | banana
4 | 


--------------------------------------------------------------------------------
/ch14-flume/spool-to-hdfs-and-logger.properties:
--------------------------------------------------------------------------------
 1 | agent1.sources = source1
 2 | agent1.sinks = sink1a sink1b
 3 | agent1.channels = channel1a channel1b
 4 | 
 5 | agent1.sources.source1.channels = channel1a channel1b
 6 | agent1.sources.source1.selector.type = replicating
 7 | agent1.sources.source1.selector.optional = channel1b
 8 | agent1.sinks.sink1a.channel = channel1a
 9 | agent1.sinks.sink1b.channel = channel1b
10 | 
11 | agent1.sources.source1.type = spooldir
12 | agent1.sources.source1.spoolDir = /tmp/spooldir
13 | 
14 | agent1.sinks.sink1a.type = hdfs
15 | agent1.sinks.sink1a.hdfs.path = /tmp/flume
16 | agent1.sinks.sink1a.hdfs.filePrefix = events
17 | agent1.sinks.sink1a.hdfs.fileSuffix = .log
18 | agent1.sinks.sink1a.hdfs.fileType = DataStream
19 | 
20 | agent1.sinks.sink1b.type = logger
21 | 
22 | agent1.channels.channel1a.type = file
23 | agent1.channels.channel1b.type = memory
24 | 


--------------------------------------------------------------------------------
/ch14-flume/spool-to-hdfs-avro.properties:
--------------------------------------------------------------------------------
 1 | agent1.sources = source1
 2 | agent1.sinks = sink1
 3 | agent1.channels = channel1
 4 | 
 5 | agent1.sources.source1.channels = channel1
 6 | agent1.sinks.sink1.channel = channel1
 7 | 
 8 | agent1.sources.source1.type = spooldir
 9 | agent1.sources.source1.spoolDir = /tmp/spooldir
10 | 
11 | agent1.sinks.sink1.type = hdfs
12 | agent1.sinks.sink1.hdfs.path = /tmp/flume
13 | agent1.sinks.sink1.hdfs.filePrefix = events
14 | agent1.sinks.sink1.hdfs.fileSuffix = .avro
15 | agent1.sinks.sink1.hdfs.fileType = DataStream
16 | agent1.sinks.sink1.serializer = avro_event
17 | agent1.sinks.sink1.serializer.compressionCodec = snappy
18 | 
19 | agent1.channels.channel1.type = file
20 | 


--------------------------------------------------------------------------------
/ch14-flume/spool-to-hdfs-partitioned.properties:
--------------------------------------------------------------------------------
 1 | agent1.sources = source1
 2 | agent1.sinks = sink1
 3 | agent1.channels = channel1
 4 | 
 5 | agent1.sources.source1.channels = channel1
 6 | agent1.sinks.sink1.channel = channel1
 7 | 
 8 | agent1.sources.source1.type = spooldir
 9 | agent1.sources.source1.spoolDir = /tmp/spooldir
10 | agent1.sources.source1.interceptors = interceptor1
11 | agent1.sources.source1.interceptors.interceptor1.type = timestamp
12 | 
13 | agent1.sinks.sink1.type = hdfs
14 | agent1.sinks.sink1.hdfs.path = /tmp/flume/year=%Y/month=%m/day=%d
15 | agent1.sinks.sink1.hdfs.filePrefix = events
16 | agent1.sinks.sink1.hdfs.fileSuffix = .log
17 | agent1.sinks.sink1.hdfs.fileType = DataStream
18 | 
19 | agent1.channels.channel1.type = file
20 | 


--------------------------------------------------------------------------------
/ch14-flume/spool-to-hdfs.properties:
--------------------------------------------------------------------------------
 1 | agent1.sources = source1
 2 | agent1.sinks = sink1
 3 | agent1.channels = channel1
 4 | 
 5 | agent1.sources.source1.channels = channel1
 6 | agent1.sinks.sink1.channel = channel1
 7 | 
 8 | agent1.sources.source1.type = spooldir
 9 | agent1.sources.source1.spoolDir = /tmp/spooldir
10 | 
11 | agent1.sinks.sink1.type = hdfs
12 | agent1.sinks.sink1.hdfs.path = /tmp/flume
13 | agent1.sinks.sink1.hdfs.filePrefix = events
14 | agent1.sinks.sink1.hdfs.fileSuffix = .log
15 | agent1.sinks.sink1.hdfs.inUsePrefix = _
16 | agent1.sinks.sink1.hdfs.fileType = DataStream
17 | 
18 | agent1.channels.channel1.type = file
19 | 


--------------------------------------------------------------------------------
/ch14-flume/spool-to-logger.properties:
--------------------------------------------------------------------------------
 1 | agent1.sources = source1
 2 | agent1.sinks = sink1
 3 | agent1.channels = channel1
 4 | 
 5 | agent1.sources.source1.channels = channel1
 6 | agent1.sinks.sink1.channel = channel1
 7 | 
 8 | agent1.sources.source1.type = spooldir
 9 | agent1.sources.source1.spoolDir = /tmp/spooldir
10 | 
11 | agent1.sinks.sink1.type = logger
12 | 
13 | agent1.channels.channel1.type = file
14 | 


--------------------------------------------------------------------------------
/ch15-sqoop/widgets/part-m-00000.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch15-sqoop/widgets/part-m-00000.avro


--------------------------------------------------------------------------------
/ch16-pig/src/main/grunt/combine.grunt:
--------------------------------------------------------------------------------
 1 | -- == combine_union
 2 | -- == combine_schema
 3 | A = LOAD 'input/pig/combine/A' AS (f0:int, f1:int);
 4 | B = LOAD 'input/pig/combine/B' AS (f0:chararray, f1:chararray, f2:int);
 5 | -- vv combine_union
 6 | DUMP A;
 7 | DUMP B;
 8 | C = UNION A, B;
 9 | DUMP C;
10 | -- ^^ combine_union
11 | -- vv combine_schema
12 | DESCRIBE A;
13 | DESCRIBE B;
14 | DESCRIBE C;
15 | -- ^^ combine_schema
16 | 


--------------------------------------------------------------------------------
/ch16-pig/src/main/grunt/disambiguate.grunt:
--------------------------------------------------------------------------------
1 | A = LOAD 'input/pig/join/A' AS (id:int, name:chararray);
2 | B = LOAD 'input/pig/join/B' AS (name:chararray, id:int);
3 | C = JOIN A by id, B by id;
4 | D = FOREACH C GENERATE A::name;
5 | 


--------------------------------------------------------------------------------
/ch16-pig/src/main/grunt/flatten.grunt:
--------------------------------------------------------------------------------
 1 | -- a demonstration of the different effects of FLATTEN
 2 | 
 3 | C = LOAD 'input/pig/types/C'
 4 |   AS (f0:chararray, f1:chararray);
 5 |   
 6 | D = FOREACH C GENERATE TOTUPLE(TOTUPLE(f0), TOTUPLE(f1));
 7 | -- D is
 8 | -- (((a),(pomegranate)))
 9 | -- (((b),(apple)))
10 | DUMP D
11 | 
12 | F = FOREACH D GENERATE FLATTEN($0);
13 | -- F is
14 | -- ((a),(pomegranate))
15 | -- ((b),(apple))
16 | -- One level of nesting removed
17 | DUMP F
18 | 
19 | 
20 | B = FOREACH C GENERATE TOBAG(f0, f1);
21 | -- B is
22 | -- ({(a),(pomegranate)})
23 | -- ({(b),(apple)})
24 | DUMP B
25 | 
26 | F = FOREACH B GENERATE FLATTEN($0);
27 | -- F is
28 | -- (a)
29 | -- (pomegranate)
30 | -- (b)
31 | -- (apple)
32 | -- Tuples in bags are turned into tuples
33 | DUMP F
34 | 
35 | B = FOREACH C GENERATE f0, TOBAG(f1, f1);
36 | -- B is
37 | -- (a,{(pomegranate),(pomegranate)})
38 | -- (b,{(apple),(apple)})
39 | DUMP B
40 | 
41 | F = FOREACH B GENERATE $0, FLATTEN($1);
42 | -- F is
43 | -- (a,pomegranate)
44 | -- (a,pomegranate)
45 | -- (b,apple)
46 | -- (b,apple)
47 | -- Tuples in bags can be added to elements at the top level
48 | DUMP F


--------------------------------------------------------------------------------
/ch16-pig/src/main/grunt/foreach.grunt:
--------------------------------------------------------------------------------
 1 | -- == foreach_generate
 2 | A = LOAD 'input/pig/foreach/A'
 3 |   AS (f0:chararray, f1:chararray, f2:int);
 4 | -- vv foreach_generate
 5 | DUMP A;
 6 | B = FOREACH A GENERATE $0, $2+1, 'Constant';
 7 | DUMP B;
 8 | -- ^^ foreach_generate
 9 | DESCRIBE B;
10 | C = FOREACH A GENERATE $0, (int) $2 AS f1, 'Constant' AS f2;
11 | DUMP C;
12 | DESCRIBE C;
13 | 
14 | -- C = FOREACH A GENERATE $0, (int) $2 AS f1, 'Constant' AS f2, ($2 > 3 ? 1 : 0);
15 | 


--------------------------------------------------------------------------------
/ch16-pig/src/main/grunt/group.grunt:
--------------------------------------------------------------------------------
 1 | -- == group_dump
 2 | -- == group_expression
 3 | -- == group_all
 4 | A = LOAD 'input/pig/group/A';
 5 | -- vv group_dump
 6 | DUMP A;
 7 | -- ^^ group_dump
 8 | -- vv group_expression
 9 | B = GROUP A BY SIZE($1);
10 | DUMP B;
11 | -- ^^ group_expression
12 | -- vv group_all
13 | C = GROUP A ALL;
14 | DUMP C;
15 | -- ^^ group_all
16 | D = FOREACH C GENERATE COUNT(A);
17 | DUMP D;
18 | 


--------------------------------------------------------------------------------
/ch16-pig/src/main/grunt/missing.grunt:
--------------------------------------------------------------------------------
1 | -- == missing_fields
2 | 
3 | -- vv missing_fields
4 | A = LOAD 'input/pig/corrupt/missing_fields';
5 | DUMP A;
6 | B = FILTER A BY SIZE(TOTUPLE(*)) > 1;
7 | DUMP B;
8 | -- ^^ missing_fields


--------------------------------------------------------------------------------
/ch16-pig/src/main/grunt/multiquery.grunt:
--------------------------------------------------------------------------------
1 | A = LOAD 'input/pig/multiquery/A';
2 | B = FILTER A BY $1 == 'banana';
3 | C = FILTER A BY $1 != 'banana';
4 | STORE B INTO 'output/b';
5 | STORE C INTO 'output/c';


--------------------------------------------------------------------------------
/ch16-pig/src/main/grunt/set.grunt:
--------------------------------------------------------------------------------
1 | -- == set_debug_on
2 | -- vv set_debug_on
3 | set debug on
4 | -- ^^ set_debug_on
5 | -- == set_default_parallel
6 | -- vv set_default_parallel
7 | set default_parallel 30
8 | -- ^^ set_default_parallel
9 | 


--------------------------------------------------------------------------------
/ch16-pig/src/main/grunt/sort.grunt:
--------------------------------------------------------------------------------
 1 | -- == sort_dump
 2 | -- == sort_order
 3 | -- == sort_no_order
 4 | -- == sort_limit
 5 | A = LOAD 'input/pig/sort/A';
 6 | -- vv sort_dump
 7 | DUMP A;
 8 | -- ^^ sort_dump
 9 | -- vv sort_order
10 | B = ORDER A BY $0, $1 DESC;
11 | DUMP B;
12 | -- ^^ sort_order
13 | -- vv sort_no_order
14 | C = FOREACH B GENERATE *;
15 | -- ^^ sort_no_order
16 | DUMP C;
17 | -- vv sort_limit
18 | D = LIMIT B 2;
19 | DUMP D;
20 | -- ^^ sort_limit


--------------------------------------------------------------------------------
/ch16-pig/src/main/grunt/store.grunt:
--------------------------------------------------------------------------------
1 | -- == store_colon_delimited
2 | A = LOAD 'input/pig/foreach/A';
3 | -- vv store_colon_delimited
4 | STORE A INTO 'out' USING PigStorage(':');
5 | cat out
6 | -- ^^ store_colon_delimited
7 | 


--------------------------------------------------------------------------------
/ch16-pig/src/main/grunt/stream.grunt:
--------------------------------------------------------------------------------
1 | -- == stream_cut
2 | A = LOAD 'input/pig/foreach/A'
3 |   AS (f0:chararray, f1:chararray, f2:int);
4 | -- vv stream_cut
5 | C = STREAM A THROUGH `cut -f 2`;
6 | DUMP C;
7 | -- ^^ stream_cut
8 | 


--------------------------------------------------------------------------------
/ch16-pig/src/main/grunt/tuples.grunt:
--------------------------------------------------------------------------------
1 | A = LOAD 'input/pig/types/A'
2 |   AS (f0, t0:tuple(f1:int, f2:chararray, t1:tuple(f3:int, f4:chararray)));
3 | DUMP A;
4 | 


--------------------------------------------------------------------------------
/ch16-pig/src/main/grunt/types.grunt:
--------------------------------------------------------------------------------
 1 | A = LOAD 'input/pig/tuples/A'
 2 |   AS (t0:tuple(f0:int, f2:chararray));
 3 | DUMP A;
 4 | DESCRIBE A;
 5 | one = LOAD 'input/pig/types/one';
 6 | B = FOREACH one GENERATE (1,'pomegranate')
 7 |   AS t0:tuple(f0:int, f2:chararray);
 8 | DUMP B;
 9 | DESCRIBE B;
10 | C = FOREACH one GENERATE ['a'#'pomegranate']
11 |   AS t0:map[];
12 | DUMP C;
13 | DESCRIBE C;
14 | 
15 | C = LOAD 'input/pig/types/C'
16 |   AS (f0:chararray, f1:chararray);
17 | D = FOREACH C GENERATE TOTUPLE(f0, f1);
18 | DUMP D;
19 | D = FOREACH C GENERATE (f0, f1);
20 | DUMP D;
21 | E = FOREACH C GENERATE TOBAG(f0, f1);
22 | DUMP E;
23 | E = FOREACH C GENERATE {f0, f1};
24 | DUMP E;
25 | F = FOREACH C GENERATE TOMAP(f0, f1);
26 | DUMP F;
27 | F = FOREACH C GENERATE [f0, f1];
28 | DUMP F;
29 | 
30 | G = FOREACH one GENERATE true AS f0:boolean, 1 as f1:int, 1L as f2:long,
31 |   1.0F as f3:float, 1.0 as f4:double, '10000000000' as f5:biginteger,
32 |   '0.110001000000000000000001' as f6:bigdecimal, 'a' as f7:chararray,
33 |   ToDate('2012-01-02T03:04:05.678Z') as f8:datetime;
34 | DUMP G;
35 | DESCRIBE G;
36 | 


--------------------------------------------------------------------------------
/ch16-pig/src/main/java/com/hadoopbook/pig/Trim.java:
--------------------------------------------------------------------------------
 1 | package com.hadoopbook.pig;
 2 | 
 3 | import org.apache.pig.PrimitiveEvalFunc;
 4 | 
 5 | //cc Trim An EvalFunc UDF to trim leading and trailing whitespace from chararray values
 6 | //vv Trim
 7 | public class Trim extends PrimitiveEvalFunc<String, String> {
 8 |   @Override
 9 |   public String exec(String input) {
10 |     return input.trim();
11 |   }
12 | }
13 | // ^^ Trim


--------------------------------------------------------------------------------
/ch16-pig/src/main/pig/comment_c-style.pig:
--------------------------------------------------------------------------------
1 | /* 
2 |  * Description of my program spanning
3 |  * multiple lines.
4 |  */
5 | A = LOAD 'input/pig/join/A';
6 | B = LOAD 'input/pig/join/B';
7 | C = JOIN A BY $0, /* ignored */ B BY $1;
8 | DUMP C;


--------------------------------------------------------------------------------
/ch16-pig/src/main/pig/comment_single_line.pig:
--------------------------------------------------------------------------------
1 | -- My program
2 | DUMP A; -- What's in A?


--------------------------------------------------------------------------------
/ch16-pig/src/main/pig/max_temp.macro:
--------------------------------------------------------------------------------
1 | DEFINE max_by_group(X, group_key, max_field) RETURNS Y {
2 |   A = GROUP $X by $group_key;
3 |   $Y = FOREACH A GENERATE group, MAX($X.$max_field);
4 | };


--------------------------------------------------------------------------------
/ch16-pig/src/main/pig/max_temp.pig:
--------------------------------------------------------------------------------
 1 | -- max_temp.pig: Finds the maximum temperature by year
 2 | records = LOAD 'input/ncdc/micro-tab/sample.txt'
 3 |   AS (year:chararray, temperature:int, quality:int);
 4 | filtered_records = FILTER records BY temperature != 9999 AND
 5 |   quality IN (0, 1, 4, 5, 9);
 6 | grouped_records = GROUP filtered_records BY year;
 7 | max_temp = FOREACH grouped_records GENERATE group,
 8 |   MAX(filtered_records.temperature);
 9 | DUMP max_temp;
10 | 


--------------------------------------------------------------------------------
/ch16-pig/src/main/pig/max_temp_filter_stream.pig:
--------------------------------------------------------------------------------
 1 | -- max_temp_filter_stream.pig
 2 | DEFINE is_good_quality `is_good_quality.py`
 3 |   SHIP ('ch16-pig/src/main/python/is_good_quality.py');
 4 | records = LOAD 'input/ncdc/micro-tab/sample.txt'
 5 |   AS (year:chararray, temperature:int, quality:int);
 6 | filtered_records = STREAM records THROUGH is_good_quality
 7 |   AS (year:chararray, temperature:int);
 8 | grouped_records = GROUP filtered_records BY year;
 9 | max_temp = FOREACH grouped_records GENERATE group,
10 |   MAX(filtered_records.temperature);
11 | DUMP max_temp;
12 | 


--------------------------------------------------------------------------------
/ch16-pig/src/main/pig/max_temp_filter_udf.pig:
--------------------------------------------------------------------------------
 1 | -- max_temp_filter_udf.pig
 2 | REGISTER pig-examples.jar;
 3 | DEFINE isGood com.hadoopbook.pig.IsGoodQuality();
 4 | records = LOAD 'input/ncdc/micro-tab/sample.txt'
 5 |   AS (year:chararray, temperature:int, quality:int);
 6 | filtered_records = FILTER records BY temperature != 9999 AND isGood(quality);
 7 | grouped_records = GROUP filtered_records BY year;
 8 | max_temp = FOREACH grouped_records GENERATE group,
 9 |   MAX(filtered_records.temperature);
10 | DUMP max_temp;
11 | 


--------------------------------------------------------------------------------
/ch16-pig/src/main/pig/max_temp_macro.pig:
--------------------------------------------------------------------------------
 1 | DEFINE max_by_group(X, group_key, max_field) RETURNS Y {
 2 |   A = GROUP $X by $group_key;
 3 |   $Y = FOREACH A GENERATE group, MAX($X.$max_field);
 4 | };
 5 | 
 6 | records = LOAD 'input/ncdc/micro-tab/sample.txt'
 7 |   AS (year:chararray, temperature:int, quality:int);
 8 | filtered_records = FILTER records BY temperature != 9999 AND
 9 |   quality IN (0, 1, 4, 5, 9);
10 | max_temp = max_by_group(filtered_records, year, temperature);
11 | DUMP max_temp


--------------------------------------------------------------------------------
/ch16-pig/src/main/pig/max_temp_macro_import.pig:
--------------------------------------------------------------------------------
1 | IMPORT './ch16-pig/src/main/pig/max_temp.macro';
2 | records = LOAD 'input/ncdc/micro-tab/sample.txt'
3 |   AS (year:chararray, temperature:int, quality:int);
4 | filtered_records = FILTER records BY temperature != 9999 AND
5 |   quality IN (0, 1, 4, 5, 9);
6 | max_temp = max_by_group(filtered_records, year, temperature);
7 | DUMP max_temp


--------------------------------------------------------------------------------
/ch16-pig/src/main/pig/max_temp_param.param:
--------------------------------------------------------------------------------
1 | # Input file
2 | input=/user/tom/input/ncdc/micro-tab/sample.txt
3 | # Output file
4 | output=/tmp/out


--------------------------------------------------------------------------------
/ch16-pig/src/main/pig/max_temp_param.pig:
--------------------------------------------------------------------------------
1 | -- max_temp_param.pig
2 | records = LOAD '$input' AS (year:chararray, temperature:int, quality:int);
3 | filtered_records = FILTER records BY temperature != 9999 AND
4 |   quality IN (0, 1, 4, 5, 9);
5 | grouped_records = GROUP filtered_records BY year;
6 | max_temp = FOREACH grouped_records GENERATE group,
7 |   MAX(filtered_records.temperature);
8 | STORE max_temp into '$output';
9 | 


--------------------------------------------------------------------------------
/ch16-pig/src/main/pig/max_temp_station_name.pig:
--------------------------------------------------------------------------------
 1 | -- max_temp_station_name.pig
 2 | REGISTER pig-examples.jar;
 3 | DEFINE isGood com.hadoopbook.pig.IsGoodQuality();
 4 | 
 5 | stations = LOAD 'input/ncdc/metadata/stations-fixed-width.txt'
 6 |   USING com.hadoopbook.pig.CutLoadFunc('1-6,8-12,14-42')
 7 |   AS (usaf:chararray, wban:chararray, name:chararray);
 8 |   
 9 | trimmed_stations = FOREACH stations GENERATE usaf, wban, TRIM(name);
10 | 
11 | records = LOAD 'input/ncdc/all/191*'
12 |   USING com.hadoopbook.pig.CutLoadFunc('5-10,11-15,88-92,93-93')
13 |   AS (usaf:chararray, wban:chararray, temperature:int, quality:int);
14 |   
15 | filtered_records = FILTER records BY temperature != 9999 AND isGood(quality);
16 | grouped_records = GROUP filtered_records BY (usaf, wban) PARALLEL 30;
17 | max_temp = FOREACH grouped_records GENERATE FLATTEN(group),
18 |   MAX(filtered_records.temperature);
19 | max_temp_named = JOIN max_temp BY (usaf, wban), trimmed_stations BY (usaf, wban)
20 |   PARALLEL 30;
21 | max_temp_result = FOREACH max_temp_named GENERATE $0, $1, $5, $2;
22 | 
23 | STORE max_temp_result INTO 'max_temp_by_station';


--------------------------------------------------------------------------------
/ch16-pig/src/main/pig/year_stats.pig:
--------------------------------------------------------------------------------
 1 | -- year_stats.pig
 2 | REGISTER pig-examples.jar;
 3 | DEFINE isGood com.hadoopbook.pig.IsGoodQuality();
 4 | records = LOAD 'input/ncdc/all/19{1,2,3,4,5}0*'
 5 |   USING com.hadoopbook.pig.CutLoadFunc('5-10,11-15,16-19,88-92,93-93')
 6 |   AS (usaf:chararray, wban:chararray, year:int, temperature:int, quality:int);
 7 |   
 8 | grouped_records = GROUP records BY year PARALLEL 30;
 9 | 
10 | year_stats = FOREACH grouped_records {
11 |   uniq_stations = DISTINCT records.usaf;
12 |   good_records = FILTER records BY isGood(quality);
13 |   GENERATE FLATTEN(group), COUNT(uniq_stations) AS station_count,
14 |     COUNT(good_records) AS good_record_count, COUNT(records) AS record_count;
15 | }
16 | 
17 | DUMP year_stats;
18 | 
19 | 


--------------------------------------------------------------------------------
/ch16-pig/src/main/python/is_good_quality.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import re
 4 | import sys
 5 | 
 6 | for line in sys.stdin:
 7 |   (year, temp, q) = line.strip().split()
 8 |   if (temp != "9999" and re.match("[01459]", q)):
 9 |     print "%s\t%s" % (year, temp)
10 | 


--------------------------------------------------------------------------------
/ch16-pig/src/test/java/com/hadoopbook/pig/RangeTest.java:
--------------------------------------------------------------------------------
 1 | package com.hadoopbook.pig;
 2 | 
 3 | import static org.hamcrest.CoreMatchers.is;
 4 | import static org.junit.Assert.assertThat;
 5 | 
 6 | import java.util.List;
 7 | 
 8 | import org.junit.*;
 9 | 
10 | public class RangeTest {
11 | 
12 |   @Test
13 |   public void parsesEmptyRangeSpec() {
14 |     assertThat(Range.parse("").size(), is(0));
15 |   }
16 | 
17 |   @Test
18 |   public void parsesSingleRangeSpec() {
19 |     List<Range> ranges = Range.parse("1-3");
20 |     assertThat(ranges.size(), is(1));
21 |     assertThat(ranges.get(0), is(new Range(1, 3)));
22 |   }
23 | 
24 |   @Test
25 |   public void parsesMultipleRangeSpec() {
26 |     List<Range> ranges = Range.parse("1-3,5-10");
27 |     assertThat(ranges.size(), is(2));
28 |     assertThat(ranges.get(0), is(new Range(1, 3)));
29 |     assertThat(ranges.get(1), is(new Range(5, 10)));
30 |   }
31 | 
32 |   @Test(expected = IllegalArgumentException.class)
33 |   public void failsOnInvalidSpec() {
34 |     Range.parse("1-n");
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/ch17-hive/src/main/hive/conversions.hive:
--------------------------------------------------------------------------------
 1 | DROP TABLE IF EXISTS dummy;
 2 | CREATE TABLE dummy (value STRING);
 3 | LOAD DATA LOCAL INPATH 'input/hive/dummy.txt'
 4 | OVERWRITE INTO TABLE dummy;
 5 | 
 6 | SELECT CAST('X' AS INT) from dummy;
 7 |  
 8 | SELECT 2 + TRUE FROM dummy;
 9 | 
10 | SELECT 2 + CAST(TRUE AS INT) FROM dummy;
11 | 
12 | SELECT 2 + '2' FROM dummy;
13 | 
14 | SELECT concat('Truth: ', TRUE) FROM simple;
15 | 
16 | DROP TABLE IF EXISTS simple;
17 | CREATE TABLE simple (
18 |   col1 TIMESTAMP
19 | );
20 | 
21 | INSERT OVERWRITE TABLE simple
22 |   SELECT '2012-01-02 03:04:05.123456789' FROM dummy;
23 | 
24 | SELECT 2 + col1 FROM simple;
25 | 
26 | SELECT 2L + col1 FROM simple;
27 | 
28 | SELECT 2.0 + col1 FROM simple;
29 | 
30 | SELECT 2 + CAST(col1 AS BIGINT) FROM simple;
31 | 
32 | SELECT concat('Date: ', col1) FROM simple;
33 | 


--------------------------------------------------------------------------------
/ch17-hive/src/main/hive/indexes.hive:
--------------------------------------------------------------------------------
 1 | DROP TABLE IF EXISTS users_extended;
 2 | 
 3 | CREATE TABLE users_extended (id INT, name STRING, gender STRING);
 4 | 
 5 | LOAD DATA LOCAL INPATH 'input/hive/tables/users_extended.txt'
 6 | OVERWRITE INTO TABLE users_extended;
 7 | 
 8 | DROP INDEX IF EXISTS users_index;
 9 | 
10 | CREATE INDEX users_index
11 | ON TABLE users_extended (gender)
12 | AS 'BITMAP' WITH DEFERRED REBUILD;
13 | ALTER INDEX users_index ON users_extended REBUILD;
14 | 
15 | SELECT * FROM users_extended WHERE gender = 'F';
16 | 


--------------------------------------------------------------------------------
/ch17-hive/src/main/hive/max_temp.hive:
--------------------------------------------------------------------------------
 1 | ! echo; # == max_temp_select;
 2 | 
 3 | DROP TABLE IF EXISTS records;
 4 | 
 5 | CREATE TABLE records (year STRING, temperature INT, quality INT)
 6 | ROW FORMAT DELIMITED
 7 |   FIELDS TERMINATED BY '\t';
 8 | 
 9 | LOAD DATA LOCAL INPATH 'input/ncdc/micro-tab/sample.txt'
10 | OVERWRITE INTO TABLE records;
11 | 
12 | ! echo; # vv max_temp_select;
13 | SELECT year, MAX(temperature)
14 | FROM records
15 | WHERE temperature != 9999 AND quality IN (0, 1, 4, 5, 9)
16 | GROUP BY year;
17 | ! echo; # ^^ max_temp_select;
18 | 


--------------------------------------------------------------------------------
/ch17-hive/src/main/hive/regex_serde.hive:
--------------------------------------------------------------------------------
 1 | ! echo; # == select_stations;
 2 | 
 3 | DROP TABLE IF EXISTS stations;
 4 | 
 5 | CREATE TABLE stations (usaf STRING, wban STRING, name STRING)
 6 | ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
 7 | WITH SERDEPROPERTIES (
 8 |   "input.regex" = "(\\d{6}) (\\d{5}) (.{29}) .*"
 9 | );
10 | 
11 | LOAD DATA LOCAL INPATH "input/ncdc/metadata/stations-fixed-width.txt"
12 | INTO TABLE stations;
13 | 
14 | ! echo; # vv select_stations;
15 | SELECT * FROM stations LIMIT 4;
16 | ! echo; # ^^ select_stations;
17 | 


--------------------------------------------------------------------------------
/ch17-hive/src/main/hive/set.hive:
--------------------------------------------------------------------------------
 1 | ! echo; # == define_function;
 2 | ! echo; # == set_value;
 3 | ! echo; # == set_show_value;
 4 | 
 5 | ! echo; # vv define_function;
 6 | DESCRIBE FUNCTION length;
 7 | ! echo; # ^^ define_function;
 8 | 
 9 | ! echo; # vv set_value;
10 | SET hive.enforce.bucketing=true;
11 | ! echo; # ^^ set_value;
12 | 
13 | ! echo; # vv set_show_value;
14 | SET hive.enforce.bucketing;
15 | ! echo; # ^^ set_show_value;
16 | 


--------------------------------------------------------------------------------
/ch17-hive/src/main/hive/sort.hive:
--------------------------------------------------------------------------------
 1 | ! echo; # == sort_by_year;
 2 | 
 3 | DROP TABLE IF EXISTS records2;
 4 | 
 5 | CREATE TABLE records2 (station STRING, year STRING, temperature INT, quality INT)
 6 | ROW FORMAT DELIMITED
 7 |   FIELDS TERMINATED BY '\t';
 8 | 
 9 | LOAD DATA LOCAL INPATH 'input/ncdc/micro-tab/sample2.txt'
10 | OVERWRITE INTO TABLE records2;
11 | 
12 | ! echo; # vv sort_by_year;
13 | FROM records2
14 | SELECT year, temperature
15 | DISTRIBUTE BY year
16 | SORT BY year ASC, temperature DESC;
17 | ! echo; # ^^ sort_by_year;
18 | 


--------------------------------------------------------------------------------
/ch17-hive/src/main/hive/types.hive:
--------------------------------------------------------------------------------
 1 | ! echo; # == complex_types;
 2 | 
 3 | DROP TABLE IF EXISTS complex;
 4 | 
 5 | CREATE TABLE complex (
 6 |   c1 ARRAY<INT>,
 7 |   c2 MAP<STRING, INT>,
 8 |   c3 STRUCT<a:STRING, b:INT, c:DOUBLE>,
 9 |   c4 UNIONTYPE<STRING, INT>
10 | );
11 | 
12 | LOAD DATA LOCAL INPATH 'input/hive/types/complex.txt'
13 | OVERWRITE INTO TABLE complex;
14 | 
15 | ! echo; # vv complex_types;
16 | SELECT c1[0], c2['b'], c3.c, c4 FROM complex;
17 | ! echo; # ^^ complex_types;
18 | 


--------------------------------------------------------------------------------
/ch17-hive/src/main/java/com/hadoopbook/hive/Strip.java:
--------------------------------------------------------------------------------
 1 | package com.hadoopbook.hive;
 2 | 
 3 | import org.apache.commons.lang.StringUtils;
 4 | import org.apache.hadoop.hive.ql.exec.UDF;
 5 | import org.apache.hadoop.io.Text;
 6 | 
 7 | public class Strip extends UDF {
 8 |   private Text result = new Text();
 9 |   
10 |   public Text evaluate(Text str) {
11 |     if (str == null) {
12 |       return null;
13 |     }
14 |     result.set(StringUtils.strip(str.toString()));
15 |     return result;
16 |   }
17 |   
18 |   public Text evaluate(Text str, String stripChars) {
19 |     if (str == null) {
20 |       return null;
21 |     }
22 |     result.set(StringUtils.strip(str.toString(), stripChars));
23 |     return result;
24 |   }
25 | }


--------------------------------------------------------------------------------
/ch17-hive/src/main/python/is_good_quality.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import re
 4 | import sys
 5 | 
 6 | for line in sys.stdin:
 7 |   (year, temp, q) = line.strip().split()
 8 |   if (temp != "9999" and re.match("[01459]", q)):
 9 |     print "%s\t%s" % (year, temp)
10 | 


--------------------------------------------------------------------------------
/ch17-hive/src/main/python/max_temperature_reduce.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | 
 5 | (last_key, max_val) = (None, 0)
 6 | for line in sys.stdin:
 7 |   (key, val) = line.strip().split("\t")
 8 |   if last_key and last_key != key:
 9 |     print "%s\t%s" % (last_key, max_val)
10 |     (last_key, max_val) = (key, int(val))
11 |   else:
12 |     (last_key, max_val) = (key, max(max_val, int(val)))
13 | 
14 | if last_key:
15 |   print "%s\t%s" % (last_key, max_val)


--------------------------------------------------------------------------------
/ch18-crunch/src/main/assembly/hadoop-job.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | 
 3 | <assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
 4 |     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 5 |     xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
 6 | 
 7 |   <id>job</id>
 8 |   <formats>
 9 |     <format>jar</format>
10 |   </formats>
11 |   <includeBaseDirectory>false</includeBaseDirectory>
12 |   <dependencySets>
13 |     <dependencySet>
14 |       <unpack>false</unpack>
15 |       <scope>runtime</scope>
16 |       <outputDirectory>lib</outputDirectory>
17 |       <excludes>
18 |         <exclude>${groupId}:${artifactId}</exclude>
19 |       </excludes>
20 |     </dependencySet>
21 |     <dependencySet>
22 |       <unpack>true</unpack>
23 |       <includes>
24 |         <include>${groupId}:${artifactId}</include>
25 |       </includes>
26 |     </dependencySet>
27 |   </dependencySets>
28 | </assembly>
29 | 


--------------------------------------------------------------------------------
/ch18-crunch/src/main/java/crunch/NcdcStationMetadataParser.java:
--------------------------------------------------------------------------------
 1 | package crunch;
 2 | 
 3 | import java.io.Serializable;
 4 | import org.apache.hadoop.io.Text;
 5 | 
 6 | // Serializable copy of NcdcStationMetadataParser
 7 | public class NcdcStationMetadataParser implements Serializable {
 8 |   
 9 |   private String stationId;
10 |   private String stationName;
11 | 
12 |   public boolean parse(String record) {
13 |     if (record.length() < 42) { // header
14 |       return false;
15 |     }
16 |     String usaf = record.substring(0, 6);
17 |     String wban = record.substring(7, 12);
18 |     stationId = usaf + "-" + wban;
19 |     stationName = record.substring(13, 42);
20 |     try {
21 |       Integer.parseInt(usaf); // USAF identifiers are numeric
22 |       return true;
23 |     } catch (NumberFormatException e) {
24 |       return false;
25 |     }
26 |   }
27 |   
28 |   public boolean parse(Text record) {
29 |     return parse(record.toString());
30 |   }
31 |   
32 |   public String getStationId() {
33 |     return stationId;
34 |   }
35 | 
36 |   public String getStationName() {
37 |     return stationName;
38 |   }
39 |   
40 | }
41 | 


--------------------------------------------------------------------------------
/ch18-crunch/src/test/java/crunch/CountValuesFn.java:
--------------------------------------------------------------------------------
 1 | package crunch;
 2 | 
 3 | import java.util.Iterator;
 4 | import org.apache.crunch.MapFn;
 5 | 
 6 | public class CountValuesFn<S> extends MapFn<Iterable<S>, Integer> {
 7 |   @Override
 8 |   public Integer map(Iterable<S> input) {
 9 |     int count = 0;
10 |     for (Iterator i = input.iterator(); i.hasNext(); ) {
11 |       i.next();
12 |       count++;
13 |     }
14 |     return count;
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/ch18-crunch/src/test/java/crunch/CustomDoFn.java:
--------------------------------------------------------------------------------
 1 | package crunch;
 2 | 
 3 | import org.apache.crunch.DoFn;
 4 | import org.apache.crunch.Emitter;
 5 | 
 6 | public class CustomDoFn<S, T> extends DoFn<S, T> {
 7 | 
 8 |   static class NonSerializableHelper { }
 9 | 
10 |   transient NonSerializableHelper helper;
11 | 
12 |   @Override
13 |   public void initialize() {
14 |     helper = new NonSerializableHelper();
15 |   }
16 | 
17 |   @SuppressWarnings("unchecked")
18 |   @Override
19 |   public void process(S input, Emitter<T> emitter) {
20 |     // use helper here
21 |     emitter.emit((T) input);
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/ch18-crunch/src/test/java/crunch/InversePairFn.java:
--------------------------------------------------------------------------------
 1 | package crunch;
 2 | 
 3 | import org.apache.crunch.DoFn;
 4 | import org.apache.crunch.Emitter;
 5 | import org.apache.crunch.Pair;
 6 | 
 7 | public class InversePairFn<S, T> extends DoFn<Pair<S, T>, Pair<T, S>> {
 8 |   @Override
 9 |   public void process(Pair<S, T> input, Emitter<Pair<T, S>> emitter) {
10 |     emitter.emit(Pair.of(input.second(), input.first()));
11 |   }
12 | }
13 | 


--------------------------------------------------------------------------------
/ch18-crunch/src/test/java/crunch/PipelineDebugTest.java:
--------------------------------------------------------------------------------
 1 | package crunch;
 2 | 
 3 | import org.apache.crunch.PCollection;
 4 | import org.apache.crunch.Pipeline;
 5 | import org.apache.crunch.impl.mr.MRPipeline;
 6 | import org.apache.crunch.test.TemporaryPath;
 7 | import org.junit.Rule;
 8 | import org.junit.Test;
 9 | 
10 | public class PipelineDebugTest {
11 |   @Rule
12 |   public transient TemporaryPath tmpDir = new TemporaryPath();
13 | 
14 |   @Test
15 |   public void testDebug() throws Exception {
16 |     String inputPath = tmpDir.copyResourceFileName("set1.txt");
17 |     Pipeline pipeline = new MRPipeline(getClass());
18 |     pipeline.enableDebug();
19 |     pipeline.getConfiguration().setBoolean("crunch.log.job.progress", true);
20 |     PCollection<String> lines = pipeline.readTextFile(inputPath);
21 |     pipeline.writeTextFile(lines, tmpDir.getFileName("out"));
22 |     pipeline.done();
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/ch18-crunch/src/test/java/crunch/SerializableFunctionsTest.java:
--------------------------------------------------------------------------------
 1 | package crunch;
 2 | 
 3 | import java.io.IOException;
 4 | import org.apache.crunch.PCollection;
 5 | import org.apache.crunch.Pipeline;
 6 | import org.apache.crunch.impl.mr.MRPipeline;
 7 | import org.apache.crunch.test.TemporaryPath;
 8 | import org.junit.Rule;
 9 | import org.junit.Test;
10 | 
11 | import static org.apache.crunch.types.avro.Avros.strings;
12 | import static org.junit.Assert.assertEquals;
13 | 
14 | public class SerializableFunctionsTest {
15 | 
16 |   @Rule
17 |   public transient TemporaryPath tmpDir = new TemporaryPath();
18 | 
19 |   @Test
20 |   public void testInitialize() throws IOException {
21 |     String inputPath = tmpDir.copyResourceFileName("set1.txt");
22 |     Pipeline pipeline = new MRPipeline(getClass());
23 |     PCollection<String> lines = pipeline.readTextFile(inputPath);
24 |     long len = lines.parallelDo(new CustomDoFn<String, String>(), strings())
25 |         .length().getValue();
26 |     assertEquals(4, len);
27 |     pipeline.done();
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/ch18-crunch/src/test/java/crunch/ToLowerFn.java:
--------------------------------------------------------------------------------
 1 | package crunch;
 2 | 
 3 | import org.apache.crunch.DoFn;
 4 | import org.apache.crunch.Emitter;
 5 | 
 6 | public class ToLowerFn extends DoFn<String, String> {
 7 |   @Override
 8 |   public void process(String input, Emitter<String> emitter) {
 9 |     emitter.emit(input.toLowerCase());
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/ch18-crunch/src/test/resources/A:
--------------------------------------------------------------------------------
1 | 2	Tie
2 | 4	Coat
3 | 3	Hat
4 | 1	Scarf
5 | 


--------------------------------------------------------------------------------
/ch18-crunch/src/test/resources/B:
--------------------------------------------------------------------------------
1 | Joe	2
2 | Hank	4
3 | Ali	0
4 | Eve	3
5 | Hank	2
6 | 


--------------------------------------------------------------------------------
/ch18-crunch/src/test/resources/fruit.txt:
--------------------------------------------------------------------------------
1 | cherry
2 | apple
3 | banana
4 | 


--------------------------------------------------------------------------------
/ch18-crunch/src/test/resources/ints.txt:
--------------------------------------------------------------------------------
1 | 2
2 | 3
3 | 1
4 | 3


--------------------------------------------------------------------------------
/ch18-crunch/src/test/resources/numbers.seq:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch18-crunch/src/test/resources/numbers.seq


--------------------------------------------------------------------------------
/ch18-crunch/src/test/resources/sample.txt:
--------------------------------------------------------------------------------
1 | 0067011990999991950051507004+68750+023550FM-12+038299999V0203301N00671220001CN9999999N9+00001+99999999999
2 | 0043011990999991950051512004+68750+023550FM-12+038299999V0203201N00671220001CN9999999N9+00221+99999999999
3 | 0043011990999991950051518004+68750+023550FM-12+038299999V0203201N00261220001CN9999999N9-00111+99999999999
4 | 0043012650999991949032412004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+01111+99999999999
5 | 0043012650999991949032418004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+00781+99999999999


--------------------------------------------------------------------------------
/ch18-crunch/src/test/resources/set1.txt:
--------------------------------------------------------------------------------
1 | b
2 | c
3 | a
4 | e


--------------------------------------------------------------------------------
/ch18-crunch/src/test/resources/set2.txt:
--------------------------------------------------------------------------------
1 | b
2 | c
3 | a
4 | e
5 | b
6 | 


--------------------------------------------------------------------------------
/ch18-crunch/src/test/resources/urls.txt:
--------------------------------------------------------------------------------
 1 | www.A.com	www.B.com
 2 | www.A.com	www.C.com
 3 | www.A.com	www.D.com
 4 | www.A.com	www.E.com
 5 | www.B.com	www.D.com
 6 | www.B.com	www.E.com
 7 | www.C.com	www.D.com
 8 | www.D.com	www.B.com
 9 | www.E.com	www.A.com
10 | www.F.com	www.B.com
11 | www.F.com	www.C.com
12 | 


--------------------------------------------------------------------------------
/ch19-spark/src/main/python/MaxTemperature.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkContext
 2 | import re, sys
 3 | 
 4 | sc = SparkContext("local", "Max Temperature")
 5 | sc.textFile(sys.argv[1]) \
 6 |   .map(lambda s: s.split("\t")) \
 7 |   .filter(lambda rec: (rec[1] != "9999" and re.match("[01459]", rec[2]))) \
 8 |   .map(lambda rec: (int(rec[0]), int(rec[1]))) \
 9 |   .reduceByKey(max) \
10 |   .saveAsTextFile(sys.argv[2])
11 | 


--------------------------------------------------------------------------------
/ch19-spark/src/main/scala/MaxTemperature.scala:
--------------------------------------------------------------------------------
 1 | import org.apache.spark.SparkContext._
 2 | import org.apache.spark.{SparkConf, SparkContext}
 3 | 
 4 | object MaxTemperature {
 5 |   def main(args: Array[String]) {
 6 |     val conf = new SparkConf().setAppName("Max Temperature")
 7 |     val sc = new SparkContext(conf)
 8 | 
 9 |     sc.textFile(args(0))
10 |       .map(_.split("\t"))
11 |       .filter(rec => (rec(1) != "9999" && rec(2).matches("[01459]")))
12 |       .map(rec => (rec(0).toInt, rec(1).toInt))
13 |       .reduceByKey((a, b) => Math.max(a, b))
14 |       .saveAsTextFile(args(1))
15 |   }
16 | }


--------------------------------------------------------------------------------
/ch19-spark/src/main/scala/MaxTemperatureWithPlacement.scala:
--------------------------------------------------------------------------------
 1 | import org.apache.hadoop.conf.Configuration
 2 | import org.apache.hadoop.mapred.TextInputFormat
 3 | import org.apache.spark.SparkContext._
 4 | import org.apache.spark.scheduler.InputFormatInfo
 5 | import org.apache.spark.{SparkConf, SparkContext}
 6 | 
 7 | object MaxTemperatureWithPlacement {
 8 |   def main(args: Array[String]) {
 9 |     val inputPath = args(0)
10 |     val conf = new SparkConf().setAppName("Max Temperature")
11 |     val preferredLocations = InputFormatInfo.computePreferredLocations(
12 |       Seq(new InputFormatInfo(new Configuration(), classOf[TextInputFormat],
13 |         inputPath)))
14 |     val sc = new SparkContext(conf, preferredLocations)
15 | 
16 |     sc.textFile(args(0))
17 |       .map(_.split("\t"))
18 |       .filter(rec => (rec(1) != "9999" && rec(2).matches("[01459]")))
19 |       .map(rec => (rec(0).toInt, rec(1).toInt))
20 |       .reduceByKey((a, b) => Math.max(a, b))
21 |       .saveAsTextFile(args(1))
22 |   }
23 | }


--------------------------------------------------------------------------------
/ch19-spark/src/test/avro/IntWrapper.avsc:
--------------------------------------------------------------------------------
1 | {
2 |   "type": "record",
3 |   "name": "example.IntWrapper",
4 |   "doc": "A record with a single int value field.",
5 |   "fields": [
6 |     {"name": "value", "type": "int"}
7 |   ]
8 | }


--------------------------------------------------------------------------------
/ch19-spark/src/test/avro/WeatherRecord.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "record",
 3 |   "name": "WeatherRecord",
 4 |   "namespace": "specific",
 5 |   "doc": "A weather reading.",
 6 |   "fields": [
 7 |     {"name": "year", "type": "int"},
 8 |     {"name": "temperature", "type": "int"},
 9 |     {"name": "stationId", "type": "string"}
10 |   ]
11 | }
12 | 


--------------------------------------------------------------------------------
/ch19-spark/src/test/resources/fruit.txt:
--------------------------------------------------------------------------------
1 | cherry
2 | apple
3 | banana
4 | 


--------------------------------------------------------------------------------
/ch19-spark/src/test/resources/numbers.seq:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/ch19-spark/src/test/resources/numbers.seq


--------------------------------------------------------------------------------
/ch19-spark/src/test/resources/quangle.txt:
--------------------------------------------------------------------------------
1 | On the top of the Crumpetty Tree
2 | The Quangle Wangle sat,
3 | But his face you could not see,
4 | On account of his Beaver Hat.
5 | 


--------------------------------------------------------------------------------
/ch19-spark/src/test/resources/set2.txt:
--------------------------------------------------------------------------------
1 | b
2 | c
3 | a
4 | e
5 | b
6 | 


--------------------------------------------------------------------------------
/ch19-spark/src/test/scala/CustomKryoRegistrator.scala:
--------------------------------------------------------------------------------
 1 | import com.esotericsoftware.kryo.Kryo
 2 | import org.apache.spark.serializer.KryoRegistrator
 3 | import specific.WeatherRecord
 4 | 
 5 | class CustomKryoRegistrator extends KryoRegistrator {
 6 |   override def registerClasses(kryo: Kryo) {
 7 |     kryo.register(classOf[WeatherRecord])
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/ch19-spark/src/test/scala/ReflectWeatherRecord.scala:
--------------------------------------------------------------------------------
1 | case class ReflectWeatherRecord(year: Int, temperature: Int, stationId: String) {
2 |   def this() = this(0, 0, null)
3 | }


--------------------------------------------------------------------------------
/ch20-hbase/src/main/java/RowKeyConverter.java:
--------------------------------------------------------------------------------
 1 | import org.apache.hadoop.hbase.util.Bytes;
 2 | 
 3 | public class RowKeyConverter {
 4 | 
 5 |   private static final int STATION_ID_LENGTH = 12;
 6 | 
 7 |   /**
 8 |    * @return A row key whose format is: <station_id> <reverse_order_timestamp>
 9 |    */
10 |   public static byte[] makeObservationRowKey(String stationId,
11 |       long observationTime) {
12 |     byte[] row = new byte[STATION_ID_LENGTH + Bytes.SIZEOF_LONG];
13 |     Bytes.putBytes(row, 0, Bytes.toBytes(stationId), 0, STATION_ID_LENGTH);
14 |     long reverseOrderTimestamp = Long.MAX_VALUE - observationTime;
15 |     Bytes.putLong(row, STATION_ID_LENGTH, reverseOrderTimestamp);
16 |     return row;
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/ch21-zk/src/main/java/DeleteGroup.java:
--------------------------------------------------------------------------------
 1 | //cc DeleteGroup A program to delete a group and its members
 2 | import java.util.List;
 3 | 
 4 | import org.apache.zookeeper.KeeperException;
 5 | 
 6 | // vv DeleteGroup
 7 | public class DeleteGroup extends ConnectionWatcher {
 8 |     
 9 |   public void delete(String groupName) throws KeeperException,
10 |       InterruptedException {
11 |     String path = "/" + groupName;
12 |     
13 |     try {
14 |       List<String> children = zk.getChildren(path, false);
15 |       for (String child : children) {
16 |         zk.delete(path + "/" + child, -1);
17 |       }
18 |       zk.delete(path, -1);
19 |     } catch (KeeperException.NoNodeException e) {
20 |       System.out.printf("Group %s does not exist\n", groupName);
21 |       System.exit(1);
22 |     }
23 |   }
24 |   
25 |   public static void main(String[] args) throws Exception {
26 |     DeleteGroup deleteGroup = new DeleteGroup();
27 |     deleteGroup.connect(args[0]);
28 |     deleteGroup.delete(args[1]);
29 |     deleteGroup.close();
30 |   }
31 | }
32 | // ^^ DeleteGroup
33 | 


--------------------------------------------------------------------------------
/ch21-zk/src/main/java/JoinGroup.java:
--------------------------------------------------------------------------------
 1 | //cc JoinGroup A program that joins a group
 2 | 
 3 | import org.apache.zookeeper.CreateMode;
 4 | import org.apache.zookeeper.KeeperException;
 5 | import org.apache.zookeeper.ZooDefs.Ids;
 6 | 
 7 | // vv JoinGroup
 8 | public class JoinGroup extends ConnectionWatcher {
 9 |   
10 |   public void join(String groupName, String memberName) throws KeeperException,
11 |       InterruptedException {
12 |     String path = "/" + groupName + "/" + memberName;
13 |     String createdPath = zk.create(path, null/*data*/, Ids.OPEN_ACL_UNSAFE,
14 |       CreateMode.EPHEMERAL);
15 |     System.out.println("Created " + createdPath);
16 |   }
17 |   
18 |   public static void main(String[] args) throws Exception {
19 |     JoinGroup joinGroup = new JoinGroup();
20 |     joinGroup.connect(args[0]);
21 |     joinGroup.join(args[1], args[2]);
22 |     
23 |     // stay alive until process is killed or thread is interrupted
24 |     Thread.sleep(Long.MAX_VALUE);
25 |   }
26 | }
27 | // ^^ JoinGroup
28 | 


--------------------------------------------------------------------------------
/ch21-zk/src/main/sh/group.sh:
--------------------------------------------------------------------------------
 1 | : == group_create
 2 | : == group_list_empty
 3 | : == group_join
 4 | : == group_list_after_join
 5 | : == group_kill_goat
 6 | : == group_list_after_kill
 7 | : == group_delete
 8 | : vv group_create
 9 | java CreateGroup localhost zoo
10 | : ^^ group_create
11 | : vv group_list_empty
12 | java ListGroup localhost zoo
13 | : ^^ group_list_empty
14 | : vv group_join
15 | java JoinGroup localhost zoo duck &
16 | duck_pid=$!
17 | java JoinGroup localhost zoo cow &
18 | cow_pid=$!
19 | java JoinGroup localhost zoo goat &
20 | goat_pid=$!
21 | : ^^ group_join
22 | sleep 5
23 | : vv group_list_after_join
24 | java ListGroup localhost zoo
25 | : ^^ group_list_after_join
26 | : vv group_kill_goat
27 | kill $goat_pid
28 | : ^^ group_kill_goat
29 | sleep 5
30 | sleep 5 # be sure goat process has died
31 | : vv group_list_after_kill
32 | java ListGroup localhost zoo
33 | : ^^ group_list_after_kill
34 | kill $duck_pid
35 | kill $cow_pid
36 | : vv group_delete
37 | java DeleteGroup localhost zoo
38 | java ListGroup localhost zoo
39 | : ^^ group_delete


--------------------------------------------------------------------------------
/ch22-case-studies/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 |   <parent>
 5 |     <groupId>com.hadoopbook</groupId>
 6 |     <artifactId>hadoop-meta</artifactId>
 7 |     <version>4.0</version>
 8 |     <relativePath>../hadoop-meta/pom.xml</relativePath>
 9 |   </parent>
10 |   <groupId>com.hadoopbook</groupId>
11 |   <artifactId>ch22-case-studies</artifactId>
12 |   <packaging>jar</packaging>
13 |   <version>4.0</version>
14 |   <name>Chapter 22: Case Studies</name>
15 | 
16 | </project>
17 | 


--------------------------------------------------------------------------------
/ch22-case-studies/src/main/java/TrackStats.jr:
--------------------------------------------------------------------------------
 1 | module fm.last.hadoop.io.records {
 2 | 
 3 |   class TrackStats {
 4 |     int listeners;
 5 |     int plays; 
 6 |     int scrobbles;
 7 |     int radioPlays; 
 8 |     int skips; 
 9 |   }
10 | 
11 | }
12 | 


--------------------------------------------------------------------------------
/common/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 |   <parent>
 5 |     <groupId>com.hadoopbook</groupId>
 6 |     <artifactId>hadoop-meta</artifactId>
 7 |     <version>4.0</version>
 8 |     <relativePath>../hadoop-meta/pom.xml</relativePath>
 9 |   </parent>
10 |   <groupId>com.hadoopbook</groupId>
11 |   <artifactId>common</artifactId>
12 |   <packaging>jar</packaging>
13 |   <version>4.0</version>
14 |   <name>Common Code</name>
15 |   <dependencies>
16 |     <dependency>
17 |       <groupId>junit</groupId>
18 |       <artifactId>junit</artifactId>
19 |     </dependency>
20 |   </dependencies>
21 | </project>
22 | 


--------------------------------------------------------------------------------
/common/src/main/java/NcdcStationMetadataParser.java:
--------------------------------------------------------------------------------
 1 | import org.apache.hadoop.io.Text;
 2 | 
 3 | public class NcdcStationMetadataParser {
 4 |   
 5 |   private String stationId;
 6 |   private String stationName;
 7 | 
 8 |   public boolean parse(String record) {
 9 |     if (record.length() < 42) { // header
10 |       return false;
11 |     }
12 |     String usaf = record.substring(0, 6);
13 |     String wban = record.substring(7, 12);
14 |     stationId = usaf + "-" + wban;
15 |     stationName = record.substring(13, 42);
16 |     try {
17 |       Integer.parseInt(usaf); // USAF identifiers are numeric
18 |       return true;
19 |     } catch (NumberFormatException e) {
20 |       return false;
21 |     }
22 |   }
23 |   
24 |   public boolean parse(Text record) {
25 |     return parse(record.toString());
26 |   }
27 |   
28 |   public String getStationId() {
29 |     return stationId;
30 |   }
31 | 
32 |   public String getStationName() {
33 |     return stationName;
34 |   }
35 |   
36 | }
37 | 


--------------------------------------------------------------------------------
/common/src/main/java/oldapi/NcdcStationMetadataParser.java:
--------------------------------------------------------------------------------
 1 | package oldapi;
 2 | 
 3 | import org.apache.hadoop.io.Text;
 4 | 
 5 | public class NcdcStationMetadataParser {
 6 |   
 7 |   private String stationId;
 8 |   private String stationName;
 9 | 
10 |   public boolean parse(String record) {
11 |     if (record.length() < 42) { // header
12 |       return false;
13 |     }
14 |     String usaf = record.substring(0, 6);
15 |     String wban = record.substring(7, 12);
16 |     stationId = usaf + "-" + wban;
17 |     stationName = record.substring(13, 42);
18 |     try {
19 |       Integer.parseInt(usaf); // USAF identifiers are numeric
20 |       return true;
21 |     } catch (NumberFormatException e) {
22 |       return false;
23 |     }
24 |   }
25 |   
26 |   public boolean parse(Text record) {
27 |     return parse(record.toString());
28 |   }
29 |   
30 |   public String getStationId() {
31 |     return stationId;
32 |   }
33 | 
34 |   public String getStationName() {
35 |     return stationName;
36 |   }
37 |   
38 | }
39 | 


--------------------------------------------------------------------------------
/common/src/test/java/NcdcStationMetadataParserTest.java:
--------------------------------------------------------------------------------
 1 | import static org.hamcrest.CoreMatchers.is;
 2 | import static org.junit.Assert.assertThat;
 3 | import org.junit.*;
 4 | 
 5 | public class NcdcStationMetadataParserTest {
 6 |   
 7 |   private NcdcStationMetadataParser parser;
 8 | 
 9 |   @Before
10 |   public void setUp() {
11 |     parser = new NcdcStationMetadataParser();
12 |   }
13 |   
14 |   @Test
15 |   public void parsesValidRecord() {
16 |     assertThat(parser.parse("715390 99999 MOOSE JAW CS                  CN CA SA CZMJ  +50317 -105550 +05770"), is(true));
17 |     assertThat(parser.getStationId(), is("715390-99999"));
18 |     assertThat(parser.getStationName().trim(), is("MOOSE JAW CS"));
19 |   }
20 |   
21 |   @Test
22 |   public void parsesHeader() {
23 |     assertThat(parser.parse("Integrated Surface Database Station History, November 2007"), is(false));
24 |   }
25 |   
26 |   public void parsesBlankLine() {
27 |     assertThat(parser.parse(""), is(false));
28 |   }
29 |   
30 | }
31 | 


--------------------------------------------------------------------------------
/conf/hadoop-cluster.template.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <configuration>
 3 | 
 4 |   <property>
 5 |     <name>fs.defaultFS</name>
 6 |     <value>hdfs://namenode/</value>
 7 |   </property>
 8 | 
 9 |   <property>
10 |     <name>mapreduce.framework.name</name>
11 |     <value>yarn</value>
12 |   </property>
13 | 
14 |   <property>
15 |     <name>yarn.resourcemanager.address</name>
16 |     <value>resourcemanager:8032</value>
17 |   </property>
18 | 
19 | </configuration>


--------------------------------------------------------------------------------
/conf/hadoop-local.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <configuration>
 3 | 
 4 |   <property>
 5 |     <name>fs.defaultFS</name>
 6 |     <value>file:///</value>
 7 |   </property>
 8 |   
 9 |   <property>
10 |     <name>mapreduce.framework.name</name>
11 |     <value>local</value>
12 |   </property>
13 |   
14 | </configuration>


--------------------------------------------------------------------------------
/conf/hadoop-localhost.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <configuration>
 3 | 
 4 |   <property>
 5 |     <name>fs.defaultFS</name>
 6 |     <value>hdfs://localhost/</value>
 7 |   </property>
 8 | 
 9 |   <property>
10 |     <name>mapreduce.framework.name</name>
11 |     <value>yarn</value>
12 |   </property>
13 | 
14 |   <property>
15 |     <name>yarn.resourcemanager.address</name>
16 |     <value>localhost:8032</value>
17 |   </property>
18 |   
19 | </configuration>


--------------------------------------------------------------------------------
/conf/hadoop/pseudo-distributed/core-site.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0"?>
2 | <!-- core-site.xml -->
3 | <configuration>
4 |   <property>
5 |     <name>fs.defaultFS</name>
6 |     <value>hdfs://localhost/</value>
7 |   </property>
8 | </configuration>


--------------------------------------------------------------------------------
/conf/hadoop/pseudo-distributed/hdfs-site.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0"?>
2 | <!-- hdfs-site.xml -->
3 | <configuration>
4 |   <property>
5 |     <name>dfs.replication</name>
6 |     <value>1</value>
7 |   </property>
8 | </configuration>


--------------------------------------------------------------------------------
/conf/hadoop/pseudo-distributed/mapred-site.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0"?>
2 | <!-- mapred-site.xml -->
3 | <configuration>
4 |   <property>
5 |     <name>mapreduce.framework.name</name>
6 |     <value>yarn</value>
7 |   </property>
8 | </configuration>


--------------------------------------------------------------------------------
/conf/hadoop/pseudo-distributed/yarn-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <!-- yarn-site.xml -->
 3 | <configuration>
 4 |   <property>
 5 |     <name>yarn.resourcemanager.hostname</name>
 6 |     <value>localhost</value>
 7 |   </property>
 8 |   <property>
 9 |     <name>yarn.nodemanager.aux-services</name>
10 |     <value>mapreduce_shuffle</value>
11 |   </property>
12 | </configuration>


--------------------------------------------------------------------------------
/conf/pig/localhost/pig.properties:
--------------------------------------------------------------------------------
1 | fs.defaultFS=hdfs://localhost/
2 | mapred.job.tracker=localhost:8021


--------------------------------------------------------------------------------
/conf/zookeeper/cluster/zoo.cfg:
--------------------------------------------------------------------------------
 1 | tickTime=2000
 2 | dataDir=/disk1/zookeeper
 3 | dataLogDir=/disk2/zookeeper
 4 | clientPort=2181
 5 | initLimit=5
 6 | syncLimit=2
 7 | server.1=zookeeper1:2888:3888
 8 | server.2=zookeeper2:2888:3888
 9 | server.3=zookeeper3:2888:3888
10 | 


--------------------------------------------------------------------------------
/conf/zookeeper/localhost/zoo.cfg:
--------------------------------------------------------------------------------
1 | tickTime=2000
2 | dataDir=/Users/tom/zookeeper
3 | clientPort=2181
4 | 


--------------------------------------------------------------------------------
/input/avro/pairs.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/input/avro/pairs.avro


--------------------------------------------------------------------------------
/input/badrecords/a:
--------------------------------------------------------------------------------
 1 | 0
 2 | 1
 3 | 2
 4 | 3
 5 | 4
 6 | S
 7 | G
 8 | I
 9 | 8
10 | 9


--------------------------------------------------------------------------------
/input/badrecords/b:
--------------------------------------------------------------------------------
 1 | 0
 2 | 1
 3 | 2
 4 | 3
 5 | 4
 6 | 5
 7 | 6
 8 | 7
 9 | 8
10 | 9


--------------------------------------------------------------------------------
/input/badrecords/c:
--------------------------------------------------------------------------------
 1 | 0
 2 | 1
 3 | 2
 4 | 3
 5 | 4
 6 | 5
 7 | 6
 8 | 7
 9 | 8
10 | 9
11 | 


--------------------------------------------------------------------------------
/input/docs/1400-8.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/input/docs/1400-8.txt


--------------------------------------------------------------------------------
/input/docs/quangle.txt:
--------------------------------------------------------------------------------
1 | On the top of the Crumpetty Tree
2 | The Quangle Wangle sat,
3 | But his face you could not see,
4 | On account of his Beaver Hat.
5 | 


--------------------------------------------------------------------------------
/input/fileglob/2007/12/30/data-2007-12-30:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/input/fileglob/2007/12/30/data-2007-12-30


--------------------------------------------------------------------------------
/input/fileglob/2007/12/30/data[2007-12-30]:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/input/fileglob/2007/12/30/data[2007-12-30]


--------------------------------------------------------------------------------
/input/fileglob/2007/12/31/data-2007-12-31:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/input/fileglob/2007/12/31/data-2007-12-31


--------------------------------------------------------------------------------
/input/fileglob/2008/01/01/data-2008-01-01:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/input/fileglob/2008/01/01/data-2008-01-01


--------------------------------------------------------------------------------
/input/fileinput/a:
--------------------------------------------------------------------------------
1 | a


--------------------------------------------------------------------------------
/input/fileinput/dir/b:
--------------------------------------------------------------------------------
1 | b


--------------------------------------------------------------------------------
/input/hive/README:
--------------------------------------------------------------------------------
 1 | Commands used to create some of the binary files:
 2 | 
 3 | echo -e '0\x01Nat' > tables/users.txt
 4 | echo -e '2\x01Joe' >> tables/users.txt
 5 | echo -e '3\x01Kay' >> tables/users.txt
 6 | echo -e '4\x01Ann' >> tables/users.txt
 7 | 
 8 | echo -e '1\x022\x01a\x031\x02b\x032\x01a\x021\x021.0' > types/complex.txt
 9 | 
10 | echo -e '0\x01Nat\x01M' > tables/users_extended.txt
11 | echo -e '2\x01Joe\x01M' >> tables/users_extended.txt
12 | echo -e '3\x01Kay\x01F' >> tables/users_extended.txt
13 | echo -e '4\x01Ann\x01F' >> tables/users_extended.txt
14 | 


--------------------------------------------------------------------------------
/input/hive/dummy.txt:
--------------------------------------------------------------------------------
1 | X
2 | 


--------------------------------------------------------------------------------
/input/hive/joins/sales.txt:
--------------------------------------------------------------------------------
1 | Joe	2
2 | Hank	4
3 | Ali	0
4 | Eve	3
5 | Hank	2


--------------------------------------------------------------------------------
/input/hive/joins/things.txt:
--------------------------------------------------------------------------------
1 | 2	Tie
2 | 4	Coat
3 | 3	Hat
4 | 1	Scarf


--------------------------------------------------------------------------------
/input/hive/partitions/file1:
--------------------------------------------------------------------------------
1 | 1Log line 1


--------------------------------------------------------------------------------
/input/hive/partitions/file2:
--------------------------------------------------------------------------------
1 | 2Log line 2


--------------------------------------------------------------------------------
/input/hive/partitions/file3:
--------------------------------------------------------------------------------
1 | 3Log line 3


--------------------------------------------------------------------------------
/input/hive/partitions/file4:
--------------------------------------------------------------------------------
1 | 4Log line 4


--------------------------------------------------------------------------------
/input/hive/partitions/file5:
--------------------------------------------------------------------------------
1 | 5Log line 5


--------------------------------------------------------------------------------
/input/hive/partitions/file6:
--------------------------------------------------------------------------------
1 | 6Log line 6


--------------------------------------------------------------------------------
/input/hive/tables/users.txt:
--------------------------------------------------------------------------------
1 | 0Nat
2 | 2Joe
3 | 3Kay
4 | 4Ann
5 | 


--------------------------------------------------------------------------------
/input/hive/tables/users_extended.txt:
--------------------------------------------------------------------------------
1 | 0NatM
2 | 2JoeM
3 | 3KayF
4 | 4AnnF
5 | 


--------------------------------------------------------------------------------
/input/hive/tmp.txt:
--------------------------------------------------------------------------------
1 | 1	a
2 | 2	b
3 | 3	c


--------------------------------------------------------------------------------
/input/hive/types/complex.txt:
--------------------------------------------------------------------------------
1 | 12b2a1a11.0163
2 | 


--------------------------------------------------------------------------------
/input/hive/types/nested.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/input/hive/types/nested.txt


--------------------------------------------------------------------------------
/input/hive/udfs/arrays.txt:
--------------------------------------------------------------------------------
1 | ab
2 | cde
3 | 


--------------------------------------------------------------------------------
/input/hive/udfs/fruit.txt:
--------------------------------------------------------------------------------
1 |  pomegranate
2 | banana  
3 | apple
4 |   lychee 
5 | 


--------------------------------------------------------------------------------
/input/hive/udfs/max1.txt:
--------------------------------------------------------------------------------
1 | 1
2 | 2
3 | 3


--------------------------------------------------------------------------------
/input/hive/udfs/max2.txt:
--------------------------------------------------------------------------------
1 | 4
2 | 3


--------------------------------------------------------------------------------
/input/ncdc/all/1901.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/input/ncdc/all/1901.gz


--------------------------------------------------------------------------------
/input/ncdc/all/1902.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/input/ncdc/all/1902.gz


--------------------------------------------------------------------------------
/input/ncdc/micro-tab/sample.txt:
--------------------------------------------------------------------------------
1 | 1950	0	1
2 | 1950	22	1
3 | 1950	-11	1
4 | 1949	111	1
5 | 1949	78	1
6 | 


--------------------------------------------------------------------------------
/input/ncdc/micro-tab/sample2.txt:
--------------------------------------------------------------------------------
1 | A	1950	0	1
2 | B	1950	22	1
3 | A	1950	-11	1
4 | B	1949	111	1
5 | A	1949	78	1
6 | 


--------------------------------------------------------------------------------
/input/ncdc/micro-tab/sample_corrupt.txt:
--------------------------------------------------------------------------------
1 | 1950	0	1
2 | 1950	22	1
3 | 1950	e	1
4 | 1949	111	1
5 | 1949	78	1
6 | 


--------------------------------------------------------------------------------
/input/ncdc/micro/sample.txt:
--------------------------------------------------------------------------------
1 | 0067011990999991950051507004+68750+023550FM-12+038299999V0203301N00671220001CN9999999N9+00001+99999999999
2 | 0043011990999991950051512004+68750+023550FM-12+038299999V0203201N00671220001CN9999999N9+00221+99999999999
3 | 0043011990999991950051518004+68750+023550FM-12+038299999V0203201N00261220001CN9999999N9-00111+99999999999
4 | 0043012650999991949032412004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+01111+99999999999
5 | 0043012650999991949032418004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+00781+99999999999


--------------------------------------------------------------------------------
/input/ncdc/sample.txt:
--------------------------------------------------------------------------------
1 | 0067011990999991950051507004+68750+023550FM-12+038299999V0203301N00671220001CN9999999N9+00001+99999999999
2 | 0043011990999991950051512004+68750+023550FM-12+038299999V0203201N00671220001CN9999999N9+00221+99999999999
3 | 0043011990999991950051518004+68750+023550FM-12+038299999V0203201N00261220001CN9999999N9-00111+99999999999
4 | 0043012650999991949032412004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+01111+99999999999
5 | 0043012650999991949032418004+62300+010750FM-12+048599999V0202701N00461220001CN0500001N9+00781+99999999999


--------------------------------------------------------------------------------
/input/ncdc/sample.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/input/ncdc/sample.txt.gz


--------------------------------------------------------------------------------
/input/pig/combine/A:
--------------------------------------------------------------------------------
1 | 2	3
2 | 1	2
3 | 2	4
4 | 


--------------------------------------------------------------------------------
/input/pig/combine/B:
--------------------------------------------------------------------------------
1 | z	x	8
2 | w	y	1
3 | 


--------------------------------------------------------------------------------
/input/pig/corrupt/missing_fields:
--------------------------------------------------------------------------------
1 | 2	Tie
2 | 4	Coat
3 | 3
4 | 1	Scarf
5 | 


--------------------------------------------------------------------------------
/input/pig/foreach/A:
--------------------------------------------------------------------------------
1 | Joe	cherry	2
2 | Ali	apple	3
3 | Joe	banana	2
4 | Eve	apple	7
5 | 


--------------------------------------------------------------------------------
/input/pig/group/A:
--------------------------------------------------------------------------------
1 | Joe	cherry
2 | Ali	apple
3 | Joe	banana
4 | Eve	apple
5 | 


--------------------------------------------------------------------------------
/input/pig/join/A:
--------------------------------------------------------------------------------
1 | 2	Tie
2 | 4	Coat
3 | 3	Hat
4 | 1	Scarf
5 | 


--------------------------------------------------------------------------------
/input/pig/join/B:
--------------------------------------------------------------------------------
1 | Joe	2
2 | Hank	4
3 | Ali	0
4 | Eve	3
5 | Hank	2
6 | 


--------------------------------------------------------------------------------
/input/pig/multiquery/A:
--------------------------------------------------------------------------------
1 | Joe	cherry
2 | Ali	apple
3 | Joe	banana
4 | Eve	apple
5 | 


--------------------------------------------------------------------------------
/input/pig/nested/A:
--------------------------------------------------------------------------------
1 | popcorn	{(cherry, 1), (cranberry, 3), (pomegranate, 2)}	[a:1, b:2]
2 | burger	{(apple, 1), (banana, 3), (tangerine, 2)}	[a:2, b:1]
3 | 


--------------------------------------------------------------------------------
/input/pig/nested/B:
--------------------------------------------------------------------------------
1 | popcorn	[a:1, b:2]
2 | burger	[a:2, b:1]
3 | 


--------------------------------------------------------------------------------
/input/pig/pairwise/postings:
--------------------------------------------------------------------------------
1 | A	{(d1,2),(d3,1)}
2 | B	{(d1,1),(d2,1),(d3,2)}
3 | 


--------------------------------------------------------------------------------
/input/pig/schema/A:
--------------------------------------------------------------------------------
1 | 2	Tie
2 | 4	Coat
3 | 3	Hat
4 | 1	Scarf
5 | 


--------------------------------------------------------------------------------
/input/pig/sort/A:
--------------------------------------------------------------------------------
1 | 2	3
2 | 1	2
3 | 2	4
4 | 


--------------------------------------------------------------------------------
/input/pig/tuples/A:
--------------------------------------------------------------------------------
1 | (1,pomegranate)
2 | 


--------------------------------------------------------------------------------
/input/pig/types/A:
--------------------------------------------------------------------------------
1 | 1	(1,'pomegranate',(2,'apple'))
2 | 2	(3,'banana',(4,lychee))
3 | 


--------------------------------------------------------------------------------
/input/pig/types/B:
--------------------------------------------------------------------------------
1 | [a#pomegranate]
2 | 


--------------------------------------------------------------------------------
/input/pig/types/C:
--------------------------------------------------------------------------------
1 | a	pomegranate
2 | b	apple
3 | 


--------------------------------------------------------------------------------
/input/pig/types/one:
--------------------------------------------------------------------------------
1 | 1
2 | 


--------------------------------------------------------------------------------
/input/pig/udfs/A:
--------------------------------------------------------------------------------
1 |  pomegranate
2 | banana  
3 | apple
4 |   lychee 
5 | 


--------------------------------------------------------------------------------
/input/smallfiles/a:
--------------------------------------------------------------------------------
1 | aaaaaaaaaa


--------------------------------------------------------------------------------
/input/smallfiles/b:
--------------------------------------------------------------------------------
1 | bbbbbbbbbb


--------------------------------------------------------------------------------
/input/smallfiles/c:
--------------------------------------------------------------------------------
1 | cccccccccc


--------------------------------------------------------------------------------
/input/smallfiles/d:
--------------------------------------------------------------------------------
1 | dddddddddd


--------------------------------------------------------------------------------
/input/smallfiles/e:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomwhite/hadoop-book/472f6a86ed5865de2abde79eed1c267d0621ed00/input/smallfiles/e


--------------------------------------------------------------------------------
/input/smallfiles/f:
--------------------------------------------------------------------------------
1 | ffffffffff


--------------------------------------------------------------------------------
/input/wikipedia/example.xml:
--------------------------------------------------------------------------------
 1 |   <mediawiki xml:lang="en">
 2 |     <page>
 3 |       <title>Page title</title>
 4 |       <restrictions>edit=sysop:move=sysop</restrictions>
 5 |       <revision>
 6 |         <timestamp>2001-01-15T13:15:00Z</timestamp>
 7 |         <contributor><username>Foobar</username></contributor>
 8 |         <comment>I have just one thing to say!</comment>
 9 |         <text>A bunch of [[text]] here.</text>
10 |         <minor />
11 |       </revision>
12 |       <revision>
13 |         <timestamp>2001-01-15T13:10:27Z</timestamp>
14 |         <contributor><ip>10.0.0.2</ip></contributor>
15 |         <comment>new!</comment>
16 |         <text>An earlier [[revision]].</text>
17 |       </revision>
18 |     </page>
19 |     
20 |     <page>
21 |       <title>Talk:Page title</title>
22 |       <revision>
23 |         <timestamp>2001-01-15T14:03:00Z</timestamp>
24 |         <contributor><ip>10.0.0.2</ip></contributor>
25 |         <comment>hey</comment>
26 |         <text>WHYD YOU LOCK PAGE??!!! i was editing that jerk</text>
27 |       </revision>
28 |     </page>
29 |   </mediawiki>
30 | 


--------------------------------------------------------------------------------
/snippet/README:
--------------------------------------------------------------------------------
 1 | This directory contains tools for generating code snippets for the book and
 2 | testing that they are as expected.
 3 | 
 4 | Example invocations:
 5 | 
 6 | # First set the location of the Hadoop installation you are testing
 7 | # You need to have an unpacked copy of Hadoop in this directory
 8 | export HADOOP_HOME=~/dev/hadoop-1.0.0/
 9 | 
10 | # From the top level
11 | mvn verify -Phadoop.version=1.0.0
12 | 
13 | # From the snippet directory
14 | mvn verify
15 | 
16 | # Run against a pseudo cluster (you need to start it first)
17 | mvn verify -DargLine="-Dexample.mode=pseudo"
18 | 
19 | # Run the examples from chapter 2 only
20 | mvn verify -DargLine="-Dexample.chapters=ch02-mr-intro"


--------------------------------------------------------------------------------
/snippet/bin/check_expected.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | bin=`dirname "$0"`
 4 | bin=`cd "$bin"; pwd`
 5 | 
 6 | actual="$bin"/../actual
 7 | expected="$bin"/../expected
 8 | 
 9 | for f in $expected/ch16-pig/grunt/*.xml; do
10 |   echo $f
11 |   f_actual=$actual/ch16-pig/grunt/`basename $f`
12 |   diff $f $f_actual > /dev/null
13 |   if [ $? != 0 ]; then
14 |     echo "Expected file $f different to actual $f_actual:"
15 |     diff $f $f_actual
16 |     #exit 1
17 |   fi
18 | done


--------------------------------------------------------------------------------
/snippet/bin/check_manuscript.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Check that the expected (or actual) snippets are in the manuscript. E.g.
 4 | # bin/check_manuscript.py  ~/book-workspace/htdg-git/ch16-pig.xml expected/ch16-pig/grunt/*
 5 | 
 6 | import sys
 7 | 
 8 | manuscript = open(sys.argv[1], 'r').read()
 9 | 
10 | for snippet_file in sys.argv[2:]:
11 |   lines = open(snippet_file, 'r').readlines()
12 |   if lines[0].startswith("<!--"):
13 |     doc = "".join(lines[1:]) # remove first line if a comment
14 |   else:
15 |     doc = "".join(lines[0:])
16 |   snippet = doc.strip()
17 |   index = manuscript.find(snippet)
18 |   if index == -1:
19 |     print "Snippet not found", snippet_file
20 |   #else:
21 |   #  print "Snippet found", snippet_file
22 | 
23 | 


--------------------------------------------------------------------------------
/snippet/bin/check_manuscript.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | bin=`dirname "$0"`
 4 | bin=`cd "$bin"; pwd`
 5 | 
 6 | actual="$bin"/../actual
 7 | book_workspace=~/book-workspace/htdg-git
 8 | 
 9 | for ch in ch02 ch03 ch05 ch06 ch08 ch09 ch16 ch17 ch21
10 | do
11 |   # remove id and language attributes from program listings, and add a newline before </programlisting>
12 |   sed '/<programlisting/s/ id="[^"]*"//; /<programlisting/s/ language="[^"]*"//; s|</programlisting>|\
13 | </programlisting>|' $book_workspace/$ch.xml > /tmp/$ch.xml
14 |   $bin/check_manuscript.py /tmp/$ch.xml $actual/$ch/*
15 | done
16 | 
17 | # Avro check
18 | sed -e '/<programlisting/s/ id="[^"]*"//; /<programlisting/s/ language="[^"]*"//;  s|</programlisting>|\
19 | </programlisting>|' $book_workspace/ch12.xml > /tmp/ch12.xml
20 | $bin/check_manuscript.py /tmp/ch12.xml $actual/ch12/*
21 | 
22 | # Common check
23 | $bin/check_manuscript.py /tmp/ch08-mr-types.xml $actual/common/*
24 | 


--------------------------------------------------------------------------------
/snippet/bin/grunter.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -x
 4 | set -e
 5 | 
 6 | bin=`dirname "$0"`
 7 | bin=`cd "$bin"; pwd`
 8 | 
 9 | actual="$bin"/../actual
10 | 
11 | cd "$bin"/../..
12 | 
13 | rm -rf $actual/ch16
14 | mkdir -p $actual/ch16
15 | 
16 | for f in ch16-pig/src/main/grunt/*.grunt; do
17 |   out=$f.output.txt
18 |   pig -x local < $f 2> /dev/null \
19 |       | grep -v INFO \
20 |       | grep -v '^grunt> $' \
21 |       | sed -e 's|&|\&amp;|g' \
22 |             -e 's|"|\&quot;|g' \
23 |             -e 's|>|\&gt;|g' \
24 |             -e 's|<|\&lt;|g' \
25 |             -e 's|^\(grunt&gt; \)\(.*\)|<prompt moreinfo="none">\1</prompt><userinput moreinfo="none">\2</userinput>|' \
26 |             -e 's|^\(&gt;&gt; \)\(.*\)|<prompt moreinfo="none">\1</prompt><userinput moreinfo="none">\2</userinput>|' \
27 |       > $out
28 |   cat $out
29 |   python "$bin"/phragmite_pig.py $out $actual/ch16
30 |   rm $out
31 | done
32 | 
33 | 


--------------------------------------------------------------------------------
/snippet/bin/phragmite_hive.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import re
 4 | import sys
 5 | 
 6 | filename = sys.argv[1]
 7 | output_dir = sys.argv[2]
 8 | 
 9 | fragments = {}
10 | active_fragments = set([])
11 | 
12 | for line in open(filename, 'r'):
13 |   m = re.match(r".*#\s+vv\s+([^<]*);", line)
14 |   if m:
15 |     active_fragments.add(m.group(1).strip())
16 |     continue
17 |   m = re.match(r".*#\s+\^\^\s+([^<]*);", line)
18 |   if m:
19 |     active_fragments.remove(m.group(1).strip())
20 |     continue
21 |   for fragment in active_fragments:
22 |    fragments[fragment] = fragments.get(fragment, '') + line
23 |    
24 | for fragment in fragments:
25 |   file = open(output_dir + "/" + fragment + ".xml", 'w')
26 |   file.write('<screen format="linespecific">' + fragments[fragment].strip() + "</screen>" )
27 | 


--------------------------------------------------------------------------------
/snippet/bin/phragmite_pig.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import re
 4 | import sys
 5 | 
 6 | filename = sys.argv[1]
 7 | output_dir = sys.argv[2]
 8 | 
 9 | fragments = {}
10 | active_fragments = set([])
11 | 
12 | for line in open(filename, 'r'):
13 |   m = re.match(r".*--\s+vv\s+([^<]*)", line)
14 |   if m:
15 |     active_fragments.add(m.group(1).strip())
16 |     continue
17 |   m = re.match(r".*--\s+\^\^\s+([^<]*)", line)
18 |   if m:
19 |     active_fragments.remove(m.group(1).strip())
20 |     continue
21 |   for fragment in active_fragments:
22 |    fragments[fragment] = fragments.get(fragment, '') + line
23 |    
24 | for fragment in fragments:
25 |   file = open(output_dir + "/" + fragment + ".xml", 'w')
26 |   file.write('<screen format="linespecific">' + fragments[fragment].strip() + "</screen>" )
27 | 


--------------------------------------------------------------------------------
/snippet/conf/local/container-executor.cfg:
--------------------------------------------------------------------------------
1 | yarn.nodemanager.linux-container-executor.group=#configured value of yarn.nodemanager.linux-container-executor.group
2 | banned.users=#comma separated list of users who can not run applications
3 | min.user.id=1000#Prevent other super-users
4 | allowed.system.users=##comma separated list of system users who CAN run applications
5 | 


--------------------------------------------------------------------------------
/snippet/conf/local/core-site.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0"?>
2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
3 | 
4 | <!-- Put site-specific property overrides in this file. -->
5 | 
6 | <configuration>
7 | 
8 | </configuration>
9 | 


--------------------------------------------------------------------------------
/snippet/conf/local/hdfs-site.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0"?>
2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
3 | 
4 | <!-- Put site-specific property overrides in this file. -->
5 | 
6 | <configuration>
7 | 
8 | </configuration>
9 | 


--------------------------------------------------------------------------------
/snippet/conf/local/httpfs-signature.secret:
--------------------------------------------------------------------------------
1 | hadoop httpfs secret
2 | 


--------------------------------------------------------------------------------
/snippet/conf/local/httpfs-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!--
 3 |   Licensed under the Apache License, Version 2.0 (the "License");
 4 |   you may not use this file except in compliance with the License.
 5 |   You may obtain a copy of the License at
 6 | 
 7 |   http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 |   Unless required by applicable law or agreed to in writing, software
10 |   distributed under the License is distributed on an "AS IS" BASIS,
11 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |   See the License for the specific language governing permissions and
13 |   limitations under the License.
14 | -->
15 | <configuration>
16 | 
17 | </configuration>
18 | 


--------------------------------------------------------------------------------
/snippet/conf/local/mapred-env.cmd:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | @rem Licensed to the Apache Software Foundation (ASF) under one or more
 3 | @rem contributor license agreements.  See the NOTICE file distributed with
 4 | @rem this work for additional information regarding copyright ownership.
 5 | @rem The ASF licenses this file to You under the Apache License, Version 2.0
 6 | @rem (the "License"); you may not use this file except in compliance with
 7 | @rem the License.  You may obtain a copy of the License at
 8 | @rem
 9 | @rem     http://www.apache.org/licenses/LICENSE-2.0
10 | @rem
11 | @rem Unless required by applicable law or agreed to in writing, software
12 | @rem distributed under the License is distributed on an "AS IS" BASIS,
13 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | @rem See the License for the specific language governing permissions and
15 | @rem limitations under the License.
16 | 
17 | set HADOOP_JOB_HISTORYSERVER_HEAPSIZE=1000
18 | 
19 | set HADOOP_MAPRED_ROOT_LOGGER=INFO,RFA
20 | 
21 | 


--------------------------------------------------------------------------------
/snippet/conf/local/mapred-site.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0"?>
2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
3 | 
4 | <!-- Put site-specific property overrides in this file. -->
5 | 
6 | <configuration>
7 | 
8 | </configuration>
9 | 


--------------------------------------------------------------------------------
/snippet/conf/local/mapred-site.xml.template:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <!--
 4 |   Licensed under the Apache License, Version 2.0 (the "License");
 5 |   you may not use this file except in compliance with the License.
 6 |   You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |   Unless required by applicable law or agreed to in writing, software
11 |   distributed under the License is distributed on an "AS IS" BASIS,
12 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |   See the License for the specific language governing permissions and
14 |   limitations under the License. See accompanying LICENSE file.
15 | -->
16 | 
17 | <!-- Put site-specific property overrides in this file. -->
18 | 
19 | <configuration>
20 | 
21 | </configuration>
22 | 


--------------------------------------------------------------------------------
/snippet/conf/local/slaves:
--------------------------------------------------------------------------------
1 | localhost
2 | 


--------------------------------------------------------------------------------
/snippet/conf/local/yarn-site.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0"?>
2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
3 | 
4 | <!-- Put site-specific property overrides in this file. -->
5 | 
6 | <configuration>
7 | 
8 | </configuration>


--------------------------------------------------------------------------------
/snippet/conf/pseudo/capacity-scheduler.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <configuration>
 3 |   <property>
 4 |     <name>yarn.scheduler.capacity.root.queues</name>
 5 |     <value>prod,dev</value>
 6 |   </property>
 7 |   <property>
 8 |     <name>yarn.scheduler.capacity.root.dev.queues</name>
 9 |     <value>eng,science</value>
10 |   </property>
11 |   <property>
12 |     <name>yarn.scheduler.capacity.root.prod.capacity</name>
13 |     <value>40</value>
14 |   </property>
15 |   <property>
16 |     <name>yarn.scheduler.capacity.root.dev.capacity</name>
17 |     <value>60</value>
18 |   </property>
19 |   <property>
20 |     <name>yarn.scheduler.capacity.root.dev.maximum-capacity</name>
21 |     <value>75</value>
22 |   </property>
23 |   <property>
24 |     <name>yarn.scheduler.capacity.root.dev.eng.capacity</name>
25 |     <value>50</value>
26 |   </property>
27 |   <property>
28 |     <name>yarn.scheduler.capacity.root.dev.science.capacity</name>
29 |     <value>50</value>
30 |   </property>
31 | </configuration>


--------------------------------------------------------------------------------
/snippet/conf/pseudo/container-executor.cfg:
--------------------------------------------------------------------------------
1 | yarn.nodemanager.linux-container-executor.group=#configured value of yarn.nodemanager.linux-container-executor.group
2 | banned.users=#comma separated list of users who can not run applications
3 | min.user.id=1000#Prevent other super-users
4 | allowed.system.users=##comma separated list of system users who CAN run applications
5 | 


--------------------------------------------------------------------------------
/snippet/conf/pseudo/core-site.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0"?>
2 | <!-- core-site.xml -->
3 | <configuration>
4 |   <property>
5 |     <name>fs.defaultFS</name>
6 |     <value>hdfs://localhost/</value>
7 |   </property>
8 | </configuration>


--------------------------------------------------------------------------------
/snippet/conf/pseudo/fair-scheduler.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <allocations>
 3 |   <defaultQueueSchedulingPolicy>fair</defaultQueueSchedulingPolicy>
 4 | 
 5 |   <queue name="prod">
 6 |     <weight>40</weight>
 7 |     <schedulingPolicy>fifo</schedulingPolicy>
 8 |   </queue>
 9 | 
10 |   <queue name="dev">
11 |     <weight>60</weight>
12 |     <queue name="eng" />
13 |     <queue name="science" />
14 |   </queue>
15 | 
16 |   <userMaxAppsDefault>5</userMaxAppsDefault>
17 | 
18 |   <queuePlacementPolicy>
19 |     <rule name="specified" />
20 |     <rule name="primaryGroup" create="false" />
21 |     <rule name="default" queue="dev.eng" />
22 |   </queuePlacementPolicy>
23 | </allocations>


--------------------------------------------------------------------------------
/snippet/conf/pseudo/hdfs-site.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0"?>
2 | <!-- hdfs-site.xml -->
3 | <configuration>
4 |   <property>
5 |     <name>dfs.replication</name>
6 |     <value>1</value>
7 |   </property>
8 | </configuration>


--------------------------------------------------------------------------------
/snippet/conf/pseudo/httpfs-signature.secret:
--------------------------------------------------------------------------------
1 | hadoop httpfs secret
2 | 


--------------------------------------------------------------------------------
/snippet/conf/pseudo/httpfs-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!--
 3 |   Licensed under the Apache License, Version 2.0 (the "License");
 4 |   you may not use this file except in compliance with the License.
 5 |   You may obtain a copy of the License at
 6 | 
 7 |   http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 |   Unless required by applicable law or agreed to in writing, software
10 |   distributed under the License is distributed on an "AS IS" BASIS,
11 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |   See the License for the specific language governing permissions and
13 |   limitations under the License.
14 | -->
15 | <configuration>
16 | 
17 | </configuration>
18 | 


--------------------------------------------------------------------------------
/snippet/conf/pseudo/mapred-env.cmd:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | @rem Licensed to the Apache Software Foundation (ASF) under one or more
 3 | @rem contributor license agreements.  See the NOTICE file distributed with
 4 | @rem this work for additional information regarding copyright ownership.
 5 | @rem The ASF licenses this file to You under the Apache License, Version 2.0
 6 | @rem (the "License"); you may not use this file except in compliance with
 7 | @rem the License.  You may obtain a copy of the License at
 8 | @rem
 9 | @rem     http://www.apache.org/licenses/LICENSE-2.0
10 | @rem
11 | @rem Unless required by applicable law or agreed to in writing, software
12 | @rem distributed under the License is distributed on an "AS IS" BASIS,
13 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | @rem See the License for the specific language governing permissions and
15 | @rem limitations under the License.
16 | 
17 | set HADOOP_JOB_HISTORYSERVER_HEAPSIZE=1000
18 | 
19 | set HADOOP_MAPRED_ROOT_LOGGER=INFO,RFA
20 | 
21 | 


--------------------------------------------------------------------------------
/snippet/conf/pseudo/mapred-site.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0"?>
2 | <!-- mapred-site.xml -->
3 | <configuration>
4 |   <property>
5 |     <name>mapreduce.framework.name</name>
6 |     <value>yarn</value>
7 |   </property>
8 | </configuration>


--------------------------------------------------------------------------------
/snippet/conf/pseudo/mapred-site.xml.template:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <!--
 4 |   Licensed under the Apache License, Version 2.0 (the "License");
 5 |   you may not use this file except in compliance with the License.
 6 |   You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |   Unless required by applicable law or agreed to in writing, software
11 |   distributed under the License is distributed on an "AS IS" BASIS,
12 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |   See the License for the specific language governing permissions and
14 |   limitations under the License. See accompanying LICENSE file.
15 | -->
16 | 
17 | <!-- Put site-specific property overrides in this file. -->
18 | 
19 | <configuration>
20 | 
21 | </configuration>
22 | 


--------------------------------------------------------------------------------
/snippet/conf/pseudo/slaves:
--------------------------------------------------------------------------------
1 | localhost
2 | 


--------------------------------------------------------------------------------
/snippet/conf/pseudo/yarn-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <!-- yarn-site.xml -->
 3 | <configuration>
 4 |   <property>
 5 |     <name>yarn.resourcemanager.hostname</name>
 6 |     <value>localhost</value>
 7 |   </property>
 8 |   <property>
 9 |     <name>yarn.nodemanager.aux-services</name>
10 |     <value>mapreduce_shuffle</value>
11 |   </property>
12 | </configuration>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/combine_schema.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DESCRIBE A;</userinput>
2 | A: {f0: int,f1: int}
3 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DESCRIBE B;</userinput>
4 | B: {f0: chararray,f1: chararray,f2: int}
5 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DESCRIBE C;</userinput>
6 | Schema for C unknown.</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/combine_union.xml:
--------------------------------------------------------------------------------
 1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP A;</userinput>
 2 | (2,3)
 3 | (1,2)
 4 | (2,4)
 5 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP B;</userinput>
 6 | (z,x,8)
 7 | (w,y,1)
 8 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">C = UNION A, B;</userinput>
 9 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP C;</userinput>
10 | (2,3)
11 | (1,2)
12 | (2,4)
13 | (z,x,8)
14 | (w,y,1)</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/foreach_generate.xml:
--------------------------------------------------------------------------------
 1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP A;</userinput>
 2 | (Joe,cherry,2)
 3 | (Ali,apple,3)
 4 | (Joe,banana,2)
 5 | (Eve,apple,7)
 6 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">B = FOREACH A GENERATE $0, $2+1, 'Constant';</userinput>
 7 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP B;</userinput>
 8 | (Joe,3,Constant)
 9 | (Ali,4,Constant)
10 | (Joe,3,Constant)
11 | (Eve,8,Constant)</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/group_all.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">C = GROUP A ALL;</userinput>
2 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP C;</userinput>
3 | (all,{(Joe,cherry),(Ali,apple),(Joe,banana),(Eve,apple)})</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/group_dump.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP A;</userinput>
2 | (Joe,cherry)
3 | (Ali,apple)
4 | (Joe,banana)
5 | (Eve,apple)</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/group_expression.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">B = GROUP A BY SIZE($1);</userinput>
2 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP B;</userinput>
3 | (5,{(Ali,apple),(Eve,apple)})
4 | (6,{(Joe,cherry),(Joe,banana)})</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/join_cogroup.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">D = COGROUP A BY $0, B BY $1;</userinput>
2 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP D;</userinput>
3 | (0,{},{(Ali,0)})
4 | (1,{(1,Scarf)},{})
5 | (2,{(2,Tie)},{(Joe,2),(Hank,2)})
6 | (3,{(3,Hat)},{(Eve,3)})
7 | (4,{(4,Coat)},{(Hank,4)})</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/join_cogroup_flatten.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">F = FOREACH E GENERATE FLATTEN(A), B.$0;</userinput>
2 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP F;</userinput>
3 | (1,Scarf,{})
4 | (2,Tie,{(Joe),(Hank)})
5 | (3,Hat,{(Eve)})
6 | (4,Coat,{(Hank)})</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/join_cogroup_inner.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">E = COGROUP A BY $0 INNER, B BY $1;</userinput>
2 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP E;</userinput>
3 | (1,{(1,Scarf)},{})
4 | (2,{(2,Tie)},{(Joe,2),(Hank,2)})
5 | (3,{(3,Hat)},{(Eve,3)})
6 | (4,{(4,Coat)},{(Hank,4)})</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/join_cogroup_join.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">G = COGROUP A BY $0 INNER, B BY $1 INNER;</userinput>
2 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">H = FOREACH G GENERATE FLATTEN($1), FLATTEN($2);</userinput>
3 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP H;</userinput>
4 | (2,Tie,Joe,2)
5 | (2,Tie,Hank,2)
6 | (3,Hat,Eve,3)
7 | (4,Coat,Hank,4)</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/join_dump.xml:
--------------------------------------------------------------------------------
 1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP A;</userinput>
 2 | (2,Tie)
 3 | (4,Coat)
 4 | (3,Hat)
 5 | (1,Scarf)
 6 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP B;</userinput>
 7 | (Joe,2)
 8 | (Hank,4)
 9 | (Ali,0)
10 | (Eve,3)
11 | (Hank,2)</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/join_frj.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">C = JOIN A BY $0, B BY $1 USING &quot;replicated&quot;;</userinput></screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/join_join.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">C = JOIN A BY $0, B BY $1;</userinput>
2 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP C;</userinput>
3 | (2,Tie,Joe,2)
4 | (2,Tie,Hank,2)
5 | (3,Hat,Eve,3)
6 | (4,Coat,Hank,4)</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/max_temp_describe_records.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DESCRIBE records;</userinput>
2 | records: {year: chararray,temperature: int,quality: int}</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/max_temp_dump_grouped_records.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">grouped_records = GROUP filtered_records BY year;</userinput>
2 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP grouped_records;</userinput>
3 | (1949,{(1949,111,1),(1949,78,1)})
4 | (1950,{(1950,0,1),(1950,22,1),(1950,-11,1)})</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/max_temp_dump_records.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP records;</userinput>
2 | (1950,0,1)
3 | (1950,22,1)
4 | (1950,-11,1)
5 | (1949,111,1)
6 | (1949,78,1)</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/max_temp_filter_records.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">filtered_records = FILTER records BY temperature != 9999 AND</userinput>
2 | <prompt moreinfo="none">&gt;&gt; </prompt><userinput moreinfo="none">  (quality == 0 OR quality == 1 OR quality == 4 OR quality == 5 OR quality == 9);</userinput>
3 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP filtered_records;</userinput>
4 | (1950,0,1)
5 | (1950,22,1)
6 | (1950,-11,1)
7 | (1949,111,1)
8 | (1949,78,1)</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/max_temp_load.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">records = LOAD 'input/ncdc/micro-tab/sample.txt'</userinput>
2 | <prompt moreinfo="none">&gt;&gt; </prompt><userinput moreinfo="none">  AS (year:chararray, temperature:int, quality:int);</userinput></screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/max_temp_max_temp.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">max_temp = FOREACH grouped_records GENERATE group,</userinput>
2 | <prompt moreinfo="none">&gt;&gt; </prompt><userinput moreinfo="none">  MAX(filtered_records.temperature);</userinput></screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/max_temp_result.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP max_temp;</userinput>
2 | (1949,111)
3 | (1950,22)</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/missing_fields.xml:
--------------------------------------------------------------------------------
 1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">A = LOAD 'input/pig/corrupt/missing_fields';</userinput>
 2 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP A;</userinput>
 3 | (2,Tie)
 4 | (4,Coat)
 5 | (3)
 6 | (1,Scarf)
 7 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">B = FILTER A BY SIZE(TOTUPLE(*)) &gt; 1;</userinput>
 8 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP B;</userinput>
 9 | (2,Tie)
10 | (4,Coat)
11 | (1,Scarf)</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/null_corrupt.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">corrupt_records = FILTER records BY temperature is null;</userinput>
2 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP corrupt_records;</userinput>
3 | (1950,,1)</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/null_count.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">grouped = GROUP corrupt_records ALL;</userinput>
2 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">all_grouped = FOREACH grouped GENERATE group, COUNT(corrupt_records);</userinput>
3 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP all_grouped;</userinput>
4 | (all,1)</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/null_dump.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">records = LOAD 'input/ncdc/micro-tab/sample_corrupt.txt'</userinput>
2 | <prompt moreinfo="none">&gt;&gt; </prompt><userinput moreinfo="none">  AS (year:chararray, temperature:int, quality:int);</userinput>
3 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP records;</userinput>
4 | (1950,0,1)
5 | (1950,22,1)
6 | (1950,,1)
7 | (1949,111,1)
8 | (1949,78,1)</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/null_split.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">SPLIT records INTO good_records IF temperature is not null,</userinput>
2 | <prompt moreinfo="none">&gt;&gt; </prompt><userinput moreinfo="none">  bad_records IF temperature is null;</userinput>
3 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP good_records;</userinput>
4 | (1950,0,1)
5 | (1950,22,1)
6 | (1949,111,1)
7 | (1949,78,1)
8 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP bad_records;</userinput>
9 | (1950,,1)</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/schema_absent.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">records = LOAD 'input/ncdc/micro-tab/sample.txt';</userinput>
2 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DESCRIBE records;</userinput>
3 | Schema for records unknown.</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/schema_absent_projected.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">projected_records = FOREACH records GENERATE $0, $1, $2;</userinput>
2 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP projected_records;</userinput>
3 | (1950,0,1)
4 | (1950,22,1)
5 | (1950,-11,1)
6 | (1949,111,1)
7 | (1949,78,1)
8 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DESCRIBE projected_records;</userinput>
9 | projected_records: {bytearray,bytearray,bytearray}</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/schema_names_only.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">records = LOAD 'input/ncdc/micro-tab/sample.txt'</userinput>
2 | <prompt moreinfo="none">&gt;&gt; </prompt><userinput moreinfo="none">  AS (year, temperature, quality);</userinput>
3 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DESCRIBE records;</userinput>
4 | records: {year: bytearray,temperature: bytearray,quality: bytearray}</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/schema_one_type_only.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">records = LOAD 'input/ncdc/micro-tab/sample.txt'</userinput>
2 | <prompt moreinfo="none">&gt;&gt; </prompt><userinput moreinfo="none">  AS (year, temperature:int, quality:int);</userinput>
3 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DESCRIBE records;</userinput>
4 | records: {year: bytearray,temperature: int,quality: int}</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/schema_types.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">records = LOAD 'input/ncdc/micro-tab/sample.txt'</userinput>
2 | <prompt moreinfo="none">&gt;&gt; </prompt><userinput moreinfo="none">  AS (year:int, temperature:int, quality:int);</userinput>
3 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DESCRIBE records;</userinput>
4 | records: {year: int,temperature: int,quality: int}</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/set_debug_on.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">set debug on</userinput></screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/sort_dump.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP A;</userinput>
2 | (2,3)
3 | (1,2)
4 | (2,4)</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/sort_limit.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">D = LIMIT B 2;</userinput>
2 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP D;</userinput>
3 | (1,2)
4 | (2,4)</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/sort_no_order.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">C = FOREACH B GENERATE *;</userinput></screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/sort_order.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">B = ORDER A BY $0, $1 DESC;</userinput>
2 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP B;</userinput>
3 | (1,2)
4 | (2,4)
5 | (2,3)</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/store_colon_delimited.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">STORE A INTO 'out' USING PigStorage(':');</userinput>
2 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">cat out</userinput>
3 | Joe:cherry:2
4 | Ali:apple:3
5 | Joe:banana:2
6 | Eve:apple:7</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/stream_cut.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">C = STREAM A THROUGH `cut -f 2`;</userinput>
2 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP C;</userinput>
3 | (cherry)
4 | (apple)
5 | (banana)
6 | (apple)</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/udfs_invoke_long.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">filtered_records = FILTER records BY temperature != 9999 AND</userinput>
2 | <prompt moreinfo="none">&gt;&gt; </prompt><userinput moreinfo="none">  com.hadoopbook.pig.IsGoodQuality(quality);</userinput></screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/udfs_invoke_short.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DEFINE isGood com.hadoopbook.pig.IsGoodQuality();</userinput>
2 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">filtered_records = FILTER records BY temperature != 9999 AND isGood(quality);</userinput></screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/udfs_load.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">records = LOAD 'input/ncdc/micro/sample.txt'</userinput>
2 | <prompt moreinfo="none">&gt;&gt; </prompt><userinput moreinfo="none">  USING com.hadoopbook.pig.CutLoadFunc('16-19,88-92,93-93')</userinput>
3 | <prompt moreinfo="none">&gt;&gt; </prompt><userinput moreinfo="none">  AS (year:int, temperature:int, quality:int);</userinput>
4 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP records;</userinput>
5 | (1950,0,1)
6 | (1950,22,1)
7 | (1950,-11,1)
8 | (1949,111,1)
9 | (1949,78,1)</screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/udfs_register.xml:
--------------------------------------------------------------------------------
1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">REGISTER pig-examples.jar;</userinput></screen>


--------------------------------------------------------------------------------
/snippet/expected/ch11/grunt/udfs_schema.xml:
--------------------------------------------------------------------------------
 1 | <screen format="linespecific"><prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP A;</userinput>
 2 | ( pomegranate)
 3 | (banana  )
 4 | (apple)
 5 | (  lychee )
 6 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DESCRIBE A;</userinput>
 7 | A: {fruit: chararray}
 8 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">B = FOREACH A GENERATE com.hadoopbook.pig.Trim(fruit);</userinput>
 9 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DUMP B;</userinput>
10 | (pomegranate)
11 | (banana)
12 | (apple)
13 | (lychee)
14 | <prompt moreinfo="none">grunt&gt; </prompt><userinput moreinfo="none">DESCRIBE B;</userinput>
15 | B: {chararray}</screen>


--------------------------------------------------------------------------------
/snippet/src/test/resources/copyoutput.sh:
--------------------------------------------------------------------------------
1 | if [ ! -e output ]; then
2 |   hadoop fs -get output .
3 | fi
4 | 


--------------------------------------------------------------------------------
/snippet/src/test/resources/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | if ! hadoop fs -test -e input; then
 3 |   hadoop fs -put input .
 4 | fi
 5 | if hadoop fs -test -e output; then
 6 |   hadoop fs -rmr output
 7 | fi
 8 | if [ -e output ]; then
 9 |   rm -r output
10 | fi
11 | 


--------------------------------------------------------------------------------