├── .gitignore ├── .travis.yml ├── Changes.md ├── LICENSE ├── Readme.md ├── cascading-protobuf ├── pom.xml └── src │ └── main │ └── java │ └── com │ └── twitter │ └── elephantbird │ └── cascading │ └── protobuf │ ├── ProtobufComparator.java │ ├── ProtobufDeserializer.java │ ├── ProtobufReflectionUtil.java │ ├── ProtobufSerialization.java │ └── ProtobufSerializer.java ├── cascading2 ├── pom.xml └── src │ ├── main │ └── java │ │ └── com │ │ └── twitter │ │ └── elephantbird │ │ └── cascading2 │ │ └── scheme │ │ ├── CombinedSequenceFile.java │ │ ├── CombinedWritableSequenceFile.java │ │ ├── LzoBinaryScheme.java │ │ ├── LzoByteArrayScheme.java │ │ ├── LzoProtobufB64LineScheme.java │ │ ├── LzoProtobufBlockScheme.java │ │ ├── LzoProtobufScheme.java │ │ ├── LzoTextDelimited.java │ │ ├── LzoTextLine.java │ │ ├── LzoThriftB64LineScheme.java │ │ └── LzoThriftScheme.java │ └── test │ └── java │ └── com │ └── twitter │ └── elephantbird │ └── cascading2 │ └── scheme │ └── TestCombinedSequenceFile.java ├── cascading3 ├── pom.xml └── src │ ├── main │ └── java │ │ └── com │ │ └── twitter │ │ └── elephantbird │ │ └── cascading3 │ │ └── scheme │ │ ├── CombinedSequenceFile.java │ │ ├── LzoBinaryScheme.java │ │ ├── LzoByteArrayScheme.java │ │ ├── LzoProtobufScheme.java │ │ ├── LzoTextDelimited.java │ │ ├── LzoTextLine.java │ │ └── LzoThriftScheme.java │ └── test │ └── java │ └── com │ └── twitter │ └── elephantbird │ └── cascading3 │ └── scheme │ └── TestCombinedSequenceFile.java ├── core ├── pom.xml ├── src │ ├── main │ │ ├── java │ │ │ └── com │ │ │ │ └── twitter │ │ │ │ └── elephantbird │ │ │ │ ├── mapred │ │ │ │ ├── input │ │ │ │ │ ├── DeprecatedFileInputFormatWrapper.java │ │ │ │ │ ├── DeprecatedInputFormatValueCopier.java │ │ │ │ │ ├── DeprecatedInputFormatWrapper.java │ │ │ │ │ ├── DeprecatedLzoJsonInputFormat.java │ │ │ │ │ ├── DeprecatedLzoTextInputFormat.java │ │ │ │ │ ├── DeprecatedMultiInputFormat.java │ │ │ │ │ └── DeprecatedRawMultiInputFormat.java │ │ │ │ └── output │ │ │ │ │ ├── DeprecatedFileOutputFormatWrapper.java │ │ │ │ │ ├── DeprecatedLzoTextOutputFormat.java │ │ │ │ │ └── DeprecatedOutputFormatWrapper.java │ │ │ │ ├── mapreduce │ │ │ │ ├── input │ │ │ │ │ ├── Base64Codec.java │ │ │ │ │ ├── BinaryConverterProvider.java │ │ │ │ │ ├── FilterRecordReader.java │ │ │ │ │ ├── IntegerListInputFormat.java │ │ │ │ │ ├── IntegerListInputSplit.java │ │ │ │ │ ├── IntegerListRecordReader.java │ │ │ │ │ ├── LzoBinaryB64LineRecordReader.java │ │ │ │ │ ├── LzoBinaryBlockRecordReader.java │ │ │ │ │ ├── LzoGenericB64LineRecordReader.java │ │ │ │ │ ├── LzoGenericBlockRecordReader.java │ │ │ │ │ ├── LzoGenericProtobufBlockInputFormat.java │ │ │ │ │ ├── LzoGenericProtobufBlockRecordReader.java │ │ │ │ │ ├── LzoInputFormat.java │ │ │ │ │ ├── LzoJsonInputFormat.java │ │ │ │ │ ├── LzoJsonRecordReader.java │ │ │ │ │ ├── LzoLineRecordReader.java │ │ │ │ │ ├── LzoProtobufB64LineInputFormat.java │ │ │ │ │ ├── LzoProtobufB64LineRecordReader.java │ │ │ │ │ ├── LzoProtobufBlockInputFormat.java │ │ │ │ │ ├── LzoProtobufBlockRecordReader.java │ │ │ │ │ ├── LzoRecordReader.java │ │ │ │ │ ├── LzoTextInputFormat.java │ │ │ │ │ ├── LzoThriftB64LineInputFormat.java │ │ │ │ │ ├── LzoThriftB64LineRecordReader.java │ │ │ │ │ ├── LzoThriftBlockInputFormat.java │ │ │ │ │ ├── LzoThriftBlockRecordReader.java │ │ │ │ │ ├── LzoW3CLogInputFormat.java │ │ │ │ │ ├── LzoW3CLogRecordReader.java │ │ │ │ │ ├── MapReduceInputFormatWrapper.java │ │ │ │ │ ├── MapredInputFormatCompatible.java │ │ │ │ │ ├── MultiInputFormat.java │ │ │ │ │ ├── RawMultiInputFormat.java │ │ │ │ │ ├── RawSequenceFileInputFormat.java │ │ │ │ │ ├── RawSequenceFileRecordReader.java │ │ │ │ │ └── combine │ │ │ │ │ │ ├── CompositeInputSplit.java │ │ │ │ │ │ ├── CompositeRecordReader.java │ │ │ │ │ │ └── DelegateCombineFileInputFormat.java │ │ │ │ ├── io │ │ │ │ │ ├── BinaryBlockReader.java │ │ │ │ │ ├── BinaryBlockWriter.java │ │ │ │ │ ├── BinaryConverter.java │ │ │ │ │ ├── BinaryWritable.java │ │ │ │ │ ├── DecodeException.java │ │ │ │ │ ├── GenericWritable.java │ │ │ │ │ ├── IdentityBinaryConverter.java │ │ │ │ │ ├── ProtobufBlockReader.java │ │ │ │ │ ├── ProtobufBlockWriter.java │ │ │ │ │ ├── ProtobufConverter.java │ │ │ │ │ ├── ProtobufWritable.java │ │ │ │ │ ├── RawBlockReader.java │ │ │ │ │ ├── RawBlockWriter.java │ │ │ │ │ ├── RawBytesWritable.java │ │ │ │ │ ├── SerializedBlock.java │ │ │ │ │ ├── ThriftBlockReader.java │ │ │ │ │ ├── ThriftBlockWriter.java │ │ │ │ │ ├── ThriftConverter.java │ │ │ │ │ ├── ThriftWritable.java │ │ │ │ │ └── TypedProtobufWritable.java │ │ │ │ └── output │ │ │ │ │ ├── LzoBinaryB64LineRecordWriter.java │ │ │ │ │ ├── LzoBinaryBlockOutputFormat.java │ │ │ │ │ ├── LzoBinaryBlockRecordWriter.java │ │ │ │ │ ├── LzoGenericBlockOutputFormat.java │ │ │ │ │ ├── LzoOutputFormat.java │ │ │ │ │ ├── LzoProtobufB64LineOutputFormat.java │ │ │ │ │ ├── LzoProtobufB64LineRecordWriter.java │ │ │ │ │ ├── LzoProtobufBlockOutputFormat.java │ │ │ │ │ ├── LzoProtobufBlockRecordWriter.java │ │ │ │ │ ├── LzoTextOutputFormat.java │ │ │ │ │ ├── LzoThriftB64LineOutputFormat.java │ │ │ │ │ ├── LzoThriftB64LineRecordWriter.java │ │ │ │ │ ├── LzoThriftBlockOutputFormat.java │ │ │ │ │ ├── LzoThriftBlockRecordWriter.java │ │ │ │ │ └── WorkFileOverride.java │ │ │ │ ├── thrift │ │ │ │ ├── TStructDescriptor.java │ │ │ │ ├── ThriftBinaryDeserializer.java │ │ │ │ ├── ThriftBinaryProtocol.java │ │ │ │ └── ThriftProtocolWrapper.java │ │ │ │ └── util │ │ │ │ ├── Codecs.java │ │ │ │ ├── CoreTestUtil.java │ │ │ │ ├── ExecuteOnClusterTool.java │ │ │ │ ├── HadoopUtils.java │ │ │ │ ├── HdfsUtils.java │ │ │ │ ├── Inflection.java │ │ │ │ ├── ListHelper.java │ │ │ │ ├── LzoUtils.java │ │ │ │ ├── Pair.java │ │ │ │ ├── PathFilters.java │ │ │ │ ├── Protobufs.java │ │ │ │ ├── SplitUtil.java │ │ │ │ ├── StreamSearcher.java │ │ │ │ ├── Strings.java │ │ │ │ ├── TaskHeartbeatThread.java │ │ │ │ ├── ThriftToDynamicProto.java │ │ │ │ ├── ThriftToProto.java │ │ │ │ ├── ThriftUtils.java │ │ │ │ ├── TypeRef.java │ │ │ │ ├── Utils.java │ │ │ │ └── W3CLogParser.java │ │ └── protobuf │ │ │ ├── address_book.proto │ │ │ └── thrift_fixtures.proto │ └── test │ │ ├── java │ │ ├── com │ │ │ └── twitter │ │ │ │ └── elephantbird │ │ │ │ ├── mapreduce │ │ │ │ ├── input │ │ │ │ │ ├── TestBase64Codec.java │ │ │ │ │ ├── TestIntegerListInputFormat.java │ │ │ │ │ ├── TestLzoJsonRecordReader.java │ │ │ │ │ ├── TestLzoProtobufBlockInputFormat.java │ │ │ │ │ └── TestLzoTextInputFormat.java │ │ │ │ ├── io │ │ │ │ │ ├── TestProtobufWritable.java │ │ │ │ │ └── TestTypedProtobufWritable.java │ │ │ │ └── output │ │ │ │ │ ├── TestLzoTextOutputFormat.java │ │ │ │ │ └── TestSimpleProtobufOutputFormat.java │ │ │ │ ├── thrift │ │ │ │ └── TestThriftBinaryProtocol.java │ │ │ │ └── util │ │ │ │ ├── TestCodecs.java │ │ │ │ ├── TestHadoopUtils.java │ │ │ │ ├── TestHdfsUtils.java │ │ │ │ ├── TestPathFilters.java │ │ │ │ ├── TestSplitUtil.java │ │ │ │ ├── TestStrings.java │ │ │ │ ├── TestThriftToDynamicProto.java │ │ │ │ └── TestThriftToProto.java │ │ └── org │ │ │ └── apache │ │ │ └── thrift │ │ │ └── Fixtures.java │ │ ├── resources │ │ └── com │ │ │ └── twitter │ │ │ └── elephantbird │ │ │ └── util │ │ │ └── sample_dir │ │ │ ├── a.txt │ │ │ ├── b.txt │ │ │ └── nested │ │ │ ├── c.txt │ │ │ ├── d.txt │ │ │ └── double_nested │ │ │ └── e.txt │ │ └── thrift │ │ ├── DebugProtoTest.thrift │ │ ├── address_book.thrift │ │ └── test.thrift ├── thrift7 │ └── src │ │ ├── main │ │ └── java │ │ │ └── com │ │ │ └── twitter │ │ │ └── elephantbird │ │ │ └── thrift │ │ │ ├── AbstractThriftBinaryDeserializer.java │ │ │ └── AbstractThriftBinaryProtocol.java │ │ └── test │ │ └── java │ │ └── com │ │ └── twitter │ │ └── elephantbird │ │ └── thrift │ │ └── TestThrift7BinaryProtocol.java └── thrift9 │ └── src │ ├── main │ └── java │ │ └── com │ │ └── twitter │ │ └── elephantbird │ │ └── thrift │ │ ├── AbstractThriftBinaryDeserializer.java │ │ └── AbstractThriftBinaryProtocol.java │ └── test │ └── java │ └── com │ └── twitter │ └── elephantbird │ └── thrift │ └── TestThrift9BinaryProtocol.java ├── crunch ├── Readme.md ├── pom.xml └── src │ └── main │ └── java │ └── com │ └── twitter │ └── elephantbird │ └── crunch │ ├── CrunchElephantBirdExample.java │ ├── EBTypes.java │ ├── LzoProtobufSource.java │ ├── LzoProtobufSourceTarget.java │ ├── LzoProtobufTarget.java │ ├── LzoThriftSource.java │ ├── LzoThriftSourceTarget.java │ ├── LzoThriftTarget.java │ ├── ProtobufFileReaderFactory.java │ ├── ProtobufReadableData.java │ ├── ThriftFileReaderFactory.java │ └── ThriftReadableData.java ├── examples ├── pom.xml └── src │ └── main │ ├── java │ └── com │ │ └── twitter │ │ └── elephantbird │ │ └── examples │ │ ├── DeprecatedWrapperWordCount.java │ │ ├── LzoJsonWordCount.java │ │ ├── LzoWordCount.java │ │ ├── PhoneNumberCounter.java │ │ ├── ProtobufMRExample.java │ │ └── ThriftMRExample.java │ ├── pig │ ├── json_word_count.pig │ ├── nested_json_get_distinct_items_from_nested_array.pig │ ├── nested_json_get_top_level_property_values.pig │ ├── nested_json_get_values_count_for_property_inside_nested_array.pig │ ├── nested_json_pizza_sample_data.json │ ├── people_phone_number_count.pig │ └── people_phone_number_count_thrift.pig │ ├── protobuf │ ├── address_book.proto │ └── examples.proto │ └── thrift │ ├── address_book.thrift │ └── simple_age.thrift ├── hadoop-compat ├── pom.xml └── src │ └── main │ └── java │ └── com │ └── twitter │ └── elephantbird │ └── util │ └── HadoopCompat.java ├── hive ├── pom.xml └── src │ ├── main │ └── java │ │ └── com │ │ └── twitter │ │ └── elephantbird │ │ ├── hive │ │ └── serde │ │ │ ├── LzoProtobufHiveSerde.java │ │ │ ├── ProtobufDeserializer.java │ │ │ ├── ProtobufStructObjectInspector.java │ │ │ └── ThriftSerDe.java │ │ └── mapred │ │ └── input │ │ └── HiveMultiInputFormat.java │ └── test │ └── java │ └── com │ └── twitter │ └── elephantbird │ └── hive │ └── serde │ └── ProtobufDeserializerTest.java ├── lucene ├── pom.xml └── src │ ├── main │ └── java │ │ └── com │ │ └── twitter │ │ └── elephantbird │ │ ├── lucene │ │ └── HdfsMergeTool.java │ │ └── mapreduce │ │ ├── input │ │ ├── LuceneHdfsDirectory.java │ │ ├── LuceneIndexCollectAllRecordReader.java │ │ ├── LuceneIndexCountHitsRecordReader.java │ │ ├── LuceneIndexInputFormat.java │ │ └── LuceneIndexRecordReader.java │ │ └── output │ │ └── LuceneIndexOutputFormat.java │ └── test │ ├── java │ └── com │ │ └── twitter │ │ └── elephantbird │ │ └── mapreduce │ │ ├── LuceneIndexingIntegrationTest.java │ │ └── input │ │ ├── TestLuceneIndexInputFormat.java │ │ └── TestLuceneIndexRecordReader.java │ └── resources │ └── com │ └── twitter │ └── elephantbird │ └── mapreduce │ ├── input │ └── sample_indexes │ │ ├── index-1 │ │ └── index-1.txt │ │ ├── index-2 │ │ └── data.txt │ │ ├── more-indexes │ │ └── index-3 │ │ │ └── data.txt │ │ └── unrelated │ │ └── index-unrelated.txt │ ├── test_documents1.txt │ ├── test_documents2.txt │ └── test_documents3.txt ├── mahout ├── pom.xml └── src │ ├── main │ └── java │ │ └── com │ │ └── twitter │ │ └── elephantbird │ │ └── pig │ │ └── mahout │ │ └── VectorWritableConverter.java │ └── test │ └── java │ └── com │ └── twitter │ └── elephantbird │ └── pig │ └── mahout │ ├── TestDenseVectorWritableConverter.java │ └── TestSequentialAccessSparseVectorWritableConverter.java ├── pig-lucene ├── pom.xml └── src │ ├── main │ └── java │ │ └── com │ │ └── twitter │ │ └── elephantbird │ │ └── pig │ │ ├── load │ │ └── LuceneIndexLoader.java │ │ └── store │ │ └── LuceneIndexStorage.java │ └── test │ ├── java │ └── com │ │ └── twitter │ │ └── elephantbird │ │ └── pig │ │ ├── PigLuceneIndexingIntegrationTest.java │ │ └── load │ │ └── TestLuceneIndexLoader.java │ └── resources │ └── com │ └── twitter │ └── elephantbird │ └── pig │ ├── index.pig │ ├── load │ └── queryfile.txt │ ├── search_file.pig │ └── search_queries.pig ├── pig ├── pom.xml └── src │ ├── main │ └── java │ │ └── com │ │ └── twitter │ │ └── elephantbird │ │ └── pig │ │ ├── load │ │ ├── FilterLoadFunc.java │ │ ├── HBaseLoader.java │ │ ├── JsonLoader.java │ │ ├── LocationAsTuple.java │ │ ├── LzoBaseLoadFunc.java │ │ ├── LzoBaseRegexLoader.java │ │ ├── LzoJsonLoader.java │ │ ├── LzoProtobufB64LinePigLoader.java │ │ ├── LzoProtobufBlockPigLoader.java │ │ ├── LzoRawBytesLoader.java │ │ ├── LzoRegexLoader.java │ │ ├── LzoTextLoader.java │ │ ├── LzoThriftB64LinePigLoader.java │ │ ├── LzoThriftBlockPigLoader.java │ │ ├── LzoTokenizedLoader.java │ │ ├── LzoW3CLogLoader.java │ │ ├── MultiFormatLoader.java │ │ ├── ProtobufPigLoader.java │ │ ├── SequenceFileLoader.java │ │ └── ThriftPigLoader.java │ │ ├── piggybank │ │ ├── BytesToThriftTuple.java │ │ ├── GenericInvoker.java │ │ ├── InvokeForDouble.java │ │ ├── InvokeForFloat.java │ │ ├── InvokeForInt.java │ │ ├── InvokeForLong.java │ │ ├── InvokeForString.java │ │ ├── Invoker.java │ │ ├── JsonStringToMap.java │ │ ├── ProtobufBytesToTuple.java │ │ └── ThriftBytesToTuple.java │ │ ├── store │ │ ├── BaseStoreFunc.java │ │ ├── Bz2PigStorage.java │ │ ├── LzoJsonStorage.java │ │ ├── LzoPigStorage.java │ │ ├── LzoProtobufB64LinePigStorage.java │ │ ├── LzoProtobufBlockPigStorage.java │ │ ├── LzoRawBytesStorage.java │ │ ├── LzoThriftB64LinePigStorage.java │ │ ├── LzoThriftBlockPigStorage.java │ │ ├── LzoTokenizedStorage.java │ │ └── SequenceFileStorage.java │ │ └── util │ │ ├── AbstractLazyTuple.java │ │ ├── AbstractWritableConverter.java │ │ ├── BytesWritableConverter.java │ │ ├── GenericWritableConverter.java │ │ ├── IntWritableConverter.java │ │ ├── LazyThriftWritableConverter.java │ │ ├── LoadFuncTupleIterator.java │ │ ├── LongWritableConverter.java │ │ ├── LzoBufferedPositionedInputStream.java │ │ ├── NullWritableConverter.java │ │ ├── PigCounterHelper.java │ │ ├── PigToProtobuf.java │ │ ├── PigToThrift.java │ │ ├── PigTokenHelper.java │ │ ├── PigUtil.java │ │ ├── ProjectedProtobufTupleFactory.java │ │ ├── ProjectedThriftTupleFactory.java │ │ ├── ProtobufToPig.java │ │ ├── ProtobufTuple.java │ │ ├── ProtobufWritableConverter.java │ │ ├── ResourceSchemaUtil.java │ │ ├── SequenceFileConfig.java │ │ ├── TextConverter.java │ │ ├── ThriftToPig.java │ │ ├── ThriftWritableConverter.java │ │ ├── WritableConverter.java │ │ ├── WritableLoadCaster.java │ │ └── WritableStoreCaster.java │ └── test │ ├── java │ └── com │ │ └── twitter │ │ └── elephantbird │ │ ├── pig │ │ ├── load │ │ │ ├── TestBinaryLoaderWithManySplits.java │ │ │ ├── TestErrorsInInput.java │ │ │ ├── TestJsonLoader.java │ │ │ ├── TestLocationAsTuple.java │ │ │ ├── TestLzoTextLoader.java │ │ │ ├── TestProtobufMultiFormatLoader.java │ │ │ └── TestThriftMultiFormatLoader.java │ │ ├── piggybank │ │ │ ├── Fixtures.java │ │ │ ├── TestInvoker.java │ │ │ ├── TestJsonStringToMap.java │ │ │ ├── TestPigToProto.java │ │ │ ├── TestProtoToPig.java │ │ │ └── TimeProtoConversions.java │ │ ├── store │ │ │ ├── FixedArgsConstructorIntWritableConverter.java │ │ │ ├── TestLzoRawBytesStorage.java │ │ │ ├── TestSequenceFileStorage.java │ │ │ └── VarArgsConstructorIntWritableConverter.java │ │ └── util │ │ │ ├── AbstractTestProtobufWritableConverter.java │ │ │ ├── AbstractTestThriftNameWritableConverter.java │ │ │ ├── AbstractTestThriftWritableConverter.java │ │ │ ├── AbstractTestWritableConverter.java │ │ │ ├── IntegrationTestIntWritableConverter.java │ │ │ ├── IntegrationTestLongWritableConverter.java │ │ │ ├── IntegrationTestTextConverter.java │ │ │ ├── PigTestUtil.java │ │ │ ├── TestLoadFuncTupleIterator.java │ │ │ ├── TestPigToProtobuf.java │ │ │ ├── TestPigToThrift.java │ │ │ ├── TestProtobufWritableConverter.java │ │ │ ├── TestThriftNameWritableConverter.java │ │ │ ├── TestThriftNameWritableConverterCustom.java │ │ │ ├── TestThriftToPig.java │ │ │ └── ThriftNameWritable.java │ │ └── util │ │ ├── TestProtobufs.java │ │ └── TestW3CLogParser.java │ ├── resources │ ├── W3CLogParser.field.txt │ ├── W3CLogParser.invalid.txt │ ├── W3CLogParser.valid.txt │ └── test-log4j.properties │ └── thrift │ └── map_keys.thrift ├── pom.xml ├── rcfile ├── pom.xml └── src │ ├── main │ └── java │ │ └── com │ │ └── twitter │ │ └── elephantbird │ │ ├── mapreduce │ │ ├── input │ │ │ ├── RCFileBaseInputFormat.java │ │ │ ├── RCFileProtobufInputFormat.java │ │ │ ├── RCFileProtobufTupleInputFormat.java │ │ │ ├── RCFileThriftInputFormat.java │ │ │ └── RCFileThriftTupleInputFormat.java │ │ └── output │ │ │ ├── RCFileOutputFormat.java │ │ │ ├── RCFileProtobufOutputFormat.java │ │ │ └── RCFileThriftOutputFormat.java │ │ ├── pig │ │ ├── load │ │ │ ├── RCFileProtobufPigLoader.java │ │ │ └── RCFileThriftPigLoader.java │ │ └── store │ │ │ ├── RCFilePigStorage.java │ │ │ ├── RCFileProtobufPigStorage.java │ │ │ └── RCFileThriftPigStorage.java │ │ └── util │ │ ├── ColumnarMetadata.java │ │ └── RCFileUtil.java │ └── test │ └── java │ └── com │ └── twitter │ └── elephantbird │ └── pig │ └── load │ ├── TestRCFilePigStorage.java │ ├── TestRCFileProtobufStorage.java │ └── TestRCFileThriftStorage.java ├── release.sh └── repo └── com └── twitter └── elephant-bird ├── 2.1.10 ├── elephant-bird-2.1.10.jar ├── elephant-bird-2.1.10.jar.md5 ├── elephant-bird-2.1.10.jar.sha1 ├── elephant-bird-2.1.10.pom ├── elephant-bird-2.1.10.pom.md5 └── elephant-bird-2.1.10.pom.sha1 ├── 2.1.11 ├── elephant-bird-2.1.11.jar ├── elephant-bird-2.1.11.jar.md5 ├── elephant-bird-2.1.11.jar.sha1 ├── elephant-bird-2.1.11.pom ├── elephant-bird-2.1.11.pom.md5 └── elephant-bird-2.1.11.pom.sha1 ├── 2.1.5 ├── elephant-bird-2.1.5.jar ├── elephant-bird-2.1.5.jar.md5 ├── elephant-bird-2.1.5.jar.sha1 ├── elephant-bird-2.1.5.pom ├── elephant-bird-2.1.5.pom.md5 └── elephant-bird-2.1.5.pom.sha1 ├── 2.1.6 ├── elephant-bird-2.1.6.jar ├── elephant-bird-2.1.6.jar.md5 ├── elephant-bird-2.1.6.jar.sha1 ├── elephant-bird-2.1.6.pom ├── elephant-bird-2.1.6.pom.md5 └── elephant-bird-2.1.6.pom.sha1 ├── 2.1.7 ├── elephant-bird-2.1.7.jar ├── elephant-bird-2.1.7.jar.md5 ├── elephant-bird-2.1.7.jar.sha1 ├── elephant-bird-2.1.7.pom ├── elephant-bird-2.1.7.pom.md5 └── elephant-bird-2.1.7.pom.sha1 ├── 2.1.8 ├── elephant-bird-2.1.8.jar ├── elephant-bird-2.1.8.jar.md5 ├── elephant-bird-2.1.8.jar.sha1 ├── elephant-bird-2.1.8.pom ├── elephant-bird-2.1.8.pom.md5 └── elephant-bird-2.1.8.pom.sha1 ├── 2.1.9 ├── elephant-bird-2.1.9.jar ├── elephant-bird-2.1.9.jar.md5 ├── elephant-bird-2.1.9.jar.sha1 ├── elephant-bird-2.1.9.pom ├── elephant-bird-2.1.9.pom.md5 └── elephant-bird-2.1.9.pom.sha1 ├── 2.2.0 ├── elephant-bird-2.2.0.jar ├── elephant-bird-2.2.0.jar.md5 ├── elephant-bird-2.2.0.jar.sha1 ├── elephant-bird-2.2.0.pom ├── elephant-bird-2.2.0.pom.md5 └── elephant-bird-2.2.0.pom.sha1 ├── 2.2.1 ├── elephant-bird-2.2.1.jar ├── elephant-bird-2.2.1.jar.md5 ├── elephant-bird-2.2.1.jar.sha1 ├── elephant-bird-2.2.1.pom ├── elephant-bird-2.2.1.pom.md5 └── elephant-bird-2.2.1.pom.sha1 ├── 2.2.2 ├── elephant-bird-2.2.2.jar ├── elephant-bird-2.2.2.jar.md5 ├── elephant-bird-2.2.2.jar.sha1 ├── elephant-bird-2.2.2.pom ├── elephant-bird-2.2.2.pom.md5 └── elephant-bird-2.2.2.pom.sha1 ├── 2.2.3 ├── elephant-bird-2.2.3.jar ├── elephant-bird-2.2.3.jar.md5 ├── elephant-bird-2.2.3.jar.sha1 ├── elephant-bird-2.2.3.pom ├── elephant-bird-2.2.3.pom.md5 └── elephant-bird-2.2.3.pom.sha1 ├── maven-metadata.xml ├── maven-metadata.xml.md5 └── maven-metadata.xml.sha1 /.gitignore: -------------------------------------------------------------------------------- 1 | *.iml 2 | *.ipr 3 | *.iws 4 | *.tmproj 5 | *.swp 6 | *~ 7 | .DS_Store 8 | .classpath 9 | .generators 10 | .idea 11 | .project 12 | .settings 13 | **/target 14 | target 15 | # TODO fix this 16 | core/test.txt 17 | core/test2.txt 18 | core/test3.txt 19 | build 20 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | 3 | before_install: 4 | # An attempt to fix the buffer overflow in the pig tests 5 | # /usr/lib/jvm/java-7-openjdk-amd64/jre/lib/amd64/libnet.so(Java_java_net_Inet4AddressImpl_getLocalHostName+0x190)[ 6 | # https://github.com/travis-ci/travis-ci/issues/1484 7 | - echo "127.0.0.1 " `hostname` | sudo tee /etc/hosts 8 | - sudo apt-get update -qq 9 | 10 | install: true 11 | matrix: 12 | include: 13 | - jdk: openjdk7 14 | env: THRIFT_TAG=0.7.0 15 | - jdk: openjdk7 16 | env: THRIFT_TAG=0.7.0 HADOOP_PROFILE=-Phadoop2 17 | - jdk: openjdk7 18 | env: THRIFT_TAG=0.10.0 19 | - jdk: openjdk7 20 | env: THRIFT_TAG=0.10.0 HADOOP_PROFILE=-Phadoop2 21 | 22 | script: "./release.sh -c travis" 23 | -------------------------------------------------------------------------------- /cascading-protobuf/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | 5 | com.twitter.elephantbird 6 | elephant-bird 7 | 4.17-SNAPSHOT 8 | .. 9 | 10 | elephant-bird-cascading-protobuf 11 | Elephant Bird Cascading Protobuf 12 | Cascading Protobuf utilities. 13 | 14 | 15 | conjars.org 16 | https://conjars.org/repo 17 | 18 | 19 | 20 | 21 | com.google.protobuf 22 | protobuf-java 23 | ${protobuf.version} 24 | 25 | 26 | cascading 27 | cascading-hadoop 28 | 29 | ${cascading3.version} 30 | provided 31 | 32 | 33 | org.apache.hadoop 34 | hadoop-client 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /cascading-protobuf/src/main/java/com/twitter/elephantbird/cascading/protobuf/ProtobufDeserializer.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.cascading.protobuf; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.lang.reflect.Method; 6 | 7 | import com.google.protobuf.Message; 8 | 9 | import org.apache.hadoop.io.serializer.Deserializer; 10 | 11 | /** 12 | * Deserializes delimited protobufs from input stream 13 | * @author Ning Liang 14 | */ 15 | public class ProtobufDeserializer implements Deserializer { 16 | 17 | private Method parseMethod; 18 | private InputStream in; 19 | 20 | public ProtobufDeserializer(Class klass) { 21 | parseMethod = ProtobufReflectionUtil.parseMethodFor(klass); 22 | } 23 | 24 | @Override 25 | public void open(InputStream inStream) throws IOException { 26 | in = inStream; 27 | } 28 | 29 | @Override 30 | public Message deserialize(Message message) throws IOException { 31 | return ProtobufReflectionUtil.parseMessage(parseMethod, in); 32 | } 33 | 34 | @Override 35 | public void close() throws IOException { 36 | if (in != null) { 37 | in.close(); 38 | } 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /cascading-protobuf/src/main/java/com/twitter/elephantbird/cascading/protobuf/ProtobufReflectionUtil.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.cascading.protobuf; 2 | 3 | import java.io.InputStream; 4 | import java.lang.reflect.InvocationTargetException; 5 | import java.lang.reflect.Method; 6 | 7 | import com.google.protobuf.Message; 8 | 9 | /** 10 | * Utility methods for reflection based protobuf deserialization 11 | * @author Ning Liang 12 | */ 13 | public final class ProtobufReflectionUtil { 14 | private ProtobufReflectionUtil() { } 15 | 16 | /** 17 | * Parse the method for a message 18 | * @param klass the class containing the message 19 | * @return the parsed method 20 | */ 21 | public static Method parseMethodFor(Class klass) { 22 | try { 23 | return klass.getMethod("parseDelimitedFrom", new Class[] {InputStream.class }); 24 | } catch (SecurityException e) { 25 | throw new RuntimeException(e); 26 | } catch (NoSuchMethodException e) { 27 | throw new RuntimeException(e); 28 | } 29 | } 30 | 31 | /** 32 | * Parse the message in a given InputStream using scpecified Method 33 | * @param parseMethod the method used for parsing 34 | * @param in the input stream 35 | * @return the parsed message 36 | */ 37 | public static Message parseMessage(Method parseMethod, InputStream in) { 38 | try { 39 | return (Message) parseMethod.invoke(null, in); 40 | } catch (IllegalArgumentException e) { 41 | throw new RuntimeException(e); 42 | } catch (IllegalAccessException e) { 43 | throw new RuntimeException(e); 44 | } catch (InvocationTargetException e) { 45 | throw new RuntimeException(e); 46 | } 47 | } 48 | 49 | /** 50 | * Parse the message in a given Message container 51 | * @param klass the class containing the message 52 | * @param in the input stream 53 | * @return the parsed Message 54 | */ 55 | public static Message parseMessage(Class klass, InputStream in) { 56 | Method parseMethod = parseMethodFor(klass); 57 | return parseMessage(parseMethod, in); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /cascading-protobuf/src/main/java/com/twitter/elephantbird/cascading/protobuf/ProtobufSerialization.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.cascading.protobuf; 2 | 3 | import java.util.Comparator; 4 | 5 | import com.google.protobuf.Message; 6 | 7 | import org.apache.hadoop.io.serializer.Deserializer; 8 | import org.apache.hadoop.io.serializer.Serialization; 9 | import org.apache.hadoop.io.serializer.Serializer; 10 | 11 | import cascading.tuple.Comparison; 12 | 13 | /** 14 | * Serialization format class 15 | * @author Ning Liang 16 | */ 17 | public class ProtobufSerialization implements Serialization, Comparison { 18 | 19 | @Override 20 | public boolean accept(Class klass) { 21 | boolean accept = Message.class.isAssignableFrom(klass); 22 | return accept; 23 | } 24 | 25 | @Override 26 | public Deserializer getDeserializer(Class klass) { 27 | return new ProtobufDeserializer(klass); 28 | } 29 | 30 | @Override 31 | public Serializer getSerializer(Class klass) { 32 | return new ProtobufSerializer(); 33 | } 34 | 35 | @Override 36 | public Comparator getComparator(Class klass) { 37 | return new ProtobufComparator(); 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /cascading-protobuf/src/main/java/com/twitter/elephantbird/cascading/protobuf/ProtobufSerializer.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.cascading.protobuf; 2 | 3 | import java.io.IOException; 4 | import java.io.OutputStream; 5 | 6 | import com.google.protobuf.Message; 7 | 8 | import org.apache.hadoop.io.serializer.Serializer; 9 | 10 | /** 11 | * Serializes protobufs with delimiters 12 | * @author Ning Liang 13 | */ 14 | public class ProtobufSerializer implements Serializer { 15 | 16 | private OutputStream out; 17 | 18 | @Override 19 | public void open(OutputStream outStream) throws IOException { 20 | out = outStream; 21 | } 22 | 23 | @Override 24 | public void serialize(Message message) throws IOException { 25 | message.writeDelimitedTo(out); 26 | } 27 | 28 | @Override 29 | public void close() throws IOException { 30 | if (out != null) { 31 | out.close(); 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /cascading2/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | 5 | com.twitter.elephantbird 6 | elephant-bird 7 | 4.17-SNAPSHOT 8 | .. 9 | 10 | elephant-bird-cascading2 11 | Elephant Bird Cascading2 12 | Cascading utilities. 13 | 14 | 15 | conjars.org 16 | https://conjars.org/repo 17 | 18 | 19 | 20 | 21 | com.twitter.elephantbird 22 | elephant-bird-core 23 | 24 | 25 | org.apache.hadoop 26 | hadoop-client 27 | 28 | 29 | org.slf4j 30 | slf4j-simple 31 | 32 | 33 | cascading 34 | cascading-hadoop 35 | ${cascading2.version} 36 | provided 37 | 38 | 39 | junit 40 | junit 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /cascading2/src/main/java/com/twitter/elephantbird/cascading2/scheme/CombinedWritableSequenceFile.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.cascading2.scheme; 2 | 3 | import org.apache.hadoop.io.Writable; 4 | import org.apache.hadoop.mapred.JobConf; 5 | import org.apache.hadoop.mapred.OutputCollector; 6 | import org.apache.hadoop.mapred.RecordReader; 7 | 8 | import cascading.flow.FlowProcess; 9 | import cascading.scheme.hadoop.WritableSequenceFile; 10 | import cascading.tap.Tap; 11 | import cascading.tuple.Fields; 12 | 13 | public class CombinedWritableSequenceFile extends WritableSequenceFile { 14 | public CombinedWritableSequenceFile(Fields fields, Class valueType) { 15 | super(fields, valueType); 16 | } 17 | 18 | public CombinedWritableSequenceFile(Fields fields, Class keyType, Class valueType) { 19 | super(fields, keyType, valueType); 20 | } 21 | 22 | @Override 23 | public void sourceConfInit(FlowProcess flowProcess, Tap tap, JobConf conf) { 24 | super.sourceConfInit(flowProcess, tap, conf); 25 | 26 | CombinedSequenceFile.sourceConfInit(conf); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /cascading2/src/main/java/com/twitter/elephantbird/cascading2/scheme/LzoProtobufB64LineScheme.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.cascading2.scheme; 2 | 3 | import com.google.protobuf.Message; 4 | 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | 8 | import java.io.IOException; 9 | 10 | /** 11 | * Scheme for Protobuf B64 line encoded files. 12 | * @deprecated please use {@link LzoProtobufScheme} 13 | * @author Argyris Zymnis 14 | */ 15 | @Deprecated 16 | public class LzoProtobufB64LineScheme extends 17 | LzoProtobufScheme { 18 | private static final Logger LOG = LoggerFactory.getLogger(LzoProtobufB64LineScheme.class); 19 | public LzoProtobufB64LineScheme(Class protoClass) { 20 | super(protoClass); 21 | LOG.warn("LzoProtobufB64LineScheme is deprecated, please use LzoProtobufScheme"); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /cascading2/src/main/java/com/twitter/elephantbird/cascading2/scheme/LzoProtobufBlockScheme.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.cascading2.scheme; 2 | 3 | import com.google.protobuf.Message; 4 | 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | 8 | import java.io.IOException; 9 | 10 | /** 11 | * Scheme for Protobuf block encoded files. 12 | * @deprecated please use {@link LzoProtobufScheme} 13 | * @author Argyris Zymnis 14 | */ 15 | @Deprecated 16 | public class LzoProtobufBlockScheme extends 17 | LzoProtobufScheme { 18 | private static final Logger LOG = LoggerFactory.getLogger(LzoProtobufBlockScheme.class); 19 | public LzoProtobufBlockScheme(Class protoClass) { 20 | super(protoClass); 21 | LOG.warn("LzoProtobufBlockScheme is deprecated, please use LzoProtobufScheme"); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /cascading2/src/main/java/com/twitter/elephantbird/cascading2/scheme/LzoProtobufScheme.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.cascading2.scheme; 2 | 3 | import com.twitter.elephantbird.mapreduce.input.combine.DelegateCombineFileInputFormat; 4 | import org.apache.hadoop.mapred.JobConf; 5 | import org.apache.hadoop.mapred.OutputCollector; 6 | import org.apache.hadoop.mapred.RecordReader; 7 | 8 | import com.google.protobuf.Message; 9 | 10 | import com.twitter.elephantbird.mapred.output.DeprecatedOutputFormatWrapper; 11 | import com.twitter.elephantbird.mapreduce.input.MultiInputFormat; 12 | import com.twitter.elephantbird.mapreduce.io.ProtobufWritable; 13 | import com.twitter.elephantbird.mapreduce.output.LzoProtobufBlockOutputFormat; 14 | import com.twitter.elephantbird.util.Protobufs; 15 | import com.twitter.elephantbird.util.TypeRef; 16 | 17 | import cascading.flow.FlowProcess; 18 | import cascading.tap.Tap; 19 | 20 | /** 21 | * Scheme for Protobuf lzo compressed files. 22 | * 23 | * @author Avi Bryant, Ning Liang 24 | */ 25 | public class LzoProtobufScheme extends 26 | LzoBinaryScheme> { 27 | 28 | private static final long serialVersionUID = -5011096855302946105L; 29 | private Class protoClass; 30 | 31 | public LzoProtobufScheme(Class protoClass) { 32 | this.protoClass = protoClass; 33 | } 34 | 35 | protected ProtobufWritable prepareBinaryWritable() { 36 | TypeRef typeRef = (TypeRef) Protobufs.getTypeRef(protoClass.getName()); 37 | return new ProtobufWritable(typeRef); 38 | } 39 | 40 | @Override 41 | public void sinkConfInit(FlowProcess hfp, Tap tap, JobConf conf) { 42 | LzoProtobufBlockOutputFormat.setClassConf(protoClass, conf); 43 | DeprecatedOutputFormatWrapper.setOutputFormat(LzoProtobufBlockOutputFormat.class, conf); 44 | } 45 | 46 | @Override 47 | public void sourceConfInit(FlowProcess hfp, Tap tap, JobConf conf) { 48 | MultiInputFormat.setClassConf(protoClass, conf); 49 | DelegateCombineFileInputFormat.setDelegateInputFormat(conf, MultiInputFormat.class); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /cascading2/src/main/java/com/twitter/elephantbird/cascading2/scheme/LzoTextLine.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.cascading2.scheme; 2 | 3 | import com.twitter.elephantbird.mapreduce.input.LzoTextInputFormat; 4 | import com.twitter.elephantbird.mapreduce.input.combine.DelegateCombineFileInputFormat; 5 | import org.apache.hadoop.mapred.JobConf; 6 | import org.apache.hadoop.mapred.OutputCollector; 7 | import org.apache.hadoop.mapred.RecordReader; 8 | 9 | import com.twitter.elephantbird.mapred.output.DeprecatedLzoTextOutputFormat; 10 | 11 | import cascading.flow.FlowProcess; 12 | import cascading.scheme.hadoop.TextLine; 13 | import cascading.tap.Tap; 14 | import cascading.tuple.Fields; 15 | 16 | /** 17 | * Scheme for LZO encoded text files. 18 | * 19 | * @author Ning Liang 20 | */ 21 | public class LzoTextLine extends TextLine { 22 | 23 | public LzoTextLine() { 24 | super(); 25 | } 26 | 27 | public LzoTextLine(int numSinkParts) { 28 | super(numSinkParts); 29 | } 30 | 31 | public LzoTextLine(Fields sourceFields, Fields sinkFields) { 32 | super(sourceFields, sinkFields); 33 | } 34 | 35 | public LzoTextLine(Fields sourceFields, Fields sinkFields, int numSinkParts) { 36 | super(sourceFields, sinkFields, numSinkParts); 37 | } 38 | 39 | public LzoTextLine(Fields sourceFields) { 40 | super(sourceFields); 41 | } 42 | 43 | public LzoTextLine(Fields sourceFields, int numSinkParts) { 44 | super(sourceFields, numSinkParts); 45 | } 46 | 47 | @Override 48 | public void sourceConfInit(FlowProcess flowProcess, Tap tap, JobConf conf ) { 49 | DelegateCombineFileInputFormat.setDelegateInputFormat(conf, LzoTextInputFormat.class); 50 | } 51 | 52 | @Override 53 | public void sinkConfInit(FlowProcess flowProcess, Tap tap, JobConf conf ) { 54 | conf.setOutputFormat(DeprecatedLzoTextOutputFormat.class); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /cascading2/src/main/java/com/twitter/elephantbird/cascading2/scheme/LzoThriftB64LineScheme.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.cascading2.scheme; 2 | 3 | import org.apache.thrift.TBase; 4 | 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | 8 | import java.io.IOException; 9 | 10 | /** 11 | * Scheme for Thrift B64 encoded files. 12 | * @deprecated please use {@link LzoThriftScheme} 13 | * @author Argyris Zymnis 14 | */ 15 | @Deprecated 16 | public class LzoThriftB64LineScheme> extends 17 | LzoThriftScheme { 18 | private static final Logger LOG = LoggerFactory.getLogger(LzoThriftB64LineScheme.class); 19 | public LzoThriftB64LineScheme(Class thriftClass) { 20 | super(thriftClass); 21 | LOG.warn("LzoThriftB64LineScheme is deprecated, please use LzoThriftScheme"); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /cascading2/src/main/java/com/twitter/elephantbird/cascading2/scheme/LzoThriftScheme.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.cascading2.scheme; 2 | 3 | import com.twitter.elephantbird.mapreduce.input.combine.DelegateCombineFileInputFormat; 4 | import org.apache.hadoop.mapred.JobConf; 5 | import org.apache.hadoop.mapred.OutputCollector; 6 | import org.apache.hadoop.mapred.RecordReader; 7 | 8 | import com.twitter.elephantbird.mapred.output.DeprecatedOutputFormatWrapper; 9 | import com.twitter.elephantbird.mapreduce.input.MultiInputFormat; 10 | import com.twitter.elephantbird.mapreduce.io.ThriftWritable; 11 | import com.twitter.elephantbird.mapreduce.output.LzoThriftBlockOutputFormat; 12 | import com.twitter.elephantbird.util.ThriftUtils; 13 | import com.twitter.elephantbird.util.TypeRef; 14 | 15 | import cascading.flow.FlowProcess; 16 | import cascading.tap.Tap; 17 | 18 | import org.apache.thrift.TBase; 19 | 20 | /** 21 | * Scheme for Thrift lzo compressed files. 22 | * 23 | * @author Argyris Zymnis 24 | */ 25 | public class LzoThriftScheme> extends 26 | LzoBinaryScheme> { 27 | 28 | private static final long serialVersionUID = -5011096855302946109L; 29 | private Class thriftClass; 30 | 31 | public LzoThriftScheme(Class thriftClass) { 32 | this.thriftClass = thriftClass; 33 | } 34 | 35 | @Override 36 | public void sinkConfInit(FlowProcess hfp, Tap tap, JobConf conf) { 37 | LzoThriftBlockOutputFormat.setClassConf(thriftClass, conf); 38 | DeprecatedOutputFormatWrapper.setOutputFormat(LzoThriftBlockOutputFormat.class, conf); 39 | } 40 | 41 | protected ThriftWritable prepareBinaryWritable() { 42 | TypeRef typeRef = (TypeRef) ThriftUtils.getTypeRef(thriftClass); 43 | return new ThriftWritable(typeRef); 44 | } 45 | 46 | @Override 47 | public void sourceConfInit(FlowProcess hfp, Tap tap, JobConf conf) { 48 | MultiInputFormat.setClassConf(thriftClass, conf); 49 | DelegateCombineFileInputFormat.setDelegateInputFormat(conf, MultiInputFormat.class); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /cascading2/src/test/java/com/twitter/elephantbird/cascading2/scheme/TestCombinedSequenceFile.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.cascading2.scheme; 2 | 3 | import org.apache.hadoop.mapred.JobConf; 4 | import org.apache.hadoop.mapred.OutputCollector; 5 | import org.apache.hadoop.mapred.RecordReader; 6 | import org.junit.Test; 7 | 8 | import com.twitter.elephantbird.mapred.input.DeprecatedInputFormatWrapper; 9 | import com.twitter.elephantbird.mapreduce.input.MapReduceInputFormatWrapper; 10 | import com.twitter.elephantbird.mapreduce.input.combine.DelegateCombineFileInputFormat; 11 | 12 | import cascading.flow.FlowProcess; 13 | import cascading.flow.hadoop.HadoopFlowProcess; 14 | import cascading.tap.Tap; 15 | import cascading.tap.hadoop.util.TempHfs; 16 | import cascading.tuple.Fields; 17 | import static org.junit.Assert.assertEquals; 18 | 19 | public class TestCombinedSequenceFile { 20 | 21 | @Test 22 | public void testHadoopConf() { 23 | CombinedSequenceFile csfScheme = new CombinedSequenceFile(Fields.ALL); 24 | JobConf conf = new JobConf(); 25 | FlowProcess fp = new HadoopFlowProcess(); 26 | Tap tap = 27 | new TempHfs(conf, "test", CombinedSequenceFile.class, false); 28 | 29 | csfScheme.sourceConfInit(fp, tap, conf); 30 | 31 | assertEquals( 32 | "MapReduceInputFormatWrapper shold wrap mapred.SequenceFileinputFormat", 33 | "org.apache.hadoop.mapred.SequenceFileInputFormat", 34 | conf.get(MapReduceInputFormatWrapper.CLASS_CONF_KEY) 35 | ); 36 | assertEquals( 37 | "Delegate combiner should wrap MapReduceInputFormatWrapper", 38 | "com.twitter.elephantbird.mapreduce.input.MapReduceInputFormatWrapper", 39 | conf.get(DelegateCombineFileInputFormat.COMBINED_INPUT_FORMAT_DELEGATE) 40 | ); 41 | assertEquals( 42 | "DeprecatedInputFormatWrapper should wrap Delegate combiner", 43 | "com.twitter.elephantbird.mapreduce.input.combine.DelegateCombineFileInputFormat", 44 | conf.get(DeprecatedInputFormatWrapper.CLASS_CONF_KEY) 45 | ); 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /cascading3/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | 5 | com.twitter.elephantbird 6 | elephant-bird 7 | 4.17-SNAPSHOT 8 | .. 9 | 10 | elephant-bird-cascading3 11 | Elephant Bird Cascading3 12 | Cascading utilities. 13 | 14 | 15 | conjars.org 16 | https://conjars.org/repo 17 | 18 | 19 | 20 | 21 | com.twitter.elephantbird 22 | elephant-bird-core 23 | 24 | 25 | org.apache.hadoop 26 | hadoop-client 27 | 28 | 29 | org.slf4j 30 | slf4j-simple 31 | 32 | 33 | cascading 34 | cascading-hadoop 35 | ${cascading3.version} 36 | provided 37 | 38 | 39 | junit 40 | junit 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /cascading3/src/test/java/com/twitter/elephantbird/cascading3/scheme/TestCombinedSequenceFile.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.cascading3.scheme; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.mapred.JobConf; 5 | import org.apache.hadoop.mapred.OutputCollector; 6 | import org.apache.hadoop.mapred.RecordReader; 7 | import org.junit.Test; 8 | 9 | import com.twitter.elephantbird.mapred.input.DeprecatedInputFormatWrapper; 10 | import com.twitter.elephantbird.mapreduce.input.MapReduceInputFormatWrapper; 11 | import com.twitter.elephantbird.mapreduce.input.combine.DelegateCombineFileInputFormat; 12 | 13 | import cascading.flow.FlowProcess; 14 | import cascading.flow.hadoop.HadoopFlowProcess; 15 | import cascading.tap.Tap; 16 | import cascading.tap.hadoop.util.TempHfs; 17 | import cascading.tuple.Fields; 18 | import static org.junit.Assert.assertEquals; 19 | 20 | public class TestCombinedSequenceFile { 21 | 22 | @Test 23 | public void testHadoopConf() { 24 | CombinedSequenceFile csfScheme = new CombinedSequenceFile(Fields.ALL); 25 | JobConf conf = new JobConf(); 26 | FlowProcess fp = new HadoopFlowProcess(); 27 | Tap tap = 28 | new TempHfs(conf, "test", CombinedSequenceFile.class, false); 29 | 30 | csfScheme.sourceConfInit(fp, tap, conf); 31 | 32 | assertEquals( 33 | "MapReduceInputFormatWrapper shold wrap mapred.SequenceFileinputFormat", 34 | "org.apache.hadoop.mapred.SequenceFileInputFormat", 35 | conf.get(MapReduceInputFormatWrapper.CLASS_CONF_KEY) 36 | ); 37 | assertEquals( 38 | "Delegate combiner should wrap MapReduceInputFormatWrapper", 39 | "com.twitter.elephantbird.mapreduce.input.MapReduceInputFormatWrapper", 40 | conf.get(DelegateCombineFileInputFormat.COMBINED_INPUT_FORMAT_DELEGATE) 41 | ); 42 | assertEquals( 43 | "DeprecatedInputFormatWrapper should wrap Delegate combiner", 44 | "com.twitter.elephantbird.mapreduce.input.combine.DelegateCombineFileInputFormat", 45 | conf.get(DeprecatedInputFormatWrapper.CLASS_CONF_KEY) 46 | ); 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapred/input/DeprecatedInputFormatValueCopier.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapred.input; 2 | 3 | public interface DeprecatedInputFormatValueCopier { 4 | public abstract void copyValue(T oldValue, T newValue); 5 | } 6 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapred/input/DeprecatedLzoJsonInputFormat.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapred.input; 2 | 3 | import org.apache.hadoop.io.LongWritable; 4 | import org.apache.hadoop.io.MapWritable; 5 | 6 | import com.twitter.elephantbird.mapreduce.input.LzoJsonInputFormat; 7 | 8 | /** 9 | * mapred version of {@link LzoJsonInputFormat}. 10 | */ 11 | public class DeprecatedLzoJsonInputFormat extends DeprecatedFileInputFormatWrapper{ 12 | public DeprecatedLzoJsonInputFormat() { 13 | super(new LzoJsonInputFormat()); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapred/input/DeprecatedLzoTextInputFormat.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapred.input; 2 | 3 | import org.apache.hadoop.io.LongWritable; 4 | import org.apache.hadoop.io.Text; 5 | 6 | import com.twitter.elephantbird.mapreduce.input.LzoTextInputFormat; 7 | 8 | /** 9 | * mapred version of {@link LzoTextInputFormat}. 10 | */ 11 | public class DeprecatedLzoTextInputFormat extends DeprecatedFileInputFormatWrapper { 12 | public DeprecatedLzoTextInputFormat() { 13 | super(new LzoTextInputFormat()); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapred/input/DeprecatedMultiInputFormat.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapred.input; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.io.LongWritable; 5 | 6 | import com.twitter.elephantbird.mapreduce.input.MultiInputFormat; 7 | import com.twitter.elephantbird.mapreduce.io.BinaryWritable; 8 | import com.twitter.elephantbird.util.TypeRef; 9 | 10 | /** 11 | * mapred version of {@link MultiInputFormat} 12 | */ 13 | public class DeprecatedMultiInputFormat 14 | extends DeprecatedFileInputFormatWrapper>{ 15 | 16 | public DeprecatedMultiInputFormat() { 17 | super(new MultiInputFormat()); 18 | } 19 | 20 | public DeprecatedMultiInputFormat(TypeRef typeRef) { 21 | super(new MultiInputFormat(typeRef)); 22 | } 23 | 24 | /** 25 | * Stores supplied class name in configuration. This configuration is 26 | * read on the remote tasks to initialize the input format correctly. 27 | */ 28 | public static void setClassConf(Class clazz, Configuration conf) { 29 | MultiInputFormat.setClassConf(clazz, conf); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapred/input/DeprecatedRawMultiInputFormat.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapred.input; 2 | 3 | import com.twitter.elephantbird.mapreduce.input.RawMultiInputFormat; 4 | 5 | /** 6 | * mapred version of {@link RawMultiInputFormat} 7 | */ 8 | public class DeprecatedRawMultiInputFormat extends DeprecatedFileInputFormatWrapper { 9 | 10 | @SuppressWarnings("unchecked") 11 | public DeprecatedRawMultiInputFormat() { 12 | super(new RawMultiInputFormat()); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapred/output/DeprecatedFileOutputFormatWrapper.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapred.output; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.mapred.JobConf; 7 | import org.apache.hadoop.mapred.RecordWriter; 8 | import org.apache.hadoop.mapred.Reporter; 9 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 10 | import org.apache.hadoop.util.Progressable; 11 | 12 | /** 13 | * The wrapper enables an {@link FileOutputFormat} written for new 14 | * mapreduce interface to be used in contexts where 15 | * a {@link org.apache.hadoop.mapred.FileOutputFormat} old mapred 16 | * interface is required.

17 | * 18 | * Note that this does not have a no args constructed, so it cannot currently 19 | * be used as an output format. Instead, it must be extended, such as in 20 | * {@link DeprecatedLzoTextOutputFormat}. 21 | * 22 | * @see DeprecatedOutputFormatWrapper 23 | * 24 | * @author Jonathan Coveney 25 | */ 26 | public class DeprecatedFileOutputFormatWrapper 27 | extends org.apache.hadoop.mapred.FileOutputFormat { 28 | 29 | private DeprecatedOutputFormatWrapper wrapped; 30 | 31 | public DeprecatedFileOutputFormatWrapper(FileOutputFormat wrapped) { 32 | this.wrapped = new DeprecatedOutputFormatWrapper(wrapped); 33 | } 34 | 35 | @Override 36 | public void checkOutputSpecs(FileSystem ignored, JobConf job) throws IOException { 37 | wrapped.checkOutputSpecs(ignored, job); 38 | } 39 | 40 | @Override 41 | public RecordWriter getRecordWriter(FileSystem ignored, JobConf job, 42 | String name, Progressable progress) throws IOException { 43 | return wrapped.getRecordWriter(ignored, job, name, progress); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapred/output/DeprecatedLzoTextOutputFormat.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapred.output; 2 | 3 | import com.twitter.elephantbird.mapreduce.output.LzoTextOutputFormat; 4 | 5 | /** 6 | * mapred version of {@link LzoTextOutputFormat}. 7 | */ 8 | public class DeprecatedLzoTextOutputFormat 9 | extends DeprecatedFileOutputFormatWrapper { 10 | 11 | public DeprecatedLzoTextOutputFormat() { 12 | super(new LzoTextOutputFormat()); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/input/BinaryConverterProvider.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.input; 2 | 3 | import com.twitter.elephantbird.mapreduce.io.BinaryConverter; 4 | import org.apache.hadoop.conf.Configuration; 5 | 6 | 7 | /** 8 | * A simple interface to serialize and deserialize objects 9 | */ 10 | public interface BinaryConverterProvider { 11 | BinaryConverter getConverter(Configuration conf); 12 | } 13 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/input/FilterRecordReader.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.input; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.mapreduce.InputSplit; 6 | import org.apache.hadoop.mapreduce.RecordReader; 7 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 8 | 9 | /** 10 | * A RecordReader equivalent of FilterInputStream 11 | */ 12 | public class FilterRecordReader extends RecordReader { 13 | 14 | protected RecordReader reader; 15 | 16 | public FilterRecordReader(RecordReader reader) { 17 | this.reader = reader; 18 | 19 | } 20 | 21 | @Override 22 | public void close() throws IOException { 23 | reader.close(); 24 | } 25 | 26 | @Override 27 | public K getCurrentKey() throws IOException, InterruptedException { 28 | return reader.getCurrentKey(); 29 | } 30 | 31 | @Override 32 | public V getCurrentValue() throws IOException, InterruptedException { 33 | return reader.getCurrentValue(); 34 | } 35 | 36 | @Override 37 | public float getProgress() throws IOException, InterruptedException { 38 | return reader.getProgress(); 39 | } 40 | 41 | @Override 42 | public void initialize(InputSplit split, TaskAttemptContext context) 43 | throws IOException, InterruptedException { 44 | reader.initialize(split, context); 45 | } 46 | 47 | @Override 48 | public boolean nextKeyValue() throws IOException, InterruptedException { 49 | return reader.nextKeyValue(); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/input/IntegerListInputSplit.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.input; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | 7 | import org.apache.hadoop.io.Writable; 8 | import org.apache.hadoop.mapreduce.InputSplit; 9 | 10 | import org.apache.commons.logging.Log; 11 | import org.apache.commons.logging.LogFactory; 12 | 13 | public class IntegerListInputSplit extends InputSplit implements Writable { 14 | private static final Log LOG = LogFactory.getLog(IntegerListInputSplit.class); 15 | 16 | protected long min; 17 | protected long max; 18 | 19 | public IntegerListInputSplit() {} 20 | 21 | public IntegerListInputSplit(long min, long max) { 22 | if (min > max) { 23 | throw new IllegalArgumentException("Attempt to create IntegerListInputSplit with min > max, min = " + 24 | min + " and max = " + max); 25 | } 26 | LOG.info("Creating IntegerListInputSplit with InputSplit [" + min + ", " + max + "]"); 27 | this.min = min; 28 | this.max = max; 29 | } 30 | 31 | public long getMin() { 32 | return min; 33 | } 34 | 35 | public long getMax() { 36 | return max; 37 | } 38 | 39 | @Override 40 | public long getLength() throws IOException, InterruptedException { 41 | return max - min + 1; 42 | } 43 | 44 | @Override 45 | public String[] getLocations() throws IOException, InterruptedException { 46 | return new String[] {}; 47 | } 48 | 49 | @Override 50 | public void write(DataOutput dataOutput) throws IOException { 51 | dataOutput.writeLong(min); 52 | dataOutput.writeLong(max); 53 | } 54 | 55 | @Override 56 | public void readFields(DataInput dataInput) throws IOException { 57 | min = dataInput.readLong(); 58 | max = dataInput.readLong(); 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/input/LzoGenericB64LineRecordReader.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.input; 2 | 3 | import org.apache.thrift.TBase; 4 | import org.slf4j.Logger; 5 | import org.slf4j.LoggerFactory; 6 | 7 | import com.twitter.elephantbird.mapreduce.io.GenericWritable; 8 | import com.twitter.elephantbird.mapreduce.io.BinaryConverter; 9 | import com.twitter.elephantbird.util.TypeRef; 10 | 11 | public class LzoGenericB64LineRecordReader extends LzoBinaryB64LineRecordReader> { 12 | public LzoGenericB64LineRecordReader(TypeRef typeRef, BinaryConverter converter) { 13 | super(typeRef, new GenericWritable(converter), converter); 14 | } 15 | } 16 | 17 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/input/LzoGenericBlockRecordReader.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.input; 2 | 3 | import com.twitter.elephantbird.mapreduce.io.GenericWritable; 4 | import com.twitter.elephantbird.mapreduce.io.BinaryConverter; 5 | import com.twitter.elephantbird.util.TypeRef; 6 | 7 | import org.slf4j.Logger; 8 | import org.slf4j.LoggerFactory; 9 | 10 | import java.io.IOException; 11 | import java.io.InputStream; 12 | 13 | import com.google.protobuf.ByteString; 14 | 15 | import com.twitter.elephantbird.mapreduce.input.MapredInputFormatCompatible; 16 | import com.twitter.elephantbird.mapreduce.io.BinaryBlockReader; 17 | import com.twitter.elephantbird.util.TypeRef; 18 | 19 | import org.slf4j.Logger; 20 | 21 | /** 22 | * Generic reader for LZO-encoded protobuf blocks. Uses the supplied BinaryConverter for deserialization. 23 | */ 24 | public class LzoGenericBlockRecordReader 25 | extends LzoBinaryBlockRecordReader> { 26 | 27 | public LzoGenericBlockRecordReader(TypeRef typeRef, BinaryConverter binaryConverter) { 28 | super(typeRef, 29 | new BinaryBlockReader(null, binaryConverter), 30 | new GenericWritable(binaryConverter)); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/input/LzoGenericProtobufBlockInputFormat.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.input; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.BytesWritable; 6 | import org.apache.hadoop.io.LongWritable; 7 | import org.apache.hadoop.mapreduce.InputSplit; 8 | import org.apache.hadoop.mapreduce.RecordReader; 9 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 10 | 11 | /** 12 | * Similar to the LzoProtobufBlockInputFormat class, but instead of being specific to 13 | * a given protocol buffer (via codegen of a LzoProtobufBlockInputFormat-derived class), 14 | * this InputFormat returns pairs and leaves the parsing of 15 | * each individual protobuf into object form to the user. 16 | * 17 | * This has two advantages. One, if your data is composed of multiple types of protobufs which 18 | * you know via some other method, you need to use this to decide the type at runtime. More 19 | * common are situations where you just want an aggregate over all protobufs (such as a count) 20 | * without caring about the individual protobuf fields, in which case this class is faster 21 | * because you don't pay for protobuf object deserialization. 22 | */ 23 | 24 | public class LzoGenericProtobufBlockInputFormat extends LzoInputFormat { 25 | 26 | public LzoGenericProtobufBlockInputFormat() { 27 | } 28 | 29 | @Override 30 | public RecordReader createRecordReader(InputSplit split, 31 | TaskAttemptContext taskAttempt) throws IOException, InterruptedException { 32 | 33 | return new LzoGenericProtobufBlockRecordReader(); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/input/LzoJsonInputFormat.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.input; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.LongWritable; 6 | import org.apache.hadoop.io.MapWritable; 7 | import org.apache.hadoop.mapreduce.InputSplit; 8 | import org.apache.hadoop.mapreduce.RecordReader; 9 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 10 | 11 | /** 12 | * An inputformat for LZO-compressed JSON files. Returns 13 | * pairs, where the json object in java is essentially a Map. 14 | * 15 | * WARNING: The RecordReader-derived class used here may not handle multi-line json 16 | * well, if it all. Please improve this. 17 | */ 18 | public class LzoJsonInputFormat extends LzoInputFormat { 19 | 20 | @Override 21 | public RecordReader createRecordReader(InputSplit split, 22 | TaskAttemptContext taskAttempt) throws IOException, InterruptedException { 23 | 24 | return new LzoJsonRecordReader(); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/input/LzoProtobufB64LineInputFormat.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.input; 2 | 3 | import com.google.protobuf.Message; 4 | import com.twitter.elephantbird.util.TypeRef; 5 | 6 | /** 7 | * This is the base class for all base64 encoded, line-oriented protocol buffer based input formats. 8 | * Data is expected to be one base64 encoded serialized protocol buffer per line. 9 | *

10 | * 11 | * A small fraction of bad records are tolerated. See {@link LzoRecordReader} 12 | * for more information on error handling. 13 | * 14 | * @Deprecated use {@link MultiInputFormat} 15 | */ 16 | public class LzoProtobufB64LineInputFormat extends MultiInputFormat { 17 | 18 | public LzoProtobufB64LineInputFormat() { 19 | } 20 | 21 | public LzoProtobufB64LineInputFormat(TypeRef typeRef) { 22 | super(typeRef); 23 | } 24 | 25 | public static LzoProtobufB64LineInputFormat newInstance(TypeRef typeRef) { 26 | return new LzoProtobufB64LineInputFormat(typeRef); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/input/LzoProtobufB64LineRecordReader.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.input; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | 6 | import com.google.protobuf.Message; 7 | import com.twitter.elephantbird.mapreduce.io.ProtobufConverter; 8 | import com.twitter.elephantbird.mapreduce.io.ProtobufWritable; 9 | import com.twitter.elephantbird.util.TypeRef; 10 | 11 | /** 12 | * Reads line from an lzo compressed text file, base64 decodes it, and then 13 | * deserializes that into the templatized protobuf object. 14 | * Returns pairs. 15 | */ 16 | public class LzoProtobufB64LineRecordReader extends LzoBinaryB64LineRecordReader> { 17 | private static final Logger LOG = LoggerFactory.getLogger(LzoProtobufB64LineRecordReader.class); 18 | 19 | public LzoProtobufB64LineRecordReader(TypeRef typeRef) { 20 | super(typeRef, new ProtobufWritable(typeRef), ProtobufConverter.newInstance(typeRef)); 21 | LOG.info("LzoProtobufB64LineRecordReader, type args are " + typeRef.getRawClass()); 22 | } 23 | } 24 | 25 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/input/LzoProtobufBlockInputFormat.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.input; 2 | 3 | import com.google.protobuf.Message; 4 | import com.twitter.elephantbird.util.TypeRef; 5 | 6 | /** 7 | * This is the base class for all blocked protocol buffer based input formats. That is, if you use 8 | * the ProtobufBlockWriter to write your data, this input format can read it. 9 | *

10 | * 11 | * A small fraction of bad records are tolerated. See {@link LzoRecordReader} 12 | * for more information on error handling. 13 | * 14 | * @Deprecated use {@link MultiInputFormat} 15 | */ 16 | public class LzoProtobufBlockInputFormat extends MultiInputFormat { 17 | 18 | public LzoProtobufBlockInputFormat() { 19 | } 20 | 21 | public LzoProtobufBlockInputFormat(TypeRef typeRef) { 22 | super(typeRef); 23 | } 24 | 25 | public static LzoProtobufBlockInputFormat newInstance(TypeRef typeRef) { 26 | return new LzoProtobufBlockInputFormat(typeRef); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/input/LzoProtobufBlockRecordReader.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.input; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | 6 | import com.google.protobuf.Message; 7 | import com.twitter.elephantbird.mapreduce.io.ProtobufBlockReader; 8 | import com.twitter.elephantbird.mapreduce.io.ProtobufWritable; 9 | import com.twitter.elephantbird.util.TypeRef; 10 | 11 | /** 12 | * A reader for LZO-encoded protobuf blocks, generally written by 13 | * a ProtobufBlockWriter or similar. Returns pairs. 14 | */ 15 | public class LzoProtobufBlockRecordReader extends LzoBinaryBlockRecordReader> { 16 | private static final Logger LOG = LoggerFactory.getLogger(LzoProtobufBlockRecordReader.class); 17 | 18 | public LzoProtobufBlockRecordReader(TypeRef typeRef) { 19 | // input stream for the reader will be set by LzoBinaryBlockRecordReader 20 | super(typeRef, new ProtobufBlockReader(null, typeRef), new ProtobufWritable(typeRef)); 21 | LOG.info("LzoProtobufBlockRecordReader, type args are " + typeRef.getRawClass()); 22 | } 23 | } 24 | 25 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/input/LzoTextInputFormat.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.input; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.LongWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.InputSplit; 8 | import org.apache.hadoop.mapreduce.RecordReader; 9 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 10 | 11 | /** 12 | * A copy of the TextInputFormat class for use with LZO-encoded data. Should be 13 | * identical to TextInputFormat in use. 14 | */ 15 | 16 | public class LzoTextInputFormat extends LzoInputFormat { 17 | 18 | @Override 19 | public RecordReader createRecordReader(InputSplit split, 20 | TaskAttemptContext taskAttempt) throws IOException, InterruptedException { 21 | 22 | return new LzoLineRecordReader(); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/input/LzoThriftB64LineInputFormat.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.input; 2 | 3 | import org.apache.thrift.TBase; 4 | 5 | import com.twitter.elephantbird.util.TypeRef; 6 | 7 | /** 8 | * Reads line from an lzo compressed text file, base64 decodes it, and then 9 | * deserializes that into the Thrift object. 10 | * Returns pairs.

11 | * 12 | * A small fraction of bad records are tolerated. See {@link LzoRecordReader} 13 | * for more information on error handling. 14 | * 15 | * @Deprecated use {@link MultiInputFormat} 16 | */ 17 | public class LzoThriftB64LineInputFormat> extends MultiInputFormat { 18 | 19 | public LzoThriftB64LineInputFormat() {} 20 | 21 | public LzoThriftB64LineInputFormat(TypeRef typeRef) { 22 | super(typeRef); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/input/LzoThriftB64LineRecordReader.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.input; 2 | 3 | import org.apache.thrift.TBase; 4 | import org.slf4j.Logger; 5 | import org.slf4j.LoggerFactory; 6 | 7 | import com.twitter.elephantbird.mapreduce.io.ThriftWritable; 8 | import com.twitter.elephantbird.mapreduce.io.ThriftConverter; 9 | import com.twitter.elephantbird.util.TypeRef; 10 | 11 | public class LzoThriftB64LineRecordReader> extends LzoBinaryB64LineRecordReader> { 12 | private static final Logger LOG = LoggerFactory.getLogger(LzoThriftB64LineRecordReader.class); 13 | 14 | public LzoThriftB64LineRecordReader(TypeRef typeRef) { 15 | super(typeRef, new ThriftWritable(typeRef), new ThriftConverter(typeRef)); 16 | LOG.info("record type is " + typeRef.getRawClass()); 17 | } 18 | } 19 | 20 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/input/LzoThriftBlockInputFormat.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.input; 2 | 3 | import com.twitter.elephantbird.util.TypeRef; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.thrift.TBase; 7 | 8 | /** 9 | * Reads Thrift objects written in blocks using LzoThriftBlockOutputFormat 10 | *

11 | * 12 | * A small fraction of bad records are tolerated. See {@link LzoRecordReader} 13 | * for more information on error handling. 14 | */ 15 | public class LzoThriftBlockInputFormat> extends MultiInputFormat { 16 | 17 | public LzoThriftBlockInputFormat() {} 18 | 19 | public LzoThriftBlockInputFormat(TypeRef typeRef) { 20 | super(typeRef); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/input/LzoThriftBlockRecordReader.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.input; 2 | 3 | import com.twitter.elephantbird.mapreduce.io.ThriftBlockReader; 4 | import com.twitter.elephantbird.mapreduce.io.ThriftWritable; 5 | import com.twitter.elephantbird.util.TypeRef; 6 | 7 | import org.apache.thrift.TBase; 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | 11 | /** 12 | * A reader for LZO-encoded protobuf blocks, generally written by 13 | * a ProtobufBlockWriter or similar. Returns pairs. 14 | */ 15 | public class LzoThriftBlockRecordReader> extends LzoBinaryBlockRecordReader> { 16 | private static final Logger LOG = LoggerFactory.getLogger(LzoThriftBlockRecordReader.class); 17 | 18 | public LzoThriftBlockRecordReader(TypeRef typeRef) { 19 | // input stream for the reader will be set by LzoBinaryBlockRecordReader 20 | super(typeRef, new ThriftBlockReader(null, typeRef), new ThriftWritable(typeRef)); 21 | LOG.info("LzoThriftBlockRecordReader, type args are " + typeRef.getRawClass()); 22 | } 23 | } 24 | 25 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/input/MapredInputFormatCompatible.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.input; 2 | 3 | /** 4 | * A {@link org.apache.hadoop.mapred.RecordReader} should implement 5 | * MapredInputFormatCompatible if it intends to be compatable 6 | * with {@link org.apache.hadoop.mapred.input.DeprecatedInputFormatWrapper} 7 | * and {@link org.apache.hadoop.mapred.lib.CombinedFileInputFormat} 8 | * 9 | * DeprecatedInputFormatWrapper enables you to use a mapreduce 10 | * {@link org.apache.hadoop.mapreduce.InputFormat} in contexts 11 | * where the old mapred interface is required. 12 | * 13 | * RecordReaders written for the deprecated mapred interface reuse 14 | * the key and value objects. This is not a requirement for 15 | * the RecordReaders written for the newer mapreduce interface 16 | * 17 | * This interface allows DeprecatedInputFormatWrapper to 18 | * manually set key and value on the RecordReader to satisfy 19 | * the old mapred interface. 20 | */ 21 | public interface MapredInputFormatCompatible { 22 | /** 23 | * Set the RecordReader's existing key and value objects 24 | * to be equal to the key and value objects passed in. 25 | * 26 | * When implemented, DeprecatedInputFormatWrapper calls 27 | * this before every call to nextKeyValue(). 28 | */ 29 | public void setKeyValue(K key, V value); 30 | } 31 | 32 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/input/RawMultiInputFormat.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.input; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.BytesWritable; 6 | import org.apache.hadoop.io.LongWritable; 7 | import org.apache.hadoop.io.Writable; 8 | import org.apache.hadoop.mapreduce.InputSplit; 9 | import org.apache.hadoop.mapreduce.RecordReader; 10 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 11 | 12 | import com.twitter.elephantbird.mapreduce.io.RawBytesWritable; 13 | import com.twitter.elephantbird.util.TypeRef; 14 | 15 | /** 16 | * A {@link MultiInputFormat} that returns records as uninterpreted 17 | * {@link BytesWritable}. Converts {@link RawBytesWritable} 18 | * returned by {@link MultiInputFormat} to a BytesWritable.

19 | * 20 | * Use MultiInputFormat if RawBytesWritable is required or suffices. 21 | */ 22 | @SuppressWarnings("rawtypes") 23 | public class RawMultiInputFormat extends MultiInputFormat { 24 | 25 | @SuppressWarnings("unchecked") 26 | public RawMultiInputFormat() { 27 | super(new TypeRef(byte[].class){}); 28 | } 29 | 30 | @SuppressWarnings("unchecked") 31 | @Override 32 | public RecordReader createRecordReader(InputSplit split, 33 | TaskAttemptContext taskAttempt) throws IOException, InterruptedException { 34 | // use FilterRecord Reader to convert RawBytesWritable to BytesWritable. 35 | 36 | return new FilterRecordReader( 37 | super.createRecordReader(split, taskAttempt)) { 38 | 39 | // extend BytesWritable to avoid a copy. 40 | byte[] bytes; 41 | BytesWritable value = new BytesWritable() { 42 | public byte[] getBytes() { 43 | return bytes; 44 | } 45 | 46 | public int getLength() { 47 | return bytes.length; 48 | } 49 | }; 50 | 51 | @Override 52 | public Writable getCurrentValue() throws IOException, InterruptedException { 53 | bytes = ((RawBytesWritable)super.getCurrentValue()).get(); 54 | return value; 55 | } 56 | }; 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/input/RawSequenceFileInputFormat.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.input; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.DataInputBuffer; 6 | import org.apache.hadoop.mapreduce.InputSplit; 7 | import org.apache.hadoop.mapreduce.RecordReader; 8 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 9 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; 10 | 11 | /** 12 | * InputFormat which uses {@link RawSequenceFileRecordReader} to read keys and values from 13 | * SequenceFiles as {@link DataInputBuffer} instances. 14 | * 15 | * @author Andy Schlaikjer 16 | */ 17 | public class RawSequenceFileInputFormat extends 18 | SequenceFileInputFormat { 19 | @Override 20 | public RecordReader createRecordReader(InputSplit split, 21 | TaskAttemptContext context) throws IOException { 22 | return new RawSequenceFileRecordReader(); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/io/BinaryConverter.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.io; 2 | 3 | /** 4 | * A simple interface to serialize and deserialize objects 5 | */ 6 | public interface BinaryConverter { 7 | /* TODO : What about exceptions? 8 | */ 9 | 10 | /** Returns deserialized object. Throws if deserialization fails. */ 11 | M fromBytes(byte[] messageBuffer) throws DecodeException; 12 | 13 | byte[] toBytes(M message); 14 | 15 | } 16 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/io/DecodeException.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.io; 2 | 3 | import java.io.IOException; 4 | 5 | /** 6 | * Thrown by BinaryConverter if it fails to deserialize bytes. 7 | */ 8 | public class DecodeException extends IOException { 9 | public DecodeException(Throwable cause) { 10 | super("BinaryConverter failed to decode", cause); 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/io/GenericWritable.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.io; 2 | 3 | /** 4 | * {@link BinaryWritable} for Generics 5 | */ 6 | public class GenericWritable extends BinaryWritable { 7 | public GenericWritable(BinaryConverter converter) { 8 | this(null, converter); 9 | } 10 | 11 | public GenericWritable(M message, BinaryConverter converter) { 12 | super(message, converter); 13 | } 14 | 15 | @Override 16 | protected BinaryConverter getConverterFor(Class clazz) { 17 | throw new UnsupportedOperationException(); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/io/IdentityBinaryConverter.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.io; 2 | 3 | /** 4 | * A noop {@link BinaryConverter} that returns the input bytes unmodified. 5 | */ 6 | public class IdentityBinaryConverter implements BinaryConverter { 7 | 8 | @Override 9 | public byte[] fromBytes(byte[] messageBuffer) { 10 | return messageBuffer; 11 | } 12 | 13 | @Override 14 | public byte[] toBytes(byte[] message) { 15 | return message; 16 | } 17 | 18 | } 19 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/io/ProtobufBlockReader.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.io; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | 6 | import com.google.protobuf.Message; 7 | import com.twitter.elephantbird.util.TypeRef; 8 | 9 | import org.apache.hadoop.io.BytesWritable; 10 | import org.slf4j.Logger; 11 | import org.slf4j.LoggerFactory; 12 | 13 | /* A class to read blocks of protobuf data of type M. To use, just instantiate 14 | * with an InputStream and a TypeRef, call readProtobuf until it returns false, and 15 | * then close the protobuf. For example, 16 | * 17 | * TypeRef personRef = new TypeRef(); 18 | * ProtobufBlockReader reader = new ProtobufBlockReader( 19 | * new FileInputStream("person_data"), personRef); 20 | * ProtobufWritable writable = new ProtobufWritable(personRef); 21 | * while (reader.readProtobuf(writable)) { 22 | * Person person = writable.get(); 23 | * // do something with the protobuf. 24 | * } 25 | * reader.close(); 26 | * 27 | * 28 | * See the ProtobufBlockWriter for how to write data files like "person_data" above. 29 | */ 30 | 31 | public class ProtobufBlockReader extends BinaryBlockReader { 32 | private static final Logger LOG = LoggerFactory.getLogger(ProtobufBlockReader.class); 33 | 34 | public ProtobufBlockReader(InputStream in, TypeRef typeRef) { 35 | super(in, ProtobufConverter.newInstance(typeRef)); 36 | LOG.info("ProtobufReader, my typeClass is " + typeRef.getRawClass()); 37 | } 38 | 39 | // for backward compatibility : 40 | 41 | public boolean readProtobuf(ProtobufWritable message) throws IOException { 42 | return readNext(message); 43 | } 44 | 45 | public boolean readProtobufBytes(BytesWritable message) throws IOException { 46 | return readNextProtoBytes(message); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/io/ProtobufBlockWriter.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.io; 2 | 3 | import java.io.OutputStream; 4 | 5 | import com.google.protobuf.Message; 6 | import com.twitter.elephantbird.util.TypeRef; 7 | 8 | /** 9 | * A class to write blocks of protobuf data of type M. To use, just instantiate 10 | * with an OutputStream and a TypeRef, call write until you're done, call finish, and 11 | * then close the writer. For example, 12 | * 13 | * ProtobufBlockWriter writer = new ProtobufBlockWriter( 14 | * new FileOutputStream("person_data"), Person.class); 15 | * writer.write(person1); 16 | * ... 17 | * writer.write(person100000); 18 | * writer.finish(); 19 | * writer.close(); 20 | * 21 | * To make an output stream for an lzo-compressed file in a Path named lzoPath in HDFS, 22 | * use the following code: 23 | * 24 | * Configuration conf = new Configuration(); 25 | * FileSystem fs = lzoPath.getFileSystem(conf); 26 | * FSDataOutputStream outputStream = fs.create(lzoPath, true); 27 | * LzopCodec codec = new LzopCodec(); 28 | * codec.setConf(conf); 29 | * OutputStream lzopOutputStream = codec.createOutputStream(outputStream); 30 | * 31 | * 32 | * See the ProtobufBlockReader for how to read data files like "person_data" above. 33 | */ 34 | public class ProtobufBlockWriter extends BinaryBlockWriter { 35 | 36 | public ProtobufBlockWriter(OutputStream out, Class protoClass) { 37 | super(out, protoClass, ProtobufConverter.newInstance(protoClass), DEFAULT_NUM_RECORDS_PER_BLOCK); 38 | } 39 | 40 | public ProtobufBlockWriter(OutputStream out, Class protoClass, int numRecordsPerBlock) { 41 | super(out, protoClass, ProtobufConverter.newInstance(protoClass), numRecordsPerBlock); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/io/ProtobufWritable.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.io; 2 | 3 | import com.google.protobuf.Message; 4 | import com.twitter.elephantbird.util.TypeRef; 5 | 6 | /** 7 | * A Hadoop Writable wrapper around a protocol buffer of type M. 8 | */ 9 | 10 | public class ProtobufWritable extends BinaryWritable { 11 | 12 | public ProtobufWritable() { 13 | super(null, null); 14 | } 15 | 16 | public ProtobufWritable(TypeRef typeRef) { 17 | this(null, typeRef); 18 | } 19 | 20 | public ProtobufWritable(M message, TypeRef typeRef) { 21 | super(message, new ProtobufConverter(typeRef)); 22 | } 23 | 24 | /** 25 | * Returns a ProtobufWritable for a given Protobuf class. 26 | */ 27 | public static ProtobufWritable newInstance(Class tClass) { 28 | return new ProtobufWritable(new TypeRef(tClass){}); 29 | } 30 | 31 | @Override 32 | protected BinaryConverter getConverterFor(Class clazz) { 33 | return ProtobufConverter.newInstance(clazz); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/io/RawBlockReader.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.io; 2 | 3 | import java.io.InputStream; 4 | 5 | /** 6 | * A {@link BinaryBlockReader} that returns each record as uninterpreted 7 | * raw bytes. 8 | */ 9 | public class RawBlockReader extends BinaryBlockReader { 10 | 11 | public RawBlockReader(InputStream in) { 12 | super(in, new IdentityBinaryConverter(), false); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/io/RawBlockWriter.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.io; 2 | 3 | import java.io.OutputStream; 4 | 5 | /** 6 | * A {@link BinaryBlockWriter} where each record is a byte array. 7 | */ 8 | public class RawBlockWriter extends BinaryBlockWriter { 9 | 10 | public RawBlockWriter(OutputStream out) { 11 | super(out, byte[].class, 12 | new IdentityBinaryConverter(), DEFAULT_NUM_RECORDS_PER_BLOCK); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/io/RawBytesWritable.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.io; 2 | 3 | /** 4 | * A {@link BinaryWritable} that returns the raw bytes. 5 | */ 6 | public class RawBytesWritable extends BinaryWritable { 7 | 8 | public RawBytesWritable() { 9 | super(null, new IdentityBinaryConverter()); 10 | } 11 | 12 | @Override 13 | protected BinaryConverter getConverterFor(Class clazz) { 14 | return null; // not expected to be invoked since converter is always set. 15 | } 16 | } 17 | 18 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/io/ThriftBlockReader.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.io; 2 | 3 | import java.io.InputStream; 4 | 5 | import com.twitter.elephantbird.util.TypeRef; 6 | 7 | import org.apache.thrift.TBase; 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | 11 | /** A class to read blocks of Thrift objects. 12 | * See the {@link ProtobufBlockReader} for more info. 13 | */ 14 | public class ThriftBlockReader> extends BinaryBlockReader { 15 | private static final Logger LOG = LoggerFactory.getLogger(ThriftBlockReader.class); 16 | 17 | public ThriftBlockReader(InputStream in, TypeRef typeRef) { 18 | super(in, new ThriftConverter(typeRef)); 19 | LOG.info("ThriftBlockReader, my typeClass is " + typeRef.getRawClass()); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/io/ThriftBlockWriter.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.io; 2 | 3 | import java.io.OutputStream; 4 | 5 | import org.apache.thrift.TBase; 6 | 7 | import com.twitter.elephantbird.util.TypeRef; 8 | 9 | /** 10 | * A class to write blocks of Thrift data of type M. 11 | * See {@link ProtobufBlockWriter} for more documentation. 12 | */ 13 | public class ThriftBlockWriter> extends BinaryBlockWriter { 14 | 15 | public ThriftBlockWriter(OutputStream out, Class protoClass) { 16 | super(out, protoClass, new ThriftConverter(new TypeRef(protoClass){}), DEFAULT_NUM_RECORDS_PER_BLOCK); 17 | } 18 | 19 | public ThriftBlockWriter(OutputStream out, Class protoClass, int numRecordsPerBlock) { 20 | super(out, protoClass, new ThriftConverter(new TypeRef(protoClass){}), numRecordsPerBlock); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/io/ThriftWritable.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.io; 2 | 3 | import org.apache.thrift.TBase; 4 | 5 | import com.twitter.elephantbird.util.TypeRef; 6 | 7 | /** 8 | * {@link BinaryWritable} for Thrift 9 | */ 10 | public class ThriftWritable> extends BinaryWritable { 11 | /** 12 | * Returns a ThriftWritable for a given Thrift class. 13 | */ 14 | public static > ThriftWritable newInstance(Class tClass) { 15 | return new ThriftWritable(new TypeRef(tClass){}); 16 | } 17 | 18 | public ThriftWritable() { 19 | super(null, null); 20 | } 21 | 22 | public ThriftWritable(TypeRef typeRef) { 23 | this(null, typeRef); 24 | } 25 | 26 | public ThriftWritable(M message, TypeRef typeRef) { 27 | super(message, new ThriftConverter(typeRef)); 28 | } 29 | 30 | @Override 31 | protected BinaryConverter getConverterFor(Class clazz) { 32 | return ThriftConverter.newInstance(clazz); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/output/LzoBinaryBlockOutputFormat.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.output; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.mapreduce.RecordWriter; 6 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 7 | 8 | import com.twitter.elephantbird.mapreduce.io.RawBlockWriter; 9 | import com.twitter.elephantbird.mapreduce.io.RawBytesWritable; 10 | 11 | /** 12 | * Output format for LZO block-compressed byte[] records. 13 | * 14 | * @author Andy Schlaikjer 15 | * @see LzoBinaryBlockRecordWriter 16 | * @see RawBytesWritable 17 | */ 18 | public class LzoBinaryBlockOutputFormat extends LzoOutputFormat { 19 | @Override 20 | public RecordWriter getRecordWriter(TaskAttemptContext job) 21 | throws IOException, InterruptedException { 22 | return new LzoBinaryBlockRecordWriter(new RawBlockWriter( 23 | getOutputStream(job))); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/output/LzoBinaryBlockRecordWriter.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.output; 2 | 3 | import java.io.IOException; 4 | 5 | import com.twitter.elephantbird.mapreduce.io.BinaryBlockWriter; 6 | import com.twitter.elephantbird.mapreduce.io.BinaryWritable; 7 | 8 | import org.apache.hadoop.mapreduce.RecordWriter; 9 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 10 | 11 | /** 12 | * A writer for LZO-encoded blocks of protobuf or Thrift objects, generally read by 13 | * a ProtobufBlockWriter or similar. 14 | */ 15 | public class LzoBinaryBlockRecordWriter> 16 | extends RecordWriter { 17 | 18 | private BinaryBlockWriter writer_; 19 | 20 | public LzoBinaryBlockRecordWriter(BinaryBlockWriter writer) { 21 | writer_ = writer; 22 | } 23 | 24 | public void write(M nullWritable, W protoWritable) 25 | throws IOException, InterruptedException { 26 | writer_.write(protoWritable.get()); 27 | // the counters are not accessible 28 | } 29 | 30 | public void close(TaskAttemptContext taskAttemptContext) 31 | throws IOException, InterruptedException { 32 | writer_.finish(); 33 | writer_.close(); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/output/LzoOutputFormat.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.output; 2 | 3 | import java.io.DataOutputStream; 4 | import java.io.IOException; 5 | 6 | import com.twitter.elephantbird.util.HadoopCompat; 7 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 8 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | 12 | import com.hadoop.compression.lzo.LzopCodec; 13 | import com.twitter.elephantbird.util.LzoUtils; 14 | 15 | /** 16 | * Base class for Lzo outputformats. 17 | * provides an helper method to create lzo output stream. 18 | */ 19 | public abstract class LzoOutputFormat extends WorkFileOverride.FileOutputFormat { 20 | 21 | public static final Logger LOG = LoggerFactory.getLogger(LzoOutputFormat.class); 22 | 23 | /** 24 | * Helper method to create lzo output file needed to create RecordWriter 25 | */ 26 | protected DataOutputStream getOutputStream(TaskAttemptContext job) 27 | throws IOException, InterruptedException { 28 | 29 | return LzoUtils.getIndexedLzoOutputStream( 30 | HadoopCompat.getConfiguration(job), 31 | getDefaultWorkFile(job, LzopCodec.DEFAULT_LZO_EXTENSION)); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/output/LzoProtobufB64LineRecordWriter.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.output; 2 | 3 | import java.io.DataOutputStream; 4 | 5 | import com.google.protobuf.Message; 6 | import com.twitter.elephantbird.mapreduce.io.BinaryConverter; 7 | import com.twitter.elephantbird.mapreduce.io.ProtobufWritable; 8 | 9 | /** 10 | * This class is not strictly necessary, you can use LzoBinaryB64LineRecordWriter directly.
11 | * It is just there to make the Protobuf dependency clear. 12 | * 13 | * @param Message you are writing 14 | * @param Writable of this message 15 | */ 16 | public class LzoProtobufB64LineRecordWriter> 17 | extends LzoBinaryB64LineRecordWriter { 18 | 19 | public LzoProtobufB64LineRecordWriter(BinaryConverter converter, DataOutputStream out) { 20 | super(converter, out); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/output/LzoProtobufBlockRecordWriter.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.output; 2 | 3 | import com.google.protobuf.Message; 4 | import com.twitter.elephantbird.mapreduce.io.BinaryBlockWriter; 5 | import com.twitter.elephantbird.mapreduce.io.ProtobufWritable; 6 | 7 | public class LzoProtobufBlockRecordWriter > 8 | extends LzoBinaryBlockRecordWriter { 9 | 10 | public LzoProtobufBlockRecordWriter(BinaryBlockWriter writer) { 11 | super(writer); 12 | } 13 | 14 | } 15 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/output/LzoTextOutputFormat.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.output; 2 | 3 | import java.io.IOException; 4 | 5 | import com.twitter.elephantbird.util.HadoopCompat; 6 | import org.apache.hadoop.conf.Configuration; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.mapreduce.RecordWriter; 9 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 10 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 11 | 12 | import com.hadoop.compression.lzo.LzopCodec; 13 | import com.twitter.elephantbird.util.LzoUtils; 14 | 15 | public class LzoTextOutputFormat extends WorkFileOverride.TextOutputFormat { 16 | @Override 17 | public RecordWriter getRecordWriter(TaskAttemptContext job) 18 | throws IOException, InterruptedException { 19 | 20 | Configuration conf = HadoopCompat.getConfiguration(job); 21 | Path path = getDefaultWorkFile(job, LzopCodec.DEFAULT_LZO_EXTENSION); 22 | 23 | return new LineRecordWriter( 24 | LzoUtils.getIndexedLzoOutputStream(conf, path), 25 | conf.get("mapred.textoutputformat.separator", "\t") 26 | ); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/output/LzoThriftB64LineOutputFormat.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.output; 2 | 3 | import java.io.IOException; 4 | 5 | import com.twitter.elephantbird.mapreduce.io.ThriftConverter; 6 | import com.twitter.elephantbird.mapreduce.io.ThriftWritable; 7 | import com.twitter.elephantbird.util.HadoopCompat; 8 | import com.twitter.elephantbird.util.ThriftUtils; 9 | import com.twitter.elephantbird.util.TypeRef; 10 | 11 | import org.apache.hadoop.conf.Configuration; 12 | import org.apache.hadoop.mapreduce.RecordWriter; 13 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 14 | import org.apache.thrift.TBase; 15 | 16 | /** 17 | * Data is written as one base64 encoded serialized thrift per line.

18 | * 19 | * Do not forget to set Thrift class using setClassConf(). 20 | */ 21 | public class LzoThriftB64LineOutputFormat> 22 | extends LzoOutputFormat> { 23 | 24 | protected TypeRef typeRef_; 25 | 26 | public LzoThriftB64LineOutputFormat() {} 27 | 28 | public LzoThriftB64LineOutputFormat(TypeRef typeRef) { 29 | typeRef_ = typeRef; 30 | } 31 | 32 | /** 33 | * Sets an internal configuration in jobConf so that remote Tasks 34 | * instantiate appropriate object for this generic class based on thriftClass 35 | */ 36 | public static > 37 | void setClassConf(Class thriftClass, Configuration jobConf) { 38 | ThriftUtils.setClassConf(jobConf, 39 | LzoThriftB64LineOutputFormat.class, 40 | thriftClass); 41 | } 42 | 43 | @Override 44 | public RecordWriter> getRecordWriter(TaskAttemptContext job) 45 | throws IOException, InterruptedException { 46 | if (typeRef_ == null) { 47 | typeRef_ = ThriftUtils.getTypeRef(HadoopCompat.getConfiguration(job), LzoThriftB64LineOutputFormat.class); 48 | } 49 | return new LzoBinaryB64LineRecordWriter>(new ThriftConverter(typeRef_), getOutputStream(job)); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/output/LzoThriftB64LineRecordWriter.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.output; 2 | 3 | import java.io.DataOutputStream; 4 | 5 | import org.apache.thrift.TBase; 6 | 7 | import com.twitter.elephantbird.mapreduce.io.BinaryConverter; 8 | import com.twitter.elephantbird.mapreduce.io.ThriftWritable; 9 | 10 | /** 11 | * This class is not strictly necessary, you can use LzoBinaryB64LineRecordWriter directly.
12 | * It is just there to make the Protobuf dependency clear. 13 | * 14 | * @param thrift message that will be written 15 | * @param writable that wraps this message 16 | */ 17 | public class LzoThriftB64LineRecordWriter, W extends ThriftWritable> 18 | extends LzoBinaryB64LineRecordWriter{ 19 | 20 | public LzoThriftB64LineRecordWriter(BinaryConverter converter, DataOutputStream out) { 21 | super(converter, out); 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/output/LzoThriftBlockOutputFormat.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.output; 2 | 3 | import java.io.IOException; 4 | 5 | import com.twitter.elephantbird.mapreduce.io.ThriftBlockWriter; 6 | import com.twitter.elephantbird.mapreduce.io.ThriftWritable; 7 | import com.twitter.elephantbird.util.HadoopCompat; 8 | import com.twitter.elephantbird.util.ThriftUtils; 9 | import com.twitter.elephantbird.util.TypeRef; 10 | 11 | import org.apache.hadoop.conf.Configuration; 12 | import org.apache.hadoop.mapreduce.RecordWriter; 13 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 14 | import org.apache.thrift.TBase; 15 | 16 | /** 17 | * Data is written as one base64 encoded serialized thrift per line.

18 | * 19 | * Do not forget to set Thrift class using setClassConf(). 20 | */ 21 | public class LzoThriftBlockOutputFormat> 22 | extends LzoOutputFormat> { 23 | 24 | protected TypeRef typeRef_; 25 | 26 | public LzoThriftBlockOutputFormat() {} 27 | 28 | public LzoThriftBlockOutputFormat(TypeRef typeRef) { 29 | typeRef_ = typeRef; 30 | } 31 | 32 | /** 33 | * Sets an internal configuration in jobConf so that remote Tasks 34 | * instantiate appropriate object for this generic class based on thriftClass 35 | */ 36 | public static > 37 | void setClassConf(Class thriftClass, Configuration jobConf) { 38 | ThriftUtils.setClassConf(jobConf, 39 | LzoThriftBlockOutputFormat.class, 40 | thriftClass); 41 | } 42 | 43 | public RecordWriter> getRecordWriter(TaskAttemptContext job) 44 | throws IOException, InterruptedException { 45 | if (typeRef_ == null) { 46 | typeRef_ = ThriftUtils.getTypeRef(HadoopCompat.getConfiguration(job), LzoThriftBlockOutputFormat.class); 47 | } 48 | return new LzoBinaryBlockRecordWriter>( 49 | new ThriftBlockWriter(getOutputStream(job), typeRef_.getRawClass())); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/mapreduce/output/LzoThriftBlockRecordWriter.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.output; 2 | 3 | import org.apache.thrift.TBase; 4 | 5 | import com.twitter.elephantbird.mapreduce.io.BinaryBlockWriter; 6 | import com.twitter.elephantbird.mapreduce.io.ThriftWritable; 7 | 8 | public class LzoThriftBlockRecordWriter , W extends ThriftWritable> 9 | extends LzoBinaryBlockRecordWriter { 10 | 11 | public LzoThriftBlockRecordWriter(BinaryBlockWriter writer) { 12 | super(writer); 13 | } 14 | 15 | } 16 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/util/Codecs.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.util; 2 | 3 | import java.lang.reflect.InvocationTargetException; 4 | 5 | import org.apache.commons.codec.binary.Base64; 6 | 7 | /** 8 | * Various Codecs specific utilities. 9 | */ 10 | public final class Codecs { 11 | private Codecs() { 12 | 13 | } 14 | 15 | /** 16 | * Get a instance of standard base64 implementation from apache 17 | * commons-codec library 18 | * @return standard base64 instance 19 | */ 20 | public static Base64 createStandardBase64() { 21 | /* with constructor Base64() in commons-codec-1.4 22 | * encode() inserts a newline after every 76 characters. 23 | * Base64(0) disables that incompatibility. 24 | */ 25 | try { 26 | return Base64.class.getConstructor(int.class).newInstance(0); 27 | } catch (SecurityException e) { 28 | } catch (NoSuchMethodException e) { 29 | } catch (IllegalArgumentException e) { 30 | } catch (InstantiationException e) { 31 | } catch (IllegalAccessException e) { 32 | } catch (InvocationTargetException e) { 33 | } 34 | return new Base64(); 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/util/CoreTestUtil.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.util; 2 | 3 | import com.hadoop.compression.lzo.LzoCodec; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | 8 | import java.io.IOException; 9 | 10 | /** 11 | * Common test utilities 12 | */ 13 | public class CoreTestUtil { 14 | 15 | private static final Logger LOG = LoggerFactory.getLogger(CoreTestUtil.class); 16 | 17 | static public String getTestDataDir(Class testClass) { 18 | return System.getProperty("test.build.data") + "/" + testClass.getSimpleName(); 19 | } 20 | 21 | /** 22 | * @return true if "require.lzo.tests" system property is set or if native 23 | * lzo libraries are loaded. 24 | */ 25 | static public boolean okToRunLzoTests(Configuration conf) throws IOException { 26 | 27 | if (Boolean.parseBoolean(System.getProperty("require.lzo.tests"))) { 28 | return true; 29 | } 30 | try { 31 | return LzoCodec.isNativeLzoLoaded(conf); 32 | } catch (UnsatisfiedLinkError e) { 33 | LOG.warn("Unable to load native LZO, skipping tests that require it.", e); 34 | } 35 | return false; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/util/ListHelper.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.util; 2 | 3 | import java.util.List; 4 | 5 | import com.google.common.base.Predicate; 6 | import com.google.common.collect.Lists; 7 | 8 | /** 9 | * Functional list utilities that google collections is for some reason lacking. 10 | */ 11 | 12 | public class ListHelper { 13 | public static List filter(List input, Predicate predicate) { 14 | List output = Lists.newArrayList(); 15 | for (K val: input) { 16 | if (predicate.apply(val)) { 17 | output.add(val); 18 | } 19 | } 20 | return output; 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/util/Pair.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.util; 2 | 3 | public class Pair { 4 | private final A first; 5 | private final B second; 6 | 7 | public Pair(A first, B second) { 8 | this.first = first; 9 | this.second = second; 10 | } 11 | 12 | @Override 13 | public int hashCode() { 14 | int hashFirst = first != null ? first.hashCode() : 0; 15 | int hashSecond = second != null ? second.hashCode() : 0; 16 | return (hashFirst + hashSecond) * hashSecond + hashFirst; 17 | } 18 | 19 | @Override 20 | public boolean equals(Object other) { 21 | if (other instanceof Pair) { 22 | Pair otherPair = (Pair) other; 23 | if ((first == null && otherPair.first == null) || 24 | (first != null && first.equals(otherPair.first))) { 25 | if ((second == null && otherPair.second == null) || 26 | (second != null && second.equals(otherPair.second))) { 27 | return true; 28 | } 29 | } 30 | } 31 | return false; 32 | } 33 | 34 | @Override 35 | public String toString() 36 | { 37 | return "(" + first + ", " + second + ")"; 38 | } 39 | 40 | public A getFirst() { 41 | return first; 42 | } 43 | 44 | public B getSecond() { 45 | return second; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /core/src/main/java/com/twitter/elephantbird/util/Utils.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.util; 2 | 3 | public class Utils { 4 | /** 5 | * returns Class.forName(className, true, classLoader).
6 | * Throws a RuntimeExcepiton if the class is not found. 7 | * 8 | * @see {@link Class#forName(String, boolean, ClassLoader) 9 | */ 10 | public static Class classForName(String className, ClassLoader classLoader) { 11 | try { 12 | return Class.forName(className, true, classLoader); 13 | } catch (ClassNotFoundException e) { 14 | throw new RuntimeException("failed to load class " + className, e); 15 | } 16 | } 17 | 18 | /** 19 | * Ensures the classLoader is 'consistent' with the original 20 | * class loader that created existingClass. Asserts
21 | * classLoader.loadClass(existingClass.getName()) == existingClass. 22 | *

23 | * 24 | * If classLoader fails to load the class, this returns silently.
25 | * Throws a RuntimeException with detailed message if the consistency 26 | * check fails. 27 | * 28 | * @param existingClass 29 | * @param classLoader 30 | */ 31 | public static void ensureClassLoaderConsistency(Class existingClass, 32 | ClassLoader classLoader) { 33 | Class loadedClass; 34 | try { 35 | loadedClass = Class.forName(existingClass.getName(), true, classLoader); 36 | } catch (ClassNotFoundException e) { 37 | return; // let class loading fail some where else. 38 | } 39 | 40 | if (!loadedClass.equals(existingClass)) { 41 | throw new RuntimeException("The class loader is incosistent with the " 42 | + "class loader that initially loaded " 43 | + existingClass.getClass() 44 | + ". This can lead to various unexpected side effects."); 45 | 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /core/src/main/protobuf/address_book.proto: -------------------------------------------------------------------------------- 1 | package com.twitter.data.proto.tutorial; 2 | 3 | // The sample protocol buffer file that Google uses in their examples at 4 | // https://code.google.com/p/protobuf. 5 | // Used in this project for tests and examples. 6 | 7 | option java_outer_classname = "AddressBookProtos"; 8 | 9 | message Person { 10 | required string name = 1; 11 | required int32 id = 2; 12 | optional string email = 3; 13 | 14 | enum PhoneType { 15 | MOBILE = 0; 16 | HOME = 1; 17 | WORK = 2; 18 | } 19 | 20 | message PhoneNumber { 21 | required string number = 1; 22 | optional PhoneType type = 2 [default = HOME]; 23 | } 24 | 25 | repeated PhoneNumber phone = 4; 26 | } 27 | 28 | message AddressBook { 29 | repeated Person person = 1; 30 | optional bytes byteData = 2; 31 | } 32 | 33 | // used testing handling of unknown fields 34 | message PersonWithoutEmail { 35 | required string name = 1; 36 | required int32 id = 2; 37 | repeated Person.PhoneNumber phone = 4; 38 | } 39 | -------------------------------------------------------------------------------- /core/src/main/protobuf/thrift_fixtures.proto: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.examples.proto; 2 | 3 | message OneOfEach { 4 | optional bool im_true = 1; 5 | optional bool im_false = 2; 6 | optional int32 a_bite = 3; 7 | optional int32 integer16 = 4; 8 | optional int32 integer32 = 5; 9 | optional int64 integer64 = 6; 10 | optional double double_precision = 7; 11 | optional string some_characters = 8; 12 | optional string zomg_unicode = 9; 13 | optional bool what_who = 10; 14 | optional bytes base64 = 11; 15 | repeated int32 byte_list = 12; 16 | repeated int32 i16_list = 13; 17 | repeated int64 i64_list = 14; 18 | }; -------------------------------------------------------------------------------- /core/src/test/java/com/twitter/elephantbird/mapreduce/input/TestLzoJsonRecordReader.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.input; 2 | 3 | import junit.framework.TestCase; 4 | import org.apache.hadoop.io.MapWritable; 5 | import org.apache.hadoop.io.Text; 6 | import org.json.simple.parser.JSONParser; 7 | import org.junit.Test; 8 | 9 | /** 10 | * Test the LzoJsonRecordReader, make sure it reads the data properly. 11 | */ 12 | public class TestLzoJsonRecordReader extends TestCase { 13 | 14 | /** 15 | * {@link LzoJsonRecordReader#decodeLineToJson(JSONParser, Text, MapWritable)} 16 | * must not choke on lines containing the word "null" (i.e. not the null 17 | * value but the string "null"). 18 | * 19 | * This can happen when the original input line to JSONParser contains "null" 20 | * as a string. In this case {@link JSONParser#parse(java.io.Reader)} will 21 | * return a null reference. 22 | * 23 | */ 24 | @Test 25 | public void testNullString() { 26 | Text line = new Text("null"); 27 | boolean result = LzoJsonRecordReader.decodeLineToJson(new JSONParser(), line, new MapWritable()); 28 | assertEquals("Parsing line with contents 'null'", false, result); 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /core/src/test/java/com/twitter/elephantbird/mapreduce/input/TestLzoProtobufBlockInputFormat.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.input; 2 | 3 | import static org.junit.Assert.assertTrue; 4 | 5 | import org.junit.Test; 6 | 7 | public class TestLzoProtobufBlockInputFormat { 8 | 9 | @Test 10 | public void testCreation() { 11 | assertTrue("we need to write a real test", true); 12 | /* 13 | * LzoDeviceInputFormat a = new LzoDeviceInputFormat(); 14 | 15 | System.out.println("a's class is " + a.getClass()); 16 | 17 | LzoStatusInputFormat b = new LzoStatusInputFormat(); 18 | System.out.println("b's class is " + b.getClass()); 19 | 20 | Message m = Protobufs.instantiateFromClassName("com.twitter.elephantbird.data.proto.Tables.Device"); 21 | System.out.println("m's classname is " + m.getClass() + " and it looks like[ " + m + " ]"); 22 | 23 | Device m2 = Protobufs.instantiateFromClass(Device.class); 24 | System.out.println("m2's classname is " + m2.getClass()); 25 | 26 | Device m3 = Protobufs.parseFrom(Device.class, new byte[] {}); 27 | System.out.println("m3's classname is " + m3.getClass()); 28 | */ 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /core/src/test/java/com/twitter/elephantbird/util/TestCodecs.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.util; 2 | 3 | 4 | import static org.junit.Assert.assertArrayEquals; 5 | 6 | import org.junit.After; 7 | import org.junit.Before; 8 | import org.junit.Test; 9 | 10 | public class TestCodecs { 11 | 12 | @Before 13 | public void setUp() { 14 | } 15 | 16 | 17 | @After 18 | public void tearDown() { 19 | } 20 | 21 | @Test 22 | public void testcreateStandardBase64() { 23 | String quote = "Man is distinguished, not only by his reason, but " + 24 | "by this singular passion from other animals, which is a" + 25 | " lust of the mind, that by a perseverance of delight in" + 26 | " the continued and indefatigable generation of knowledge," + 27 | " exceeds the short vehemence of any carnal pleasure."; 28 | String expected = "TWFuIGlzIGRpc3Rpbmd1aXNoZWQsIG5vdCBvbmx5IGJ5IGhpc" + 29 | "yByZWFzb24sIGJ1dCBieSB0aGlzIHNpbmd1bGFyIHBhc3Npb24gZnJvbSB" + 30 | "vdGhlci" + 31 | "BhbmltYWxzLCB3aGljaCBpcyBhIGx1c3Qgb2YgdGhlIG1pbmQsIHRoYXQgYn" + 32 | "kgYSBwZXJzZXZlcmFuY2Ugb2YgZGVsaWdodCBpbiB0aGUgY29udGlu" + 33 | "dWVkIGFuZCBpbmRlZmF0aWdhYmxlIGdlbmVyYXRpb24gb2Yga25vd2xl" + 34 | "ZGdlLCBleGNlZWRzIHRoZSBzaG9ydCB2ZWhlbWVuY2Ugb2YgYW55IGNhcm5" + 35 | "hbCBwbGVhc3VyZS4="; 36 | assertArrayEquals(Codecs.createStandardBase64().encode(quote.getBytes()), 37 | expected.getBytes()); 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /core/src/test/java/com/twitter/elephantbird/util/TestHadoopUtils.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.util; 2 | 3 | import java.util.Map; 4 | import java.util.Set; 5 | 6 | import com.google.common.collect.Maps; 7 | 8 | import org.apache.hadoop.conf.Configuration; 9 | import org.junit.Test; 10 | 11 | import static org.junit.Assert.assertEquals; 12 | import static org.junit.Assert.assertNull; 13 | import static org.junit.Assert.fail; 14 | 15 | /** 16 | * @author Alex Levenson 17 | */ 18 | public class TestHadoopUtils { 19 | 20 | @Test 21 | public void testReadWriteObjectToConfAsBase64() throws Exception { 22 | Map anObject = Maps.newHashMap(); 23 | anObject.put(7, "seven"); 24 | anObject.put(8, "eight"); 25 | 26 | Configuration conf = new Configuration(); 27 | 28 | HadoopUtils.writeObjectToConfAsBase64("anobject", anObject, conf); 29 | Map copy = HadoopUtils.readObjectFromConfAsBase64("anobject", conf); 30 | assertEquals(anObject, copy); 31 | 32 | try { 33 | Set bad = HadoopUtils.readObjectFromConfAsBase64("anobject", conf); 34 | fail("This should throw a ClassCastException"); 35 | } catch (ClassCastException e) { 36 | 37 | } 38 | 39 | conf = new Configuration(); 40 | Object nullObj = null; 41 | 42 | HadoopUtils.writeObjectToConfAsBase64("anobject", null, conf); 43 | Object copyObj = HadoopUtils.readObjectFromConfAsBase64("anobject", conf); 44 | assertEquals(nullObj, copyObj); 45 | } 46 | 47 | @Test 48 | public void readObjectFromConfAsBase64UnsetKey() throws Exception { 49 | assertNull(HadoopUtils.readObjectFromConfAsBase64("non-existant-key", new Configuration())); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /core/src/test/java/com/twitter/elephantbird/util/TestThriftToProto.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.util; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import java.io.IOException; 6 | 7 | import org.apache.thrift.Fixtures; 8 | import org.apache.thrift.TException; 9 | import org.junit.Test; 10 | 11 | import thrift.test.OneOfEach; 12 | 13 | import com.google.protobuf.Descriptors.DescriptorValidationException; 14 | import com.google.protobuf.DynamicMessage; 15 | import com.twitter.elephantbird.thrift.test.PhoneNumber; 16 | import com.twitter.elephantbird.thrift.test.PhoneType; 17 | import com.twitter.elephantbird.examples.proto.ThriftFixtures; 18 | import com.twitter.elephantbird.util.Protobufs; 19 | import com.twitter.elephantbird.util.ThriftToProto; 20 | 21 | public class TestThriftToProto { 22 | @Test 23 | public void testThriftToProto() throws TException, IOException { 24 | OneOfEach ooe = Fixtures.oneOfEach; 25 | ThriftToProto thriftToProto = 26 | ThriftToProto.newInstance(ooe, ThriftFixtures.OneOfEach.newBuilder().build()); 27 | ThriftFixtures.OneOfEach proto = thriftToProto.convert(ooe); 28 | assertEquals(ooe.im_true, proto.getImTrue()); 29 | assertEquals(ooe.im_false, proto.getImFalse()); 30 | assertEquals(ooe.a_bite, proto.getABite()); 31 | assertEquals(ooe.integer16, proto.getInteger16()); 32 | assertEquals(ooe.integer32, proto.getInteger32()); 33 | assertEquals(ooe.integer64, proto.getInteger64()); 34 | assertEquals(ooe.double_precision, proto.getDoublePrecision(), 0.00001); 35 | assertEquals(ooe.some_characters, proto.getSomeCharacters()); 36 | assertEquals(ooe.zomg_unicode, proto.getZomgUnicode()); 37 | assertEquals(ooe.what_who, proto.getWhatWho()); 38 | 39 | assertEquals(new String(ooe.getBase64(), "UTF-8"), proto.getBase64().toStringUtf8()); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /core/src/test/resources/com/twitter/elephantbird/util/sample_dir/a.txt: -------------------------------------------------------------------------------- 1 | a 2 | -------------------------------------------------------------------------------- /core/src/test/resources/com/twitter/elephantbird/util/sample_dir/b.txt: -------------------------------------------------------------------------------- 1 | bee 2 | -------------------------------------------------------------------------------- /core/src/test/resources/com/twitter/elephantbird/util/sample_dir/nested/c.txt: -------------------------------------------------------------------------------- 1 | 123456789 -------------------------------------------------------------------------------- /core/src/test/resources/com/twitter/elephantbird/util/sample_dir/nested/d.txt: -------------------------------------------------------------------------------- 1 | d 2 | -------------------------------------------------------------------------------- /core/src/test/resources/com/twitter/elephantbird/util/sample_dir/nested/double_nested/e.txt: -------------------------------------------------------------------------------- 1 | e -------------------------------------------------------------------------------- /core/src/test/thrift/address_book.thrift: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * https://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | namespace java com.twitter.elephantbird.thrift.test 20 | 21 | enum PhoneType { 22 | MOBILE = 0, 23 | HOME = 1, 24 | WORK = 2 25 | } 26 | 27 | struct PhoneNumber { 28 | 1: string number, 29 | 2: optional PhoneType type 30 | } 31 | 32 | struct Name { 33 | 1: string first_name, 34 | 2: string last_name 35 | } 36 | 37 | struct Person { 38 | 1: required Name name, 39 | 2: i32 id, 40 | 3: string email, 41 | 4: list phones 42 | } 43 | 44 | struct AddressBook { 45 | 1: list persons 46 | } 47 | -------------------------------------------------------------------------------- /core/thrift7/src/main/java/com/twitter/elephantbird/thrift/AbstractThriftBinaryDeserializer.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.thrift; 2 | 3 | import org.apache.thrift.TDeserializer; 4 | import org.apache.thrift.protocol.TBinaryProtocol; 5 | import org.apache.thrift.protocol.TProtocolFactory; 6 | 7 | /** 8 | * A shim on top of thrift to allow for thrift 0.7/0.9 compatibility 9 | * 10 | * This one is designed for thrift 0.7 11 | * 12 | */ 13 | class AbstractThriftBinaryDeserializer extends TDeserializer { 14 | public AbstractThriftBinaryDeserializer(TProtocolFactory protocolFactory) { 15 | super(protocolFactory); 16 | } 17 | 18 | protected void resetAndInitialize(TBinaryProtocol protocol, int newLength) { 19 | protocol.reset(); 20 | protocol.setReadLength(newLength); // reduces OutOfMemoryError exceptions 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /core/thrift7/src/main/java/com/twitter/elephantbird/thrift/AbstractThriftBinaryProtocol.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.thrift; 2 | 3 | import org.apache.thrift.protocol.TBinaryProtocol; 4 | import org.apache.thrift.protocol.TProtocolException; 5 | import org.apache.thrift.transport.TTransport; 6 | 7 | /** 8 | * A shim on top of thrift to allow for thrift 0.7/0.9 compatibility. 9 | * 10 | * This one is designed for thrift 0.7 11 | * 12 | */ 13 | abstract class AbstractThriftBinaryProtocol extends TBinaryProtocol { 14 | public AbstractThriftBinaryProtocol(TTransport trans) { 15 | super(trans); 16 | } 17 | 18 | public AbstractThriftBinaryProtocol(TTransport trans, boolean strictRead, boolean strictWrite) { 19 | super(trans, strictRead, strictWrite); 20 | } 21 | 22 | @SuppressWarnings("unused") 23 | public AbstractThriftBinaryProtocol(TTransport trans, long stringLengthLimit, long containerLengthLimit) { 24 | super(trans); 25 | } 26 | 27 | protected void resetAndInitialize(TBinaryProtocol protocol, int newLength) { 28 | protocol.reset(); 29 | protocol.setReadLength(newLength); 30 | } 31 | 32 | /** 33 | * Check if the container size is valid. 34 | * 35 | * NOTE: This assumes that the elements are one byte each. So this does not 36 | * catch all cases, but does increase the chances of handling malformed 37 | * lengths when the number of remaining bytes in the underlying Transport is 38 | * clearly less than the container size that the Transport provides. 39 | */ 40 | protected void checkContainerSize(int size) throws TProtocolException { 41 | if (size < 0) { 42 | throw new TProtocolException("Negative container size: " + size); 43 | } 44 | 45 | if (checkReadLength_ && (readLength_ - size) < 0) { 46 | throw new TProtocolException("Remaining message length is " + readLength_ 47 | + " but container size in underlying TTransport is set to at least: " + size); 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /core/thrift9/src/main/java/com/twitter/elephantbird/thrift/AbstractThriftBinaryDeserializer.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.thrift; 2 | 3 | import org.apache.thrift.TDeserializer; 4 | import org.apache.thrift.protocol.TBinaryProtocol; 5 | import org.apache.thrift.protocol.TProtocolFactory; 6 | 7 | /** 8 | * A shim on top of thrift to allow for thrift 0.7/0.9 compatibility. 9 | * 10 | * This one is designed for thrift 0.9 and above 11 | * 12 | */ 13 | abstract class AbstractThriftBinaryDeserializer extends TDeserializer { 14 | public AbstractThriftBinaryDeserializer(TProtocolFactory protocolFactory) { 15 | super(protocolFactory); 16 | } 17 | 18 | protected void resetAndInitialize(TBinaryProtocol protocol, int newLength) { 19 | protocol.reset(); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /core/thrift9/src/main/java/com/twitter/elephantbird/thrift/AbstractThriftBinaryProtocol.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.thrift; 2 | 3 | import org.apache.thrift.protocol.TBinaryProtocol; 4 | import org.apache.thrift.protocol.TProtocolException; 5 | import org.apache.thrift.transport.TTransport; 6 | 7 | /** 8 | * A shim on top of thrift to allow for thrift 0.7/0.9 compatibility. 9 | * 10 | * This one is designed for thrift 0.9 and above 11 | * 12 | */ 13 | abstract class AbstractThriftBinaryProtocol extends TBinaryProtocol { 14 | public AbstractThriftBinaryProtocol(TTransport trans) { 15 | super(trans); 16 | } 17 | 18 | public AbstractThriftBinaryProtocol(TTransport trans, long stringLengthLimit, long containerLengthLimit) { 19 | super(trans, stringLengthLimit, containerLengthLimit); 20 | } 21 | 22 | /** 23 | * Check if the container size is valid. 24 | */ 25 | protected void checkContainerSize(int size) throws TProtocolException { 26 | if (size < 0) { 27 | throw new TProtocolException("Negative container size: " + size); 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /crunch/Readme.md: -------------------------------------------------------------------------------- 1 | # Elephant Bird for [Apache Crunch](https://crunch.apache.org) 2 | 3 | [Apache Crunch](https://crunch.apache.org/intro.html) is a Java library for writing, testing, and running MapReduce pipelines. One of Crunch's 4 | goals is to make it easy to write and test pipelines that process complex records containing nested and repeated 5 | data structures, like protocol buffers and Thrift records. This module contains support for Crunch's 6 | [PType](https://crunch.apache.org/apidocs/0.8.0/org/apache/crunch/types/PType.html) serialization for 7 | Elephant Bird's `ProtobufWritable` and `ThriftWritable` classes, along with 8 | [Source](https://crunch.apache.org/apidocs/0.8.0/org/apache/crunch/Source.html), [Target](https://crunch.apache.org/apidocs/0.8.0/org/apache/crunch/Target.html), 9 | and [SourceTarget](https://crunch.apache.org/apidocs/0.8.0/org/apache/crunch/SourceTarget.html) implementations to support 10 | Elephant Bird's `LzoProtobufBlockInputFormat`, `LzoThriftBlockInputFormat`, `LzoProtobufBlockOutputFormat`, and 11 | `LzoThriftBlockOutputFormat`. 12 | -------------------------------------------------------------------------------- /crunch/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | 5 | com.twitter.elephantbird 6 | elephant-bird 7 | 4.17-SNAPSHOT 8 | .. 9 | 10 | elephant-bird-crunch 11 | Elephant Bird Crunch 12 | Crunch utilities. 13 | 14 | 15 | com.twitter.elephantbird 16 | elephant-bird-core 17 | 18 | 19 | org.apache.hadoop 20 | hadoop-client 21 | 22 | 23 | org.slf4j 24 | slf4j-simple 25 | 26 | 27 | org.apache.crunch 28 | crunch-core 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /crunch/src/main/java/com/twitter/elephantbird/crunch/LzoProtobufSourceTarget.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.crunch; 2 | 3 | import com.google.protobuf.Message; 4 | import org.apache.crunch.io.impl.ReadableSourceTargetImpl; 5 | import org.apache.crunch.types.PType; 6 | import org.apache.hadoop.fs.Path; 7 | 8 | /** 9 | * A Crunch {@code SourceTarget} for writing files out using a 10 | * {@link com.twitter.elephantbird.mapreduce.output.LzoProtobufBlockOutputFormat} and then reading them back with a 11 | * {@link com.twitter.elephantbird.mapreduce.input.LzoProtobufBlockInputFormat}. 12 | */ 13 | public class LzoProtobufSourceTarget extends ReadableSourceTargetImpl { 14 | 15 | /** 16 | * Factory method for creating a new {@code LzoProtobufSourceTarget} from a given path and protocol buffer 17 | * message class. 18 | * 19 | * @param path path to the data 20 | * @param protoClass the Message class to read 21 | * @return a new {@code LzoProtobufSourceTarget} 22 | */ 23 | public static LzoProtobufSourceTarget at(Path path, Class protoClass) { 24 | return new LzoProtobufSourceTarget(path, EBTypes.protos(protoClass)); 25 | } 26 | 27 | public LzoProtobufSourceTarget(Path path, PType ptype) { 28 | super(new LzoProtobufSource(path, ptype), new LzoProtobufTarget(path)); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /crunch/src/main/java/com/twitter/elephantbird/crunch/LzoThriftSourceTarget.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.crunch; 2 | 3 | import org.apache.crunch.io.impl.ReadableSourceTargetImpl; 4 | import org.apache.crunch.types.PType; 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.thrift.TBase; 7 | 8 | /** 9 | * A Crunch {@code SourceTarget} for writing files with the 10 | * {@link com.twitter.elephantbird.mapreduce.output.LzoThriftBlockOutputFormat} and reading them back with the 11 | * {@link com.twitter.elephantbird.mapreduce.input.LzoThriftBlockInputFormat}. 12 | */ 13 | public class LzoThriftSourceTarget> extends ReadableSourceTargetImpl { 14 | 15 | /** 16 | * Factory method for creating a new {@code LzoThriftSourceTarget} from a given path and Thrift 17 | * record class. 18 | * 19 | * @param path path to the data 20 | * @param thriftClass the Thrift class to read 21 | * @return a new {@code LzoThriftSourceTarget} 22 | */ 23 | public static > LzoThriftSourceTarget at(Path path, Class thriftClass) { 24 | return new LzoThriftSourceTarget(path, EBTypes.thrifts(thriftClass)); 25 | } 26 | 27 | public LzoThriftSourceTarget(Path path, PType ptype) { 28 | super(new LzoThriftSource(path, ptype), new LzoThriftTarget(path)); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /crunch/src/main/java/com/twitter/elephantbird/crunch/ProtobufFileReaderFactory.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.crunch; 2 | 3 | import com.google.common.collect.Iterators; 4 | import com.google.common.collect.UnmodifiableIterator; 5 | import com.google.protobuf.Message; 6 | import com.twitter.elephantbird.mapreduce.io.ProtobufBlockReader; 7 | import com.twitter.elephantbird.mapreduce.io.ProtobufWritable; 8 | import com.twitter.elephantbird.util.TypeRef; 9 | import org.apache.crunch.io.FileReaderFactory; 10 | import org.apache.crunch.io.impl.AutoClosingIterator; 11 | import org.apache.crunch.types.PType; 12 | import org.apache.hadoop.fs.FSDataInputStream; 13 | import org.apache.hadoop.fs.FileSystem; 14 | import org.apache.hadoop.fs.Path; 15 | 16 | import java.io.IOException; 17 | import java.util.Iterator; 18 | 19 | class ProtobufFileReaderFactory implements FileReaderFactory { 20 | 21 | private final PType ptype; 22 | 23 | public ProtobufFileReaderFactory(PType ptype) { 24 | this.ptype = ptype; 25 | } 26 | 27 | @Override 28 | public Iterator read(FileSystem fs, Path path) { 29 | try { 30 | final FSDataInputStream in = fs.open(path); 31 | return new AutoClosingIterator(in, new UnmodifiableIterator() { 32 | TypeRef typeRef = new TypeRef(ptype.getTypeClass()) {}; 33 | ProtobufBlockReader reader = new ProtobufBlockReader(in, typeRef); 34 | ProtobufWritable pw = new ProtobufWritable(typeRef); 35 | 36 | @Override 37 | public boolean hasNext() { 38 | try { 39 | return reader.readNext(pw); 40 | } catch (IOException e) { 41 | //TODO 42 | return false; 43 | } 44 | } 45 | 46 | @Override 47 | public T next() { 48 | return pw.get(); 49 | } 50 | }); 51 | } catch (IOException e) { 52 | //TODO 53 | return Iterators.emptyIterator(); 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /crunch/src/main/java/com/twitter/elephantbird/crunch/ProtobufReadableData.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.crunch; 2 | 3 | import com.google.protobuf.Message; 4 | import org.apache.crunch.ReadableData; 5 | import org.apache.crunch.io.FileReaderFactory; 6 | import org.apache.crunch.io.impl.ReadableDataImpl; 7 | import org.apache.crunch.types.PType; 8 | import org.apache.hadoop.fs.Path; 9 | 10 | import java.util.List; 11 | 12 | class ProtobufReadableData extends ReadableDataImpl { 13 | private final PType ptype; 14 | 15 | public ProtobufReadableData(List paths, PType ptype) { 16 | super(paths); 17 | this.ptype = ptype; 18 | } 19 | 20 | @Override 21 | protected FileReaderFactory getFileReaderFactory() { 22 | return new ProtobufFileReaderFactory(ptype); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /crunch/src/main/java/com/twitter/elephantbird/crunch/ThriftFileReaderFactory.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.crunch; 2 | 3 | import com.google.common.collect.Iterators; 4 | import com.google.common.collect.UnmodifiableIterator; 5 | import com.twitter.elephantbird.mapreduce.io.ThriftBlockReader; 6 | import com.twitter.elephantbird.mapreduce.io.ThriftWritable; 7 | import com.twitter.elephantbird.util.TypeRef; 8 | import org.apache.crunch.io.FileReaderFactory; 9 | import org.apache.crunch.io.impl.AutoClosingIterator; 10 | import org.apache.crunch.types.PType; 11 | import org.apache.hadoop.fs.FSDataInputStream; 12 | import org.apache.hadoop.fs.FileSystem; 13 | import org.apache.hadoop.fs.Path; 14 | import org.apache.thrift.TBase; 15 | 16 | import java.io.IOException; 17 | import java.util.Iterator; 18 | 19 | class ThriftFileReaderFactory> implements FileReaderFactory { 20 | 21 | private final PType ptype; 22 | public ThriftFileReaderFactory(PType ptype) { 23 | this.ptype = ptype; 24 | } 25 | 26 | @Override 27 | public Iterator read(FileSystem fs, Path path) { 28 | try { 29 | final FSDataInputStream in = fs.open(path); 30 | return new AutoClosingIterator(in, new UnmodifiableIterator() { 31 | TypeRef typeRef = new TypeRef(ptype.getTypeClass()) {}; 32 | ThriftBlockReader reader = new ThriftBlockReader(in, typeRef); 33 | ThriftWritable tw = new ThriftWritable(typeRef); 34 | 35 | @Override 36 | public boolean hasNext() { 37 | try { 38 | return reader.readNext(tw); 39 | } catch (IOException e) { 40 | //TODO 41 | return false; 42 | } 43 | } 44 | 45 | @Override 46 | public T next() { 47 | return tw.get(); 48 | } 49 | }); 50 | } catch (IOException e) { 51 | //TODO 52 | return Iterators.emptyIterator(); 53 | } 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /crunch/src/main/java/com/twitter/elephantbird/crunch/ThriftReadableData.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.crunch; 2 | 3 | import org.apache.crunch.io.FileReaderFactory; 4 | import org.apache.crunch.io.impl.ReadableDataImpl; 5 | import org.apache.crunch.types.PType; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.thrift.TBase; 8 | 9 | import java.util.List; 10 | 11 | class ThriftReadableData> extends ReadableDataImpl { 12 | 13 | private final PType ptype; 14 | 15 | ThriftReadableData(List paths, PType ptype) { 16 | super(paths); 17 | this.ptype = ptype; 18 | } 19 | 20 | @Override 21 | protected FileReaderFactory getFileReaderFactory() { 22 | return new ThriftFileReaderFactory(ptype); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /examples/src/main/pig/json_word_count.pig: -------------------------------------------------------------------------------- 1 | register ../../../dist/elephant-bird-1.0.jar; 2 | 3 | -- To generate data for use with this pig script, take a json data file of the form 4 | -- { "key1": 16, "key2": 1, "key3": 28 } 5 | -- { "key4": 66, "key1": 1, "key5": 38, "key6": 77 } 6 | -- ... 7 | -- { "key82383": 29, "key1": 22 } 8 | -- run lzop over it, and place the resulting compressed file in the directory you use 9 | -- as the first argument to this class on the command line. 10 | raw_data = load '/path/to/your_lzop_data' using com.twitter.elephantbird.pig.load.LzoJsonLoader() 11 | as ( 12 | json: map[] 13 | ); 14 | 15 | certain_keys = foreach raw_data generate (int)json#'key1' as key1_count, (int)json#'key3' as key3_count; 16 | 17 | -- etc. 18 | 19 | 20 | -------------------------------------------------------------------------------- /examples/src/main/pig/nested_json_get_distinct_items_from_nested_array.pig: -------------------------------------------------------------------------------- 1 | -- nested_json_pizza_sample_data.json 2 | -- { "Name": "BBQ Chicken", "Sizes": [{ "Size": "Large", "Price": 14.99 }, { "Size": "Medium", "Price": 12.99 }], "Toppings": [ "Barbecue Sauce", "Chicken", "Cheese" ] } 3 | -- { "Name": "Hawaiian", "Sizes": [{ "Size": "Large", "Price": 12.99 }, { "Size": "Medium", "Price": 10.99 }], "Toppings": [ "Ham", "Pineapple", "Cheese" ] } 4 | -- { "Name": "Vegetable", "Sizes": [{ "Size": "Large", "Price": 12.99 }, { "Size": "Medium", "Price": 10.99 }], "Toppings": [ "Broccoli", "Tomato", "Cheese" ] } 5 | -- { "Name": "Pepperoni", "Sizes": [{ "Size": "Large", "Price": 12.99 }, { "Size": "Medium", "Price": 10.99 }, { "Size": "Small", "Price": 7.49 }], "Toppings": [ "Pepperoni", "Cheese" ] } 6 | -- { "Name": "Cheese", "Sizes": [{ "Size": "Large", "Price": 10.99 }, { "Size": "Medium", "Price": 9.99 }, { "Size": "Small", "Price": 5.49 }], "Toppings": [ "Cheese" ] } 7 | 8 | register /path/to/json-simple.jar; 9 | register /path/to/elephant-bird-core.jar; 10 | register /path/to/elephant-bird-pig.jar; 11 | 12 | json_data = load '/path/to/nested_json_pizza_sample_data.json' using com.twitter.elephantbird.pig.load.JsonLoader('-nestedLoad'); 13 | toppings = foreach json_data generate flatten($0#'Toppings'); 14 | distinct_toppings = distinct toppings; 15 | dump distinct_toppings; 16 | -------------------------------------------------------------------------------- /examples/src/main/pig/nested_json_get_top_level_property_values.pig: -------------------------------------------------------------------------------- 1 | -- nested_json_pizza_sample_data.json 2 | -- { "Name": "BBQ Chicken", "Sizes": [{ "Size": "Large", "Price": 14.99 }, { "Size": "Medium", "Price": 12.99 }], "Toppings": [ "Barbecue Sauce", "Chicken", "Cheese" ] } 3 | -- { "Name": "Hawaiian", "Sizes": [{ "Size": "Large", "Price": 12.99 }, { "Size": "Medium", "Price": 10.99 }], "Toppings": [ "Ham", "Pineapple", "Cheese" ] } 4 | -- { "Name": "Vegetable", "Sizes": [{ "Size": "Large", "Price": 12.99 }, { "Size": "Medium", "Price": 10.99 }], "Toppings": [ "Broccoli", "Tomato", "Cheese" ] } 5 | -- { "Name": "Pepperoni", "Sizes": [{ "Size": "Large", "Price": 12.99 }, { "Size": "Medium", "Price": 10.99 }, { "Size": "Small", "Price": 7.49 }], "Toppings": [ "Pepperoni", "Cheese" ] } 6 | -- { "Name": "Cheese", "Sizes": [{ "Size": "Large", "Price": 10.99 }, { "Size": "Medium", "Price": 9.99 }, { "Size": "Small", "Price": 5.49 }], "Toppings": [ "Cheese" ] } 7 | 8 | register /path/to/json-simple.jar; 9 | register /path/to/elephant-bird-core.jar; 10 | register /path/to/elephant-bird-pig.jar; 11 | 12 | json_data = load '/path/to/nested_json_pizza_sample_data.json' using com.twitter.elephantbird.pig.load.JsonLoader('-nestedLoad'); 13 | names = foreach json_data generate $0#'Name' as name; 14 | ordered = order names by name; 15 | dump ordered; 16 | -------------------------------------------------------------------------------- /examples/src/main/pig/nested_json_get_values_count_for_property_inside_nested_array.pig: -------------------------------------------------------------------------------- 1 | -- nested_json_pizza_sample_data.json 2 | -- { "Name": "BBQ Chicken", "Sizes": [{ "Size": "Large", "Price": 14.99 }, { "Size": "Medium", "Price": 12.99 }], "Toppings": [ "Barbecue Sauce", "Chicken", "Cheese" ] } 3 | -- { "Name": "Hawaiian", "Sizes": [{ "Size": "Large", "Price": 12.99 }, { "Size": "Medium", "Price": 10.99 }], "Toppings": [ "Ham", "Pineapple", "Cheese" ] } 4 | -- { "Name": "Vegetable", "Sizes": [{ "Size": "Large", "Price": 12.99 }, { "Size": "Medium", "Price": 10.99 }], "Toppings": [ "Broccoli", "Tomato", "Cheese" ] } 5 | -- { "Name": "Pepperoni", "Sizes": [{ "Size": "Large", "Price": 12.99 }, { "Size": "Medium", "Price": 10.99 }, { "Size": "Small", "Price": 7.49 }], "Toppings": [ "Pepperoni", "Cheese" ] } 6 | -- { "Name": "Cheese", "Sizes": [{ "Size": "Large", "Price": 10.99 }, { "Size": "Medium", "Price": 9.99 }, { "Size": "Small", "Price": 5.49 }], "Toppings": [ "Cheese" ] } 7 | 8 | register /path/to/json-simple.jar; 9 | register /path/to/elephant-bird-core.jar; 10 | register /path/to/elephant-bird-pig.jar; 11 | 12 | json_data = load '/path/to/nested_json_pizza_sample_data.json' using com.twitter.elephantbird.pig.load.JsonLoader('-nestedLoad'); 13 | sizes = foreach json_data generate flatten($0#'Sizes'); 14 | grouped = group sizes by $0#'Size'; 15 | size_and_count = foreach grouped generate group as size, COUNT($1) as count; 16 | dump size_and_count; 17 | -------------------------------------------------------------------------------- /examples/src/main/pig/nested_json_pizza_sample_data.json: -------------------------------------------------------------------------------- 1 | { "Name": "BBQ Chicken", "Sizes": [{ "Size": "Large", "Price": 14.99 }, { "Size": "Medium", "Price": 12.99 }], "Toppings": [ "Barbecue Sauce", "Chicken", "Cheese" ] } 2 | { "Name": "Hawaiian", "Sizes": [{ "Size": "Large", "Price": 12.99 }, { "Size": "Medium", "Price": 10.99 }], "Toppings": [ "Ham", "Pineapple", "Cheese" ] } 3 | { "Name": "Vegetable", "Sizes": [{ "Size": "Large", "Price": 12.99 }, { "Size": "Medium", "Price": 10.99 }], "Toppings": [ "Broccoli", "Tomato", "Cheese" ] } 4 | { "Name": "Pepperoni", "Sizes": [{ "Size": "Large", "Price": 12.99 }, { "Size": "Medium", "Price": 10.99 }, { "Size": "Small", "Price": 7.49 }], "Toppings": [ "Pepperoni", "Cheese" ] } 5 | { "Name": "Cheese", "Sizes": [{ "Size": "Large", "Price": 10.99 }, { "Size": "Medium", "Price": 9.99 }, { "Size": "Small", "Price": 5.49 }], "Toppings": [ "Cheese" ] } 6 | -------------------------------------------------------------------------------- /examples/src/main/pig/people_phone_number_count.pig: -------------------------------------------------------------------------------- 1 | register '$EB_HOME/*/target/*.jar'; 2 | 3 | raw_data = load '/path/to/input_files' using ProtobufPigLoader('com.twitter.elephantbird.examples.proto.AddressBookProtos.Person'); 4 | 5 | person_phone_numbers = foreach raw_data generate name, FLATTEN(phone.phone_tuple.number) as phone_number; 6 | 7 | phones_by_person = group person_phone_numbers by name; 8 | 9 | person_phone_count = foreach phones_by_person generate group as name, COUNT(person_phone_numbers) as phone_count; 10 | 11 | dump person_phone_count; 12 | 13 | 14 | -------------------------------------------------------------------------------- /examples/src/main/pig/people_phone_number_count_thrift.pig: -------------------------------------------------------------------------------- 1 | register ../../../dist/elephant-bird-1.0.jar; 2 | 3 | -- the schema does not need to be explicitly specified. Pig queries the loader 4 | -- for the schema. Specifying the schema in comments here helps the readers of 5 | -- the script. It is commented out so that it is does not override the most 6 | -- up to date schema when the Thrift class changes. 7 | -- you use ThriftToPig class to pretty print : 8 | -- $ java -cp "[..]elephant-bird-1.0.jar" com.twitter.elephantbird.pig.piggybank.ThriftToPig thrift.class.name 9 | 10 | raw_data = load '/path/to/input_files' using com.twitter.elephantbird.pig.load.LzoThriftB64LinePigLoader('com.twitter.elephantbird.examples.thrift.Person'); 11 | -- as ( 12 | -- name: chararray, 13 | -- id: int, 14 | -- email: chararray, 15 | -- phones: { 16 | -- phones_tuple: ( 17 | -- number: chararray, 18 | -- type: chararray 19 | -- ) 20 | -- } 21 | -- ) 22 | 23 | person_phone_numbers = foreach raw_data generate name, FLATTEN(phone.phone_tuple.number) as phone_number; 24 | 25 | phones_by_person = group person_phone_numbers by name; 26 | 27 | person_phone_count = foreach phones_by_person generate group as name, COUNT(person_phone_numbers) as phone_count; 28 | 29 | dump person_phone_count; 30 | 31 | 32 | -------------------------------------------------------------------------------- /examples/src/main/protobuf/address_book.proto: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.examples.proto; 2 | 3 | option java_outer_classname = "AddressBookProtos"; 4 | 5 | message Person { 6 | required string name = 1; 7 | required int32 id = 2; 8 | optional string email = 3; 9 | 10 | enum PhoneType { 11 | MOBILE = 0; 12 | HOME = 1; 13 | WORK = 2; 14 | } 15 | 16 | message PhoneNumber { 17 | required string number = 1; 18 | optional PhoneType type = 2 [default = HOME]; 19 | } 20 | 21 | repeated PhoneNumber phone = 4; 22 | } 23 | 24 | message AddressBook { 25 | repeated Person person = 1; 26 | } -------------------------------------------------------------------------------- /examples/src/main/protobuf/examples.proto: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.examples.proto; 2 | 3 | option java_outer_classname = "Examples"; 4 | 5 | message Age { 6 | optional string name = 1; 7 | optional int32 age = 2; 8 | } 9 | -------------------------------------------------------------------------------- /examples/src/main/thrift/address_book.thrift: -------------------------------------------------------------------------------- 1 | namespace java com.twitter.elephantbird.examples.thrift 2 | 3 | enum PhoneType { 4 | MOBILE = 0, 5 | HOME = 1, 6 | WORK = 2 7 | } 8 | 9 | struct PhoneNumber { 10 | 1: string number, 11 | 2: optional PhoneType type 12 | } 13 | 14 | struct Name { 15 | 1: string first_name, 16 | 2: string last_name 17 | } 18 | 19 | struct Person { 20 | 1: required Name name, 21 | 2: i32 id, 22 | 3: string email, 23 | 4: list phones 24 | } 25 | 26 | struct AddressBook { 27 | 1: list persons 28 | } 29 | -------------------------------------------------------------------------------- /examples/src/main/thrift/simple_age.thrift: -------------------------------------------------------------------------------- 1 | namespace java com.twitter.elephantbird.examples.thrift 2 | 3 | // a simple class with name of a person and the age. 4 | 5 | struct Age { 6 | 1: string name, 7 | 2: i32 age 8 | } 9 | -------------------------------------------------------------------------------- /hadoop-compat/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | 5 | com.twitter.elephantbird 6 | elephant-bird 7 | 4.17-SNAPSHOT 8 | .. 9 | 10 | elephant-bird-hadoop-compat 11 | Elephant Bird Hadoop Compatibility 12 | Utilities for dealing with Hadoop incompatibilities between 1.x and 2.x 13 | 14 | 15 | org.apache.hadoop 16 | hadoop-client 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /hive/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | 5 | com.twitter.elephantbird 6 | elephant-bird 7 | 4.17-SNAPSHOT 8 | .. 9 | 10 | elephant-bird-hive 11 | Elephant Bird Hive 12 | Hive utilities. 13 | 14 | 15 | 16 | twitter 17 | Twitter 18 | https://maven.twttr.com/ 19 | 20 | true 21 | 22 | 23 | false 24 | 25 | 26 | 27 | 28 | 29 | com.twitter.elephantbird 30 | elephant-bird-core 31 | 32 | 33 | org.apache.hadoop 34 | hadoop-client 35 | 36 | 37 | org.slf4j 38 | slf4j-simple 39 | 40 | 41 | org.apache.hive 42 | hive-serde 43 | 44 | 45 | org.apache.hive 46 | hive-exec 47 | 48 | 49 | org.apache.hcatalog 50 | hcatalog 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /hive/src/main/java/com/twitter/elephantbird/hive/serde/LzoProtobufHiveSerde.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.hive.serde; 2 | 3 | import java.util.Properties; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.hive.serde2.SerDe; 7 | import org.apache.hadoop.hive.serde2.SerDeException; 8 | import org.apache.hadoop.hive.serde2.SerDeStats; 9 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 10 | import org.apache.hadoop.io.Writable; 11 | 12 | public abstract class LzoProtobufHiveSerde implements SerDe { 13 | 14 | @Override 15 | public void initialize(Configuration conf, Properties props) throws SerDeException { 16 | } 17 | 18 | @Override 19 | public abstract ObjectInspector getObjectInspector() throws SerDeException; 20 | 21 | @Override 22 | public abstract Object deserialize(Writable w) throws SerDeException; 23 | 24 | @Override 25 | public Class getSerializedClass() { 26 | return org.apache.hadoop.io.Text.class; 27 | //serialization not supported 28 | } 29 | 30 | @Override 31 | public Writable serialize(Object arg0, ObjectInspector arg1) throws SerDeException { 32 | return null; 33 | //serialization not supported 34 | } 35 | 36 | @Override 37 | public SerDeStats getSerDeStats() { 38 | return null; 39 | // stats not supported 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /lucene/src/main/java/com/twitter/elephantbird/mapreduce/input/LuceneIndexCountHitsRecordReader.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.input; 2 | 3 | import java.io.IOException; 4 | import java.util.Iterator; 5 | 6 | import com.google.common.collect.ImmutableList; 7 | 8 | import org.apache.hadoop.io.IntWritable; 9 | import org.apache.lucene.search.IndexSearcher; 10 | import org.apache.lucene.search.Query; 11 | import org.apache.lucene.search.TotalHitCountCollector; 12 | 13 | /** 14 | * Only counts the number of hits for each query 15 | * 16 | * @author Alex Levenson 17 | */ 18 | public abstract class LuceneIndexCountHitsRecordReader 19 | extends LuceneIndexRecordReader { 20 | 21 | @Override 22 | protected Iterator search(IndexSearcher searcher, Query query) throws IOException { 23 | TotalHitCountCollector collector = new TotalHitCountCollector(); 24 | searcher.search(query, collector); 25 | return ImmutableList.of(new IntWritable(collector.getTotalHits())).iterator(); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /lucene/src/test/resources/com/twitter/elephantbird/mapreduce/input/sample_indexes/index-1/index-1.txt: -------------------------------------------------------------------------------- 1 | one 2 | -------------------------------------------------------------------------------- /lucene/src/test/resources/com/twitter/elephantbird/mapreduce/input/sample_indexes/index-2/data.txt: -------------------------------------------------------------------------------- 1 | 123456789123456789 2 | 3 | -------------------------------------------------------------------------------- /lucene/src/test/resources/com/twitter/elephantbird/mapreduce/input/sample_indexes/more-indexes/index-3/data.txt: -------------------------------------------------------------------------------- 1 | data 2 | 3 | -------------------------------------------------------------------------------- /lucene/src/test/resources/com/twitter/elephantbird/mapreduce/input/sample_indexes/unrelated/index-unrelated.txt: -------------------------------------------------------------------------------- 1 | unrelated 2 | -------------------------------------------------------------------------------- /mahout/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | 5 | com.twitter.elephantbird 6 | elephant-bird 7 | 4.17-SNAPSHOT 8 | .. 9 | 10 | elephant-bird-mahout 11 | Elephant Bird Mahout 12 | Mahout utilities. 13 | 14 | 15 | com.twitter.elephantbird 16 | elephant-bird-pig 17 | 18 | 19 | com.twitter.elephantbird 20 | elephant-bird-pig 21 | test-jar 22 | 23 | 24 | org.apache.hadoop 25 | hadoop-client 26 | 27 | 28 | org.apache.pig 29 | pig 30 | ${apache.pig.classifier} 31 | 32 | 33 | org.antlr 34 | antlr 35 | 36 | 37 | joda-time 38 | joda-time 39 | 40 | 41 | log4j 42 | log4j 43 | 44 | 45 | org.apache.mahout 46 | mahout-collections 47 | 48 | 49 | org.apache.mahout 50 | mahout-core 51 | 52 | 53 | org.apache.mahout 54 | mahout-math 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /pig-lucene/src/test/resources/com/twitter/elephantbird/pig/index.pig: -------------------------------------------------------------------------------- 1 | lines = load '$INPUT' using TextLoader() as (line:chararray); 2 | store lines into '$OUTPUT' using com.twitter.elephantbird.pig.store.LuceneIndexStorage('com.twitter.elephantbird.pig.PigLuceneIndexingIntegrationTest\$IndexOutputFormat'); 3 | -------------------------------------------------------------------------------- /pig-lucene/src/test/resources/com/twitter/elephantbird/pig/load/queryfile.txt: -------------------------------------------------------------------------------- 1 | +hello -goodbye 2 | +test 3 | 4 | +こにちは 5 | -------------------------------------------------------------------------------- /pig-lucene/src/test/resources/com/twitter/elephantbird/pig/search_file.pig: -------------------------------------------------------------------------------- 1 | hits = load '$INPUT' using com.twitter.elephantbird.pig.PigLuceneIndexingIntegrationTest\$Loader('--file', '$QUERY_FILE'); 2 | store hits into '$OUTPUT' using PigStorage('\t'); 3 | -------------------------------------------------------------------------------- /pig-lucene/src/test/resources/com/twitter/elephantbird/pig/search_queries.pig: -------------------------------------------------------------------------------- 1 | -- we have to hard code the query literals because pig substitution via pigserver can't handle this kind of substitution 2 | hits = load '$INPUT' using com.twitter.elephantbird.pig.PigLuceneIndexingIntegrationTest\$Loader('--queries','+(macbeth achilles)','+shield','+out, +"candle!"'); 3 | store hits into '$OUTPUT' using PigStorage('\t'); 4 | -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/load/HBaseLoader.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.load; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.commons.cli.ParseException; 6 | import org.apache.commons.logging.Log; 7 | import org.apache.commons.logging.LogFactory; 8 | import org.apache.pig.backend.hadoop.hbase.HBaseStorage; 9 | 10 | /** 11 | * @deprecated replaced by {@link HBaseStorage}. 12 | */ 13 | @Deprecated 14 | public class HBaseLoader extends HBaseStorage { 15 | private static final Log LOG = LogFactory.getLog(HBaseLoader.class); 16 | 17 | static private void warn() { 18 | LOG.warn("HBaseLoader is deprecated and will be removed soon." 19 | + " Please use " + HBaseStorage.class.getName() 20 | + ". HBaseStorage is a drop in replacement."); 21 | } 22 | 23 | public HBaseLoader(String columnList) throws ParseException, IOException { 24 | super(columnList); 25 | warn(); 26 | } 27 | 28 | public HBaseLoader(String columnList, String optString) throws ParseException, IOException { 29 | super(columnList, optString); 30 | warn(); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/load/LzoJsonLoader.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.load; 2 | 3 | import com.twitter.elephantbird.mapreduce.input.LzoTextInputFormat; 4 | 5 | public class LzoJsonLoader extends JsonLoader { 6 | 7 | /** 8 | * Constructor. Construct a LzoJsonLoader LoadFunc to load. 9 | * @param optString Loader options. For available options, 10 | * see {@link JsonLoader#JsonLoader(String)}. 11 | * Notice that the -inputFormat option is overridden. 12 | */ 13 | public LzoJsonLoader(String optString) { 14 | super(optString); 15 | this.setInputFormatClassName(LzoTextInputFormat.class.getName()); 16 | } 17 | 18 | public LzoJsonLoader() { 19 | // defaults to no options 20 | this(""); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/load/LzoProtobufB64LinePigLoader.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.load; 2 | 3 | import com.google.protobuf.Message; 4 | 5 | /** 6 | * @Deprecated use {@link ProtobufPigLoader} 7 | */ 8 | public class LzoProtobufB64LinePigLoader extends ProtobufPigLoader { 9 | 10 | public LzoProtobufB64LinePigLoader(String protoClassName) { 11 | super(protoClassName); 12 | LOG.warn("LzoProtobufB64LinePigLoader is deprecated and will be removed in future. " + 13 | "please use ProtobufPigLoader"); 14 | } 15 | } -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/load/LzoProtobufBlockPigLoader.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.load; 2 | 3 | import com.google.protobuf.Message; 4 | 5 | /** 6 | * @Deprecated use {@link ProtobufPigLoader} 7 | */ 8 | public class LzoProtobufBlockPigLoader extends ProtobufPigLoader { 9 | 10 | public LzoProtobufBlockPigLoader(String protoClassName) { 11 | super(protoClassName); 12 | LOG.warn("LzoProtobufBlockPigLoader is deprecated and will be removed in future. " + 13 | "please use ProtobufPigLoader"); 14 | } 15 | } -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/load/LzoRawBytesLoader.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.load; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.LongWritable; 6 | import org.apache.hadoop.mapreduce.InputFormat; 7 | import org.apache.hadoop.mapreduce.Job; 8 | import org.apache.pig.ResourceSchema; 9 | import org.apache.pig.data.DataByteArray; 10 | import org.apache.pig.data.Tuple; 11 | import org.apache.pig.data.TupleFactory; 12 | import org.apache.pig.impl.util.Utils; 13 | 14 | import com.twitter.elephantbird.mapreduce.input.MultiInputFormat; 15 | import com.twitter.elephantbird.mapreduce.io.BinaryWritable; 16 | import com.twitter.elephantbird.util.TypeRef; 17 | 18 | /** 19 | * Loads raw bytes. 20 | */ 21 | public class LzoRawBytesLoader extends LzoBaseLoadFunc { 22 | 23 | private static final TupleFactory tupleFactory = TupleFactory.getInstance(); 24 | 25 | private TypeRef typeRef = new TypeRef(byte[].class){}; 26 | 27 | @Override 28 | public InputFormat> getInputFormat() throws IOException { 29 | return new MultiInputFormat(typeRef); 30 | } 31 | 32 | @Override 33 | public Tuple getNext() throws IOException { 34 | byte[] bytes = getNextBinaryValue(typeRef); 35 | return bytes != null ? 36 | tupleFactory.newTuple(new DataByteArray(bytes)) : null; 37 | } 38 | 39 | @Override 40 | public ResourceSchema getSchema(String filename, Job job) throws IOException { 41 | return new ResourceSchema(Utils.getSchemaFromString("bytes : bytearray")); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/load/LzoRegexLoader.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.load; 2 | 3 | import java.util.regex.Pattern; 4 | 5 | 6 | import org.slf4j.Logger; 7 | import org.slf4j.LoggerFactory; 8 | 9 | /** 10 | * LzoRegexLoader extends LzoBaseRegexLoader, allowing regular expressions to be passed by argument through pig latin 11 | * via a line like: 12 | * A = LOAD 'test.txt' USING com.twitter.elephantbird.pig.storage.LzoRegexLoader('(\\d+)::(\\w+)\|\|(\\w+)'); 13 | * which would parse lines like 14 | * 1::one||i 2::two||ii 3::three--iii 15 | * into arrays like 16 | * {1, "one", "i"}, {2, "two", "ii"}, {3, "three", "iii"} 17 | */ 18 | public class LzoRegexLoader extends LzoBaseRegexLoader { 19 | private static final Logger LOG = LoggerFactory.getLogger(LzoRegexLoader.class); 20 | 21 | private final Pattern pattern_; 22 | 23 | /** 24 | * The regex is passed in via the constructor. 25 | * @param pattern the regex. 26 | */ 27 | public LzoRegexLoader(String pattern) { 28 | LOG.info("LzoRegexLoader with regex = " + pattern); 29 | 30 | pattern = pattern.replace("\\\\","\\"); 31 | pattern_ = Pattern.compile(pattern); 32 | } 33 | 34 | /** 35 | * Implement the abstract part of the class by returning the pattern. 36 | */ 37 | @Override 38 | public Pattern getPattern() { 39 | return pattern_; 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/load/LzoTextLoader.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.load; 2 | 3 | import com.twitter.elephantbird.pig.store.LzoPigStorage; 4 | 5 | /** 6 | * Load the LZO file line by line, passing each line as a single-field Tuple to Pig. 7 | */ 8 | public class LzoTextLoader extends LzoPigStorage { 9 | 10 | public LzoTextLoader() { 11 | super("\n"); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/load/LzoThriftB64LinePigLoader.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.load; 2 | 3 | import org.apache.thrift.TBase; 4 | 5 | /** 6 | * @Deprecated use {@link ThriftPigLoader} 7 | */ 8 | public class LzoThriftB64LinePigLoader> extends ThriftPigLoader { 9 | 10 | public LzoThriftB64LinePigLoader(String thriftClassName) { 11 | super(thriftClassName); 12 | LOG.warn("LzoThriftB64LinePigLoader is deprecated and will be removed in future. " + 13 | "please use ThriftPigLoader"); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/load/LzoThriftBlockPigLoader.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.load; 2 | 3 | import org.apache.thrift.TBase; 4 | 5 | /** 6 | * @Deprecated use {@link ThriftPigLoader} 7 | */ 8 | public class LzoThriftBlockPigLoader> extends ThriftPigLoader { 9 | 10 | public LzoThriftBlockPigLoader(String thriftClassName) { 11 | super(thriftClassName); 12 | LOG.warn("LzoThriftBlockPigLoader is deprecated and will be removed in future " + 13 | "please use ThriftPigLoader"); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/load/LzoTokenizedLoader.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.load; 2 | 3 | import com.twitter.elephantbird.pig.store.LzoPigStorage; 4 | 5 | /** 6 | * Same as {@link LzoPigStorage}. 7 | */ 8 | public class LzoTokenizedLoader extends LzoPigStorage { 9 | 10 | public LzoTokenizedLoader() { 11 | super(); 12 | } 13 | 14 | public LzoTokenizedLoader(String delimiter) { 15 | super(delimiter); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/load/MultiFormatLoader.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.load; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.LongWritable; 6 | import org.apache.hadoop.mapreduce.InputFormat; 7 | import org.apache.pig.LoadFunc; 8 | import org.apache.thrift.TBase; 9 | 10 | import com.google.protobuf.Message; 11 | import com.twitter.elephantbird.mapreduce.input.MultiInputFormat; 12 | import com.twitter.elephantbird.mapreduce.io.BinaryWritable; 13 | import com.twitter.elephantbird.pig.util.PigUtil; 14 | import com.twitter.elephantbird.util.TypeRef; 15 | 16 | /** 17 | * A loader based on {@link MultiInputFormat} to read input written in 18 | * different file formats. 19 | * 20 | * @see MultiInputFormat 21 | */ 22 | public class MultiFormatLoader extends FilterLoadFunc { 23 | 24 | private TypeRef typeRef = null; 25 | 26 | /** 27 | * @param className Thrift or Protobuf class 28 | */ 29 | public MultiFormatLoader(String className) { 30 | super(null); 31 | Class clazz = PigUtil.getClass(className); 32 | typeRef = new TypeRef(clazz){}; 33 | 34 | /* Initialize the loader. It is required to handle 35 | * functionality of LoadFunc, LoadMetadata etc, 36 | * even though it does not affect the inputformat. 37 | */ 38 | LoadFunc ldr; 39 | if (Message.class.isAssignableFrom(clazz)) { 40 | ldr = new ProtobufPigLoader(className); 41 | 42 | } else if (TBase.class.isAssignableFrom(clazz)) { 43 | ldr = new ThriftPigLoader>(className); 44 | 45 | } else { 46 | throw new RuntimeException(className + " is not a Protobuf or Thrift class"); 47 | } 48 | 49 | setLoader(ldr); 50 | } 51 | 52 | @Override 53 | public InputFormat> getInputFormat() throws IOException { 54 | return new MultiInputFormat(typeRef); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/piggybank/InvokeForDouble.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * https://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.twitter.elephantbird.pig.piggybank; 19 | 20 | import org.apache.pig.impl.logicalLayer.FrontendException; 21 | 22 | /** 23 | * @see GenericInvoker 24 | */ 25 | public class InvokeForDouble extends GenericInvoker { 26 | 27 | public InvokeForDouble() {} 28 | 29 | public InvokeForDouble(String fullName) throws FrontendException, SecurityException, ClassNotFoundException, NoSuchMethodException { 30 | super(fullName); 31 | } 32 | 33 | public InvokeForDouble(String fullName, String paramSpecsStr) throws FrontendException, SecurityException, ClassNotFoundException, NoSuchMethodException { 34 | super(fullName, paramSpecsStr); 35 | } 36 | 37 | public InvokeForDouble(String fullName, String paramSpecsStr, String isStatic) 38 | throws ClassNotFoundException, FrontendException, SecurityException, NoSuchMethodException { 39 | super(fullName, paramSpecsStr, isStatic); 40 | } 41 | 42 | 43 | } 44 | -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/piggybank/InvokeForFloat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * https://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.twitter.elephantbird.pig.piggybank; 19 | 20 | import org.apache.pig.impl.logicalLayer.FrontendException; 21 | 22 | /** 23 | * @see GenericInvoker 24 | */ 25 | 26 | public class InvokeForFloat extends GenericInvoker { 27 | 28 | public InvokeForFloat() {} 29 | 30 | public InvokeForFloat(String fullName) throws FrontendException, SecurityException, ClassNotFoundException, NoSuchMethodException { 31 | super(fullName); 32 | } 33 | 34 | public InvokeForFloat(String fullName, String paramSpecsStr) throws FrontendException, SecurityException, ClassNotFoundException, NoSuchMethodException { 35 | super(fullName, paramSpecsStr); 36 | } 37 | 38 | public InvokeForFloat(String fullName, String paramSpecsStr, String isStatic) 39 | throws ClassNotFoundException, FrontendException, SecurityException, NoSuchMethodException { 40 | super(fullName, paramSpecsStr, isStatic); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/piggybank/InvokeForInt.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * https://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.twitter.elephantbird.pig.piggybank; 19 | 20 | import org.apache.pig.impl.logicalLayer.FrontendException; 21 | 22 | /** 23 | * @see GenericInvoker 24 | */ 25 | public class InvokeForInt extends GenericInvoker { 26 | 27 | public InvokeForInt() {} 28 | 29 | public InvokeForInt(String fullName, String paramSpecsStr) throws FrontendException, SecurityException, ClassNotFoundException, NoSuchMethodException { 30 | super(fullName, paramSpecsStr); 31 | } 32 | 33 | public InvokeForInt(String fullName) throws FrontendException, SecurityException, ClassNotFoundException, NoSuchMethodException { 34 | super(fullName); 35 | } 36 | 37 | public InvokeForInt(String fullName, String paramSpecsStr, String isStatic) 38 | throws ClassNotFoundException, FrontendException, SecurityException, NoSuchMethodException { 39 | super(fullName, paramSpecsStr, isStatic); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/piggybank/InvokeForLong.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * https://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.twitter.elephantbird.pig.piggybank; 19 | 20 | import org.apache.pig.impl.logicalLayer.FrontendException; 21 | 22 | /** 23 | * @see GenericInvoker 24 | */ 25 | public class InvokeForLong extends GenericInvoker { 26 | 27 | public InvokeForLong() {} 28 | 29 | public InvokeForLong(String fullName) throws FrontendException, SecurityException, ClassNotFoundException, NoSuchMethodException { 30 | super(fullName); 31 | } 32 | 33 | public InvokeForLong(String fullName, String paramSpecsStr) throws FrontendException, SecurityException, ClassNotFoundException, NoSuchMethodException { 34 | super(fullName, paramSpecsStr); 35 | } 36 | 37 | public InvokeForLong(String fullName, String paramSpecsStr, String isStatic) 38 | throws ClassNotFoundException, FrontendException, SecurityException, NoSuchMethodException { 39 | super(fullName, paramSpecsStr, isStatic); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/piggybank/InvokeForString.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * https://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.twitter.elephantbird.pig.piggybank; 20 | 21 | import org.apache.pig.impl.logicalLayer.FrontendException; 22 | 23 | /** 24 | * @see GenericInvoker 25 | */ 26 | public class InvokeForString extends GenericInvoker { 27 | 28 | public InvokeForString() {} 29 | 30 | public InvokeForString(String fullName, String paramSpecsStr) throws FrontendException, SecurityException, ClassNotFoundException, NoSuchMethodException { 31 | super(fullName, paramSpecsStr); 32 | } 33 | 34 | public InvokeForString(String fullName) throws FrontendException, SecurityException, ClassNotFoundException, NoSuchMethodException { 35 | super(fullName); 36 | } 37 | 38 | public InvokeForString(String fullName, String paramSpecsStr, String isStatic) 39 | throws ClassNotFoundException, FrontendException, SecurityException, NoSuchMethodException { 40 | super(fullName, paramSpecsStr, isStatic); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/store/Bz2PigStorage.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.store; 2 | 3 | import java.io.IOException; 4 | 5 | import com.twitter.elephantbird.util.HadoopCompat; 6 | import org.apache.hadoop.io.compress.BZip2Codec; 7 | import org.apache.hadoop.mapreduce.Job; 8 | import org.apache.pig.builtin.PigStorage; 9 | 10 | /** 11 | * Enables bzip2 compression for storage.
12 | * This is similar to: 13 | *

14 |  *   set output.compression.enabled true;
15 |  *   set output.compression.codec org.apache.hadoop.io.compress.BZip2Codec;
16 |  *   storage alias using PigStorage();
17 |  * 
18 | */ 19 | public class Bz2PigStorage extends PigStorage { 20 | // Ideally, PigStorage it self should take more options like compression 21 | // codec etc. 22 | public Bz2PigStorage() { 23 | super(); 24 | } 25 | 26 | public Bz2PigStorage(String delimiter) { 27 | super(delimiter); 28 | } 29 | 30 | @Override 31 | public void setStoreLocation(String location, Job job) throws IOException { 32 | HadoopCompat.getConfiguration(job).set("output.compression.enabled", "true"); 33 | HadoopCompat.getConfiguration(job).set("output.compression.codec", BZip2Codec.class.getName()); 34 | super.setStoreLocation(location, job); 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/store/LzoRawBytesStorage.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.store; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.mapreduce.OutputFormat; 6 | import org.apache.pig.data.DataByteArray; 7 | import org.apache.pig.data.Tuple; 8 | 9 | import com.twitter.elephantbird.mapreduce.io.RawBytesWritable; 10 | import com.twitter.elephantbird.mapreduce.output.LzoBinaryBlockOutputFormat; 11 | import com.twitter.elephantbird.pig.load.LzoBaseLoadFunc; 12 | import com.twitter.elephantbird.pig.load.LzoRawBytesLoader; 13 | 14 | /** 15 | * Stores raw byte[] records with LZO block-compression, suitable for reading via 16 | * {@link LzoRawBytesLoader} or some other {@link LzoBaseLoadFunc}. 17 | * 18 | * @author Andy Schlaikjer 19 | */ 20 | public class LzoRawBytesStorage extends BaseStoreFunc { 21 | private final RawBytesWritable writable = new RawBytesWritable(); 22 | 23 | @Override 24 | @SuppressWarnings("rawtypes") 25 | public OutputFormat getOutputFormat() throws IOException { 26 | return new LzoBinaryBlockOutputFormat(); 27 | } 28 | 29 | @Override 30 | public void putNext(Tuple t) throws IOException { 31 | DataByteArray data = null; 32 | if (t == null || t.size() < 1 || (data = (DataByteArray) t.get(0)) == null) { 33 | // TODO(Andy Schlaikjer): Signal error 34 | return; 35 | } 36 | writable.set(data.get()); 37 | writeRecord(null, writable); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/store/LzoThriftB64LinePigStorage.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.store; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.mapreduce.OutputFormat; 6 | import org.apache.pig.data.Tuple; 7 | import org.apache.thrift.TBase; 8 | 9 | import com.twitter.elephantbird.mapreduce.io.ThriftWritable; 10 | import com.twitter.elephantbird.mapreduce.output.LzoThriftB64LineOutputFormat; 11 | import com.twitter.elephantbird.pig.util.PigToThrift; 12 | import com.twitter.elephantbird.pig.util.PigUtil; 13 | import com.twitter.elephantbird.util.TypeRef; 14 | 15 | /** 16 | * Serializes Pig Tuples into Base-64 encoded, line-delimited Thrift objects. 17 | * The fields in the pig tuple must correspond exactly to the fields in 18 | * the Thrift object, as no name-matching is performed (names of the tuple 19 | * fields are not currently accessible to a StoreFunc. It will be in 0.7, 20 | * so something more flexible will be possible) 21 | */ 22 | public class LzoThriftB64LinePigStorage> extends BaseStoreFunc { 23 | 24 | private TypeRef typeRef; 25 | private ThriftWritable writable; 26 | private PigToThrift pigToThrift; 27 | 28 | public LzoThriftB64LinePigStorage(String thriftClassName) { 29 | typeRef = PigUtil.getThriftTypeRef(thriftClassName); 30 | writable = ThriftWritable.newInstance(typeRef.getRawClass()); 31 | pigToThrift = PigToThrift.newInstance(typeRef); 32 | } 33 | 34 | @Override 35 | @SuppressWarnings("unchecked") 36 | public void putNext(Tuple f) throws IOException { 37 | if (f == null) return; 38 | try { 39 | writable.set(pigToThrift.getThriftObject(f)); 40 | writer.write(null, writable); 41 | } catch (InterruptedException e) { 42 | throw new IOException(e); 43 | } 44 | } 45 | 46 | @Override 47 | public OutputFormat> getOutputFormat() throws IOException { 48 | return new LzoThriftB64LineOutputFormat(typeRef); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/store/LzoThriftBlockPigStorage.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.store; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.mapreduce.OutputFormat; 6 | import org.apache.pig.data.Tuple; 7 | import org.apache.thrift.TBase; 8 | 9 | import com.twitter.elephantbird.mapreduce.io.ThriftWritable; 10 | import com.twitter.elephantbird.mapreduce.output.LzoThriftBlockOutputFormat; 11 | import com.twitter.elephantbird.pig.util.PigToThrift; 12 | import com.twitter.elephantbird.pig.util.PigUtil; 13 | import com.twitter.elephantbird.util.TypeRef; 14 | 15 | /** 16 | * Serializes Pig Tuples into Base-64 encoded, line-delimited Thrift objects. 17 | * The fields in the pig tuple must correspond exactly to the fields in 18 | * the Thrift object, as no name-matching is performed (names of the tuple 19 | * fields are not currently accessible to a StoreFunc. It will be in 0.7, 20 | * so something more flexible will be possible) 21 | */ 22 | public class LzoThriftBlockPigStorage> extends BaseStoreFunc { 23 | 24 | private TypeRef typeRef; 25 | private ThriftWritable writable; 26 | private PigToThrift pigToThrift; 27 | 28 | public LzoThriftBlockPigStorage(String thriftClassName) { 29 | typeRef = PigUtil.getThriftTypeRef(thriftClassName); 30 | writable = ThriftWritable.newInstance(typeRef.getRawClass()); 31 | pigToThrift = PigToThrift.newInstance(typeRef); 32 | } 33 | 34 | @Override 35 | @SuppressWarnings("unchecked") 36 | public void putNext(Tuple f) throws IOException { 37 | if (f == null) return; 38 | try { 39 | writable.set(pigToThrift.getThriftObject(f)); 40 | writer.write(null, writable); 41 | } catch (InterruptedException e) { 42 | throw new IOException(e); 43 | } 44 | } 45 | 46 | @Override 47 | public OutputFormat> getOutputFormat() throws IOException { 48 | return new LzoThriftBlockOutputFormat(typeRef); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/store/LzoTokenizedStorage.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.store; 2 | 3 | /** 4 | * @deprecated use {@link LzoPigStorage} instead 5 | */ 6 | @Deprecated 7 | public class LzoTokenizedStorage extends LzoPigStorage { 8 | 9 | public LzoTokenizedStorage() { 10 | super(); 11 | } 12 | 13 | public LzoTokenizedStorage(String delimiter) { 14 | super(delimiter); 15 | } 16 | } 17 | 18 | -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/util/BytesWritableConverter.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.util; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.BytesWritable; 6 | import org.apache.hadoop.io.DataInputBuffer; 7 | import org.apache.pig.ResourceSchema.ResourceFieldSchema; 8 | import org.apache.pig.data.DataByteArray; 9 | import org.apache.pig.data.DataType; 10 | 11 | /** 12 | * Supports conversion between Pig bytearray and {@link org.apache.hadoop.io.BytesWritable}. 13 | * 14 | * @author Andy Schlaikjer 15 | */ 16 | public class BytesWritableConverter extends AbstractWritableConverter { 17 | private final DataInputBuffer in = new DataInputBuffer(); 18 | 19 | public BytesWritableConverter() { 20 | super(new BytesWritable()); 21 | } 22 | 23 | @Override 24 | public ResourceFieldSchema getLoadSchema() throws IOException { 25 | ResourceFieldSchema schema = new ResourceFieldSchema(); 26 | schema.setType(DataType.BYTEARRAY); 27 | return schema; 28 | } 29 | 30 | @Override 31 | public Object bytesToObject(DataByteArray dataByteArray) throws IOException { 32 | byte[] bytes = dataByteArray.get(); 33 | // test leading 4 bytes encode run length of rest of data 34 | in.reset(bytes, bytes.length); 35 | int length = in.readInt(); 36 | if (length != bytes.length - 4) { 37 | throw new IOException(String.format( 38 | "Int value '%d' of leading four bytes does not match run length of data '%d'", 39 | length, bytes.length - 4)); 40 | } 41 | return new DataByteArray(bytes, 4, bytes.length); 42 | } 43 | 44 | @Override 45 | public void checkStoreSchema(ResourceFieldSchema schema) throws IOException { 46 | switch (schema.getType()) { 47 | case DataType.BYTEARRAY: 48 | return; 49 | } 50 | throw new IOException("Pig type '" + DataType.findTypeName(schema.getType()) + "' unsupported"); 51 | } 52 | 53 | @Override 54 | protected BytesWritable toWritable(DataByteArray value) throws IOException { 55 | // avoid array copy at the sake of new BytesWritable 56 | return new BytesWritable(value.get()); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/util/GenericWritableConverter.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.util; 2 | 3 | import java.io.IOException; 4 | 5 | import com.google.common.base.Preconditions; 6 | 7 | import org.apache.hadoop.io.DataInputBuffer; 8 | import org.apache.hadoop.io.Writable; 9 | import org.apache.pig.ResourceSchema.ResourceFieldSchema; 10 | import org.apache.pig.data.DataByteArray; 11 | import org.apache.pig.data.DataType; 12 | 13 | /** 14 | * Supports conversion between Pig bytearray ({@link DataByteArray}) and an arbitrary 15 | * {@link Writable} implementation type. Useful for loading data from a SequenceFile when the key or 16 | * value must be passed through to output, but otherwise goes untouched by Pig. 17 | * 18 | * @author Andy Schlaikjer 19 | */ 20 | public class GenericWritableConverter extends AbstractWritableConverter { 21 | private final DataInputBuffer ibuf = new DataInputBuffer(); 22 | 23 | @Override 24 | public ResourceFieldSchema getLoadSchema() throws IOException { 25 | ResourceFieldSchema schema = new ResourceFieldSchema(); 26 | schema.setType(DataType.BYTEARRAY); 27 | return schema; 28 | } 29 | 30 | @Override 31 | public void checkStoreSchema(ResourceFieldSchema schema) throws IOException { 32 | Preconditions.checkNotNull(schema); 33 | if (schema.getType() != DataType.BYTEARRAY) 34 | throw new IOException("Expected Pig type '" + DataType.findTypeName(DataType.BYTEARRAY) 35 | + "' but found '" + DataType.findTypeName(schema.getType()) + "'"); 36 | } 37 | 38 | @Override 39 | protected Writable toWritable(DataByteArray value) throws IOException { 40 | Preconditions.checkNotNull(writable, "Writable is null"); 41 | byte[] bytes = value.get(); 42 | ibuf.reset(bytes, bytes.length); 43 | writable.readFields(ibuf); 44 | return writable; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/util/LazyThriftWritableConverter.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.util; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.pig.ResourceSchema.ResourceFieldSchema; 6 | import org.apache.pig.data.Tuple; 7 | import org.apache.thrift.TBase; 8 | 9 | import com.twitter.elephantbird.mapreduce.io.ThriftWritable; 10 | 11 | /** 12 | * Supports conversion between Pig {@link Tuple} and {@link ThriftWritable} types. This is a 13 | * specialization of {@link ThriftWritableConverter} which uses 14 | * {@link ThriftToPig#getLazyTuple(TBase)} to generate Tuples from ThriftWritable instances. 15 | * 16 | * @author Andy Schlaikjer 17 | * @see ThriftWritableConverter 18 | */ 19 | public class LazyThriftWritableConverter> extends ThriftWritableConverter { 20 | public LazyThriftWritableConverter(String thriftClassName) { 21 | super(thriftClassName); 22 | } 23 | 24 | @Override 25 | protected Tuple toTuple(ThriftWritable writable, ResourceFieldSchema schema) 26 | throws IOException { 27 | return thriftToPig.getLazyTuple(writable.get()); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/util/LoadFuncTupleIterator.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.util; 2 | 3 | import java.io.IOException; 4 | import java.util.Iterator; 5 | import java.util.NoSuchElementException; 6 | 7 | import org.apache.pig.LoadFunc; 8 | import org.apache.pig.data.Tuple; 9 | 10 | /** 11 | * Utility to simplify iteration over Tuples loaded by some {@link LoadFunc}. 12 | * 13 | * @author Andy Schlaikjer 14 | */ 15 | public class LoadFuncTupleIterator implements Iterator { 16 | private final LoadFunc loadFunc; 17 | private boolean hasNextCalled; 18 | private Tuple tuple; 19 | 20 | public LoadFuncTupleIterator(LoadFunc loadFunc) { 21 | super(); 22 | this.loadFunc = loadFunc; 23 | } 24 | 25 | @Override 26 | public boolean hasNext() { 27 | if (!hasNextCalled) { 28 | hasNextCalled = true; 29 | if (tuple == null) { 30 | try { 31 | tuple = loadFunc.getNext(); 32 | } catch (IOException e) { 33 | throw new RuntimeException(e); 34 | } 35 | } 36 | } 37 | return tuple != null; 38 | } 39 | 40 | @Override 41 | public Tuple next() { 42 | if (!hasNext()) 43 | throw new NoSuchElementException(); 44 | Tuple next = tuple; 45 | hasNextCalled = false; 46 | tuple = null; 47 | return next; 48 | } 49 | 50 | @Override 51 | public void remove() { 52 | throw new UnsupportedOperationException(); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/util/NullWritableConverter.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.util; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.NullWritable; 6 | import org.apache.pig.ResourceSchema.ResourceFieldSchema; 7 | import org.apache.pig.data.DataByteArray; 8 | import org.apache.pig.data.DataType; 9 | 10 | /** 11 | * Supports conversion from NullWritable to Pig null, and from all Pig types to {@link NullWritable} 12 | * . 13 | * 14 | * @author Andy Schlaikjer 15 | */ 16 | public class NullWritableConverter extends AbstractWritableConverter { 17 | public NullWritableConverter() { 18 | super(NullWritable.get()); 19 | } 20 | 21 | @Override 22 | public ResourceFieldSchema getLoadSchema() throws IOException { 23 | ResourceFieldSchema schema = new ResourceFieldSchema(); 24 | schema.setType(DataType.NULL); 25 | return schema; 26 | } 27 | 28 | @Override 29 | public Object bytesToObject(DataByteArray dataByteArray) throws IOException { 30 | return null; 31 | } 32 | 33 | @Override 34 | public NullWritable toWritable(Object value) throws IOException { 35 | return writable; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/util/PigTokenHelper.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.util; 2 | 3 | /** 4 | * A helper class to deal with standard Pig tokens and delimiters. 5 | */ 6 | public class PigTokenHelper { 7 | public static final byte DEFAULT_RECORD_DELIMITER = '\n'; 8 | public static final byte DEFAULT_FIELD_DELIMITER = '\t'; 9 | public static final String DEFAULT_FIELD_DELIMITER_STRING = "\\t"; 10 | 11 | // String constants for each delimiter 12 | public static final String TUPLE_BEGIN = "("; 13 | public static final String TUPLE_END = ")"; 14 | public static final String BAG_BEGIN = "{"; 15 | public static final String BAG_END = "}"; 16 | public static final String MAP_BEGIN = "["; 17 | public static final String MAP_END = "]"; 18 | public static final String MAP_KV = "#"; 19 | 20 | /** 21 | * Parse an input delimiter string, as with PigStorage, and return the byte it represents. 22 | * @param inputDelimiter the string passed in from the pig script. 23 | * @return the corresponding byte that will serve as the field separator. 24 | */ 25 | public static byte evaluateDelimiter(String inputDelimiter) { 26 | if (inputDelimiter.length() == 1) { 27 | return inputDelimiter.getBytes()[0]; 28 | } else if (inputDelimiter.length() > 1 && inputDelimiter.charAt(0) == '\\') { 29 | switch (inputDelimiter.charAt(1)) { 30 | case 't': 31 | return (byte)'\t'; 32 | 33 | case 'x': 34 | case 'u': 35 | return Integer.valueOf(inputDelimiter.substring(2)).byteValue(); 36 | 37 | default: 38 | throw new IllegalArgumentException("Unknown delimiter " + inputDelimiter); 39 | } 40 | } else { 41 | throw new IllegalArgumentException("LzoTokenizedStorage delimeter must be a single character"); 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/util/ProtobufTuple.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.util; 2 | 3 | import java.util.List; 4 | 5 | import com.google.protobuf.Message; 6 | import com.google.protobuf.Descriptors.Descriptor; 7 | import com.google.protobuf.Descriptors.FieldDescriptor; 8 | 9 | @SuppressWarnings("serial") 10 | /** 11 | * This class wraps a protocol buffer message and attempts to delay parsing until individual 12 | * fields are requested. 13 | */ 14 | public class ProtobufTuple extends AbstractLazyTuple { 15 | 16 | private final Message msg_; 17 | private final Descriptor descriptor_; 18 | private final List fieldDescriptors_; 19 | private final ProtobufToPig protoConv_; 20 | private final int protoSize_; 21 | 22 | public ProtobufTuple(Message msg) { 23 | msg_ = msg; 24 | descriptor_ = msg.getDescriptorForType(); 25 | fieldDescriptors_ = descriptor_.getFields(); 26 | protoSize_ = fieldDescriptors_.size(); 27 | protoConv_ = new ProtobufToPig(); 28 | initRealTuple(protoSize_); 29 | } 30 | 31 | protected Object getObjectAt(int idx) { 32 | FieldDescriptor fieldDescriptor = fieldDescriptors_.get(idx); 33 | Object fieldValue = msg_.getField(fieldDescriptor); 34 | return protoConv_.fieldToPig(fieldDescriptor, fieldValue); 35 | } 36 | 37 | @Override 38 | public long getMemorySize() { 39 | // The protobuf estimate is obviously inaccurate. 40 | return msg_.getSerializedSize() + realTuple.getMemorySize(); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /pig/src/main/java/com/twitter/elephantbird/pig/util/ResourceSchemaUtil.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.util; 2 | 3 | import java.io.IOException; 4 | import java.util.List; 5 | 6 | import org.apache.pig.LoadPushDown.RequiredField; 7 | import org.apache.pig.ResourceSchema; 8 | import org.apache.pig.ResourceSchema.ResourceFieldSchema; 9 | 10 | /** 11 | * Utilities for {@link ResourceSchema} and friends. 12 | * 13 | * @author Andy Schlaikjer 14 | */ 15 | public final class ResourceSchemaUtil { 16 | /** 17 | * Creates a new ResourceFieldSchema which reflects data from an input RequiredField. 18 | * 19 | * @param field 20 | * @return new ResourceFieldSchema which reflects {@code field}. 21 | * @throws IOException 22 | */ 23 | public static ResourceFieldSchema createResourceFieldSchema(RequiredField field) 24 | throws IOException { 25 | ResourceFieldSchema schema = 26 | new ResourceFieldSchema().setName(field.getAlias()).setType(field.getType()); 27 | List subFields = field.getSubFields(); 28 | if (subFields != null && !subFields.isEmpty()) { 29 | ResourceFieldSchema[] subSchemaFields = new ResourceFieldSchema[subFields.size()]; 30 | int i = 0; 31 | for (RequiredField subField : subFields) { 32 | subSchemaFields[i++] = createResourceFieldSchema(subField); 33 | } 34 | ResourceSchema subSchema = new ResourceSchema(); 35 | subSchema.setFields(subSchemaFields); 36 | schema.setSchema(subSchema); 37 | } 38 | return schema; 39 | } 40 | 41 | private ResourceSchemaUtil() { 42 | // hide ctor 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /pig/src/test/java/com/twitter/elephantbird/pig/piggybank/TestPigToProto.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.piggybank; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import org.apache.pig.backend.executionengine.ExecException; 6 | import org.apache.pig.data.Tuple; 7 | import org.apache.thrift.TException; 8 | import org.junit.Test; 9 | 10 | import com.google.protobuf.Message; 11 | import com.twitter.data.proto.tutorial.AddressBookProtos.AddressBook; 12 | import com.twitter.elephantbird.examples.proto.ThriftFixtures.OneOfEach; 13 | import com.twitter.elephantbird.pig.util.PigToProtobuf; 14 | import com.twitter.elephantbird.pig.util.ThriftToPig; 15 | import com.twitter.elephantbird.util.ThriftToProto; 16 | 17 | 18 | public class TestPigToProto { 19 | 20 | @Test 21 | public void testPigToProto() throws ExecException, TException { 22 | Tuple abTuple = Fixtures.buildAddressBookTuple(); 23 | Message proto = PigToProtobuf.tupleToMessage(AddressBook.newBuilder(), abTuple); 24 | assertEquals(Fixtures.buildAddressBookProto(), proto); 25 | 26 | // test with OneOfEach. 27 | thrift.test.OneOfEach thrift_ooe = org.apache.thrift.Fixtures.oneOfEach; 28 | OneOfEach proto_ooe = ThriftToProto.newInstance(thrift_ooe, OneOfEach.newBuilder().build()).convert(thrift_ooe); 29 | //tuple from Thrift ooe : 30 | Tuple tuple_ooe = ThriftToPig.newInstance(thrift.test.OneOfEach.class).getPigTuple(thrift_ooe); 31 | 32 | assertEquals(proto_ooe, PigToProtobuf.tupleToMessage(OneOfEach.class, tuple_ooe)); 33 | 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /pig/src/test/java/com/twitter/elephantbird/pig/piggybank/TimeProtoConversions.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.piggybank; 2 | 3 | import org.apache.commons.lang.time.StopWatch; 4 | import org.apache.pig.backend.executionengine.ExecException; 5 | import org.apache.pig.data.Tuple; 6 | 7 | import com.twitter.data.proto.tutorial.AddressBookProtos.Person; 8 | import com.twitter.elephantbird.pig.util.ProtobufToPig; 9 | import com.twitter.elephantbird.pig.util.ProtobufTuple; 10 | 11 | public class TimeProtoConversions { 12 | 13 | /** 14 | * @param args 15 | * @throws ExecException 16 | */ 17 | public static void main(String[] args) throws ExecException { 18 | int iterations = 100000; 19 | ProtobufToPig protoConv = new ProtobufToPig(); 20 | for (int i = 0; i < iterations; i++) { 21 | Person proto = Fixtures.buildPersonProto(); 22 | Tuple t = protoConv.toTuple(proto); 23 | t.get(0); 24 | t = new ProtobufTuple(proto); 25 | t.get(0); 26 | } 27 | StopWatch timer = new StopWatch(); 28 | timer.start(); 29 | for (int i = 0; i < iterations; i++) { 30 | Person proto = Fixtures.buildPersonProto(); 31 | Tuple t = protoConv.toTuple(proto); 32 | t.get(0); 33 | } 34 | timer.split(); 35 | System.err.println(timer.getSplitTime()); 36 | timer.reset(); 37 | timer.start(); 38 | for (int i = 0; i < iterations; i++) { 39 | Person proto = Fixtures.buildPersonProto(); 40 | Tuple t = new ProtobufTuple(proto); 41 | t.get(0); 42 | } 43 | timer.split(); 44 | System.err.println(timer.getSplitTime()); 45 | 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /pig/src/test/java/com/twitter/elephantbird/pig/store/FixedArgsConstructorIntWritableConverter.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.store; 2 | 3 | import com.twitter.elephantbird.pig.util.IntWritableConverter; 4 | 5 | /** 6 | * Simple WritableConverter impl which has no default constructor-- String arguments are required. 7 | * 8 | * @author Andy Schlaikjer 9 | */ 10 | public class FixedArgsConstructorIntWritableConverter extends IntWritableConverter { 11 | private final String a; 12 | private final String b; 13 | 14 | public FixedArgsConstructorIntWritableConverter(String a, String b) { 15 | this.a = a; 16 | this.b = b; 17 | } 18 | 19 | public String getA() { 20 | return a; 21 | } 22 | 23 | public String getB() { 24 | return b; 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /pig/src/test/java/com/twitter/elephantbird/pig/store/VarArgsConstructorIntWritableConverter.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.store; 2 | 3 | import com.twitter.elephantbird.pig.util.IntWritableConverter; 4 | 5 | /** 6 | * Simple WritableConverter impl which has no default constructor-- String arguments are required. 7 | * 8 | * @author Andy Schlaikjer 9 | */ 10 | public class VarArgsConstructorIntWritableConverter extends IntWritableConverter { 11 | private final String[] args; 12 | 13 | public VarArgsConstructorIntWritableConverter(String... args) { 14 | this.args = args; 15 | } 16 | 17 | public String[] getArgs() { 18 | return args; 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /pig/src/test/java/com/twitter/elephantbird/pig/util/AbstractTestProtobufWritableConverter.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.util; 2 | 3 | import com.google.protobuf.Message; 4 | 5 | import com.twitter.elephantbird.mapreduce.io.ProtobufWritable; 6 | 7 | /** 8 | * @author Andy Schlaikjer 9 | */ 10 | public abstract class AbstractTestProtobufWritableConverter extends 11 | AbstractTestWritableConverter, ProtobufWritableConverter> { 12 | public AbstractTestProtobufWritableConverter(Class protobufClass, ProtobufWritable[] data, 13 | String[] expected, String valueSchema) { 14 | super(castProtobufWritableClass(protobufClass, ProtobufWritable.class), 15 | castProtobufWritableConverterClass(protobufClass, ProtobufWritableConverter.class), 16 | protobufClass.getName(), data, expected, valueSchema); 17 | } 18 | 19 | @SuppressWarnings("unchecked") 20 | private static Class> castProtobufWritableClass( 21 | Class protobufClass, Class cls) { 22 | return (Class>) cls; 23 | } 24 | 25 | @SuppressWarnings("unchecked") 26 | private static Class> castProtobufWritableConverterClass( 27 | Class protobufClass, Class cls) { 28 | return (Class>) cls; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /pig/src/test/java/com/twitter/elephantbird/pig/util/AbstractTestThriftNameWritableConverter.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.util; 2 | 3 | import java.lang.reflect.Array; 4 | 5 | import com.twitter.elephantbird.thrift.test.Name; 6 | import com.twitter.elephantbird.mapreduce.io.ThriftWritable; 7 | import com.twitter.elephantbird.util.ThriftUtils; 8 | import com.twitter.elephantbird.util.TypeRef; 9 | 10 | /** 11 | * @author Andy Schlaikjer 12 | */ 13 | public abstract class AbstractTestThriftNameWritableConverter, C extends ThriftWritableConverter> 14 | extends AbstractTestThriftWritableConverter { 15 | public static final TypeRef TYPE_REF = ThriftUtils.getTypeRef(Name.class); 16 | private static final Name V1 = new Name("Jon", "Smith"); 17 | private static final Name V2 = new Name("John", "Doe"); 18 | private static final Name V3 = new Name("Mary", "Jane"); 19 | private static final String[] EXPECTED = { "(Jon,Smith)", "(John,Doe)", "(Mary,Jane)" }; 20 | 21 | public AbstractTestThriftNameWritableConverter(Class writableClass, 22 | Class writableConverterClass) { 23 | super(Name.class, writableClass, writableConverterClass, getData(writableClass), EXPECTED, 24 | "tuple()"); 25 | } 26 | 27 | protected static > W[] getData(Class writableClass) { 28 | try { 29 | @SuppressWarnings("unchecked") 30 | W[] ws = (W[]) Array.newInstance(writableClass, 3); 31 | ws[0] = writableClass.newInstance(); 32 | ws[1] = writableClass.newInstance(); 33 | ws[2] = writableClass.newInstance(); 34 | ws[0].setConverter(Name.class); 35 | ws[1].setConverter(Name.class); 36 | ws[2].setConverter(Name.class); 37 | ws[0].set(V1); 38 | ws[1].set(V2); 39 | ws[2].set(V3); 40 | return ws; 41 | } catch (Exception e) { 42 | throw new RuntimeException(e); 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /pig/src/test/java/com/twitter/elephantbird/pig/util/AbstractTestThriftWritableConverter.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.util; 2 | 3 | import org.apache.thrift.TBase; 4 | 5 | import com.twitter.elephantbird.mapreduce.io.ThriftWritable; 6 | 7 | /** 8 | * @author Andy Schlaikjer 9 | */ 10 | public abstract class AbstractTestThriftWritableConverter> extends 11 | AbstractTestWritableConverter, ThriftWritableConverter> { 12 | public AbstractTestThriftWritableConverter(Class thriftClass, 13 | Class> writableClass, 14 | Class> writableConverterClass, ThriftWritable[] data, 15 | String[] expected, String valueSchema) { 16 | super(writableClass, writableConverterClass, thriftClass.getName(), data, expected, valueSchema); 17 | } 18 | 19 | @SuppressWarnings("unchecked") 20 | public static , W extends ThriftWritable> Class getWritableClass( 21 | Class thriftClass, Class writableClass) { 22 | return (Class) writableClass; 23 | } 24 | 25 | @SuppressWarnings("unchecked") 26 | public static , W extends ThriftWritable, C extends ThriftWritableConverter> Class getWritableConverterClass( 27 | Class thriftClass, Class writableClass, Class writableConverterClass) { 28 | return (Class) writableConverterClass; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /pig/src/test/java/com/twitter/elephantbird/pig/util/IntegrationTestIntWritableConverter.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.util; 2 | 3 | import org.apache.hadoop.io.IntWritable; 4 | 5 | /** 6 | * @author Andy Schlaikjer 7 | */ 8 | public class IntegrationTestIntWritableConverter extends 9 | AbstractTestWritableConverter { 10 | private static final IntWritable[] DATA = { new IntWritable(1), new IntWritable(2), 11 | new IntWritable(3) }; 12 | private static final String[] EXPECTED = { "1", "2", "3" }; 13 | 14 | public IntegrationTestIntWritableConverter() { 15 | super(IntWritable.class, IntWritableConverter.class, "", DATA, EXPECTED, "int"); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /pig/src/test/java/com/twitter/elephantbird/pig/util/IntegrationTestLongWritableConverter.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.util; 2 | 3 | import org.apache.hadoop.io.LongWritable; 4 | 5 | /** 6 | * @author Andy Schlaikjer 7 | */ 8 | public class IntegrationTestLongWritableConverter extends 9 | AbstractTestWritableConverter { 10 | private static final LongWritable[] DATA = { new LongWritable(1), new LongWritable(2), 11 | new LongWritable(4294967296l) }; 12 | private static final String[] EXPECTED = { "1", "2", "4294967296" }; 13 | 14 | public IntegrationTestLongWritableConverter() { 15 | super(LongWritable.class, LongWritableConverter.class, "", DATA, EXPECTED, "long"); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /pig/src/test/java/com/twitter/elephantbird/pig/util/IntegrationTestTextConverter.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.util; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.Text; 6 | import org.junit.Test; 7 | 8 | import com.twitter.elephantbird.pig.store.SequenceFileStorage; 9 | 10 | /** 11 | * @author Andy Schlaikjer 12 | */ 13 | public class IntegrationTestTextConverter extends 14 | AbstractTestWritableConverter { 15 | private static final String V1 = "one, two, buckle my shoe"; 16 | private static final String V2 = "three, four, knock on my door"; 17 | private static final String V3 = "five, six, pickup sticks"; 18 | private static final Text[] DATA = { new Text(V1), new Text(V2), new Text(V3) }; 19 | private static final String[] EXPECTED = { V1, V2, V3 }; 20 | 21 | public IntegrationTestTextConverter() { 22 | super(Text.class, TextConverter.class, "", DATA, EXPECTED, "chararray"); 23 | } 24 | 25 | @Test 26 | public void testDefaultCtor() throws IOException { 27 | pigServer.registerQuery(String.format("A = LOAD 'file:%s' USING %s();", tempFilename, 28 | SequenceFileStorage.class.getName())); 29 | validate(pigServer.openIterator("A")); 30 | } 31 | 32 | @Test 33 | public void testDefaultCtor02() throws IOException { 34 | pigServer.registerQuery(String.format("A = LOAD 'file:%s' USING %s('', '');", tempFilename, 35 | SequenceFileStorage.class.getName())); 36 | validate(pigServer.openIterator("A")); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /pig/src/test/java/com/twitter/elephantbird/pig/util/PigTestUtil.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.util; 2 | 3 | import org.apache.pig.ExecType; 4 | import org.apache.pig.PigServer; 5 | import org.apache.pig.backend.executionengine.ExecException; 6 | 7 | /** 8 | * Common test utilities 9 | */ 10 | public class PigTestUtil { 11 | 12 | /** 13 | * Creates a new PigServer in local mode. 14 | * Sets pig properties for lzo codec and temp directory. 15 | */ 16 | static public PigServer makePigServer() throws ExecException { 17 | 18 | PigServer pigServer = new PigServer(ExecType.LOCAL); 19 | // set lzo codec: 20 | pigServer.getPigContext().getProperties().setProperty( 21 | "io.compression.codecs", "com.hadoop.compression.lzo.LzopCodec"); 22 | 23 | pigServer.getPigContext().getProperties().setProperty( 24 | "pig.temp.dir", System.getProperty("test.build.data") + "/pig-temp"); 25 | 26 | return pigServer; 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /pig/src/test/java/com/twitter/elephantbird/pig/util/TestProtobufWritableConverter.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.util; 2 | 3 | import com.twitter.data.proto.tutorial.AddressBookProtos.Person; 4 | import com.twitter.elephantbird.mapreduce.io.ProtobufWritable; 5 | import com.twitter.elephantbird.util.Protobufs; 6 | import com.twitter.elephantbird.util.TypeRef; 7 | 8 | /** 9 | * @author Andy Schlaikjer 10 | */ 11 | public class TestProtobufWritableConverter extends AbstractTestProtobufWritableConverter { 12 | private static final TypeRef TYPE_REF = Protobufs.getTypeRef(Person.class.getName()); 13 | private static final Person V1 = Person.newBuilder().setId(1).setName("Jon Smith").build(); 14 | private static final Person V3 = Person.newBuilder().setId(3).setName("Mary Jane").build(); 15 | private static final Person V2 = Person.newBuilder().setId(2).setName("John Doe").build(); 16 | private static final ProtobufWritable[] DATA = { new ProtobufWritable(V1, TYPE_REF), 17 | new ProtobufWritable(V2, TYPE_REF), new ProtobufWritable(V3, TYPE_REF) }; 18 | private static final String[] EXPECTED = { "(Jon Smith,1,,{})", "(John Doe,2,,{})", 19 | "(Mary Jane,3,,{})" }; 20 | 21 | @SuppressWarnings("unchecked") 22 | public TestProtobufWritableConverter() { 23 | super(Person.class, (ProtobufWritable[]) DATA, EXPECTED, "()"); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /pig/src/test/java/com/twitter/elephantbird/pig/util/TestThriftNameWritableConverter.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.util; 2 | 3 | import com.twitter.elephantbird.thrift.test.Name; 4 | import com.twitter.elephantbird.mapreduce.io.ThriftWritable; 5 | 6 | /** 7 | * @author Andy Schlaikjer 8 | */ 9 | public class TestThriftNameWritableConverter extends 10 | AbstractTestThriftNameWritableConverter, ThriftWritableConverter> { 11 | public TestThriftNameWritableConverter() { 12 | super(getWritableClass(Name.class, ThriftWritable.class), getWritableConverterClass(Name.class, 13 | getWritableClass(Name.class, ThriftWritable.class), ThriftWritableConverter.class)); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /pig/src/test/java/com/twitter/elephantbird/pig/util/TestThriftNameWritableConverterCustom.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.util; 2 | 3 | import com.twitter.elephantbird.thrift.test.Name; 4 | 5 | /** 6 | * @author Andy Schlaikjer 7 | */ 8 | public class TestThriftNameWritableConverterCustom extends 9 | AbstractTestThriftNameWritableConverter> { 10 | public TestThriftNameWritableConverterCustom() { 11 | super(ThriftNameWritable.class, getWritableConverterClass(Name.class, ThriftNameWritable.class, 12 | ThriftWritableConverter.class)); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /pig/src/test/java/com/twitter/elephantbird/pig/util/ThriftNameWritable.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.util; 2 | 3 | import com.twitter.elephantbird.thrift.test.Name; 4 | import com.twitter.elephantbird.mapreduce.io.ThriftWritable; 5 | import com.twitter.elephantbird.util.ThriftUtils; 6 | 7 | /** 8 | * @author Andy Schlaikjer 9 | */ 10 | public class ThriftNameWritable extends ThriftWritable { 11 | public ThriftNameWritable() { 12 | super(ThriftUtils.getTypeRef(Name.class)); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /pig/src/test/java/com/twitter/elephantbird/util/TestProtobufs.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.util; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import org.junit.Test; 6 | 7 | import com.google.common.base.Function; 8 | import com.google.protobuf.Message; 9 | import com.twitter.data.proto.tutorial.AddressBookProtos.AddressBook; 10 | import com.twitter.data.proto.tutorial.AddressBookProtos.Person; 11 | import com.twitter.elephantbird.mapreduce.io.DecodeException; 12 | import com.twitter.elephantbird.mapreduce.io.ProtobufConverter; 13 | import com.twitter.elephantbird.pig.piggybank.Fixtures; 14 | import com.twitter.elephantbird.util.Protobufs; 15 | 16 | public class TestProtobufs { 17 | 18 | private static final AddressBook ab_ = Fixtures.buildAddressBookProto(); 19 | private static final byte[] abBytes_ = ab_.toByteArray(); 20 | 21 | @Test 22 | public void testGetInnerProtobufClass() { 23 | String canonicalClassName = "com.twitter.data.proto.tutorial.AddressBookProtos.Person"; 24 | Class klass = Protobufs.getInnerProtobufClass(canonicalClassName); 25 | assertEquals(klass, Person.class); 26 | } 27 | 28 | @Test 29 | public void testDynamicParsing() { 30 | assertEquals(ab_, Protobufs.parseDynamicFrom(AddressBook.class, abBytes_)); 31 | } 32 | 33 | @Test 34 | public void testStaticParsing() { 35 | assertEquals(ab_, Protobufs.parseFrom(AddressBook.class, abBytes_)); 36 | } 37 | 38 | @Test 39 | public void testConverterParsing() throws DecodeException { 40 | ProtobufConverter protoConverter = ProtobufConverter.newInstance(AddressBook.class); 41 | assertEquals(ab_, protoConverter.fromBytes(abBytes_)); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /pig/src/test/resources/W3CLogParser.invalid.txt: -------------------------------------------------------------------------------- 1 | 487077199 1261350000 1261350000 searchweb008.twitter.com 1261350047-27526-10265 - - - - - 10134881.800 26496.750 0.107 0.027 0.010 0.080 0.070 0.057 0.047 0.000 0.010 0.010 - - - - - 0.142 0.132 0.000 0.010 0.010 - - - - - 141276 - 141277 0.000 - - - - - 6671269565 202697636 0.001 0.001 0.000 0.000 0.000 31683409 3063908 0.000 - 0.000 10265 740552 229.480 3 359.469 0.000 - - SearchController::search SearchController - 0.006 0.006 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 - 0.002 0.002 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.397 0.257 0.010 0.140 0.130 application/json - 1 - GET /search.json - rpp=100&q=%23neh - 68.255.61.253 0.000 0.000 0.000 0.000 0.000 0.700 0.550 0.010 0.150 0.140 0.401 0.261 0.010 0.140 0.130 Tweetie1.2.2_CFNetwork/438.14_Darwin/9.8.0_(i386)_(MacBookPro5%2C1) 0.000 0.000 0.000 0.000 0.000 68.255.61.253 9669 200 20 0.003 0.003 0.000 0.000 0.000 529935 - - - - - - - - - - - 1260820112 0.000 0.000 0.000 0.000 0.000 2 | 487077198 1261350047 1261350000 1261350000 searchweb008.twitter.com 1261350047-12279-9607 - - - - - 4625285.133 12268.100 - - - - - - - - - - - - - - - 0.062 0.062 0.000 0.000 0.000 0.104 0.074 0.000 0.030 0.030 73744 - 73745 0.000 - - - - - 6735210058 138758554 - - - - - 31894719 3080021 0.000 - 0.000 9607 741029 234.312 3 358.473 0.000 - - SearchController::search SearchController - 0.011 0.001 0.000 0.010 0.010 0.000 0.000 0.000 0.000 0.000 - 0.001 0.001 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.166 0.106 0.000 0.060 0.060 application/json - 1 en-us,en;q=0.5 GET /search.json - _=&lang=en&q=%23ab09&rpp=30 https://www.arabloggers.com/2009/12/19/interview-with-lina-ben-mhenni/ 88.110.220.34 0.000 0.000 0.000 0.000 0.000 0.324 0.254 0.000 0.070 0.070 0.170 0.110 0.000 0.060 0.060 Mozilla/5.0_(Macintosh;_U;_Intel_Mac_OS_X_10.5;_en-US;_rv:1.9.1.5)_Gecko/20091102_Firefox/3.5.5 0.000 0.000 0.000 0.000 0.000 88.110.220.34 14192 200 30 0.002 0.002 0.000 0.000 0.000 368043 - - - - - - - - - - 1 1260982004 0.016 0.006 0.000 0.010 0.010 3 | -------------------------------------------------------------------------------- /pig/src/test/resources/test-log4j.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # Set root logger level to INFO and its only appender to A1. 17 | log4j.rootLogger=INFO, A1 18 | 19 | # A1 is set to be a ConsoleAppender. 20 | log4j.appender.A1=org.apache.log4j.ConsoleAppender 21 | 22 | # A1 uses PatternLayout. 23 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout 24 | log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n 25 | -------------------------------------------------------------------------------- /pig/src/test/thrift/map_keys.thrift: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * https://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | namespace java com.twitter.elephantbird.pig.test.thrift 20 | 21 | enum KeyEnum { 22 | A, 23 | B, 24 | C 25 | } 26 | 27 | struct MapKeyTest { 28 | 1: optional map booleans 29 | 2: optional map bytes 30 | 3: optional map shorts 31 | 4: optional map ints 32 | 5: optional map longs 33 | 6: optional map doubles 34 | 7: optional map enums 35 | 8: optional map strings 36 | 9: optional map binaries 37 | } 38 | -------------------------------------------------------------------------------- /rcfile/src/main/java/com/twitter/elephantbird/mapreduce/input/RCFileBaseInputFormat.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.mapreduce.input; 2 | 3 | import org.apache.hadoop.hive.ql.io.RCFileInputFormat; 4 | import org.apache.hadoop.io.LongWritable; 5 | import org.apache.hadoop.io.Writable; 6 | import org.apache.hadoop.mapreduce.InputSplit; 7 | import org.apache.hadoop.mapreduce.RecordReader; 8 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 9 | 10 | import java.io.IOException; 11 | 12 | /** 13 | * Base input format for Thrift and Protobuf RCFile input formats.
14 | * contains a few common common utility methods. 15 | */ 16 | public abstract class RCFileBaseInputFormat extends MapReduceInputFormatWrapper { 17 | 18 | /** internal, for MR use only. */ 19 | @SuppressWarnings({ "unchecked", "rawtypes" }) 20 | public RCFileBaseInputFormat() { 21 | super(new RCFileInputFormat()); 22 | } 23 | 24 | /** 25 | * returns super.createRecordReader(split, taskAttempt). This is useful when 26 | * a sub class has its own their own wrapper over the base recordreader. 27 | */ 28 | protected final RecordReader 29 | createUnwrappedRecordReader(InputSplit split, TaskAttemptContext taskAttempt) 30 | throws IOException, InterruptedException { 31 | return super.createRecordReader(split, taskAttempt); 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /rcfile/src/main/java/com/twitter/elephantbird/pig/store/RCFileProtobufPigStorage.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.store; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.mapreduce.OutputFormat; 6 | import org.apache.pig.data.Tuple; 7 | 8 | import com.google.protobuf.Message; 9 | import com.google.protobuf.Message.Builder; 10 | import com.twitter.elephantbird.mapreduce.io.ProtobufWritable; 11 | import com.twitter.elephantbird.mapreduce.output.RCFileProtobufOutputFormat; 12 | import com.twitter.elephantbird.pig.util.PigToProtobuf; 13 | import com.twitter.elephantbird.util.Protobufs; 14 | import com.twitter.elephantbird.util.TypeRef; 15 | 16 | /** 17 | * StoreFunc for storing Protobuf messages in RCFiles.

18 | * 19 | * @see {@link RCFileProtobufOutputFormat} 20 | */ 21 | public class RCFileProtobufPigStorage extends BaseStoreFunc { 22 | // add stats? 23 | 24 | private TypeRef typeRef; 25 | private Builder msgBuilder; 26 | private ProtobufWritable writable; 27 | 28 | public RCFileProtobufPigStorage(String protoClassName) { 29 | typeRef = Protobufs.getTypeRef(protoClassName); 30 | msgBuilder = Protobufs.getMessageBuilder(typeRef.getRawClass()); 31 | writable = ProtobufWritable.newInstance(Message.class); 32 | } 33 | 34 | @Override @SuppressWarnings("unchecked") 35 | public OutputFormat getOutputFormat() throws IOException { 36 | return new RCFileProtobufOutputFormat(typeRef); 37 | } 38 | 39 | public void putNext(Tuple t) throws IOException { 40 | Message msg = PigToProtobuf.tupleToMessage(msgBuilder.clone(), t); 41 | writable.set(msg); 42 | writeRecord(null, writable); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /rcfile/src/main/java/com/twitter/elephantbird/pig/store/RCFileThriftPigStorage.java: -------------------------------------------------------------------------------- 1 | package com.twitter.elephantbird.pig.store; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.mapreduce.OutputFormat; 6 | import org.apache.pig.data.Tuple; 7 | import org.apache.thrift.TBase; 8 | 9 | import com.twitter.elephantbird.mapreduce.io.ThriftWritable; 10 | import com.twitter.elephantbird.mapreduce.output.RCFileThriftOutputFormat; 11 | import com.twitter.elephantbird.pig.util.PigToThrift; 12 | import com.twitter.elephantbird.util.ThriftUtils; 13 | import com.twitter.elephantbird.util.TypeRef; 14 | 15 | /** 16 | * StoreFunc for storing Thrift objects in RCFiles.

17 | * 18 | * @see RCFileThriftOutputFormat 19 | */ 20 | public class RCFileThriftPigStorage extends BaseStoreFunc { 21 | // add stats? 22 | 23 | private final TypeRef> typeRef; 24 | private final ThriftWritable> writable; 25 | private final PigToThrift> pigToThrift; 26 | 27 | @SuppressWarnings("unchecked") 28 | public RCFileThriftPigStorage(String thriftClassName) { 29 | typeRef = ThriftUtils.getTypeRef(thriftClassName); 30 | pigToThrift = (PigToThrift>) PigToThrift.newInstance(typeRef); 31 | writable = new ThriftWritable(typeRef); 32 | } 33 | 34 | @Override @SuppressWarnings("unchecked") 35 | public OutputFormat getOutputFormat() throws IOException { 36 | return new RCFileThriftOutputFormat(typeRef); 37 | } 38 | 39 | public void putNext(Tuple t) throws IOException { 40 | TBase tObj = pigToThrift.getThriftObject(t); 41 | writable.set(tObj); 42 | writeRecord(null, writable); 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.10/elephant-bird-2.1.10.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twitter/elephant-bird/3ae48b10bc56b2d66de45739ef7d6aad821c06e0/repo/com/twitter/elephant-bird/2.1.10/elephant-bird-2.1.10.jar -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.10/elephant-bird-2.1.10.jar.md5: -------------------------------------------------------------------------------- 1 | 55b6900f14c9f9c29e74b05b977d1ab6 -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.10/elephant-bird-2.1.10.jar.sha1: -------------------------------------------------------------------------------- 1 | 9b49f75f6f91f17c32049adbdadb327e98fef1e0 -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.10/elephant-bird-2.1.10.pom.md5: -------------------------------------------------------------------------------- 1 | 30cc37678f4c503e33dd3a2f335c5743 -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.10/elephant-bird-2.1.10.pom.sha1: -------------------------------------------------------------------------------- 1 | e7fee57f8fa5262186b42f52b79a76a1a9edca91 -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.11/elephant-bird-2.1.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twitter/elephant-bird/3ae48b10bc56b2d66de45739ef7d6aad821c06e0/repo/com/twitter/elephant-bird/2.1.11/elephant-bird-2.1.11.jar -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.11/elephant-bird-2.1.11.jar.md5: -------------------------------------------------------------------------------- 1 | 6e8f4950da3af1258c83003c022ec0b8 -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.11/elephant-bird-2.1.11.jar.sha1: -------------------------------------------------------------------------------- 1 | 86408d17c4d17778ba10e037d98e5cf2daa39ec0 -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.11/elephant-bird-2.1.11.pom.md5: -------------------------------------------------------------------------------- 1 | 23faa3a212ea2e6c9e9a509952d8383d -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.11/elephant-bird-2.1.11.pom.sha1: -------------------------------------------------------------------------------- 1 | 2be69f981a585b1c6150a10ab2bbfdeed812d44c -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.5/elephant-bird-2.1.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twitter/elephant-bird/3ae48b10bc56b2d66de45739ef7d6aad821c06e0/repo/com/twitter/elephant-bird/2.1.5/elephant-bird-2.1.5.jar -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.5/elephant-bird-2.1.5.jar.md5: -------------------------------------------------------------------------------- 1 | 178476b9384af4a8280182989e8431ec -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.5/elephant-bird-2.1.5.jar.sha1: -------------------------------------------------------------------------------- 1 | 0b8ed162891872d6d34171795d18d6a85de1f0ef -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.5/elephant-bird-2.1.5.pom.md5: -------------------------------------------------------------------------------- 1 | 4b3d26540bed5122e4c064c71d272621 -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.5/elephant-bird-2.1.5.pom.sha1: -------------------------------------------------------------------------------- 1 | 534164537caa351cc210dda82b2c6e353be8200d -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.6/elephant-bird-2.1.6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twitter/elephant-bird/3ae48b10bc56b2d66de45739ef7d6aad821c06e0/repo/com/twitter/elephant-bird/2.1.6/elephant-bird-2.1.6.jar -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.6/elephant-bird-2.1.6.jar.md5: -------------------------------------------------------------------------------- 1 | a1704e4693f14b94816120ff12df3e86 -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.6/elephant-bird-2.1.6.jar.sha1: -------------------------------------------------------------------------------- 1 | b8cb52cab21750f582d5d37c82d70df348e245a1 -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.6/elephant-bird-2.1.6.pom.md5: -------------------------------------------------------------------------------- 1 | de400137b656a88de219dfd2f335aaed -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.6/elephant-bird-2.1.6.pom.sha1: -------------------------------------------------------------------------------- 1 | b80d034ea84e9e7ab6e53fba8eb0f3813f91f90c -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.7/elephant-bird-2.1.7.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twitter/elephant-bird/3ae48b10bc56b2d66de45739ef7d6aad821c06e0/repo/com/twitter/elephant-bird/2.1.7/elephant-bird-2.1.7.jar -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.7/elephant-bird-2.1.7.jar.md5: -------------------------------------------------------------------------------- 1 | fb0440f29625e97524f467ac0d830282 -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.7/elephant-bird-2.1.7.jar.sha1: -------------------------------------------------------------------------------- 1 | 27f94b63a0bd9ef4b13c05a8a5a7c670aa4fdc10 -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.7/elephant-bird-2.1.7.pom.md5: -------------------------------------------------------------------------------- 1 | 5ea3a78406f34e6cc8f28b324754b87c -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.7/elephant-bird-2.1.7.pom.sha1: -------------------------------------------------------------------------------- 1 | 2c76ec725b23ae6a6d292be61eb42c0b6adb9a1a -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.8/elephant-bird-2.1.8.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twitter/elephant-bird/3ae48b10bc56b2d66de45739ef7d6aad821c06e0/repo/com/twitter/elephant-bird/2.1.8/elephant-bird-2.1.8.jar -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.8/elephant-bird-2.1.8.jar.md5: -------------------------------------------------------------------------------- 1 | 1839af2428a499c6515f14bae55fb3ac -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.8/elephant-bird-2.1.8.jar.sha1: -------------------------------------------------------------------------------- 1 | 60d7badd7ab42adbb4ee6375b8dccf698b4b00ae -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.8/elephant-bird-2.1.8.pom.md5: -------------------------------------------------------------------------------- 1 | abc6012175bedf892ad75590f0bc0339 -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.8/elephant-bird-2.1.8.pom.sha1: -------------------------------------------------------------------------------- 1 | d86b6bfa54102712dfe1c9611297f8b0299a691f -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.9/elephant-bird-2.1.9.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twitter/elephant-bird/3ae48b10bc56b2d66de45739ef7d6aad821c06e0/repo/com/twitter/elephant-bird/2.1.9/elephant-bird-2.1.9.jar -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.9/elephant-bird-2.1.9.jar.md5: -------------------------------------------------------------------------------- 1 | 8b3c9d5629e10e4f5908f229b67fbc17 -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.9/elephant-bird-2.1.9.jar.sha1: -------------------------------------------------------------------------------- 1 | 1e189844b4c213a0522e82e3e1bd23733f8c9420 -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.9/elephant-bird-2.1.9.pom.md5: -------------------------------------------------------------------------------- 1 | 6a03f2abc7ce6b7c347cdce78fefdab1 -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.1.9/elephant-bird-2.1.9.pom.sha1: -------------------------------------------------------------------------------- 1 | 8b329a404287af1f4089d68d71dc7def787615d5 -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.2.0/elephant-bird-2.2.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twitter/elephant-bird/3ae48b10bc56b2d66de45739ef7d6aad821c06e0/repo/com/twitter/elephant-bird/2.2.0/elephant-bird-2.2.0.jar -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.2.0/elephant-bird-2.2.0.jar.md5: -------------------------------------------------------------------------------- 1 | c4c0a613e1c6758efa8f9cc16ab81a00 -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.2.0/elephant-bird-2.2.0.jar.sha1: -------------------------------------------------------------------------------- 1 | 7c92711d1eaa6281dca74777608ed3209e6c9cc3 -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.2.0/elephant-bird-2.2.0.pom.md5: -------------------------------------------------------------------------------- 1 | 1e088cc4afcd993fe9bfaf0ff798b00d -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.2.0/elephant-bird-2.2.0.pom.sha1: -------------------------------------------------------------------------------- 1 | 6d29c0ccfc5b72989f97ef9806b0a323453992b4 -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.2.1/elephant-bird-2.2.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twitter/elephant-bird/3ae48b10bc56b2d66de45739ef7d6aad821c06e0/repo/com/twitter/elephant-bird/2.2.1/elephant-bird-2.2.1.jar -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.2.1/elephant-bird-2.2.1.jar.md5: -------------------------------------------------------------------------------- 1 | 4c0e4d222c7a6fc00aa8cfad80b171a9 -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.2.1/elephant-bird-2.2.1.jar.sha1: -------------------------------------------------------------------------------- 1 | bb0302e00012716b0e2e3464552b1ab66cf581dc -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.2.1/elephant-bird-2.2.1.pom.md5: -------------------------------------------------------------------------------- 1 | a3b0db1b730050c9c16beb9918e0b683 -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.2.1/elephant-bird-2.2.1.pom.sha1: -------------------------------------------------------------------------------- 1 | 337b5327c7772ad96d43824b7cf50ecc32db17b8 -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.2.2/elephant-bird-2.2.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twitter/elephant-bird/3ae48b10bc56b2d66de45739ef7d6aad821c06e0/repo/com/twitter/elephant-bird/2.2.2/elephant-bird-2.2.2.jar -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.2.2/elephant-bird-2.2.2.jar.md5: -------------------------------------------------------------------------------- 1 | 9888cb206ab0011486e3fc396b61ddb2 -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.2.2/elephant-bird-2.2.2.jar.sha1: -------------------------------------------------------------------------------- 1 | 9b69c73390ffe41f08983fed7412ded5cb6e5715 -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.2.2/elephant-bird-2.2.2.pom.md5: -------------------------------------------------------------------------------- 1 | 7ea8f5a469b74c9d9f7efed275f6aa71 -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.2.2/elephant-bird-2.2.2.pom.sha1: -------------------------------------------------------------------------------- 1 | d9c4bb5e584a9ec9a646fbb4e8d786c8eaab92ec -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.2.3/elephant-bird-2.2.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twitter/elephant-bird/3ae48b10bc56b2d66de45739ef7d6aad821c06e0/repo/com/twitter/elephant-bird/2.2.3/elephant-bird-2.2.3.jar -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.2.3/elephant-bird-2.2.3.jar.md5: -------------------------------------------------------------------------------- 1 | 0ea27110934bcc56a59f963d4956846b -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.2.3/elephant-bird-2.2.3.jar.sha1: -------------------------------------------------------------------------------- 1 | 5f9f0b258c730ec2ae5315ddd079ba7152b58629 -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.2.3/elephant-bird-2.2.3.pom.md5: -------------------------------------------------------------------------------- 1 | e73dc763d3dee6df114236be8439914e -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/2.2.3/elephant-bird-2.2.3.pom.sha1: -------------------------------------------------------------------------------- 1 | 005eefe557c5919a2f58e025ed0bed1c03ff1190 -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/maven-metadata.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | com.twitter 4 | elephant-bird 5 | 2.1.5 6 | 7 | 8 | 2.1.5 9 | 2.1.6 10 | 2.1.7 11 | 2.1.8 12 | 2.1.9 13 | 2.1.10 14 | 2.1.11 15 | 2.2.0 16 | 2.2.1 17 | 2.2.2 18 | 2.2.3 19 | 20 | 20120525000108 21 | 22 | 23 | -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/maven-metadata.xml.md5: -------------------------------------------------------------------------------- 1 | ec903bf8a92920fcbc5f77c6dae31b10 -------------------------------------------------------------------------------- /repo/com/twitter/elephant-bird/maven-metadata.xml.sha1: -------------------------------------------------------------------------------- 1 | c7291c6930e70ea9019542e9f1e11fc39b7d8b2b --------------------------------------------------------------------------------