├── NOTICE.txt ├── src ├── main │ └── java │ │ └── com │ │ └── m6d │ │ └── filecrush │ │ ├── crush │ │ ├── ReducerCounter.java │ │ ├── MapperCounter.java │ │ ├── FileStatusHasSize.java │ │ ├── CountersMapper.java │ │ ├── CrushPartitioner.java │ │ ├── CountersInputFormat.java │ │ ├── KeyValuePreservingTextInputFormat.java │ │ ├── Bucketer.java │ │ └── CrushReducer.java │ │ └── clean │ │ └── Clean.java └── test │ ├── java │ └── com │ │ └── m6d │ │ └── filecrush │ │ ├── crush │ │ ├── CountersMapperTest.java │ │ ├── BucketerTest.java │ │ ├── KeyValuePreservingRecordReaderDelegationTest.java │ │ ├── KeyValuePreservingRecordReaderNextTest.java │ │ ├── CrushStandAloneSequenceFileTest.java │ │ ├── CrushStandAloneTextTest.java │ │ ├── CrushPartitionerTest.java │ │ ├── BucketerParameterizedTest.java │ │ ├── CrushReducerTest.java │ │ ├── CrushOptionParsingTest.java │ │ ├── CrushReducerParameterizedTest.java │ │ └── CrushTest.java │ │ └── clean │ │ └── TestClean.java │ └── resources │ └── help.txt ├── pom.xml └── README /NOTICE.txt: -------------------------------------------------------------------------------- 1 | Hadoop Filecrush 2 | Copyright 2010-2013 m6d Media6degrees 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | -------------------------------------------------------------------------------- /src/main/java/com/m6d/filecrush/crush/ReducerCounter.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011 m6d.com 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.m6d.filecrush.crush; 17 | 18 | public enum ReducerCounter { 19 | FILES_CRUSHED, RECORDS_CRUSHED 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/com/m6d/filecrush/crush/MapperCounter.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011 m6d.com 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.m6d.filecrush.crush; 17 | 18 | public enum MapperCounter { 19 | DIRS_FOUND, DIRS_SKIPPED, DIRS_ELIGIBLE, FILES_FOUND, FILES_SKIPPED, FILES_ELIGIBLE 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/com/m6d/filecrush/crush/FileStatusHasSize.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011 m6d.com 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.m6d.filecrush.crush; 17 | 18 | import org.apache.hadoop.fs.FileStatus; 19 | 20 | import com.m6d.filecrush.crush.Bucketer.HasSize; 21 | 22 | 23 | class FileStatusHasSize implements HasSize { 24 | 25 | private final FileStatus fileStatus; 26 | 27 | public FileStatusHasSize(FileStatus fileStatus) { 28 | super(); 29 | 30 | if (null == fileStatus) { 31 | throw new NullPointerException("File status"); 32 | } 33 | 34 | this.fileStatus = fileStatus; 35 | } 36 | 37 | @Override 38 | public String id() { 39 | return fileStatus.getPath().toUri().getPath(); 40 | } 41 | 42 | @Override 43 | public long size() { 44 | return fileStatus.getLen(); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/com/m6d/filecrush/crush/CountersMapper.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011 m6d.com 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.m6d.filecrush.crush; 17 | 18 | import java.io.IOException; 19 | 20 | import org.apache.hadoop.io.NullWritable; 21 | import org.apache.hadoop.io.Text; 22 | import org.apache.hadoop.mapred.Counters; 23 | import org.apache.hadoop.mapred.Counters.Counter; 24 | import org.apache.hadoop.mapred.Counters.Group; 25 | import org.apache.hadoop.mapred.JobConf; 26 | import org.apache.hadoop.mapred.Mapper; 27 | import org.apache.hadoop.mapred.OutputCollector; 28 | import org.apache.hadoop.mapred.Reporter; 29 | 30 | /** 31 | * Exists only to load the counters created during the planning phase into the reporter. 32 | */ 33 | @SuppressWarnings("deprecation") 34 | public class CountersMapper implements Mapper { 35 | 36 | @Override 37 | public void configure(JobConf job) { 38 | /* 39 | * Nothing to do here. 40 | */ 41 | } 42 | 43 | @Override 44 | public void map(Counters key, NullWritable value, OutputCollector collector, Reporter reporter) throws IOException { 45 | for (Group group : key) { 46 | for (Counter counter : group) { 47 | reporter.incrCounter(group.getName(), counter.getName(), counter.getValue()); 48 | } 49 | } 50 | } 51 | 52 | @Override 53 | public void close() throws IOException { 54 | /* 55 | * Nothing to do here. 56 | */ 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/test/java/com/m6d/filecrush/crush/CountersMapperTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011 m6d.com 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.m6d.filecrush.crush; 17 | 18 | import java.io.IOException; 19 | 20 | import org.apache.hadoop.mapred.Counters; 21 | import org.apache.hadoop.mapred.Reporter; 22 | import org.easymock.EasyMockSupport; 23 | import org.junit.Before; 24 | import org.junit.Test; 25 | 26 | import com.m6d.filecrush.crush.CountersMapper; 27 | import com.m6d.filecrush.crush.MapperCounter; 28 | 29 | @SuppressWarnings("deprecation") 30 | public class CountersMapperTest extends EasyMockSupport { 31 | 32 | private Reporter reporter; 33 | 34 | private CountersMapper mapper; 35 | 36 | @Before 37 | public void before() { 38 | reporter = createMock("reporter", Reporter.class); 39 | 40 | mapper = new CountersMapper(); 41 | } 42 | 43 | @Test 44 | public void map() throws IOException { 45 | Counters counters = new Counters(); 46 | 47 | counters.incrCounter(MapperCounter.DIRS_FOUND, 1); 48 | reporter.incrCounter(MapperCounter.class.getName(), MapperCounter.DIRS_FOUND.name(), 1); 49 | 50 | counters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 2); 51 | reporter.incrCounter(MapperCounter.class.getName(), MapperCounter.DIRS_ELIGIBLE.name(), 2); 52 | 53 | counters.incrCounter(MapperCounter.DIRS_SKIPPED, 3); 54 | reporter.incrCounter(MapperCounter.class.getName(), MapperCounter.DIRS_SKIPPED.name(), 3); 55 | 56 | counters.incrCounter(MapperCounter.FILES_FOUND, 4); 57 | reporter.incrCounter(MapperCounter.class.getName(), MapperCounter.FILES_FOUND.name(), 4); 58 | 59 | counters.incrCounter(MapperCounter.FILES_SKIPPED, 5); 60 | reporter.incrCounter(MapperCounter.class.getName(), MapperCounter.FILES_SKIPPED.name(), 5); 61 | 62 | replayAll(); 63 | 64 | mapper.map(counters, null, null, reporter); 65 | 66 | verifyAll(); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/test/java/com/m6d/filecrush/crush/BucketerTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011 m6d.com 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.m6d.filecrush.crush; 17 | 18 | import static java.util.Collections.emptyList; 19 | import static org.hamcrest.Matchers.equalTo; 20 | import static org.junit.Assert.assertThat; 21 | import static org.junit.Assert.fail; 22 | 23 | import org.apache.hadoop.fs.FileStatus; 24 | import org.junit.Before; 25 | import org.junit.Test; 26 | 27 | import com.m6d.filecrush.crush.Bucketer; 28 | import com.m6d.filecrush.crush.FileStatusHasSize; 29 | import com.m6d.filecrush.crush.Bucketer.HasSize; 30 | 31 | 32 | public class BucketerTest { 33 | 34 | private Bucketer bucketer; 35 | 36 | @Before 37 | public void before() { 38 | bucketer = new Bucketer(5, 50, true); 39 | } 40 | 41 | @Test(expected = IllegalStateException.class) 42 | public void callAddBeforeReset() { 43 | bucketer.add(new FileStatusHasSize(new FileStatus())); 44 | } 45 | 46 | @Test(expected = IllegalStateException.class) 47 | public void callCreateBeforeReset() { 48 | bucketer.createBuckets(); 49 | } 50 | 51 | @Test 52 | public void addNullCheck() { 53 | bucketer.reset("foo"); 54 | 55 | try { 56 | bucketer.add(null); 57 | fail(); 58 | } catch (NullPointerException ok) { 59 | } 60 | } 61 | 62 | @Test(expected = NullPointerException.class) 63 | public void resestNullCheck() { 64 | bucketer.reset(null); 65 | } 66 | 67 | @Test(expected = IllegalArgumentException.class) 68 | public void resestEmptyCheck() { 69 | bucketer.reset(""); 70 | } 71 | 72 | @Test 73 | public void nothingAdded() { 74 | bucketer.reset("test"); 75 | 76 | assertThat(bucketer.createBuckets(), equalTo((Object) emptyList())); 77 | } 78 | 79 | @Test 80 | public void addZeroSize() { 81 | bucketer.reset("test"); 82 | 83 | bucketer.add(new HasSize() { 84 | @Override 85 | public String id() { 86 | return "test"; 87 | } 88 | 89 | @Override 90 | public long size() { 91 | return 0; 92 | } 93 | }); 94 | 95 | assertThat(bucketer.createBuckets(), equalTo((Object) emptyList())); 96 | } 97 | } 98 | 99 | -------------------------------------------------------------------------------- /src/test/java/com/m6d/filecrush/crush/KeyValuePreservingRecordReaderDelegationTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011 m6d.com 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.m6d.filecrush.crush; 17 | 18 | import static org.hamcrest.Matchers.not; 19 | import static org.hamcrest.Matchers.nullValue; 20 | import static org.hamcrest.Matchers.sameInstance; 21 | import static org.junit.Assert.assertThat; 22 | import static org.mockito.Mockito.verify; 23 | 24 | import java.io.IOException; 25 | 26 | import org.apache.hadoop.io.LongWritable; 27 | import org.apache.hadoop.io.Text; 28 | import org.apache.hadoop.mapred.RecordReader; 29 | import org.junit.Before; 30 | import org.junit.Test; 31 | import org.junit.runner.RunWith; 32 | import org.mockito.Mock; 33 | import org.mockito.runners.MockitoJUnitRunner; 34 | 35 | import com.m6d.filecrush.crush.KeyValuePreservingTextInputFormat.KeyValuePreservingRecordReader; 36 | 37 | 38 | @RunWith(MockitoJUnitRunner.class) 39 | public class KeyValuePreservingRecordReaderDelegationTest { 40 | 41 | @Mock 42 | private PartialRecordReader delegate; 43 | 44 | private KeyValuePreservingRecordReader reader; 45 | 46 | @Before 47 | public void before() { 48 | reader = new KeyValuePreservingRecordReader(delegate); 49 | } 50 | 51 | @Test 52 | public void createValueDelegation() { 53 | reader.createValue(); 54 | 55 | verify(delegate).createValue(); 56 | } 57 | 58 | @Test 59 | public void getPosDelegation() throws IOException { 60 | reader.getPos(); 61 | 62 | verify(delegate).getPos(); 63 | } 64 | 65 | @Test 66 | public void closeDelegation() throws IOException { 67 | reader.close(); 68 | 69 | verify(delegate).close(); 70 | } 71 | 72 | public void createKeyDoesNotDelegate() { 73 | Text key = reader.createKey(); 74 | 75 | assertThat(key, not(nullValue())); 76 | assertThat(reader.createKey(), not(sameInstance(key))); 77 | } 78 | 79 | public static abstract class PartialRecordReader implements RecordReader { 80 | @Override 81 | public boolean next(LongWritable key, Text value) throws IOException { 82 | throw new AssertionError(); 83 | } 84 | 85 | @Override 86 | public LongWritable createKey() { 87 | throw new AssertionError(); 88 | } 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/main/java/com/m6d/filecrush/crush/CrushPartitioner.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011 m6d.com 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.m6d.filecrush.crush; 17 | 18 | import java.io.IOException; 19 | import java.util.HashMap; 20 | import java.util.HashSet; 21 | import java.util.Map; 22 | 23 | import org.apache.hadoop.fs.FileSystem; 24 | import org.apache.hadoop.fs.Path; 25 | import org.apache.hadoop.io.IntWritable; 26 | import org.apache.hadoop.io.SequenceFile.Reader; 27 | import org.apache.hadoop.io.Text; 28 | import org.apache.hadoop.mapred.JobConf; 29 | import org.apache.hadoop.mapred.Partitioner; 30 | 31 | @SuppressWarnings("deprecation") 32 | public class CrushPartitioner implements Partitioner { 33 | 34 | private Map bucketToPartition; 35 | 36 | @Override 37 | public void configure(JobConf job) { 38 | String path = job.get("crush.partition.map"); 39 | int expPartitions = job.getNumReduceTasks(); 40 | 41 | bucketToPartition = new HashMap(100); 42 | 43 | try { 44 | FileSystem fs = FileSystem.get(job); 45 | 46 | Reader reader = new Reader(fs, new Path(path), job); 47 | 48 | Text bucket = new Text(); 49 | IntWritable partNum = new IntWritable(); 50 | 51 | while (reader.next(bucket, partNum)) { 52 | int partNumValue = partNum.get(); 53 | 54 | if (partNumValue < 0 || partNumValue >= expPartitions) { 55 | throw new IllegalArgumentException("Partition " + partNumValue + " not allowed with " + expPartitions + " reduce tasks"); 56 | } 57 | 58 | Integer prev = bucketToPartition.put(new Text(bucket), partNumValue); 59 | 60 | if (null != prev) { 61 | throw new IllegalArgumentException("Bucket " + bucket + " appears more than once in " + path); 62 | } 63 | } 64 | } catch (IOException e) { 65 | throw new RuntimeException("Could not read partition map from " + path, e); 66 | } 67 | 68 | if (new HashSet(bucketToPartition.values()).size() > expPartitions) { 69 | throw new IllegalArgumentException(path + " contains more than " + expPartitions + " distinct partitions"); 70 | } 71 | } 72 | 73 | @Override 74 | public int getPartition(Text bucketId, Text fileName, int numPartitions) { 75 | return bucketToPartition.get(bucketId); 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/main/java/com/m6d/filecrush/crush/CountersInputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011 m6d.com 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.m6d.filecrush.crush; 17 | 18 | import java.io.IOException; 19 | 20 | import org.apache.hadoop.fs.FSDataInputStream; 21 | import org.apache.hadoop.fs.FileSystem; 22 | import org.apache.hadoop.fs.Path; 23 | import org.apache.hadoop.io.NullWritable; 24 | import org.apache.hadoop.mapred.Counters; 25 | import org.apache.hadoop.mapred.FileInputFormat; 26 | import org.apache.hadoop.mapred.FileSplit; 27 | import org.apache.hadoop.mapred.InputSplit; 28 | import org.apache.hadoop.mapred.JobConf; 29 | import org.apache.hadoop.mapred.RecordReader; 30 | import org.apache.hadoop.mapred.Reporter; 31 | 32 | @SuppressWarnings("deprecation") 33 | public class CountersInputFormat extends FileInputFormat { 34 | 35 | @Override 36 | protected boolean isSplitable(FileSystem fs, Path filename) { 37 | return false; 38 | } 39 | 40 | @Override 41 | public RecordReader getRecordReader(InputSplit inputSplit, JobConf jobconf, Reporter reporter) 42 | throws IOException { 43 | 44 | if (!(inputSplit instanceof FileSplit)) { 45 | throw new AssertionError(); 46 | } 47 | 48 | FileSplit fSplit = (FileSplit) inputSplit; 49 | 50 | Path path = fSplit.getPath(); 51 | long length = fSplit.getLength(); 52 | 53 | FileSystem fs = FileSystem.get(jobconf); 54 | 55 | FSDataInputStream is = fs.open(path); 56 | 57 | return new CountersReader(is, length); 58 | } 59 | 60 | private static class CountersReader implements RecordReader { 61 | 62 | private final FSDataInputStream in; 63 | 64 | private final long length; 65 | 66 | public CountersReader(FSDataInputStream in, long length) { 67 | super(); 68 | 69 | this.in = in; 70 | this.length = length; 71 | } 72 | 73 | @Override 74 | public Counters createKey() { 75 | return new Counters(); 76 | } 77 | 78 | @Override 79 | public NullWritable createValue() { 80 | return NullWritable.get(); 81 | } 82 | 83 | @Override 84 | public long getPos() throws IOException { 85 | return in.getPos(); 86 | } 87 | 88 | @Override 89 | public float getProgress() throws IOException { 90 | float percent = ((float) length) / in.getPos(); 91 | 92 | return percent; 93 | } 94 | 95 | @Override 96 | public boolean next(Counters key, NullWritable value) throws IOException { 97 | if (0 == in.getPos()) { 98 | key.readFields(in); 99 | 100 | return true; 101 | } 102 | 103 | return false; 104 | } 105 | 106 | @Override 107 | public void close() throws IOException { 108 | in.close(); 109 | } 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /src/main/java/com/m6d/filecrush/crush/KeyValuePreservingTextInputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011 m6d.com 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.m6d.filecrush.crush; 17 | 18 | import java.io.IOException; 19 | 20 | import org.apache.hadoop.fs.FileSystem; 21 | import org.apache.hadoop.fs.Path; 22 | import org.apache.hadoop.io.LongWritable; 23 | import org.apache.hadoop.io.Text; 24 | import org.apache.hadoop.mapred.FileInputFormat; 25 | import org.apache.hadoop.mapred.FileSplit; 26 | import org.apache.hadoop.mapred.InputSplit; 27 | import org.apache.hadoop.mapred.JobConf; 28 | import org.apache.hadoop.mapred.LineRecordReader; 29 | import org.apache.hadoop.mapred.RecordReader; 30 | import org.apache.hadoop.mapred.Reporter; 31 | import org.apache.hadoop.mapred.TextInputFormat; 32 | 33 | /** 34 | * {@link TextInputFormat} creates keys of {@link LongWritable} offsets and {@link Text} values, which contain the line. For file 35 | * crushing, we need to preserve the keys and values as they appear in the file, which means we must discard the byte offsets and 36 | * divide the value into the original key and value pairs. 37 | */ 38 | @SuppressWarnings("deprecation") 39 | public class KeyValuePreservingTextInputFormat extends FileInputFormat { 40 | 41 | private TextInputFormat delegate; 42 | 43 | public void configure(JobConf conf) { 44 | delegate = new TextInputFormat(); 45 | delegate.configure(conf); 46 | } 47 | 48 | @Override 49 | protected boolean isSplitable(FileSystem fs, Path file) { 50 | /* 51 | * Return false because the reducer opens the file from beginning to end. 52 | */ 53 | return false; 54 | } 55 | 56 | @Override 57 | public RecordReader getRecordReader(InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException { 58 | 59 | reporter.setStatus(genericSplit.toString()); 60 | 61 | return new KeyValuePreservingRecordReader(new LineRecordReader(job, (FileSplit) genericSplit)); 62 | } 63 | 64 | static class KeyValuePreservingRecordReader implements RecordReader { 65 | 66 | private final RecordReader delegate; 67 | 68 | private final LongWritable delKey = new LongWritable(); 69 | 70 | private final Text delValue = new Text(); 71 | 72 | public KeyValuePreservingRecordReader(RecordReader delegate) { 73 | super(); 74 | 75 | this.delegate = delegate; 76 | } 77 | 78 | @Override 79 | public Text createKey() { 80 | return new Text(); 81 | } 82 | 83 | @Override 84 | public Text createValue() { 85 | return delegate.createValue(); 86 | } 87 | 88 | @Override 89 | public long getPos() throws IOException { 90 | return delegate.getPos(); 91 | } 92 | 93 | @Override 94 | public void close() throws IOException { 95 | delegate.close(); 96 | } 97 | 98 | @Override 99 | public float getProgress() throws IOException { 100 | return delegate.getProgress(); 101 | } 102 | 103 | @Override 104 | public boolean next(Text key, Text value) throws IOException { 105 | boolean next = delegate.next(delKey, delValue); 106 | 107 | if (next) { 108 | int first = delValue.find("\t"); 109 | 110 | if (first >= 0) { 111 | key.set(delValue.getBytes(), 0, first); 112 | 113 | if (delValue.getLength() > first) { 114 | value.set(delValue.getBytes(), first + 1, delValue.getLength() - first - 1); 115 | } else { 116 | value.clear(); 117 | } 118 | } else { 119 | key.set(delValue); 120 | } 121 | } 122 | 123 | return next; 124 | } 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /src/test/java/com/m6d/filecrush/crush/KeyValuePreservingRecordReaderNextTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011 m6d.com 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.m6d.filecrush.crush; 17 | 18 | import static org.hamcrest.Matchers.equalTo; 19 | import static org.hamcrest.Matchers.is; 20 | import static org.junit.Assert.assertThat; 21 | 22 | import java.io.IOException; 23 | 24 | import org.apache.hadoop.io.LongWritable; 25 | import org.apache.hadoop.io.Text; 26 | import org.apache.hadoop.mapred.RecordReader; 27 | import org.junit.Before; 28 | import org.junit.Test; 29 | 30 | import com.m6d.filecrush.crush.KeyValuePreservingTextInputFormat.KeyValuePreservingRecordReader; 31 | 32 | 33 | public class KeyValuePreservingRecordReaderNextTest implements RecordReader { 34 | 35 | private final Text key = new Text(); 36 | 37 | private final Text value = new Text(); 38 | 39 | private boolean next; 40 | 41 | private long offset; 42 | 43 | private String line; 44 | 45 | private KeyValuePreservingRecordReader reader; 46 | 47 | @Before 48 | public void before() { 49 | reader = new KeyValuePreservingRecordReader(this); 50 | } 51 | 52 | @Test 53 | public void nextDelegation() throws IOException { 54 | next = false; 55 | 56 | assertThat(reader.next(key, value), is(false)); 57 | } 58 | 59 | @Test 60 | public void keyAndValueArePreserved() throws IOException { 61 | next = true; 62 | 63 | /* 64 | * Key with multiple values. 65 | */ 66 | offset = 0; 67 | line = "key\tvalue0\tvalue1\tvalue2"; 68 | 69 | assertThat(reader.next(key, value), is(true)); 70 | 71 | assertThat(key.toString(), equalTo("key")); 72 | assertThat(value.toString(), equalTo("value0\tvalue1\tvalue2")); 73 | 74 | 75 | /* 76 | * No key with tab and value. 77 | */ 78 | offset = offset + line.length() + 1; 79 | line = "\tvalue0\tvalue1\tvalue2"; 80 | assertThat(reader.next(key, value), is(true)); 81 | 82 | assertThat(key.toString(), equalTo("")); 83 | assertThat(value.toString(), equalTo("value0\tvalue1\tvalue2")); 84 | 85 | 86 | /* 87 | * Key and tab, no value. 88 | */ 89 | offset = offset + line.length() + 1; 90 | line = "key and tab\t"; 91 | assertThat(reader.next(key, value), is(true)); 92 | 93 | assertThat(key.toString(), equalTo("key and tab")); 94 | assertThat(value.toString(), equalTo("")); 95 | 96 | 97 | /* 98 | * Key only. No tab or value. 99 | */ 100 | offset = offset + line.length() + 1; 101 | line = "key only"; 102 | assertThat(reader.next(key, value), is(true)); 103 | 104 | assertThat(key.toString(), equalTo("key only")); 105 | assertThat(value.toString(), equalTo("")); 106 | 107 | 108 | /* 109 | * Key and value again. 110 | */ 111 | offset = offset + line.length() + 1; 112 | line = "a reeeeeeeally long key\tvalue0\tvalue1\tvalue2\tvalue3\tvalue4"; 113 | assertThat(reader.next(key, value), is(true)); 114 | 115 | assertThat(key.toString(), equalTo("a reeeeeeeally long key")); 116 | assertThat(value.toString(), equalTo("value0\tvalue1\tvalue2\tvalue3\tvalue4")); 117 | } 118 | 119 | @Override 120 | public boolean next(LongWritable key, Text value) throws IOException { 121 | if (next) { 122 | key.set(offset); 123 | value.set(line); 124 | } 125 | 126 | return next; 127 | } 128 | 129 | @Override 130 | public LongWritable createKey() { 131 | throw new AssertionError(); 132 | } 133 | 134 | @Override 135 | public Text createValue() { 136 | throw new AssertionError(); 137 | } 138 | 139 | @Override 140 | public long getPos() throws IOException { 141 | throw new AssertionError(); 142 | } 143 | 144 | @Override 145 | public void close() throws IOException { 146 | throw new AssertionError(); 147 | } 148 | 149 | @Override 150 | public float getProgress() throws IOException { 151 | throw new AssertionError(); 152 | } 153 | } 154 | -------------------------------------------------------------------------------- /src/test/java/com/m6d/filecrush/clean/TestClean.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011 m6d.com 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.m6d.filecrush.clean; 17 | 18 | import java.io.IOException; 19 | 20 | import org.apache.hadoop.conf.Configuration; 21 | import org.apache.hadoop.fs.FSDataOutputStream; 22 | import org.apache.hadoop.fs.FileSystem; 23 | import org.apache.hadoop.fs.Path; 24 | import org.apache.hadoop.mapred.HadoopTestCase; 25 | import org.apache.hadoop.util.ToolRunner; 26 | import org.junit.Test; 27 | 28 | import com.m6d.filecrush.clean.Clean; 29 | 30 | public class TestClean extends HadoopTestCase{ 31 | 32 | private static final Path ROOT_DIR = new Path("testing"); 33 | 34 | public TestClean() throws IOException { 35 | super(HadoopTestCase.LOCAL_MR, HadoopTestCase.LOCAL_FS, 1, 1); 36 | } 37 | 38 | private Path getDir(Path dir) { 39 | if (isLocalFS()) { 40 | String localPathRoot = System 41 | .getProperty("test.build.data", "/tmp").replace(' ', '+'); 42 | dir = new Path(localPathRoot, dir); 43 | } 44 | return dir; 45 | } 46 | 47 | public void setUp() throws Exception { 48 | super.setUp(); 49 | Path rootDir = getDir(ROOT_DIR); 50 | Configuration conf = createJobConf(); 51 | FileSystem fs = FileSystem.get(conf); 52 | fs.delete(rootDir, true); 53 | } 54 | 55 | @Test 56 | public void testAge() throws Exception{ 57 | Configuration conf = createJobConf(); 58 | FileSystem fs = FileSystem.get(conf); 59 | fs.mkdirs( new Path(ROOT_DIR,"a") ); 60 | fs.mkdirs( new Path( new Path(ROOT_DIR,"a"),"1") ); 61 | fs.mkdirs( new Path(ROOT_DIR,"b") ); 62 | fs.mkdirs( new Path(ROOT_DIR,"c") ); 63 | fs.mkdirs( new Path( new Path(ROOT_DIR,"c"),"2") ); 64 | 65 | Path oldFile = new Path(new Path( new Path(ROOT_DIR,"a"),"1"),"oldfile"); 66 | FSDataOutputStream out = fs.create(oldFile); 67 | out.write("bla".getBytes()); 68 | out.close(); 69 | 70 | Path cFile = new Path(new Path( new Path(ROOT_DIR,"c"),"1"),"cfile"); 71 | FSDataOutputStream out2 = fs.create(cFile); 72 | out2.write("wah".getBytes()); 73 | out2.close(); 74 | 75 | assertEquals(true,fs.exists(cFile)); 76 | assertEquals(true,fs.exists(oldFile)); 77 | 78 | Clean cleanWarn = new Clean(); 79 | Configuration warnConf = createJobConf(); 80 | warnConf.set(Clean.TARGET_DIR, ROOT_DIR.toString()); 81 | warnConf.set(Clean.TARGET_EXPR, "cfile"); 82 | warnConf.set(Clean.WARN_MODE, "true"); 83 | ToolRunner.run(warnConf, cleanWarn, new String[]{}); 84 | assertEquals(true,fs.exists(cFile)); 85 | assertEquals(true,fs.exists(oldFile)); 86 | 87 | Clean cleanReg = new Clean(); 88 | Configuration regConf = createJobConf(); 89 | regConf.set(Clean.TARGET_DIR, ROOT_DIR.toString()); 90 | regConf.set(Clean.TARGET_EXPR, "cfile"); 91 | ToolRunner.run(regConf, cleanReg, new String[]{}); 92 | assertEquals(false,fs.exists(cFile)); 93 | assertEquals(true,fs.exists(oldFile)); 94 | 95 | Clean clean = new Clean(); 96 | Configuration cleanConf = createJobConf(); 97 | cleanConf.setLong(Clean.CUTTOFF_MILLIS, 20000); 98 | cleanConf.set(Clean.TARGET_DIR, ROOT_DIR.toString()); 99 | ToolRunner.run(cleanConf, clean, new String[]{}); 100 | assertEquals(true,fs.exists(oldFile)); 101 | Thread.sleep(3); 102 | 103 | Clean clean2 = new Clean(); 104 | Configuration cleanConf2 = createJobConf(); 105 | cleanConf2.setLong(Clean.CUTTOFF_MILLIS, 1); 106 | cleanConf2.set(Clean.TARGET_DIR, ROOT_DIR.toString()); 107 | ToolRunner.run(cleanConf2, clean2, new String[]{}); 108 | assertEquals(false,fs.exists(oldFile)); 109 | 110 | } 111 | 112 | @Test 113 | public void testNegatives() throws Exception{ 114 | Clean clean = new Clean(); 115 | Configuration cleanConf = createJobConf(); 116 | cleanConf.setLong(Clean.CUTTOFF_MILLIS, 20000); 117 | cleanConf.set(Clean.TARGET_DIR, ROOT_DIR.toString()); 118 | cleanConf.set(Clean.TARGET_EXPR, "bla"); 119 | int res = ToolRunner.run(cleanConf, clean, new String[]{}); 120 | assertEquals(9,res); 121 | } 122 | 123 | @Test 124 | public void testRootClean() throws Exception{ 125 | Clean clean = new Clean(); 126 | Configuration cleanConf = createJobConf(); 127 | cleanConf.set(Clean.TARGET_DIR, "/"); 128 | cleanConf.set(Clean.TARGET_EXPR, "bla"); 129 | int res = ToolRunner.run(cleanConf, clean, new String[]{}); 130 | assertEquals(2,res); 131 | } 132 | } -------------------------------------------------------------------------------- /src/main/java/com/m6d/filecrush/clean/Clean.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011 m6d.com 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.m6d.filecrush.clean; 17 | 18 | import java.io.IOException; 19 | 20 | import org.apache.hadoop.conf.Configuration; 21 | import org.apache.hadoop.conf.Configured; 22 | import org.apache.hadoop.fs.FileStatus; 23 | import org.apache.hadoop.fs.FileSystem; 24 | import org.apache.hadoop.fs.Path; 25 | import org.apache.hadoop.util.Tool; 26 | import org.apache.hadoop.util.ToolRunner; 27 | 28 | @SuppressWarnings("deprecation") 29 | public class Clean extends Configured implements Tool{ 30 | 31 | public static final String TARGET_DIR="clean.target.dir"; 32 | public static final String CUTTOFF_MILLIS="clean.cutoff.millis"; 33 | public static final String TARGET_EXPR="clean.target.expr"; 34 | public static final String WARN_MODE="clean.warn.mode"; 35 | 36 | protected FileSystem fs; 37 | protected Configuration conf; 38 | protected long cutoff; 39 | 40 | public Clean(){ 41 | super(); 42 | } 43 | 44 | public static void main(String[] args) throws Exception { 45 | Clean clean = new Clean(); 46 | int exitCode = ToolRunner.run(new Configuration(),clean, args); 47 | System.exit(exitCode); 48 | } 49 | 50 | @Override 51 | public int run(String[] args) throws Exception { 52 | conf = getConf(); 53 | 54 | try { 55 | fs=FileSystem.get(getConf()); 56 | } catch (IOException e) { 57 | throw new RuntimeException("Could not open filesystem"); 58 | } 59 | int pre = preFlightCheck(); 60 | if (pre!=0){ 61 | return pre; 62 | } 63 | 64 | if (conf.get(CUTTOFF_MILLIS)!=null){ 65 | long now=System.currentTimeMillis(); 66 | long targetAge= Long.parseLong(conf.get(CUTTOFF_MILLIS)); 67 | cutoff=now-targetAge; 68 | } 69 | 70 | return cleanup (new Path(conf.get(TARGET_DIR))); 71 | 72 | } 73 | 74 | public void warnOrDelete(Path p) throws IOException{ 75 | if (conf.getBoolean(WARN_MODE, false)){ 76 | System.out.println("DELETE "+p); 77 | } else { 78 | if ( p.equals( new Path(conf.get(TARGET_DIR)) )){ 79 | 80 | } else { 81 | fs.delete(p); 82 | } 83 | } 84 | } 85 | 86 | 87 | public int cleanup(Path p){ 88 | try { 89 | if (fs.isFile(p)){ 90 | if (conf.get(TARGET_EXPR)!=null){ 91 | if (p.getName().matches(conf.get(TARGET_EXPR))){ 92 | warnOrDelete(p); 93 | } 94 | } 95 | if (conf.get(CUTTOFF_MILLIS)!=null){ 96 | if (fs.getFileStatus(p).getModificationTime() < cutoff ){ 97 | warnOrDelete(p); 98 | } 99 | } 100 | } 101 | 102 | if (fs.isDirectory(p)){ 103 | for (FileStatus stat: fs.listStatus(p)){ 104 | cleanup( stat.getPath() ); 105 | } 106 | if (fs.listStatus(p).length == 0){ 107 | if (conf.get(TARGET_EXPR)!=null){ 108 | if (p.getName().matches(conf.get(TARGET_EXPR))){ 109 | warnOrDelete(p); 110 | } 111 | } 112 | if (conf.get(CUTTOFF_MILLIS)!=null){ 113 | if (fs.getFileStatus(p).getModificationTime() < cutoff ){ 114 | warnOrDelete(p); 115 | } 116 | } 117 | } 118 | } 119 | } catch (IOException e) { 120 | System.out.println("exception "+e); 121 | return 7; 122 | } 123 | return 0; 124 | } 125 | 126 | public int preFlightCheck(){ 127 | Configuration conf = getConf(); 128 | if (conf.get(TARGET_DIR) == null){ 129 | System.err.println("You must specify a target.dir"); 130 | return 1; 131 | } 132 | if (conf.get(TARGET_DIR).equals("/")){ 133 | System.err.println("Will not clean / !!!!!!"); 134 | return 2; 135 | } 136 | if ( fs.getHomeDirectory().equals( new Path(conf.get(TARGET_DIR)) ) ){ 137 | System.err.println("Will not clean home directory"); 138 | return 3; 139 | } 140 | if (conf.get(CUTTOFF_MILLIS)==null && conf.get(TARGET_EXPR)==null){ 141 | System.err.println("You must specify "+CUTTOFF_MILLIS+" or "+TARGET_EXPR); 142 | return 4; 143 | } 144 | if (!(conf.get(CUTTOFF_MILLIS)==null) && !(conf.get(TARGET_EXPR)==null)){ 145 | System.err.println("You can not specify "+CUTTOFF_MILLIS+" and "+TARGET_EXPR); 146 | return 9; 147 | } 148 | if (conf.get(CUTTOFF_MILLIS)!=null) { 149 | try { 150 | Long.parseLong(conf.get(CUTTOFF_MILLIS)); 151 | } catch (NumberFormatException ex){ 152 | System.err.println(CUTTOFF_MILLIS+" was specified as "+conf.get(CUTTOFF_MILLIS)+" this is not a long integer"); 153 | return 15; 154 | } 155 | } 156 | try { 157 | if (! fs.exists( new Path(conf.get(TARGET_DIR)))) { 158 | System.err.println(conf.get(TARGET_DIR)+" does not exist"); 159 | } 160 | } catch (IOException e) { 161 | System.err.println("IOEXCEPTION"+ e); 162 | return 6; 163 | } 164 | return 0; 165 | } 166 | 167 | } -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | com.m6d 5 | filecrush 6 | M6D App - Filecrush 7 | 2.2.2-SNAPSHOT 8 | filecrush utility 9 | jar 10 | 11 | 0.20.2 12 | 1.2 13 | 1.0.4 14 | 2.3 15 | 3.0.1 16 | 1.2.13 17 | 1.6.1 18 | 1.1 19 | 4.8.2 20 | 1.8.5 21 | 1.2 22 | 3.0 23 | 6.1.14 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | org.apache.maven.plugins 32 | maven-eclipse-plugin 33 | 2.5.1 34 | 35 | [artifactId] 36 | true 37 | true 38 | 1.5 39 | 40 | org.eclipse.jdt.core.javabuilder 41 | org.maven.ide.eclipse.maven2Builder 42 | 43 | 44 | org.eclipse.jdt.core.javanature 45 | org.maven.ide.eclipse.maven2Nature 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | maven-compiler-plugin 56 | 57 | 1.6 58 | 1.6 59 | 60 | 61 | 62 | 63 | maven-jar-plugin 64 | 65 | 66 | 67 | 68 | 69 | 70 | jar 71 | 72 | jar 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | org.apache.hadoop 84 | hadoop-core 85 | ${hadoop.version} 86 | provided 87 | 88 | 89 | commons-logging 90 | commons-logging 91 | ${commons-logging.version} 92 | provided 93 | 94 | 95 | log4j 96 | log4j 97 | ${log4j.version} 98 | provided 99 | 100 | 101 | commons-httpclient 102 | commons-httpclient 103 | ${commons-httpclient.version} 104 | provided 105 | 106 | 107 | commons-lang 108 | commons-lang 109 | ${commons-lang.version} 110 | 111 | 112 | org.codehaus.plexus 113 | plexus-utils 114 | ${plexus-utils.version} 115 | provided 116 | 117 | 118 | commons-cli 119 | commons-cli 120 | ${commons-cli.version} 121 | provided 122 | 123 | 124 | 125 | org.mockito 126 | mockito-all 127 | ${mockito.version} 128 | test 129 | 130 | 131 | org.hamcrest 132 | hamcrest-core 133 | ${hamcrest.version} 134 | test 135 | 136 | 137 | org.hamcrest 138 | hamcrest-library 139 | ${hamcrest.version} 140 | test 141 | 142 | 143 | org.easymock 144 | easymock 145 | ${easymock.version} 146 | test 147 | 148 | 149 | junit 150 | junit 151 | ${junit.version} 152 | test 153 | 154 | 155 | org.apache.hadoop 156 | hadoop-test 157 | ${hadoop.version} 158 | provided 159 | 160 | 161 | org.slf4j 162 | slf4j-api 163 | ${slf4j.version} 164 | test 165 | 166 | 167 | org.slf4j 168 | slf4j-log4j12 169 | ${slf4j.version} 170 | test 171 | 172 | 173 | 174 | org.mortbay.jetty 175 | jetty 176 | ${jetty.version} 177 | test 178 | 179 | 180 | 181 | org.mortbay.jetty 182 | jetty-util 183 | ${jetty.version} 184 | test 185 | 186 | 187 | 188 | 189 | -------------------------------------------------------------------------------- /src/test/java/com/m6d/filecrush/crush/CrushStandAloneSequenceFileTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011 m6d.com 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.m6d.filecrush.crush; 17 | 18 | import static java.lang.String.format; 19 | import static org.hamcrest.Matchers.equalTo; 20 | import static org.hamcrest.Matchers.greaterThanOrEqualTo; 21 | import static org.hamcrest.Matchers.is; 22 | import static org.junit.Assert.assertThat; 23 | 24 | import java.io.File; 25 | import java.io.IOException; 26 | import java.util.ArrayList; 27 | import java.util.List; 28 | 29 | import org.apache.hadoop.fs.FileSystem; 30 | import org.apache.hadoop.fs.Path; 31 | import org.apache.hadoop.io.IntWritable; 32 | import org.apache.hadoop.io.SequenceFile; 33 | import org.apache.hadoop.io.SequenceFile.Reader; 34 | import org.apache.hadoop.io.SequenceFile.Writer; 35 | import org.apache.hadoop.io.Text; 36 | import org.apache.hadoop.mapred.JobConf; 37 | import org.apache.hadoop.util.ToolRunner; 38 | import org.junit.After; 39 | import org.junit.Before; 40 | import org.junit.Rule; 41 | import org.junit.Test; 42 | import org.junit.rules.TemporaryFolder; 43 | 44 | import com.m6d.filecrush.crush.Crush; 45 | 46 | /** 47 | * Dfs block size will be set to 50 and threshold set to 20%. 48 | */ 49 | @SuppressWarnings("deprecation") 50 | public class CrushStandAloneSequenceFileTest { 51 | @Rule 52 | public final TemporaryFolder tmp = new TemporaryFolder(); 53 | 54 | private JobConf job; 55 | 56 | @Before 57 | public void setup() throws Exception { 58 | job = new JobConf(false); 59 | 60 | job.set("fs.default.name", "file:///"); 61 | job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem"); 62 | job.setLong("dfs.block.size", 50); 63 | } 64 | 65 | /** 66 | * Crush creates a subdirectory in tmp to store all its transient data. Since this test uses the local file system, the present 67 | * working directory is the parent of tmp. We delete it here since it's not so useful to clutter the build directory with 68 | * empty directories. 69 | */ 70 | @After 71 | public void deleteTmp() throws IOException { 72 | File tmp = new File("tmp"); 73 | 74 | if (tmp.exists()) { 75 | assertThat(tmp.delete(), is(true)); 76 | } 77 | } 78 | 79 | @Test 80 | public void standAloneOutput() throws Exception { 81 | 82 | File in = tmp.newFolder("in"); 83 | 84 | createFile(in, "skipped-0", 0, 25); 85 | createFile(in, "skipped-1", 1, 25); 86 | createFile(in, "skipped-2", 2, 25); 87 | createFile(in, "skipped-3", 3, 25); 88 | 89 | File subdir = tmp.newFolder("in/subdir"); 90 | 91 | createFile(subdir, "lil-0", 0, 1); 92 | createFile(subdir, "lil-1", 1, 2); 93 | createFile(subdir, "big-2", 2, 5); 94 | createFile(subdir, "big-3", 3, 5); 95 | 96 | File subsubdir = tmp.newFolder("in/subdir/subsubdir"); 97 | 98 | createFile(subsubdir, "skipped-4", 4, 25); 99 | createFile(subsubdir, "skipped-5", 5, 25); 100 | 101 | File out = new File(tmp.getRoot(), "out"); 102 | 103 | ToolRunner.run(job, new Crush(), new String[] { 104 | subdir.getAbsolutePath(), out.getAbsolutePath() 105 | }); 106 | 107 | /* 108 | * Make sure the original files are still there. 109 | */ 110 | verifyFile(in, "skipped-0", 0, 25); 111 | verifyFile(in, "skipped-1", 1, 25); 112 | verifyFile(in, "skipped-2", 2, 25); 113 | verifyFile(in, "skipped-3", 3, 25); 114 | 115 | verifyFile(subdir, "lil-0", 0, 1); 116 | verifyFile(subdir, "lil-1", 1, 2); 117 | verifyFile(subdir, "big-2", 2, 5); 118 | verifyFile(subdir, "big-3", 3, 5); 119 | 120 | verifyFile(subsubdir, "skipped-4", 4, 25); 121 | verifyFile(subsubdir, "skipped-5", 5, 25); 122 | 123 | /* 124 | * Verify the crush output. 125 | */ 126 | verifyCrushOutput(out, new int[] { 0, 1 }, new int[] { 1, 2}, new int[] { 2, 5 }, new int[] { 3, 5 }); 127 | } 128 | 129 | @Test 130 | public void noFiles() throws Exception { 131 | File in = tmp.newFolder("in"); 132 | 133 | File out = new File(tmp.getRoot(), "out"); 134 | 135 | ToolRunner.run(job, new Crush(), new String[] { 136 | in.getAbsolutePath(), out.getAbsolutePath() 137 | }); 138 | 139 | assertThat(out.exists(), is(false)); 140 | } 141 | 142 | private void verifyCrushOutput(File crushOutput, int[]... keyCounts) throws IOException { 143 | 144 | List actual = new ArrayList(); 145 | 146 | Text text = new Text(); 147 | IntWritable value = new IntWritable(); 148 | 149 | Reader reader = new Reader(FileSystem.get(job), new Path(crushOutput.getAbsolutePath()), job); 150 | 151 | while (reader.next(text, value)) { 152 | actual.add(format("%s\t%d", text, value.get())); 153 | } 154 | 155 | reader.close(); 156 | 157 | int expLines = 0; 158 | List> expected = new ArrayList>(); 159 | 160 | 161 | for (int[] keyCount : keyCounts) { 162 | int key = keyCount[0]; 163 | int count = keyCount[1]; 164 | 165 | List lines = new ArrayList(); 166 | expected.add(lines); 167 | 168 | for (int i = 0, j = 0; i < count; i++, j = j == 9 ? 0 : j + 1) { 169 | String line = format("%d\t%d", key, j); 170 | lines.add(line); 171 | } 172 | 173 | expLines += count; 174 | } 175 | 176 | /* 177 | * Make sure each file's data is contiguous in the crush output file. 178 | */ 179 | for (List list : expected) { 180 | int idx = actual.indexOf(list.get(0)); 181 | 182 | assertThat(idx, greaterThanOrEqualTo(0)); 183 | 184 | assertThat(actual.subList(idx, idx + list.size()), equalTo(list)); 185 | } 186 | 187 | assertThat(actual.size(), equalTo(expLines)); 188 | } 189 | 190 | private void createFile(File dir, String fileName, int key, int count) throws IOException { 191 | File file = new File(dir, fileName); 192 | 193 | Writer writer = SequenceFile.createWriter(FileSystem.get(job), job, new Path(file.getAbsolutePath()), Text.class, IntWritable.class); 194 | 195 | Text text = new Text(Integer.toString(key)); 196 | IntWritable value = new IntWritable(); 197 | 198 | for (int i = 0, j = 0; i < count; i++, j = j == 9 ? 0 : j + 1) { 199 | value.set(j); 200 | 201 | writer.append(text, value); 202 | } 203 | 204 | writer.close(); 205 | } 206 | 207 | private void verifyFile(File dir, String fileName, int key, int count) throws IOException { 208 | File file = new File(dir, fileName); 209 | 210 | Reader reader = new Reader(FileSystem.get(job), new Path(file.getAbsolutePath()), job); 211 | 212 | int i = 0; 213 | int actual = 0; 214 | 215 | Text text = new Text(); 216 | IntWritable value = new IntWritable(); 217 | 218 | while (reader.next(text, value)) { 219 | assertThat(text.toString(), equalTo(Integer.toString(key))); 220 | assertThat(value.get(), equalTo(i)); 221 | 222 | if (i == 9) { 223 | i = 0; 224 | } else { 225 | i++; 226 | } 227 | 228 | actual++; 229 | } 230 | 231 | reader.close(); 232 | 233 | assertThat(actual, equalTo(count)); 234 | } 235 | } 236 | -------------------------------------------------------------------------------- /src/test/java/com/m6d/filecrush/crush/CrushStandAloneTextTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011 m6d.com 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.m6d.filecrush.crush; 17 | 18 | import static java.lang.String.format; 19 | import static org.hamcrest.Matchers.equalTo; 20 | import static org.hamcrest.Matchers.greaterThanOrEqualTo; 21 | import static org.hamcrest.Matchers.is; 22 | import static org.hamcrest.Matchers.nullValue; 23 | import static org.junit.Assert.assertThat; 24 | 25 | import java.io.BufferedReader; 26 | import java.io.File; 27 | import java.io.FileReader; 28 | import java.io.IOException; 29 | import java.io.PrintWriter; 30 | import java.util.ArrayList; 31 | import java.util.List; 32 | 33 | import org.apache.hadoop.mapred.JobConf; 34 | import org.apache.hadoop.util.ToolRunner; 35 | import org.junit.After; 36 | import org.junit.Before; 37 | import org.junit.Rule; 38 | import org.junit.Test; 39 | import org.junit.rules.TemporaryFolder; 40 | 41 | import com.m6d.filecrush.crush.Crush; 42 | 43 | /** 44 | * Dfs block size will be set to 50 and threshold set to 20%. 45 | */ 46 | @SuppressWarnings("deprecation") 47 | public class CrushStandAloneTextTest { 48 | @Rule 49 | public final TemporaryFolder tmp = new TemporaryFolder(); 50 | 51 | private JobConf job; 52 | 53 | @Before 54 | public void setup() throws Exception { 55 | job = new JobConf(false); 56 | 57 | job.set("fs.default.name", "file:///"); 58 | job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem"); 59 | job.setLong("dfs.block.size", 50); 60 | } 61 | 62 | /** 63 | * Crush creates a subdirectory in tmp to store all its transient data. Since this test uses the local file system, the present 64 | * working directory is the parent of tmp. We delete it here since it's not so useful to clutter the build directory with 65 | * empty directories. 66 | */ 67 | @After 68 | public void deleteTmp() throws IOException { 69 | File tmp = new File("tmp"); 70 | 71 | if (tmp.exists()) { 72 | assertThat(tmp.delete(), is(true)); 73 | } 74 | } 75 | 76 | @Test 77 | public void standAloneOutput() throws Exception { 78 | 79 | File in = tmp.newFolder("in"); 80 | 81 | createFile(in, "skipped-0", 0, 25); 82 | createFile(in, "skipped-1", 1, 25); 83 | createFile(in, "skipped-2", 2, 25); 84 | createFile(in, "skipped-3", 3, 25); 85 | 86 | File subdir = tmp.newFolder("in/subdir"); 87 | 88 | createFile(subdir, "lil-0", 0, 1); 89 | createFile(subdir, "lil-1", 1, 2); 90 | createFile(subdir, "big-2", 2, 5); 91 | createFile(subdir, "big-3", 3, 5); 92 | 93 | File subsubdir = tmp.newFolder("in/subdir/subsubdir"); 94 | 95 | createFile(subsubdir, "skipped-4", 4, 25); 96 | createFile(subsubdir, "skipped-5", 5, 25); 97 | 98 | File out = new File(tmp.getRoot(), "out"); 99 | 100 | ToolRunner.run(job, new Crush(), new String[] { 101 | "--input-format=text", 102 | "--output-format=text", 103 | "--compress=none", 104 | 105 | subdir.getAbsolutePath(), out.getAbsolutePath() 106 | }); 107 | 108 | /* 109 | * Make sure the original files are still there. 110 | */ 111 | verifyFile(in, "skipped-0", 0, 25); 112 | verifyFile(in, "skipped-1", 1, 25); 113 | verifyFile(in, "skipped-2", 2, 25); 114 | verifyFile(in, "skipped-3", 3, 25); 115 | 116 | verifyFile(subdir, "lil-0", 0, 1); 117 | verifyFile(subdir, "lil-1", 1, 2); 118 | verifyFile(subdir, "big-2", 2, 5); 119 | verifyFile(subdir, "big-3", 3, 5); 120 | 121 | verifyFile(subsubdir, "skipped-4", 4, 25); 122 | verifyFile(subsubdir, "skipped-5", 5, 25); 123 | 124 | /* 125 | * Verify the crush output. 126 | */ 127 | verifyCrushOutput(out, new int[] { 0, 1 }, new int[] { 1, 2}, new int[] { 2, 5 }, new int[] { 3, 5 }); 128 | } 129 | 130 | @Test 131 | public void noFiles() throws Exception { 132 | File in = tmp.newFolder("in"); 133 | 134 | File out = new File(tmp.getRoot(), "out"); 135 | 136 | ToolRunner.run(job, new Crush(), new String[] { 137 | in.getAbsolutePath(), out.getAbsolutePath() 138 | }); 139 | 140 | assertThat(out.exists(), is(false)); 141 | } 142 | 143 | @Test 144 | public void ignoreRegexTest() throws Exception { 145 | 146 | File in = tmp.newFolder("skip_test"); 147 | 148 | createFile(in, "lil-0", 0, 1); 149 | createFile(in, "lil-1", 1, 2); 150 | createFile(in, "big-2", 2, 5); 151 | createFile(in, "big-3", 3, 5); 152 | // Files to be ignored 153 | createFile(in, "lil-0.index", 0, 10); 154 | createFile(in, "lil-1.index", 1, 20); 155 | createFile(in, "big-2.index", 2, 50); 156 | createFile(in, "big-3.index", 3, 50); 157 | 158 | File out = new File(tmp.getRoot(), "skip_test_out"); 159 | 160 | ToolRunner.run(job, new Crush(), new String[] { 161 | "--input-format=text", 162 | "--output-format=text", 163 | "--ignore-regex=.*\\.index", 164 | "--compress=none", 165 | 166 | in.getAbsolutePath(), out.getAbsolutePath() 167 | }); 168 | 169 | /* 170 | * Make sure the original files are still there. 171 | */ 172 | verifyFile(in, "lil-0", 0, 1); 173 | verifyFile(in, "lil-1", 1, 2); 174 | verifyFile(in, "big-2", 2, 5); 175 | verifyFile(in, "big-3", 3, 5); 176 | verifyFile(in, "lil-0.index", 0, 10); 177 | verifyFile(in, "lil-1.index", 1, 20); 178 | verifyFile(in, "big-2.index", 2, 50); 179 | verifyFile(in, "big-3.index", 3, 50); 180 | 181 | /* 182 | * Verify the crush output. 183 | */ 184 | verifyCrushOutput(out, new int[] { 0, 1 }, new int[] { 1, 2}, new int[] { 2, 5 }, new int[] { 3, 5 }); 185 | } 186 | 187 | private void verifyCrushOutput(File crushOutput, int[]... keyCounts) throws IOException { 188 | 189 | List actual = new ArrayList(); 190 | BufferedReader reader = new BufferedReader(new FileReader(crushOutput)); 191 | 192 | String line; 193 | 194 | while (null != (line = reader.readLine())) { 195 | actual.add(line); 196 | } 197 | 198 | reader.close(); 199 | 200 | int expLines = 0; 201 | List> expected = new ArrayList>(); 202 | 203 | for (int[] kc : keyCounts) { 204 | int key = kc[0]; 205 | int count = kc[1]; 206 | 207 | List lines = new ArrayList(); 208 | expected.add(lines); 209 | 210 | for (int idx = 0, i = 0; idx < count; idx++, i = i == 9 ? 0 : i + 1) { 211 | line = format("%d\t%d", key, i); 212 | lines.add(line); 213 | } 214 | 215 | expLines += count; 216 | } 217 | 218 | /* 219 | * Make sure each file's data is contiguous in the crush output file. 220 | */ 221 | for (List list : expected) { 222 | int idx = actual.indexOf(list.get(0)); 223 | 224 | assertThat(idx, greaterThanOrEqualTo(0)); 225 | 226 | assertThat(actual.subList(idx, idx + list.size()), equalTo(list)); 227 | } 228 | 229 | assertThat(actual.size(), equalTo(expLines)); 230 | } 231 | 232 | private void createFile(File dir, String fileName, int key, int count) throws IOException { 233 | File file = new File(dir, fileName); 234 | 235 | PrintWriter writer = new PrintWriter(file); 236 | 237 | for (int idx = 0, i = 0; idx < count; idx++, i = i == 9 ? 0 : i + 1) { 238 | String line = format("%d\t%d\n", key, i); 239 | 240 | assertThat(line.length(), equalTo(4)); 241 | 242 | writer.write(line); 243 | } 244 | 245 | writer.close(); 246 | } 247 | 248 | private void verifyFile(File dir, String fileName, int key, int count) throws IOException { 249 | File file = new File(dir, fileName); 250 | 251 | assertThat(file.isFile(), is(true)); 252 | assertThat(file.length(), equalTo((long) count * 4)); 253 | 254 | BufferedReader reader = new BufferedReader(new FileReader(file)); 255 | 256 | String line; 257 | int i = 0; 258 | int actualCount = 0; 259 | 260 | while (null != (line = reader.readLine())) { 261 | assertThat(line.length(), equalTo(3)); 262 | 263 | actualCount++; 264 | 265 | String[] split = line.split("\t"); 266 | 267 | assertThat(line, split[0], equalTo(Integer.toString(key))); 268 | assertThat(line, split[1], equalTo(Integer.toString(i))); 269 | 270 | if (i == 9) { 271 | i = 0; 272 | } else { 273 | i++; 274 | } 275 | } 276 | 277 | assertThat(reader.readLine(), nullValue()); 278 | 279 | reader.close(); 280 | 281 | assertThat(actualCount, equalTo(count)); 282 | } 283 | } 284 | -------------------------------------------------------------------------------- /src/main/java/com/m6d/filecrush/crush/Bucketer.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011 m6d.com 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.m6d.filecrush.crush; 17 | 18 | import static java.lang.String.format; 19 | import static java.util.Collections.unmodifiableList; 20 | 21 | import java.util.Collections; 22 | import java.util.Comparator; 23 | import java.util.Iterator; 24 | import java.util.LinkedList; 25 | import java.util.List; 26 | import java.util.ListIterator; 27 | 28 | import org.apache.hadoop.fs.FileStatus; 29 | 30 | /** 31 | *

32 | * Arranges files into buckets. Callers must interact with this class in the following order: 33 | *

34 | *
    35 | *
  1. Invoke {@link #reset(String)}.
  2. 36 | *
  3. Invoke {@link #add(FileStatus)} zero or more times.
  4. 37 | *
  5. Invoke {@link #createBuckets()}.
  6. 38 | *
  7. Go to 1 or throw away instance.
  8. 39 | *
40 | * 41 | *

42 | * The bucketing algorithm is: 43 | *

44 | * 45 | *
    46 | *
  1. Calculate the number of buckets as floor(total bytes / block size). Add one if there is a remainder.
  2. 47 | *
  3. Sort the files in order of descending size.
  4. 48 | *
  5. Add each file to the bucket that has the least size.
  6. 49 | *
  7. Remove any buckets containing one file only
  8. 50 | *
51 | */ 52 | class Bucketer { 53 | /** 54 | * The maximum number of buckets to create. 55 | */ 56 | private final int maxBuckets; 57 | 58 | /** 59 | * The size of the files to create. Used in the bucketing algorithm. 60 | */ 61 | private final long bucketSize; 62 | 63 | /** 64 | * The items to consider for bucketing. 65 | */ 66 | private final List items = new LinkedList(); 67 | 68 | /** 69 | * The total number of bytes represented by the files in {@link #items}. 70 | */ 71 | private long size; 72 | 73 | /** 74 | * The directory being bucketed. 75 | */ 76 | private String dir; 77 | 78 | /** 79 | * Do not return buckets containing a single item from {@link #createBuckets()}. 80 | */ 81 | private final boolean excludeSingleItemBuckets; 82 | 83 | public Bucketer(int numBuckets, boolean excludeSingleItemBuckets) { 84 | this(numBuckets, 0, excludeSingleItemBuckets); 85 | } 86 | 87 | public Bucketer(int maxBuckets, long bucketSize, boolean excludeSingleItemBuckets) { 88 | super(); 89 | 90 | if (1 > maxBuckets) { 91 | throw new IllegalArgumentException("Must have at least one bucket: " + maxBuckets); 92 | } 93 | 94 | this.maxBuckets = maxBuckets; 95 | 96 | if (0 > bucketSize) { 97 | throw new IllegalArgumentException("Bucket size must be zero or positive: " + bucketSize); 98 | } 99 | 100 | this.bucketSize = bucketSize; 101 | this.excludeSingleItemBuckets = excludeSingleItemBuckets; 102 | } 103 | 104 | /** 105 | * Returns map from bucket to files that are in that bucket. Buckets are guaranteed to contain more than one file and will be 106 | * approximately the same size in bytes (summing the sizes of all the files in that bucket). After this method returns, 107 | * {@link #reset(String)} must be called before this instance can be called again. 108 | */ 109 | public List createBuckets() { 110 | if (null == dir) { 111 | throw new IllegalStateException("No directory set"); 112 | } 113 | 114 | /* 115 | * Sort the files in order of descending size. 116 | */ 117 | Collections.sort(items, DESCENDING_SIZE); 118 | 119 | LinkedList buckets = new LinkedList(); 120 | 121 | for (long remaining = size; remaining > 0 && buckets.size() < maxBuckets; remaining -= bucketSize) { 122 | buckets.add(new Bucket(format("%s-%d", dir, buckets.size()))); 123 | } 124 | 125 | int numBuckets = buckets.size(); 126 | 127 | if (1 == numBuckets) { 128 | Bucket bucket = buckets.getFirst(); 129 | 130 | for (HasSize file : items) { 131 | bucket.add(file); 132 | } 133 | } else { 134 | /* 135 | * Add the files to the smallest bucket. 136 | */ 137 | for (HasSize item : items) { 138 | ListIterator iterator = buckets.listIterator(); 139 | 140 | Bucket bucket = iterator.next(); 141 | bucket.add(item); 142 | 143 | iterator.remove(); 144 | 145 | /* 146 | * Reposition the bucket in the list to preserve order by ascending bucket size. 147 | */ 148 | while (buckets.size() < numBuckets && iterator.hasNext()) { 149 | Bucket other = iterator.next(); 150 | 151 | if (other.bytes > bucket.bytes) { 152 | iterator.previous(); 153 | iterator.add(bucket); 154 | } 155 | } 156 | 157 | if (buckets.size() < numBuckets) { 158 | /* 159 | * This bucket is now the biggest one. 160 | */ 161 | buckets.add(bucket); 162 | } 163 | } 164 | } 165 | 166 | if (excludeSingleItemBuckets) { 167 | for (Iterator iter = buckets.iterator(); iter.hasNext(); ) { 168 | Bucket bucket = iter.next(); 169 | 170 | if (bucket.contents.size() < 2) { 171 | iter.remove(); 172 | } 173 | } 174 | } 175 | 176 | /* 177 | * Empty the state for the next invocation of reset. 178 | */ 179 | dir = null; 180 | items.clear(); 181 | size = 0; 182 | 183 | return buckets; 184 | } 185 | 186 | /** 187 | * Add an item for consideration. If the item has zero size, then it is ignored. 188 | */ 189 | public void add(HasSize item) { 190 | if (null == dir) { 191 | throw new IllegalStateException("No directory set"); 192 | } 193 | 194 | long itemSize = item.size(); 195 | 196 | if (0 != itemSize) { 197 | items.add(item); 198 | size += itemSize; 199 | } 200 | } 201 | 202 | /** 203 | * Returns the count of items being considered. 204 | */ 205 | int count() { 206 | return items.size(); 207 | } 208 | 209 | /** 210 | * Returns the total size of all the items being considered. 211 | */ 212 | long size() { 213 | return size; 214 | } 215 | 216 | /** 217 | * Resets the instance for the directory. The given name is used to name the buckets. 218 | * 219 | * @param dir 220 | * Directory name. Must not be null or empty. 221 | */ 222 | public void reset(String dir) { 223 | if (dir.equals("")) { 224 | throw new IllegalArgumentException("Directory is empty"); 225 | } 226 | 227 | this.dir = dir; 228 | 229 | items.clear(); 230 | size = 0; 231 | } 232 | 233 | String dir() { 234 | return dir; 235 | } 236 | 237 | public static class Bucket implements HasSize { 238 | 239 | private final List contents; 240 | 241 | private final String name; 242 | 243 | private long bytes; 244 | 245 | public Bucket(String name) { 246 | super(); 247 | 248 | this.name = name; 249 | this.contents = new LinkedList(); 250 | } 251 | 252 | public Bucket(String name, List contents, long bytes) { 253 | super(); 254 | 255 | this.contents = contents; 256 | this.name = name; 257 | this.bytes = bytes; 258 | } 259 | 260 | private void add(HasSize hasSize) { 261 | contents.add(hasSize.id()); 262 | bytes += hasSize.size(); 263 | } 264 | 265 | public List contents() { 266 | return unmodifiableList(contents); 267 | } 268 | 269 | public String name() { 270 | return name; 271 | } 272 | 273 | public long bytes() { 274 | return bytes; 275 | } 276 | 277 | @Override 278 | public String id() { 279 | return name(); 280 | } 281 | 282 | @Override 283 | public long size() { 284 | return bytes(); 285 | } 286 | 287 | @Override 288 | public String toString() { 289 | return format("%s[%s, %d, %s]", getClass().getSimpleName(), name, bytes, contents); 290 | } 291 | 292 | @Override 293 | public boolean equals(Object obj) { 294 | if (!(obj instanceof Bucket)) { 295 | return false; 296 | } 297 | 298 | Bucket other = (Bucket) obj; 299 | 300 | return name.equals(other.name) && bytes == other.bytes && contents.equals(other.contents); 301 | } 302 | 303 | @Override 304 | public int hashCode() { 305 | return name.hashCode(); 306 | } 307 | } 308 | 309 | private static final Comparator DESCENDING_SIZE = new Comparator() { 310 | @Override 311 | public int compare(HasSize o1, HasSize o2) { 312 | long l1 = o1.size(); 313 | long l2 = o2.size(); 314 | 315 | if (l1 < l2) { 316 | return 1; 317 | } 318 | 319 | if (l1 > l2) { 320 | return -1; 321 | } 322 | 323 | return 0; 324 | } 325 | }; 326 | 327 | interface HasSize { 328 | String id(); 329 | 330 | long size(); 331 | } 332 | } 333 | -------------------------------------------------------------------------------- /src/test/java/com/m6d/filecrush/crush/CrushPartitionerTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011 m6d.com 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.m6d.filecrush.crush; 17 | 18 | import static org.hamcrest.Matchers.equalTo; 19 | import static org.junit.Assert.assertThat; 20 | import static org.junit.Assert.fail; 21 | 22 | import java.io.IOException; 23 | 24 | import org.apache.hadoop.fs.FileSystem; 25 | import org.apache.hadoop.fs.Path; 26 | import org.apache.hadoop.io.IntWritable; 27 | import org.apache.hadoop.io.SequenceFile; 28 | import org.apache.hadoop.io.SequenceFile.Writer; 29 | import org.apache.hadoop.io.Text; 30 | import org.apache.hadoop.mapred.JobConf; 31 | import org.junit.Before; 32 | import org.junit.Rule; 33 | import org.junit.Test; 34 | import org.junit.rules.TemporaryFolder; 35 | 36 | import com.m6d.filecrush.crush.CrushPartitioner; 37 | 38 | @SuppressWarnings("deprecation") 39 | public class CrushPartitionerTest { 40 | @Rule 41 | public final TemporaryFolder tmp = new TemporaryFolder(); 42 | 43 | private JobConf job; 44 | 45 | private FileSystem fs; 46 | 47 | private Path partitionMap; 48 | 49 | private CrushPartitioner partitioner; 50 | 51 | @Before 52 | public void setupPartitionMap() throws IOException { 53 | job = new JobConf(false); 54 | 55 | job.set("fs.default.name", "file:///"); 56 | job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem"); 57 | job.set("crush.partition.map", tmp.getRoot().getAbsolutePath() + "/partition-map"); 58 | 59 | fs = FileSystem.get(job); 60 | 61 | partitionMap = new Path(tmp.getRoot().getAbsolutePath(), "partition-map"); 62 | 63 | partitioner = new CrushPartitioner(); 64 | } 65 | 66 | @Test 67 | public void partition() throws IOException { 68 | 69 | Writer writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class); 70 | 71 | Text key = new Text(); 72 | IntWritable partNum = new IntWritable(); 73 | 74 | key.set("bucket-1"); 75 | partNum.set(0); 76 | writer.append(key, partNum); 77 | 78 | key.set("bucket-2"); 79 | partNum.set(0); 80 | writer.append(key, partNum); 81 | 82 | key.set("bucket-3"); 83 | partNum.set(1); 84 | writer.append(key, partNum); 85 | 86 | key.set("bucket-4"); 87 | partNum.set(2); 88 | writer.append(key, partNum); 89 | 90 | key.set("bucket-5"); 91 | partNum.set(2); 92 | writer.append(key, partNum); 93 | 94 | key.set("bucket-6"); 95 | partNum.set(2); 96 | writer.append(key, partNum); 97 | 98 | writer.close(); 99 | 100 | job.setNumReduceTasks(3); 101 | 102 | 103 | partitioner.configure(job); 104 | 105 | 106 | Text fileName = new Text(); 107 | 108 | key.set("bucket-1"); 109 | 110 | for (int file = 0; file < 4; file++) { 111 | fileName.set("file" + file); 112 | assertThat(partitioner.getPartition(key, fileName, 3), equalTo(0)); 113 | } 114 | 115 | 116 | key.set("bucket-2"); 117 | 118 | for (int file = 0; file < 4; file++) { 119 | fileName.set("file" + file); 120 | assertThat(partitioner.getPartition(key, fileName, 3), equalTo(0)); 121 | } 122 | 123 | 124 | key.set("bucket-3"); 125 | 126 | for (int file = 0; file < 4; file++) { 127 | fileName.set("file" + file); 128 | assertThat(partitioner.getPartition(key, fileName, 3), equalTo(1)); 129 | } 130 | 131 | 132 | key.set("bucket-4"); 133 | 134 | for (int file = 0; file < 4; file++) { 135 | fileName.set("file" + file); 136 | assertThat(partitioner.getPartition(key, fileName, 3), equalTo(2)); 137 | } 138 | 139 | 140 | key.set("bucket-5"); 141 | 142 | for (int file = 0; file < 4; file++) { 143 | fileName.set("file" + file); 144 | assertThat(partitioner.getPartition(key, fileName, 3), equalTo(2)); 145 | } 146 | 147 | 148 | key.set("bucket-6"); 149 | 150 | for (int file = 0; file < 4; file++) { 151 | fileName.set("file" + file); 152 | assertThat(partitioner.getPartition(key, fileName, 3), equalTo(2)); 153 | } 154 | } 155 | 156 | 157 | @Test 158 | public void partitionWithFewerPartitionsThanReduceTasks() throws IOException { 159 | 160 | Writer writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class); 161 | 162 | Text key = new Text(); 163 | IntWritable partNum = new IntWritable(); 164 | 165 | key.set("bucket-1"); 166 | partNum.set(0); 167 | writer.append(key, partNum); 168 | 169 | key.set("bucket-2"); 170 | partNum.set(0); 171 | writer.append(key, partNum); 172 | 173 | key.set("bucket-3"); 174 | partNum.set(1); 175 | writer.append(key, partNum); 176 | 177 | key.set("bucket-4"); 178 | partNum.set(2); 179 | writer.append(key, partNum); 180 | 181 | key.set("bucket-5"); 182 | partNum.set(2); 183 | writer.append(key, partNum); 184 | 185 | key.set("bucket-6"); 186 | partNum.set(2); 187 | writer.append(key, partNum); 188 | 189 | writer.close(); 190 | 191 | job.setNumReduceTasks(40); 192 | 193 | 194 | partitioner.configure(job); 195 | 196 | 197 | Text fileName = new Text(); 198 | 199 | key.set("bucket-1"); 200 | 201 | for (int file = 0; file < 4; file++) { 202 | fileName.set("file" + file); 203 | assertThat(partitioner.getPartition(key, fileName, 3), equalTo(0)); 204 | } 205 | 206 | 207 | key.set("bucket-2"); 208 | 209 | for (int file = 0; file < 4; file++) { 210 | fileName.set("file" + file); 211 | assertThat(partitioner.getPartition(key, fileName, 3), equalTo(0)); 212 | } 213 | 214 | 215 | key.set("bucket-3"); 216 | 217 | for (int file = 0; file < 4; file++) { 218 | fileName.set("file" + file); 219 | assertThat(partitioner.getPartition(key, fileName, 3), equalTo(1)); 220 | } 221 | 222 | 223 | key.set("bucket-4"); 224 | 225 | for (int file = 0; file < 4; file++) { 226 | fileName.set("file" + file); 227 | assertThat(partitioner.getPartition(key, fileName, 3), equalTo(2)); 228 | } 229 | 230 | 231 | key.set("bucket-5"); 232 | 233 | for (int file = 0; file < 4; file++) { 234 | fileName.set("file" + file); 235 | assertThat(partitioner.getPartition(key, fileName, 3), equalTo(2)); 236 | } 237 | 238 | 239 | key.set("bucket-6"); 240 | 241 | for (int file = 0; file < 4; file++) { 242 | fileName.set("file" + file); 243 | assertThat(partitioner.getPartition(key, fileName, 3), equalTo(2)); 244 | } 245 | } 246 | 247 | @Test 248 | public void noDupes() throws IOException { 249 | 250 | Writer writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class); 251 | 252 | Text key = new Text(); 253 | IntWritable value = new IntWritable(); 254 | 255 | key.set("bucket-1"); 256 | value.set(0); 257 | writer.append(key, value); 258 | 259 | key.set("bucket-2"); 260 | value.set(0); 261 | writer.append(key, value); 262 | 263 | key.set("bucket-2"); 264 | value.set(1); 265 | writer.append(key, value); 266 | 267 | writer.close(); 268 | 269 | job.setNumReduceTasks(3); 270 | 271 | try { 272 | partitioner.configure(job); 273 | fail(); 274 | } catch (IllegalArgumentException e) { 275 | if (!e.getMessage().contains("bucket-2")) { 276 | throw e; 277 | } 278 | } 279 | } 280 | 281 | @Test 282 | public void partitionTooLow() throws IOException { 283 | 284 | Writer writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class); 285 | 286 | Text key = new Text(); 287 | IntWritable partNum = new IntWritable(); 288 | 289 | key.set("bucket-1"); 290 | partNum.set(0); 291 | writer.append(key, partNum); 292 | 293 | key.set("bucket-2"); 294 | partNum.set(0); 295 | writer.append(key, partNum); 296 | 297 | key.set("bucket-4"); 298 | partNum.set(2); 299 | writer.append(key, partNum); 300 | 301 | key.set("bucket-5"); 302 | partNum.set(2); 303 | writer.append(key, partNum); 304 | 305 | key.set("bucket-6"); 306 | partNum.set(-1); 307 | writer.append(key, partNum); 308 | 309 | writer.close(); 310 | 311 | 312 | job.setNumReduceTasks(3); 313 | 314 | try { 315 | partitioner.configure(job); 316 | fail("No such thing as a negitave partition"); 317 | } catch (IllegalArgumentException e) { 318 | if (!e.getMessage().contains("Partition -1")) { 319 | throw e; 320 | } 321 | } 322 | } 323 | 324 | @Test 325 | public void partitionTooHigh() throws IOException { 326 | 327 | Writer writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class); 328 | 329 | Text key = new Text(); 330 | IntWritable partNum = new IntWritable(); 331 | 332 | key.set("bucket-1"); 333 | partNum.set(0); 334 | writer.append(key, partNum); 335 | 336 | key.set("bucket-2"); 337 | partNum.set(0); 338 | writer.append(key, partNum); 339 | 340 | key.set("bucket-4"); 341 | partNum.set(2); 342 | writer.append(key, partNum); 343 | 344 | key.set("bucket-5"); 345 | partNum.set(2); 346 | writer.append(key, partNum); 347 | 348 | key.set("bucket-6"); 349 | partNum.set(3); 350 | writer.append(key, partNum); 351 | 352 | writer.close(); 353 | 354 | 355 | job.setNumReduceTasks(3); 356 | 357 | try { 358 | partitioner.configure(job); 359 | fail("Parition with id 3 is not allowed with 3 reduce tasks"); 360 | } catch (IllegalArgumentException e) { 361 | if (!e.getMessage().contains("Partition 3")) { 362 | throw e; 363 | } 364 | } 365 | } 366 | } 367 | -------------------------------------------------------------------------------- /src/test/java/com/m6d/filecrush/crush/BucketerParameterizedTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011 m6d.com 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.m6d.filecrush.crush; 17 | 18 | import static java.lang.System.currentTimeMillis; 19 | import static java.util.Arrays.asList; 20 | import static java.util.Collections.emptyList; 21 | import static org.hamcrest.Matchers.equalTo; 22 | import static org.hamcrest.Matchers.nullValue; 23 | import static org.junit.Assert.assertThat; 24 | 25 | import java.util.ArrayList; 26 | import java.util.Collection; 27 | import java.util.Collections; 28 | import java.util.Comparator; 29 | import java.util.List; 30 | 31 | import org.apache.hadoop.fs.FileStatus; 32 | import org.apache.hadoop.fs.Path; 33 | import org.junit.Test; 34 | import org.junit.runner.RunWith; 35 | import org.junit.runners.Parameterized; 36 | import org.junit.runners.Parameterized.Parameters; 37 | 38 | import com.m6d.filecrush.crush.Bucketer; 39 | import com.m6d.filecrush.crush.FileStatusHasSize; 40 | import com.m6d.filecrush.crush.Bucketer.Bucket; 41 | 42 | 43 | /** 44 | * Block size 50 and threshold 75%. 45 | */ 46 | @RunWith(Parameterized.class) 47 | public class BucketerParameterizedTest { 48 | @Parameters 49 | public static Collection testCases() { 50 | List testCases = new ArrayList(); 51 | 52 | String dir; 53 | List input; 54 | List expected; 55 | 56 | /* 57 | * Three buckets of two each. 58 | * 59 | * 0 1 2 60 | * file3 37 file 2 20 file 4 19 61 | * file6 10 file 5 17 file 1 18 62 | */ 63 | dir = "three buckets of two each"; 64 | 65 | input = asList( statusFor("file1", 18), 66 | statusFor("file2", 20), 67 | statusFor("file3", 37), 68 | statusFor("file4", 19), 69 | statusFor("file5", 17), 70 | statusFor("file6", 10)); 71 | 72 | expected = asList(new Bucket("three buckets of two each-0", asList("file3", "file6"), 47), 73 | new Bucket("three buckets of two each-1", asList("file2", "file5"), 37), 74 | new Bucket("three buckets of two each-2", asList("file4", "file1"), 37)); 75 | 76 | testCases.add(new Object[] { dir, true, input, expected }); 77 | 78 | 79 | /* 80 | * Not enough data to fill all the buckets. Data should be packed into as few buckets as possible. 81 | */ 82 | dir = "not/enough/data/for/max/buckets"; 83 | 84 | input = asList( statusFor("file1", 1), 85 | statusFor("file2", 2), 86 | statusFor("file3", 3), 87 | statusFor("file4", 4), 88 | statusFor("file5", 5), 89 | statusFor("file6", 6)); 90 | 91 | expected = asList(new Bucket("not/enough/data/for/max/buckets-0", asList("file6", "file5", "file4", "file3", "file2", "file1"), 21)); 92 | 93 | testCases.add(new Object[] { dir, true, input, expected }); 94 | 95 | /* 96 | * A directory with one file should be ignored. 97 | */ 98 | dir = "dir/with/one/file"; 99 | 100 | input = asList(statusFor("loner", 1)); 101 | 102 | expected = emptyList(); 103 | 104 | testCases.add(new Object[] { dir, true, input, expected }); 105 | 106 | 107 | /* 108 | * Test case with enough data to fill up all the buckets but no one bucket is more than twice the bucket size. 109 | * 110 | * 0 1 2 3 4 111 | * file 9 35 file 1 30 file 3 30 file 5 30 file 7 30 112 | * file 6 20 file 8 25 file 0 20 file 2 20 file 4 20 113 | * file 11 20 file 10 10 114 | */ 115 | dir = "enough/data/for/max/buckets"; 116 | 117 | input = asList( statusFor("file0", 20), 118 | statusFor("file1", 30), 119 | statusFor("file2", 20), 120 | statusFor("file3", 30), 121 | statusFor("file4", 20), 122 | statusFor("file5", 30), 123 | statusFor("file6", 20), 124 | statusFor("file7", 30), 125 | statusFor("file8", 25), 126 | statusFor("file9", 35), 127 | statusFor("file10", 10), 128 | statusFor("file11", 20)); 129 | 130 | expected = asList( 131 | new Bucket("enough/data/for/max/buckets-0", asList("file9", "file6"), 55), 132 | new Bucket("enough/data/for/max/buckets-1", asList("file1", "file8"), 55), 133 | new Bucket("enough/data/for/max/buckets-2", asList("file3", "file0", "file11"), 70), 134 | new Bucket("enough/data/for/max/buckets-3", asList("file5", "file2", "file10"), 60), 135 | new Bucket("enough/data/for/max/buckets-4", asList("file7", "file4"), 50)); 136 | 137 | testCases.add(new Object[] { dir, true, input, expected }); 138 | 139 | 140 | /* 141 | * Test case with enough data to fill up all the buckets with some of the buckets more than twice the bucket size. 142 | * 143 | * 1 2 3 4 5 144 | * file 0 35 file 2 35 file 4 35 file 6 35 file 8 35 145 | * file 10 35 file 12 35 file 14 35 file 1 30 file 3 30 146 | * file 9 30 file 11 30 file 13 30 file 5 30 file 7 30 147 | * file 15 30 file 16 20 148 | */ 149 | dir = "enough/data/for/max/buckets/and/big/buckets"; 150 | 151 | input = asList( statusFor("file0", 35), 152 | statusFor("file1", 30), 153 | statusFor("file2", 35), 154 | statusFor("file3", 30), 155 | statusFor("file4", 35), 156 | statusFor("file5", 30), 157 | statusFor("file6", 35), 158 | statusFor("file7", 30), 159 | statusFor("file8", 35), 160 | statusFor("file9", 30), 161 | statusFor("file10", 35), 162 | statusFor("file11", 30), 163 | statusFor("file12", 35), 164 | statusFor("file13", 30), 165 | statusFor("file14", 35), 166 | statusFor("file15", 30), 167 | statusFor("file16", 20)); 168 | 169 | expected = asList( 170 | new Bucket("enough/data/for/max/buckets/and/big/buckets-0", asList("file0", "file10", "file9"), 100), 171 | new Bucket("enough/data/for/max/buckets/and/big/buckets-1", asList("file2", "file12", "file11"), 100), 172 | new Bucket("enough/data/for/max/buckets/and/big/buckets-2", asList("file4", "file14", "file13"), 100), 173 | new Bucket("enough/data/for/max/buckets/and/big/buckets-3", asList("file6", "file1", "file5", "file15"), 125), 174 | new Bucket("enough/data/for/max/buckets/and/big/buckets-4", asList("file8", "file3", "file7", "file16"), 115)); 175 | 176 | testCases.add(new Object[] { dir, true, input, expected }); 177 | 178 | 179 | /* 180 | * Exactly enough data for five buckets of 50. 181 | */ 182 | dir = "exactly/enough/data/for/max/buckets"; 183 | 184 | input = asList( statusFor("file0", 20), 185 | statusFor("file1", 30), 186 | statusFor("file2", 20), 187 | statusFor("file3", 30), 188 | statusFor("file4", 20), 189 | statusFor("file5", 30), 190 | statusFor("file6", 20), 191 | statusFor("file7", 30), 192 | statusFor("file8", 20), 193 | statusFor("file9", 30)); 194 | 195 | expected = asList( 196 | new Bucket("exactly/enough/data/for/max/buckets-0", asList("file1", "file0"), 50), 197 | new Bucket("exactly/enough/data/for/max/buckets-1", asList("file3", "file2"), 50), 198 | new Bucket("exactly/enough/data/for/max/buckets-2", asList("file5", "file4"), 50), 199 | new Bucket("exactly/enough/data/for/max/buckets-3", asList("file7", "file6"), 50), 200 | new Bucket("exactly/enough/data/for/max/buckets-4", asList("file9", "file8"), 50)); 201 | 202 | testCases.add(new Object[] { dir, true, input, expected }); 203 | 204 | 205 | /* 206 | * Exactly enough data for four buckets of 50. 207 | */ 208 | dir = "exactly/enough/data/for/four/buckets"; 209 | 210 | input = asList( statusFor("file0", 20), 211 | statusFor("file1", 30), 212 | statusFor("file2", 20), 213 | statusFor("file3", 30), 214 | statusFor("file4", 20), 215 | statusFor("file5", 30), 216 | statusFor("file6", 20), 217 | statusFor("file7", 30)); 218 | 219 | expected = asList( 220 | new Bucket("exactly/enough/data/for/four/buckets-0", asList("file1", "file0"), 50), 221 | new Bucket("exactly/enough/data/for/four/buckets-1", asList("file3", "file2"), 50), 222 | new Bucket("exactly/enough/data/for/four/buckets-2", asList("file5", "file4"), 50), 223 | new Bucket("exactly/enough/data/for/four/buckets-3", asList("file7", "file6"), 50)); 224 | 225 | testCases.add(new Object[] { dir, true, input, expected }); 226 | 227 | 228 | /* 229 | * Buckets that end up with one file are ignored. 230 | * 231 | * 0 1 232 | * file 3 35 file 2 30 233 | * file 1 25 234 | * 235 | * What would have been bucket 0 is dropped since it has only one file in it. 236 | */ 237 | dir = "buckets/with/one/file/are/ignored"; 238 | 239 | input = asList( statusFor("file1", 25), 240 | statusFor("file2", 30), 241 | statusFor("file3", 35)); 242 | 243 | expected = asList(new Bucket("buckets/with/one/file/are/ignored-1", asList("file2", "file1"), 55)); 244 | 245 | testCases.add(new Object[] { dir, true, input, expected }); 246 | 247 | 248 | /* 249 | * Set the flag so that single item buckets are returned. 250 | * 251 | * 0 1 252 | * file 3 35 file 2 30 253 | * file 1 25 254 | * 255 | * What would have been bucket 0 is dropped since it has only one file in it. 256 | */ 257 | dir = "include/buckets/with/one/file"; 258 | 259 | input = asList( statusFor("file1", 25), 260 | statusFor("file2", 30), 261 | statusFor("file3", 35)); 262 | 263 | expected = asList( 264 | new Bucket("include/buckets/with/one/file-0", asList("file3"), 35), 265 | new Bucket("include/buckets/with/one/file-1", asList("file2", "file1"), 55)); 266 | 267 | testCases.add(new Object[] { dir, false, input, expected }); 268 | 269 | return testCases; 270 | } 271 | 272 | private final Bucketer bucketer; 273 | 274 | private final String dir; 275 | 276 | private final List input; 277 | 278 | private final List expected; 279 | 280 | public BucketerParameterizedTest(String dir, boolean excludeSingleItemBuckets, List input, List expected) { 281 | super(); 282 | 283 | this.dir = dir; 284 | this.input = input; 285 | this.expected = expected; 286 | 287 | this.bucketer = new Bucketer(5, 50, excludeSingleItemBuckets); 288 | } 289 | 290 | @Test 291 | public void test() { 292 | bucketer.reset(dir); 293 | 294 | for (int i = 0; i < input.size(); i++) { 295 | FileStatus file = input.get(i); 296 | 297 | bucketer.add(new FileStatusHasSize(file)); 298 | 299 | assertThat(dir, bucketer.count(), equalTo(i + 1)); 300 | } 301 | 302 | List actual = bucketer.createBuckets(); 303 | 304 | Collections.sort(expected, BUCKET_CMP); 305 | Collections.sort(actual, BUCKET_CMP); 306 | 307 | assertThat(dir, actual, equalTo(expected)); 308 | 309 | assertThat(dir, bucketer.count(), equalTo(0)); 310 | assertThat(dir, bucketer.dir(), nullValue()); 311 | assertThat(dir, bucketer.size(), equalTo(0L)); 312 | } 313 | 314 | private static FileStatus statusFor(String path, long size) { 315 | return new FileStatus(size, false, 3, 1024, currentTimeMillis(), new Path(path)); 316 | } 317 | 318 | private static final Comparator BUCKET_CMP = new Comparator() { 319 | @Override 320 | public int compare(Bucket o1, Bucket o2) { 321 | return o1.name().compareTo(o2.name()); 322 | } 323 | }; 324 | } 325 | -------------------------------------------------------------------------------- /src/test/resources/help.txt: -------------------------------------------------------------------------------- 1 | Crush 2 | 3 | NAME 4 | 5 | Crush - Crush small files in dfs to fewer, larger files 6 | 7 | SYNOPSIS 8 | Crush [OPTION]... 9 | 10 | DESCRIPTION 11 | 12 | Crush consumes directories containing many small files with the same key and value types and creates fewer, larger files containing the same data. Crush is gives you the control to: 13 | 14 | * Name the output files 15 | * Ignore files that are "big enough" 16 | * Limit the size of each output file 17 | * Control the output compression codec 18 | * Swap smaller files with generated large files in-place 19 | * No long-running task problem 20 | 21 | See the EXAMPLES section 22 | 23 | ARGUMENTS 24 | 25 | input dir 26 | The root of the directory tree to crush. Directories are found recursively. 27 | 28 | output dir 29 | In non-clone mode, the directory where the output files should be written. In clone mode, the directory where the original files (that were combine into larger files) should be moved. 30 | 31 | timestamp 32 | A 14 digit job timestamp used to uniquely name files. E.g. 20100221175612. Generate in a script with: date +%Y%m%d%H%M%S 33 | 34 | GLOBAL OPTIONS 35 | 36 | -?, --help 37 | Print this help message. 38 | 39 | --threshold 40 | Percent threshold relative to the dfs block size over which a file becomes eligible for crushing. Must be in the (0, 1]. Default is 0.75, which means files smaller than or equal to 75% of a dfs block will be eligible for crushing. File greater than 75% of a dfs block will be left untouched. 41 | 42 | --max-file-blocks 43 | The maximum number of dfs blocks per output file. Must be a positive integer. Small input files are associated with an output file under the assumption that input and output compression codecs have similar efficiency. Also, a directory containing a lot of data in many small files will be converted into a directory containing a fewer number of large files rather than one super-massive file. With the default value 8, 80 small files, each being 1/10th of a dfs block will be grouped into to a single output file since 8 * 1/10 = 8 dfs blocks. If there are 81 small files, each being 1/10th of a dfs block, two output files will be created. One output file contain the combined contents of 41 files and the second will contain the combined contents of the other 40. A directory of many small files will be converted into fewer number of larger files where each output file is roughly the same size. 44 | 45 | --compress 46 | Fully qualified class name of the compression codec to use when writing data. It is permissible to use "none" and "gzip" to indicate no compression and org.apache.hadoop.io.compress.GzipCodec, respectively. 47 | 48 | --clone 49 | Use clone mode. Useful for external Hive tables. In clone mode, the small files are replaced with the larger files. The small files are moved to a subdirectory of the output dir argument. The subdirectory is same as the original directory rooted at output dir. For example, assume the input dir argument and output dir argument are /user/example/input and /user/example/output, respectively. If a file was originally /user/example/input/my-dir/smallfile, then after the clone, the original file would be located in /user/example/output/user/example/input/my-dir/smallfile. 50 | 51 | --info 52 | Print information to the console about what the crush is doing. 53 | 54 | --verbose 55 | Print even more information to the console about what the crush is doing. 56 | 57 | DIRECTORY OPTIONS 58 | 59 | If specified, these options must be appear as a group. When specifying multiple groups of these options, order matters. Defaults for directory options are not used if any are specified. See the EXAMPLES section. 60 | 61 | --regex 62 | Regular expression that matches a directory name. Defaults to .+ if no directory options are specified at all. Empty directories are not required to have a matching regex. Conceptually similar to the first argument of String.replaceAll(). 63 | 64 | --replacement 65 | Replacement string used with corresponding regex to name output files. Defaults to crushed_file-${crush.timestamp}-${crush.task.num}-${crush.file.num} if no directory options are specified at all. The placeholder ${crush.timestamp} refers to the command line argument. ${crush.task.num} refers to the reducer number. ${crush.file.num} is a zero-based count of files producer by a specific reducer. The first file written by a reducer will have ${crush.file.num} = 0, the second = 1, the third = 2, etc. Conceptually similar to the second argument of String.replaceAll(). 66 | 67 | --input-format 68 | Fully qualified class name of the input format for the data in a directory. Can use the "text" and "sequence" shortcuts for org.apache.hadoop.mapred.TextInputFormat and org.apache.hadoop.mapred.SequenceFileInputFormat, respectively. Defaults to sequence if no directory options are specified. 69 | 70 | --output-format 71 | Fully qualified class name of the output format to use when writing the output file for a directory. Can use the "text" and "sequence" shortcuts for org.apache.hadoop.mapred.TextOutputFormat and org.apache.hadoop.mapred.SequenceFileOutputFormat, respectively. Defaults to sequence if no directory options are specified. 72 | 73 | EXAMPLES 74 | 75 | Say we have the following files: 76 | 77 | /user/example/work/input/ 78 | small-file1 79 | small-file2 80 | small-file3 81 | small-file4 82 | big-enough-file 83 | subdir/ 84 | small-file6 85 | small-file7 86 | small-file8 87 | medium-file1 88 | medium-file2 89 | 90 | And we invoke the crush like this: 91 | 92 | Crush /user/example/work/input /user/example/work/output 20100221175612 93 | 94 | Since we have not specified any of the directory options, the default regex, replacement, input-format, and output-format are used. We will get: 95 | 96 | /user/example/work/ 97 | input/ 98 | small-file1 99 | small-file2 100 | small-file3 101 | small-file4 102 | subdir/ 103 | small-file6 104 | small-file7 105 | small-file8 106 | medium-file1 107 | medium-file2 108 | output/ 109 | crushed_file-20100221175612-0-0 110 | big-enough-file 111 | subdir/ 112 | crushed_file-20100221175612-1-0 113 | crushed_file-20100221175612-1-1 114 | 115 | Where: 116 | 117 | crushed_file-20100221175612-0-0 = small-file1 + small-file2 + small-file3 + small-file4 118 | 119 | crushed_file-20100221175612-1-0 = medium-file1 + small-file6 + small-file8 120 | 121 | crushed_file-20100221175612-1-1 = medium-file2 + small-file7 122 | 123 | Notice how big-enough-file was moved to the output directory. The input directory contains only the files that were combined into the larger files. 124 | 125 | By default, the output file names end with two numbers. The first number is the task number of the reducer that wrote the file. The second number is the zero-based file count of that specific reducer. So a file ending with 0-0 was produced by reducer 0 and was the first file written by that reducer. A file ending 0-1 is the second file written by that reducer. A file ending 1-0 was produced by reducer 1 and was the first file written by that reducer. In the example, notice how the directory subdir was converted into two files. If mapred.reduce.tasks permits, multiple reducers can cooperate to crush a large directory. 126 | 127 | Now a clone example. Say we invoked the crush like this: 128 | 129 | Crush --clone /user/example/work/input /user/example/clone 20100221175612 130 | 131 | With the clone option. We would end up with: 132 | 133 | /user/example/ 134 | work/input/ 135 | crushed_file-20100221175612-0-0 136 | big-enough-file 137 | subdir/ 138 | crushed_file-20100221175612-1-0 139 | crushed_file-20100221175612-1-1 140 | clone/user/example/input/ 141 | small-file1 142 | small-file2 143 | small-file3 144 | small-file4 145 | subdir/ 146 | small-file6 147 | small-file7 148 | small-file8 149 | medium-file1 150 | medium-file2 151 | 152 | Note how the original directory structure of /user/example/input as it appeared before the crush is reproduced in /user/example/clone. The small files that were combined are moved to the clone directory while the output files and file that were "big enough" are now in the inpu directory. Clone mode is useful for crushing external Hive tables. Just make sure that there are no Hive queries running on the table because they will fail when the small files are moved to the clone directory. 153 | 154 | Now we try an example using the directory options. Say we invoke the crush like this to control the output file names: 155 | 156 | Crush \ 157 | --regex=.*/(.+) \ 158 | --replacement=$1-${crush.timestamp}-${crush.task.num}-${crush.file.num} \ 159 | --input=sequence \ 160 | --output=sequence \ 161 | /user/example/work/input /user/example/work/output 20100221175612 162 | 163 | The --regex and --replacement arguments are similar to the arguments passed to String.replaceAll(). The regex argument matches the final part of a directory path. For /user/example/work/input, it will match input. For /user/example/work/input/subdir, it will match subdir. For matching purposes, a directory path does not have a trailing slash. The replacement argument refers to the match group by number to rename the file. The result is: 164 | 165 | /user/example/work/output/ 166 | input-20100221175612-0-0 167 | big-enough-file 168 | subdir/ 169 | subdir-20100221175612-1-0 170 | subdir-20100221175612-1-1 171 | 172 | The regex and replacement options are useful for naming the output files when crushing external Hive tables that are partitioned into directories whose names have business significance. 173 | 174 | The following invocation fails: 175 | 176 | Crush \ 177 | --regex=.*/input \ 178 | --replacement=input-${crush.timestamp}-${crush.task.num}-${crush.file.num} \ 179 | --input=sequence \ 180 | --output=sequence \ 181 | /user/example/work/input /user/example/work/output 20100221175612 182 | 183 | Since we have specified some directory options, we must ensure that all directories in hierarchy rooted at the input argument have a matching regex (since the default regex is no longer applicable). In this invocation, there is no regex argument that matches /user/example/work/input/subdir. We must change it to: 184 | 185 | Crush \ 186 | --regex=.*/input \ 187 | --replacement=input-${crush.timestamp}-${crush.task.num}-${crush.file.num} \ 188 | --input=sequence \ 189 | --output=sequence \ 190 | --regex=.*/subdir \ 191 | --replacement=as-text-${crush.timestamp}-${crush.task.num}-${crush.file.num} \ 192 | --input=sequence \ 193 | --output=text \ 194 | /user/example/work/input /user/example/work/output 20100221175612 195 | 196 | This will yield: 197 | 198 | /user/example/work/output/ 199 | input-20100221175612-0-0 200 | big-enough-file 201 | subdir/ 202 | as-text-20100221175612-1-0 203 | as-text-20100221175612-1-1 204 | 205 | Notice subdir has two files whose names differ only by the ${crush.file.num} value. Without the ${crush.file.num}, file names are not guaranteed to be unique. 206 | 207 | NOTES 208 | 209 | This program creates a temporary directories in "tmp" of the executing user's home directory in dfs. 210 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Hadoop filecrusher. 2 | 3 | Turn many small files into fewer larger ones. Also change from text to sequence and other compression options in one pass. 4 | Crush 5 | 6 | NAME 7 | 8 | Crush - Crush small files in dfs to fewer, larger files 9 | 10 | SYNOPSIS 11 | Crush [OPTION]... 12 | 13 | DESCRIPTION 14 | 15 | Crush consumes directories containing many small files with the same key and value types and creates fewer, larger files containing the same data. Crush is gives you the control to: 16 | 17 | * Name the output files 18 | * Ignore files that are "big enough" 19 | * Limit the size of each output file 20 | * Control the output compression codec 21 | * Swap smaller files with generated large files in-place 22 | * No long-running task problem 23 | 24 | See the EXAMPLES section 25 | 26 | ARGUMENTS 27 | 28 | input dir 29 | The root of the directory tree to crush. Directories are found recursively. 30 | 31 | output dir 32 | In non-clone mode, the directory where the output files should be written. In clone mode, the directory where the original files (that were combine into larger files) should be moved. 33 | 34 | timestamp 35 | A 14 digit job timestamp used to uniquely name files. E.g. 20100221175612. Generate in a script with: date +%Y%m%d%H%M%S 36 | 37 | GLOBAL OPTIONS 38 | 39 | -?, --help 40 | Print this help message. 41 | 42 | --threshold 43 | Percent threshold relative to the dfs block size over which a file becomes eligible for crushing. Must be in the (0, 1]. Default is 0.75, which means files smaller than or equal to 75% of a dfs block will be eligible for crushing. File greater than 75% of a dfs block will be left untouched. 44 | 45 | --max-file-blocks 46 | The maximum number of dfs blocks per output file. Must be a positive integer. Small input files are associated with an output file under the assumption that input and output compression codecs have similar efficiency. Also, a directory containing a lot of data in many small files will be converted into a directory containing a fewer number of large files rather than one super-massive file. With the default value 8, 80 small files, each being 1/10th of a dfs block will be grouped into to a single output file since 8 * 1/10 = 8 dfs blocks. If there are 81 small files, each being 1/10th of a dfs block, two output files will be created. One output file contain the combined contents of 41 files and the second will contain the combined contents of the other 40. A directory of many small files will be converted into fewer number of larger files where each output file is roughly the same size. 47 | 48 | --compress 49 | Fully qualified class name of the compression codec to use when writing data. It is permissible to use "none" and "gzip" to indicate no compression and org.apache.hadoop.io.compress.GzipCodec, respectively. 50 | 51 | --clone 52 | Use clone mode. Useful for external Hive tables. In clone mode, the small files are replaced with the larger files. The small files are moved to a subdirectory of the output dir argument. The subdirectory is same as the original directory rooted at output dir. For example, assume the input dir argument and output dir argument are /user/example/input and /user/example/output, respectively. If a file was originally /user/example/input/my-dir/smallfile, then after the clone, the original file would be located in /user/example/output/user/example/input/my-dir/smallfile. 53 | 54 | --info 55 | Print information to the console about what the crush is doing. 56 | 57 | --verbose 58 | Print even more information to the console about what the crush is doing. 59 | 60 | DIRECTORY OPTIONS 61 | 62 | If specified, these options must be appear as a group. When specifying multiple groups of these options, order matters. Defaults for directory options are not used if any are specified. See the EXAMPLES section. 63 | 64 | --regex 65 | Regular expression that matches a directory name. Defaults to .+ if no directory options are specified at all. Empty directories are not required to have a matching regex. Conceptually similar to the first argument of String.replaceAll(). 66 | 67 | --replacement 68 | Replacement string used with corresponding regex to name output files. Defaults to crushed_file-${crush.timestamp}-${crush.task.num}-${crush.file.num} if no directory options are specified at all. The placeholder ${crush.timestamp} refers to the command line argument. ${crush.task.num} refers to the reducer number. ${crush.file.num} is a zero-based count of files producer by a specific reducer. The first file written by a reducer will have ${crush.file.num} = 0, the second = 1, the third = 2, etc. Conceptually similar to the second argument of String.replaceAll(). 69 | 70 | --input-format 71 | Fully qualified class name of the input format for the data in a directory. Can use the "text" and "sequence" shortcuts for org.apache.hadoop.mapred.TextInputFormat and org.apache.hadoop.mapred.SequenceFileInputFormat, respectively. Defaults to sequence if no directory options are specified. 72 | 73 | --output-format 74 | Fully qualified class name of the output format to use when writing the output file for a directory. Can use the "text" and "sequence" shortcuts for org.apache.hadoop.mapred.TextOutputFormat and org.apache.hadoop.mapred.SequenceFileOutputFormat, respectively. Defaults to sequence if no directory options are specified. 75 | 76 | EXAMPLES 77 | 78 | Say we have the following files: 79 | 80 | /user/example/work/input/ 81 | small-file1 82 | small-file2 83 | small-file3 84 | small-file4 85 | big-enough-file 86 | subdir/ 87 | small-file6 88 | small-file7 89 | small-file8 90 | medium-file1 91 | medium-file2 92 | 93 | And we invoke the crush like this: 94 | 95 | Crush /user/example/work/input /user/example/work/output 20100221175612 96 | 97 | Since we have not specified any of the directory options, the default regex, replacement, input-format, and output-format are used. We will get: 98 | 99 | /user/example/work/ 100 | input/ 101 | small-file1 102 | small-file2 103 | small-file3 104 | small-file4 105 | subdir/ 106 | small-file6 107 | small-file7 108 | small-file8 109 | medium-file1 110 | medium-file2 111 | output/ 112 | crushed_file-20100221175612-0-0 113 | big-enough-file 114 | subdir/ 115 | crushed_file-20100221175612-1-0 116 | crushed_file-20100221175612-1-1 117 | 118 | Where: 119 | 120 | crushed_file-20100221175612-0-0 = small-file1 + small-file2 + small-file3 + small-file4 121 | 122 | crushed_file-20100221175612-1-0 = medium-file1 + small-file6 + small-file8 123 | 124 | crushed_file-20100221175612-1-1 = medium-file2 + small-file7 125 | 126 | Notice how big-enough-file was moved to the output directory. The input directory contains only the files that were combined into the larger files. 127 | 128 | By default, the output file names end with two numbers. The first number is the task number of the reducer that wrote the file. The second number is the zero-based file count of that specific reducer. So a file ending with 0-0 was produced by reducer 0 and was the first file written by that reducer. A file ending 0-1 is the second file written by that reducer. A file ending 1-0 was produced by reducer 1 and was the first file written by that reducer. In the example, notice how the directory subdir was converted into two files. If mapred.reduce.tasks permits, multiple reducers can cooperate to crush a large directory. 129 | 130 | Now a clone example. Say we invoked the crush like this: 131 | 132 | Crush --clone /user/example/work/input /user/example/clone 20100221175612 133 | 134 | With the clone option. We would end up with: 135 | 136 | /user/example/ 137 | work/input/ 138 | crushed_file-20100221175612-0-0 139 | big-enough-file 140 | subdir/ 141 | crushed_file-20100221175612-1-0 142 | crushed_file-20100221175612-1-1 143 | clone/user/example/input/ 144 | small-file1 145 | small-file2 146 | small-file3 147 | small-file4 148 | subdir/ 149 | small-file6 150 | small-file7 151 | small-file8 152 | medium-file1 153 | medium-file2 154 | 155 | Note how the original directory structure of /user/example/input as it appeared before the crush is reproduced in /user/example/clone. The small files that were combined are moved to the clone directory while the output files and file that were "big enough" are now in the inpu directory. Clone mode is useful for crushing external Hive tables. Just make sure that there are no Hive queries running on the table because they will fail when the small files are moved to the clone directory. 156 | 157 | Now we try an example using the directory options. Say we invoke the crush like this to control the output file names: 158 | 159 | Crush \ 160 | --regex=.*/(.+) \ 161 | --replacement=$1-${crush.timestamp}-${crush.task.num}-${crush.file.num} \ 162 | --input=sequence \ 163 | --output=sequence \ 164 | /user/example/work/input /user/example/work/output 20100221175612 165 | 166 | The --regex and --replacement arguments are similar to the arguments passed to String.replaceAll(). The regex argument matches the final part of a directory path. For /user/example/work/input, it will match input. For /user/example/work/input/subdir, it will match subdir. For matching purposes, a directory path does not have a trailing slash. The replacement argument refers to the match group by number to rename the file. The result is: 167 | 168 | /user/example/work/output/ 169 | input-20100221175612-0-0 170 | big-enough-file 171 | subdir/ 172 | subdir-20100221175612-1-0 173 | subdir-20100221175612-1-1 174 | 175 | The regex and replacement options are useful for naming the output files when crushing external Hive tables that are partitioned into directories whose names have business significance. 176 | 177 | The following invocation fails: 178 | 179 | Crush \ 180 | --regex=.*/input \ 181 | --replacement=input-${crush.timestamp}-${crush.task.num}-${crush.file.num} \ 182 | --input=sequence \ 183 | --output=sequence \ 184 | /user/example/work/input /user/example/work/output 20100221175612 185 | 186 | Since we have specified some directory options, we must ensure that all directories in hierarchy rooted at the input argument have a matching regex (since the default regex is no longer applicable). In this invocation, there is no regex argument that matches /user/example/work/input/subdir. We must change it to: 187 | 188 | Crush \ 189 | --regex=.*/input \ 190 | --replacement=input-${crush.timestamp}-${crush.task.num}-${crush.file.num} \ 191 | --input=sequence \ 192 | --output=sequence \ 193 | --regex=.*/subdir \ 194 | --replacement=as-text-${crush.timestamp}-${crush.task.num}-${crush.file.num} \ 195 | --input=sequence \ 196 | --output=text \ 197 | /user/example/work/input /user/example/work/output 20100221175612 198 | 199 | This will yield: 200 | 201 | /user/example/work/output/ 202 | input-20100221175612-0-0 203 | big-enough-file 204 | subdir/ 205 | as-text-20100221175612-1-0 206 | as-text-20100221175612-1-1 207 | 208 | Notice subdir has two files whose names differ only by the ${crush.file.num} value. Without the ${crush.file.num}, file names are not guaranteed to be unique. 209 | 210 | NOTES 211 | 212 | This program creates a temporary directories in "tmp" of the executing user's home directory in dfs. 213 | 214 | https://zenodo.org/badge/doi/10.5281/zenodo.11038.png 215 | 216 | -------------------------------------------------------------------------------- /src/test/java/com/m6d/filecrush/crush/CrushReducerTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011 m6d.com 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.m6d.filecrush.crush; 17 | 18 | import static java.util.Arrays.asList; 19 | import static org.hamcrest.Matchers.equalTo; 20 | import static org.junit.Assert.assertThat; 21 | import static org.junit.Assert.fail; 22 | 23 | import java.io.File; 24 | import java.io.IOException; 25 | import java.util.Arrays; 26 | 27 | import org.apache.hadoop.io.Text; 28 | import org.apache.hadoop.mapred.JobConf; 29 | import org.apache.hadoop.mapred.SequenceFileInputFormat; 30 | import org.apache.hadoop.mapred.SequenceFileOutputFormat; 31 | import org.apache.hadoop.mapred.TextInputFormat; 32 | import org.apache.hadoop.mapred.TextOutputFormat; 33 | import org.junit.Before; 34 | import org.junit.Rule; 35 | import org.junit.Test; 36 | import org.junit.rules.TemporaryFolder; 37 | 38 | import com.m6d.filecrush.crush.CrushReducer; 39 | import com.m6d.filecrush.crush.KeyValuePreservingTextInputFormat; 40 | 41 | @SuppressWarnings("deprecation") 42 | public class CrushReducerTest { 43 | 44 | @Rule 45 | public final TemporaryFolder tmp = new TemporaryFolder(); 46 | 47 | private File outDir; 48 | 49 | private CrushReducer reducer; 50 | 51 | @Before 52 | public void setupReducer() { 53 | JobConf job = new JobConf(false); 54 | 55 | job.set("mapred.tip.id", "task_201011081200_014527_r_001234"); 56 | job.set("mapred.task.id", "attempt_201011081200_14527_r_001234_0"); 57 | 58 | outDir = tmp.newFolder("out"); 59 | tmp.newFolder("out/_temporary"); 60 | 61 | job.set("mapred.output.dir", outDir.getAbsolutePath()); 62 | 63 | job.set("fs.default.name", "file:///"); 64 | job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem"); 65 | 66 | job.setLong("crush.timestamp", 98765); 67 | 68 | job.setInt("crush.num.specs", 3); 69 | job.set("crush.0.regex", ".+/dir"); 70 | job.set("crush.0.regex.replacement", "firstregex-${crush.timestamp}-${crush.task.num}-${crush.file.num}"); 71 | job.set("crush.0.input.format", SequenceFileInputFormat.class.getName()); 72 | job.set("crush.0.output.format", TextOutputFormat.class.getName()); 73 | 74 | job.set("crush.1.regex", ".+/dir/([^/]+/)*(.+)"); 75 | job.set("crush.1.regex.replacement", "secondregex-$2-${crush.timestamp}-${crush.task.num}-${crush.file.num}"); 76 | job.set("crush.1.input.format", TextInputFormat.class.getName()); 77 | job.set("crush.1.output.format", TextOutputFormat.class.getName()); 78 | 79 | job.set("crush.2.regex", ".+/other"); 80 | job.set("crush.2.regex.replacement", "${crush.timestamp}-${crush.task.num}-middle-${crush.file.num}-tail"); 81 | job.set("crush.2.input.format", TextInputFormat.class.getName()); 82 | job.set("crush.2.output.format", SequenceFileOutputFormat.class.getName()); 83 | 84 | reducer = new CrushReducer(); 85 | 86 | reducer.configure(job); 87 | } 88 | 89 | @Test 90 | public void taskNum() { 91 | assertThat("task_201011081200_14527_r_1234 => 1234", reducer.getTaskNum(), equalTo(1234)); 92 | } 93 | 94 | @Test 95 | public void timestamp() { 96 | assertThat(reducer.getTimestamp(), equalTo(98765L)); 97 | } 98 | 99 | @Test 100 | public void inputRegexList() { 101 | assertThat(reducer.getInputRegexList(), equalTo(asList(".+/dir", ".+/dir/([^/]+/)*(.+)", ".+/other"))); 102 | } 103 | 104 | @Test 105 | public void outputReplacementList() { 106 | /* 107 | * Job configuration already performs some token substitution. 108 | */ 109 | assertThat(reducer.getOutputReplacementList(), equalTo(asList("firstregex-98765-${crush.task.num}-${crush.file.num}", 110 | "secondregex-$2-98765-${crush.task.num}-${crush.file.num}", 111 | "98765-${crush.task.num}-middle-${crush.file.num}-tail"))); 112 | } 113 | 114 | @Test 115 | public void inputFormatList() { 116 | assertThat(reducer.getInputFormatList(), equalTo(Arrays.> asList(SequenceFileInputFormat.class, 117 | KeyValuePreservingTextInputFormat.class, 118 | KeyValuePreservingTextInputFormat.class))); 119 | } 120 | 121 | @Test 122 | public void outputFormatList() { 123 | assertThat(reducer.getOutputFormatList(), equalTo(Arrays.> asList( TextOutputFormat.class, 124 | TextOutputFormat.class, 125 | SequenceFileOutputFormat.class))); 126 | } 127 | 128 | @Test 129 | public void calculateOutputfile() { 130 | assertThat(reducer.findMatcher("/path/to/a/dir"), equalTo(0)); 131 | assertThat(reducer.calculateOutputFile(0, "/path/to/a/dir"), equalTo("/path/to/a/dir/firstregex-98765-1234-0")); 132 | 133 | assertThat(reducer.findMatcher("/path/to/a/dir/foo/dir"), equalTo(0)); 134 | assertThat(reducer.calculateOutputFile(0, "/path/to/a/dir/foo/dir"), equalTo("/path/to/a/dir/foo/dir/firstregex-98765-1234-1")); 135 | 136 | assertThat(reducer.findMatcher("/path/to/a/dir/subdir"), equalTo(1)); 137 | assertThat(reducer.calculateOutputFile(1, "/path/to/a/dir/subdir"), equalTo("/path/to/a/dir/subdir/secondregex-subdir-98765-1234-2")); 138 | 139 | assertThat(reducer.findMatcher("/x/dir/foo/bar"), equalTo(1)); 140 | assertThat(reducer.calculateOutputFile(1, "/x/dir/foo/bar"), equalTo("/x/dir/foo/bar/secondregex-bar-98765-1234-3")); 141 | 142 | assertThat(reducer.findMatcher("/x/other"), equalTo(2)); 143 | assertThat(reducer.calculateOutputFile(2, "/x/other"), equalTo("/x/other/98765-1234-middle-4-tail")); 144 | 145 | assertThat(reducer.findMatcher("/x/foo/other"), equalTo(2)); 146 | assertThat(reducer.calculateOutputFile(2, "/x/foo/other"), equalTo("/x/foo/other/98765-1234-middle-5-tail")); 147 | } 148 | 149 | @Test 150 | public void fileNotFound() throws IOException { 151 | try { 152 | reducer.reduce(new Text("/path/to/a/dir-4"), asList(new Text("/file/does/not/exist")).iterator(), null, null); 153 | fail(); 154 | } catch (IOException e) { 155 | if (!e.getMessage().contains("/file/does/not/exist")) { 156 | throw e; 157 | } 158 | } 159 | } 160 | 161 | @Test(expected = IllegalArgumentException.class) 162 | public void noMatchingInputPattern() { 163 | reducer.findMatcher("nothing matches me"); 164 | } 165 | 166 | @Test 167 | public void missingInputRegex() { 168 | JobConf job = new JobConf(false); 169 | 170 | job.set("mapred.tip.id", "task_201011081200_14527_r_1234"); 171 | 172 | job.set("fs.default.name", "file:///"); 173 | job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem"); 174 | job.set("mapred.output.dir", outDir.getAbsolutePath()); 175 | 176 | job.setLong("crush.timestamp", 98765); 177 | 178 | job.setLong("dfs.block.size", 1024 * 1024 * 64L); 179 | 180 | job.setInt("crush.num.specs", 2); 181 | job.set("crush.0.regex", "foo"); 182 | job.set("crush.0.regex.replacement", "bar"); 183 | job.set("crush.0.input.format", SequenceFileInputFormat.class.getName()); 184 | job.set("crush.0.output.format", TextOutputFormat.class.getName()); 185 | 186 | job.set("crush.1.regex.replacement", "bar"); 187 | job.set("crush.1.input.format", SequenceFileInputFormat.class.getName()); 188 | job.set("crush.1.output.format", TextOutputFormat.class.getName()); 189 | 190 | reducer = new CrushReducer(); 191 | 192 | try { 193 | reducer.configure(job); 194 | fail(); 195 | } catch (IllegalArgumentException e) { 196 | if (!"No input regex: crush.1.regex".equals(e.getMessage())) { 197 | throw e; 198 | } 199 | } 200 | } 201 | 202 | @Test 203 | public void missingOutputRegex() { 204 | JobConf job = new JobConf(false); 205 | 206 | job.set("mapred.tip.id", "task_201011081200_14527_r_1234"); 207 | 208 | job.set("fs.default.name", "file:///"); 209 | job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem"); 210 | job.set("mapred.output.dir", outDir.getAbsolutePath()); 211 | 212 | job.setLong("crush.timestamp", 98765); 213 | 214 | job.setLong("dfs.block.size", 1024 * 1024 * 64L); 215 | 216 | job.setInt("crush.num.specs", 2); 217 | job.set("crush.0.regex", "foo"); 218 | job.set("crush.0.regex.replacement", "bar"); 219 | job.set("crush.0.input.format", SequenceFileInputFormat.class.getName()); 220 | job.set("crush.0.output.format", TextOutputFormat.class.getName()); 221 | 222 | job.set("crush.1.regex", "hello"); 223 | job.set("crush.1.input.format", SequenceFileInputFormat.class.getName()); 224 | job.set("crush.1.output.format", TextOutputFormat.class.getName()); 225 | 226 | reducer = new CrushReducer(); 227 | 228 | try { 229 | reducer.configure(job); 230 | fail(); 231 | } catch (IllegalArgumentException e) { 232 | if (!"No output replacement: crush.1.regex.replacement".equals(e.getMessage())) { 233 | throw e; 234 | } 235 | } 236 | } 237 | 238 | @Test 239 | public void missingInputFormat() { 240 | JobConf job = new JobConf(false); 241 | 242 | job.set("mapred.tip.id", "task_201011081200_14527_r_1234"); 243 | 244 | job.set("fs.default.name", "file:///"); 245 | job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem"); 246 | job.set("mapred.output.dir", outDir.getAbsolutePath()); 247 | 248 | job.setLong("crush.timestamp", 98765); 249 | 250 | job.setLong("dfs.block.size", 1024 * 1024 * 64L); 251 | 252 | job.setInt("crush.num.specs", 2); 253 | job.set("crush.0.regex", "foo"); 254 | job.set("crush.0.regex.replacement", "bar"); 255 | job.set("crush.0.input.format", SequenceFileInputFormat.class.getName()); 256 | job.set("crush.0.output.format", SequenceFileOutputFormat.class.getName()); 257 | 258 | job.set("crush.1.regex", "hello"); 259 | job.set("crush.1.regex.replacement", "hello"); 260 | job.set("crush.1.output.format", SequenceFileOutputFormat.class.getName()); 261 | 262 | reducer = new CrushReducer(); 263 | 264 | try { 265 | reducer.configure(job); 266 | fail(); 267 | } catch (IllegalArgumentException e) { 268 | if (!"No input format: crush.1.input.format".equals(e.getMessage())) { 269 | throw e; 270 | } 271 | } 272 | } 273 | 274 | @Test 275 | public void inputFormatWrongType() { 276 | JobConf job = new JobConf(false); 277 | 278 | job.set("mapred.tip.id", "task_201011081200_14527_r_1234"); 279 | 280 | job.set("fs.default.name", "file:///"); 281 | job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem"); 282 | job.set("mapred.output.dir", outDir.getAbsolutePath()); 283 | 284 | job.setLong("crush.timestamp", 98765); 285 | 286 | job.setLong("dfs.block.size", 1024 * 1024 * 64L); 287 | 288 | job.setInt("crush.num.specs", 2); 289 | job.set("crush.0.regex", "foo"); 290 | job.set("crush.0.regex.replacement", "bar"); 291 | job.set("crush.0.input.format", SequenceFileInputFormat.class.getName()); 292 | job.set("crush.0.output.format", SequenceFileOutputFormat.class.getName()); 293 | 294 | job.set("crush.1.regex", "hello"); 295 | job.set("crush.1.regex.replacement", "hello"); 296 | job.set("crush.1.input.format", Object.class.getName()); 297 | job.set("crush.1.output.format", SequenceFileOutputFormat.class.getName()); 298 | 299 | reducer = new CrushReducer(); 300 | 301 | try { 302 | reducer.configure(job); 303 | fail(); 304 | } catch (IllegalArgumentException e) { 305 | if (!"Not a file input format: crush.1.input.format=java.lang.Object".equals(e.getMessage())) { 306 | throw e; 307 | } 308 | } 309 | } 310 | 311 | @Test 312 | public void missingOutputFormat() { 313 | JobConf job = new JobConf(false); 314 | 315 | job.set("mapred.tip.id", "task_201011081200_14527_r_1234"); 316 | 317 | job.set("fs.default.name", "file:///"); 318 | job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem"); 319 | job.set("mapred.output.dir", outDir.getAbsolutePath()); 320 | 321 | job.setLong("crush.timestamp", 98765); 322 | 323 | job.setLong("dfs.block.size", 1024 * 1024 * 64L); 324 | 325 | job.setInt("crush.num.specs", 2); 326 | job.set("crush.0.regex", "foo"); 327 | job.set("crush.0.regex.replacement", "bar"); 328 | job.set("crush.0.input.format", SequenceFileInputFormat.class.getName()); 329 | job.set("crush.0.output.format", SequenceFileOutputFormat.class.getName()); 330 | 331 | job.set("crush.1.regex", "hello"); 332 | job.set("crush.1.regex.replacement", "hello"); 333 | job.set("crush.1.input.format", SequenceFileInputFormat.class.getName()); 334 | 335 | reducer = new CrushReducer(); 336 | 337 | try { 338 | reducer.configure(job); 339 | fail(); 340 | } catch (IllegalArgumentException e) { 341 | if (!"No output format: crush.1.output.format".equals(e.getMessage())) { 342 | throw e; 343 | } 344 | } 345 | } 346 | 347 | @Test 348 | public void outputFormatWrongType() { 349 | JobConf job = new JobConf(false); 350 | 351 | job.set("mapred.tip.id", "task_201011081200_14527_r_1234"); 352 | 353 | job.set("fs.default.name", "file:///"); 354 | job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem"); 355 | job.set("mapred.output.dir", outDir.getAbsolutePath()); 356 | 357 | job.setLong("crush.timestamp", 98765); 358 | 359 | job.setLong("dfs.block.size", 1024 * 1024 * 64L); 360 | 361 | job.setInt("crush.num.specs", 2); 362 | job.set("crush.0.regex", "foo"); 363 | job.set("crush.0.regex.replacement", "bar"); 364 | job.set("crush.0.input.format", SequenceFileInputFormat.class.getName()); 365 | job.set("crush.0.output.format", SequenceFileOutputFormat.class.getName()); 366 | 367 | job.set("crush.1.regex", "hello"); 368 | job.set("crush.1.regex.replacement", "hello"); 369 | job.set("crush.1.input.format", TextInputFormat.class.getName()); 370 | job.set("crush.1.output.format", Object.class.getName()); 371 | 372 | reducer = new CrushReducer(); 373 | 374 | try { 375 | reducer.configure(job); 376 | fail(); 377 | } catch (IllegalArgumentException e) { 378 | if (!"Not an output format: crush.1.output.format=java.lang.Object".equals(e.getMessage())) { 379 | throw e; 380 | } 381 | } 382 | } 383 | } 384 | -------------------------------------------------------------------------------- /src/main/java/com/m6d/filecrush/crush/CrushReducer.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011 m6d.com 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.m6d.filecrush.crush; 17 | 18 | import static java.lang.String.format; 19 | 20 | import java.io.IOException; 21 | import java.util.ArrayList; 22 | import java.util.HashMap; 23 | import java.util.Iterator; 24 | import java.util.List; 25 | import java.util.Map; 26 | import java.util.regex.Matcher; 27 | import java.util.regex.Pattern; 28 | 29 | import org.apache.commons.logging.Log; 30 | import org.apache.commons.logging.LogFactory; 31 | import org.apache.hadoop.fs.FileSystem; 32 | import org.apache.hadoop.fs.Path; 33 | import org.apache.hadoop.io.Text; 34 | import org.apache.hadoop.mapred.FileInputFormat; 35 | import org.apache.hadoop.mapred.InputSplit; 36 | import org.apache.hadoop.mapred.JobConf; 37 | import org.apache.hadoop.mapred.JobConfigurable; 38 | import org.apache.hadoop.mapred.MapReduceBase; 39 | import org.apache.hadoop.mapred.OutputCollector; 40 | import org.apache.hadoop.mapred.OutputFormat; 41 | import org.apache.hadoop.mapred.RecordReader; 42 | import org.apache.hadoop.mapred.RecordWriter; 43 | import org.apache.hadoop.mapred.Reducer; 44 | import org.apache.hadoop.mapred.Reporter; 45 | import org.apache.hadoop.mapred.TextInputFormat; 46 | 47 | @SuppressWarnings("deprecation") 48 | public class CrushReducer extends MapReduceBase implements Reducer { 49 | 50 | private final Text valueOut = new Text(); 51 | 52 | /** 53 | * Internal counter for the number of input groups processed. Used to report status. 54 | */ 55 | private int fileNum; 56 | 57 | /** 58 | * The number of source files that have been crushed. 59 | */ 60 | private int recordNumber; 61 | 62 | /** 63 | * Report status when after processing this number of files. 64 | */ 65 | private int reportRecordNumber = 100; 66 | 67 | private int taskNum; 68 | 69 | private long timestamp; 70 | 71 | private JobConf job; 72 | 73 | private FileSystem fs; 74 | 75 | /** 76 | * Matched against dir names to calculate the crush output file name. 77 | */ 78 | private List inputRegexList; 79 | 80 | /** 81 | * Used with corresponding element in {@link #inputRegexList} to calculate the crush ouput file name. 82 | */ 83 | private List outputReplacementList; 84 | 85 | /** 86 | * Input formats that correspond with {@link #inputRegexList}. 87 | */ 88 | private List> inFormatClsList; 89 | 90 | /** 91 | * Output formats that correspond with {@link #inputRegexList}. 92 | */ 93 | private List> outFormatClsList; 94 | 95 | /** 96 | * Used to substitute values into placeholders. 97 | */ 98 | private Map placeHolderToValue = new HashMap(3); 99 | 100 | /** 101 | * Used to locate placeholders in the replacement strings. 102 | */ 103 | private Matcher placeholderMatcher = Pattern.compile("\\$\\{([a-zA-Z]([a-zA-Z\\.]*))\\}").matcher("dummy"); 104 | 105 | /** 106 | * Path to the output dir of the job. Used to compute the final output file names for the crush files, which are the values in 107 | * the reducer output. 108 | */ 109 | private String outDirPath; 110 | 111 | @Override 112 | public void configure(JobConf job) { 113 | super.configure(job); 114 | 115 | this.job = job; 116 | 117 | taskNum = Integer.parseInt(job.get("mapred.tip.id").replaceFirst(".+_(\\d+)", "$1")); 118 | timestamp = Long.parseLong(job.get("crush.timestamp")); 119 | 120 | outDirPath = job.get("mapred.output.dir"); 121 | 122 | if (null == outDirPath || outDirPath.isEmpty()) { 123 | throw new IllegalArgumentException("mapred.output.dir has no value"); 124 | } 125 | 126 | /* 127 | * The files we write should be rooted in the "crush" subdir of the output directory to distinguish them from the files 128 | * created by the collector. 129 | */ 130 | outDirPath = new Path(outDirPath + "/crush").toUri().getPath(); 131 | 132 | /* 133 | * Configure the regular expressions and replacements we use to convert dir names to crush output file names. Also get the 134 | * directory data formats. 135 | */ 136 | int numSpecs = job.getInt("crush.num.specs", 0); 137 | 138 | if (numSpecs <= 0) { 139 | throw new IllegalArgumentException("Number of regular expressions must be zero or greater: " + numSpecs); 140 | } 141 | 142 | readCrushSpecs(numSpecs); 143 | 144 | placeHolderToValue.put("crush.task.num", Integer.toString(taskNum)); 145 | placeHolderToValue.put("crush.timestamp", job.get("crush.timestamp")); 146 | 147 | try { 148 | fs = FileSystem.get(job); 149 | } catch (RuntimeException e) { 150 | throw e; 151 | } catch (Exception e) { 152 | throw new RuntimeException(e); 153 | } 154 | } 155 | 156 | /** 157 | * Populates the following fields with non-default values from the configuration. 158 | * 159 | *
    160 | *
  • <{@link #inputRegexList}/li> 161 | *
  • <{@link #outputReplacementList}/li> 162 | *
  • <{@link #inFormatClsList}/li> 163 | *
  • <{@link #outFormatClsList}/li> 164 | *
165 | */ 166 | private void readCrushSpecs(int numSpecs) { 167 | inputRegexList = new ArrayList(numSpecs); 168 | outputReplacementList = new ArrayList(numSpecs); 169 | inFormatClsList = new ArrayList>(numSpecs); 170 | outFormatClsList = new ArrayList>(numSpecs); 171 | 172 | for (int i = 0; i < numSpecs; i++) { 173 | String key; 174 | String value; 175 | 176 | /* 177 | * Regex. 178 | */ 179 | key = format("crush.%d.regex", i); 180 | value = job.get(key); 181 | 182 | if (null == value || value.isEmpty()) { 183 | throw new IllegalArgumentException("No input regex: " + key); 184 | } 185 | 186 | inputRegexList.add(Pattern.compile(value).matcher("dummy")); 187 | 188 | /* 189 | * Replacement for regex. 190 | */ 191 | key = format("crush.%d.regex.replacement", i); 192 | value = job.get(key); 193 | 194 | if (null == value || value.isEmpty()) { 195 | throw new IllegalArgumentException("No output replacement: " + key); 196 | } 197 | 198 | outputReplacementList.add(value); 199 | 200 | /* 201 | * Input format 202 | */ 203 | key = format("crush.%d.input.format", i); 204 | value = job.get(key); 205 | 206 | if (null == value || value.isEmpty()) { 207 | throw new IllegalArgumentException("No input format: " + key); 208 | } 209 | 210 | try { 211 | Class inFormatCls; 212 | 213 | if (value.equals(TextInputFormat.class.getName())) { 214 | inFormatCls = KeyValuePreservingTextInputFormat.class; 215 | } else { 216 | inFormatCls = Class.forName(value); 217 | 218 | if (!FileInputFormat.class.isAssignableFrom(inFormatCls)) { 219 | throw new IllegalArgumentException(format("Not a file input format: %s=%s", key, value)); 220 | } 221 | } 222 | 223 | inFormatClsList.add(inFormatCls); 224 | } catch (ClassNotFoundException e) { 225 | throw new IllegalArgumentException(format("Not a valid class: %s=%s", key, value)); 226 | } 227 | 228 | /* 229 | * Output format. 230 | */ 231 | key = format("crush.%d.output.format", i); 232 | value = job.get(key); 233 | 234 | if (null == value || value.isEmpty()) { 235 | throw new IllegalArgumentException("No output format: " + key); 236 | } 237 | 238 | try { 239 | Class outFormatCls = Class.forName(value); 240 | 241 | if (!OutputFormat.class.isAssignableFrom(outFormatCls)) { 242 | throw new IllegalArgumentException(format("Not an output format: %s=%s", key, value)); 243 | } 244 | 245 | outFormatClsList.add(outFormatCls); 246 | } catch (ClassNotFoundException e) { 247 | throw new IllegalArgumentException(format("Not a valid class: %s=%s", key, value)); 248 | } 249 | } 250 | } 251 | 252 | @Override 253 | public void reduce(Text bucketId, Iterator values, OutputCollector collector, Reporter reporter) throws IOException { 254 | String bucket = bucketId.toString(); 255 | 256 | String dirName = bucket.substring(0, bucket.lastIndexOf('-')); 257 | 258 | int idx = findMatcher(dirName); 259 | 260 | String outputFileName = calculateOutputFile(idx, dirName); 261 | 262 | /* 263 | * Don't need to separate the paths because the output file name is already absolute. 264 | */ 265 | valueOut.set(outDirPath + outputFileName); 266 | 267 | LOG.info(format("Crushing bucket '%s' to file '%s'", bucket, outputFileName)); 268 | 269 | /* 270 | * Strip the leading slash to make the path relative. the output format will relativize it to the task attempt work dir. 271 | */ 272 | RecordWriter sink = null; 273 | Exception rootCause = null; 274 | 275 | Object key = null; 276 | Object value = null; 277 | 278 | try { 279 | while (null == rootCause && values.hasNext()) { 280 | Text srcFile = values.next(); 281 | Path inputPath = new Path(srcFile.toString()); 282 | 283 | RecordReader reader = createRecordReader(idx, inputPath, reporter); 284 | 285 | try { 286 | if (null == key) { 287 | key = reader.createKey(); 288 | value = reader.createValue(); 289 | 290 | /* 291 | * Set the key and value class in the conf, which the output format uses to get type information. 292 | */ 293 | job.setOutputKeyClass(key.getClass()); 294 | job.setOutputValueClass(value.getClass()); 295 | 296 | /* 297 | * Output file name is absolute so we can just add it to the crush prefix. 298 | */ 299 | sink = createRecordWriter(idx, "crush" + outputFileName); 300 | } else { 301 | 302 | Class other = reader.createKey().getClass(); 303 | 304 | if (!(key.getClass().equals(other))) { 305 | throw new IllegalArgumentException(format("Heterogeneous keys detected in %s: %s !- %s", inputPath, key.getClass(), other)); 306 | } 307 | 308 | other = reader.createValue().getClass(); 309 | 310 | if (!value.getClass().equals(other)) { 311 | throw new IllegalArgumentException(format("Heterogeneous values detected in %s: %s !- %s", inputPath, value.getClass(), other)); 312 | } 313 | } 314 | 315 | while (reader.next(key, value)) { 316 | sink.write(key, value); 317 | reporter.incrCounter(ReducerCounter.RECORDS_CRUSHED, 1); 318 | } 319 | } catch (Exception e) { 320 | rootCause = e; 321 | } finally { 322 | try { 323 | reader.close(); 324 | } catch (Exception e) { 325 | if (null == rootCause) { 326 | rootCause = e; 327 | } else { 328 | LOG.debug("Swallowing exception on close of " + inputPath, e); 329 | } 330 | } 331 | } 332 | 333 | /* 334 | * Output of the reducer is the source file => crushed file (in the final output dir, no the task attempt work dir. 335 | */ 336 | collector.collect(srcFile, valueOut); 337 | reporter.incrCounter(ReducerCounter.FILES_CRUSHED, 1); 338 | 339 | recordNumber++; 340 | 341 | if (reportRecordNumber == recordNumber) { 342 | reportRecordNumber += reportRecordNumber; 343 | 344 | reporter.setStatus(format("Processed %,d files %s : %s", recordNumber, bucket, inputPath)); 345 | } 346 | } 347 | } catch (Exception e) { 348 | rootCause = e; 349 | } finally { 350 | if (null != sink) { 351 | try { 352 | sink.close(reporter); 353 | } catch (Exception e) { 354 | if (null == rootCause) { 355 | rootCause = e; 356 | } else { 357 | LOG.error("Swallowing exception on close of " + outputFileName, e); 358 | } 359 | } 360 | } 361 | 362 | /* 363 | * Let the exception bubble up with a minimum of wrapping. 364 | */ 365 | if (null != rootCause) { 366 | if (rootCause instanceof RuntimeException) { 367 | throw (RuntimeException) rootCause; 368 | } 369 | 370 | if (rootCause instanceof IOException) { 371 | throw (IOException) rootCause; 372 | } 373 | 374 | throw new RuntimeException(rootCause); 375 | } 376 | } 377 | } 378 | 379 | /** 380 | * Returns a record writer that creates files in the task attempt work directory. Path must be relative! 381 | */ 382 | @SuppressWarnings("unchecked") 383 | private RecordWriter createRecordWriter(int idx, String path) throws IOException { 384 | Class> cls = (Class>) outFormatClsList.get(idx); 385 | 386 | try { 387 | OutputFormat format = (OutputFormat) cls.newInstance(); 388 | 389 | return format.getRecordWriter(fs, job, path, null); 390 | } catch (RuntimeException e) { 391 | throw e; 392 | } catch (IOException e) { 393 | throw e; 394 | } catch (Exception e) { 395 | throw new RuntimeException(e); 396 | } 397 | } 398 | 399 | @SuppressWarnings("unchecked") 400 | private RecordReader createRecordReader(int idx, Path inputPath, Reporter reporter) throws IOException { 401 | 402 | LOG.info(format("Opening '%s'", inputPath)); 403 | 404 | Class> cls = (Class>) inFormatClsList.get(idx); 405 | 406 | try { 407 | FileInputFormat.setInputPaths(job, inputPath); 408 | 409 | FileInputFormat instance = cls.newInstance(); 410 | 411 | if (instance instanceof JobConfigurable) { 412 | ((JobConfigurable) instance).configure(job); 413 | } 414 | 415 | InputSplit[] splits = instance.getSplits(job, 1); 416 | 417 | if (1 != splits.length) { 418 | throw new IllegalArgumentException("Could not get input splits: " + inputPath); 419 | } 420 | 421 | return (RecordReader) instance.getRecordReader(splits[0], job, reporter); 422 | } catch (RuntimeException e) { 423 | throw e; 424 | } catch (IOException e) { 425 | throw e; 426 | } catch (Exception e) { 427 | throw new RuntimeException(e); 428 | } 429 | } 430 | 431 | /** 432 | * Converts the name of a directory to a path to the crush output file using the specs at the given index. The path will the 433 | * directory and file name separated by a slash /. Performs placeholder substitution on the corresponding replacement string in 434 | * {@link #outputReplacementList}. The final replacement string is then used to form the final path. 435 | */ 436 | String calculateOutputFile(int idx, String srcDir) { 437 | 438 | StringBuffer sb = new StringBuffer(srcDir); 439 | sb.append("/"); 440 | 441 | String replacement = outputReplacementList.get(idx); 442 | 443 | placeHolderToValue.put("crush.file.num", Integer.toString(fileNum++)); 444 | 445 | placeholderMatcher.reset(replacement); 446 | 447 | while (placeholderMatcher.find()) { 448 | String key = placeholderMatcher.group(1); 449 | 450 | String value = placeHolderToValue.get(key); 451 | 452 | if (null == value) { 453 | throw new IllegalArgumentException("No value for key: " + key); 454 | } 455 | 456 | placeholderMatcher.appendReplacement(sb, value); 457 | } 458 | 459 | placeholderMatcher.appendTail(sb); 460 | 461 | Matcher matcher = inputRegexList.get(idx); 462 | matcher.reset(srcDir); 463 | 464 | String finalOutputName = matcher.replaceAll(sb.toString()); 465 | 466 | return finalOutputName; 467 | } 468 | 469 | /** 470 | * Returns the index into {@link #inputRegexList} of first pattern that matches the argument. 471 | */ 472 | int findMatcher(String dir) { 473 | 474 | String outputNameWithPlaceholders = null; 475 | 476 | for (int i = 0; i < inputRegexList.size() && outputNameWithPlaceholders == null; i++) { 477 | Matcher matcher = inputRegexList.get(i); 478 | 479 | matcher.reset(dir); 480 | 481 | if (matcher.matches()) { 482 | return i; 483 | } 484 | } 485 | 486 | throw new IllegalArgumentException("No matching input regex: " + dir); 487 | } 488 | 489 | int getTaskNum() { 490 | return taskNum; 491 | } 492 | 493 | long getTimestamp() { 494 | return timestamp; 495 | } 496 | 497 | List getInputRegexList() { 498 | ArrayList list = new ArrayList(inputRegexList.size()); 499 | 500 | for (Matcher matcher : inputRegexList) { 501 | list.add(matcher.pattern().pattern()); 502 | } 503 | 504 | return list; 505 | } 506 | 507 | List getOutputReplacementList() { 508 | return new ArrayList(outputReplacementList); 509 | } 510 | 511 | List> getInputFormatList() { 512 | return new ArrayList>(inFormatClsList); 513 | } 514 | 515 | List> getOutputFormatList() { 516 | return new ArrayList>(outFormatClsList); 517 | } 518 | 519 | private static final Log LOG = LogFactory.getLog(CrushReducer.class); 520 | } 521 | -------------------------------------------------------------------------------- /src/test/java/com/m6d/filecrush/crush/CrushOptionParsingTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011 m6d.com 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.m6d.filecrush.crush; 17 | 18 | import static java.lang.System.currentTimeMillis; 19 | import static org.hamcrest.Matchers.equalTo; 20 | import static org.hamcrest.Matchers.greaterThanOrEqualTo; 21 | import static org.hamcrest.Matchers.is; 22 | import static org.junit.Assert.assertThat; 23 | import static org.junit.Assert.fail; 24 | 25 | import java.io.IOException; 26 | 27 | import org.apache.hadoop.fs.FileSystem; 28 | import org.apache.hadoop.fs.Path; 29 | import org.apache.hadoop.mapred.JobConf; 30 | import org.junit.Before; 31 | import org.junit.Rule; 32 | import org.junit.Test; 33 | import org.junit.rules.TemporaryFolder; 34 | 35 | import com.m6d.filecrush.crush.Crush; 36 | 37 | @SuppressWarnings("deprecation") 38 | public class CrushOptionParsingTest { 39 | @Rule 40 | public final TemporaryFolder tmp = new TemporaryFolder(); 41 | 42 | private Crush crush; 43 | 44 | @Before 45 | public void before() throws IOException { 46 | crush = new Crush(); 47 | 48 | JobConf job = new JobConf(false); 49 | crush.setConf(job); 50 | 51 | job.set("fs.default.name", "file:///"); 52 | job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem"); 53 | job.setInt("mapred.reduce.tasks", 20); 54 | job.setLong("dfs.block.size", 1024 * 1024 * 64); 55 | 56 | FileSystem fs = FileSystem.get(job); 57 | fs.setWorkingDirectory(new Path(tmp.getRoot().getAbsolutePath())); 58 | 59 | crush.setFileSystem(fs); 60 | } 61 | 62 | @Test 63 | public void unrecognizedOption() { 64 | try { 65 | crush.createJobConfAndParseArgs("-bad", "in", "out", "20101116123015"); 66 | fail(); 67 | } catch (Exception e) { 68 | } 69 | } 70 | 71 | @Test 72 | public void badRegexCount() throws Exception { 73 | try { 74 | crush.createJobConfAndParseArgs( 75 | "--regex", ".+/ads/.+", 76 | "--replacement", "foo", 77 | "--input-format", "org.apache.hadoop.mapred.TextInputFormat", 78 | "--output-format", "org.apache.hadoop.mapred.TextOutputFormat", 79 | "--replacement", "bar", 80 | "--input-format", "org.apache.hadoop.mapred.TextInputFormat", 81 | "--output-format", "org.apache.hadoop.mapred.TextOutputFormat", 82 | "in", "out", "20101116123015"); 83 | fail(); 84 | } catch (IllegalArgumentException e) { 85 | if (!e.getMessage().equals("Must be an equal number of regex, replacement, in-format, and out-format options")) { 86 | throw e; 87 | } 88 | } 89 | } 90 | 91 | @Test 92 | public void badCompressCodec() throws Exception { 93 | try { 94 | crush.createJobConfAndParseArgs( 95 | "--regex", ".+/ads/.+", 96 | "--replacement", "foo", 97 | "--input-format", "org.apache.hadoop.mapred.TextInputFormat", 98 | "--output-format", "org.apache.hadoop.mapred.TextOutputFormat", 99 | "--compress", "java.lang.Object", 100 | "in", "out", "20101116123015"); 101 | fail(); 102 | } catch (IllegalArgumentException e) { 103 | if (!e.getMessage().contains("java.lang.Object")) { 104 | throw e; 105 | } 106 | } 107 | } 108 | 109 | @Test 110 | public void badCompressCodecNotAClass() throws Exception { 111 | try { 112 | crush.createJobConfAndParseArgs( 113 | "--regex", ".+/ads/.+", 114 | "--replacement", "foo", 115 | "--input-format", "org.apache.hadoop.mapred.TextInputFormat", 116 | "--output-format", "org.apache.hadoop.mapred.TextOutputFormat", 117 | "--compress", "foo", 118 | "in", "out", "20101116123015"); 119 | fail(); 120 | } catch (IllegalArgumentException e) { 121 | if (!e.getMessage().contains("foo")) { 122 | throw e; 123 | } 124 | } 125 | } 126 | 127 | @Test 128 | public void badReplacementCount() throws Exception { 129 | try { 130 | crush.createJobConfAndParseArgs( 131 | "--regex", ".+/ads/.+", 132 | "--replacement", "foo", 133 | "--input-format", "org.apache.hadoop.mapred.TextInputFormat", 134 | "--output-format", "org.apache.hadoop.mapred.TextOutputFormat", 135 | "--regex", ".+/act/.+", 136 | "--input-format", "org.apache.hadoop.mapred.TextInputFormat", 137 | "--output-format", "org.apache.hadoop.mapred.TextOutputFormat", 138 | "in", "out", "20101116123015"); 139 | fail(); 140 | } catch (IllegalArgumentException e) { 141 | if (!e.getMessage().equals("Must be an equal number of regex, replacement, in-format, and out-format options")) { 142 | throw e; 143 | } 144 | } 145 | } 146 | 147 | @Test 148 | public void badInputFormatCount() throws Exception { 149 | try { 150 | crush.createJobConfAndParseArgs( 151 | "--regex", ".+/ads/.+", 152 | "--replacement", "foo", 153 | "--input-format", "org.apache.hadoop.mapred.TextInputFormat", 154 | "--output-format", "org.apache.hadoop.mapred.TextOutputFormat", 155 | "--regex", ".+/act/.+", 156 | "--replacement", "bar", 157 | "--output-format", "org.apache.hadoop.mapred.TextOutputFormat", 158 | "in", "out", "20101116123015"); 159 | fail(); 160 | } catch (IllegalArgumentException e) { 161 | if (!e.getMessage().equals("Must be an equal number of regex, replacement, in-format, and out-format options")) { 162 | throw e; 163 | } 164 | } 165 | } 166 | 167 | @Test 168 | public void badOutputFormatCount() throws Exception { 169 | try { 170 | crush.createJobConfAndParseArgs( 171 | "--regex", ".+/ads/.+", 172 | "--replacement", "foo", 173 | "--input-format", "org.apache.hadoop.mapred.TextInputFormat", 174 | "--output-format", "org.apache.hadoop.mapred.TextOutputFormat", 175 | "--regex", ".+/act/.+", 176 | "--replacement", "bar", 177 | "--input-format", "org.apache.hadoop.mapred.TextInputFormat", 178 | "in", "out", "20101116123015"); 179 | fail(); 180 | } catch (IllegalArgumentException e) { 181 | if (!e.getMessage().equals("Must be an equal number of regex, replacement, in-format, and out-format options")) { 182 | throw e; 183 | } 184 | } 185 | } 186 | 187 | @Test 188 | public void badInputFormat() throws Exception { 189 | try { 190 | crush.createJobConfAndParseArgs( 191 | "--regex", ".+/ads/.+", 192 | "--replacement", "foo", 193 | "--input-format", "org.apache.hadoop.mapred.TextInputFormat", 194 | "--output-format", "org.apache.hadoop.mapred.TextOutputFormat", 195 | "--regex", ".+/act/.+", 196 | "--replacement", "bar", 197 | "--input-format", "org.apache.hadoop.mapred.TextInputFormat", 198 | "--output-format", "org.apache.hadoop.mapred.SequenceFileOutputFormat", 199 | "--regex", ".+/bid/.+", 200 | "--replacement", "hello", 201 | "--input-format", "org.apache.hadoop.mapreduce.lib.input.TextInputFormat", 202 | "--output-format", "org.apache.hadoop.mapred.TextOutputFormat", 203 | "--threshold", "0.5", 204 | "--max-file-blocks", "100", 205 | "in", "out", "20101116123015"); 206 | fail(); 207 | } catch (IllegalArgumentException e) { 208 | if (!e.getMessage().contains("org.apache.hadoop.mapreduce.lib.input.TextInputFormat")) { 209 | throw e; 210 | } 211 | } 212 | } 213 | 214 | @Test 215 | public void badInputFormatNotAClass() throws Exception { 216 | try { 217 | crush.createJobConfAndParseArgs( 218 | "--regex", ".+/ads/.+", 219 | "--replacement", "foo", 220 | "--input-format", "org.apache.hadoop.mapred.TextInputFormat", 221 | "--output-format", "org.apache.hadoop.mapred.TextOutputFormat", 222 | "--regex", ".+/act/.+", 223 | "--replacement", "bar", 224 | "--input-format", "org.apache.hadoop.mapred.TextInputFormat", 225 | "--output-format", "org.apache.hadoop.mapred.SequenceFileOutputFormat", 226 | "--regex", ".+/bid/.+", 227 | "--replacement", "hello", 228 | "--input-format", "foo", 229 | "--output-format", "org.apache.hadoop.mapred.TextOutputFormat", 230 | "--threshold", "0.5", 231 | "--max-file-blocks", "100", 232 | "in", "out", "20101116123015"); 233 | fail(); 234 | } catch (IllegalArgumentException e) { 235 | if (!e.getMessage().contains("foo")) { 236 | throw e; 237 | } 238 | } 239 | } 240 | 241 | @Test 242 | public void badOutputFormat() throws Exception { 243 | try { 244 | crush.createJobConfAndParseArgs( 245 | "--regex", ".+/ads/.+", 246 | "--replacement", "foo", 247 | "--input-format", "org.apache.hadoop.mapred.TextInputFormat", 248 | "--output-format", "org.apache.hadoop.mapred.TextOutputFormat", 249 | "--regex", ".+/act/.+", 250 | "--replacement", "bar", 251 | "--input-format", "org.apache.hadoop.mapred.TextInputFormat", 252 | "--output-format", "org.apache.hadoop.mapred.SequenceFileOutputFormat", 253 | "--regex", ".+/bid/.+", 254 | "--replacement", "hello", 255 | "--input-format", "org.apache.hadoop.mapred.SequenceFileInputFormat", 256 | "--output-format", "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat", 257 | "--threshold", "0.5", 258 | "--max-file-blocks", "100", 259 | "in", "out", "20101116123015"); 260 | fail(); 261 | } catch (IllegalArgumentException e) { 262 | if (!e.getMessage().contains("org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat")) { 263 | throw e; 264 | } 265 | } 266 | } 267 | 268 | @Test 269 | public void badOutputFormatNotAClass() throws Exception { 270 | try { 271 | crush.createJobConfAndParseArgs( 272 | "--regex", ".+/ads/.+", 273 | "--replacement", "foo", 274 | "--input-format", "org.apache.hadoop.mapred.TextInputFormat", 275 | "--output-format", "org.apache.hadoop.mapred.TextOutputFormat", 276 | "--regex", ".+/act/.+", 277 | "--replacement", "bar", 278 | "--input-format", "org.apache.hadoop.mapred.TextInputFormat", 279 | "--output-format", "org.apache.hadoop.mapred.SequenceFileOutputFormat", 280 | "--regex", ".+/bid/.+", 281 | "--replacement", "hello", 282 | "--input-format", "org.apache.hadoop.mapred.SequenceFileInputFormat", 283 | "--output-format", "foo", 284 | "--threshold", "0.5", 285 | "--max-file-blocks", "100", 286 | "in", "out", "20101116123015"); 287 | fail(); 288 | } catch (IllegalArgumentException e) { 289 | if (!e.getMessage().contains("foo")) { 290 | throw e; 291 | } 292 | } 293 | } 294 | 295 | @Test 296 | public void badSourceDir() throws Exception { 297 | try { 298 | crush.createJobConfAndParseArgs("does not exist", tmp.newFolder("out").getAbsolutePath(), "20101116123015"); 299 | } catch (IOException e) { 300 | if (!e.getMessage().contains("does not exist")) { 301 | throw e; 302 | } 303 | } 304 | } 305 | 306 | @Test 307 | public void defaults() throws Exception { 308 | crush.createJobConfAndParseArgs(tmp.newFolder("in").getAbsolutePath(), tmp.newFolder("out").getAbsolutePath(), "20101116123015"); 309 | 310 | JobConf job = crush.getJob(); 311 | 312 | assertThat(job.get("mapred.reduce.tasks"), equalTo("20")); 313 | assertThat(job.get("mapred.output.compress"), equalTo("true")); 314 | assertThat(job.get("mapred.output.compression.type"), equalTo("BLOCK")); 315 | assertThat(job.get("mapred.output.compression.codec"), equalTo("org.apache.hadoop.io.compress.DefaultCodec")); 316 | 317 | assertThat(crush.getMaxFileBlocks(), equalTo(8)); 318 | 319 | assertThat(job.get("crush.timestamp"), equalTo("20101116123015")); 320 | 321 | assertThat(job.get("crush.num.specs"), equalTo("1")); 322 | 323 | assertThat(job.get("crush.0.regex"), equalTo(".+")); 324 | assertThat(job.get("crush.0.regex.replacement"), equalTo("crushed_file-20101116123015-${crush.task.num}-${crush.file.num}")); 325 | assertThat(job.get("crush.0.input.format"), equalTo("org.apache.hadoop.mapred.SequenceFileInputFormat")); 326 | assertThat(job.get("crush.0.output.format"), equalTo("org.apache.hadoop.mapred.SequenceFileOutputFormat")); 327 | } 328 | 329 | @Test 330 | public void disableCompression() throws Exception { 331 | crush.createJobConfAndParseArgs( 332 | "--compress=none", 333 | tmp.newFolder("in").getAbsolutePath(), 334 | tmp.newFolder("out").getAbsolutePath(), 335 | "20101116123015"); 336 | 337 | JobConf job = crush.getJob(); 338 | 339 | assertThat(job.get("mapred.reduce.tasks"), equalTo("20")); 340 | assertThat(job.get("mapred.output.compress"), equalTo("false")); 341 | 342 | assertThat(crush.getMaxFileBlocks(), equalTo(8)); 343 | 344 | assertThat(job.get("crush.timestamp"), equalTo("20101116123015")); 345 | 346 | assertThat(job.get("crush.num.specs"), equalTo("1")); 347 | 348 | assertThat(job.get("crush.0.regex"), equalTo(".+")); 349 | assertThat(job.get("crush.0.regex.replacement"), equalTo("crushed_file-20101116123015-${crush.task.num}-${crush.file.num}")); 350 | assertThat(job.get("crush.0.input.format"), equalTo("org.apache.hadoop.mapred.SequenceFileInputFormat")); 351 | assertThat(job.get("crush.0.output.format"), equalTo("org.apache.hadoop.mapred.SequenceFileOutputFormat")); 352 | } 353 | 354 | @Test 355 | public void parse() throws Exception { 356 | crush.createJobConfAndParseArgs( 357 | "--regex", ".+/ads/.+", 358 | "--replacement", "foo", 359 | "--input-format", "org.apache.hadoop.mapred.TextInputFormat", 360 | "--output-format", "org.apache.hadoop.mapred.TextOutputFormat", 361 | "--regex", ".+/act/.+", 362 | "--replacement", "bar", 363 | "--input-format", "org.apache.hadoop.mapred.TextInputFormat", 364 | "--output-format", "org.apache.hadoop.mapred.SequenceFileOutputFormat", 365 | "--regex", ".+/bid/.+", 366 | "--replacement", "hello", 367 | "--input-format", "org.apache.hadoop.mapred.SequenceFileInputFormat", 368 | "--output-format", "org.apache.hadoop.mapred.TextOutputFormat", 369 | "--threshold", "0.5", 370 | "--max-file-blocks", "100", 371 | "--compress", "org.apache.hadoop.io.compress.DefaultCodec", 372 | 373 | tmp.newFolder("in").getAbsolutePath(), tmp.newFolder("out").getAbsolutePath(), "20101116123015"); 374 | 375 | JobConf job = crush.getJob(); 376 | 377 | assertThat(job.get("mapred.reduce.tasks"), equalTo("20")); 378 | assertThat(job.get("mapred.output.compress"), equalTo("true")); 379 | assertThat(job.get("mapred.output.compression.codec"), equalTo("org.apache.hadoop.io.compress.DefaultCodec")); 380 | 381 | assertThat(crush.getMaxFileBlocks(), equalTo(100)); 382 | 383 | assertThat(job.get("crush.timestamp"), equalTo("20101116123015")); 384 | 385 | assertThat(job.get("crush.num.specs"), equalTo("3")); 386 | 387 | assertThat(job.get("crush.0.regex"), equalTo(".+/ads/.+")); 388 | assertThat(job.get("crush.0.regex.replacement"), equalTo("foo")); 389 | assertThat(job.get("crush.0.input.format"), equalTo("org.apache.hadoop.mapred.TextInputFormat")); 390 | assertThat(job.get("crush.0.output.format"), equalTo("org.apache.hadoop.mapred.TextOutputFormat")); 391 | 392 | assertThat(job.get("crush.1.regex"), equalTo(".+/act/.+")); 393 | assertThat(job.get("crush.1.regex.replacement"), equalTo("bar")); 394 | assertThat(job.get("crush.1.input.format"), equalTo("org.apache.hadoop.mapred.TextInputFormat")); 395 | assertThat(job.get("crush.1.output.format"), equalTo("org.apache.hadoop.mapred.SequenceFileOutputFormat")); 396 | 397 | assertThat(job.get("crush.2.regex"), equalTo(".+/bid/.+")); 398 | assertThat(job.get("crush.2.regex.replacement"), equalTo("hello")); 399 | assertThat(job.get("crush.2.input.format"), equalTo("org.apache.hadoop.mapred.SequenceFileInputFormat")); 400 | assertThat(job.get("crush.2.output.format"), equalTo("org.apache.hadoop.mapred.TextOutputFormat")); 401 | } 402 | 403 | @Test 404 | public void parseOldNoType() throws Exception { 405 | long millis = currentTimeMillis(); 406 | 407 | crush.createJobConfAndParseArgs( 408 | tmp.newFolder("in").getAbsolutePath(), 409 | tmp.newFolder("out").getAbsolutePath(), 410 | "80"); 411 | 412 | JobConf job = crush.getJob(); 413 | 414 | assertThat(job.get("mapred.reduce.tasks"), equalTo("80")); 415 | assertThat(Long.parseLong(job.get("crush.timestamp")), greaterThanOrEqualTo(millis)); 416 | assertThat(job.get("crush.num.specs"), equalTo("1")); 417 | 418 | assertThat(crush.getMaxFileBlocks(), equalTo(Integer.MAX_VALUE)); 419 | 420 | assertThat(job.get("crush.0.regex"), equalTo(".+")); 421 | assertThat(job.get("crush.0.regex.replacement").matches("crushed_file-\\d+-\\$\\{crush.task.num\\}-\\$\\{crush.file.num\\}"), is(true)); 422 | assertThat(job.get("crush.0.input.format"), equalTo("org.apache.hadoop.mapred.SequenceFileInputFormat")); 423 | assertThat(job.get("crush.0.output.format"), equalTo("org.apache.hadoop.mapred.SequenceFileOutputFormat")); 424 | } 425 | 426 | @Test 427 | public void parseOldSequence() throws Exception { 428 | long millis = currentTimeMillis(); 429 | 430 | crush.createJobConfAndParseArgs( 431 | tmp.newFolder("in").getAbsolutePath(), 432 | tmp.newFolder("out").getAbsolutePath(), 433 | "80", 434 | "SEQUENCE"); 435 | 436 | JobConf job = crush.getJob(); 437 | 438 | assertThat(job.get("mapred.reduce.tasks"), equalTo("80")); 439 | assertThat(Long.parseLong(job.get("crush.timestamp")), greaterThanOrEqualTo(millis)); 440 | assertThat(job.get("crush.num.specs"), equalTo("1")); 441 | 442 | assertThat(crush.getMaxFileBlocks(), equalTo(Integer.MAX_VALUE)); 443 | 444 | assertThat(job.get("crush.0.regex"), equalTo(".+")); 445 | assertThat(job.get("crush.0.regex.replacement").matches("crushed_file-\\d+-\\$\\{crush.task.num\\}-\\$\\{crush.file.num\\}"), is(true)); 446 | assertThat(job.get("crush.0.input.format"), equalTo("org.apache.hadoop.mapred.SequenceFileInputFormat")); 447 | assertThat(job.get("crush.0.output.format"), equalTo("org.apache.hadoop.mapred.SequenceFileOutputFormat")); 448 | } 449 | 450 | @Test 451 | public void parseOldText() throws Exception { 452 | long millis = currentTimeMillis(); 453 | 454 | crush.createJobConfAndParseArgs( 455 | tmp.newFolder("in").getAbsolutePath(), 456 | tmp.newFolder("out").getAbsolutePath(), 457 | "80", 458 | "TEXT"); 459 | 460 | JobConf job = crush.getJob(); 461 | 462 | assertThat(job.get("mapred.reduce.tasks"), equalTo("80")); 463 | assertThat(Long.parseLong(job.get("crush.timestamp")), greaterThanOrEqualTo(millis)); 464 | assertThat(job.get("crush.num.specs"), equalTo("1")); 465 | 466 | assertThat(crush.getMaxFileBlocks(), equalTo(Integer.MAX_VALUE)); 467 | 468 | assertThat(job.get("crush.0.regex"), equalTo(".+")); 469 | assertThat(job.get("crush.0.regex.replacement").matches("crushed_file-\\d+-\\$\\{crush.task.num\\}-\\$\\{crush.file.num\\}"), is(true)); 470 | assertThat(job.get("crush.0.input.format"), equalTo("org.apache.hadoop.mapred.TextInputFormat")); 471 | assertThat(job.get("crush.0.output.format"), equalTo("org.apache.hadoop.mapred.TextOutputFormat")); 472 | } 473 | 474 | @Test 475 | public void parseOldBadType() throws Exception { 476 | try { 477 | crush.createJobConfAndParseArgs("in", 478 | "out", 479 | "80", 480 | "FOO"); 481 | fail(); 482 | } catch (IllegalArgumentException e) { 483 | if (!e.getMessage().contains("FOO")) { 484 | throw e; 485 | } 486 | } 487 | } 488 | } 489 | -------------------------------------------------------------------------------- /src/test/java/com/m6d/filecrush/crush/CrushReducerParameterizedTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011 m6d.com 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.m6d.filecrush.crush; 17 | 18 | import static java.lang.String.format; 19 | import static java.util.Arrays.asList; 20 | import static org.easymock.EasyMock.expectLastCall; 21 | import static org.easymock.EasyMock.isA; 22 | import static org.hamcrest.Matchers.equalTo; 23 | import static org.hamcrest.Matchers.is; 24 | import static org.hamcrest.Matchers.nullValue; 25 | import static org.junit.Assert.assertThat; 26 | import static org.junit.Assert.fail; 27 | 28 | import java.io.BufferedReader; 29 | import java.io.File; 30 | import java.io.FileInputStream; 31 | import java.io.FileOutputStream; 32 | import java.io.IOException; 33 | import java.io.InputStreamReader; 34 | import java.io.PrintWriter; 35 | import java.util.ArrayList; 36 | import java.util.Collection; 37 | import java.util.LinkedHashMap; 38 | import java.util.List; 39 | import java.util.Map; 40 | import java.util.Map.Entry; 41 | 42 | import org.apache.hadoop.fs.FileSystem; 43 | import org.apache.hadoop.fs.Path; 44 | import org.apache.hadoop.io.LongWritable; 45 | import org.apache.hadoop.io.SequenceFile; 46 | import org.apache.hadoop.io.SequenceFile.CompressionType; 47 | import org.apache.hadoop.io.SequenceFile.Reader; 48 | import org.apache.hadoop.io.SequenceFile.Writer; 49 | import org.apache.hadoop.io.Text; 50 | import org.apache.hadoop.io.compress.DefaultCodec; 51 | import org.apache.hadoop.mapred.JobConf; 52 | import org.apache.hadoop.mapred.OutputCollector; 53 | import org.apache.hadoop.mapred.Reporter; 54 | import org.apache.hadoop.mapred.SequenceFileInputFormat; 55 | import org.apache.hadoop.mapred.SequenceFileOutputFormat; 56 | import org.apache.hadoop.mapred.TextInputFormat; 57 | import org.apache.hadoop.mapred.TextOutputFormat; 58 | import org.easymock.EasyMockSupport; 59 | import org.junit.Before; 60 | import org.junit.Rule; 61 | import org.junit.Test; 62 | import org.junit.rules.TemporaryFolder; 63 | import org.junit.runner.RunWith; 64 | import org.junit.runners.Parameterized; 65 | import org.junit.runners.Parameterized.Parameters; 66 | 67 | import com.m6d.filecrush.crush.CrushReducer; 68 | import com.m6d.filecrush.crush.ReducerCounter; 69 | 70 | @RunWith(Parameterized.class) 71 | @SuppressWarnings("deprecation") 72 | public class CrushReducerParameterizedTest extends EasyMockSupport { 73 | @Parameters 74 | public static Collection testCases() { 75 | List testCases = new ArrayList(); 76 | 77 | for (Object[] testCase : new Object[][] { new Object[] { CompressionType.NONE }, 78 | new Object[] { CompressionType.BLOCK }, 79 | new Object[] { CompressionType.RECORD }}) { 80 | testCases.add(testCase); 81 | } 82 | 83 | return testCases; 84 | } 85 | 86 | @Rule 87 | public final TemporaryFolder tmp = new TemporaryFolder(); 88 | 89 | private final CompressionType compressionType; 90 | 91 | private OutputCollector collector; 92 | 93 | private Reporter reporter; 94 | 95 | private CrushReducer reducer; 96 | 97 | private JobConf job; 98 | 99 | private FileSystem fs; 100 | 101 | /** 102 | * Simulates the task attempt work dir that is created by Hadoop. 103 | */ 104 | private File workDir; 105 | 106 | /** 107 | * Simulates the output dir to which the attempt's output will be copied. 108 | */ 109 | private File outDir; 110 | 111 | public CrushReducerParameterizedTest(CompressionType compressionType) { 112 | super(); 113 | 114 | this.compressionType = compressionType; 115 | } 116 | 117 | @Before 118 | public void setupReducer() throws IOException { 119 | job = new JobConf(false); 120 | 121 | job.set("mapred.tip.id", "task_201011081200_014527_r_001234"); 122 | job.set("mapred.task.id", "attempt_201011081200_14527_r_001234_0"); 123 | 124 | /* 125 | * This logic tree around compression simulates what the output formats do. 126 | */ 127 | if (CompressionType.NONE == compressionType) { 128 | job.setBoolean("mapred.output.compress", false); 129 | } else { 130 | job.setBoolean("mapred.output.compress", true); 131 | job.set("mapred.output.compression.type", compressionType.name()); 132 | job.set("mapred.output.compression.codec", CustomCompressionCodec.class.getName()); 133 | } 134 | 135 | outDir = tmp.newFolder("out"); 136 | tmp.newFolder("out/_temporary"); 137 | workDir = tmp.newFolder("out/_temporary/_" + job.get("mapred.task.id")); 138 | 139 | job.set("mapred.output.dir", outDir.getAbsolutePath()); 140 | job.set("mapred.work.output.dir", workDir.getAbsolutePath()); 141 | 142 | job.set("fs.default.name", "file:///"); 143 | job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem"); 144 | 145 | job.setLong("crush.timestamp", 98765); 146 | 147 | job.setInt("crush.num.specs", 4); 148 | job.set("crush.0.regex", ".+/other"); 149 | job.set("crush.0.regex.replacement", "${crush.timestamp}-${crush.task.num}-middle-${crush.file.num}-tail"); 150 | job.set("crush.0.input.format", SequenceFileInputFormat.class.getName()); 151 | job.set("crush.0.output.format", TextOutputFormat.class.getName()); 152 | 153 | job.set("crush.1.regex", ".+/dir"); 154 | job.set("crush.1.regex.replacement", "secondregex-${crush.timestamp}-${crush.task.num}-${crush.file.num}"); 155 | job.set("crush.1.input.format", TextInputFormat.class.getName()); 156 | job.set("crush.1.output.format", SequenceFileOutputFormat.class.getName()); 157 | 158 | job.set("crush.2.regex", ".+/dir/([^/]+/)*(.+)"); 159 | job.set("crush.2.regex.replacement", "thirdregex-$2-${crush.timestamp}-${crush.task.num}-${crush.file.num}"); 160 | job.set("crush.2.input.format", SequenceFileInputFormat.class.getName()); 161 | job.set("crush.2.output.format", SequenceFileOutputFormat.class.getName()); 162 | 163 | job.set("crush.3.regex", ".+/text"); 164 | job.set("crush.3.regex.replacement", "fourthregex-${crush.task.num}-${crush.timestamp}-${crush.file.num}"); 165 | job.set("crush.3.input.format", TextInputFormat.class.getName()); 166 | job.set("crush.3.output.format", TextOutputFormat.class.getName()); 167 | 168 | reducer = new CrushReducer(); 169 | 170 | reducer.configure(job); 171 | 172 | fs = FileSystem.get(job); 173 | } 174 | 175 | @Before 176 | @SuppressWarnings("unchecked") 177 | public void setupMocks() { 178 | collector = createMock("collector", OutputCollector.class); 179 | reporter = createMock("reporter", Reporter.class); 180 | } 181 | 182 | @Test 183 | public void reduce() throws IOException { 184 | reporter.setStatus(isA(String.class)); 185 | expectLastCall().anyTimes(); 186 | 187 | /* 188 | * We setup a few directories to exercise regexes. In this comment, dirs are distinguished by a trailing slash. The 189 | * file name is followed by the bucket id. 190 | * 191 | * dir/ 192 | * file10 0 193 | * file11 0 194 | * file12 1 195 | * file13 1 196 | * subdir/ 197 | * file20 0 198 | * file21 0 199 | * file22 1 200 | * file23 1 201 | * file24 1 202 | * subsubdir/ 203 | * file30 0 204 | * file31 0 205 | * file32 0 206 | * file33 1 207 | * file34 1 208 | * other/ 209 | * file40 1 210 | * file41 1 211 | * file42 2 212 | * file43 2 213 | * other/ 214 | * file50 0 215 | * file51 0 216 | * file52 1 217 | * file53 1 218 | * file54 3 219 | * file55 3 220 | * text/ 221 | * file60 2 222 | * file61 2 223 | * file62 3 224 | * file63 3 225 | * 226 | * Now setup the dir so the reducer has some data to work with. 227 | */ 228 | 229 | Map> inputGroups = new LinkedHashMap>(); 230 | 231 | 232 | /* 233 | * dir/ 234 | * file10 0 235 | * file11 0 236 | * file12 1 237 | * file13 1 238 | * 239 | * These files match the first regex. 240 | */ 241 | File dir = tmp.newFolder("dir"); 242 | 243 | inputGroups.put(new Text(dir.getAbsolutePath() + "-0"), asList( writeFile(dir, "file10", Format.TEXT), 244 | writeFile(dir, "file11", Format.TEXT))); 245 | 246 | inputGroups.put(new Text(dir.getAbsolutePath() + "-1"), asList( writeFile(dir, "file12", Format.TEXT), 247 | writeFile(dir, "file13", Format.TEXT))); 248 | 249 | recordCollectForFile(dir, "file10", "secondregex-98765-1234-0"); 250 | recordCollectForFile(dir, "file11", "secondregex-98765-1234-0"); 251 | recordCollectForFile(dir, "file12", "secondregex-98765-1234-1"); 252 | recordCollectForFile(dir, "file13", "secondregex-98765-1234-1"); 253 | 254 | 255 | /* 256 | * dir/ 257 | * subdir/ 258 | * file20 0 259 | * file21 0 260 | * file22 1 261 | * file23 1 262 | * file24 1 263 | */ 264 | File subdir = tmp.newFolder("dir/subdir"); 265 | 266 | inputGroups.put(new Text(subdir.getAbsolutePath() + "-0"), asList( writeFile(subdir, "file20", Format.SEQUENCE), 267 | writeFile(subdir, "file21", Format.SEQUENCE))); 268 | 269 | inputGroups.put(new Text(subdir.getAbsolutePath() + "-1"), asList( writeFile(subdir, "file22", Format.SEQUENCE), 270 | writeFile(subdir, "file23", Format.SEQUENCE), 271 | writeFile(subdir, "file24", Format.SEQUENCE))); 272 | 273 | recordCollectForFile(subdir, "file20", "thirdregex-subdir-98765-1234-2"); 274 | recordCollectForFile(subdir, "file21", "thirdregex-subdir-98765-1234-2"); 275 | recordCollectForFile(subdir, "file22", "thirdregex-subdir-98765-1234-3"); 276 | recordCollectForFile(subdir, "file23", "thirdregex-subdir-98765-1234-3"); 277 | recordCollectForFile(subdir, "file24", "thirdregex-subdir-98765-1234-3"); 278 | 279 | 280 | /* 281 | * dir/ 282 | * subdir/ 283 | * subsubdir/ 284 | * file30 0 285 | * file31 0 286 | * file32 0 287 | * file33 1 288 | * file34 1 289 | */ 290 | File subsubdir = tmp.newFolder("dir/subdir/subsubdir"); 291 | 292 | inputGroups.put(new Text(subsubdir.getAbsolutePath() + "-0"), asList( writeFile(subsubdir, "file30", Format.SEQUENCE), 293 | writeFile(subsubdir, "file31", Format.SEQUENCE), 294 | writeFile(subsubdir, "file32", Format.SEQUENCE))); 295 | 296 | inputGroups.put(new Text(subsubdir.getAbsolutePath() + "-1"), asList( writeFile(subsubdir, "file33", Format.SEQUENCE), 297 | writeFile(subsubdir, "file34", Format.SEQUENCE))); 298 | 299 | recordCollectForFile(subsubdir, "file30", "thirdregex-subsubdir-98765-1234-4"); 300 | recordCollectForFile(subsubdir, "file31", "thirdregex-subsubdir-98765-1234-4"); 301 | recordCollectForFile(subsubdir, "file32", "thirdregex-subsubdir-98765-1234-4"); 302 | recordCollectForFile(subsubdir, "file33", "thirdregex-subsubdir-98765-1234-5"); 303 | recordCollectForFile(subsubdir, "file34", "thirdregex-subsubdir-98765-1234-5"); 304 | 305 | 306 | /* 307 | * dir/ 308 | * subdir/ 309 | * other/ 310 | * file40 1 311 | * file41 1 312 | * file42 2 313 | * file43 2 314 | */ 315 | File other1 = tmp.newFolder("dir/subdir/other"); 316 | 317 | inputGroups.put(new Text(other1.getAbsolutePath() + "-1"), asList( writeFile(other1, "file40", Format.SEQUENCE), 318 | writeFile(other1, "file41", Format.SEQUENCE))); 319 | 320 | inputGroups.put(new Text(other1.getAbsolutePath() + "-2"), asList( writeFile(other1, "file42", Format.SEQUENCE), 321 | writeFile(other1, "file43", Format.SEQUENCE))); 322 | 323 | recordCollectForFile(other1, "file40", "98765-1234-middle-6-tail"); 324 | recordCollectForFile(other1, "file41", "98765-1234-middle-6-tail"); 325 | recordCollectForFile(other1, "file42", "98765-1234-middle-7-tail"); 326 | recordCollectForFile(other1, "file43", "98765-1234-middle-7-tail"); 327 | 328 | 329 | /* 330 | * dir/ 331 | * other/ 332 | * file50 0 333 | * file51 0 334 | * file52 1 335 | * file53 1 336 | * file54 3 337 | * file55 3 338 | */ 339 | File other2 = tmp.newFolder("dir/other"); 340 | 341 | inputGroups.put(new Text(other2.getAbsolutePath() + "-0"), asList( writeFile(other2, "file50", Format.SEQUENCE), 342 | writeFile(other2, "file51", Format.SEQUENCE))); 343 | 344 | inputGroups.put(new Text(other2.getAbsolutePath() + "-1"), asList( writeFile(other2, "file52", Format.SEQUENCE), 345 | writeFile(other2, "file53", Format.SEQUENCE))); 346 | 347 | inputGroups.put(new Text(other2.getAbsolutePath() + "-3"), asList( writeFile(other2, "file54", Format.SEQUENCE), 348 | writeFile(other2, "file55", Format.SEQUENCE))); 349 | 350 | recordCollectForFile(other2, "file50", "98765-1234-middle-8-tail"); 351 | recordCollectForFile(other2, "file51", "98765-1234-middle-8-tail"); 352 | recordCollectForFile(other2, "file52", "98765-1234-middle-9-tail"); 353 | recordCollectForFile(other2, "file53", "98765-1234-middle-9-tail"); 354 | recordCollectForFile(other2, "file54", "98765-1234-middle-10-tail"); 355 | recordCollectForFile(other2, "file55", "98765-1234-middle-10-tail"); 356 | 357 | /* 358 | * text/ 359 | * file60 2 360 | * file61 2 361 | * file62 3 362 | * file63 3 363 | */ 364 | File text = tmp.newFolder("text"); 365 | 366 | inputGroups.put(new Text(text.getAbsolutePath() + "-2"), asList(writeFile(text, "file60", Format.TEXT), 367 | writeFile(text, "file61", Format.TEXT))); 368 | 369 | inputGroups.put(new Text(text.getAbsolutePath() + "-3"), asList(writeFile(text, "file62", Format.TEXT), 370 | writeFile(text, "file63", Format.TEXT))); 371 | 372 | recordCollectForFile(text, "file60", "fourthregex-1234-98765-11"); 373 | recordCollectForFile(text, "file61", "fourthregex-1234-98765-11"); 374 | recordCollectForFile(text, "file62", "fourthregex-1234-98765-12"); 375 | recordCollectForFile(text, "file63", "fourthregex-1234-98765-12"); 376 | 377 | replayAll(); 378 | 379 | for (Entry> e : inputGroups.entrySet()) { 380 | reducer.reduce(e.getKey(), e.getValue().iterator(), collector, reporter); 381 | } 382 | 383 | verifyAll(); 384 | 385 | verifyWorkOutput(dir, "secondregex-98765-1234-0", Format.TEXT, Format.SEQUENCE, "file10", "file11"); 386 | verifyWorkOutput(dir, "secondregex-98765-1234-1", Format.TEXT, Format.SEQUENCE, "file12", "file13"); 387 | verifyWorkOutput(subdir, "thirdregex-subdir-98765-1234-2", Format.SEQUENCE, Format.SEQUENCE, "file20", "file21"); 388 | verifyWorkOutput(subdir, "thirdregex-subdir-98765-1234-3", Format.SEQUENCE, Format.SEQUENCE, "file22", "file23", "file24"); 389 | verifyWorkOutput(subsubdir, "thirdregex-subsubdir-98765-1234-4", Format.SEQUENCE, Format.SEQUENCE, "file30", "file31", "file32"); 390 | verifyWorkOutput(subsubdir, "thirdregex-subsubdir-98765-1234-5", Format.SEQUENCE, Format.SEQUENCE, "file33", "file34"); 391 | verifyWorkOutput(other1, "98765-1234-middle-6-tail", Format.SEQUENCE, Format.TEXT, "file40", "file41"); 392 | verifyWorkOutput(other1, "98765-1234-middle-7-tail", Format.SEQUENCE, Format.TEXT, "file42", "file43"); 393 | verifyWorkOutput(other2, "98765-1234-middle-8-tail", Format.SEQUENCE, Format.TEXT, "file50", "file51"); 394 | verifyWorkOutput(other2, "98765-1234-middle-9-tail", Format.SEQUENCE, Format.TEXT, "file52", "file53"); 395 | verifyWorkOutput(other2, "98765-1234-middle-10-tail", Format.SEQUENCE, Format.TEXT, "file54", "file55"); 396 | verifyWorkOutput(text, "fourthregex-1234-98765-11", Format.TEXT, Format.TEXT, "file60", "file61"); 397 | verifyWorkOutput(text, "fourthregex-1234-98765-12", Format.TEXT, Format.TEXT, "file62", "file63"); 398 | } 399 | 400 | /** 401 | * Verifies that the work dir has the expected output. 402 | */ 403 | private void verifyWorkOutput(File srcDir, String crushedOutFileName, Format inFmt, Format outFmt, String... fileNames) throws IOException { 404 | 405 | /* 406 | * Read format table 407 | * 408 | * \ out format 409 | * \ 410 | * in format \ seq | text 411 | * ---------------------------- 412 | * seq | Custom | ascii | 413 | * -------------------------- - 414 | * text | Text | ascii | 415 | * ---------------------------- 416 | */ 417 | File crushOutput = new File(workDir.getAbsolutePath() + "/crush" + srcDir.getAbsolutePath() + "/" + crushedOutFileName); 418 | 419 | if (Format.TEXT == outFmt) { 420 | /* 421 | * TextInputFormat will produce keys that are byte offsets and values that are the line. This is not actually what we want. 422 | * We want to preserve the actualy keys and values in the files, just like SequenceFileInputFormat. So, either way, the 423 | * keys and values should be the text representations of what went in. 424 | */ 425 | BufferedReader reader; 426 | 427 | /* 428 | * Text output format appends the default extension of the codec, if there is one. 429 | */ 430 | if (CompressionType.NONE == compressionType) { 431 | reader = new BufferedReader(new InputStreamReader(new FileInputStream(crushOutput))); 432 | } else { 433 | CustomCompressionCodec codec = new CustomCompressionCodec(); 434 | codec.setConf(job); 435 | 436 | reader = new BufferedReader(new InputStreamReader(codec.createInputStream(new FileInputStream(crushOutput + ".custom")))); 437 | } 438 | 439 | String line = ""; 440 | 441 | for (String fileName : fileNames) { 442 | int max = Integer.parseInt(fileName.substring(4)); 443 | 444 | for (int key = 1, value = max * 100 + 1; key <= max; key++, value++) { 445 | String expectedLine = format("%d\t%d", key, value); 446 | 447 | line = reader.readLine(); 448 | 449 | assertThat(line, equalTo(expectedLine)); 450 | } 451 | } 452 | 453 | assertThat("Should be at end of crush output file" + crushedOutFileName, reader.readLine(), nullValue()); 454 | 455 | reader.close(); 456 | } else if (Format.SEQUENCE == inFmt && Format.SEQUENCE == outFmt) { 457 | /* 458 | * Record reader will produce keys that are custom writables and values that are custom writable. 459 | */ 460 | Reader reader = new Reader(fs, new Path(crushOutput.getAbsolutePath()), job); 461 | 462 | assertThat(reader.isCompressed(), is(compressionType != CompressionType.NONE)); 463 | 464 | if (reader.isCompressed()) { 465 | assertThat(reader.isBlockCompressed(), is(compressionType == CompressionType.BLOCK)); 466 | assertThat(reader.getCompressionCodec().getClass(), equalTo((Object) CustomCompressionCodec.class)); 467 | } 468 | 469 | CustomWritable key = new CustomWritable(); 470 | CustomWritable value = new CustomWritable(); 471 | 472 | for (String fileName : fileNames) { 473 | int max = Integer.parseInt(fileName.substring(4)); 474 | 475 | for (int k = 1, v = max * 100 + 1; k <= max; k++, v++) { 476 | reader.next(key, value); 477 | 478 | assertThat(fileName, key.get(), equalTo((long) k)); 479 | assertThat(fileName, value.get(), equalTo((long) v)); 480 | } 481 | } 482 | 483 | assertThat("Should be at end of crush output file" + crushedOutFileName, reader.next(key, value), is(false)); 484 | 485 | reader.close(); 486 | } else if (Format.TEXT == inFmt && Format.SEQUENCE == outFmt) { 487 | 488 | Reader reader = new Reader(fs, new Path(crushOutput.getAbsolutePath()), job); 489 | 490 | assertThat(reader.isCompressed(), is(compressionType != CompressionType.NONE)); 491 | 492 | if (reader.isCompressed()) { 493 | assertThat(reader.isBlockCompressed(), is(compressionType == CompressionType.BLOCK)); 494 | assertThat(reader.getCompressionCodec().getClass(), equalTo((Object) CustomCompressionCodec.class)); 495 | } 496 | 497 | Text key = new Text(); 498 | Text value = new Text(); 499 | 500 | for (String fileName : fileNames) { 501 | int max = Integer.parseInt(fileName.substring(4)); 502 | 503 | for (int k = 1, v = max * 100 + 1; k <= max; k++, v++) { 504 | reader.next(key, value); 505 | 506 | assertThat(fileName, key.toString(), equalTo(Integer.toString(k))); 507 | assertThat(fileName, value.toString(), equalTo(Integer.toString(v))); 508 | } 509 | } 510 | 511 | assertThat("Should be at end of crush output file" + crushedOutFileName, reader.next(key, value), is(false)); 512 | 513 | reader.close(); 514 | } else { 515 | fail(); 516 | } 517 | } 518 | 519 | /** 520 | * Records an expectation that a file has been crushed. The key is the absolute path of the crush input file. The value is the 521 | * absolute path of the crush output file, which is rooted in the output dir/crush (not the attempt work dir). 522 | */ 523 | private void recordCollectForFile(File srcDir, String crushInput, String crushOutput) throws IOException { 524 | Text srcFileAbsPath = new Text(new File(srcDir, crushInput).getAbsolutePath()); 525 | Text fileInJobOutputDir = new Text(format("%s/crush%s", outDir.getAbsolutePath(), new File(srcDir, crushOutput).getAbsolutePath())); 526 | 527 | collector.collect(srcFileAbsPath, fileInJobOutputDir); 528 | reporter.incrCounter(ReducerCounter.FILES_CRUSHED, 1); 529 | 530 | reporter.incrCounter(ReducerCounter.RECORDS_CRUSHED, 1); 531 | expectLastCall().times(Integer.parseInt(crushInput.substring(4))); 532 | } 533 | 534 | /** 535 | * Every file in this unit test is named "file" followed by a number. This method will create a sequence file with as many lines 536 | * as the number in the file name. The keys in the file will count from one to the number. The values in the file will count 537 | * from 100n + 1 to 100n + n. This way each file will have distinct contents so long as no two files have the same name. 538 | */ 539 | private Text writeFile(File srcDir, String fileName, Format format) throws IOException { 540 | 541 | int fileNum = Integer.parseInt(fileName.substring(4)); 542 | 543 | File file = new File(srcDir, fileName); 544 | 545 | if (Format.TEXT == format) { 546 | PrintWriter writer = new PrintWriter(new FileOutputStream(file)); 547 | 548 | for (int k = 1, v = 100 * fileNum + 1; k <= fileNum; k++, v++) { 549 | writer.printf("%d\t%d\n", k, v); 550 | } 551 | 552 | writer.close(); 553 | } else { 554 | CustomWritable key = new CustomWritable(); 555 | CustomWritable value = new CustomWritable(); 556 | 557 | DefaultCodec codec = new DefaultCodec(); 558 | codec.setConf(job); 559 | 560 | Writer writer = SequenceFile.createWriter(fs, job, new Path(file.getAbsolutePath()), CustomWritable.class, 561 | CustomWritable.class, compressionType, codec); 562 | 563 | for (int k = 1, v = 100 * fileNum + 1; k <= fileNum; k++, v++) { 564 | key.set(k); 565 | value.set(v); 566 | 567 | writer.append(key, value); 568 | } 569 | 570 | writer.close(); 571 | } 572 | 573 | return new Text(file.getAbsolutePath()); 574 | } 575 | 576 | private enum Format { 577 | TEXT, SEQUENCE 578 | } 579 | 580 | /** 581 | * This only sexists to prove that the reducer can read and write custom writables about which it has no a priori knowledge. 582 | */ 583 | public static class CustomWritable extends LongWritable { 584 | } 585 | 586 | /** 587 | * This only exists to prove that the reducer can use custom codecs. 588 | */ 589 | public static class CustomCompressionCodec extends DefaultCodec { 590 | public CustomCompressionCodec() { 591 | super(); 592 | } 593 | 594 | @Override 595 | public String getDefaultExtension() { 596 | return ".custom"; 597 | } 598 | } 599 | } 600 | -------------------------------------------------------------------------------- /src/test/java/com/m6d/filecrush/crush/CrushTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011 m6d.com 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.m6d.filecrush.crush; 17 | 18 | import static java.lang.String.format; 19 | import static java.lang.System.currentTimeMillis; 20 | import static org.hamcrest.Matchers.equalTo; 21 | import static org.junit.Assert.assertThat; 22 | import static org.junit.Assert.fail; 23 | 24 | import java.io.DataInputStream; 25 | import java.io.File; 26 | import java.io.FileOutputStream; 27 | import java.io.IOException; 28 | import java.net.URI; 29 | import java.util.ArrayList; 30 | import java.util.Arrays; 31 | import java.util.Collections; 32 | import java.util.HashMap; 33 | import java.util.List; 34 | import java.util.Map; 35 | 36 | import org.apache.commons.cli.UnrecognizedOptionException; 37 | import org.apache.hadoop.conf.Configuration; 38 | import org.apache.hadoop.fs.BlockLocation; 39 | import org.apache.hadoop.fs.ContentSummary; 40 | import org.apache.hadoop.fs.FSDataInputStream; 41 | import org.apache.hadoop.fs.FSDataOutputStream; 42 | import org.apache.hadoop.fs.FileChecksum; 43 | import org.apache.hadoop.fs.FileStatus; 44 | import org.apache.hadoop.fs.FileSystem; 45 | import org.apache.hadoop.fs.Path; 46 | import org.apache.hadoop.fs.PathFilter; 47 | import org.apache.hadoop.fs.permission.FsPermission; 48 | import org.apache.hadoop.io.IntWritable; 49 | import org.apache.hadoop.io.SequenceFile.Reader; 50 | import org.apache.hadoop.io.Text; 51 | import org.apache.hadoop.mapred.Counters; 52 | import org.apache.hadoop.mapred.JobConf; 53 | import org.apache.hadoop.util.Progressable; 54 | import org.apache.hadoop.util.ToolRunner; 55 | import org.junit.After; 56 | import org.junit.Before; 57 | import org.junit.Rule; 58 | import org.junit.Test; 59 | import org.junit.rules.TemporaryFolder; 60 | 61 | import com.m6d.filecrush.crush.Crush; 62 | import com.m6d.filecrush.crush.MapperCounter; 63 | 64 | @SuppressWarnings("deprecation") 65 | public class CrushTest { 66 | @Rule 67 | public final TemporaryFolder tmp = new TemporaryFolder(); 68 | 69 | private JobConf job; 70 | 71 | private FileSystem fileSystem; 72 | 73 | private String javaIoTmpDir; 74 | 75 | @Before 76 | public void setupJob() throws IOException { 77 | job = new JobConf(false); 78 | 79 | job.set("fs.default.name", "file:///"); 80 | job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem"); 81 | job.setInt("mapred.reduce.tasks", 5); 82 | job.setLong("dfs.block.size", 50); 83 | 84 | FileSystem delegate = FileSystem.get(job); 85 | 86 | fileSystem = new SortingFileSystem(delegate); 87 | 88 | /* 89 | * Set the working directory so that all relative paths are rooted in the tmp dir. This will keep the file system clean of 90 | * temporary test files. 91 | */ 92 | FileSystem.get(job).setWorkingDirectory(new Path(tmp.getRoot().getAbsolutePath())); 93 | } 94 | 95 | @Before 96 | public void setJavaIoTmpDir() { 97 | javaIoTmpDir = System.setProperty("java.io.tmpdir", tmp.getRoot().getAbsolutePath()); 98 | } 99 | 100 | @After 101 | public void restoreJavaIoTmpDir() { 102 | System.setProperty("java.io.tmpdir", javaIoTmpDir); 103 | } 104 | 105 | private void run(String... args) throws Exception { 106 | ToolRunner.run(job, new Crush(), args); 107 | } 108 | 109 | @Test 110 | public void backwardsCompatibleInvocationBadSrcDir() throws Exception { 111 | try { 112 | run("does-not-exist", tmp.getRoot().getAbsolutePath(), "80"); 113 | fail(); 114 | } catch (IOException e) { 115 | if (!e.getMessage().contains("does-not-exist")) { 116 | throw e; 117 | } 118 | } 119 | } 120 | 121 | @Test 122 | public void backwardsCompatibleInvocationBadNumberOfTasks() throws Exception { 123 | try { 124 | run(tmp.newFolder("in").getAbsolutePath(), tmp.newFolder("out").getAbsolutePath(), "not a number"); 125 | fail(); 126 | } catch (NumberFormatException e) { 127 | if (!e.getMessage().contains("not a number")) { 128 | throw e; 129 | } 130 | } 131 | } 132 | 133 | @Test 134 | public void backwardsCompatibleInvocationNegativeTasks() throws Exception { 135 | try { 136 | run(tmp.newFolder("in").getAbsolutePath(), tmp.newFolder("out").getAbsolutePath(), "-1"); 137 | fail(); 138 | } catch (UnrecognizedOptionException e) { 139 | if (!e.getMessage().contains("-1")) { 140 | throw e; 141 | } 142 | } 143 | } 144 | 145 | @Test 146 | public void backwardsCompatibleInvocationZeroTasks() throws Exception { 147 | try { 148 | run(tmp.newFolder("in").getAbsolutePath(), tmp.newFolder("out").getAbsolutePath(), "0"); 149 | fail(); 150 | } catch (IllegalArgumentException e) { 151 | if (!e.getMessage().contains("0")) { 152 | throw e; 153 | } 154 | } 155 | } 156 | 157 | @Test 158 | public void backwardsCompatibleInvocationHugeTasks() throws Exception { 159 | try { 160 | run(tmp.newFolder("in").getAbsolutePath(), tmp.newFolder("out").getAbsolutePath(), "4001"); 161 | fail(); 162 | } catch (IllegalArgumentException e) { 163 | if (!e.getMessage().contains("4001")) { 164 | throw e; 165 | } 166 | } 167 | } 168 | 169 | @Test 170 | public void backwardsCompatibleInvocationBadSrcDirWithType() throws Exception { 171 | try { 172 | run("does-not-exist", tmp.getRoot().getAbsolutePath(), "80", "TEXT"); 173 | fail(); 174 | } catch (IOException e) { 175 | if (!e.getMessage().contains("does-not-exist")) { 176 | throw e; 177 | } 178 | } 179 | } 180 | 181 | @Test 182 | public void backwardsCompatibleInvocationBadNumberOfTasksWithType() throws Exception { 183 | try { 184 | run(tmp.newFolder("in").getAbsolutePath(), tmp.newFolder("out").getAbsolutePath(), "not a number", "TEXT"); 185 | fail(); 186 | } catch (NumberFormatException e) { 187 | if (!e.getMessage().contains("not a number")) { 188 | throw e; 189 | } 190 | } 191 | } 192 | 193 | @Test 194 | public void backwardsCompatibleInvocationNegativeTasksWithType() throws Exception { 195 | try { 196 | run(tmp.newFolder("in").getAbsolutePath(), tmp.newFolder("out").getAbsolutePath(), "-1", "TEXT"); 197 | fail(); 198 | } catch (UnrecognizedOptionException e) { 199 | if (!e.getMessage().contains("-1")) { 200 | throw e; 201 | } 202 | } 203 | } 204 | 205 | @Test 206 | public void backwardsCompatibleInvocationZeroTasksWithType() throws Exception { 207 | try { 208 | run(tmp.newFolder("in").getAbsolutePath(), tmp.newFolder("out").getAbsolutePath(), "0", "TEXT"); 209 | fail(); 210 | } catch (IllegalArgumentException e) { 211 | if (!e.getMessage().contains("0")) { 212 | throw e; 213 | } 214 | } 215 | } 216 | 217 | @Test 218 | public void backwardsCompatibleInvocationHugeHugeTasksWithType() throws Exception { 219 | try { 220 | run(tmp.newFolder("in").getAbsolutePath(), tmp.newFolder("out").getAbsolutePath(), "4001", "TEXT"); 221 | fail(); 222 | } catch (IllegalArgumentException e) { 223 | if (!e.getMessage().contains("4001")) { 224 | throw e; 225 | } 226 | } 227 | } 228 | 229 | @Test 230 | public void backwardsCompatibleInvocationBadType() throws Exception { 231 | try { 232 | run(tmp.newFolder("in").getAbsolutePath(), tmp.newFolder("out").getAbsolutePath(), "80", "NEITHER_TEXT_OR_SEQUENCE"); 233 | fail(); 234 | } catch (IllegalArgumentException e) { 235 | if (!e.getMessage().contains("NEITHER_TEXT_OR_SEQUENCE")) { 236 | throw e; 237 | } 238 | } 239 | } 240 | 241 | @Test 242 | public void invocationBadSrcDir() throws Exception { 243 | try { 244 | run("--threshold=0.9", "does-not-exist", tmp.getRoot().getAbsolutePath(), "20101116123015"); 245 | fail(); 246 | } catch (IOException e) { 247 | if (!e.getMessage().contains("does-not-exist")) { 248 | throw e; 249 | } 250 | } 251 | } 252 | 253 | @Test 254 | public void invocationBadTimestamp() throws Exception { 255 | try { 256 | run("--threshold=0.9", tmp.newFolder("in").getAbsolutePath(), new File(tmp.getRoot(), "out").getAbsolutePath(), "not a number"); 257 | fail(); 258 | } catch (IllegalArgumentException e) { 259 | if (!e.getMessage().contains("not a number")) { 260 | throw e; 261 | } 262 | } 263 | } 264 | 265 | @Test 266 | public void invocationShortTimestamp() throws Exception { 267 | try { 268 | run(tmp.newFolder("in").getAbsolutePath(), new File(tmp.getRoot(), "out").getAbsolutePath(), "2010111612301"); 269 | fail(); 270 | } catch (IllegalArgumentException e) { 271 | if (!e.getMessage().contains("2010111612301")) { 272 | throw e; 273 | } 274 | } 275 | } 276 | 277 | @Test 278 | public void invocationLongTimestamp() throws Exception { 279 | try { 280 | run("--threshold=0.5", tmp.newFolder("in").getAbsolutePath(), new File(tmp.getRoot(), "out").getAbsolutePath(), "201011161230150"); 281 | fail(); 282 | } catch (IllegalArgumentException e) { 283 | if (!e.getMessage().contains("201011161230150")) { 284 | throw e; 285 | } 286 | } 287 | } 288 | 289 | @Test 290 | public void dirWithNoMatchingRegex() throws Exception { 291 | /* 292 | * Create a non-empty directory. 293 | */ 294 | File src = tmp.newFolder("src"); 295 | tmp.newFolder("src/foo"); 296 | tmp.newFile("src/foo/file"); 297 | 298 | try { 299 | run("--regex", ".+/in", 300 | "--replacement", "foo", 301 | "--input-format", "org.apache.hadoop.mapred.TextInputFormat", 302 | "--output-format", "org.apache.hadoop.mapred.TextOutputFormat", 303 | "--threshold", "0.5", 304 | "--max-file-blocks", "100", 305 | src.getAbsolutePath(), "out", "20101116123015"); 306 | 307 | fail(); 308 | } catch (IllegalArgumentException e) { 309 | if (!e.getMessage().contains("src/foo")) { 310 | throw e; 311 | } 312 | } 313 | } 314 | 315 | @Test 316 | public void bucketing() throws Exception { 317 | File in = tmp.newFolder("in"); 318 | 319 | Counters expectedCounters = new Counters(); 320 | List expectedBucketFiles = new ArrayList(); 321 | 322 | /* 323 | * Create a hierarchy of directories. Directories are distinguished by a trailing slash in these comments. 324 | * 325 | * 1/ 326 | * 1.1/ 327 | * file1 10 bytes 328 | * file2 20 bytes 329 | * file3 30 bytes 330 | * file4 41 bytes 331 | * file5 15 bytes 332 | * file6 30 bytes 333 | * file7 20 bytes 334 | * 1.2/ 335 | * file1 20 bytes 336 | * file2 10 bytes 337 | * 1.3/ 338 | * 2/ 339 | * file1 70 bytes 340 | * file2 30 bytes 341 | * file3 25 bytes 342 | * file4 30 bytes 343 | * file5 35 bytes 344 | * 2.1/ 345 | * file1 10 bytes 346 | * 2.2/ 347 | * file1 25 bytes 348 | * file2 15 bytes 349 | * file3 35 bytes 350 | * 2.3/ 351 | * file1 41 bytes 352 | * file2 10 bytes 353 | * 2.4/ 354 | * 2.4.1/ 355 | * file1 100 bytes 356 | * file2 30 bytes 357 | * 2.4.2/ 358 | * file1 20 bytes 359 | * file2 20 bytes 360 | * file3 10 bytes 361 | */ 362 | 363 | /* 364 | * in contains 2 dirs and no files so it is skipped. 365 | * 366 | * in/ 367 | * 1/ 368 | * 2/ 369 | */ 370 | expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); 371 | expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); 372 | 373 | tmp.newFolder("in/1"); 374 | File dir2 = tmp.newFolder("in/2"); 375 | 376 | 377 | /* 378 | * in/1 contains three dirs and no files so it is skipped. 379 | * 380 | * in/ 381 | * 1/ 382 | * 1.1/ 383 | * 1.2/ 384 | * 1.3/ 385 | */ 386 | expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); 387 | expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); 388 | 389 | File dir1_1 = tmp.newFolder("in/1/1.1"); 390 | File dir1_2 = tmp.newFolder("in/1/1.2"); 391 | tmp.newFolder("in/1/1.3"); 392 | 393 | 394 | /* 395 | * in/2 contains five files and four dirs. 396 | * 397 | * in/ 398 | * 2/ 399 | * file1 70 bytes 400 | * file2 30 bytes 401 | * file3 25 bytes 402 | * file4 30 bytes 403 | * file5 35 bytes 404 | * 2.1/ 405 | * 2.2/ 406 | * 2.3/ 407 | * 2.4/ 408 | * 409 | * 0 1 2 410 | * file5 35 file2 30 file4 30 411 | * file3 25 412 | * 413 | * Buckets 0 and 2 have a single file each so they are ignored. 414 | */ 415 | expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); 416 | expectedCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1); 417 | 418 | expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 5); 419 | expectedCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, 2); 420 | expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 3); 421 | 422 | File dir2_1 = tmp.newFolder("in/2/2.1"); 423 | File dir2_2 = tmp.newFolder("in/2/2.2"); 424 | File dir2_3 = tmp.newFolder("in/2/2.3"); 425 | tmp.newFolder("in/2/2.4"); 426 | 427 | createFile(dir2, "file1", 70); 428 | createFile(dir2, "file2", 30); 429 | createFile(dir2, "file3", 25); 430 | createFile(dir2, "file4", 30); 431 | createFile(dir2, "file5", 35); 432 | 433 | expectedBucketFiles.add(format("%s %s", dir2.getAbsolutePath() + "-1", new File(dir2, "file2").getAbsolutePath())); 434 | expectedBucketFiles.add(format("%s %s", dir2.getAbsolutePath() + "-1", new File(dir2, "file3").getAbsolutePath())); 435 | 436 | 437 | /* 438 | * in/1/1.1 contains seven files and no dirs. 439 | * 440 | * in/ 441 | * 1/ 442 | * 1.1/ 443 | * file1 10 bytes 444 | * file2 20 bytes 445 | * file3 30 bytes 446 | * file4 41 bytes 447 | * file5 15 bytes 448 | * file6 30 bytes 449 | * file7 20 bytes 450 | * 451 | * 0 1 2 452 | * file3 30 file6 30 file2 20 453 | * file5 15 file1 10 file7 20 454 | * 455 | * file4 is > 50 * 0.8 so it is ignored. 456 | */ 457 | expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); 458 | expectedCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1); 459 | 460 | expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 7); 461 | expectedCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, 6); 462 | expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 1); 463 | 464 | createFile(dir1_1, "file1", 10); 465 | createFile(dir1_1, "file2", 20); 466 | createFile(dir1_1, "file3", 30); 467 | createFile(dir1_1, "file4", 41); 468 | createFile(dir1_1, "file5", 15); 469 | createFile(dir1_1, "file6", 30); 470 | createFile(dir1_1, "file7", 20); 471 | 472 | expectedBucketFiles.add(format("%s %s", dir1_1.getAbsolutePath() + "-0", new File(dir1_1, "file3").getAbsolutePath())); 473 | expectedBucketFiles.add(format("%s %s", dir1_1.getAbsolutePath() + "-0", new File(dir1_1, "file5").getAbsolutePath())); 474 | expectedBucketFiles.add(format("%s %s", dir1_1.getAbsolutePath() + "-1", new File(dir1_1, "file6").getAbsolutePath())); 475 | expectedBucketFiles.add(format("%s %s", dir1_1.getAbsolutePath() + "-1", new File(dir1_1, "file1").getAbsolutePath())); 476 | expectedBucketFiles.add(format("%s %s", dir1_1.getAbsolutePath() + "-2", new File(dir1_1, "file2").getAbsolutePath())); 477 | expectedBucketFiles.add(format("%s %s", dir1_1.getAbsolutePath() + "-2", new File(dir1_1, "file7").getAbsolutePath())); 478 | 479 | 480 | /* 481 | * in/1/1.2 contains to files. 482 | * 483 | * in/ 484 | * 1/ 485 | * 1.2/ 486 | * file1 20 bytes 487 | * file2 10 bytes 488 | */ 489 | expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); 490 | expectedCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1); 491 | 492 | expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 2); 493 | expectedCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, 2); 494 | 495 | createFile(dir1_2, "file1", 20); 496 | createFile(dir1_2, "file2", 10); 497 | 498 | expectedBucketFiles.add(format("%s %s", dir1_2.getAbsolutePath() + "-0", new File(dir1_2, "file1").getAbsolutePath())); 499 | expectedBucketFiles.add(format("%s %s", dir1_2.getAbsolutePath() + "-0", new File(dir1_2, "file2").getAbsolutePath())); 500 | 501 | 502 | /* 503 | * in/1/1.3 is empty. 504 | * 505 | * in/ 506 | * 1/ 507 | * 1.3/ 508 | */ 509 | expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); 510 | expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); 511 | 512 | tmp.newFolder("in/1/1.3"); 513 | 514 | 515 | /* 516 | * in/2/2.1 contains on file. 517 | * 518 | * in/ 519 | * 2/ 520 | * 2.1/ 521 | * file1 10 bytes 522 | * 523 | * Single file dirs are ignored. 524 | */ 525 | expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); 526 | expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); 527 | 528 | expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 1); 529 | expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 1); 530 | 531 | createFile(dir2_1, "file1", 10); 532 | 533 | 534 | /* 535 | * in/2/2.2 contains three files. 536 | * 537 | * in/ 538 | * 2/ 539 | * 2.2/ 540 | * file1 25 bytes 541 | * file2 15 bytes 542 | * file3 35 bytes 543 | * 544 | * 0 1 545 | * file3 35 file1 25 546 | * file2 15 547 | * 548 | * Bucket 0 with a single file is ignored. 549 | */ 550 | expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); 551 | expectedCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1); 552 | 553 | expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 3); 554 | expectedCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, 2); 555 | expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 1); 556 | 557 | createFile(dir2_2, "file1", 25); 558 | createFile(dir2_2, "file2", 15); 559 | createFile(dir2_2, "file3", 35); 560 | 561 | expectedBucketFiles.add(format("%s %s", dir2_2.getAbsolutePath() + "-1", new File(dir2_2, "file1").getAbsolutePath())); 562 | expectedBucketFiles.add(format("%s %s", dir2_2.getAbsolutePath() + "-1", new File(dir2_2, "file2").getAbsolutePath())); 563 | 564 | 565 | /* 566 | * in/2/2.3 contains 2 files. 567 | * 568 | * in/ 569 | * 2/ 570 | * 2.3/ 571 | * file1 41 bytes 572 | * file2 10 bytes 573 | * 574 | * file1 is too big and leaving file2 as a single file, which is also ignored. 575 | */ 576 | expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); 577 | expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); 578 | 579 | expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 2); 580 | expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 2); 581 | 582 | createFile(dir2_3, "file1", 41); 583 | createFile(dir2_3, "file2", 10); 584 | 585 | 586 | /* 587 | * in/2/2.4 contains two sub directories and no files. 588 | * 589 | * in/ 590 | * 2/ 591 | * 2.4/ 592 | * 2.4.1/ 593 | * 2.4.2/ 594 | */ 595 | expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); 596 | expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); 597 | 598 | tmp.newFolder("in/2/2.4"); 599 | 600 | File dir2_4_1 = tmp.newFolder("in/2/2.4/2.4.1"); 601 | File dir2_4_2 = tmp.newFolder("in/2/2.4/2.4.2"); 602 | 603 | 604 | /* 605 | * in/ 606 | * 2/ 607 | * 2.4/ 608 | * 2.4.1/ 609 | * file1 100 bytes 610 | * file2 30 bytes 611 | */ 612 | expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); 613 | expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); 614 | 615 | expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 2); 616 | expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 2); 617 | 618 | createFile(dir2_4_1, "file1", 100); 619 | createFile(dir2_4_1, "file2", 30); 620 | 621 | 622 | /* 623 | * in/ 624 | * 2/ 625 | * 2.4/ 626 | * 2.4.2/ 627 | * file1 20 bytes 628 | * file2 20 bytes 629 | * file3 10 bytes 630 | * 0 631 | * file1 20 632 | * file2 20 633 | * file3 10 634 | */ 635 | expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); 636 | expectedCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1); 637 | 638 | expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 3); 639 | expectedCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, 3); 640 | 641 | createFile(dir2_4_2, "file1", 20); 642 | createFile(dir2_4_2, "file2", 20); 643 | createFile(dir2_4_2, "file3", 10); 644 | 645 | expectedBucketFiles.add(format("%s %s", dir2_4_2.getAbsolutePath() + "-0", new File(dir2_4_2, "file1").getAbsolutePath())); 646 | expectedBucketFiles.add(format("%s %s", dir2_4_2.getAbsolutePath() + "-0", new File(dir2_4_2, "file2").getAbsolutePath())); 647 | expectedBucketFiles.add(format("%s %s", dir2_4_2.getAbsolutePath() + "-0", new File(dir2_4_2, "file3").getAbsolutePath())); 648 | 649 | 650 | Crush crush = new Crush(); 651 | 652 | crush.setConf(job); 653 | crush.setFileSystem(fileSystem); 654 | 655 | /* 656 | * Call these in the same order that run() does. 657 | */ 658 | crush.createJobConfAndParseArgs("--compress=none", "--max-file-blocks=1", in.getAbsolutePath(), new File(tmp.getRoot(), "out").getAbsolutePath(), "20101124171730"); 659 | crush.writeDirs(); 660 | 661 | 662 | /* 663 | * Verify bucket contents. 664 | */ 665 | 666 | List actualBucketFiles = new ArrayList(); 667 | 668 | Text key = new Text(); 669 | Text value = new Text(); 670 | 671 | Reader reader = new Reader(FileSystem.get(job), crush.getBucketFiles(), job); 672 | 673 | while(reader.next(key, value)) { 674 | actualBucketFiles.add(format("%s\t%s", key, value)); 675 | } 676 | 677 | reader.close(); 678 | 679 | Collections.sort(expectedBucketFiles); 680 | Collections.sort(actualBucketFiles); 681 | 682 | assertThat(actualBucketFiles, equalTo(expectedBucketFiles)); 683 | 684 | /* 685 | * Verify the partition map. 686 | */ 687 | Reader partitionMapReader = new Reader(FileSystem.get(job), crush.getPartitionMap(), job); 688 | 689 | IntWritable partNum = new IntWritable(); 690 | 691 | Map actualPartitions = new HashMap(); 692 | 693 | while (partitionMapReader.next(key, partNum)) { 694 | actualPartitions.put(key.toString(), partNum.get()); 695 | } 696 | 697 | partitionMapReader.close(); 698 | 699 | /* 700 | * These crush files need to allocated into 5 partitions: 701 | * 702 | * in/2-1 55 bytes 703 | * in/1/1.1-0 45 bytes 704 | * in/1/1.1-2 40 bytes 705 | * in/1/1.1-1 40 bytes 706 | * in/1/1.2-0 30 bytes 707 | * in/2/2.2-1 40 bytes 708 | * in/2/2.4/2.4.2-0 50 bytes 709 | * 710 | * 0 1 2 3 4 711 | * in/2-1 55 in/2/2.4/2.4.2-0 50 in/1/1.1-0 45 in/1/1.1-2 40 in/1/1.1-1 40 712 | * in/2/2.2-1 40 in/1/1.2-0 39 713 | */ 714 | Map expectedPartitions = new HashMap(); 715 | 716 | //TODO: this may not be deterministic due to jvm/hashmap/filesystem 717 | expectedPartitions.put(dir2.getAbsolutePath() + "-1", 0); 718 | expectedPartitions.put(dir2_4_2.getAbsolutePath() + "-0", 1); 719 | expectedPartitions.put(dir1_1.getAbsolutePath() + "-0", 2); 720 | expectedPartitions.put(dir1_1.getAbsolutePath() + "-2", 4); 721 | expectedPartitions.put(dir2_2.getAbsolutePath() + "-1", 3); 722 | expectedPartitions.put(dir1_1.getAbsolutePath() + "-1", 3); 723 | expectedPartitions.put(dir1_2.getAbsolutePath() + "-0", 4); 724 | 725 | assertThat(actualPartitions, equalTo(expectedPartitions)); 726 | 727 | 728 | /* 729 | * Verify counters. 730 | */ 731 | Counters actualCounters = new Counters(); 732 | 733 | DataInputStream countersStream = FileSystem.get(job).open(crush.getCounters()); 734 | 735 | actualCounters.readFields(countersStream); 736 | 737 | countersStream.close(); 738 | 739 | assertThat(actualCounters, equalTo(expectedCounters)); 740 | } 741 | 742 | /** 743 | * Returns a qualified file status, just like {@link FileSystem#listStatus(Path)} does. 744 | */ 745 | private static FileStatus createFile(File dir, String fileName, int size) { 746 | File file = new File(dir, fileName); 747 | 748 | try { 749 | FileOutputStream os = new FileOutputStream(file); 750 | 751 | os.write(new byte[size]); 752 | 753 | os.close(); 754 | } catch (IOException e) { 755 | throw new RuntimeException(e); 756 | } 757 | 758 | return new FileStatus(size, false, 3, 1024 * 1024 * 60, currentTimeMillis(), new Path("hdfs://hostname.pvt:12345" + file.getAbsolutePath())); 759 | } 760 | 761 | /** 762 | * This exists only so we can impose a specific order on the files that are listed. 763 | */ 764 | private static class SortingFileSystem extends FileSystem { 765 | 766 | private final FileSystem delegate; 767 | 768 | public SortingFileSystem(FileSystem delegate) { 769 | super(); 770 | 771 | this.delegate = delegate; 772 | } 773 | 774 | @Override 775 | public FileStatus[] listStatus(Path arg0) throws IOException { 776 | FileStatus[] contents = delegate.listStatus(arg0); 777 | 778 | Arrays.sort(contents); 779 | 780 | return contents; 781 | } 782 | 783 | @Override 784 | public FSDataOutputStream append(Path arg0, int arg1, Progressable arg2) throws IOException { 785 | return delegate.append(arg0, arg1, arg2); 786 | } 787 | 788 | @Override 789 | public FSDataOutputStream append(Path f, int bufferSize) throws IOException { 790 | return delegate.append(f, bufferSize); 791 | } 792 | 793 | @Override 794 | public FSDataOutputStream append(Path f) throws IOException { 795 | return delegate.append(f); 796 | } 797 | 798 | @Override 799 | public void close() throws IOException { 800 | delegate.close(); 801 | } 802 | 803 | @Override 804 | public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) throws IOException { 805 | delegate.completeLocalOutput(fsOutputFile, tmpLocalFile); 806 | } 807 | 808 | @Override 809 | public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path src, Path dst) throws IOException { 810 | delegate.copyFromLocalFile(delSrc, overwrite, src, dst); 811 | } 812 | 813 | @Override 814 | public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path[] srcs, Path dst) throws IOException { 815 | delegate.copyFromLocalFile(delSrc, overwrite, srcs, dst); 816 | } 817 | 818 | @Override 819 | public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws IOException { 820 | delegate.copyFromLocalFile(delSrc, src, dst); 821 | } 822 | 823 | @Override 824 | public void copyFromLocalFile(Path src, Path dst) throws IOException { 825 | delegate.copyFromLocalFile(src, dst); 826 | } 827 | 828 | @Override 829 | public void copyToLocalFile(boolean delSrc, Path src, Path dst) throws IOException { 830 | delegate.copyToLocalFile(delSrc, src, dst); 831 | } 832 | 833 | @Override 834 | public void copyToLocalFile(Path src, Path dst) throws IOException { 835 | delegate.copyToLocalFile(src, dst); 836 | } 837 | 838 | @Override 839 | public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, Progressable progress) throws IOException { 840 | return delegate.create(f, overwrite, bufferSize, progress); 841 | } 842 | 843 | @Override 844 | public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, short replication, long blockSize, 845 | Progressable progress) throws IOException { 846 | return delegate.create(f, overwrite, bufferSize, replication, blockSize, progress); 847 | } 848 | 849 | @Override 850 | public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, short replication, long blockSize) 851 | throws IOException { 852 | return delegate.create(f, overwrite, bufferSize, replication, blockSize); 853 | } 854 | 855 | @Override 856 | public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize) throws IOException { 857 | return delegate.create(f, overwrite, bufferSize); 858 | } 859 | 860 | @Override 861 | public FSDataOutputStream create(Path f, boolean overwrite) throws IOException { 862 | return delegate.create(f, overwrite); 863 | } 864 | 865 | @Override 866 | public FSDataOutputStream create(Path arg0, FsPermission arg1, boolean arg2, int arg3, short arg4, long arg5, 867 | Progressable arg6) throws IOException { 868 | return delegate.create(arg0, arg1, arg2, arg3, arg4, arg5, arg6); 869 | } 870 | 871 | @Override 872 | public FSDataOutputStream create(Path f, Progressable progress) throws IOException { 873 | return delegate.create(f, progress); 874 | } 875 | 876 | @Override 877 | public FSDataOutputStream create(Path f, short replication, Progressable progress) throws IOException { 878 | return delegate.create(f, replication, progress); 879 | } 880 | 881 | @Override 882 | public FSDataOutputStream create(Path f, short replication) throws IOException { 883 | return delegate.create(f, replication); 884 | } 885 | 886 | @Override 887 | public FSDataOutputStream create(Path f) throws IOException { 888 | return delegate.create(f); 889 | } 890 | 891 | @Override 892 | public boolean createNewFile(Path f) throws IOException { 893 | return delegate.createNewFile(f); 894 | } 895 | 896 | @Override 897 | public boolean delete(Path arg0, boolean arg1) throws IOException { 898 | return delegate.delete(arg0, arg1); 899 | } 900 | 901 | @Override 902 | public boolean delete(Path arg0) throws IOException { 903 | return delegate.delete(arg0); 904 | } 905 | 906 | @Override 907 | public boolean deleteOnExit(Path f) throws IOException { 908 | return delegate.deleteOnExit(f); 909 | } 910 | 911 | @Override 912 | public boolean equals(Object obj) { 913 | return delegate.equals(obj); 914 | } 915 | 916 | @Override 917 | public boolean exists(Path arg0) throws IOException { 918 | return delegate.exists(arg0); 919 | } 920 | 921 | @Override 922 | public long getBlockSize(Path f) throws IOException { 923 | return delegate.getBlockSize(f); 924 | } 925 | 926 | @Override 927 | public Configuration getConf() { 928 | return delegate.getConf(); 929 | } 930 | 931 | @Override 932 | public ContentSummary getContentSummary(Path arg0) throws IOException { 933 | return delegate.getContentSummary(arg0); 934 | } 935 | 936 | @Override 937 | public long getDefaultBlockSize() { 938 | return delegate.getDefaultBlockSize(); 939 | } 940 | 941 | @Override 942 | public short getDefaultReplication() { 943 | return delegate.getDefaultReplication(); 944 | } 945 | 946 | @Override 947 | public BlockLocation[] getFileBlockLocations(FileStatus file, long start, long len) throws IOException { 948 | return delegate.getFileBlockLocations(file, start, len); 949 | } 950 | 951 | @Override 952 | public FileChecksum getFileChecksum(Path f) throws IOException { 953 | return delegate.getFileChecksum(f); 954 | } 955 | 956 | @Override 957 | public FileStatus getFileStatus(Path arg0) throws IOException { 958 | return delegate.getFileStatus(arg0); 959 | } 960 | 961 | @Override 962 | public Path getHomeDirectory() { 963 | return delegate.getHomeDirectory(); 964 | } 965 | 966 | @Override 967 | public long getLength(Path f) throws IOException { 968 | return delegate.getLength(f); 969 | } 970 | 971 | @Override 972 | public String getName() { 973 | return delegate.getName(); 974 | } 975 | 976 | @Override 977 | public short getReplication(Path src) throws IOException { 978 | return delegate.getReplication(src); 979 | } 980 | 981 | @Override 982 | public URI getUri() { 983 | return delegate.getUri(); 984 | } 985 | 986 | @Override 987 | public long getUsed() throws IOException { 988 | return delegate.getUsed(); 989 | } 990 | 991 | @Override 992 | public Path getWorkingDirectory() { 993 | return delegate.getWorkingDirectory(); 994 | } 995 | 996 | @Override 997 | public FileStatus[] globStatus(Path arg0, PathFilter arg1) throws IOException { 998 | return delegate.globStatus(arg0, arg1); 999 | } 1000 | 1001 | @Override 1002 | public FileStatus[] globStatus(Path pathPattern) throws IOException { 1003 | return delegate.globStatus(pathPattern); 1004 | } 1005 | 1006 | @Override 1007 | public int hashCode() { 1008 | return delegate.hashCode(); 1009 | } 1010 | 1011 | @Override 1012 | public void initialize(URI name, Configuration conf) throws IOException { 1013 | delegate.initialize(name, conf); 1014 | } 1015 | 1016 | @Override 1017 | public boolean isDirectory(Path arg0) throws IOException { 1018 | return delegate.isDirectory(arg0); 1019 | } 1020 | 1021 | @Override 1022 | public boolean isFile(Path arg0) throws IOException { 1023 | return delegate.isFile(arg0); 1024 | } 1025 | 1026 | @Override 1027 | public FileStatus[] listStatus(Path f, PathFilter filter) throws IOException { 1028 | return delegate.listStatus(f, filter); 1029 | } 1030 | 1031 | @Override 1032 | public FileStatus[] listStatus(Path[] arg0, PathFilter arg1) throws IOException { 1033 | return delegate.listStatus(arg0, arg1); 1034 | } 1035 | 1036 | @Override 1037 | public FileStatus[] listStatus(Path[] files) throws IOException { 1038 | return delegate.listStatus(files); 1039 | } 1040 | 1041 | @Override 1042 | public Path makeQualified(Path path) { 1043 | return delegate.makeQualified(path); 1044 | } 1045 | 1046 | @Override 1047 | public boolean mkdirs(Path arg0, FsPermission arg1) throws IOException { 1048 | return delegate.mkdirs(arg0, arg1); 1049 | } 1050 | 1051 | @Override 1052 | public boolean mkdirs(Path f) throws IOException { 1053 | return delegate.mkdirs(f); 1054 | } 1055 | 1056 | @Override 1057 | public void moveFromLocalFile(Path src, Path dst) throws IOException { 1058 | delegate.moveFromLocalFile(src, dst); 1059 | } 1060 | 1061 | @Override 1062 | public void moveFromLocalFile(Path[] srcs, Path dst) throws IOException { 1063 | delegate.moveFromLocalFile(srcs, dst); 1064 | } 1065 | 1066 | @Override 1067 | public void moveToLocalFile(Path src, Path dst) throws IOException { 1068 | delegate.moveToLocalFile(src, dst); 1069 | } 1070 | 1071 | @Override 1072 | public FSDataInputStream open(Path arg0, int arg1) throws IOException { 1073 | return delegate.open(arg0, arg1); 1074 | } 1075 | 1076 | @Override 1077 | public FSDataInputStream open(Path f) throws IOException { 1078 | return delegate.open(f); 1079 | } 1080 | 1081 | @Override 1082 | public boolean rename(Path arg0, Path arg1) throws IOException { 1083 | return delegate.rename(arg0, arg1); 1084 | } 1085 | 1086 | @Override 1087 | public void setConf(Configuration conf) { 1088 | if (null != delegate) { 1089 | delegate.setConf(conf); 1090 | } 1091 | } 1092 | 1093 | @Override 1094 | public void setOwner(Path p, String username, String groupname) throws IOException { 1095 | delegate.setOwner(p, username, groupname); 1096 | } 1097 | 1098 | @Override 1099 | public void setPermission(Path p, FsPermission permission) throws IOException { 1100 | delegate.setPermission(p, permission); 1101 | } 1102 | 1103 | @Override 1104 | public boolean setReplication(Path src, short replication) throws IOException { 1105 | return delegate.setReplication(src, replication); 1106 | } 1107 | 1108 | @Override 1109 | public void setTimes(Path p, long mtime, long atime) throws IOException { 1110 | delegate.setTimes(p, mtime, atime); 1111 | } 1112 | 1113 | @Override 1114 | public void setVerifyChecksum(boolean verifyChecksum) { 1115 | delegate.setVerifyChecksum(verifyChecksum); 1116 | } 1117 | 1118 | @Override 1119 | public void setWorkingDirectory(Path arg0) { 1120 | delegate.setWorkingDirectory(arg0); 1121 | } 1122 | 1123 | @Override 1124 | public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) throws IOException { 1125 | return delegate.startLocalOutput(fsOutputFile, tmpLocalFile); 1126 | } 1127 | 1128 | @Override 1129 | public String toString() { 1130 | return delegate.toString(); 1131 | } 1132 | 1133 | 1134 | } 1135 | } 1136 | --------------------------------------------------------------------------------