├── NOTICE.txt
├── src
    ├── main
    │   └── java
    │   │   └── com
    │   │       └── m6d
    │   │           └── filecrush
    │   │               ├── crush
    │   │                   ├── ReducerCounter.java
    │   │                   ├── MapperCounter.java
    │   │                   ├── FileStatusHasSize.java
    │   │                   ├── CountersMapper.java
    │   │                   ├── CrushPartitioner.java
    │   │                   ├── CountersInputFormat.java
    │   │                   ├── KeyValuePreservingTextInputFormat.java
    │   │                   ├── Bucketer.java
    │   │                   └── CrushReducer.java
    │   │               └── clean
    │   │                   └── Clean.java
    └── test
    │   ├── java
    │       └── com
    │       │   └── m6d
    │       │       └── filecrush
    │       │           ├── crush
    │       │               ├── CountersMapperTest.java
    │       │               ├── BucketerTest.java
    │       │               ├── KeyValuePreservingRecordReaderDelegationTest.java
    │       │               ├── KeyValuePreservingRecordReaderNextTest.java
    │       │               ├── CrushStandAloneSequenceFileTest.java
    │       │               ├── CrushStandAloneTextTest.java
    │       │               ├── CrushPartitionerTest.java
    │       │               ├── BucketerParameterizedTest.java
    │       │               ├── CrushReducerTest.java
    │       │               ├── CrushOptionParsingTest.java
    │       │               ├── CrushReducerParameterizedTest.java
    │       │               └── CrushTest.java
    │       │           └── clean
    │       │               └── TestClean.java
    │   └── resources
    │       └── help.txt
├── pom.xml
└── README


/NOTICE.txt:
--------------------------------------------------------------------------------
 1 | Hadoop Filecrush
 2 | Copyright 2010-2013 m6d Media6degrees
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | 


--------------------------------------------------------------------------------
/src/main/java/com/m6d/filecrush/crush/ReducerCounter.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |    Copyright 2011 m6d.com
 3 | 
 4 |    Licensed under the Apache License, Version 2.0 (the "License");
 5 |    you may not use this file except in compliance with the License.
 6 |    You may obtain a copy of the License at
 7 | 
 8 |        http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |    Unless required by applicable law or agreed to in writing, software
11 |    distributed under the License is distributed on an "AS IS" BASIS,
12 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |    See the License for the specific language governing permissions and
14 |    limitations under the License.
15 |  */
16 | package com.m6d.filecrush.crush;
17 | 
18 | public enum ReducerCounter {
19 | 	FILES_CRUSHED, RECORDS_CRUSHED
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/java/com/m6d/filecrush/crush/MapperCounter.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |    Copyright 2011 m6d.com
 3 | 
 4 |    Licensed under the Apache License, Version 2.0 (the "License");
 5 |    you may not use this file except in compliance with the License.
 6 |    You may obtain a copy of the License at
 7 | 
 8 |        http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |    Unless required by applicable law or agreed to in writing, software
11 |    distributed under the License is distributed on an "AS IS" BASIS,
12 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |    See the License for the specific language governing permissions and
14 |    limitations under the License.
15 |  */
16 | package com.m6d.filecrush.crush;
17 | 
18 | public enum MapperCounter {
19 | 	DIRS_FOUND, DIRS_SKIPPED, DIRS_ELIGIBLE, FILES_FOUND, FILES_SKIPPED, FILES_ELIGIBLE
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/java/com/m6d/filecrush/crush/FileStatusHasSize.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |    Copyright 2011 m6d.com
 3 | 
 4 |    Licensed under the Apache License, Version 2.0 (the "License");
 5 |    you may not use this file except in compliance with the License.
 6 |    You may obtain a copy of the License at
 7 | 
 8 |        http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |    Unless required by applicable law or agreed to in writing, software
11 |    distributed under the License is distributed on an "AS IS" BASIS,
12 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |    See the License for the specific language governing permissions and
14 |    limitations under the License.
15 |  */
16 | package com.m6d.filecrush.crush;
17 | 
18 | import org.apache.hadoop.fs.FileStatus;
19 | 
20 | import com.m6d.filecrush.crush.Bucketer.HasSize;
21 | 
22 | 
23 | class FileStatusHasSize implements HasSize {
24 | 
25 | 	private final FileStatus fileStatus;
26 | 
27 | 	public FileStatusHasSize(FileStatus fileStatus) {
28 | 		super();
29 | 
30 | 		if (null == fileStatus) {
31 | 			throw new NullPointerException("File status");
32 | 		}
33 | 
34 | 		this.fileStatus = fileStatus;
35 | 	}
36 | 
37 | 	@Override
38 | 	public String id() {
39 | 		return fileStatus.getPath().toUri().getPath();
40 | 	}
41 | 
42 | 	@Override
43 | 	public long size() {
44 | 		return fileStatus.getLen();
45 | 	}
46 | }
47 | 


--------------------------------------------------------------------------------
/src/main/java/com/m6d/filecrush/crush/CountersMapper.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |    Copyright 2011 m6d.com
 3 | 
 4 |    Licensed under the Apache License, Version 2.0 (the "License");
 5 |    you may not use this file except in compliance with the License.
 6 |    You may obtain a copy of the License at
 7 | 
 8 |        http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |    Unless required by applicable law or agreed to in writing, software
11 |    distributed under the License is distributed on an "AS IS" BASIS,
12 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |    See the License for the specific language governing permissions and
14 |    limitations under the License.
15 |  */
16 | package com.m6d.filecrush.crush;
17 | 
18 | import java.io.IOException;
19 | 
20 | import org.apache.hadoop.io.NullWritable;
21 | import org.apache.hadoop.io.Text;
22 | import org.apache.hadoop.mapred.Counters;
23 | import org.apache.hadoop.mapred.Counters.Counter;
24 | import org.apache.hadoop.mapred.Counters.Group;
25 | import org.apache.hadoop.mapred.JobConf;
26 | import org.apache.hadoop.mapred.Mapper;
27 | import org.apache.hadoop.mapred.OutputCollector;
28 | import org.apache.hadoop.mapred.Reporter;
29 | 
30 | /**
31 |  * Exists only to load the counters created during the planning phase into the reporter.
32 |  */
33 | @SuppressWarnings("deprecation")
34 | public class CountersMapper implements Mapper<Counters, NullWritable, Text, Text> {
35 | 
36 | 	@Override
37 | 	public void configure(JobConf job) {
38 | 		/*
39 | 		 * Nothing to do here.
40 | 		 */
41 | 	}
42 | 
43 | 	@Override
44 | 	public void map(Counters key, NullWritable value, OutputCollector<Text, Text> collector, Reporter reporter) throws IOException {
45 | 		for (Group group : key) {
46 | 			for (Counter counter : group) {
47 | 				reporter.incrCounter(group.getName(), counter.getName(), counter.getValue());
48 | 			}
49 | 		}
50 | 	}
51 | 
52 | 	@Override
53 | 	public void close() throws IOException {
54 | 		/*
55 | 		 * Nothing to do here.
56 | 		 */
57 | 	}
58 | }
59 | 


--------------------------------------------------------------------------------
/src/test/java/com/m6d/filecrush/crush/CountersMapperTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |    Copyright 2011 m6d.com
 3 | 
 4 |    Licensed under the Apache License, Version 2.0 (the "License");
 5 |    you may not use this file except in compliance with the License.
 6 |    You may obtain a copy of the License at
 7 | 
 8 |        http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |    Unless required by applicable law or agreed to in writing, software
11 |    distributed under the License is distributed on an "AS IS" BASIS,
12 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |    See the License for the specific language governing permissions and
14 |    limitations under the License.
15 |  */
16 | package com.m6d.filecrush.crush;
17 | 
18 | import java.io.IOException;
19 | 
20 | import org.apache.hadoop.mapred.Counters;
21 | import org.apache.hadoop.mapred.Reporter;
22 | import org.easymock.EasyMockSupport;
23 | import org.junit.Before;
24 | import org.junit.Test;
25 | 
26 | import com.m6d.filecrush.crush.CountersMapper;
27 | import com.m6d.filecrush.crush.MapperCounter;
28 | 
29 | @SuppressWarnings("deprecation")
30 | public class CountersMapperTest extends EasyMockSupport {
31 | 
32 | 	private Reporter reporter;
33 | 
34 | 	private CountersMapper mapper;
35 | 
36 | 	@Before
37 | 	public void before() {
38 | 		reporter = createMock("reporter", Reporter.class);
39 | 
40 | 		mapper = new CountersMapper();
41 | 	}
42 | 
43 | 	@Test
44 | 	public void map() throws IOException {
45 | 		Counters counters = new Counters();
46 | 
47 | 		counters.incrCounter(MapperCounter.DIRS_FOUND, 1);
48 | 		reporter.incrCounter(MapperCounter.class.getName(), MapperCounter.DIRS_FOUND.name(), 1);
49 | 
50 | 		counters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 2);
51 | 		reporter.incrCounter(MapperCounter.class.getName(), MapperCounter.DIRS_ELIGIBLE.name(), 2);
52 | 
53 | 		counters.incrCounter(MapperCounter.DIRS_SKIPPED, 3);
54 | 		reporter.incrCounter(MapperCounter.class.getName(), MapperCounter.DIRS_SKIPPED.name(), 3);
55 | 
56 | 		counters.incrCounter(MapperCounter.FILES_FOUND, 4);
57 | 		reporter.incrCounter(MapperCounter.class.getName(), MapperCounter.FILES_FOUND.name(), 4);
58 | 
59 | 		counters.incrCounter(MapperCounter.FILES_SKIPPED, 5);
60 | 		reporter.incrCounter(MapperCounter.class.getName(), MapperCounter.FILES_SKIPPED.name(), 5);
61 | 
62 | 		replayAll();
63 | 
64 | 		mapper.map(counters, null, null, reporter);
65 | 
66 | 		verifyAll();
67 | 	}
68 | }
69 | 


--------------------------------------------------------------------------------
/src/test/java/com/m6d/filecrush/crush/BucketerTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |    Copyright 2011 m6d.com
 3 | 
 4 |    Licensed under the Apache License, Version 2.0 (the "License");
 5 |    you may not use this file except in compliance with the License.
 6 |    You may obtain a copy of the License at
 7 | 
 8 |        http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |    Unless required by applicable law or agreed to in writing, software
11 |    distributed under the License is distributed on an "AS IS" BASIS,
12 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |    See the License for the specific language governing permissions and
14 |    limitations under the License.
15 |  */
16 | package com.m6d.filecrush.crush;
17 | 
18 | import static java.util.Collections.emptyList;
19 | import static org.hamcrest.Matchers.equalTo;
20 | import static org.junit.Assert.assertThat;
21 | import static org.junit.Assert.fail;
22 | 
23 | import org.apache.hadoop.fs.FileStatus;
24 | import org.junit.Before;
25 | import org.junit.Test;
26 | 
27 | import com.m6d.filecrush.crush.Bucketer;
28 | import com.m6d.filecrush.crush.FileStatusHasSize;
29 | import com.m6d.filecrush.crush.Bucketer.HasSize;
30 | 
31 | 
32 | public class BucketerTest {
33 | 
34 | 	private Bucketer bucketer;
35 | 
36 | 	@Before
37 | 	public void before() {
38 | 		bucketer = new Bucketer(5, 50, true);
39 | 	}
40 | 
41 | 	@Test(expected = IllegalStateException.class)
42 | 	public void callAddBeforeReset() {
43 | 		bucketer.add(new FileStatusHasSize(new FileStatus()));
44 | 	}
45 | 
46 | 	@Test(expected = IllegalStateException.class)
47 | 	public void callCreateBeforeReset() {
48 | 		bucketer.createBuckets();
49 | 	}
50 | 
51 | 	@Test
52 | 	public void addNullCheck() {
53 | 		bucketer.reset("foo");
54 | 
55 | 		try {
56 | 			bucketer.add(null);
57 | 			fail();
58 | 		} catch (NullPointerException ok) {
59 | 		}
60 | 	}
61 | 
62 | 	@Test(expected = NullPointerException.class)
63 | 	public void resestNullCheck() {
64 | 		bucketer.reset(null);
65 | 	}
66 | 
67 | 	@Test(expected = IllegalArgumentException.class)
68 | 	public void resestEmptyCheck() {
69 | 		bucketer.reset("");
70 | 	}
71 | 
72 | 	@Test
73 | 	public void nothingAdded() {
74 | 		bucketer.reset("test");
75 | 
76 | 		assertThat(bucketer.createBuckets(), equalTo((Object) emptyList()));
77 | 	}
78 | 
79 | 	@Test
80 | 	public void addZeroSize() {
81 | 		bucketer.reset("test");
82 | 
83 | 		bucketer.add(new HasSize() {
84 | 			@Override
85 | 			public String id() {
86 | 				return "test";
87 | 			}
88 | 
89 | 			@Override
90 | 			public long size() {
91 | 				return 0;
92 | 			}
93 | 		});
94 | 
95 | 		assertThat(bucketer.createBuckets(), equalTo((Object) emptyList()));
96 | 	}
97 | }
98 | 
99 | 


--------------------------------------------------------------------------------
/src/test/java/com/m6d/filecrush/crush/KeyValuePreservingRecordReaderDelegationTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |    Copyright 2011 m6d.com
 3 | 
 4 |    Licensed under the Apache License, Version 2.0 (the "License");
 5 |    you may not use this file except in compliance with the License.
 6 |    You may obtain a copy of the License at
 7 | 
 8 |        http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |    Unless required by applicable law or agreed to in writing, software
11 |    distributed under the License is distributed on an "AS IS" BASIS,
12 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |    See the License for the specific language governing permissions and
14 |    limitations under the License.
15 |  */
16 | package com.m6d.filecrush.crush;
17 | 
18 | import static org.hamcrest.Matchers.not;
19 | import static org.hamcrest.Matchers.nullValue;
20 | import static org.hamcrest.Matchers.sameInstance;
21 | import static org.junit.Assert.assertThat;
22 | import static org.mockito.Mockito.verify;
23 | 
24 | import java.io.IOException;
25 | 
26 | import org.apache.hadoop.io.LongWritable;
27 | import org.apache.hadoop.io.Text;
28 | import org.apache.hadoop.mapred.RecordReader;
29 | import org.junit.Before;
30 | import org.junit.Test;
31 | import org.junit.runner.RunWith;
32 | import org.mockito.Mock;
33 | import org.mockito.runners.MockitoJUnitRunner;
34 | 
35 | import com.m6d.filecrush.crush.KeyValuePreservingTextInputFormat.KeyValuePreservingRecordReader;
36 | 
37 | 
38 | @RunWith(MockitoJUnitRunner.class)
39 | public class KeyValuePreservingRecordReaderDelegationTest {
40 | 
41 | 	@Mock
42 | 	private PartialRecordReader delegate;
43 | 
44 | 	private KeyValuePreservingRecordReader reader;
45 | 
46 | 	@Before
47 | 	public void before() {
48 | 		reader = new KeyValuePreservingRecordReader(delegate);
49 | 	}
50 | 
51 | 	@Test
52 | 	public void createValueDelegation() {
53 | 		reader.createValue();
54 | 
55 | 		verify(delegate).createValue();
56 | 	}
57 | 
58 | 	@Test
59 | 	public void getPosDelegation() throws IOException {
60 | 		reader.getPos();
61 | 
62 | 		verify(delegate).getPos();
63 | 	}
64 | 
65 | 	@Test
66 | 	public void closeDelegation() throws IOException {
67 | 		reader.close();
68 | 
69 | 		verify(delegate).close();
70 | 	}
71 | 
72 | 	public void createKeyDoesNotDelegate() {
73 | 		Text key = reader.createKey();
74 | 
75 | 		assertThat(key, not(nullValue()));
76 | 		assertThat(reader.createKey(), not(sameInstance(key)));
77 | 	}
78 | 
79 | 	public static abstract class PartialRecordReader implements RecordReader<LongWritable, Text> {
80 | 		@Override
81 | 		public boolean next(LongWritable key, Text value) throws IOException {
82 | 			throw new AssertionError();
83 | 		}
84 | 
85 | 		@Override
86 | 		public LongWritable createKey() {
87 | 			throw new AssertionError();
88 | 		}
89 | 	}
90 | }
91 | 


--------------------------------------------------------------------------------
/src/main/java/com/m6d/filecrush/crush/CrushPartitioner.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |    Copyright 2011 m6d.com
 3 | 
 4 |    Licensed under the Apache License, Version 2.0 (the "License");
 5 |    you may not use this file except in compliance with the License.
 6 |    You may obtain a copy of the License at
 7 | 
 8 |        http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |    Unless required by applicable law or agreed to in writing, software
11 |    distributed under the License is distributed on an "AS IS" BASIS,
12 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |    See the License for the specific language governing permissions and
14 |    limitations under the License.
15 |  */
16 | package com.m6d.filecrush.crush;
17 | 
18 | import java.io.IOException;
19 | import java.util.HashMap;
20 | import java.util.HashSet;
21 | import java.util.Map;
22 | 
23 | import org.apache.hadoop.fs.FileSystem;
24 | import org.apache.hadoop.fs.Path;
25 | import org.apache.hadoop.io.IntWritable;
26 | import org.apache.hadoop.io.SequenceFile.Reader;
27 | import org.apache.hadoop.io.Text;
28 | import org.apache.hadoop.mapred.JobConf;
29 | import org.apache.hadoop.mapred.Partitioner;
30 | 
31 | @SuppressWarnings("deprecation")
32 | public class CrushPartitioner implements Partitioner<Text, Text> {
33 | 
34 | 	private Map<Text, Integer> bucketToPartition;
35 | 
36 | 	@Override
37 | 	public void configure(JobConf job) {
38 | 		String path = job.get("crush.partition.map");
39 | 		int expPartitions = job.getNumReduceTasks();
40 | 
41 | 		bucketToPartition = new HashMap<Text, Integer>(100);
42 | 
43 | 		try {
44 | 			FileSystem fs = FileSystem.get(job);
45 | 
46 | 			Reader reader = new Reader(fs, new Path(path), job);
47 | 
48 | 			Text bucket = new Text();
49 | 			IntWritable partNum = new IntWritable();
50 | 
51 | 			while (reader.next(bucket, partNum)) {
52 | 				int partNumValue = partNum.get();
53 | 
54 | 				if (partNumValue < 0 || partNumValue >= expPartitions) {
55 | 					throw new IllegalArgumentException("Partition " + partNumValue + " not allowed with " + expPartitions + " reduce tasks");
56 | 				}
57 | 
58 | 				Integer prev = bucketToPartition.put(new Text(bucket), partNumValue);
59 | 
60 | 				if (null != prev) {
61 | 					throw new IllegalArgumentException("Bucket " + bucket + " appears more than once in " + path);
62 | 				}
63 | 			}
64 | 		} catch (IOException e) {
65 | 			throw new RuntimeException("Could not read partition map from " + path, e);
66 | 		}
67 | 
68 | 		if (new HashSet<Integer>(bucketToPartition.values()).size() > expPartitions) {
69 | 			throw new IllegalArgumentException(path + " contains more than " + expPartitions + " distinct partitions");
70 | 		}
71 | 	}
72 | 
73 | 	@Override
74 | 	public int getPartition(Text bucketId, Text fileName, int numPartitions) {
75 | 		return bucketToPartition.get(bucketId);
76 | 	}
77 | }
78 | 


--------------------------------------------------------------------------------
/src/main/java/com/m6d/filecrush/crush/CountersInputFormat.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |    Copyright 2011 m6d.com
  3 | 
  4 |    Licensed under the Apache License, Version 2.0 (the "License");
  5 |    you may not use this file except in compliance with the License.
  6 |    You may obtain a copy of the License at
  7 | 
  8 |        http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |    Unless required by applicable law or agreed to in writing, software
 11 |    distributed under the License is distributed on an "AS IS" BASIS,
 12 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |    See the License for the specific language governing permissions and
 14 |    limitations under the License.
 15 |  */
 16 | package com.m6d.filecrush.crush;
 17 | 
 18 | import java.io.IOException;
 19 | 
 20 | import org.apache.hadoop.fs.FSDataInputStream;
 21 | import org.apache.hadoop.fs.FileSystem;
 22 | import org.apache.hadoop.fs.Path;
 23 | import org.apache.hadoop.io.NullWritable;
 24 | import org.apache.hadoop.mapred.Counters;
 25 | import org.apache.hadoop.mapred.FileInputFormat;
 26 | import org.apache.hadoop.mapred.FileSplit;
 27 | import org.apache.hadoop.mapred.InputSplit;
 28 | import org.apache.hadoop.mapred.JobConf;
 29 | import org.apache.hadoop.mapred.RecordReader;
 30 | import org.apache.hadoop.mapred.Reporter;
 31 | 
 32 | @SuppressWarnings("deprecation")
 33 | public class CountersInputFormat extends FileInputFormat<Counters, NullWritable> {
 34 | 
 35 | 	@Override
 36 | 	protected boolean isSplitable(FileSystem fs, Path filename) {
 37 | 		return false;
 38 | 	}
 39 | 
 40 | 	@Override
 41 | 	public RecordReader<Counters, NullWritable> getRecordReader(InputSplit inputSplit, JobConf jobconf, Reporter reporter)
 42 | 			throws IOException {
 43 | 
 44 | 		if (!(inputSplit instanceof FileSplit)) {
 45 | 			throw new AssertionError();
 46 | 		}
 47 | 
 48 | 		FileSplit fSplit = (FileSplit) inputSplit;
 49 | 
 50 | 		Path path = fSplit.getPath();
 51 | 		long length = fSplit.getLength();
 52 | 
 53 | 		FileSystem fs = FileSystem.get(jobconf);
 54 | 
 55 | 		FSDataInputStream is = fs.open(path);
 56 | 
 57 | 		return new CountersReader(is, length);
 58 | 	}
 59 | 
 60 | 	private static class CountersReader implements RecordReader<Counters, NullWritable> {
 61 | 
 62 | 		private final FSDataInputStream in;
 63 | 
 64 | 		private final long length;
 65 | 
 66 | 		public CountersReader(FSDataInputStream in, long length) {
 67 | 			super();
 68 | 
 69 | 			this.in = in;
 70 | 			this.length = length;
 71 | 		}
 72 | 
 73 | 		@Override
 74 | 		public Counters createKey() {
 75 | 			return new Counters();
 76 | 		}
 77 | 
 78 | 		@Override
 79 | 		public NullWritable createValue() {
 80 | 			return NullWritable.get();
 81 | 		}
 82 | 
 83 | 		@Override
 84 | 		public long getPos() throws IOException {
 85 | 			return in.getPos();
 86 | 		}
 87 | 
 88 | 		@Override
 89 | 		public float getProgress() throws IOException {
 90 | 			float percent = ((float) length) / in.getPos();
 91 | 
 92 | 			return percent;
 93 | 		}
 94 | 
 95 | 		@Override
 96 | 		public boolean next(Counters key, NullWritable value) throws IOException {
 97 | 			if (0 == in.getPos()) {
 98 | 				key.readFields(in);
 99 | 
100 | 				return true;
101 | 			}
102 | 
103 | 			return false;
104 | 		}
105 | 
106 | 		@Override
107 | 		public void close() throws IOException {
108 | 			in.close();
109 | 		}
110 | 	}
111 | }
112 | 


--------------------------------------------------------------------------------
/src/main/java/com/m6d/filecrush/crush/KeyValuePreservingTextInputFormat.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |    Copyright 2011 m6d.com
  3 | 
  4 |    Licensed under the Apache License, Version 2.0 (the "License");
  5 |    you may not use this file except in compliance with the License.
  6 |    You may obtain a copy of the License at
  7 | 
  8 |        http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |    Unless required by applicable law or agreed to in writing, software
 11 |    distributed under the License is distributed on an "AS IS" BASIS,
 12 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |    See the License for the specific language governing permissions and
 14 |    limitations under the License.
 15 |  */
 16 | package com.m6d.filecrush.crush;
 17 | 
 18 | import java.io.IOException;
 19 | 
 20 | import org.apache.hadoop.fs.FileSystem;
 21 | import org.apache.hadoop.fs.Path;
 22 | import org.apache.hadoop.io.LongWritable;
 23 | import org.apache.hadoop.io.Text;
 24 | import org.apache.hadoop.mapred.FileInputFormat;
 25 | import org.apache.hadoop.mapred.FileSplit;
 26 | import org.apache.hadoop.mapred.InputSplit;
 27 | import org.apache.hadoop.mapred.JobConf;
 28 | import org.apache.hadoop.mapred.LineRecordReader;
 29 | import org.apache.hadoop.mapred.RecordReader;
 30 | import org.apache.hadoop.mapred.Reporter;
 31 | import org.apache.hadoop.mapred.TextInputFormat;
 32 | 
 33 | /**
 34 |  * {@link TextInputFormat} creates keys of {@link LongWritable} offsets and {@link Text} values, which contain the line. For file
 35 |  * crushing, we need to preserve the keys and values as they appear in the file, which means we must discard the byte offsets and
 36 |  * divide the value into the original key and value pairs.
 37 |  */
 38 | @SuppressWarnings("deprecation")
 39 | public class KeyValuePreservingTextInputFormat extends FileInputFormat<Text, Text> {
 40 | 
 41 | 	private TextInputFormat delegate;
 42 | 
 43 |   public void configure(JobConf conf) {
 44 |   	delegate = new TextInputFormat();
 45 |   	delegate.configure(conf);
 46 |   }
 47 | 
 48 |   @Override
 49 | 	protected boolean isSplitable(FileSystem fs, Path file) {
 50 |   	/*
 51 |   	 * Return false because the reducer opens the file from beginning to end.
 52 |   	 */
 53 |     return false;
 54 |   }
 55 | 
 56 |   @Override
 57 | 	public RecordReader<Text, Text> getRecordReader(InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException {
 58 | 
 59 |     reporter.setStatus(genericSplit.toString());
 60 | 
 61 |     return new KeyValuePreservingRecordReader(new LineRecordReader(job, (FileSplit) genericSplit));
 62 |   }
 63 | 
 64 |   static class KeyValuePreservingRecordReader implements RecordReader<Text, Text> {
 65 | 
 66 |   	private final RecordReader<LongWritable, Text> delegate;
 67 | 
 68 |   	private final LongWritable delKey = new LongWritable();
 69 | 
 70 |   	private final Text delValue = new Text();
 71 | 
 72 | 		public KeyValuePreservingRecordReader(RecordReader<LongWritable, Text> delegate) {
 73 | 			super();
 74 | 
 75 | 			this.delegate = delegate;
 76 | 		}
 77 | 
 78 | 		@Override
 79 | 		public Text createKey() {
 80 | 			return new Text();
 81 | 		}
 82 | 
 83 | 		@Override
 84 | 		public Text createValue() {
 85 | 			return delegate.createValue();
 86 | 		}
 87 | 
 88 | 		@Override
 89 | 		public long getPos() throws IOException {
 90 | 			return delegate.getPos();
 91 | 		}
 92 | 
 93 | 		@Override
 94 | 		public void close() throws IOException {
 95 | 			delegate.close();
 96 | 		}
 97 | 
 98 | 		@Override
 99 | 		public float getProgress() throws IOException {
100 | 			return delegate.getProgress();
101 | 		}
102 | 
103 | 		@Override
104 | 		public boolean next(Text key, Text value) throws IOException {
105 | 			boolean next = delegate.next(delKey, delValue);
106 | 
107 | 			if (next) {
108 | 				int first = delValue.find("\t");
109 | 
110 | 				if (first >= 0) {
111 | 					key.set(delValue.getBytes(), 0, first);
112 | 
113 | 					if (delValue.getLength() > first) {
114 | 						value.set(delValue.getBytes(), first + 1, delValue.getLength() - first - 1);
115 | 					} else {
116 | 						value.clear();
117 | 					}
118 | 				} else {
119 | 					key.set(delValue);
120 | 				}
121 | 			}
122 | 
123 | 			return next;
124 | 		}
125 |   }
126 | }
127 | 


--------------------------------------------------------------------------------
/src/test/java/com/m6d/filecrush/crush/KeyValuePreservingRecordReaderNextTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |    Copyright 2011 m6d.com
  3 | 
  4 |    Licensed under the Apache License, Version 2.0 (the "License");
  5 |    you may not use this file except in compliance with the License.
  6 |    You may obtain a copy of the License at
  7 | 
  8 |        http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |    Unless required by applicable law or agreed to in writing, software
 11 |    distributed under the License is distributed on an "AS IS" BASIS,
 12 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |    See the License for the specific language governing permissions and
 14 |    limitations under the License.
 15 |  */
 16 | package com.m6d.filecrush.crush;
 17 | 
 18 | import static org.hamcrest.Matchers.equalTo;
 19 | import static org.hamcrest.Matchers.is;
 20 | import static org.junit.Assert.assertThat;
 21 | 
 22 | import java.io.IOException;
 23 | 
 24 | import org.apache.hadoop.io.LongWritable;
 25 | import org.apache.hadoop.io.Text;
 26 | import org.apache.hadoop.mapred.RecordReader;
 27 | import org.junit.Before;
 28 | import org.junit.Test;
 29 | 
 30 | import com.m6d.filecrush.crush.KeyValuePreservingTextInputFormat.KeyValuePreservingRecordReader;
 31 | 
 32 | 
 33 | public class KeyValuePreservingRecordReaderNextTest implements RecordReader<LongWritable, Text> {
 34 | 
 35 | 	private final Text key = new Text();
 36 | 
 37 | 	private final Text value = new Text();
 38 | 
 39 | 	private boolean next;
 40 | 
 41 | 	private long offset;
 42 | 
 43 | 	private String line;
 44 | 
 45 | 	private KeyValuePreservingRecordReader reader;
 46 | 
 47 | 	@Before
 48 | 	public void before() {
 49 | 		reader = new KeyValuePreservingRecordReader(this);
 50 | 	}
 51 | 
 52 | 	@Test
 53 | 	public void nextDelegation() throws IOException {
 54 | 		next = false;
 55 | 
 56 | 		assertThat(reader.next(key, value), is(false));
 57 | 	}
 58 | 
 59 | 	@Test
 60 | 	public void keyAndValueArePreserved() throws IOException {
 61 | 		next = true;
 62 | 
 63 | 		/*
 64 | 		 * Key with multiple values.
 65 | 		 */
 66 | 		offset = 0;
 67 | 		line = "key\tvalue0\tvalue1\tvalue2";
 68 | 
 69 | 		assertThat(reader.next(key, value), is(true));
 70 | 
 71 | 		assertThat(key.toString(), equalTo("key"));
 72 | 		assertThat(value.toString(), equalTo("value0\tvalue1\tvalue2"));
 73 | 
 74 | 
 75 | 		/*
 76 | 		 * No key with tab and value.
 77 | 		 */
 78 | 		offset = offset + line.length() + 1;
 79 | 		line = "\tvalue0\tvalue1\tvalue2";
 80 | 		assertThat(reader.next(key, value), is(true));
 81 | 
 82 | 		assertThat(key.toString(), equalTo(""));
 83 | 		assertThat(value.toString(), equalTo("value0\tvalue1\tvalue2"));
 84 | 
 85 | 
 86 | 		/*
 87 | 		 * Key and tab, no value.
 88 | 		 */
 89 | 		offset = offset + line.length() + 1;
 90 | 		line = "key and tab\t";
 91 | 		assertThat(reader.next(key, value), is(true));
 92 | 
 93 | 		assertThat(key.toString(), equalTo("key and tab"));
 94 | 		assertThat(value.toString(), equalTo(""));
 95 | 
 96 | 
 97 | 		/*
 98 | 		 * Key only. No tab or value.
 99 | 		 */
100 | 		offset = offset + line.length() + 1;
101 | 		line = "key only";
102 | 		assertThat(reader.next(key, value), is(true));
103 | 
104 | 		assertThat(key.toString(), equalTo("key only"));
105 | 		assertThat(value.toString(), equalTo(""));
106 | 
107 | 
108 | 		/*
109 | 		 * Key and value again.
110 | 		 */
111 | 		offset = offset + line.length() + 1;
112 | 		line = "a reeeeeeeally long key\tvalue0\tvalue1\tvalue2\tvalue3\tvalue4";
113 | 		assertThat(reader.next(key, value), is(true));
114 | 
115 | 		assertThat(key.toString(), equalTo("a reeeeeeeally long key"));
116 | 		assertThat(value.toString(), equalTo("value0\tvalue1\tvalue2\tvalue3\tvalue4"));
117 | 	}
118 | 
119 | 	@Override
120 | 	public boolean next(LongWritable key, Text value) throws IOException {
121 | 		if (next) {
122 | 			key.set(offset);
123 | 			value.set(line);
124 | 		}
125 | 
126 | 		return next;
127 | 	}
128 | 
129 | 	@Override
130 | 	public LongWritable createKey() {
131 | 		throw new AssertionError();
132 | 	}
133 | 
134 | 	@Override
135 | 	public Text createValue() {
136 | 		throw new AssertionError();
137 | 	}
138 | 
139 | 	@Override
140 | 	public long getPos() throws IOException {
141 | 		throw new AssertionError();
142 | 	}
143 | 
144 | 	@Override
145 | 	public void close() throws IOException {
146 | 		throw new AssertionError();
147 | 	}
148 | 
149 | 	@Override
150 | 	public float getProgress() throws IOException {
151 | 		throw new AssertionError();
152 | 	}
153 | }
154 | 


--------------------------------------------------------------------------------
/src/test/java/com/m6d/filecrush/clean/TestClean.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |    Copyright 2011 m6d.com
  3 | 
  4 |    Licensed under the Apache License, Version 2.0 (the "License");
  5 |    you may not use this file except in compliance with the License.
  6 |    You may obtain a copy of the License at
  7 | 
  8 |        http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |    Unless required by applicable law or agreed to in writing, software
 11 |    distributed under the License is distributed on an "AS IS" BASIS,
 12 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |    See the License for the specific language governing permissions and
 14 |    limitations under the License.
 15 |  */
 16 | package com.m6d.filecrush.clean;
 17 | 
 18 | import java.io.IOException;
 19 | 
 20 | import org.apache.hadoop.conf.Configuration;
 21 | import org.apache.hadoop.fs.FSDataOutputStream;
 22 | import org.apache.hadoop.fs.FileSystem;
 23 | import org.apache.hadoop.fs.Path;
 24 | import org.apache.hadoop.mapred.HadoopTestCase;
 25 | import org.apache.hadoop.util.ToolRunner;
 26 | import org.junit.Test;
 27 | 
 28 | import com.m6d.filecrush.clean.Clean;
 29 | 
 30 | public class TestClean  extends HadoopTestCase{
 31 | 
 32 | 	private static final Path ROOT_DIR = new Path("testing");
 33 | 
 34 | 	public TestClean() throws IOException {
 35 | 		super(HadoopTestCase.LOCAL_MR, HadoopTestCase.LOCAL_FS, 1, 1);
 36 | 	}
 37 | 
 38 | 	private Path getDir(Path dir) {
 39 | 		if (isLocalFS()) {
 40 | 			String localPathRoot = System
 41 | 			.getProperty("test.build.data", "/tmp").replace(' ', '+');
 42 | 			dir = new Path(localPathRoot, dir);
 43 | 		}
 44 | 		return dir;
 45 | 	}
 46 | 
 47 | 	public void setUp() throws Exception {
 48 | 		super.setUp();
 49 | 		Path rootDir = getDir(ROOT_DIR);
 50 | 		Configuration conf = createJobConf();
 51 | 		FileSystem fs = FileSystem.get(conf);
 52 | 		fs.delete(rootDir, true);		
 53 | 	}
 54 | 	
 55 | 	@Test
 56 | 	public void testAge() throws Exception{
 57 | 		Configuration conf = createJobConf();
 58 | 		FileSystem fs = FileSystem.get(conf);
 59 | 		fs.mkdirs( new Path(ROOT_DIR,"a") );
 60 | 		fs.mkdirs( new Path( new Path(ROOT_DIR,"a"),"1") );
 61 | 		fs.mkdirs( new Path(ROOT_DIR,"b") );
 62 | 		fs.mkdirs( new Path(ROOT_DIR,"c") );
 63 | 		fs.mkdirs( new Path( new Path(ROOT_DIR,"c"),"2") );
 64 | 		
 65 | 		Path oldFile = new Path(new Path( new Path(ROOT_DIR,"a"),"1"),"oldfile");
 66 | 		FSDataOutputStream out = fs.create(oldFile);
 67 | 		out.write("bla".getBytes());
 68 | 		out.close();
 69 | 		
 70 | 		Path cFile = new Path(new Path( new Path(ROOT_DIR,"c"),"1"),"cfile");
 71 | 		FSDataOutputStream out2 = fs.create(cFile);
 72 | 		out2.write("wah".getBytes());
 73 | 		out2.close();
 74 | 		
 75 | 		assertEquals(true,fs.exists(cFile));
 76 | 		assertEquals(true,fs.exists(oldFile));
 77 | 		
 78 | 		Clean cleanWarn = new Clean();
 79 | 		Configuration warnConf = createJobConf();
 80 | 		warnConf.set(Clean.TARGET_DIR, ROOT_DIR.toString());
 81 | 		warnConf.set(Clean.TARGET_EXPR, "cfile");
 82 | 		warnConf.set(Clean.WARN_MODE, "true");
 83 | 		ToolRunner.run(warnConf, cleanWarn, new String[]{});
 84 | 		assertEquals(true,fs.exists(cFile));
 85 | 		assertEquals(true,fs.exists(oldFile));
 86 | 				
 87 | 		Clean cleanReg = new Clean();
 88 | 		Configuration regConf = createJobConf();
 89 | 		regConf.set(Clean.TARGET_DIR, ROOT_DIR.toString());
 90 | 		regConf.set(Clean.TARGET_EXPR, "cfile");
 91 | 		ToolRunner.run(regConf, cleanReg, new String[]{});
 92 | 		assertEquals(false,fs.exists(cFile));
 93 | 		assertEquals(true,fs.exists(oldFile));
 94 | 		
 95 | 		Clean clean = new Clean();
 96 | 		Configuration cleanConf = createJobConf();
 97 | 		cleanConf.setLong(Clean.CUTTOFF_MILLIS, 20000);
 98 | 		cleanConf.set(Clean.TARGET_DIR, ROOT_DIR.toString());
 99 | 		ToolRunner.run(cleanConf, clean, new String[]{});
100 | 		assertEquals(true,fs.exists(oldFile));
101 | 		Thread.sleep(3);
102 | 		
103 | 		Clean clean2 = new Clean();
104 | 		Configuration cleanConf2 = createJobConf();
105 | 		cleanConf2.setLong(Clean.CUTTOFF_MILLIS, 1);
106 | 		cleanConf2.set(Clean.TARGET_DIR, ROOT_DIR.toString());
107 | 		ToolRunner.run(cleanConf2, clean2, new String[]{});
108 | 		assertEquals(false,fs.exists(oldFile));
109 | 		
110 | 	}
111 | 	
112 | 	@Test
113 | 	public void testNegatives() throws Exception{
114 | 		Clean clean = new Clean();
115 | 		Configuration cleanConf = createJobConf();
116 | 		cleanConf.setLong(Clean.CUTTOFF_MILLIS, 20000);
117 | 		cleanConf.set(Clean.TARGET_DIR, ROOT_DIR.toString());
118 | 		cleanConf.set(Clean.TARGET_EXPR, "bla");
119 | 		int res = ToolRunner.run(cleanConf, clean, new String[]{});
120 | 		assertEquals(9,res);
121 | 	}
122 | 
123 | 	@Test
124 | 	public void testRootClean() throws Exception{
125 | 		Clean clean = new Clean();
126 | 		Configuration cleanConf = createJobConf();
127 | 		cleanConf.set(Clean.TARGET_DIR, "/");
128 | 		cleanConf.set(Clean.TARGET_EXPR, "bla");
129 | 		int res = ToolRunner.run(cleanConf, clean, new String[]{});
130 | 		assertEquals(2,res);
131 | 	}
132 | }


--------------------------------------------------------------------------------
/src/main/java/com/m6d/filecrush/clean/Clean.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |    Copyright 2011 m6d.com
  3 | 
  4 |    Licensed under the Apache License, Version 2.0 (the "License");
  5 |    you may not use this file except in compliance with the License.
  6 |    You may obtain a copy of the License at
  7 | 
  8 |        http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |    Unless required by applicable law or agreed to in writing, software
 11 |    distributed under the License is distributed on an "AS IS" BASIS,
 12 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |    See the License for the specific language governing permissions and
 14 |    limitations under the License.
 15 |  */
 16 | package com.m6d.filecrush.clean;
 17 | 
 18 | import java.io.IOException;
 19 | 
 20 | import org.apache.hadoop.conf.Configuration;
 21 | import org.apache.hadoop.conf.Configured;
 22 | import org.apache.hadoop.fs.FileStatus;
 23 | import org.apache.hadoop.fs.FileSystem;
 24 | import org.apache.hadoop.fs.Path;
 25 | import org.apache.hadoop.util.Tool;
 26 | import org.apache.hadoop.util.ToolRunner;
 27 | 
 28 | @SuppressWarnings("deprecation")
 29 | public class Clean extends Configured implements Tool{
 30 | 
 31 | 	public static final String TARGET_DIR="clean.target.dir";
 32 | 	public static final String CUTTOFF_MILLIS="clean.cutoff.millis";
 33 | 	public static final String TARGET_EXPR="clean.target.expr";
 34 | 	public static final String WARN_MODE="clean.warn.mode";
 35 | 	
 36 | 	protected FileSystem fs;
 37 | 	protected Configuration conf;
 38 | 	protected long cutoff;
 39 | 	
 40 | 	public Clean(){
 41 | 		super();
 42 | 	}
 43 | 	
 44 | 	public static void main(String[] args) throws Exception {
 45 | 		Clean clean = new Clean();
 46 | 		int exitCode = ToolRunner.run(new Configuration(),clean, args);
 47 | 		System.exit(exitCode);
 48 |     }
 49 | 	
 50 | 	@Override
 51 | 	public int run(String[] args) throws Exception {
 52 |         conf = getConf();
 53 |        
 54 | 		try {
 55 | 			fs=FileSystem.get(getConf());
 56 | 		} catch (IOException e) {
 57 | 			throw new RuntimeException("Could not open filesystem");
 58 | 		}
 59 | 		int pre = preFlightCheck();
 60 | 		if (pre!=0){
 61 | 			return pre;
 62 | 		}
 63 | 		
 64 | 		if (conf.get(CUTTOFF_MILLIS)!=null){
 65 | 			long now=System.currentTimeMillis();
 66 | 			long targetAge= Long.parseLong(conf.get(CUTTOFF_MILLIS));
 67 | 			cutoff=now-targetAge;
 68 | 		}
 69 | 		
 70 |         return cleanup (new Path(conf.get(TARGET_DIR)));
 71 |     
 72 | 	}
 73 | 	
 74 | 	public void warnOrDelete(Path p) throws IOException{
 75 | 		if (conf.getBoolean(WARN_MODE, false)){
 76 | 			System.out.println("DELETE "+p);
 77 | 		} else {
 78 | 			if ( p.equals( new Path(conf.get(TARGET_DIR)) )){
 79 | 				
 80 | 			} else {
 81 | 				fs.delete(p);
 82 | 			}
 83 | 		}
 84 | 	}
 85 | 	
 86 | 	
 87 | 	public int cleanup(Path p){
 88 | 		try {
 89 | 			if (fs.isFile(p)){
 90 | 				if (conf.get(TARGET_EXPR)!=null){
 91 | 					if (p.getName().matches(conf.get(TARGET_EXPR))){
 92 | 						warnOrDelete(p);
 93 | 					}
 94 | 				}
 95 | 				if (conf.get(CUTTOFF_MILLIS)!=null){
 96 | 					if (fs.getFileStatus(p).getModificationTime() < cutoff ){
 97 | 						warnOrDelete(p);
 98 | 					} 
 99 | 				}
100 | 			}
101 | 			
102 | 			if (fs.isDirectory(p)){
103 | 				for (FileStatus stat: fs.listStatus(p)){
104 | 					cleanup( stat.getPath() );
105 | 				}
106 | 				if (fs.listStatus(p).length == 0){
107 | 					if (conf.get(TARGET_EXPR)!=null){
108 | 						if (p.getName().matches(conf.get(TARGET_EXPR))){
109 | 							warnOrDelete(p);
110 | 						}
111 | 					}
112 | 					if (conf.get(CUTTOFF_MILLIS)!=null){
113 | 						if (fs.getFileStatus(p).getModificationTime() < cutoff ){
114 | 							warnOrDelete(p);
115 | 						}
116 | 					}
117 | 				}
118 | 			}
119 | 		} catch (IOException e) {
120 | 			System.out.println("exception "+e);
121 | 			return 7;
122 | 		}
123 | 		return 0;
124 | 	}
125 | 	
126 | 	public int preFlightCheck(){
127 | 		Configuration conf = getConf();
128 | 		if (conf.get(TARGET_DIR) == null){
129 |         	System.err.println("You must specify a target.dir");
130 |         	return 1;
131 |         }
132 |         if (conf.get(TARGET_DIR).equals("/")){
133 |         	System.err.println("Will not clean / !!!!!!");
134 |         	return 2;
135 |         }
136 |         if ( fs.getHomeDirectory().equals( new Path(conf.get(TARGET_DIR)) ) ){
137 |         	System.err.println("Will not clean home directory");
138 |         	return 3;
139 |         }
140 |         if (conf.get(CUTTOFF_MILLIS)==null && conf.get(TARGET_EXPR)==null){
141 |         	System.err.println("You must specify "+CUTTOFF_MILLIS+" or "+TARGET_EXPR);
142 |         	return 4;
143 |         }
144 |         if (!(conf.get(CUTTOFF_MILLIS)==null) && !(conf.get(TARGET_EXPR)==null)){
145 |         	System.err.println("You can not specify "+CUTTOFF_MILLIS+" and "+TARGET_EXPR);
146 |         	return 9;
147 |         }
148 |         if (conf.get(CUTTOFF_MILLIS)!=null) {
149 |         	try { 
150 |         		Long.parseLong(conf.get(CUTTOFF_MILLIS));
151 |         	} catch (NumberFormatException ex){
152 |         		System.err.println(CUTTOFF_MILLIS+" was specified as "+conf.get(CUTTOFF_MILLIS)+" this is not a long integer");
153 |             	return 15;
154 |         	}
155 |         }
156 |         try {
157 | 			if (! fs.exists( new Path(conf.get(TARGET_DIR)))) {
158 | 				System.err.println(conf.get(TARGET_DIR)+" does not exist");
159 | 			}
160 | 		} catch (IOException e) {
161 | 			System.err.println("IOEXCEPTION"+ e);
162 | 			return 6;
163 | 		}
164 |         return 0;
165 | 	}
166 | 	
167 | }


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  3 | 	<modelVersion>4.0.0</modelVersion>
  4 | 	<groupId>com.m6d</groupId>
  5 | 	<artifactId>filecrush</artifactId>
  6 | 	<name>M6D App - Filecrush</name>
  7 | 	<version>2.2.2-SNAPSHOT</version>
  8 | 	<description>filecrush utility</description>
  9 | 	<packaging>jar</packaging>
 10 | 	<properties>
 11 | 		<hadoop.version>0.20.2</hadoop.version>
 12 | 		<commons-cli.version>1.2</commons-cli.version>
 13 | 		<commons-logging.version>1.0.4</commons-logging.version>
 14 | 		<commons-lang.version>2.3</commons-lang.version>
 15 | 		<commons-httpclient.version>3.0.1</commons-httpclient.version>
 16 | 		<log4j.version>1.2.13</log4j.version>
 17 | 		<slf4j.version>1.6.1</slf4j.version>
 18 | 		<plexus-utils.version>1.1</plexus-utils.version>
 19 | 		<junit.version>4.8.2</junit.version>
 20 | 		<mockito.version>1.8.5</mockito.version>
 21 | 		<hamcrest.version>1.2</hamcrest.version>
 22 | 		<easymock.version>3.0</easymock.version>
 23 | 		<jetty.version>6.1.14</jetty.version>
 24 | 	</properties>
 25 | 
 26 | 	<build>
 27 | 
 28 | 		<pluginManagement>
 29 | 			<plugins>
 30 | 				<plugin>
 31 | 					<groupId>org.apache.maven.plugins</groupId>
 32 | 					<artifactId>maven-eclipse-plugin</artifactId>
 33 | 					<version>2.5.1</version>
 34 | 					<configuration>
 35 | 						<projectNameTemplate>[artifactId]</projectNameTemplate>
 36 | 						<wtpmanifest>true</wtpmanifest>
 37 | 						<wtpapplicationxml>true</wtpapplicationxml>
 38 | 						<wtpversion>1.5</wtpversion>
 39 | 						<additionalBuildcommands>
 40 | 							<buildcommand>org.eclipse.jdt.core.javabuilder</buildcommand>
 41 | 							<buildcommand>org.maven.ide.eclipse.maven2Builder</buildcommand>
 42 | 						</additionalBuildcommands>
 43 | 						<additionalProjectnatures>
 44 | 							<projectnature>org.eclipse.jdt.core.javanature</projectnature>
 45 | 							<projectnature>org.maven.ide.eclipse.maven2Nature</projectnature>
 46 | 						</additionalProjectnatures>
 47 | 					</configuration>
 48 | 				</plugin>
 49 | 			</plugins>
 50 | 		</pluginManagement>
 51 | 
 52 | 		<plugins>
 53 | 
 54 | 			<plugin>
 55 | 				<artifactId>maven-compiler-plugin</artifactId>
 56 | 				<configuration>
 57 | 					<source>1.6</source>
 58 | 					<target>1.6</target>
 59 | 				</configuration>
 60 | 			</plugin>
 61 | 
 62 | 			<plugin>
 63 | 				<artifactId>maven-jar-plugin</artifactId>
 64 | 				<configuration>
 65 | 					<archive>
 66 | 					</archive>
 67 | 				</configuration>
 68 | 				<executions>
 69 | 					<execution>
 70 | 						<id>jar</id>
 71 | 						<goals>
 72 | 							<goal>jar</goal>
 73 | 						</goals>
 74 | 					</execution>
 75 | 				</executions>
 76 | 			</plugin>
 77 | 		</plugins>
 78 | 	</build>
 79 | 
 80 | 
 81 | 	<dependencies>
 82 | 		<dependency>
 83 | 			<groupId>org.apache.hadoop</groupId>
 84 | 			<artifactId>hadoop-core</artifactId>
 85 | 			<version>${hadoop.version}</version>
 86 | 			<scope>provided</scope>
 87 | 		</dependency>
 88 | 		<dependency>
 89 | 			<groupId>commons-logging</groupId>
 90 | 			<artifactId>commons-logging</artifactId>
 91 | 			<version>${commons-logging.version}</version>
 92 | 			<scope>provided</scope>
 93 | 		</dependency>
 94 | 		<dependency>
 95 | 			<groupId>log4j</groupId>
 96 | 			<artifactId>log4j</artifactId>
 97 | 			<version>${log4j.version}</version>
 98 | 			<scope>provided</scope>
 99 | 		</dependency>
100 | 		<dependency>
101 | 			<groupId>commons-httpclient</groupId>
102 | 			<artifactId>commons-httpclient</artifactId>
103 | 			<version>${commons-httpclient.version}</version>
104 | 			<scope>provided</scope>
105 | 		</dependency>
106 | 		<dependency>
107 | 			<groupId>commons-lang</groupId>
108 | 			<artifactId>commons-lang</artifactId>
109 | 			<version>${commons-lang.version}</version>
110 | 		</dependency>
111 | 		<dependency>
112 | 			<groupId>org.codehaus.plexus</groupId>
113 | 			<artifactId>plexus-utils</artifactId>
114 | 			<version>${plexus-utils.version}</version>
115 | 			<scope>provided</scope>
116 | 		</dependency>
117 | 		<dependency>
118 | 			<groupId>commons-cli</groupId>
119 | 			<artifactId>commons-cli</artifactId>
120 | 			<version>${commons-cli.version}</version>
121 | 			<scope>provided</scope>
122 | 		</dependency>
123 | 		<!--  test dependencies -->
124 | 		<dependency>
125 | 			<groupId>org.mockito</groupId>
126 | 			<artifactId>mockito-all</artifactId>
127 | 			<version>${mockito.version}</version>
128 | 			<scope>test</scope>
129 | 		</dependency>
130 | 		<dependency>
131 | 			<groupId>org.hamcrest</groupId>
132 | 			<artifactId>hamcrest-core</artifactId>
133 | 			<version>${hamcrest.version}</version>
134 | 			<scope>test</scope>
135 | 		</dependency>
136 | 		<dependency>
137 | 			<groupId>org.hamcrest</groupId>
138 | 			<artifactId>hamcrest-library</artifactId>
139 | 			<version>${hamcrest.version}</version>
140 | 			<scope>test</scope>
141 | 		</dependency>
142 | 		<dependency>
143 | 			<groupId>org.easymock</groupId>
144 | 			<artifactId>easymock</artifactId>
145 | 			<version>${easymock.version}</version>
146 | 			<scope>test</scope>
147 | 		</dependency>
148 | 		<dependency>
149 | 			<groupId>junit</groupId>
150 | 			<artifactId>junit</artifactId>
151 | 			<version>${junit.version}</version>
152 | 			<scope>test</scope>
153 | 		</dependency>
154 | 		<dependency>
155 | 			<groupId>org.apache.hadoop</groupId>
156 | 			<artifactId>hadoop-test</artifactId>
157 | 			<version>${hadoop.version}</version>
158 | 			<scope>provided</scope>
159 | 		</dependency>
160 | 		<dependency>
161 | 			<groupId>org.slf4j</groupId>
162 | 			<artifactId>slf4j-api</artifactId>
163 | 			<version>${slf4j.version}</version>
164 | 			<scope>test</scope>
165 | 		</dependency>
166 | 		<dependency>
167 | 			<groupId>org.slf4j</groupId>
168 | 			<artifactId>slf4j-log4j12</artifactId>
169 | 			<version>${slf4j.version}</version>
170 | 			<scope>test</scope>
171 | 		</dependency>
172 | 		<!-- Needed to run Hadoop cluster test cases -->
173 | 		<dependency>
174 | 			<groupId>org.mortbay.jetty</groupId>
175 | 			<artifactId>jetty</artifactId>
176 | 			<version>${jetty.version}</version>
177 | 			<scope>test</scope>
178 | 		</dependency>
179 | 		<!-- Needed to run Hadoop cluster test cases -->
180 | 		<dependency>
181 | 			<groupId>org.mortbay.jetty</groupId>
182 | 			<artifactId>jetty-util</artifactId>
183 | 			<version>${jetty.version}</version>
184 | 			<scope>test</scope>
185 | 		</dependency>
186 | 
187 | 	</dependencies>
188 | </project>
189 | 


--------------------------------------------------------------------------------
/src/test/java/com/m6d/filecrush/crush/CrushStandAloneSequenceFileTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |    Copyright 2011 m6d.com
  3 | 
  4 |    Licensed under the Apache License, Version 2.0 (the "License");
  5 |    you may not use this file except in compliance with the License.
  6 |    You may obtain a copy of the License at
  7 | 
  8 |        http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |    Unless required by applicable law or agreed to in writing, software
 11 |    distributed under the License is distributed on an "AS IS" BASIS,
 12 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |    See the License for the specific language governing permissions and
 14 |    limitations under the License.
 15 |  */
 16 | package com.m6d.filecrush.crush;
 17 | 
 18 | import static java.lang.String.format;
 19 | import static org.hamcrest.Matchers.equalTo;
 20 | import static org.hamcrest.Matchers.greaterThanOrEqualTo;
 21 | import static org.hamcrest.Matchers.is;
 22 | import static org.junit.Assert.assertThat;
 23 | 
 24 | import java.io.File;
 25 | import java.io.IOException;
 26 | import java.util.ArrayList;
 27 | import java.util.List;
 28 | 
 29 | import org.apache.hadoop.fs.FileSystem;
 30 | import org.apache.hadoop.fs.Path;
 31 | import org.apache.hadoop.io.IntWritable;
 32 | import org.apache.hadoop.io.SequenceFile;
 33 | import org.apache.hadoop.io.SequenceFile.Reader;
 34 | import org.apache.hadoop.io.SequenceFile.Writer;
 35 | import org.apache.hadoop.io.Text;
 36 | import org.apache.hadoop.mapred.JobConf;
 37 | import org.apache.hadoop.util.ToolRunner;
 38 | import org.junit.After;
 39 | import org.junit.Before;
 40 | import org.junit.Rule;
 41 | import org.junit.Test;
 42 | import org.junit.rules.TemporaryFolder;
 43 | 
 44 | import com.m6d.filecrush.crush.Crush;
 45 | 
 46 | /**
 47 |  * Dfs block size will be set to 50 and threshold set to 20%.
 48 |  */
 49 | @SuppressWarnings("deprecation")
 50 | public class CrushStandAloneSequenceFileTest {
 51 | 	@Rule
 52 | 	public final TemporaryFolder tmp = new TemporaryFolder();
 53 | 
 54 | 	private JobConf job;
 55 | 
 56 | 	@Before
 57 | 	public void setup() throws Exception {
 58 | 		job = new JobConf(false);
 59 | 
 60 | 		job.set("fs.default.name", "file:///");
 61 | 		job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem");
 62 | 		job.setLong("dfs.block.size", 50);
 63 | 	}
 64 | 
 65 | 	/**
 66 | 	 * Crush creates a subdirectory in tmp to store all its transient data. Since this test uses the local file system, the present
 67 | 	 * working directory is the parent of tmp. We delete it here since it's not so useful to clutter the build directory with
 68 | 	 * empty directories.
 69 | 	 */
 70 | 	@After
 71 | 	public void deleteTmp() throws IOException {
 72 | 		File tmp = new File("tmp");
 73 | 
 74 | 		if (tmp.exists()) {
 75 | 			assertThat(tmp.delete(), is(true));
 76 | 		}
 77 | 	}
 78 | 
 79 | 	@Test
 80 | 	public void standAloneOutput() throws Exception {
 81 | 
 82 | 		File in = tmp.newFolder("in");
 83 | 
 84 | 		createFile(in, "skipped-0", 0, 25);
 85 | 		createFile(in, "skipped-1", 1, 25);
 86 | 		createFile(in, "skipped-2", 2, 25);
 87 | 		createFile(in, "skipped-3", 3, 25);
 88 | 
 89 | 		File subdir = tmp.newFolder("in/subdir");
 90 | 
 91 | 		createFile(subdir, "lil-0", 0, 1);
 92 | 		createFile(subdir, "lil-1", 1, 2);
 93 | 		createFile(subdir, "big-2", 2, 5);
 94 | 		createFile(subdir, "big-3", 3, 5);
 95 | 
 96 | 		File subsubdir = tmp.newFolder("in/subdir/subsubdir");
 97 | 
 98 | 		createFile(subsubdir, "skipped-4", 4, 25);
 99 | 		createFile(subsubdir, "skipped-5", 5, 25);
100 | 
101 | 		File out = new File(tmp.getRoot(), "out");
102 | 
103 | 		ToolRunner.run(job, new Crush(), new String[] {
104 | 				subdir.getAbsolutePath(), out.getAbsolutePath()
105 | 		});
106 | 
107 | 		/*
108 | 		 * Make sure the original files are still there.
109 | 		 */
110 | 		verifyFile(in, "skipped-0", 0, 25);
111 | 		verifyFile(in, "skipped-1", 1, 25);
112 | 		verifyFile(in, "skipped-2", 2, 25);
113 | 		verifyFile(in, "skipped-3", 3, 25);
114 | 
115 | 		verifyFile(subdir, "lil-0", 0, 1);
116 | 		verifyFile(subdir, "lil-1", 1, 2);
117 | 		verifyFile(subdir, "big-2", 2, 5);
118 | 		verifyFile(subdir, "big-3", 3, 5);
119 | 
120 | 		verifyFile(subsubdir, "skipped-4", 4, 25);
121 | 		verifyFile(subsubdir, "skipped-5", 5, 25);
122 | 
123 | 		/*
124 | 		 * Verify the crush output.
125 | 		 */
126 | 		verifyCrushOutput(out, new int[] { 0, 1 }, new int[] { 1, 2}, new int[] { 2, 5 }, new int[] { 3, 5 });
127 | 	}
128 | 
129 | 	@Test
130 | 	public void noFiles() throws Exception {
131 | 		File in = tmp.newFolder("in");
132 | 
133 | 		File out = new File(tmp.getRoot(), "out");
134 | 
135 | 		ToolRunner.run(job, new Crush(), new String[] {
136 | 				in.getAbsolutePath(), out.getAbsolutePath()
137 | 		});
138 | 
139 | 		assertThat(out.exists(), is(false));
140 | 	}
141 | 
142 | 	private void verifyCrushOutput(File crushOutput, int[]... keyCounts) throws IOException {
143 | 
144 | 		List<String> actual = new ArrayList<String>();
145 | 
146 | 		Text text = new Text();
147 | 		IntWritable value = new IntWritable();
148 | 
149 | 		Reader reader = new Reader(FileSystem.get(job), new Path(crushOutput.getAbsolutePath()), job);
150 | 
151 | 		while (reader.next(text, value)) {
152 | 			actual.add(format("%s\t%d", text, value.get()));
153 | 		}
154 | 
155 | 		reader.close();
156 | 
157 | 		int expLines = 0;
158 | 		List<List<String>> expected = new ArrayList<List<String>>();
159 | 
160 | 
161 | 		for (int[] keyCount : keyCounts) {
162 | 			int key  = keyCount[0];
163 | 			int count = keyCount[1];
164 | 
165 | 			List<String> lines = new ArrayList<String>();
166 | 			expected.add(lines);
167 | 
168 | 			for (int i = 0, j = 0; i < count; i++, j = j == 9 ? 0 : j + 1) {
169 | 				String line = format("%d\t%d", key, j);
170 | 				lines.add(line);
171 | 			}
172 | 
173 | 			expLines += count;
174 | 		}
175 | 
176 | 		/*
177 | 		 * Make sure each file's data is contiguous in the crush output file.
178 | 		 */
179 | 		for (List<String> list : expected) {
180 | 			int idx = actual.indexOf(list.get(0));
181 | 
182 | 			assertThat(idx, greaterThanOrEqualTo(0));
183 | 
184 | 			assertThat(actual.subList(idx, idx + list.size()), equalTo(list));
185 | 		}
186 | 
187 | 		assertThat(actual.size(), equalTo(expLines));
188 | 	}
189 | 
190 | 	private void createFile(File dir, String fileName, int key, int count) throws IOException {
191 | 		File file = new File(dir, fileName);
192 | 
193 | 		Writer writer = SequenceFile.createWriter(FileSystem.get(job), job, new Path(file.getAbsolutePath()), Text.class, IntWritable.class);
194 | 
195 | 		Text text = new Text(Integer.toString(key));
196 | 		IntWritable value = new IntWritable();
197 | 
198 | 		for (int i = 0, j = 0; i < count; i++, j = j == 9 ? 0 : j + 1) {
199 | 			value.set(j);
200 | 
201 | 			writer.append(text, value);
202 | 		}
203 | 
204 | 		writer.close();
205 | 	}
206 | 
207 | 	private void verifyFile(File dir, String fileName, int key, int count) throws IOException {
208 | 		File file = new File(dir, fileName);
209 | 
210 | 		Reader reader = new Reader(FileSystem.get(job), new Path(file.getAbsolutePath()), job);
211 | 
212 | 		int i = 0;
213 | 		int actual = 0;
214 | 
215 | 		Text text = new Text();
216 | 		IntWritable value = new IntWritable();
217 | 
218 | 		while (reader.next(text, value)) {
219 | 			assertThat(text.toString(), equalTo(Integer.toString(key)));
220 | 			assertThat(value.get(), equalTo(i));
221 | 
222 | 			if (i == 9) {
223 | 				i = 0;
224 | 			} else {
225 | 				i++;
226 | 			}
227 | 
228 | 			actual++;
229 | 		}
230 | 
231 | 		reader.close();
232 | 
233 | 		assertThat(actual, equalTo(count));
234 | 	}
235 | }
236 | 


--------------------------------------------------------------------------------
/src/test/java/com/m6d/filecrush/crush/CrushStandAloneTextTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |    Copyright 2011 m6d.com
  3 | 
  4 |    Licensed under the Apache License, Version 2.0 (the "License");
  5 |    you may not use this file except in compliance with the License.
  6 |    You may obtain a copy of the License at
  7 | 
  8 |        http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |    Unless required by applicable law or agreed to in writing, software
 11 |    distributed under the License is distributed on an "AS IS" BASIS,
 12 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |    See the License for the specific language governing permissions and
 14 |    limitations under the License.
 15 |  */
 16 | package com.m6d.filecrush.crush;
 17 | 
 18 | import static java.lang.String.format;
 19 | import static org.hamcrest.Matchers.equalTo;
 20 | import static org.hamcrest.Matchers.greaterThanOrEqualTo;
 21 | import static org.hamcrest.Matchers.is;
 22 | import static org.hamcrest.Matchers.nullValue;
 23 | import static org.junit.Assert.assertThat;
 24 | 
 25 | import java.io.BufferedReader;
 26 | import java.io.File;
 27 | import java.io.FileReader;
 28 | import java.io.IOException;
 29 | import java.io.PrintWriter;
 30 | import java.util.ArrayList;
 31 | import java.util.List;
 32 | 
 33 | import org.apache.hadoop.mapred.JobConf;
 34 | import org.apache.hadoop.util.ToolRunner;
 35 | import org.junit.After;
 36 | import org.junit.Before;
 37 | import org.junit.Rule;
 38 | import org.junit.Test;
 39 | import org.junit.rules.TemporaryFolder;
 40 | 
 41 | import com.m6d.filecrush.crush.Crush;
 42 | 
 43 | /**
 44 |  * Dfs block size will be set to 50 and threshold set to 20%.
 45 |  */
 46 | @SuppressWarnings("deprecation")
 47 | public class CrushStandAloneTextTest {
 48 | 	@Rule
 49 | 	public final TemporaryFolder tmp = new TemporaryFolder();
 50 | 
 51 | 	private JobConf job;
 52 | 
 53 | 	@Before
 54 | 	public void setup() throws Exception {
 55 | 		job = new JobConf(false);
 56 | 
 57 | 		job.set("fs.default.name", "file:///");
 58 | 		job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem");
 59 | 		job.setLong("dfs.block.size", 50);
 60 | 	}
 61 | 
 62 | 	/**
 63 | 	 * Crush creates a subdirectory in tmp to store all its transient data. Since this test uses the local file system, the present
 64 | 	 * working directory is the parent of tmp. We delete it here since it's not so useful to clutter the build directory with
 65 | 	 * empty directories.
 66 | 	 */
 67 | 	@After
 68 | 	public void deleteTmp() throws IOException {
 69 | 		File tmp = new File("tmp");
 70 | 
 71 | 		if (tmp.exists()) {
 72 | 			assertThat(tmp.delete(), is(true));
 73 | 		}
 74 | 	}
 75 | 
 76 | 	@Test
 77 | 	public void standAloneOutput() throws Exception {
 78 | 
 79 | 		File in = tmp.newFolder("in");
 80 | 
 81 | 		createFile(in, "skipped-0", 0, 25);
 82 | 		createFile(in, "skipped-1", 1, 25);
 83 | 		createFile(in, "skipped-2", 2, 25);
 84 | 		createFile(in, "skipped-3", 3, 25);
 85 | 
 86 | 		File subdir = tmp.newFolder("in/subdir");
 87 | 
 88 | 		createFile(subdir, "lil-0", 0, 1);
 89 | 		createFile(subdir, "lil-1", 1, 2);
 90 | 		createFile(subdir, "big-2", 2, 5);
 91 | 		createFile(subdir, "big-3", 3, 5);
 92 | 
 93 | 		File subsubdir = tmp.newFolder("in/subdir/subsubdir");
 94 | 
 95 | 		createFile(subsubdir, "skipped-4", 4, 25);
 96 | 		createFile(subsubdir, "skipped-5", 5, 25);
 97 | 
 98 | 		File out = new File(tmp.getRoot(), "out");
 99 | 
100 | 		ToolRunner.run(job, new Crush(), new String[] {
101 | 				"--input-format=text",
102 | 				"--output-format=text",
103 | 				"--compress=none",
104 | 
105 | 				subdir.getAbsolutePath(), out.getAbsolutePath()
106 | 		});
107 | 
108 | 		/*
109 | 		 * Make sure the original files are still there.
110 | 		 */
111 | 		verifyFile(in, "skipped-0", 0, 25);
112 | 		verifyFile(in, "skipped-1", 1, 25);
113 | 		verifyFile(in, "skipped-2", 2, 25);
114 | 		verifyFile(in, "skipped-3", 3, 25);
115 | 
116 | 		verifyFile(subdir, "lil-0", 0, 1);
117 | 		verifyFile(subdir, "lil-1", 1, 2);
118 | 		verifyFile(subdir, "big-2", 2, 5);
119 | 		verifyFile(subdir, "big-3", 3, 5);
120 | 
121 | 		verifyFile(subsubdir, "skipped-4", 4, 25);
122 | 		verifyFile(subsubdir, "skipped-5", 5, 25);
123 | 
124 | 		/*
125 | 		 * Verify the crush output.
126 | 		 */
127 | 		verifyCrushOutput(out, new int[] { 0, 1 }, new int[] { 1, 2}, new int[] { 2, 5 }, new int[] { 3, 5 });
128 | 	}
129 | 
130 | 	@Test
131 | 	public void noFiles() throws Exception {
132 | 		File in = tmp.newFolder("in");
133 | 
134 | 		File out = new File(tmp.getRoot(), "out");
135 | 
136 | 		ToolRunner.run(job, new Crush(), new String[] {
137 | 				in.getAbsolutePath(), out.getAbsolutePath()
138 | 		});
139 | 
140 | 		assertThat(out.exists(), is(false));
141 | 	}
142 | 
143 | 	@Test
144 | 	public void ignoreRegexTest() throws Exception {
145 | 
146 | 		File in = tmp.newFolder("skip_test");
147 | 
148 | 		createFile(in, "lil-0", 0, 1);
149 | 		createFile(in, "lil-1", 1, 2);
150 | 		createFile(in, "big-2", 2, 5);
151 | 		createFile(in, "big-3", 3, 5);
152 | 		// Files to be ignored
153 | 		createFile(in, "lil-0.index", 0, 10);
154 | 		createFile(in, "lil-1.index", 1, 20);
155 | 		createFile(in, "big-2.index", 2, 50);
156 | 		createFile(in, "big-3.index", 3, 50);
157 | 
158 | 		File out = new File(tmp.getRoot(), "skip_test_out");
159 | 
160 | 		ToolRunner.run(job, new Crush(), new String[] {
161 | 				"--input-format=text",
162 | 				"--output-format=text",
163 | 				"--ignore-regex=.*\\.index",
164 | 				"--compress=none",
165 | 
166 | 				in.getAbsolutePath(), out.getAbsolutePath()
167 | 		});
168 | 
169 | 		/*
170 | 		 * Make sure the original files are still there.
171 | 		 */
172 | 		verifyFile(in, "lil-0", 0, 1);
173 | 		verifyFile(in, "lil-1", 1, 2);
174 | 		verifyFile(in, "big-2", 2, 5);
175 | 		verifyFile(in, "big-3", 3, 5);
176 | 		verifyFile(in, "lil-0.index", 0, 10);
177 | 		verifyFile(in, "lil-1.index", 1, 20);
178 | 		verifyFile(in, "big-2.index", 2, 50);
179 | 		verifyFile(in, "big-3.index", 3, 50);
180 | 
181 | 		/*
182 | 		 * Verify the crush output.
183 | 		 */
184 | 		verifyCrushOutput(out, new int[] { 0, 1 }, new int[] { 1, 2}, new int[] { 2, 5 }, new int[] { 3, 5 });
185 | 	}
186 | 
187 | 	private void verifyCrushOutput(File crushOutput, int[]... keyCounts) throws IOException {
188 | 
189 | 		List<String> actual = new ArrayList<String>();
190 | 		BufferedReader reader = new BufferedReader(new FileReader(crushOutput));
191 | 
192 | 		String line;
193 | 
194 | 		while (null != (line = reader.readLine())) {
195 | 			actual.add(line);
196 | 		}
197 | 
198 | 		reader.close();
199 | 
200 | 		int expLines = 0;
201 | 		List<List<String>> expected = new ArrayList<List<String>>();
202 | 
203 | 		for (int[] kc : keyCounts) {
204 | 			int key  = kc[0];
205 | 			int count = kc[1];
206 | 
207 | 			List<String> lines = new ArrayList<String>();
208 | 			expected.add(lines);
209 | 
210 | 			for (int idx = 0, i = 0; idx < count; idx++, i = i == 9 ? 0 : i + 1) {
211 | 				line = format("%d\t%d", key, i);
212 | 				lines.add(line);
213 | 			}
214 | 
215 | 			expLines += count;
216 | 		}
217 | 
218 | 		/*
219 | 		 * Make sure each file's data is contiguous in the crush output file.
220 | 		 */
221 | 		for (List<String> list : expected) {
222 | 			int idx = actual.indexOf(list.get(0));
223 | 
224 | 			assertThat(idx, greaterThanOrEqualTo(0));
225 | 
226 | 			assertThat(actual.subList(idx, idx + list.size()), equalTo(list));
227 | 		}
228 | 
229 | 		assertThat(actual.size(), equalTo(expLines));
230 | 	}
231 | 
232 | 	private void createFile(File dir, String fileName, int key, int count) throws IOException {
233 | 		File file = new File(dir, fileName);
234 | 
235 | 		PrintWriter writer = new PrintWriter(file);
236 | 
237 | 		for (int idx = 0, i = 0; idx < count; idx++, i = i == 9 ? 0 : i + 1) {
238 | 			String line = format("%d\t%d\n", key, i);
239 | 
240 | 			assertThat(line.length(), equalTo(4));
241 | 
242 | 			writer.write(line);
243 | 		}
244 | 
245 | 		writer.close();
246 | 	}
247 | 
248 | 	private void verifyFile(File dir, String fileName, int key, int count) throws IOException {
249 | 		File file = new File(dir, fileName);
250 | 
251 | 		assertThat(file.isFile(), is(true));
252 | 		assertThat(file.length(), equalTo((long) count * 4));
253 | 
254 | 		BufferedReader reader = new BufferedReader(new FileReader(file));
255 | 
256 | 		String line;
257 | 		int i = 0;
258 | 		int actualCount = 0;
259 | 
260 | 		while (null != (line = reader.readLine())) {
261 | 			assertThat(line.length(), equalTo(3));
262 | 
263 | 			actualCount++;
264 | 
265 | 			String[] split = line.split("\t");
266 | 
267 | 			assertThat(line, split[0], equalTo(Integer.toString(key)));
268 | 			assertThat(line, split[1], equalTo(Integer.toString(i)));
269 | 
270 | 			if (i == 9) {
271 | 				i = 0;
272 | 			} else {
273 | 				i++;
274 | 			}
275 | 		}
276 | 
277 | 		assertThat(reader.readLine(), nullValue());
278 | 
279 | 		reader.close();
280 | 
281 | 		assertThat(actualCount, equalTo(count));
282 | 	}
283 | }
284 | 


--------------------------------------------------------------------------------
/src/main/java/com/m6d/filecrush/crush/Bucketer.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |    Copyright 2011 m6d.com
  3 | 
  4 |    Licensed under the Apache License, Version 2.0 (the "License");
  5 |    you may not use this file except in compliance with the License.
  6 |    You may obtain a copy of the License at
  7 | 
  8 |        http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |    Unless required by applicable law or agreed to in writing, software
 11 |    distributed under the License is distributed on an "AS IS" BASIS,
 12 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |    See the License for the specific language governing permissions and
 14 |    limitations under the License.
 15 |  */
 16 | package com.m6d.filecrush.crush;
 17 | 
 18 | import static java.lang.String.format;
 19 | import static java.util.Collections.unmodifiableList;
 20 | 
 21 | import java.util.Collections;
 22 | import java.util.Comparator;
 23 | import java.util.Iterator;
 24 | import java.util.LinkedList;
 25 | import java.util.List;
 26 | import java.util.ListIterator;
 27 | 
 28 | import org.apache.hadoop.fs.FileStatus;
 29 | 
 30 | /**
 31 |  * <p>
 32 |  * Arranges files into buckets. Callers must interact with this class in the following order:
 33 |  * </p>
 34 |  * <ol>
 35 |  * <li>Invoke {@link #reset(String)}.</li>
 36 |  * <li>Invoke {@link #add(FileStatus)} zero or more times.</li>
 37 |  * <li>Invoke {@link #createBuckets()}.</li>
 38 |  * <li>Go to 1 or throw away instance.</li>
 39 |  * </ol>
 40 |  *
 41 |  * <p>
 42 |  * The bucketing algorithm is:
 43 |  * </p>
 44 |  *
 45 |  * <ol>
 46 |  * <li>Calculate the number of buckets as floor(total bytes / block size). Add one if there is a remainder.</li>
 47 |  * <li>Sort the files in order of <b>descending</b> size.</li>
 48 |  * <li>Add each file to the bucket that has the least size.</li>
 49 |  * <li>Remove any buckets containing one file only</li>
 50 |  * </ol>
 51 |  */
 52 | class Bucketer {
 53 | 	/**
 54 | 	 * The maximum number of buckets to create.
 55 | 	 */
 56 | 	private final int maxBuckets;
 57 | 
 58 | 	/**
 59 | 	 * The size of the files to create. Used in the bucketing algorithm.
 60 | 	 */
 61 | 	private final long bucketSize;
 62 | 
 63 | 	/**
 64 | 	 * The items to consider for bucketing.
 65 | 	 */
 66 | 	private final List<HasSize> items = new LinkedList<HasSize>();
 67 | 
 68 | 	/**
 69 | 	 * The total number of bytes represented by the files in {@link #items}.
 70 | 	 */
 71 | 	private long size;
 72 | 
 73 | 	/**
 74 | 	 * The directory being bucketed.
 75 | 	 */
 76 | 	private String dir;
 77 | 
 78 | 	/**
 79 | 	 * Do not return buckets containing a single item from {@link #createBuckets()}.
 80 | 	 */
 81 | 	private final boolean excludeSingleItemBuckets;
 82 | 
 83 | 	public Bucketer(int numBuckets, boolean excludeSingleItemBuckets) {
 84 | 		this(numBuckets, 0, excludeSingleItemBuckets);
 85 | 	}
 86 | 
 87 | 	public Bucketer(int maxBuckets, long bucketSize, boolean excludeSingleItemBuckets) {
 88 | 		super();
 89 | 
 90 | 		if (1 > maxBuckets) {
 91 | 			throw new IllegalArgumentException("Must have at least one bucket: " + maxBuckets);
 92 | 		}
 93 | 
 94 | 		this.maxBuckets = maxBuckets;
 95 | 
 96 | 		if (0 > bucketSize) {
 97 | 			throw new IllegalArgumentException("Bucket size must be zero or positive: " + bucketSize);
 98 | 		}
 99 | 
100 | 		this.bucketSize = bucketSize;
101 | 		this.excludeSingleItemBuckets = excludeSingleItemBuckets;
102 | 	}
103 | 
104 | 	/**
105 | 	 * Returns map from bucket to files that are in that bucket. Buckets are guaranteed to contain more than one file and will be
106 | 	 * approximately the same size in bytes (summing the sizes of all the files in that bucket). After this method returns,
107 | 	 * {@link #reset(String)} must be called before this instance can be called again.
108 | 	 */
109 | 	public List<Bucket> createBuckets() {
110 | 		if (null == dir) {
111 | 			throw new IllegalStateException("No directory set");
112 | 		}
113 | 
114 | 		/*
115 | 		 * Sort the files in order of descending size.
116 | 		 */
117 | 		Collections.sort(items, DESCENDING_SIZE);
118 | 
119 | 		LinkedList<Bucket> buckets = new LinkedList<Bucketer.Bucket>();
120 | 
121 | 		for (long remaining = size; remaining > 0 && buckets.size() < maxBuckets; remaining -= bucketSize) {
122 | 			buckets.add(new Bucket(format("%s-%d", dir, buckets.size())));
123 | 		}
124 | 
125 | 		int numBuckets = buckets.size();
126 | 
127 | 		if (1 == numBuckets) {
128 | 			Bucket bucket = buckets.getFirst();
129 | 
130 | 			for (HasSize file : items) {
131 | 				bucket.add(file);
132 | 			}
133 | 		} else {
134 | 			/*
135 | 			 * Add the files to the smallest bucket.
136 | 			 */
137 | 			for (HasSize item : items) {
138 | 				ListIterator<Bucket> iterator = buckets.listIterator();
139 | 
140 | 				Bucket bucket = iterator.next();
141 | 				bucket.add(item);
142 | 
143 | 				iterator.remove();
144 | 
145 | 				/*
146 | 				 * Reposition the bucket in the list to preserve order by ascending bucket size.
147 | 				 */
148 | 				while (buckets.size() < numBuckets && iterator.hasNext()) {
149 | 					Bucket other = iterator.next();
150 | 
151 | 					if (other.bytes > bucket.bytes) {
152 | 							iterator.previous();
153 | 							iterator.add(bucket);
154 | 					}
155 | 				}
156 | 
157 | 				if (buckets.size() < numBuckets) {
158 | 					/*
159 | 					 * This bucket is now the biggest one.
160 | 					 */
161 | 					buckets.add(bucket);
162 | 				}
163 | 			}
164 | 		}
165 | 
166 | 		if (excludeSingleItemBuckets) {
167 | 			for (Iterator<Bucket> iter = buckets.iterator(); iter.hasNext(); ) {
168 | 				Bucket bucket = iter.next();
169 | 
170 | 				if (bucket.contents.size() < 2) {
171 | 					iter.remove();
172 | 				}
173 | 			}
174 | 		}
175 | 
176 | 		/*
177 | 		 * Empty the state for the next invocation of reset.
178 | 		 */
179 | 		dir = null;
180 | 		items.clear();
181 | 		size = 0;
182 | 
183 | 		return buckets;
184 | 	}
185 | 
186 | 	/**
187 | 	 * Add an item for consideration. If the item has zero size, then it is ignored.
188 | 	 */
189 | 	public void add(HasSize item) {
190 | 		if (null == dir) {
191 | 			throw new IllegalStateException("No directory set");
192 | 		}
193 | 
194 | 		long itemSize = item.size();
195 | 
196 | 		if (0 != itemSize) {
197 | 			items.add(item);
198 | 			size += itemSize;
199 | 		}
200 | 	}
201 | 
202 | 	/**
203 | 	 * Returns the count of items being considered.
204 | 	 */
205 | 	int count() {
206 | 		return items.size();
207 | 	}
208 | 
209 | 	/**
210 | 	 * Returns the total size of all the items being considered.
211 | 	 */
212 | 	long size() {
213 | 		return size;
214 | 	}
215 | 
216 | 	/**
217 | 	 * Resets the instance for the directory. The given name is used to name the buckets.
218 | 	 *
219 | 	 * @param dir
220 | 	 *          Directory name. Must not be null or empty.
221 | 	 */
222 | 	public void reset(String dir) {
223 | 		if (dir.equals("")) {
224 | 			throw new IllegalArgumentException("Directory is empty");
225 | 		}
226 | 
227 | 		this.dir = dir;
228 | 
229 | 		items.clear();
230 | 		size = 0;
231 | 	}
232 | 
233 | 	String dir() {
234 | 		return dir;
235 | 	}
236 | 
237 | 	public static class Bucket implements HasSize {
238 | 
239 | 		private final List<String> contents;
240 | 
241 | 		private final String name;
242 | 
243 | 		private long bytes;
244 | 
245 | 		public Bucket(String name) {
246 | 			super();
247 | 
248 | 			this.name = name;
249 | 			this.contents = new LinkedList<String>();
250 | 		}
251 | 
252 | 		public Bucket(String name, List<String> contents, long bytes) {
253 | 			super();
254 | 
255 | 			this.contents = contents;
256 | 			this.name = name;
257 | 			this.bytes = bytes;
258 | 		}
259 | 
260 | 		private void add(HasSize hasSize) {
261 | 			contents.add(hasSize.id());
262 | 			bytes += hasSize.size();
263 | 		}
264 | 
265 | 		public List<String> contents() {
266 | 			return unmodifiableList(contents);
267 | 		}
268 | 
269 | 		public String name() {
270 | 			return name;
271 | 		}
272 | 
273 | 		public long bytes() {
274 | 			return bytes;
275 | 		}
276 | 
277 | 		@Override
278 | 		public String id() {
279 | 			return name();
280 | 		}
281 | 
282 | 		@Override
283 | 		public long size() {
284 | 			return bytes();
285 | 		}
286 | 
287 | 		@Override
288 | 		public String toString() {
289 | 			return format("%s[%s, %d, %s]", getClass().getSimpleName(), name, bytes, contents);
290 | 		}
291 | 
292 | 		@Override
293 | 		public boolean equals(Object obj) {
294 | 			if (!(obj instanceof Bucket)) {
295 | 				return false;
296 | 			}
297 | 
298 | 			Bucket other = (Bucket) obj;
299 | 
300 | 			return name.equals(other.name) && bytes == other.bytes && contents.equals(other.contents);
301 | 		}
302 | 
303 | 		@Override
304 | 		public int hashCode() {
305 | 			return name.hashCode();
306 | 		}
307 | 	}
308 | 
309 | 	private static final Comparator<HasSize> DESCENDING_SIZE = new Comparator<HasSize>() {
310 | 		@Override
311 | 		public int compare(HasSize o1, HasSize o2) {
312 | 			long l1 = o1.size();
313 | 			long l2 = o2.size();
314 | 
315 | 			if (l1 < l2) {
316 | 				return 1;
317 | 			}
318 | 
319 | 			if (l1 > l2) {
320 | 				return -1;
321 | 			}
322 | 
323 | 			return 0;
324 | 		}
325 | 	};
326 | 
327 | 	interface HasSize {
328 | 		String id();
329 | 
330 | 		long size();
331 | 	}
332 | }
333 | 


--------------------------------------------------------------------------------
/src/test/java/com/m6d/filecrush/crush/CrushPartitionerTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |    Copyright 2011 m6d.com
  3 | 
  4 |    Licensed under the Apache License, Version 2.0 (the "License");
  5 |    you may not use this file except in compliance with the License.
  6 |    You may obtain a copy of the License at
  7 | 
  8 |        http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |    Unless required by applicable law or agreed to in writing, software
 11 |    distributed under the License is distributed on an "AS IS" BASIS,
 12 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |    See the License for the specific language governing permissions and
 14 |    limitations under the License.
 15 |  */
 16 | package com.m6d.filecrush.crush;
 17 | 
 18 | import static org.hamcrest.Matchers.equalTo;
 19 | import static org.junit.Assert.assertThat;
 20 | import static org.junit.Assert.fail;
 21 | 
 22 | import java.io.IOException;
 23 | 
 24 | import org.apache.hadoop.fs.FileSystem;
 25 | import org.apache.hadoop.fs.Path;
 26 | import org.apache.hadoop.io.IntWritable;
 27 | import org.apache.hadoop.io.SequenceFile;
 28 | import org.apache.hadoop.io.SequenceFile.Writer;
 29 | import org.apache.hadoop.io.Text;
 30 | import org.apache.hadoop.mapred.JobConf;
 31 | import org.junit.Before;
 32 | import org.junit.Rule;
 33 | import org.junit.Test;
 34 | import org.junit.rules.TemporaryFolder;
 35 | 
 36 | import com.m6d.filecrush.crush.CrushPartitioner;
 37 | 
 38 | @SuppressWarnings("deprecation")
 39 | public class CrushPartitionerTest {
 40 | 	@Rule
 41 | 	public final TemporaryFolder tmp = new TemporaryFolder();
 42 | 
 43 | 	private JobConf job;
 44 | 
 45 | 	private FileSystem fs;
 46 | 
 47 | 	private Path partitionMap;
 48 | 
 49 | 	private CrushPartitioner partitioner;
 50 | 
 51 | 	@Before
 52 | 	public void setupPartitionMap() throws IOException {
 53 | 		job = new JobConf(false);
 54 | 
 55 | 		job.set("fs.default.name", "file:///");
 56 | 		job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem");
 57 | 		job.set("crush.partition.map", tmp.getRoot().getAbsolutePath() + "/partition-map");
 58 | 
 59 | 		fs = FileSystem.get(job);
 60 | 
 61 | 		partitionMap = new Path(tmp.getRoot().getAbsolutePath(), "partition-map");
 62 | 
 63 | 		partitioner = new CrushPartitioner();
 64 | 	}
 65 | 
 66 | 	@Test
 67 | 	public void partition() throws IOException {
 68 | 
 69 | 		Writer writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class);
 70 | 
 71 | 		Text key = new Text();
 72 | 		IntWritable partNum = new IntWritable();
 73 | 
 74 | 		key.set("bucket-1");
 75 | 		partNum.set(0);
 76 | 		writer.append(key, partNum);
 77 | 
 78 | 		key.set("bucket-2");
 79 | 		partNum.set(0);
 80 | 		writer.append(key, partNum);
 81 | 
 82 | 		key.set("bucket-3");
 83 | 		partNum.set(1);
 84 | 		writer.append(key, partNum);
 85 | 
 86 | 		key.set("bucket-4");
 87 | 		partNum.set(2);
 88 | 		writer.append(key, partNum);
 89 | 
 90 | 		key.set("bucket-5");
 91 | 		partNum.set(2);
 92 | 		writer.append(key, partNum);
 93 | 
 94 | 		key.set("bucket-6");
 95 | 		partNum.set(2);
 96 | 		writer.append(key, partNum);
 97 | 
 98 | 		writer.close();
 99 | 
100 | 		job.setNumReduceTasks(3);
101 | 
102 | 
103 | 		partitioner.configure(job);
104 | 
105 | 
106 | 		Text fileName = new Text();
107 | 
108 | 		key.set("bucket-1");
109 | 
110 | 		for (int file = 0; file < 4; file++) {
111 | 			fileName.set("file" + file);
112 | 			assertThat(partitioner.getPartition(key, fileName, 3), equalTo(0));
113 | 		}
114 | 
115 | 
116 | 		key.set("bucket-2");
117 | 
118 | 		for (int file = 0; file < 4; file++) {
119 | 			fileName.set("file" + file);
120 | 			assertThat(partitioner.getPartition(key, fileName, 3), equalTo(0));
121 | 		}
122 | 
123 | 
124 | 		key.set("bucket-3");
125 | 
126 | 		for (int file = 0; file < 4; file++) {
127 | 			fileName.set("file" + file);
128 | 			assertThat(partitioner.getPartition(key, fileName, 3), equalTo(1));
129 | 		}
130 | 
131 | 
132 | 		key.set("bucket-4");
133 | 
134 | 		for (int file = 0; file < 4; file++) {
135 | 			fileName.set("file" + file);
136 | 			assertThat(partitioner.getPartition(key, fileName, 3), equalTo(2));
137 | 		}
138 | 
139 | 
140 | 		key.set("bucket-5");
141 | 
142 | 		for (int file = 0; file < 4; file++) {
143 | 			fileName.set("file" + file);
144 | 			assertThat(partitioner.getPartition(key, fileName, 3), equalTo(2));
145 | 		}
146 | 
147 | 
148 | 		key.set("bucket-6");
149 | 
150 | 		for (int file = 0; file < 4; file++) {
151 | 			fileName.set("file" + file);
152 | 			assertThat(partitioner.getPartition(key, fileName, 3), equalTo(2));
153 | 		}
154 | 	}
155 | 
156 | 
157 | 	@Test
158 | 	public void partitionWithFewerPartitionsThanReduceTasks() throws IOException {
159 | 
160 | 		Writer writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class);
161 | 
162 | 		Text key = new Text();
163 | 		IntWritable partNum = new IntWritable();
164 | 
165 | 		key.set("bucket-1");
166 | 		partNum.set(0);
167 | 		writer.append(key, partNum);
168 | 
169 | 		key.set("bucket-2");
170 | 		partNum.set(0);
171 | 		writer.append(key, partNum);
172 | 
173 | 		key.set("bucket-3");
174 | 		partNum.set(1);
175 | 		writer.append(key, partNum);
176 | 
177 | 		key.set("bucket-4");
178 | 		partNum.set(2);
179 | 		writer.append(key, partNum);
180 | 
181 | 		key.set("bucket-5");
182 | 		partNum.set(2);
183 | 		writer.append(key, partNum);
184 | 
185 | 		key.set("bucket-6");
186 | 		partNum.set(2);
187 | 		writer.append(key, partNum);
188 | 
189 | 		writer.close();
190 | 
191 | 		job.setNumReduceTasks(40);
192 | 
193 | 
194 | 		partitioner.configure(job);
195 | 
196 | 
197 | 		Text fileName = new Text();
198 | 
199 | 		key.set("bucket-1");
200 | 
201 | 		for (int file = 0; file < 4; file++) {
202 | 			fileName.set("file" + file);
203 | 			assertThat(partitioner.getPartition(key, fileName, 3), equalTo(0));
204 | 		}
205 | 
206 | 
207 | 		key.set("bucket-2");
208 | 
209 | 		for (int file = 0; file < 4; file++) {
210 | 			fileName.set("file" + file);
211 | 			assertThat(partitioner.getPartition(key, fileName, 3), equalTo(0));
212 | 		}
213 | 
214 | 
215 | 		key.set("bucket-3");
216 | 
217 | 		for (int file = 0; file < 4; file++) {
218 | 			fileName.set("file" + file);
219 | 			assertThat(partitioner.getPartition(key, fileName, 3), equalTo(1));
220 | 		}
221 | 
222 | 
223 | 		key.set("bucket-4");
224 | 
225 | 		for (int file = 0; file < 4; file++) {
226 | 			fileName.set("file" + file);
227 | 			assertThat(partitioner.getPartition(key, fileName, 3), equalTo(2));
228 | 		}
229 | 
230 | 
231 | 		key.set("bucket-5");
232 | 
233 | 		for (int file = 0; file < 4; file++) {
234 | 			fileName.set("file" + file);
235 | 			assertThat(partitioner.getPartition(key, fileName, 3), equalTo(2));
236 | 		}
237 | 
238 | 
239 | 		key.set("bucket-6");
240 | 
241 | 		for (int file = 0; file < 4; file++) {
242 | 			fileName.set("file" + file);
243 | 			assertThat(partitioner.getPartition(key, fileName, 3), equalTo(2));
244 | 		}
245 | 	}
246 | 
247 | 	@Test
248 | 	public void noDupes() throws IOException {
249 | 
250 | 		Writer writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class);
251 | 
252 | 		Text key = new Text();
253 | 		IntWritable value = new IntWritable();
254 | 
255 | 		key.set("bucket-1");
256 | 		value.set(0);
257 | 		writer.append(key, value);
258 | 
259 | 		key.set("bucket-2");
260 | 		value.set(0);
261 | 		writer.append(key, value);
262 | 
263 | 		key.set("bucket-2");
264 | 		value.set(1);
265 | 		writer.append(key, value);
266 | 
267 | 		writer.close();
268 | 
269 | 		job.setNumReduceTasks(3);
270 | 
271 | 		try {
272 | 			partitioner.configure(job);
273 | 			fail();
274 | 		} catch (IllegalArgumentException e) {
275 | 			if (!e.getMessage().contains("bucket-2")) {
276 | 				throw e;
277 | 			}
278 | 		}
279 | 	}
280 | 
281 | 	@Test
282 | 	public void partitionTooLow() throws IOException {
283 | 
284 | 		Writer writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class);
285 | 
286 | 		Text key = new Text();
287 | 		IntWritable partNum = new IntWritable();
288 | 
289 | 		key.set("bucket-1");
290 | 		partNum.set(0);
291 | 		writer.append(key, partNum);
292 | 
293 | 		key.set("bucket-2");
294 | 		partNum.set(0);
295 | 		writer.append(key, partNum);
296 | 
297 | 		key.set("bucket-4");
298 | 		partNum.set(2);
299 | 		writer.append(key, partNum);
300 | 
301 | 		key.set("bucket-5");
302 | 		partNum.set(2);
303 | 		writer.append(key, partNum);
304 | 
305 | 		key.set("bucket-6");
306 | 		partNum.set(-1);
307 | 		writer.append(key, partNum);
308 | 
309 | 		writer.close();
310 | 
311 | 
312 | 		job.setNumReduceTasks(3);
313 | 
314 | 		try {
315 | 			partitioner.configure(job);
316 | 			fail("No such thing as a negitave partition");
317 | 		} catch (IllegalArgumentException e) {
318 | 			if (!e.getMessage().contains("Partition -1")) {
319 | 				throw e;
320 | 			}
321 | 		}
322 | 	}
323 | 
324 | 	@Test
325 | 	public void partitionTooHigh() throws IOException {
326 | 
327 | 		Writer writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class);
328 | 
329 | 		Text key = new Text();
330 | 		IntWritable partNum = new IntWritable();
331 | 
332 | 		key.set("bucket-1");
333 | 		partNum.set(0);
334 | 		writer.append(key, partNum);
335 | 
336 | 		key.set("bucket-2");
337 | 		partNum.set(0);
338 | 		writer.append(key, partNum);
339 | 
340 | 		key.set("bucket-4");
341 | 		partNum.set(2);
342 | 		writer.append(key, partNum);
343 | 
344 | 		key.set("bucket-5");
345 | 		partNum.set(2);
346 | 		writer.append(key, partNum);
347 | 
348 | 		key.set("bucket-6");
349 | 		partNum.set(3);
350 | 		writer.append(key, partNum);
351 | 
352 | 		writer.close();
353 | 
354 | 
355 | 		job.setNumReduceTasks(3);
356 | 
357 | 		try {
358 | 			partitioner.configure(job);
359 | 			fail("Parition with id 3 is not allowed with 3 reduce tasks");
360 | 		} catch (IllegalArgumentException e) {
361 | 			if (!e.getMessage().contains("Partition 3")) {
362 | 				throw e;
363 | 			}
364 | 		}
365 | 	}
366 | }
367 | 


--------------------------------------------------------------------------------
/src/test/java/com/m6d/filecrush/crush/BucketerParameterizedTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |    Copyright 2011 m6d.com
  3 | 
  4 |    Licensed under the Apache License, Version 2.0 (the "License");
  5 |    you may not use this file except in compliance with the License.
  6 |    You may obtain a copy of the License at
  7 | 
  8 |        http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |    Unless required by applicable law or agreed to in writing, software
 11 |    distributed under the License is distributed on an "AS IS" BASIS,
 12 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |    See the License for the specific language governing permissions and
 14 |    limitations under the License.
 15 |  */
 16 | package com.m6d.filecrush.crush;
 17 | 
 18 | import static java.lang.System.currentTimeMillis;
 19 | import static java.util.Arrays.asList;
 20 | import static java.util.Collections.emptyList;
 21 | import static org.hamcrest.Matchers.equalTo;
 22 | import static org.hamcrest.Matchers.nullValue;
 23 | import static org.junit.Assert.assertThat;
 24 | 
 25 | import java.util.ArrayList;
 26 | import java.util.Collection;
 27 | import java.util.Collections;
 28 | import java.util.Comparator;
 29 | import java.util.List;
 30 | 
 31 | import org.apache.hadoop.fs.FileStatus;
 32 | import org.apache.hadoop.fs.Path;
 33 | import org.junit.Test;
 34 | import org.junit.runner.RunWith;
 35 | import org.junit.runners.Parameterized;
 36 | import org.junit.runners.Parameterized.Parameters;
 37 | 
 38 | import com.m6d.filecrush.crush.Bucketer;
 39 | import com.m6d.filecrush.crush.FileStatusHasSize;
 40 | import com.m6d.filecrush.crush.Bucketer.Bucket;
 41 | 
 42 | 
 43 | /**
 44 |  * Block size 50 and threshold 75%.
 45 |  */
 46 | @RunWith(Parameterized.class)
 47 | public class BucketerParameterizedTest {
 48 | 	@Parameters
 49 | 	public static Collection<Object[]> testCases() {
 50 | 		List<Object[]> testCases = new ArrayList<Object[]>();
 51 | 
 52 | 		String dir;
 53 | 		List<FileStatus> input;
 54 | 		List<Bucket> expected;
 55 | 
 56 | 		/*
 57 | 		 * Three buckets of two each.
 58 | 		 *
 59 | 		 * 0					1						2
 60 | 		 * file3 37		file 2 20		file 4 19
 61 | 		 * file6 10		file 5 17		file 1 18
 62 | 		 */
 63 | 		dir = "three buckets of two each";
 64 | 
 65 | 		input = asList(	statusFor("file1", 18),
 66 | 										statusFor("file2", 20),
 67 | 										statusFor("file3", 37),
 68 | 										statusFor("file4", 19),
 69 | 										statusFor("file5", 17),
 70 | 										statusFor("file6", 10));
 71 | 
 72 | 		expected = asList(new Bucket("three buckets of two each-0", asList("file3", "file6"), 47),
 73 | 											new Bucket("three buckets of two each-1", asList("file2", "file5"), 37),
 74 | 											new Bucket("three buckets of two each-2", asList("file4", "file1"), 37));
 75 | 
 76 | 		testCases.add(new Object[] { dir, true, input, expected });
 77 | 
 78 | 
 79 | 		/*
 80 | 		 * Not enough data to fill all the buckets. Data should be packed into as few buckets as possible.
 81 | 		 */
 82 | 		dir = "not/enough/data/for/max/buckets";
 83 | 
 84 | 		input = asList(	statusFor("file1", 1),
 85 | 										statusFor("file2", 2),
 86 | 										statusFor("file3", 3),
 87 | 										statusFor("file4", 4),
 88 | 										statusFor("file5", 5),
 89 | 										statusFor("file6", 6));
 90 | 
 91 | 		expected = asList(new Bucket("not/enough/data/for/max/buckets-0", asList("file6", "file5", "file4", "file3", "file2", "file1"), 21));
 92 | 
 93 | 		testCases.add(new Object[] { dir, true, input, expected });
 94 | 
 95 | 		/*
 96 | 		 * A directory with one file should be ignored.
 97 | 		 */
 98 | 		dir = "dir/with/one/file";
 99 | 
100 | 		input = asList(statusFor("loner", 1));
101 | 
102 | 		expected = emptyList();
103 | 
104 | 		testCases.add(new Object[] { dir, true, input, expected });
105 | 
106 | 
107 | 		/*
108 | 		 * Test case with enough data to fill up all the buckets but no one bucket is more than twice the bucket size.
109 | 		 *
110 | 		 * 0						1						2						3						4
111 | 		 * file 9 35		file 1 30		file 3 30		file 5 30		file 7 30
112 | 		 * file 6 20		file 8 25		file 0 20		file 2 20		file 4 20
113 | 		 * 													file 11 20	file 10 10
114 | 		 */
115 | 		dir = "enough/data/for/max/buckets";
116 | 
117 | 		input = asList(	statusFor("file0",	20),
118 | 										statusFor("file1",	30),
119 | 										statusFor("file2",	20),
120 | 										statusFor("file3",	30),
121 | 										statusFor("file4",	20),
122 | 										statusFor("file5",	30),
123 | 										statusFor("file6",	20),
124 | 										statusFor("file7",	30),
125 | 										statusFor("file8",	25),
126 | 										statusFor("file9",	35),
127 | 										statusFor("file10",	10),
128 | 										statusFor("file11",	20));
129 | 
130 | 		expected = asList(
131 | 			new Bucket("enough/data/for/max/buckets-0", asList("file9", "file6"), 55),
132 | 			new Bucket("enough/data/for/max/buckets-1", asList("file1", "file8"), 55),
133 | 			new Bucket("enough/data/for/max/buckets-2", asList("file3", "file0", "file11"), 70),
134 | 			new Bucket("enough/data/for/max/buckets-3", asList("file5", "file2", "file10"), 60),
135 | 			new Bucket("enough/data/for/max/buckets-4", asList("file7", "file4"), 50));
136 | 
137 | 		testCases.add(new Object[] { dir, true, input, expected });
138 | 
139 | 
140 | 		/*
141 | 		 * Test case with enough data to fill up all the buckets with some of the buckets more than twice the bucket size.
142 | 		 *
143 | 		 * 1						2						3						4						5
144 | 		 * file  0 35		file  2 35	file  4 35	file  6 35	file  8 35
145 | 		 * file 10 35		file 12 35	file 14 35	file  1 30	file  3 30
146 | 		 * file  9 30		file 11 30	file 13 30 	file  5 30	file  7 30
147 | 		 * 																			file 15 30	file 16 20
148 | 		 */
149 | 		dir = "enough/data/for/max/buckets/and/big/buckets";
150 | 
151 | 		input = asList(	statusFor("file0",	35),
152 | 										statusFor("file1",	30),
153 | 										statusFor("file2",	35),
154 | 										statusFor("file3",	30),
155 | 										statusFor("file4",	35),
156 | 										statusFor("file5",	30),
157 | 										statusFor("file6",	35),
158 | 										statusFor("file7",	30),
159 | 										statusFor("file8",	35),
160 | 										statusFor("file9",	30),
161 | 										statusFor("file10",	35),
162 | 										statusFor("file11",	30),
163 | 										statusFor("file12",	35),
164 | 										statusFor("file13",	30),
165 | 										statusFor("file14",	35),
166 | 										statusFor("file15",	30),
167 | 										statusFor("file16",	20));
168 | 
169 | 		expected = asList(
170 | 			new Bucket("enough/data/for/max/buckets/and/big/buckets-0", asList("file0", "file10", "file9"), 100),
171 | 			new Bucket("enough/data/for/max/buckets/and/big/buckets-1", asList("file2", "file12", "file11"), 100),
172 | 			new Bucket("enough/data/for/max/buckets/and/big/buckets-2", asList("file4", "file14", "file13"), 100),
173 | 			new Bucket("enough/data/for/max/buckets/and/big/buckets-3", asList("file6", "file1", "file5", "file15"), 125),
174 | 			new Bucket("enough/data/for/max/buckets/and/big/buckets-4", asList("file8", "file3", "file7", "file16"), 115));
175 | 
176 | 		testCases.add(new Object[] { dir, true, input, expected });
177 | 
178 | 
179 | 		/*
180 | 		 * Exactly enough data for five buckets of 50.
181 | 		 */
182 | 		dir = "exactly/enough/data/for/max/buckets";
183 | 
184 | 		input = asList(	statusFor("file0", 20),
185 | 										statusFor("file1", 30),
186 | 										statusFor("file2", 20),
187 | 										statusFor("file3", 30),
188 | 										statusFor("file4", 20),
189 | 										statusFor("file5", 30),
190 | 										statusFor("file6", 20),
191 | 										statusFor("file7", 30),
192 | 										statusFor("file8", 20),
193 | 										statusFor("file9", 30));
194 | 
195 | 		expected = asList(
196 | 			new Bucket("exactly/enough/data/for/max/buckets-0", asList("file1", "file0"), 50),
197 | 			new Bucket("exactly/enough/data/for/max/buckets-1", asList("file3", "file2"), 50),
198 | 			new Bucket("exactly/enough/data/for/max/buckets-2", asList("file5", "file4"), 50),
199 | 			new Bucket("exactly/enough/data/for/max/buckets-3", asList("file7", "file6"), 50),
200 | 			new Bucket("exactly/enough/data/for/max/buckets-4", asList("file9", "file8"), 50));
201 | 
202 | 		testCases.add(new Object[] { dir, true, input, expected });
203 | 
204 | 
205 | 		/*
206 | 		 * Exactly enough data for four buckets of 50.
207 | 		 */
208 | 		dir = "exactly/enough/data/for/four/buckets";
209 | 
210 | 		input = asList(	statusFor("file0", 20),
211 | 										statusFor("file1", 30),
212 | 										statusFor("file2", 20),
213 | 										statusFor("file3", 30),
214 | 										statusFor("file4", 20),
215 | 										statusFor("file5", 30),
216 | 										statusFor("file6", 20),
217 | 										statusFor("file7", 30));
218 | 
219 | 		expected = asList(
220 | 			new Bucket("exactly/enough/data/for/four/buckets-0", asList("file1", "file0"), 50),
221 | 			new Bucket("exactly/enough/data/for/four/buckets-1", asList("file3", "file2"), 50),
222 | 			new Bucket("exactly/enough/data/for/four/buckets-2", asList("file5", "file4"), 50),
223 | 			new Bucket("exactly/enough/data/for/four/buckets-3", asList("file7", "file6"), 50));
224 | 
225 | 		testCases.add(new Object[] { dir, true, input, expected });
226 | 
227 | 
228 | 		/*
229 | 		 * Buckets that end up with one file are ignored.
230 | 		 *
231 | 		 * 0					1
232 | 		 * file 3 35	file 2 30
233 | 		 * 						file 1 25
234 | 		 *
235 | 		 * What would have been bucket 0 is dropped since it has only one file in it.
236 | 		 */
237 | 		dir = "buckets/with/one/file/are/ignored";
238 | 
239 | 		input = asList(	statusFor("file1", 25),
240 | 										statusFor("file2", 30),
241 | 										statusFor("file3", 35));
242 | 
243 | 		expected = asList(new Bucket("buckets/with/one/file/are/ignored-1", asList("file2", "file1"), 55));
244 | 
245 | 		testCases.add(new Object[] { dir, true, input, expected });
246 | 
247 | 
248 | 		/*
249 | 		 * Set the flag so that single item buckets are returned.
250 | 		 *
251 | 		 * 0					1
252 | 		 * file 3 35	file 2 30
253 | 		 * 						file 1 25
254 | 		 *
255 | 		 * What would have been bucket 0 is dropped since it has only one file in it.
256 | 		 */
257 | 		dir = "include/buckets/with/one/file";
258 | 
259 | 		input = asList(	statusFor("file1", 25),
260 | 										statusFor("file2", 30),
261 | 										statusFor("file3", 35));
262 | 
263 | 		expected = asList(
264 | 				new Bucket("include/buckets/with/one/file-0", asList("file3"), 35),
265 | 				new Bucket("include/buckets/with/one/file-1", asList("file2", "file1"), 55));
266 | 
267 | 		testCases.add(new Object[] { dir, false, input, expected });
268 | 
269 | 		return testCases;
270 | 	}
271 | 
272 | 	private final Bucketer bucketer;
273 | 
274 | 	private final String dir;
275 | 
276 | 	private final List<FileStatus> input;
277 | 
278 | 	private final List<Bucket> expected;
279 | 
280 | 	public BucketerParameterizedTest(String dir, boolean excludeSingleItemBuckets, List<FileStatus> input, List<Bucket> expected) {
281 | 		super();
282 | 
283 | 		this.dir = dir;
284 | 		this.input = input;
285 | 		this.expected = expected;
286 | 
287 | 		this.bucketer = new Bucketer(5, 50, excludeSingleItemBuckets);
288 | 	}
289 | 
290 | 	@Test
291 | 	public void test() {
292 | 		bucketer.reset(dir);
293 | 
294 | 		for (int i = 0; i < input.size(); i++) {
295 | 			FileStatus file = input.get(i);
296 | 
297 | 			bucketer.add(new FileStatusHasSize(file));
298 | 
299 | 			assertThat(dir, bucketer.count(), equalTo(i + 1));
300 | 		}
301 | 
302 | 		List<Bucket> actual = bucketer.createBuckets();
303 | 
304 | 		Collections.sort(expected, BUCKET_CMP);
305 | 		Collections.sort(actual, BUCKET_CMP);
306 | 
307 | 		assertThat(dir, actual, equalTo(expected));
308 | 
309 | 		assertThat(dir, bucketer.count(), equalTo(0));
310 | 		assertThat(dir, bucketer.dir(), nullValue());
311 | 		assertThat(dir, bucketer.size(), equalTo(0L));
312 | 	}
313 | 
314 | 	private static FileStatus statusFor(String path, long size) {
315 | 		return new FileStatus(size, false, 3, 1024, currentTimeMillis(), new Path(path));
316 | 	}
317 | 
318 | 	private static final Comparator<Bucket> BUCKET_CMP = new Comparator<Bucket>() {
319 | 		@Override
320 | 		public int compare(Bucket o1, Bucket o2) {
321 | 			return o1.name().compareTo(o2.name());
322 | 		}
323 | 	};
324 | }
325 | 


--------------------------------------------------------------------------------
/src/test/resources/help.txt:
--------------------------------------------------------------------------------
  1 | Crush
  2 | 
  3 | NAME
  4 | 
  5 |   Crush - Crush small files in dfs to fewer, larger files
  6 | 
  7 | SYNOPSIS
  8 |   Crush [OPTION]... <input dir> <output dir> <timestamp>
  9 | 
 10 | DESCRIPTION
 11 | 
 12 | Crush consumes directories containing many small files with the same key and value types and creates fewer, larger files containing the same data. Crush is gives you the control to:
 13 | 
 14 | * Name the output files
 15 | * Ignore files that are "big enough"
 16 | * Limit the size of each output file
 17 | * Control the output compression codec
 18 | * Swap smaller files with generated large files in-place
 19 | * No long-running task problem
 20 | 
 21 | See the EXAMPLES section
 22 | 
 23 | ARGUMENTS
 24 | 
 25 | input dir
 26 |   The root of the directory tree to crush. Directories are found recursively.
 27 | 
 28 | output dir
 29 |   In non-clone mode, the directory where the output files should be written. In clone mode, the directory where the original files (that were combine into larger files) should be moved.
 30 | 
 31 | timestamp
 32 |   A 14 digit job timestamp used to uniquely name files. E.g. 20100221175612. Generate in a script with: date +%Y%m%d%H%M%S
 33 | 
 34 | GLOBAL OPTIONS
 35 | 
 36 | -?, --help
 37 |   Print this help message.
 38 | 
 39 | --threshold
 40 |   Percent threshold relative to the dfs block size over which a file becomes eligible for crushing. Must be in the (0, 1]. Default is 0.75, which means files smaller than or equal to 75% of a dfs block will be eligible for crushing. File greater than 75% of a dfs block will be left untouched.
 41 | 
 42 | --max-file-blocks
 43 |   The maximum number of dfs blocks per output file. Must be a positive integer. Small input files are associated with an output file under the assumption that input and output compression codecs have similar efficiency. Also, a directory containing a lot of data in many small files will be converted into a directory containing a fewer number of large files rather than one super-massive file. With the default value 8, 80 small files, each being 1/10th of a dfs block will be grouped into to a single output file since 8 * 1/10 = 8 dfs blocks. If there are 81 small files, each being 1/10th of a dfs block, two output files will be created. One output file contain the combined contents of 41 files and the second will contain the combined contents of the other 40. A directory of many small files will be converted into fewer number of larger files where each output file is roughly the same size.
 44 | 
 45 | --compress
 46 |   Fully qualified class name of the compression codec to use when writing data. It is permissible to use "none" and "gzip" to indicate no compression and org.apache.hadoop.io.compress.GzipCodec, respectively.
 47 | 
 48 | --clone
 49 |   Use clone mode. Useful for external Hive tables. In clone mode, the small files are replaced with the larger files. The small files are moved to a subdirectory of the output dir argument. The subdirectory is same as the original directory rooted at output dir. For example, assume the input dir argument and output dir argument are /user/example/input and /user/example/output, respectively. If a file was originally /user/example/input/my-dir/smallfile, then after the clone, the original file would be located in /user/example/output/user/example/input/my-dir/smallfile.
 50 | 
 51 | --info
 52 |   Print information to the console about what the crush is doing.
 53 | 
 54 | --verbose
 55 |   Print even more information to the console about what the crush is doing.
 56 | 
 57 | DIRECTORY OPTIONS
 58 | 
 59 | If specified, these options must be appear as a group. When specifying multiple groups of these options, order matters. Defaults for directory options are not used if any are specified. See the EXAMPLES section.
 60 | 
 61 | --regex
 62 |   Regular expression that matches a directory name. Defaults to .+ if no directory options are specified at all. Empty directories are not required to have a matching regex. Conceptually similar to the first argument of String.replaceAll().
 63 | 
 64 | --replacement
 65 |   Replacement string used with corresponding regex to name output files. Defaults to crushed_file-${crush.timestamp}-${crush.task.num}-${crush.file.num} if no directory options are specified at all. The placeholder ${crush.timestamp} refers to the command line argument. ${crush.task.num} refers to the reducer number. ${crush.file.num} is a zero-based count of files producer by a specific reducer. The first file written by a reducer will have ${crush.file.num} = 0, the second = 1, the third = 2, etc. Conceptually similar to the second argument of String.replaceAll().
 66 | 
 67 | --input-format
 68 |   Fully qualified class name of the input format for the data in a directory. Can use the "text" and "sequence" shortcuts for org.apache.hadoop.mapred.TextInputFormat and org.apache.hadoop.mapred.SequenceFileInputFormat, respectively. Defaults to sequence if no directory options are specified.
 69 | 
 70 | --output-format
 71 |   Fully qualified class name of the output format to use when writing the output file for a directory. Can use the "text" and "sequence" shortcuts for org.apache.hadoop.mapred.TextOutputFormat and org.apache.hadoop.mapred.SequenceFileOutputFormat, respectively. Defaults to sequence if no directory options are specified.
 72 | 
 73 | EXAMPLES
 74 | 
 75 | Say we have the following files:
 76 | 
 77 | /user/example/work/input/
 78 |                          small-file1
 79 |                          small-file2
 80 |                          small-file3
 81 |                          small-file4
 82 |                          big-enough-file
 83 |                          subdir/
 84 |                                  small-file6
 85 |                                  small-file7
 86 |                                  small-file8
 87 |                                  medium-file1
 88 |                                  medium-file2
 89 | 
 90 | And we invoke the crush like this:
 91 | 
 92 |   Crush /user/example/work/input /user/example/work/output 20100221175612
 93 | 
 94 | Since we have not specified any of the directory options, the default regex, replacement, input-format, and output-format are used. We will get:
 95 | 
 96 | /user/example/work/
 97 |                    input/
 98 |                          small-file1
 99 |                          small-file2
100 |                          small-file3
101 |                          small-file4
102 |                          subdir/
103 |                                  small-file6
104 |                                  small-file7
105 |                                  small-file8
106 |                                  medium-file1
107 |                                  medium-file2
108 |                    output/
109 |                           crushed_file-20100221175612-0-0
110 |                           big-enough-file
111 |                           subdir/
112 |                                  crushed_file-20100221175612-1-0
113 |                                  crushed_file-20100221175612-1-1
114 | 
115 | Where:
116 | 
117 | crushed_file-20100221175612-0-0 = small-file1 + small-file2 + small-file3 + small-file4
118 | 
119 | crushed_file-20100221175612-1-0 = medium-file1 + small-file6 + small-file8
120 | 
121 | crushed_file-20100221175612-1-1 = medium-file2 + small-file7
122 | 
123 | Notice how big-enough-file was moved to the output directory. The input directory contains only the files that were combined into the larger files.
124 | 
125 | By default, the output file names end with two numbers. The first number is the task number of the reducer that wrote the file. The second number is the zero-based file count of that specific reducer. So a file ending with 0-0 was produced by reducer 0 and was the first file written by that reducer. A file ending 0-1 is the second file written by that reducer. A file ending 1-0 was produced by reducer 1 and was the first file written by that reducer. In the example, notice how the directory subdir was converted into two files. If mapred.reduce.tasks permits, multiple reducers can cooperate to crush a large directory.
126 | 
127 | Now a clone example. Say we invoked the crush like this:
128 | 
129 |   Crush --clone /user/example/work/input /user/example/clone 20100221175612
130 | 
131 | With the clone option. We would end up with:
132 | 
133 | /user/example/
134 |               work/input/
135 |                          crushed_file-20100221175612-0-0
136 |                          big-enough-file
137 |                          subdir/
138 |                                 crushed_file-20100221175612-1-0
139 |                                 crushed_file-20100221175612-1-1
140 |               clone/user/example/input/
141 |                                        small-file1
142 |                                        small-file2
143 |                                        small-file3
144 |                                        small-file4
145 |                                        subdir/
146 |                                               small-file6
147 |                                               small-file7
148 |                                               small-file8
149 |                                               medium-file1
150 |                                               medium-file2
151 | 
152 | Note how the original directory structure of /user/example/input as it appeared before the crush is reproduced in /user/example/clone. The small files that were combined are moved to the clone directory while the output files and file that were "big enough" are now in the inpu directory. Clone mode is useful for crushing external Hive tables. Just make sure that there are no Hive queries running on the table because they will fail when the small files are moved to the clone directory.
153 | 
154 | Now we try an example using the directory options. Say we invoke the crush like this to control the output file names:
155 | 
156 |   Crush \
157 |   --regex=.*/(.+) \
158 |   --replacement=$1-${crush.timestamp}-${crush.task.num}-${crush.file.num} \
159 |   --input=sequence \
160 |   --output=sequence \
161 |   /user/example/work/input /user/example/work/output 20100221175612
162 | 
163 | The --regex and --replacement arguments are similar to the arguments passed to String.replaceAll(). The regex argument matches the final part of a directory path. For /user/example/work/input, it will match input. For /user/example/work/input/subdir, it will match subdir. For matching purposes, a directory path does not have a trailing slash. The replacement argument refers to the match group by number to rename the file. The result is:
164 | 
165 | /user/example/work/output/
166 |                           input-20100221175612-0-0
167 |                           big-enough-file
168 |                           subdir/
169 |                                  subdir-20100221175612-1-0
170 |                                  subdir-20100221175612-1-1
171 | 
172 | The regex and replacement options are useful for naming the output files when crushing external Hive tables that are partitioned into directories whose names have business significance.
173 | 
174 | The following invocation fails:
175 | 
176 |   Crush \
177 |   --regex=.*/input \
178 |   --replacement=input-${crush.timestamp}-${crush.task.num}-${crush.file.num} \
179 |   --input=sequence \ 
180 |   --output=sequence \
181 |   /user/example/work/input /user/example/work/output 20100221175612
182 | 
183 | Since we have specified some directory options, we must ensure that all directories in hierarchy rooted at the input argument have a matching regex (since the default regex is no longer applicable). In this invocation, there is no regex argument that matches /user/example/work/input/subdir. We must change it to:
184 | 
185 |   Crush \
186 |   --regex=.*/input \
187 |   --replacement=input-${crush.timestamp}-${crush.task.num}-${crush.file.num} \
188 |   --input=sequence \
189 |   --output=sequence \
190 |   --regex=.*/subdir \
191 |   --replacement=as-text-${crush.timestamp}-${crush.task.num}-${crush.file.num} \
192 |   --input=sequence \
193 |   --output=text \
194 |   /user/example/work/input /user/example/work/output 20100221175612
195 | 
196 | This will yield:
197 | 
198 | /user/example/work/output/
199 |                           input-20100221175612-0-0
200 |                           big-enough-file
201 |                           subdir/
202 |                                  as-text-20100221175612-1-0
203 |                                  as-text-20100221175612-1-1
204 | 
205 | Notice subdir has two files whose names differ only by the ${crush.file.num} value. Without the ${crush.file.num}, file names are not guaranteed to be unique.
206 | 
207 | NOTES
208 | 
209 | This program creates a temporary directories in "tmp" of the executing user's home directory in dfs.
210 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
  1 | Hadoop filecrusher.
  2 | 
  3 | Turn many small files into fewer larger ones. Also change from text to sequence and other compression options in one pass.
  4 | Crush
  5 | 
  6 | NAME
  7 | 
  8 |   Crush - Crush small files in dfs to fewer, larger files
  9 | 
 10 | SYNOPSIS
 11 |   Crush [OPTION]... <input dir> <output dir> <timestamp>
 12 | 
 13 | DESCRIPTION
 14 | 
 15 | Crush consumes directories containing many small files with the same key and value types and creates fewer, larger files containing the same data. Crush is gives you the control to:
 16 | 
 17 | * Name the output files
 18 | * Ignore files that are "big enough"
 19 | * Limit the size of each output file
 20 | * Control the output compression codec
 21 | * Swap smaller files with generated large files in-place
 22 | * No long-running task problem
 23 | 
 24 | See the EXAMPLES section
 25 | 
 26 | ARGUMENTS
 27 | 
 28 | input dir
 29 |   The root of the directory tree to crush. Directories are found recursively.
 30 | 
 31 | output dir
 32 |   In non-clone mode, the directory where the output files should be written. In clone mode, the directory where the original files (that were combine into larger files) should be moved.
 33 | 
 34 | timestamp
 35 |   A 14 digit job timestamp used to uniquely name files. E.g. 20100221175612. Generate in a script with: date +%Y%m%d%H%M%S
 36 | 
 37 | GLOBAL OPTIONS
 38 | 
 39 | -?, --help
 40 |   Print this help message.
 41 | 
 42 | --threshold
 43 |   Percent threshold relative to the dfs block size over which a file becomes eligible for crushing. Must be in the (0, 1]. Default is 0.75, which means files smaller than or equal to 75% of a dfs block will be eligible for crushing. File greater than 75% of a dfs block will be left untouched.
 44 | 
 45 | --max-file-blocks
 46 |   The maximum number of dfs blocks per output file. Must be a positive integer. Small input files are associated with an output file under the assumption that input and output compression codecs have similar efficiency. Also, a directory containing a lot of data in many small files will be converted into a directory containing a fewer number of large files rather than one super-massive file. With the default value 8, 80 small files, each being 1/10th of a dfs block will be grouped into to a single output file since 8 * 1/10 = 8 dfs blocks. If there are 81 small files, each being 1/10th of a dfs block, two output files will be created. One output file contain the combined contents of 41 files and the second will contain the combined contents of the other 40. A directory of many small files will be converted into fewer number of larger files where each output file is roughly the same size.
 47 | 
 48 | --compress
 49 |   Fully qualified class name of the compression codec to use when writing data. It is permissible to use "none" and "gzip" to indicate no compression and org.apache.hadoop.io.compress.GzipCodec, respectively.
 50 | 
 51 | --clone
 52 |   Use clone mode. Useful for external Hive tables. In clone mode, the small files are replaced with the larger files. The small files are moved to a subdirectory of the output dir argument. The subdirectory is same as the original directory rooted at output dir. For example, assume the input dir argument and output dir argument are /user/example/input and /user/example/output, respectively. If a file was originally /user/example/input/my-dir/smallfile, then after the clone, the original file would be located in /user/example/output/user/example/input/my-dir/smallfile.
 53 | 
 54 | --info
 55 |   Print information to the console about what the crush is doing.
 56 | 
 57 | --verbose
 58 |   Print even more information to the console about what the crush is doing.
 59 | 
 60 | DIRECTORY OPTIONS
 61 | 
 62 | If specified, these options must be appear as a group. When specifying multiple groups of these options, order matters. Defaults for directory options are not used if any are specified. See the EXAMPLES section.
 63 | 
 64 | --regex
 65 |   Regular expression that matches a directory name. Defaults to .+ if no directory options are specified at all. Empty directories are not required to have a matching regex. Conceptually similar to the first argument of String.replaceAll().
 66 | 
 67 | --replacement
 68 |   Replacement string used with corresponding regex to name output files. Defaults to crushed_file-${crush.timestamp}-${crush.task.num}-${crush.file.num} if no directory options are specified at all. The placeholder ${crush.timestamp} refers to the command line argument. ${crush.task.num} refers to the reducer number. ${crush.file.num} is a zero-based count of files producer by a specific reducer. The first file written by a reducer will have ${crush.file.num} = 0, the second = 1, the third = 2, etc. Conceptually similar to the second argument of String.replaceAll().
 69 | 
 70 | --input-format
 71 |   Fully qualified class name of the input format for the data in a directory. Can use the "text" and "sequence" shortcuts for org.apache.hadoop.mapred.TextInputFormat and org.apache.hadoop.mapred.SequenceFileInputFormat, respectively. Defaults to sequence if no directory options are specified.
 72 | 
 73 | --output-format
 74 |   Fully qualified class name of the output format to use when writing the output file for a directory. Can use the "text" and "sequence" shortcuts for org.apache.hadoop.mapred.TextOutputFormat and org.apache.hadoop.mapred.SequenceFileOutputFormat, respectively. Defaults to sequence if no directory options are specified.
 75 | 
 76 | EXAMPLES
 77 | 
 78 | Say we have the following files:
 79 | 
 80 | /user/example/work/input/
 81 |                          small-file1
 82 |                          small-file2
 83 |                          small-file3
 84 |                          small-file4
 85 |                          big-enough-file
 86 |                          subdir/
 87 |                                  small-file6
 88 |                                  small-file7
 89 |                                  small-file8
 90 |                                  medium-file1
 91 |                                  medium-file2
 92 | 
 93 | And we invoke the crush like this:
 94 | 
 95 |   Crush /user/example/work/input /user/example/work/output 20100221175612
 96 | 
 97 | Since we have not specified any of the directory options, the default regex, replacement, input-format, and output-format are used. We will get:
 98 | 
 99 | /user/example/work/
100 |                    input/
101 |                          small-file1
102 |                          small-file2
103 |                          small-file3
104 |                          small-file4
105 |                          subdir/
106 |                                  small-file6
107 |                                  small-file7
108 |                                  small-file8
109 |                                  medium-file1
110 |                                  medium-file2
111 |                    output/
112 |                           crushed_file-20100221175612-0-0
113 |                           big-enough-file
114 |                           subdir/
115 |                                  crushed_file-20100221175612-1-0
116 |                                  crushed_file-20100221175612-1-1
117 | 
118 | Where:
119 | 
120 | crushed_file-20100221175612-0-0 = small-file1 + small-file2 + small-file3 + small-file4
121 | 
122 | crushed_file-20100221175612-1-0 = medium-file1 + small-file6 + small-file8
123 | 
124 | crushed_file-20100221175612-1-1 = medium-file2 + small-file7
125 | 
126 | Notice how big-enough-file was moved to the output directory. The input directory contains only the files that were combined into the larger files.
127 | 
128 | By default, the output file names end with two numbers. The first number is the task number of the reducer that wrote the file. The second number is the zero-based file count of that specific reducer. So a file ending with 0-0 was produced by reducer 0 and was the first file written by that reducer. A file ending 0-1 is the second file written by that reducer. A file ending 1-0 was produced by reducer 1 and was the first file written by that reducer. In the example, notice how the directory subdir was converted into two files. If mapred.reduce.tasks permits, multiple reducers can cooperate to crush a large directory.
129 | 
130 | Now a clone example. Say we invoked the crush like this:
131 | 
132 |   Crush --clone /user/example/work/input /user/example/clone 20100221175612
133 | 
134 | With the clone option. We would end up with:
135 | 
136 | /user/example/
137 |               work/input/
138 |                          crushed_file-20100221175612-0-0
139 |                          big-enough-file
140 |                          subdir/
141 |                                 crushed_file-20100221175612-1-0
142 |                                 crushed_file-20100221175612-1-1
143 |               clone/user/example/input/
144 |                                        small-file1
145 |                                        small-file2
146 |                                        small-file3
147 |                                        small-file4
148 |                                        subdir/
149 |                                               small-file6
150 |                                               small-file7
151 |                                               small-file8
152 |                                               medium-file1
153 |                                               medium-file2
154 | 
155 | Note how the original directory structure of /user/example/input as it appeared before the crush is reproduced in /user/example/clone. The small files that were combined are moved to the clone directory while the output files and file that were "big enough" are now in the inpu directory. Clone mode is useful for crushing external Hive tables. Just make sure that there are no Hive queries running on the table because they will fail when the small files are moved to the clone directory.
156 | 
157 | Now we try an example using the directory options. Say we invoke the crush like this to control the output file names:
158 | 
159 |   Crush \
160 |   --regex=.*/(.+) \
161 |   --replacement=$1-${crush.timestamp}-${crush.task.num}-${crush.file.num} \
162 |   --input=sequence \
163 |   --output=sequence \
164 |   /user/example/work/input /user/example/work/output 20100221175612
165 | 
166 | The --regex and --replacement arguments are similar to the arguments passed to String.replaceAll(). The regex argument matches the final part of a directory path. For /user/example/work/input, it will match input. For /user/example/work/input/subdir, it will match subdir. For matching purposes, a directory path does not have a trailing slash. The replacement argument refers to the match group by number to rename the file. The result is:
167 | 
168 | /user/example/work/output/
169 |                           input-20100221175612-0-0
170 |                           big-enough-file
171 |                           subdir/
172 |                                  subdir-20100221175612-1-0
173 |                                  subdir-20100221175612-1-1
174 | 
175 | The regex and replacement options are useful for naming the output files when crushing external Hive tables that are partitioned into directories whose names have business significance.
176 | 
177 | The following invocation fails:
178 | 
179 |   Crush \
180 |   --regex=.*/input \
181 |   --replacement=input-${crush.timestamp}-${crush.task.num}-${crush.file.num} \
182 |   --input=sequence \ 
183 |   --output=sequence \
184 |   /user/example/work/input /user/example/work/output 20100221175612
185 | 
186 | Since we have specified some directory options, we must ensure that all directories in hierarchy rooted at the input argument have a matching regex (since the default regex is no longer applicable). In this invocation, there is no regex argument that matches /user/example/work/input/subdir. We must change it to:
187 | 
188 |   Crush \
189 |   --regex=.*/input \
190 |   --replacement=input-${crush.timestamp}-${crush.task.num}-${crush.file.num} \
191 |   --input=sequence \
192 |   --output=sequence \
193 |   --regex=.*/subdir \
194 |   --replacement=as-text-${crush.timestamp}-${crush.task.num}-${crush.file.num} \
195 |   --input=sequence \
196 |   --output=text \
197 |   /user/example/work/input /user/example/work/output 20100221175612
198 | 
199 | This will yield:
200 | 
201 | /user/example/work/output/
202 |                           input-20100221175612-0-0
203 |                           big-enough-file
204 |                           subdir/
205 |                                  as-text-20100221175612-1-0
206 |                                  as-text-20100221175612-1-1
207 | 
208 | Notice subdir has two files whose names differ only by the ${crush.file.num} value. Without the ${crush.file.num}, file names are not guaranteed to be unique.
209 | 
210 | NOTES
211 | 
212 | This program creates a temporary directories in "tmp" of the executing user's home directory in dfs.
213 | 
214 | https://zenodo.org/badge/doi/10.5281/zenodo.11038.png
215 | 
216 | 


--------------------------------------------------------------------------------
/src/test/java/com/m6d/filecrush/crush/CrushReducerTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |    Copyright 2011 m6d.com
  3 | 
  4 |    Licensed under the Apache License, Version 2.0 (the "License");
  5 |    you may not use this file except in compliance with the License.
  6 |    You may obtain a copy of the License at
  7 | 
  8 |        http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |    Unless required by applicable law or agreed to in writing, software
 11 |    distributed under the License is distributed on an "AS IS" BASIS,
 12 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |    See the License for the specific language governing permissions and
 14 |    limitations under the License.
 15 |  */
 16 | package com.m6d.filecrush.crush;
 17 | 
 18 | import static java.util.Arrays.asList;
 19 | import static org.hamcrest.Matchers.equalTo;
 20 | import static org.junit.Assert.assertThat;
 21 | import static org.junit.Assert.fail;
 22 | 
 23 | import java.io.File;
 24 | import java.io.IOException;
 25 | import java.util.Arrays;
 26 | 
 27 | import org.apache.hadoop.io.Text;
 28 | import org.apache.hadoop.mapred.JobConf;
 29 | import org.apache.hadoop.mapred.SequenceFileInputFormat;
 30 | import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 31 | import org.apache.hadoop.mapred.TextInputFormat;
 32 | import org.apache.hadoop.mapred.TextOutputFormat;
 33 | import org.junit.Before;
 34 | import org.junit.Rule;
 35 | import org.junit.Test;
 36 | import org.junit.rules.TemporaryFolder;
 37 | 
 38 | import com.m6d.filecrush.crush.CrushReducer;
 39 | import com.m6d.filecrush.crush.KeyValuePreservingTextInputFormat;
 40 | 
 41 | @SuppressWarnings("deprecation")
 42 | public class CrushReducerTest {
 43 | 
 44 | 	@Rule
 45 | 	public final TemporaryFolder tmp = new TemporaryFolder();
 46 | 
 47 | 	private File outDir;
 48 | 
 49 | 	private CrushReducer reducer;
 50 | 
 51 | 	@Before
 52 | 	public void setupReducer() {
 53 | 		JobConf job = new JobConf(false);
 54 | 
 55 | 		job.set("mapred.tip.id", "task_201011081200_014527_r_001234");
 56 | 		job.set("mapred.task.id", "attempt_201011081200_14527_r_001234_0");
 57 | 
 58 | 		outDir = tmp.newFolder("out");
 59 | 		tmp.newFolder("out/_temporary");
 60 | 
 61 | 		job.set("mapred.output.dir", outDir.getAbsolutePath());
 62 | 
 63 | 		job.set("fs.default.name", "file:///");
 64 | 		job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem");
 65 | 
 66 | 		job.setLong("crush.timestamp", 98765);
 67 | 
 68 | 		job.setInt("crush.num.specs", 3);
 69 | 		job.set("crush.0.regex", ".+/dir");
 70 | 		job.set("crush.0.regex.replacement", "firstregex-${crush.timestamp}-${crush.task.num}-${crush.file.num}");
 71 | 		job.set("crush.0.input.format", SequenceFileInputFormat.class.getName());
 72 | 		job.set("crush.0.output.format", TextOutputFormat.class.getName());
 73 | 
 74 | 		job.set("crush.1.regex", ".+/dir/([^/]+/)*(.+)");
 75 | 		job.set("crush.1.regex.replacement", "secondregex-$2-${crush.timestamp}-${crush.task.num}-${crush.file.num}");
 76 | 		job.set("crush.1.input.format", TextInputFormat.class.getName());
 77 | 		job.set("crush.1.output.format", TextOutputFormat.class.getName());
 78 | 
 79 | 		job.set("crush.2.regex", ".+/other");
 80 | 		job.set("crush.2.regex.replacement", "${crush.timestamp}-${crush.task.num}-middle-${crush.file.num}-tail");
 81 | 		job.set("crush.2.input.format", TextInputFormat.class.getName());
 82 | 		job.set("crush.2.output.format", SequenceFileOutputFormat.class.getName());
 83 | 
 84 | 		reducer = new CrushReducer();
 85 | 
 86 | 		reducer.configure(job);
 87 | 	}
 88 | 
 89 | 	@Test
 90 | 	public void taskNum() {
 91 | 		assertThat("task_201011081200_14527_r_1234 => 1234", reducer.getTaskNum(), equalTo(1234));
 92 | 	}
 93 | 
 94 | 	@Test
 95 | 	public void timestamp() {
 96 | 		assertThat(reducer.getTimestamp(), equalTo(98765L));
 97 | 	}
 98 | 
 99 | 	@Test
100 | 	public void inputRegexList() {
101 | 		assertThat(reducer.getInputRegexList(), equalTo(asList(".+/dir", ".+/dir/([^/]+/)*(.+)", ".+/other")));
102 | 	}
103 | 
104 | 	@Test
105 | 	public void outputReplacementList() {
106 | 		/*
107 | 		 * Job configuration already performs some token substitution.
108 | 		 */
109 | 		assertThat(reducer.getOutputReplacementList(), equalTo(asList("firstregex-98765-${crush.task.num}-${crush.file.num}",
110 | 																																	"secondregex-$2-98765-${crush.task.num}-${crush.file.num}",
111 | 																																	"98765-${crush.task.num}-middle-${crush.file.num}-tail")));
112 | 	}
113 | 
114 | 	@Test
115 | 	public void inputFormatList() {
116 | 		assertThat(reducer.getInputFormatList(), equalTo(Arrays.<Class<?>> asList(SequenceFileInputFormat.class,
117 | 																																							KeyValuePreservingTextInputFormat.class,
118 | 																																							KeyValuePreservingTextInputFormat.class)));
119 | 	}
120 | 
121 | 	@Test
122 | 	public void outputFormatList() {
123 | 		assertThat(reducer.getOutputFormatList(), equalTo(Arrays.<Class<?>> asList(	TextOutputFormat.class,
124 | 																																								TextOutputFormat.class,
125 | 																																								SequenceFileOutputFormat.class)));
126 | 	}
127 | 
128 | 	@Test
129 | 	public void calculateOutputfile() {
130 | 		assertThat(reducer.findMatcher("/path/to/a/dir"), equalTo(0));
131 | 		assertThat(reducer.calculateOutputFile(0, "/path/to/a/dir"), equalTo("/path/to/a/dir/firstregex-98765-1234-0"));
132 | 
133 | 		assertThat(reducer.findMatcher("/path/to/a/dir/foo/dir"), equalTo(0));
134 | 		assertThat(reducer.calculateOutputFile(0, "/path/to/a/dir/foo/dir"), equalTo("/path/to/a/dir/foo/dir/firstregex-98765-1234-1"));
135 | 
136 | 		assertThat(reducer.findMatcher("/path/to/a/dir/subdir"), equalTo(1));
137 | 		assertThat(reducer.calculateOutputFile(1, "/path/to/a/dir/subdir"), equalTo("/path/to/a/dir/subdir/secondregex-subdir-98765-1234-2"));
138 | 
139 | 		assertThat(reducer.findMatcher("/x/dir/foo/bar"), equalTo(1));
140 | 		assertThat(reducer.calculateOutputFile(1, "/x/dir/foo/bar"), equalTo("/x/dir/foo/bar/secondregex-bar-98765-1234-3"));
141 | 
142 | 		assertThat(reducer.findMatcher("/x/other"), equalTo(2));
143 | 		assertThat(reducer.calculateOutputFile(2, "/x/other"), equalTo("/x/other/98765-1234-middle-4-tail"));
144 | 
145 | 		assertThat(reducer.findMatcher("/x/foo/other"), equalTo(2));
146 | 		assertThat(reducer.calculateOutputFile(2, "/x/foo/other"), equalTo("/x/foo/other/98765-1234-middle-5-tail"));
147 | 	}
148 | 
149 | 	@Test
150 | 	public void fileNotFound() throws IOException {
151 | 		try {
152 | 			reducer.reduce(new Text("/path/to/a/dir-4"), asList(new Text("/file/does/not/exist")).iterator(), null, null);
153 | 			fail();
154 | 		} catch (IOException e) {
155 | 			if (!e.getMessage().contains("/file/does/not/exist")) {
156 | 				throw e;
157 | 			}
158 | 		}
159 | 	}
160 | 
161 | 	@Test(expected = IllegalArgumentException.class)
162 | 	public void noMatchingInputPattern() {
163 | 		reducer.findMatcher("nothing matches me");
164 | 	}
165 | 
166 | 	@Test
167 | 	public void missingInputRegex() {
168 | 		JobConf job = new JobConf(false);
169 | 
170 | 		job.set("mapred.tip.id", "task_201011081200_14527_r_1234");
171 | 
172 | 		job.set("fs.default.name", "file:///");
173 | 		job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem");
174 | 		job.set("mapred.output.dir", outDir.getAbsolutePath());
175 | 
176 | 		job.setLong("crush.timestamp", 98765);
177 | 
178 | 		job.setLong("dfs.block.size", 1024 * 1024 * 64L);
179 | 
180 | 		job.setInt("crush.num.specs", 2);
181 | 		job.set("crush.0.regex", "foo");
182 | 		job.set("crush.0.regex.replacement", "bar");
183 | 		job.set("crush.0.input.format", SequenceFileInputFormat.class.getName());
184 | 		job.set("crush.0.output.format", TextOutputFormat.class.getName());
185 | 
186 | 		job.set("crush.1.regex.replacement", "bar");
187 | 		job.set("crush.1.input.format", SequenceFileInputFormat.class.getName());
188 | 		job.set("crush.1.output.format", TextOutputFormat.class.getName());
189 | 
190 | 		reducer = new CrushReducer();
191 | 
192 | 		try {
193 | 			reducer.configure(job);
194 | 			fail();
195 | 		} catch (IllegalArgumentException e) {
196 | 			if (!"No input regex: crush.1.regex".equals(e.getMessage())) {
197 | 				throw e;
198 | 			}
199 | 		}
200 | 	}
201 | 
202 | 	@Test
203 | 	public void missingOutputRegex() {
204 | 		JobConf job = new JobConf(false);
205 | 
206 | 		job.set("mapred.tip.id", "task_201011081200_14527_r_1234");
207 | 
208 | 		job.set("fs.default.name", "file:///");
209 | 		job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem");
210 | 		job.set("mapred.output.dir", outDir.getAbsolutePath());
211 | 
212 | 		job.setLong("crush.timestamp", 98765);
213 | 
214 | 		job.setLong("dfs.block.size", 1024 * 1024 * 64L);
215 | 
216 | 		job.setInt("crush.num.specs", 2);
217 | 		job.set("crush.0.regex", "foo");
218 | 		job.set("crush.0.regex.replacement", "bar");
219 | 		job.set("crush.0.input.format", SequenceFileInputFormat.class.getName());
220 | 		job.set("crush.0.output.format", TextOutputFormat.class.getName());
221 | 
222 | 		job.set("crush.1.regex", "hello");
223 | 		job.set("crush.1.input.format", SequenceFileInputFormat.class.getName());
224 | 		job.set("crush.1.output.format", TextOutputFormat.class.getName());
225 | 
226 | 		reducer = new CrushReducer();
227 | 
228 | 		try {
229 | 			reducer.configure(job);
230 | 			fail();
231 | 		} catch (IllegalArgumentException e) {
232 | 			if (!"No output replacement: crush.1.regex.replacement".equals(e.getMessage())) {
233 | 				throw e;
234 | 			}
235 | 		}
236 | 	}
237 | 
238 | 	@Test
239 | 	public void missingInputFormat() {
240 | 		JobConf job = new JobConf(false);
241 | 
242 | 		job.set("mapred.tip.id", "task_201011081200_14527_r_1234");
243 | 
244 | 		job.set("fs.default.name", "file:///");
245 | 		job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem");
246 | 		job.set("mapred.output.dir", outDir.getAbsolutePath());
247 | 
248 | 		job.setLong("crush.timestamp", 98765);
249 | 
250 | 		job.setLong("dfs.block.size", 1024 * 1024 * 64L);
251 | 
252 | 		job.setInt("crush.num.specs", 2);
253 | 		job.set("crush.0.regex", "foo");
254 | 		job.set("crush.0.regex.replacement", "bar");
255 | 		job.set("crush.0.input.format", SequenceFileInputFormat.class.getName());
256 | 		job.set("crush.0.output.format", SequenceFileOutputFormat.class.getName());
257 | 
258 | 		job.set("crush.1.regex", "hello");
259 | 		job.set("crush.1.regex.replacement", "hello");
260 | 		job.set("crush.1.output.format", SequenceFileOutputFormat.class.getName());
261 | 
262 | 		reducer = new CrushReducer();
263 | 
264 | 		try {
265 | 			reducer.configure(job);
266 | 			fail();
267 | 		} catch (IllegalArgumentException e) {
268 | 			if (!"No input format: crush.1.input.format".equals(e.getMessage())) {
269 | 				throw e;
270 | 			}
271 | 		}
272 | 	}
273 | 
274 | 	@Test
275 | 	public void inputFormatWrongType() {
276 | 		JobConf job = new JobConf(false);
277 | 
278 | 		job.set("mapred.tip.id", "task_201011081200_14527_r_1234");
279 | 
280 | 		job.set("fs.default.name", "file:///");
281 | 		job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem");
282 | 		job.set("mapred.output.dir", outDir.getAbsolutePath());
283 | 
284 | 		job.setLong("crush.timestamp", 98765);
285 | 
286 | 		job.setLong("dfs.block.size", 1024 * 1024 * 64L);
287 | 
288 | 		job.setInt("crush.num.specs", 2);
289 | 		job.set("crush.0.regex", "foo");
290 | 		job.set("crush.0.regex.replacement", "bar");
291 | 		job.set("crush.0.input.format", SequenceFileInputFormat.class.getName());
292 | 		job.set("crush.0.output.format", SequenceFileOutputFormat.class.getName());
293 | 
294 | 		job.set("crush.1.regex", "hello");
295 | 		job.set("crush.1.regex.replacement", "hello");
296 | 		job.set("crush.1.input.format", Object.class.getName());
297 | 		job.set("crush.1.output.format", SequenceFileOutputFormat.class.getName());
298 | 
299 | 		reducer = new CrushReducer();
300 | 
301 | 		try {
302 | 			reducer.configure(job);
303 | 			fail();
304 | 		} catch (IllegalArgumentException e) {
305 | 			if (!"Not a file input format: crush.1.input.format=java.lang.Object".equals(e.getMessage())) {
306 | 				throw e;
307 | 			}
308 | 		}
309 | 	}
310 | 
311 | 	@Test
312 | 	public void missingOutputFormat() {
313 | 		JobConf job = new JobConf(false);
314 | 
315 | 		job.set("mapred.tip.id", "task_201011081200_14527_r_1234");
316 | 
317 | 		job.set("fs.default.name", "file:///");
318 | 		job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem");
319 | 		job.set("mapred.output.dir", outDir.getAbsolutePath());
320 | 
321 | 		job.setLong("crush.timestamp", 98765);
322 | 
323 | 		job.setLong("dfs.block.size", 1024 * 1024 * 64L);
324 | 
325 | 		job.setInt("crush.num.specs", 2);
326 | 		job.set("crush.0.regex", "foo");
327 | 		job.set("crush.0.regex.replacement", "bar");
328 | 		job.set("crush.0.input.format", SequenceFileInputFormat.class.getName());
329 | 		job.set("crush.0.output.format", SequenceFileOutputFormat.class.getName());
330 | 
331 | 		job.set("crush.1.regex", "hello");
332 | 		job.set("crush.1.regex.replacement", "hello");
333 | 		job.set("crush.1.input.format", SequenceFileInputFormat.class.getName());
334 | 
335 | 		reducer = new CrushReducer();
336 | 
337 | 		try {
338 | 			reducer.configure(job);
339 | 			fail();
340 | 		} catch (IllegalArgumentException e) {
341 | 			if (!"No output format: crush.1.output.format".equals(e.getMessage())) {
342 | 				throw e;
343 | 			}
344 | 		}
345 | 	}
346 | 
347 | 	@Test
348 | 	public void outputFormatWrongType() {
349 | 		JobConf job = new JobConf(false);
350 | 
351 | 		job.set("mapred.tip.id", "task_201011081200_14527_r_1234");
352 | 
353 | 		job.set("fs.default.name", "file:///");
354 | 		job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem");
355 | 		job.set("mapred.output.dir", outDir.getAbsolutePath());
356 | 
357 | 		job.setLong("crush.timestamp", 98765);
358 | 
359 | 		job.setLong("dfs.block.size", 1024 * 1024 * 64L);
360 | 
361 | 		job.setInt("crush.num.specs", 2);
362 | 		job.set("crush.0.regex", "foo");
363 | 		job.set("crush.0.regex.replacement", "bar");
364 | 		job.set("crush.0.input.format", SequenceFileInputFormat.class.getName());
365 | 		job.set("crush.0.output.format", SequenceFileOutputFormat.class.getName());
366 | 
367 | 		job.set("crush.1.regex", "hello");
368 | 		job.set("crush.1.regex.replacement", "hello");
369 | 		job.set("crush.1.input.format", TextInputFormat.class.getName());
370 | 		job.set("crush.1.output.format", Object.class.getName());
371 | 
372 | 		reducer = new CrushReducer();
373 | 
374 | 		try {
375 | 			reducer.configure(job);
376 | 			fail();
377 | 		} catch (IllegalArgumentException e) {
378 | 			if (!"Not an output format: crush.1.output.format=java.lang.Object".equals(e.getMessage())) {
379 | 				throw e;
380 | 			}
381 | 		}
382 | 	}
383 | }
384 | 


--------------------------------------------------------------------------------
/src/main/java/com/m6d/filecrush/crush/CrushReducer.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |    Copyright 2011 m6d.com
  3 | 
  4 |    Licensed under the Apache License, Version 2.0 (the "License");
  5 |    you may not use this file except in compliance with the License.
  6 |    You may obtain a copy of the License at
  7 | 
  8 |        http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |    Unless required by applicable law or agreed to in writing, software
 11 |    distributed under the License is distributed on an "AS IS" BASIS,
 12 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |    See the License for the specific language governing permissions and
 14 |    limitations under the License.
 15 |  */
 16 | package com.m6d.filecrush.crush;
 17 | 
 18 | import static java.lang.String.format;
 19 | 
 20 | import java.io.IOException;
 21 | import java.util.ArrayList;
 22 | import java.util.HashMap;
 23 | import java.util.Iterator;
 24 | import java.util.List;
 25 | import java.util.Map;
 26 | import java.util.regex.Matcher;
 27 | import java.util.regex.Pattern;
 28 | 
 29 | import org.apache.commons.logging.Log;
 30 | import org.apache.commons.logging.LogFactory;
 31 | import org.apache.hadoop.fs.FileSystem;
 32 | import org.apache.hadoop.fs.Path;
 33 | import org.apache.hadoop.io.Text;
 34 | import org.apache.hadoop.mapred.FileInputFormat;
 35 | import org.apache.hadoop.mapred.InputSplit;
 36 | import org.apache.hadoop.mapred.JobConf;
 37 | import org.apache.hadoop.mapred.JobConfigurable;
 38 | import org.apache.hadoop.mapred.MapReduceBase;
 39 | import org.apache.hadoop.mapred.OutputCollector;
 40 | import org.apache.hadoop.mapred.OutputFormat;
 41 | import org.apache.hadoop.mapred.RecordReader;
 42 | import org.apache.hadoop.mapred.RecordWriter;
 43 | import org.apache.hadoop.mapred.Reducer;
 44 | import org.apache.hadoop.mapred.Reporter;
 45 | import org.apache.hadoop.mapred.TextInputFormat;
 46 | 
 47 | @SuppressWarnings("deprecation")
 48 | public class CrushReducer extends MapReduceBase implements Reducer<Text, Text, Text, Text> {
 49 | 
 50 | 	private final Text valueOut = new Text();
 51 | 
 52 | 	/**
 53 | 	 * Internal counter for the number of input groups processed. Used to report status.
 54 | 	 */
 55 | 	private int fileNum;
 56 | 
 57 | 	/**
 58 | 	 * The number of source files that have been crushed.
 59 | 	 */
 60 | 	private int recordNumber;
 61 | 
 62 | 	/**
 63 | 	 * Report status when after processing this number of files.
 64 | 	 */
 65 | 	private int reportRecordNumber = 100;
 66 | 
 67 | 	private int taskNum;
 68 | 
 69 | 	private long timestamp;
 70 | 
 71 | 	private JobConf job;
 72 | 
 73 | 	private FileSystem fs;
 74 | 
 75 | 	/**
 76 | 	 * Matched against dir names to calculate the crush output file name.
 77 | 	 */
 78 | 	private List<Matcher> inputRegexList;
 79 | 
 80 | 	/**
 81 | 	 * Used with corresponding element in {@link #inputRegexList} to calculate the crush ouput file name.
 82 | 	 */
 83 | 	private List<String> outputReplacementList;
 84 | 
 85 | 	/**
 86 | 	 * Input formats that correspond with {@link #inputRegexList}.
 87 | 	 */
 88 | 	private List<Class<?>> inFormatClsList;
 89 | 
 90 | 	/**
 91 | 	 * Output formats that correspond with {@link #inputRegexList}.
 92 | 	 */
 93 | 	private List<Class<?>> outFormatClsList;
 94 | 
 95 | 	/**
 96 | 	 * Used to substitute values into placeholders.
 97 | 	 */
 98 | 	private Map<String, String> placeHolderToValue = new HashMap<String, String>(3);
 99 | 
100 | 	/**
101 | 	 * Used to locate placeholders in the replacement strings.
102 | 	 */
103 | 	private Matcher placeholderMatcher = Pattern.compile("\\$\\{([a-zA-Z]([a-zA-Z\\.]*))\\}").matcher("dummy");
104 | 
105 | 	/**
106 | 	 * Path to the output dir of the job. Used to compute the final output file names for the crush files, which are the values in
107 | 	 * the reducer output.
108 | 	 */
109 | 	private String outDirPath;
110 | 
111 | 	@Override
112 | 	public void configure(JobConf job) {
113 | 		super.configure(job);
114 | 
115 | 		this.job = job;
116 | 
117 | 		taskNum = Integer.parseInt(job.get("mapred.tip.id").replaceFirst(".+_(\\d+)", "$1"));
118 | 		timestamp = Long.parseLong(job.get("crush.timestamp"));
119 | 
120 | 		outDirPath = job.get("mapred.output.dir");
121 | 
122 | 		if (null == outDirPath || outDirPath.isEmpty()) {
123 | 			throw new IllegalArgumentException("mapred.output.dir has no value");
124 | 		}
125 | 
126 | 		/*
127 | 		 * The files we write should be rooted in the "crush" subdir of the output directory to distinguish them from the files
128 | 		 * created by the collector.
129 | 		 */
130 | 		outDirPath = new Path(outDirPath + "/crush").toUri().getPath();
131 | 
132 | 		/*
133 | 		 * Configure the regular expressions and replacements we use to convert dir names to crush output file names. Also get the
134 | 		 * directory data formats.
135 | 		 */
136 | 		int numSpecs = job.getInt("crush.num.specs", 0);
137 | 
138 | 		if (numSpecs <= 0) {
139 | 			throw new IllegalArgumentException("Number of regular expressions must be zero or greater: " + numSpecs);
140 | 		}
141 | 
142 | 		readCrushSpecs(numSpecs);
143 | 
144 | 		placeHolderToValue.put("crush.task.num", Integer.toString(taskNum));
145 | 		placeHolderToValue.put("crush.timestamp", job.get("crush.timestamp"));
146 | 
147 | 		try {
148 | 			fs = FileSystem.get(job);
149 | 		} catch (RuntimeException e) {
150 | 			throw e;
151 | 		} catch (Exception e) {
152 | 			throw new RuntimeException(e);
153 | 		}
154 | 	}
155 | 
156 | 	/**
157 | 	 * Populates the following fields with non-default values from the configuration.
158 | 	 *
159 | 	 * <ul>
160 | 	 * <li><{@link #inputRegexList}/li>
161 | 	 * <li><{@link #outputReplacementList}/li>
162 | 	 * <li><{@link #inFormatClsList}/li>
163 | 	 * <li><{@link #outFormatClsList}/li>
164 | 	 * </ul>
165 | 	 */
166 | 	private void readCrushSpecs(int numSpecs) {
167 | 		inputRegexList = new ArrayList<Matcher>(numSpecs);
168 | 		outputReplacementList = new ArrayList<String>(numSpecs);
169 | 		inFormatClsList = new ArrayList<Class<?>>(numSpecs);
170 | 		outFormatClsList = new ArrayList<Class<?>>(numSpecs);
171 | 
172 | 		for (int i = 0; i < numSpecs; i++) {
173 | 			String key;
174 | 			String value;
175 | 
176 | 			/*
177 | 			 * Regex.
178 | 			 */
179 | 			key = format("crush.%d.regex", i);
180 | 			value = job.get(key);
181 | 
182 | 			if (null == value || value.isEmpty()) {
183 | 				throw new IllegalArgumentException("No input regex: " + key);
184 | 			}
185 | 
186 | 			inputRegexList.add(Pattern.compile(value).matcher("dummy"));
187 | 
188 | 			/*
189 | 			 * Replacement for regex.
190 | 			 */
191 | 			key = format("crush.%d.regex.replacement", i);
192 | 			value = job.get(key);
193 | 
194 | 			if (null == value || value.isEmpty()) {
195 | 				throw new IllegalArgumentException("No output replacement: " + key);
196 | 			}
197 | 
198 | 			outputReplacementList.add(value);
199 | 
200 | 			/*
201 | 			 * Input format
202 | 			 */
203 | 			key = format("crush.%d.input.format", i);
204 | 			value = job.get(key);
205 | 
206 | 			if (null == value || value.isEmpty()) {
207 | 				throw new IllegalArgumentException("No input format: " + key);
208 | 			}
209 | 
210 | 			try {
211 | 				Class<?> inFormatCls;
212 | 
213 | 				if (value.equals(TextInputFormat.class.getName())) {
214 | 					inFormatCls = KeyValuePreservingTextInputFormat.class;
215 | 				} else {
216 | 				 inFormatCls = Class.forName(value);
217 | 
218 | 					if (!FileInputFormat.class.isAssignableFrom(inFormatCls)) {
219 | 						throw new IllegalArgumentException(format("Not a file input format: %s=%s", key, value));
220 | 					}
221 | 				}
222 | 
223 | 				inFormatClsList.add(inFormatCls);
224 | 			} catch (ClassNotFoundException e) {
225 | 				throw new IllegalArgumentException(format("Not a valid class: %s=%s", key, value));
226 | 			}
227 | 
228 | 			/*
229 | 			 * Output format.
230 | 			 */
231 | 			key = format("crush.%d.output.format", i);
232 | 			value = job.get(key);
233 | 
234 | 			if (null == value || value.isEmpty()) {
235 | 				throw new IllegalArgumentException("No output format: " + key);
236 | 			}
237 | 
238 | 			try {
239 | 				Class<?> outFormatCls = Class.forName(value);
240 | 
241 | 				if (!OutputFormat.class.isAssignableFrom(outFormatCls)) {
242 | 					throw new IllegalArgumentException(format("Not an output format: %s=%s", key, value));
243 | 				}
244 | 
245 | 				outFormatClsList.add(outFormatCls);
246 | 			} catch (ClassNotFoundException e) {
247 | 				throw new IllegalArgumentException(format("Not a valid class: %s=%s", key, value));
248 | 			}
249 | 		}
250 | 	}
251 | 
252 | 	@Override
253 | 	public void reduce(Text bucketId, Iterator<Text> values, OutputCollector<Text, Text> collector, Reporter reporter) throws IOException {
254 | 		String bucket = bucketId.toString();
255 | 
256 | 		String dirName = bucket.substring(0, bucket.lastIndexOf('-'));
257 | 
258 | 		int idx = findMatcher(dirName);
259 | 
260 | 		String outputFileName = calculateOutputFile(idx, dirName);
261 | 
262 | 		/*
263 | 		 * Don't need to separate the paths because the output file name is already absolute.
264 | 		 */
265 | 		valueOut.set(outDirPath + outputFileName);
266 | 
267 | 		LOG.info(format("Crushing bucket '%s' to file '%s'", bucket, outputFileName));
268 | 
269 | 		/*
270 | 		 * Strip the leading slash to make the path relative. the output format will relativize it to the task attempt work dir.
271 | 		 */
272 | 		RecordWriter<Object, Object> sink = null;
273 | 		Exception rootCause = null;
274 | 
275 | 		Object key = null;
276 | 		Object value = null;
277 | 
278 | 		try {
279 | 			while (null == rootCause && values.hasNext()) {
280 | 				Text srcFile = values.next();
281 | 				Path inputPath = new Path(srcFile.toString());
282 | 
283 | 				RecordReader<Object, Object> reader = createRecordReader(idx, inputPath, reporter);
284 | 
285 | 				try {
286 | 					if (null == key) {
287 | 						key = reader.createKey();
288 | 						value = reader.createValue();
289 | 
290 | 						/*
291 | 						 * Set the key and value class in the conf, which the output format uses to get type information.
292 | 						 */
293 | 						job.setOutputKeyClass(key.getClass());
294 | 						job.setOutputValueClass(value.getClass());
295 | 
296 | 						/*
297 | 						 * Output file name is absolute so we can just add it to the crush prefix.
298 | 						 */
299 | 						sink = createRecordWriter(idx, "crush" + outputFileName);
300 | 					} else {
301 | 
302 | 						Class<?> other = reader.createKey().getClass();
303 | 
304 | 						if (!(key.getClass().equals(other))) {
305 | 							throw new IllegalArgumentException(format("Heterogeneous keys detected in %s: %s !- %s", inputPath, key.getClass(), other));
306 | 						}
307 | 
308 | 						other = reader.createValue().getClass();
309 | 
310 | 						if (!value.getClass().equals(other)) {
311 | 							throw new IllegalArgumentException(format("Heterogeneous values detected in %s: %s !- %s", inputPath, value.getClass(), other));
312 | 						}
313 | 					}
314 | 
315 | 					while (reader.next(key, value)) {
316 | 						sink.write(key, value);
317 | 						reporter.incrCounter(ReducerCounter.RECORDS_CRUSHED, 1);
318 | 					}
319 | 				} catch (Exception e) {
320 | 					rootCause = e;
321 | 				} finally {
322 | 					try {
323 | 						reader.close();
324 | 					} catch (Exception e) {
325 | 						if (null == rootCause) {
326 | 							rootCause = e;
327 | 						} else {
328 | 							LOG.debug("Swallowing exception on close of " + inputPath, e);
329 | 						}
330 | 					}
331 | 				}
332 | 
333 | 				/*
334 | 				 * Output of the reducer is the source file => crushed file (in the final output dir, no the task attempt work dir.
335 | 				 */
336 | 				collector.collect(srcFile, valueOut);
337 | 				reporter.incrCounter(ReducerCounter.FILES_CRUSHED, 1);
338 | 
339 | 				recordNumber++;
340 | 
341 | 				if (reportRecordNumber == recordNumber) {
342 | 					reportRecordNumber += reportRecordNumber;
343 | 
344 | 					reporter.setStatus(format("Processed %,d files %s : %s", recordNumber, bucket, inputPath));
345 | 				}
346 | 			}
347 | 		} catch (Exception e) {
348 | 			rootCause = e;
349 | 		} finally {
350 | 			if (null != sink) {
351 | 				try {
352 | 					sink.close(reporter);
353 | 				} catch (Exception e) {
354 | 					if (null == rootCause) {
355 | 						rootCause = e;
356 | 					} else {
357 | 						LOG.error("Swallowing exception on close of " + outputFileName, e);
358 | 					}
359 | 				}
360 | 			}
361 | 
362 | 			/*
363 | 			 * Let the exception bubble up with a minimum of wrapping.
364 | 			 */
365 | 			if (null != rootCause) {
366 | 				if (rootCause instanceof RuntimeException) {
367 | 					throw (RuntimeException) rootCause;
368 | 				}
369 | 
370 | 				if (rootCause instanceof IOException) {
371 | 					throw (IOException) rootCause;
372 | 				}
373 | 
374 | 				throw new RuntimeException(rootCause);
375 | 			}
376 | 		}
377 | 	}
378 | 
379 | 	/**
380 | 	 * Returns a record writer that creates files in the task attempt work directory. Path must be relative!
381 | 	 */
382 | 	@SuppressWarnings("unchecked")
383 | 	private RecordWriter<Object, Object> createRecordWriter(int idx, String path) throws IOException {
384 | 		Class<? extends OutputFormat<?, ?>> cls = (Class<? extends OutputFormat<?, ?>>) outFormatClsList.get(idx);
385 | 
386 | 		try {
387 | 			OutputFormat<Object, Object> format = (OutputFormat<Object, Object>) cls.newInstance();
388 | 
389 | 			return format.getRecordWriter(fs, job, path, null);
390 | 		} catch (RuntimeException e) {
391 | 			throw e;
392 | 		} catch (IOException e) {
393 | 			throw e;
394 | 		} catch (Exception e) {
395 | 			throw new RuntimeException(e);
396 | 		}
397 | 	}
398 | 
399 | 	@SuppressWarnings("unchecked")
400 | 	private RecordReader<Object, Object> createRecordReader(int idx, Path inputPath, Reporter reporter) throws IOException {
401 | 
402 | 		LOG.info(format("Opening '%s'", inputPath));
403 | 
404 | 		Class<? extends FileInputFormat<?, ?>> cls = (Class<? extends FileInputFormat<?, ?>>) inFormatClsList.get(idx);
405 | 
406 | 		try {
407 | 			FileInputFormat.setInputPaths(job, inputPath);
408 | 
409 | 			FileInputFormat<?, ?> instance = cls.newInstance();
410 | 
411 | 			if (instance instanceof JobConfigurable) {
412 | 				((JobConfigurable) instance).configure(job);
413 | 			}
414 | 
415 | 			InputSplit[] splits = instance.getSplits(job, 1);
416 | 
417 | 			if (1 != splits.length) {
418 | 				throw new IllegalArgumentException("Could not get input splits: " + inputPath);
419 | 			}
420 | 
421 | 			return (RecordReader<Object, Object>) instance.getRecordReader(splits[0], job, reporter);
422 | 		} catch (RuntimeException e) {
423 | 			throw e;
424 | 		} catch (IOException e) {
425 | 			throw e;
426 | 		} catch (Exception e) {
427 | 			throw new RuntimeException(e);
428 | 		}
429 | 	}
430 | 
431 | 	/**
432 | 	 * Converts the name of a directory to a path to the crush output file using the specs at the given index. The path will the
433 | 	 * directory and file name separated by a slash /. Performs placeholder substitution on the corresponding replacement string in
434 | 	 * {@link #outputReplacementList}. The final replacement string is then used to form the final path.
435 | 	 */
436 | 	String calculateOutputFile(int idx, String srcDir) {
437 | 
438 | 		StringBuffer sb = new StringBuffer(srcDir);
439 | 		sb.append("/");
440 | 
441 | 		String replacement = outputReplacementList.get(idx);
442 | 
443 | 		placeHolderToValue.put("crush.file.num", Integer.toString(fileNum++));
444 | 
445 | 		placeholderMatcher.reset(replacement);
446 | 
447 | 		while (placeholderMatcher.find()) {
448 | 			String key = placeholderMatcher.group(1);
449 | 
450 | 			String value = placeHolderToValue.get(key);
451 | 
452 | 			if (null == value) {
453 | 				throw new IllegalArgumentException("No value for key: " + key);
454 | 			}
455 | 
456 | 			placeholderMatcher.appendReplacement(sb, value);
457 | 		}
458 | 
459 | 		placeholderMatcher.appendTail(sb);
460 | 
461 | 		Matcher matcher = inputRegexList.get(idx);
462 | 		matcher.reset(srcDir);
463 | 
464 | 		String finalOutputName = matcher.replaceAll(sb.toString());
465 | 
466 | 		return finalOutputName;
467 | 	}
468 | 
469 | 	/**
470 | 	 * Returns the index into {@link #inputRegexList} of first pattern that matches the argument.
471 | 	 */
472 | 	int findMatcher(String dir) {
473 | 
474 | 		String outputNameWithPlaceholders = null;
475 | 
476 | 		for (int i = 0; i < inputRegexList.size() && outputNameWithPlaceholders == null; i++) {
477 | 			Matcher matcher = inputRegexList.get(i);
478 | 
479 | 			matcher.reset(dir);
480 | 
481 | 			if (matcher.matches()) {
482 | 				return i;
483 | 			}
484 | 		}
485 | 
486 | 		throw new IllegalArgumentException("No matching input regex: " + dir);
487 | 	}
488 | 
489 | 	int getTaskNum() {
490 | 		return taskNum;
491 | 	}
492 | 
493 | 	long getTimestamp() {
494 | 		return timestamp;
495 | 	}
496 | 
497 | 	List<String> getInputRegexList() {
498 | 		ArrayList<String> list = new ArrayList<String>(inputRegexList.size());
499 | 
500 | 		for (Matcher matcher : inputRegexList) {
501 | 			list.add(matcher.pattern().pattern());
502 | 		}
503 | 
504 | 		return list;
505 | 	}
506 | 
507 | 	List<String> getOutputReplacementList() {
508 | 		return new ArrayList<String>(outputReplacementList);
509 | 	}
510 | 
511 | 	List<Class<?>> getInputFormatList() {
512 | 		return new ArrayList<Class<?>>(inFormatClsList);
513 | 	}
514 | 
515 | 	List<Class<?>> getOutputFormatList() {
516 | 		return new ArrayList<Class<?>>(outFormatClsList);
517 | 	}
518 | 
519 | 	private static final Log LOG = LogFactory.getLog(CrushReducer.class);
520 | }
521 | 


--------------------------------------------------------------------------------
/src/test/java/com/m6d/filecrush/crush/CrushOptionParsingTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |    Copyright 2011 m6d.com
  3 | 
  4 |    Licensed under the Apache License, Version 2.0 (the "License");
  5 |    you may not use this file except in compliance with the License.
  6 |    You may obtain a copy of the License at
  7 | 
  8 |        http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |    Unless required by applicable law or agreed to in writing, software
 11 |    distributed under the License is distributed on an "AS IS" BASIS,
 12 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |    See the License for the specific language governing permissions and
 14 |    limitations under the License.
 15 |  */
 16 | package com.m6d.filecrush.crush;
 17 | 
 18 | import static java.lang.System.currentTimeMillis;
 19 | import static org.hamcrest.Matchers.equalTo;
 20 | import static org.hamcrest.Matchers.greaterThanOrEqualTo;
 21 | import static org.hamcrest.Matchers.is;
 22 | import static org.junit.Assert.assertThat;
 23 | import static org.junit.Assert.fail;
 24 | 
 25 | import java.io.IOException;
 26 | 
 27 | import org.apache.hadoop.fs.FileSystem;
 28 | import org.apache.hadoop.fs.Path;
 29 | import org.apache.hadoop.mapred.JobConf;
 30 | import org.junit.Before;
 31 | import org.junit.Rule;
 32 | import org.junit.Test;
 33 | import org.junit.rules.TemporaryFolder;
 34 | 
 35 | import com.m6d.filecrush.crush.Crush;
 36 | 
 37 | @SuppressWarnings("deprecation")
 38 | public class CrushOptionParsingTest {
 39 | 	@Rule
 40 | 	public final TemporaryFolder tmp = new TemporaryFolder();
 41 | 
 42 | 	private Crush crush;
 43 | 
 44 | 	@Before
 45 | 	public void before() throws IOException {
 46 | 		crush = new Crush();
 47 | 
 48 | 		JobConf job = new JobConf(false);
 49 | 		crush.setConf(job);
 50 | 
 51 | 		job.set("fs.default.name", "file:///");
 52 | 		job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem");
 53 | 		job.setInt("mapred.reduce.tasks", 20);
 54 | 		job.setLong("dfs.block.size", 1024 * 1024 * 64);
 55 | 
 56 | 		FileSystem fs = FileSystem.get(job);
 57 | 		fs.setWorkingDirectory(new Path(tmp.getRoot().getAbsolutePath()));
 58 | 
 59 | 		crush.setFileSystem(fs);
 60 | 	}
 61 | 
 62 | 	@Test
 63 | 	public void unrecognizedOption() {
 64 | 		try {
 65 | 			crush.createJobConfAndParseArgs("-bad", "in", "out", "20101116123015");
 66 | 			fail();
 67 | 		} catch (Exception e) {
 68 | 		}
 69 | 	}
 70 | 
 71 | 	@Test
 72 | 	public void badRegexCount() throws Exception {
 73 | 		try {
 74 | 			crush.createJobConfAndParseArgs(
 75 | 					"--regex", ".+/ads/.+",
 76 | 					"--replacement", "foo",
 77 | 					"--input-format", "org.apache.hadoop.mapred.TextInputFormat",
 78 | 					"--output-format", "org.apache.hadoop.mapred.TextOutputFormat",
 79 | 					"--replacement", "bar",
 80 | 					"--input-format", "org.apache.hadoop.mapred.TextInputFormat",
 81 | 					"--output-format", "org.apache.hadoop.mapred.TextOutputFormat",
 82 | 					"in", "out", "20101116123015");
 83 | 			fail();
 84 | 		} catch (IllegalArgumentException e) {
 85 | 			if (!e.getMessage().equals("Must be an equal number of regex, replacement, in-format, and out-format options")) {
 86 | 				throw e;
 87 | 			}
 88 | 		}
 89 | 	}
 90 | 
 91 | 	@Test
 92 | 	public void badCompressCodec() throws Exception {
 93 | 		try {
 94 | 			crush.createJobConfAndParseArgs(
 95 | 					"--regex", ".+/ads/.+",
 96 | 					"--replacement", "foo",
 97 | 					"--input-format", "org.apache.hadoop.mapred.TextInputFormat",
 98 | 					"--output-format", "org.apache.hadoop.mapred.TextOutputFormat",
 99 | 					"--compress", "java.lang.Object",
100 | 					"in", "out", "20101116123015");
101 | 			fail();
102 | 		} catch (IllegalArgumentException e) {
103 | 			if (!e.getMessage().contains("java.lang.Object")) {
104 | 				throw e;
105 | 			}
106 | 		}
107 | 	}
108 | 
109 | 	@Test
110 | 	public void badCompressCodecNotAClass() throws Exception {
111 | 		try {
112 | 			crush.createJobConfAndParseArgs(
113 | 					"--regex", ".+/ads/.+",
114 | 					"--replacement", "foo",
115 | 					"--input-format", "org.apache.hadoop.mapred.TextInputFormat",
116 | 					"--output-format", "org.apache.hadoop.mapred.TextOutputFormat",
117 | 					"--compress", "foo",
118 | 					"in", "out", "20101116123015");
119 | 			fail();
120 | 		} catch (IllegalArgumentException e) {
121 | 			if (!e.getMessage().contains("foo")) {
122 | 				throw e;
123 | 			}
124 | 		}
125 | 	}
126 | 
127 | 	@Test
128 | 	public void badReplacementCount() throws Exception {
129 | 		try {
130 | 			crush.createJobConfAndParseArgs(
131 | 					"--regex", ".+/ads/.+",
132 | 					"--replacement", "foo",
133 | 					"--input-format", "org.apache.hadoop.mapred.TextInputFormat",
134 | 					"--output-format", "org.apache.hadoop.mapred.TextOutputFormat",
135 | 					"--regex", ".+/act/.+",
136 | 					"--input-format", "org.apache.hadoop.mapred.TextInputFormat",
137 | 					"--output-format", "org.apache.hadoop.mapred.TextOutputFormat",
138 | 					"in", "out", "20101116123015");
139 | 			fail();
140 | 		} catch (IllegalArgumentException e) {
141 | 			if (!e.getMessage().equals("Must be an equal number of regex, replacement, in-format, and out-format options")) {
142 | 				throw e;
143 | 			}
144 | 		}
145 | 	}
146 | 
147 | 	@Test
148 | 	public void badInputFormatCount() throws Exception {
149 | 		try {
150 | 			crush.createJobConfAndParseArgs(
151 | 					"--regex", ".+/ads/.+",
152 | 					"--replacement", "foo",
153 | 					"--input-format", "org.apache.hadoop.mapred.TextInputFormat",
154 | 					"--output-format", "org.apache.hadoop.mapred.TextOutputFormat",
155 | 					"--regex", ".+/act/.+",
156 | 					"--replacement", "bar",
157 | 					"--output-format", "org.apache.hadoop.mapred.TextOutputFormat",
158 | 					"in", "out", "20101116123015");
159 | 			fail();
160 | 		} catch (IllegalArgumentException e) {
161 | 			if (!e.getMessage().equals("Must be an equal number of regex, replacement, in-format, and out-format options")) {
162 | 				throw e;
163 | 			}
164 | 		}
165 | 	}
166 | 
167 | 	@Test
168 | 	public void badOutputFormatCount() throws Exception {
169 | 		try {
170 | 			crush.createJobConfAndParseArgs(
171 | 					"--regex", ".+/ads/.+",
172 | 					"--replacement", "foo",
173 | 					"--input-format", "org.apache.hadoop.mapred.TextInputFormat",
174 | 					"--output-format", "org.apache.hadoop.mapred.TextOutputFormat",
175 | 					"--regex", ".+/act/.+",
176 | 					"--replacement", "bar",
177 | 					"--input-format", "org.apache.hadoop.mapred.TextInputFormat",
178 | 					"in", "out", "20101116123015");
179 | 			fail();
180 | 		} catch (IllegalArgumentException e) {
181 | 			if (!e.getMessage().equals("Must be an equal number of regex, replacement, in-format, and out-format options")) {
182 | 				throw e;
183 | 			}
184 | 		}
185 | 	}
186 | 
187 | 	@Test
188 | 	public void badInputFormat() throws Exception {
189 | 		try {
190 | 			crush.createJobConfAndParseArgs(
191 | 					"--regex", ".+/ads/.+",
192 | 					"--replacement", "foo",
193 | 					"--input-format", "org.apache.hadoop.mapred.TextInputFormat",
194 | 					"--output-format", "org.apache.hadoop.mapred.TextOutputFormat",
195 | 					"--regex", ".+/act/.+",
196 | 					"--replacement", "bar",
197 | 					"--input-format", "org.apache.hadoop.mapred.TextInputFormat",
198 | 					"--output-format", "org.apache.hadoop.mapred.SequenceFileOutputFormat",
199 | 					"--regex", ".+/bid/.+",
200 | 					"--replacement", "hello",
201 | 					"--input-format", "org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
202 | 					"--output-format", "org.apache.hadoop.mapred.TextOutputFormat",
203 | 					"--threshold", "0.5",
204 | 					"--max-file-blocks", "100",
205 | 					"in", "out", "20101116123015");
206 | 			fail();
207 | 		} catch (IllegalArgumentException e) {
208 | 			if (!e.getMessage().contains("org.apache.hadoop.mapreduce.lib.input.TextInputFormat")) {
209 | 				throw e;
210 | 			}
211 | 		}
212 | 	}
213 | 
214 | 	@Test
215 | 	public void badInputFormatNotAClass() throws Exception {
216 | 		try {
217 | 			crush.createJobConfAndParseArgs(
218 | 					"--regex", ".+/ads/.+",
219 | 					"--replacement", "foo",
220 | 					"--input-format", "org.apache.hadoop.mapred.TextInputFormat",
221 | 					"--output-format", "org.apache.hadoop.mapred.TextOutputFormat",
222 | 					"--regex", ".+/act/.+",
223 | 					"--replacement", "bar",
224 | 					"--input-format", "org.apache.hadoop.mapred.TextInputFormat",
225 | 					"--output-format", "org.apache.hadoop.mapred.SequenceFileOutputFormat",
226 | 					"--regex", ".+/bid/.+",
227 | 					"--replacement", "hello",
228 | 					"--input-format", "foo",
229 | 					"--output-format", "org.apache.hadoop.mapred.TextOutputFormat",
230 | 					"--threshold", "0.5",
231 | 					"--max-file-blocks", "100",
232 | 					"in", "out", "20101116123015");
233 | 			fail();
234 | 		} catch (IllegalArgumentException e) {
235 | 			if (!e.getMessage().contains("foo")) {
236 | 				throw e;
237 | 			}
238 | 		}
239 | 	}
240 | 
241 | 	@Test
242 | 	public void badOutputFormat() throws Exception {
243 | 		try {
244 | 			crush.createJobConfAndParseArgs(
245 | 					"--regex", ".+/ads/.+",
246 | 					"--replacement", "foo",
247 | 					"--input-format", "org.apache.hadoop.mapred.TextInputFormat",
248 | 					"--output-format", "org.apache.hadoop.mapred.TextOutputFormat",
249 | 					"--regex", ".+/act/.+",
250 | 					"--replacement", "bar",
251 | 					"--input-format", "org.apache.hadoop.mapred.TextInputFormat",
252 | 					"--output-format", "org.apache.hadoop.mapred.SequenceFileOutputFormat",
253 | 					"--regex", ".+/bid/.+",
254 | 					"--replacement", "hello",
255 | 					"--input-format", "org.apache.hadoop.mapred.SequenceFileInputFormat",
256 | 					"--output-format", "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat",
257 | 					"--threshold", "0.5",
258 | 					"--max-file-blocks", "100",
259 | 					"in", "out", "20101116123015");
260 | 			fail();
261 | 		} catch (IllegalArgumentException e) {
262 | 			if (!e.getMessage().contains("org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat")) {
263 | 				throw e;
264 | 			}
265 | 		}
266 | 	}
267 | 
268 | 	@Test
269 | 	public void badOutputFormatNotAClass() throws Exception {
270 | 		try {
271 | 			crush.createJobConfAndParseArgs(
272 | 					"--regex", ".+/ads/.+",
273 | 					"--replacement", "foo",
274 | 					"--input-format", "org.apache.hadoop.mapred.TextInputFormat",
275 | 					"--output-format", "org.apache.hadoop.mapred.TextOutputFormat",
276 | 					"--regex", ".+/act/.+",
277 | 					"--replacement", "bar",
278 | 					"--input-format", "org.apache.hadoop.mapred.TextInputFormat",
279 | 					"--output-format", "org.apache.hadoop.mapred.SequenceFileOutputFormat",
280 | 					"--regex", ".+/bid/.+",
281 | 					"--replacement",	"hello",
282 | 					"--input-format", "org.apache.hadoop.mapred.SequenceFileInputFormat",
283 | 					"--output-format", "foo",
284 | 					"--threshold", "0.5",
285 | 					"--max-file-blocks", "100",
286 | 					"in", "out", "20101116123015");
287 | 			fail();
288 | 		} catch (IllegalArgumentException e) {
289 | 			if (!e.getMessage().contains("foo")) {
290 | 				throw e;
291 | 			}
292 | 		}
293 | 	}
294 | 
295 | 	@Test
296 | 	public void badSourceDir() throws Exception {
297 | 		try {
298 | 			crush.createJobConfAndParseArgs("does not exist", tmp.newFolder("out").getAbsolutePath(), "20101116123015");
299 | 		} catch (IOException e) {
300 | 			if (!e.getMessage().contains("does not exist")) {
301 | 				throw e;
302 | 			}
303 | 		}
304 | 	}
305 | 
306 | 	@Test
307 | 	public void defaults() throws Exception {
308 | 		crush.createJobConfAndParseArgs(tmp.newFolder("in").getAbsolutePath(), tmp.newFolder("out").getAbsolutePath(), "20101116123015");
309 | 
310 | 		JobConf job = crush.getJob();
311 | 
312 | 		assertThat(job.get("mapred.reduce.tasks"), equalTo("20"));
313 | 		assertThat(job.get("mapred.output.compress"), equalTo("true"));
314 | 		assertThat(job.get("mapred.output.compression.type"), equalTo("BLOCK"));
315 | 		assertThat(job.get("mapred.output.compression.codec"), equalTo("org.apache.hadoop.io.compress.DefaultCodec"));
316 | 
317 | 		assertThat(crush.getMaxFileBlocks(), equalTo(8));
318 | 
319 | 		assertThat(job.get("crush.timestamp"), equalTo("20101116123015"));
320 | 
321 | 		assertThat(job.get("crush.num.specs"), equalTo("1"));
322 | 
323 | 		assertThat(job.get("crush.0.regex"), equalTo(".+"));
324 | 		assertThat(job.get("crush.0.regex.replacement"), equalTo("crushed_file-20101116123015-${crush.task.num}-${crush.file.num}"));
325 | 		assertThat(job.get("crush.0.input.format"), equalTo("org.apache.hadoop.mapred.SequenceFileInputFormat"));
326 | 		assertThat(job.get("crush.0.output.format"), equalTo("org.apache.hadoop.mapred.SequenceFileOutputFormat"));
327 | 	}
328 | 
329 | 	@Test
330 | 	public void disableCompression() throws Exception {
331 | 		crush.createJobConfAndParseArgs(
332 | 				"--compress=none",
333 | 				tmp.newFolder("in").getAbsolutePath(),
334 | 				tmp.newFolder("out").getAbsolutePath(),
335 | 				"20101116123015");
336 | 
337 | 		JobConf job = crush.getJob();
338 | 
339 | 		assertThat(job.get("mapred.reduce.tasks"), equalTo("20"));
340 | 		assertThat(job.get("mapred.output.compress"), equalTo("false"));
341 | 
342 | 		assertThat(crush.getMaxFileBlocks(), equalTo(8));
343 | 
344 | 		assertThat(job.get("crush.timestamp"), equalTo("20101116123015"));
345 | 
346 | 		assertThat(job.get("crush.num.specs"), equalTo("1"));
347 | 
348 | 		assertThat(job.get("crush.0.regex"), equalTo(".+"));
349 | 		assertThat(job.get("crush.0.regex.replacement"), equalTo("crushed_file-20101116123015-${crush.task.num}-${crush.file.num}"));
350 | 		assertThat(job.get("crush.0.input.format"), equalTo("org.apache.hadoop.mapred.SequenceFileInputFormat"));
351 | 		assertThat(job.get("crush.0.output.format"), equalTo("org.apache.hadoop.mapred.SequenceFileOutputFormat"));
352 | 	}
353 | 
354 | 	@Test
355 | 	public void parse() throws Exception {
356 | 		crush.createJobConfAndParseArgs(
357 | 				"--regex", ".+/ads/.+",
358 | 				"--replacement", "foo",
359 | 				"--input-format", "org.apache.hadoop.mapred.TextInputFormat",
360 | 				"--output-format", "org.apache.hadoop.mapred.TextOutputFormat",
361 | 				"--regex", ".+/act/.+",
362 | 				"--replacement", "bar",
363 | 				"--input-format", "org.apache.hadoop.mapred.TextInputFormat",
364 | 				"--output-format", "org.apache.hadoop.mapred.SequenceFileOutputFormat",
365 | 				"--regex", ".+/bid/.+",
366 | 				"--replacement", "hello",
367 | 				"--input-format", "org.apache.hadoop.mapred.SequenceFileInputFormat",
368 | 				"--output-format", "org.apache.hadoop.mapred.TextOutputFormat",
369 | 				"--threshold", "0.5",
370 | 				"--max-file-blocks", "100",
371 | 				"--compress", "org.apache.hadoop.io.compress.DefaultCodec",
372 | 
373 | 				tmp.newFolder("in").getAbsolutePath(), tmp.newFolder("out").getAbsolutePath(), "20101116123015");
374 | 
375 | 		JobConf job = crush.getJob();
376 | 
377 | 		assertThat(job.get("mapred.reduce.tasks"), equalTo("20"));
378 | 		assertThat(job.get("mapred.output.compress"), equalTo("true"));
379 | 		assertThat(job.get("mapred.output.compression.codec"), equalTo("org.apache.hadoop.io.compress.DefaultCodec"));
380 | 
381 | 		assertThat(crush.getMaxFileBlocks(), equalTo(100));
382 | 
383 | 		assertThat(job.get("crush.timestamp"), equalTo("20101116123015"));
384 | 
385 | 		assertThat(job.get("crush.num.specs"), equalTo("3"));
386 | 
387 | 		assertThat(job.get("crush.0.regex"), equalTo(".+/ads/.+"));
388 | 		assertThat(job.get("crush.0.regex.replacement"), equalTo("foo"));
389 | 		assertThat(job.get("crush.0.input.format"), equalTo("org.apache.hadoop.mapred.TextInputFormat"));
390 | 		assertThat(job.get("crush.0.output.format"), equalTo("org.apache.hadoop.mapred.TextOutputFormat"));
391 | 
392 | 		assertThat(job.get("crush.1.regex"), equalTo(".+/act/.+"));
393 | 		assertThat(job.get("crush.1.regex.replacement"), equalTo("bar"));
394 | 		assertThat(job.get("crush.1.input.format"), equalTo("org.apache.hadoop.mapred.TextInputFormat"));
395 | 		assertThat(job.get("crush.1.output.format"), equalTo("org.apache.hadoop.mapred.SequenceFileOutputFormat"));
396 | 
397 | 		assertThat(job.get("crush.2.regex"), equalTo(".+/bid/.+"));
398 | 		assertThat(job.get("crush.2.regex.replacement"), equalTo("hello"));
399 | 		assertThat(job.get("crush.2.input.format"), equalTo("org.apache.hadoop.mapred.SequenceFileInputFormat"));
400 | 		assertThat(job.get("crush.2.output.format"), equalTo("org.apache.hadoop.mapred.TextOutputFormat"));
401 | 	}
402 | 
403 | 	@Test
404 | 	public void parseOldNoType() throws Exception {
405 | 		long millis = currentTimeMillis();
406 | 
407 | 		crush.createJobConfAndParseArgs(
408 | 				tmp.newFolder("in").getAbsolutePath(),
409 | 				tmp.newFolder("out").getAbsolutePath(),
410 | 				"80");
411 | 
412 | 		JobConf job = crush.getJob();
413 | 
414 | 		assertThat(job.get("mapred.reduce.tasks"), equalTo("80"));
415 | 		assertThat(Long.parseLong(job.get("crush.timestamp")), greaterThanOrEqualTo(millis));
416 | 		assertThat(job.get("crush.num.specs"), equalTo("1"));
417 | 
418 | 		assertThat(crush.getMaxFileBlocks(), equalTo(Integer.MAX_VALUE));
419 | 
420 | 		assertThat(job.get("crush.0.regex"), equalTo(".+"));
421 | 		assertThat(job.get("crush.0.regex.replacement").matches("crushed_file-\\d+-\\$\\{crush.task.num\\}-\\$\\{crush.file.num\\}"), is(true));
422 | 		assertThat(job.get("crush.0.input.format"), equalTo("org.apache.hadoop.mapred.SequenceFileInputFormat"));
423 | 		assertThat(job.get("crush.0.output.format"), equalTo("org.apache.hadoop.mapred.SequenceFileOutputFormat"));
424 | 	}
425 | 
426 | 	@Test
427 | 	public void parseOldSequence() throws Exception {
428 | 		long millis = currentTimeMillis();
429 | 
430 | 		crush.createJobConfAndParseArgs(
431 | 				tmp.newFolder("in").getAbsolutePath(),
432 | 				tmp.newFolder("out").getAbsolutePath(),
433 | 				"80",
434 | 				"SEQUENCE");
435 | 
436 | 		JobConf job = crush.getJob();
437 | 
438 | 		assertThat(job.get("mapred.reduce.tasks"), equalTo("80"));
439 | 		assertThat(Long.parseLong(job.get("crush.timestamp")), greaterThanOrEqualTo(millis));
440 | 		assertThat(job.get("crush.num.specs"), equalTo("1"));
441 | 
442 | 		assertThat(crush.getMaxFileBlocks(), equalTo(Integer.MAX_VALUE));
443 | 
444 | 		assertThat(job.get("crush.0.regex"), equalTo(".+"));
445 | 		assertThat(job.get("crush.0.regex.replacement").matches("crushed_file-\\d+-\\$\\{crush.task.num\\}-\\$\\{crush.file.num\\}"), is(true));
446 | 		assertThat(job.get("crush.0.input.format"), equalTo("org.apache.hadoop.mapred.SequenceFileInputFormat"));
447 | 		assertThat(job.get("crush.0.output.format"), equalTo("org.apache.hadoop.mapred.SequenceFileOutputFormat"));
448 | 	}
449 | 
450 | 	@Test
451 | 	public void parseOldText() throws Exception {
452 | 		long millis = currentTimeMillis();
453 | 
454 | 		crush.createJobConfAndParseArgs(
455 | 				tmp.newFolder("in").getAbsolutePath(),
456 | 				tmp.newFolder("out").getAbsolutePath(),
457 | 				"80",
458 | 				"TEXT");
459 | 
460 | 		JobConf job = crush.getJob();
461 | 
462 | 		assertThat(job.get("mapred.reduce.tasks"), equalTo("80"));
463 | 		assertThat(Long.parseLong(job.get("crush.timestamp")), greaterThanOrEqualTo(millis));
464 | 		assertThat(job.get("crush.num.specs"), equalTo("1"));
465 | 
466 | 		assertThat(crush.getMaxFileBlocks(), equalTo(Integer.MAX_VALUE));
467 | 
468 | 		assertThat(job.get("crush.0.regex"), equalTo(".+"));
469 | 		assertThat(job.get("crush.0.regex.replacement").matches("crushed_file-\\d+-\\$\\{crush.task.num\\}-\\$\\{crush.file.num\\}"), is(true));
470 | 		assertThat(job.get("crush.0.input.format"), equalTo("org.apache.hadoop.mapred.TextInputFormat"));
471 | 		assertThat(job.get("crush.0.output.format"), equalTo("org.apache.hadoop.mapred.TextOutputFormat"));
472 | 	}
473 | 
474 | 	@Test
475 | 	public void parseOldBadType() throws Exception {
476 | 		try {
477 | 			crush.createJobConfAndParseArgs("in",
478 | 					"out",
479 | 					"80",
480 | 					"FOO");
481 | 			fail();
482 | 		} catch (IllegalArgumentException e) {
483 | 			if (!e.getMessage().contains("FOO")) {
484 | 				throw e;
485 | 			}
486 | 		}
487 | 	}
488 | }
489 | 


--------------------------------------------------------------------------------
/src/test/java/com/m6d/filecrush/crush/CrushReducerParameterizedTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |    Copyright 2011 m6d.com
  3 | 
  4 |    Licensed under the Apache License, Version 2.0 (the "License");
  5 |    you may not use this file except in compliance with the License.
  6 |    You may obtain a copy of the License at
  7 | 
  8 |        http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |    Unless required by applicable law or agreed to in writing, software
 11 |    distributed under the License is distributed on an "AS IS" BASIS,
 12 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |    See the License for the specific language governing permissions and
 14 |    limitations under the License.
 15 |  */
 16 | package com.m6d.filecrush.crush;
 17 | 
 18 | import static java.lang.String.format;
 19 | import static java.util.Arrays.asList;
 20 | import static org.easymock.EasyMock.expectLastCall;
 21 | import static org.easymock.EasyMock.isA;
 22 | import static org.hamcrest.Matchers.equalTo;
 23 | import static org.hamcrest.Matchers.is;
 24 | import static org.hamcrest.Matchers.nullValue;
 25 | import static org.junit.Assert.assertThat;
 26 | import static org.junit.Assert.fail;
 27 | 
 28 | import java.io.BufferedReader;
 29 | import java.io.File;
 30 | import java.io.FileInputStream;
 31 | import java.io.FileOutputStream;
 32 | import java.io.IOException;
 33 | import java.io.InputStreamReader;
 34 | import java.io.PrintWriter;
 35 | import java.util.ArrayList;
 36 | import java.util.Collection;
 37 | import java.util.LinkedHashMap;
 38 | import java.util.List;
 39 | import java.util.Map;
 40 | import java.util.Map.Entry;
 41 | 
 42 | import org.apache.hadoop.fs.FileSystem;
 43 | import org.apache.hadoop.fs.Path;
 44 | import org.apache.hadoop.io.LongWritable;
 45 | import org.apache.hadoop.io.SequenceFile;
 46 | import org.apache.hadoop.io.SequenceFile.CompressionType;
 47 | import org.apache.hadoop.io.SequenceFile.Reader;
 48 | import org.apache.hadoop.io.SequenceFile.Writer;
 49 | import org.apache.hadoop.io.Text;
 50 | import org.apache.hadoop.io.compress.DefaultCodec;
 51 | import org.apache.hadoop.mapred.JobConf;
 52 | import org.apache.hadoop.mapred.OutputCollector;
 53 | import org.apache.hadoop.mapred.Reporter;
 54 | import org.apache.hadoop.mapred.SequenceFileInputFormat;
 55 | import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 56 | import org.apache.hadoop.mapred.TextInputFormat;
 57 | import org.apache.hadoop.mapred.TextOutputFormat;
 58 | import org.easymock.EasyMockSupport;
 59 | import org.junit.Before;
 60 | import org.junit.Rule;
 61 | import org.junit.Test;
 62 | import org.junit.rules.TemporaryFolder;
 63 | import org.junit.runner.RunWith;
 64 | import org.junit.runners.Parameterized;
 65 | import org.junit.runners.Parameterized.Parameters;
 66 | 
 67 | import com.m6d.filecrush.crush.CrushReducer;
 68 | import com.m6d.filecrush.crush.ReducerCounter;
 69 | 
 70 | @RunWith(Parameterized.class)
 71 | @SuppressWarnings("deprecation")
 72 | public class CrushReducerParameterizedTest extends EasyMockSupport {
 73 | 	@Parameters
 74 | 	public static Collection<Object[]> testCases() {
 75 | 		List<Object[]> testCases = new ArrayList<Object[]>();
 76 | 
 77 | 		for (Object[] testCase : new Object[][] {	new Object[] { CompressionType.NONE },
 78 | 																							new Object[] { CompressionType.BLOCK },
 79 | 																							new Object[] { CompressionType.RECORD }}) {
 80 | 			testCases.add(testCase);
 81 | 		}
 82 | 
 83 | 		return testCases;
 84 | 	}
 85 | 
 86 | 	@Rule
 87 | 	public final TemporaryFolder tmp = new TemporaryFolder();
 88 | 
 89 | 	private final CompressionType compressionType;
 90 | 
 91 | 	private OutputCollector<Text, Text> collector;
 92 | 
 93 | 	private Reporter reporter;
 94 | 
 95 | 	private CrushReducer reducer;
 96 | 
 97 | 	private JobConf job;
 98 | 
 99 | 	private FileSystem fs;
100 | 
101 | 	/**
102 | 	 * Simulates the task attempt work dir that is created by Hadoop.
103 | 	 */
104 | 	private File workDir;
105 | 
106 | 	/**
107 | 	 * Simulates the output dir to which the attempt's output will be copied.
108 | 	 */
109 | 	private File outDir;
110 | 
111 | 	public CrushReducerParameterizedTest(CompressionType compressionType) {
112 | 		super();
113 | 
114 | 		this.compressionType = compressionType;
115 | 	}
116 | 
117 | 	@Before
118 | 	public void setupReducer() throws IOException {
119 | 		job = new JobConf(false);
120 | 
121 | 		job.set("mapred.tip.id", "task_201011081200_014527_r_001234");
122 | 		job.set("mapred.task.id", "attempt_201011081200_14527_r_001234_0");
123 | 
124 | 		/*
125 | 		 * This logic tree around compression simulates what the output formats do.
126 | 		 */
127 | 		if (CompressionType.NONE == compressionType) {
128 | 			job.setBoolean("mapred.output.compress", false);
129 | 		} else {
130 | 			job.setBoolean("mapred.output.compress", true);
131 | 			job.set("mapred.output.compression.type", compressionType.name());
132 | 			job.set("mapred.output.compression.codec", CustomCompressionCodec.class.getName());
133 | 		}
134 | 
135 | 		outDir = tmp.newFolder("out");
136 | 		tmp.newFolder("out/_temporary");
137 | 		workDir = tmp.newFolder("out/_temporary/_" + job.get("mapred.task.id"));
138 | 
139 | 		job.set("mapred.output.dir", outDir.getAbsolutePath());
140 | 		job.set("mapred.work.output.dir", workDir.getAbsolutePath());
141 | 
142 | 		job.set("fs.default.name", "file:///");
143 | 		job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem");
144 | 
145 | 		job.setLong("crush.timestamp", 98765);
146 | 
147 | 		job.setInt("crush.num.specs", 4);
148 | 		job.set("crush.0.regex", ".+/other");
149 | 		job.set("crush.0.regex.replacement", "${crush.timestamp}-${crush.task.num}-middle-${crush.file.num}-tail");
150 | 		job.set("crush.0.input.format", SequenceFileInputFormat.class.getName());
151 | 		job.set("crush.0.output.format", TextOutputFormat.class.getName());
152 | 
153 | 		job.set("crush.1.regex", ".+/dir");
154 | 		job.set("crush.1.regex.replacement", "secondregex-${crush.timestamp}-${crush.task.num}-${crush.file.num}");
155 | 		job.set("crush.1.input.format", TextInputFormat.class.getName());
156 | 		job.set("crush.1.output.format", SequenceFileOutputFormat.class.getName());
157 | 
158 | 		job.set("crush.2.regex", ".+/dir/([^/]+/)*(.+)");
159 | 		job.set("crush.2.regex.replacement", "thirdregex-$2-${crush.timestamp}-${crush.task.num}-${crush.file.num}");
160 | 		job.set("crush.2.input.format", SequenceFileInputFormat.class.getName());
161 | 		job.set("crush.2.output.format", SequenceFileOutputFormat.class.getName());
162 | 
163 | 		job.set("crush.3.regex", ".+/text");
164 | 		job.set("crush.3.regex.replacement", "fourthregex-${crush.task.num}-${crush.timestamp}-${crush.file.num}");
165 | 		job.set("crush.3.input.format", TextInputFormat.class.getName());
166 | 		job.set("crush.3.output.format", TextOutputFormat.class.getName());
167 | 
168 | 		reducer = new CrushReducer();
169 | 
170 | 		reducer.configure(job);
171 | 
172 | 		fs = FileSystem.get(job);
173 | 	}
174 | 
175 | 	@Before
176 | 	@SuppressWarnings("unchecked")
177 | 	public void setupMocks() {
178 | 		collector = createMock("collector", OutputCollector.class);
179 | 		reporter = createMock("reporter", Reporter.class);
180 | 	}
181 | 
182 | 	@Test
183 | 	public void reduce() throws IOException {
184 | 		reporter.setStatus(isA(String.class));
185 | 		expectLastCall().anyTimes();
186 | 
187 | 		/*
188 | 		 * We setup a few directories to exercise regexes. In this comment, dirs are distinguished by a trailing slash. The
189 | 		 * file name is followed by the bucket id.
190 | 		 *
191 | 		 * 	dir/
192 | 		 * 			file10 0
193 | 		 * 			file11 0
194 | 		 * 			file12 1
195 | 		 * 			file13 1
196 | 		 * 			subdir/
197 | 		 * 					file20 0
198 | 		 * 					file21 0
199 | 		 * 					file22 1
200 | 		 * 					file23 1
201 | 		 * 					file24 1
202 | 		 * 					subsubdir/
203 | 		 * 							file30 0
204 | 		 * 							file31 0
205 | 		 * 							file32 0
206 | 		 * 							file33 1
207 | 		 * 							file34 1
208 | 		 * 					other/
209 | 		 * 							file40 1
210 | 		 * 							file41 1
211 | 		 * 							file42 2
212 | 		 * 							file43 2
213 | 		 * 			other/
214 | 		 * 					file50 0
215 | 		 * 					file51 0
216 | 		 * 					file52 1
217 | 		 * 					file53 1
218 | 		 * 					file54 3
219 | 		 * 					file55 3
220 | 		 * 	text/
221 | 		 * 			file60 2
222 | 		 * 			file61 2
223 | 		 * 			file62 3
224 | 		 * 			file63 3
225 | 		 *
226 | 		 * Now setup the dir so the reducer has some data to work with.
227 | 		 */
228 | 
229 | 		Map<Text, List<Text>> inputGroups = new LinkedHashMap<Text, List<Text>>();
230 | 
231 | 
232 | 		/*
233 | 		 * 	dir/
234 | 		 * 			file10 0
235 | 		 * 			file11 0
236 | 		 * 			file12 1
237 | 		 * 			file13 1
238 | 		 *
239 | 		 * These files match the first regex.
240 | 		 */
241 | 		File dir = tmp.newFolder("dir");
242 | 
243 | 		inputGroups.put(new Text(dir.getAbsolutePath() + "-0"), asList(	writeFile(dir, "file10", Format.TEXT),
244 | 																																		writeFile(dir, "file11", Format.TEXT)));
245 | 
246 | 		inputGroups.put(new Text(dir.getAbsolutePath() + "-1"), asList(	writeFile(dir, "file12", Format.TEXT),
247 | 																																		writeFile(dir, "file13", Format.TEXT)));
248 | 
249 | 		recordCollectForFile(dir, "file10", "secondregex-98765-1234-0");
250 | 		recordCollectForFile(dir, "file11", "secondregex-98765-1234-0");
251 | 		recordCollectForFile(dir, "file12", "secondregex-98765-1234-1");
252 | 		recordCollectForFile(dir, "file13", "secondregex-98765-1234-1");
253 | 
254 | 
255 | 		/*
256 | 		 * dir/
257 | 		 * 			subdir/
258 | 		 * 					file20 0
259 | 		 * 					file21 0
260 | 		 * 					file22 1
261 | 		 * 					file23 1
262 | 		 * 					file24 1
263 | 		 */
264 | 		File subdir = tmp.newFolder("dir/subdir");
265 | 
266 | 		inputGroups.put(new Text(subdir.getAbsolutePath() + "-0"), asList(	writeFile(subdir, "file20", Format.SEQUENCE),
267 | 																																				writeFile(subdir, "file21", Format.SEQUENCE)));
268 | 
269 | 		inputGroups.put(new Text(subdir.getAbsolutePath() + "-1"), asList(	writeFile(subdir, "file22", Format.SEQUENCE),
270 | 																																				writeFile(subdir, "file23", Format.SEQUENCE),
271 | 																																				writeFile(subdir, "file24", Format.SEQUENCE)));
272 | 
273 | 		recordCollectForFile(subdir, "file20", "thirdregex-subdir-98765-1234-2");
274 | 		recordCollectForFile(subdir, "file21", "thirdregex-subdir-98765-1234-2");
275 | 		recordCollectForFile(subdir, "file22", "thirdregex-subdir-98765-1234-3");
276 | 		recordCollectForFile(subdir, "file23", "thirdregex-subdir-98765-1234-3");
277 | 		recordCollectForFile(subdir, "file24", "thirdregex-subdir-98765-1234-3");
278 | 
279 | 
280 | 		/*
281 | 		 * dir/
282 | 		 * 			subdir/
283 | 		 * 					subsubdir/
284 | 		 * 							file30 0
285 | 		 * 							file31 0
286 | 		 * 							file32 0
287 | 		 * 							file33 1
288 | 		 * 							file34 1
289 | 		 */
290 | 		File subsubdir = tmp.newFolder("dir/subdir/subsubdir");
291 | 
292 | 		inputGroups.put(new Text(subsubdir.getAbsolutePath() + "-0"), asList(	writeFile(subsubdir, "file30", Format.SEQUENCE),
293 | 																																					writeFile(subsubdir, "file31", Format.SEQUENCE),
294 | 																																					writeFile(subsubdir, "file32", Format.SEQUENCE)));
295 | 
296 | 		inputGroups.put(new Text(subsubdir.getAbsolutePath() + "-1"), asList(	writeFile(subsubdir, "file33", Format.SEQUENCE),
297 | 																																					writeFile(subsubdir, "file34", Format.SEQUENCE)));
298 | 
299 | 		recordCollectForFile(subsubdir, "file30", "thirdregex-subsubdir-98765-1234-4");
300 | 		recordCollectForFile(subsubdir, "file31", "thirdregex-subsubdir-98765-1234-4");
301 | 		recordCollectForFile(subsubdir, "file32", "thirdregex-subsubdir-98765-1234-4");
302 | 		recordCollectForFile(subsubdir, "file33", "thirdregex-subsubdir-98765-1234-5");
303 | 		recordCollectForFile(subsubdir, "file34", "thirdregex-subsubdir-98765-1234-5");
304 | 
305 | 
306 | 		/*
307 | 		 * dir/
308 | 		 * 			subdir/
309 | 		 * 					other/
310 | 		 * 							file40 1
311 | 		 * 							file41 1
312 | 		 * 							file42 2
313 | 		 * 							file43 2
314 | 		 */
315 | 		File other1 = tmp.newFolder("dir/subdir/other");
316 | 
317 | 		inputGroups.put(new Text(other1.getAbsolutePath() + "-1"), asList(	writeFile(other1, "file40", Format.SEQUENCE),
318 | 																																				writeFile(other1, "file41", Format.SEQUENCE)));
319 | 
320 | 		inputGroups.put(new Text(other1.getAbsolutePath() + "-2"), asList(	writeFile(other1, "file42", Format.SEQUENCE),
321 | 																																				writeFile(other1, "file43", Format.SEQUENCE)));
322 | 
323 | 		recordCollectForFile(other1, "file40", "98765-1234-middle-6-tail");
324 | 		recordCollectForFile(other1, "file41", "98765-1234-middle-6-tail");
325 | 		recordCollectForFile(other1, "file42", "98765-1234-middle-7-tail");
326 | 		recordCollectForFile(other1, "file43", "98765-1234-middle-7-tail");
327 | 
328 | 
329 | 		/*
330 | 		 * dir/
331 | 		 * 			other/
332 | 		 * 					file50 0
333 | 		 * 					file51 0
334 | 		 * 					file52 1
335 | 		 * 					file53 1
336 | 		 * 					file54 3
337 | 		 * 					file55 3
338 | 		 */
339 | 		File other2 = tmp.newFolder("dir/other");
340 | 
341 | 		inputGroups.put(new Text(other2.getAbsolutePath() + "-0"), asList(	writeFile(other2, "file50", Format.SEQUENCE),
342 | 																																				writeFile(other2, "file51", Format.SEQUENCE)));
343 | 
344 | 		inputGroups.put(new Text(other2.getAbsolutePath() + "-1"), asList(	writeFile(other2, "file52", Format.SEQUENCE),
345 | 																																				writeFile(other2, "file53", Format.SEQUENCE)));
346 | 
347 | 		inputGroups.put(new Text(other2.getAbsolutePath() + "-3"), asList(	writeFile(other2, "file54", Format.SEQUENCE),
348 | 																																				writeFile(other2, "file55", Format.SEQUENCE)));
349 | 
350 | 		recordCollectForFile(other2, "file50", "98765-1234-middle-8-tail");
351 | 		recordCollectForFile(other2, "file51", "98765-1234-middle-8-tail");
352 | 		recordCollectForFile(other2, "file52", "98765-1234-middle-9-tail");
353 | 		recordCollectForFile(other2, "file53", "98765-1234-middle-9-tail");
354 | 		recordCollectForFile(other2, "file54", "98765-1234-middle-10-tail");
355 | 		recordCollectForFile(other2, "file55", "98765-1234-middle-10-tail");
356 | 
357 | 		/*
358 | 		 * 	text/
359 | 		 * 			file60 2
360 | 		 * 			file61 2
361 | 		 * 			file62 3
362 | 		 * 			file63 3
363 | 		 */
364 | 		File text = tmp.newFolder("text");
365 | 
366 | 		inputGroups.put(new Text(text.getAbsolutePath() + "-2"), asList(writeFile(text, "file60", Format.TEXT),
367 | 																																		writeFile(text, "file61", Format.TEXT)));
368 | 
369 | 		inputGroups.put(new Text(text.getAbsolutePath() + "-3"), asList(writeFile(text, "file62", Format.TEXT),
370 | 																																		writeFile(text, "file63", Format.TEXT)));
371 | 
372 | 		recordCollectForFile(text, "file60", "fourthregex-1234-98765-11");
373 | 		recordCollectForFile(text, "file61", "fourthregex-1234-98765-11");
374 | 		recordCollectForFile(text, "file62", "fourthregex-1234-98765-12");
375 | 		recordCollectForFile(text, "file63", "fourthregex-1234-98765-12");
376 | 
377 | 		replayAll();
378 | 
379 | 		for (Entry<Text, List<Text>> e : inputGroups.entrySet()) {
380 | 			reducer.reduce(e.getKey(), e.getValue().iterator(), collector, reporter);
381 | 		}
382 | 
383 | 		verifyAll();
384 | 
385 | 		verifyWorkOutput(dir,				"secondregex-98765-1234-0",						Format.TEXT,			Format.SEQUENCE, "file10", "file11");
386 | 		verifyWorkOutput(dir,				"secondregex-98765-1234-1",						Format.TEXT,			Format.SEQUENCE, "file12", "file13");
387 | 		verifyWorkOutput(subdir,		"thirdregex-subdir-98765-1234-2",			Format.SEQUENCE,	Format.SEQUENCE, "file20", "file21");
388 | 		verifyWorkOutput(subdir,		"thirdregex-subdir-98765-1234-3",			Format.SEQUENCE,	Format.SEQUENCE, "file22", "file23", "file24");
389 | 		verifyWorkOutput(subsubdir, "thirdregex-subsubdir-98765-1234-4",	Format.SEQUENCE,	Format.SEQUENCE, "file30", "file31", "file32");
390 | 		verifyWorkOutput(subsubdir, "thirdregex-subsubdir-98765-1234-5",	Format.SEQUENCE,	Format.SEQUENCE, "file33", "file34");
391 | 		verifyWorkOutput(other1,		"98765-1234-middle-6-tail",						Format.SEQUENCE,	Format.TEXT, "file40", "file41");
392 | 		verifyWorkOutput(other1,		"98765-1234-middle-7-tail",						Format.SEQUENCE,	Format.TEXT, "file42", "file43");
393 | 		verifyWorkOutput(other2,		"98765-1234-middle-8-tail",						Format.SEQUENCE,	Format.TEXT, "file50", "file51");
394 | 		verifyWorkOutput(other2,		"98765-1234-middle-9-tail",						Format.SEQUENCE,	Format.TEXT, "file52", "file53");
395 | 		verifyWorkOutput(other2,		"98765-1234-middle-10-tail",					Format.SEQUENCE,	Format.TEXT, "file54", "file55");
396 | 		verifyWorkOutput(text,			"fourthregex-1234-98765-11",					Format.TEXT,			Format.TEXT, "file60", "file61");
397 | 		verifyWorkOutput(text,			"fourthregex-1234-98765-12",					Format.TEXT,			Format.TEXT, "file62", "file63");
398 | 	}
399 | 
400 | 	/**
401 | 	 * Verifies that the work dir has the expected output.
402 | 	 */
403 | 	private void verifyWorkOutput(File srcDir, String crushedOutFileName, Format inFmt, Format outFmt, String... fileNames) throws IOException {
404 | 
405 | 		/*
406 | 		 * Read format table
407 | 		 *
408 | 		 *         \   out format
409 | 		 *          \
410 | 		 * in format \ seq    | text
411 | 		 * ----------------------------
412 | 		 *      seq  | Custom | ascii |
413 | 		 * -------------------------- -
414 | 		 *      text | Text   | ascii |
415 | 		 * ----------------------------
416 | 		 */
417 | 		File crushOutput = new File(workDir.getAbsolutePath() + "/crush" + srcDir.getAbsolutePath() + "/" + crushedOutFileName);
418 | 
419 | 		if (Format.TEXT == outFmt) {
420 | 			/*
421 | 			 * TextInputFormat will produce keys that are byte offsets and values that are the line. This is not actually what we want.
422 | 			 * We want to preserve the actualy keys and values in the files, just like SequenceFileInputFormat. So, either way, the
423 | 			 * keys and values should be the text representations of what went in.
424 | 			 */
425 | 			BufferedReader reader;
426 | 
427 | 			/*
428 | 			 * Text output format appends the default extension of the codec, if there is one.
429 | 			 */
430 | 			if (CompressionType.NONE == compressionType) {
431 | 				reader = new BufferedReader(new InputStreamReader(new FileInputStream(crushOutput)));
432 | 			} else {
433 | 				CustomCompressionCodec codec = new CustomCompressionCodec();
434 | 				codec.setConf(job);
435 | 
436 | 				reader = new BufferedReader(new InputStreamReader(codec.createInputStream(new FileInputStream(crushOutput + ".custom"))));
437 | 			}
438 | 
439 | 			String line = "";
440 | 
441 | 			for (String fileName : fileNames) {
442 | 				int max = Integer.parseInt(fileName.substring(4));
443 | 
444 | 				for (int key = 1, value = max * 100 + 1; key <= max; key++, value++) {
445 | 					String expectedLine = format("%d\t%d", key, value);
446 | 
447 | 					line = reader.readLine();
448 | 
449 | 					assertThat(line, equalTo(expectedLine));
450 | 				}
451 | 			}
452 | 
453 | 			assertThat("Should be at end of crush output file" + crushedOutFileName, reader.readLine(), nullValue());
454 | 
455 | 			reader.close();
456 | 		} else if (Format.SEQUENCE == inFmt && Format.SEQUENCE == outFmt) {
457 | 			/*
458 | 			 * Record reader will produce keys that are custom writables and values that are custom writable.
459 | 			 */
460 | 			Reader reader = new Reader(fs, new Path(crushOutput.getAbsolutePath()), job);
461 | 
462 | 			assertThat(reader.isCompressed(), is(compressionType != CompressionType.NONE));
463 | 
464 | 			if (reader.isCompressed()) {
465 | 				assertThat(reader.isBlockCompressed(), is(compressionType == CompressionType.BLOCK));
466 | 				assertThat(reader.getCompressionCodec().getClass(), equalTo((Object) CustomCompressionCodec.class));
467 | 			}
468 | 
469 | 			CustomWritable key = new CustomWritable();
470 | 			CustomWritable value = new CustomWritable();
471 | 
472 | 			for (String fileName : fileNames) {
473 | 				int max = Integer.parseInt(fileName.substring(4));
474 | 
475 | 				for (int k = 1, v = max * 100 + 1; k <= max; k++, v++) {
476 | 					reader.next(key, value);
477 | 
478 | 					assertThat(fileName, key.get(), equalTo((long) k));
479 | 					assertThat(fileName, value.get(), equalTo((long) v));
480 | 				}
481 | 			}
482 | 
483 | 			assertThat("Should be at end of crush output file" + crushedOutFileName, reader.next(key, value), is(false));
484 | 
485 | 			reader.close();
486 | 		} else if (Format.TEXT == inFmt && Format.SEQUENCE == outFmt) {
487 | 
488 | 			Reader reader = new Reader(fs, new Path(crushOutput.getAbsolutePath()), job);
489 | 
490 | 			assertThat(reader.isCompressed(), is(compressionType != CompressionType.NONE));
491 | 
492 | 			if (reader.isCompressed()) {
493 | 				assertThat(reader.isBlockCompressed(), is(compressionType == CompressionType.BLOCK));
494 | 				assertThat(reader.getCompressionCodec().getClass(), equalTo((Object) CustomCompressionCodec.class));
495 | 			}
496 | 
497 | 			Text key = new Text();
498 | 			Text value = new Text();
499 | 
500 | 			for (String fileName : fileNames) {
501 | 				int max = Integer.parseInt(fileName.substring(4));
502 | 
503 | 				for (int k = 1, v = max * 100 + 1; k <= max; k++, v++) {
504 | 					reader.next(key, value);
505 | 
506 | 					assertThat(fileName, key.toString(), equalTo(Integer.toString(k)));
507 | 					assertThat(fileName, value.toString(), equalTo(Integer.toString(v)));
508 | 				}
509 | 			}
510 | 
511 | 			assertThat("Should be at end of crush output file" + crushedOutFileName, reader.next(key, value), is(false));
512 | 
513 | 			reader.close();
514 | 		} else {
515 | 			fail();
516 | 		}
517 | 	}
518 | 
519 | 	/**
520 | 	 * Records an expectation that a file has been crushed. The key is the absolute path of the crush input file. The value is the
521 | 	 * absolute path of the crush output file, which is rooted in the output dir/crush (<b>not</b> the attempt work dir).
522 | 	 */
523 | 	private void recordCollectForFile(File srcDir, String crushInput, String crushOutput) throws IOException {
524 | 		Text srcFileAbsPath = new Text(new File(srcDir, crushInput).getAbsolutePath());
525 | 		Text fileInJobOutputDir = new Text(format("%s/crush%s", outDir.getAbsolutePath(), new File(srcDir, crushOutput).getAbsolutePath()));
526 | 
527 | 		collector.collect(srcFileAbsPath, fileInJobOutputDir);
528 | 		reporter.incrCounter(ReducerCounter.FILES_CRUSHED, 1);
529 | 
530 | 		reporter.incrCounter(ReducerCounter.RECORDS_CRUSHED, 1);
531 | 		expectLastCall().times(Integer.parseInt(crushInput.substring(4)));
532 | 	}
533 | 
534 | 	/**
535 | 	 * Every file in this unit test is named "file" followed by a number. This method will create a sequence file with as many lines
536 | 	 * as the number in the file name. The keys in the file will count from one to the number. The values in the file will count
537 | 	 * from 100n + 1 to 100n + n. This way each file will have distinct contents so long as no two files have the same name.
538 | 	 */
539 | 	private Text writeFile(File srcDir, String fileName, Format format) throws IOException {
540 | 
541 | 		int fileNum = Integer.parseInt(fileName.substring(4));
542 | 
543 | 		File file = new File(srcDir, fileName);
544 | 
545 | 		if (Format.TEXT == format) {
546 | 			PrintWriter writer = new PrintWriter(new FileOutputStream(file));
547 | 
548 | 			for (int k = 1, v = 100 * fileNum + 1; k <= fileNum; k++, v++) {
549 | 				writer.printf("%d\t%d\n", k, v);
550 | 			}
551 | 
552 | 			writer.close();
553 | 		} else {
554 | 			CustomWritable key = new CustomWritable();
555 | 			CustomWritable value = new CustomWritable();
556 | 
557 | 			DefaultCodec codec = new DefaultCodec();
558 | 			codec.setConf(job);
559 | 
560 | 			Writer writer = SequenceFile.createWriter(fs, job, new Path(file.getAbsolutePath()), CustomWritable.class,
561 | 					CustomWritable.class, compressionType, codec);
562 | 
563 | 			for (int k = 1, v = 100 * fileNum + 1; k <= fileNum; k++, v++) {
564 | 				key.set(k);
565 | 				value.set(v);
566 | 
567 | 				writer.append(key, value);
568 | 			}
569 | 
570 | 			writer.close();
571 | 		}
572 | 
573 | 		return new Text(file.getAbsolutePath());
574 | 	}
575 | 
576 | 	private enum Format {
577 | 		TEXT, SEQUENCE
578 | 	}
579 | 
580 | 	/**
581 | 	 * This only sexists to prove that the reducer can read and write custom writables about which it has no a priori knowledge.
582 | 	 */
583 | 	public static class CustomWritable extends LongWritable {
584 | 	}
585 | 
586 | 	/**
587 | 	 * This only exists to prove that the reducer can use custom codecs.
588 | 	 */
589 | 	public static class CustomCompressionCodec extends DefaultCodec {
590 | 		public CustomCompressionCodec() {
591 | 			super();
592 | 		}
593 | 
594 | 		@Override
595 | 		public String getDefaultExtension() {
596 | 			return ".custom";
597 | 		}
598 | 	}
599 | }
600 | 


--------------------------------------------------------------------------------
/src/test/java/com/m6d/filecrush/crush/CrushTest.java:
--------------------------------------------------------------------------------
   1 | /*
   2 |    Copyright 2011 m6d.com
   3 | 
   4 |    Licensed under the Apache License, Version 2.0 (the "License");
   5 |    you may not use this file except in compliance with the License.
   6 |    You may obtain a copy of the License at
   7 | 
   8 |        http://www.apache.org/licenses/LICENSE-2.0
   9 | 
  10 |    Unless required by applicable law or agreed to in writing, software
  11 |    distributed under the License is distributed on an "AS IS" BASIS,
  12 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 |    See the License for the specific language governing permissions and
  14 |    limitations under the License.
  15 |  */
  16 | package com.m6d.filecrush.crush;
  17 | 
  18 | import static java.lang.String.format;
  19 | import static java.lang.System.currentTimeMillis;
  20 | import static org.hamcrest.Matchers.equalTo;
  21 | import static org.junit.Assert.assertThat;
  22 | import static org.junit.Assert.fail;
  23 | 
  24 | import java.io.DataInputStream;
  25 | import java.io.File;
  26 | import java.io.FileOutputStream;
  27 | import java.io.IOException;
  28 | import java.net.URI;
  29 | import java.util.ArrayList;
  30 | import java.util.Arrays;
  31 | import java.util.Collections;
  32 | import java.util.HashMap;
  33 | import java.util.List;
  34 | import java.util.Map;
  35 | 
  36 | import org.apache.commons.cli.UnrecognizedOptionException;
  37 | import org.apache.hadoop.conf.Configuration;
  38 | import org.apache.hadoop.fs.BlockLocation;
  39 | import org.apache.hadoop.fs.ContentSummary;
  40 | import org.apache.hadoop.fs.FSDataInputStream;
  41 | import org.apache.hadoop.fs.FSDataOutputStream;
  42 | import org.apache.hadoop.fs.FileChecksum;
  43 | import org.apache.hadoop.fs.FileStatus;
  44 | import org.apache.hadoop.fs.FileSystem;
  45 | import org.apache.hadoop.fs.Path;
  46 | import org.apache.hadoop.fs.PathFilter;
  47 | import org.apache.hadoop.fs.permission.FsPermission;
  48 | import org.apache.hadoop.io.IntWritable;
  49 | import org.apache.hadoop.io.SequenceFile.Reader;
  50 | import org.apache.hadoop.io.Text;
  51 | import org.apache.hadoop.mapred.Counters;
  52 | import org.apache.hadoop.mapred.JobConf;
  53 | import org.apache.hadoop.util.Progressable;
  54 | import org.apache.hadoop.util.ToolRunner;
  55 | import org.junit.After;
  56 | import org.junit.Before;
  57 | import org.junit.Rule;
  58 | import org.junit.Test;
  59 | import org.junit.rules.TemporaryFolder;
  60 | 
  61 | import com.m6d.filecrush.crush.Crush;
  62 | import com.m6d.filecrush.crush.MapperCounter;
  63 | 
  64 | @SuppressWarnings("deprecation")
  65 | public class CrushTest {
  66 | 	@Rule
  67 | 	public final TemporaryFolder tmp = new TemporaryFolder();
  68 | 
  69 | 	private JobConf job;
  70 | 
  71 | 	private FileSystem fileSystem;
  72 | 
  73 | 	private String javaIoTmpDir;
  74 | 
  75 | 	@Before
  76 | 	public void setupJob() throws IOException {
  77 | 		job = new JobConf(false);
  78 | 
  79 | 		job.set("fs.default.name", "file:///");
  80 | 		job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem");
  81 | 		job.setInt("mapred.reduce.tasks", 5);
  82 | 		job.setLong("dfs.block.size", 50);
  83 | 
  84 | 		FileSystem delegate = FileSystem.get(job);
  85 | 
  86 | 		fileSystem = new SortingFileSystem(delegate);
  87 | 
  88 | 		/*
  89 | 		 * Set the working directory so that all relative paths are rooted in the tmp dir. This will keep the file system clean of
  90 | 		 * temporary test files.
  91 | 		 */
  92 | 		FileSystem.get(job).setWorkingDirectory(new Path(tmp.getRoot().getAbsolutePath()));
  93 | 	}
  94 | 
  95 | 	@Before
  96 | 	public void setJavaIoTmpDir() {
  97 | 		javaIoTmpDir = System.setProperty("java.io.tmpdir", tmp.getRoot().getAbsolutePath());
  98 | 	}
  99 | 
 100 | 	@After
 101 | 	public void restoreJavaIoTmpDir() {
 102 | 		System.setProperty("java.io.tmpdir", javaIoTmpDir);
 103 | 	}
 104 | 
 105 | 	private void run(String... args) throws Exception {
 106 | 		ToolRunner.run(job, new Crush(), args);
 107 | 	}
 108 | 
 109 | 	@Test
 110 | 	public void backwardsCompatibleInvocationBadSrcDir() throws Exception {
 111 | 		try {
 112 | 			run("does-not-exist", tmp.getRoot().getAbsolutePath(), "80");
 113 | 			fail();
 114 | 		} catch (IOException e) {
 115 | 			if (!e.getMessage().contains("does-not-exist")) {
 116 | 				throw e;
 117 | 			}
 118 | 		}
 119 | 	}
 120 | 
 121 | 	@Test
 122 | 	public void backwardsCompatibleInvocationBadNumberOfTasks() throws Exception {
 123 | 		try {
 124 | 			run(tmp.newFolder("in").getAbsolutePath(), tmp.newFolder("out").getAbsolutePath(), "not a number");
 125 | 			fail();
 126 | 		} catch (NumberFormatException e) {
 127 | 			if (!e.getMessage().contains("not a number")) {
 128 | 				throw e;
 129 | 			}
 130 | 		}
 131 | 	}
 132 | 
 133 | 	@Test
 134 | 	public void backwardsCompatibleInvocationNegativeTasks() throws Exception {
 135 | 		try {
 136 | 			run(tmp.newFolder("in").getAbsolutePath(), tmp.newFolder("out").getAbsolutePath(), "-1");
 137 | 			fail();
 138 | 		} catch (UnrecognizedOptionException e) {
 139 | 			if (!e.getMessage().contains("-1")) {
 140 | 				throw e;
 141 | 			}
 142 | 		}
 143 | 	}
 144 | 
 145 | 	@Test
 146 | 	public void backwardsCompatibleInvocationZeroTasks() throws Exception {
 147 | 		try {
 148 | 			run(tmp.newFolder("in").getAbsolutePath(), tmp.newFolder("out").getAbsolutePath(), "0");
 149 | 			fail();
 150 | 		} catch (IllegalArgumentException e) {
 151 | 			if (!e.getMessage().contains("0")) {
 152 | 				throw e;
 153 | 			}
 154 | 		}
 155 | 	}
 156 | 
 157 | 	@Test
 158 | 	public void backwardsCompatibleInvocationHugeTasks() throws Exception {
 159 | 		try {
 160 | 			run(tmp.newFolder("in").getAbsolutePath(), tmp.newFolder("out").getAbsolutePath(), "4001");
 161 | 			fail();
 162 | 		} catch (IllegalArgumentException e) {
 163 | 			if (!e.getMessage().contains("4001")) {
 164 | 				throw e;
 165 | 			}
 166 | 		}
 167 | 	}
 168 | 
 169 | 	@Test
 170 | 	public void backwardsCompatibleInvocationBadSrcDirWithType() throws Exception {
 171 | 		try {
 172 | 			run("does-not-exist", tmp.getRoot().getAbsolutePath(), "80", "TEXT");
 173 | 			fail();
 174 | 		} catch (IOException e) {
 175 | 			if (!e.getMessage().contains("does-not-exist")) {
 176 | 				throw e;
 177 | 			}
 178 | 		}
 179 | 	}
 180 | 
 181 | 	@Test
 182 | 	public void backwardsCompatibleInvocationBadNumberOfTasksWithType() throws Exception {
 183 | 		try {
 184 | 			run(tmp.newFolder("in").getAbsolutePath(), tmp.newFolder("out").getAbsolutePath(), "not a number", "TEXT");
 185 | 			fail();
 186 | 		} catch (NumberFormatException e) {
 187 | 			if (!e.getMessage().contains("not a number")) {
 188 | 				throw e;
 189 | 			}
 190 | 		}
 191 | 	}
 192 | 
 193 | 	@Test
 194 | 	public void backwardsCompatibleInvocationNegativeTasksWithType() throws Exception {
 195 | 		try {
 196 | 			run(tmp.newFolder("in").getAbsolutePath(), tmp.newFolder("out").getAbsolutePath(), "-1", "TEXT");
 197 | 			fail();
 198 | 		} catch (UnrecognizedOptionException e) {
 199 | 			if (!e.getMessage().contains("-1")) {
 200 | 				throw e;
 201 | 			}
 202 | 		}
 203 | 	}
 204 | 
 205 | 	@Test
 206 | 	public void backwardsCompatibleInvocationZeroTasksWithType() throws Exception {
 207 | 		try {
 208 | 			run(tmp.newFolder("in").getAbsolutePath(), tmp.newFolder("out").getAbsolutePath(), "0", "TEXT");
 209 | 			fail();
 210 | 		} catch (IllegalArgumentException e) {
 211 | 			if (!e.getMessage().contains("0")) {
 212 | 				throw e;
 213 | 			}
 214 | 		}
 215 | 	}
 216 | 
 217 | 	@Test
 218 | 	public void backwardsCompatibleInvocationHugeHugeTasksWithType() throws Exception {
 219 | 		try {
 220 | 			run(tmp.newFolder("in").getAbsolutePath(), tmp.newFolder("out").getAbsolutePath(), "4001", "TEXT");
 221 | 			fail();
 222 | 		} catch (IllegalArgumentException e) {
 223 | 			if (!e.getMessage().contains("4001")) {
 224 | 				throw e;
 225 | 			}
 226 | 		}
 227 | 	}
 228 | 
 229 | 	@Test
 230 | 	public void backwardsCompatibleInvocationBadType() throws Exception {
 231 | 		try {
 232 | 			run(tmp.newFolder("in").getAbsolutePath(), tmp.newFolder("out").getAbsolutePath(), "80", "NEITHER_TEXT_OR_SEQUENCE");
 233 | 			fail();
 234 | 		} catch (IllegalArgumentException e) {
 235 | 			if (!e.getMessage().contains("NEITHER_TEXT_OR_SEQUENCE")) {
 236 | 				throw e;
 237 | 			}
 238 | 		}
 239 | 	}
 240 | 
 241 | 	@Test
 242 | 	public void invocationBadSrcDir() throws Exception {
 243 | 		try {
 244 | 			run("--threshold=0.9", "does-not-exist", tmp.getRoot().getAbsolutePath(), "20101116123015");
 245 | 			fail();
 246 | 		} catch (IOException e) {
 247 | 			if (!e.getMessage().contains("does-not-exist")) {
 248 | 				throw e;
 249 | 			}
 250 | 		}
 251 | 	}
 252 | 
 253 | 	@Test
 254 | 	public void invocationBadTimestamp() throws Exception {
 255 | 		try {
 256 | 			run("--threshold=0.9", tmp.newFolder("in").getAbsolutePath(), new File(tmp.getRoot(), "out").getAbsolutePath(), "not a number");
 257 | 			fail();
 258 | 		} catch (IllegalArgumentException e) {
 259 | 			if (!e.getMessage().contains("not a number")) {
 260 | 				throw e;
 261 | 			}
 262 | 		}
 263 | 	}
 264 | 
 265 | 	@Test
 266 | 	public void invocationShortTimestamp() throws Exception {
 267 | 		try {
 268 | 			run(tmp.newFolder("in").getAbsolutePath(), new File(tmp.getRoot(), "out").getAbsolutePath(), "2010111612301");
 269 | 			fail();
 270 | 		} catch (IllegalArgumentException e) {
 271 | 			if (!e.getMessage().contains("2010111612301")) {
 272 | 				throw e;
 273 | 			}
 274 | 		}
 275 | 	}
 276 | 
 277 | 	@Test
 278 | 	public void invocationLongTimestamp() throws Exception {
 279 | 		try {
 280 | 			run("--threshold=0.5", tmp.newFolder("in").getAbsolutePath(), new File(tmp.getRoot(), "out").getAbsolutePath(), "201011161230150");
 281 | 			fail();
 282 | 		} catch (IllegalArgumentException e) {
 283 | 			if (!e.getMessage().contains("201011161230150")) {
 284 | 				throw e;
 285 | 			}
 286 | 		}
 287 | 	}
 288 | 
 289 | 	@Test
 290 | 	public void dirWithNoMatchingRegex() throws Exception {
 291 | 		/*
 292 | 		 * Create a non-empty directory.
 293 | 		 */
 294 | 		File src = tmp.newFolder("src");
 295 | 		tmp.newFolder("src/foo");
 296 | 		tmp.newFile("src/foo/file");
 297 | 
 298 | 		try {
 299 | 			run("--regex", ".+/in",
 300 | 					"--replacement", "foo",
 301 | 					"--input-format", "org.apache.hadoop.mapred.TextInputFormat",
 302 | 					"--output-format", "org.apache.hadoop.mapred.TextOutputFormat",
 303 | 					"--threshold", "0.5",
 304 | 					"--max-file-blocks", "100",
 305 | 					src.getAbsolutePath(), "out", "20101116123015");
 306 | 
 307 | 			fail();
 308 | 		} catch (IllegalArgumentException e) {
 309 | 			if (!e.getMessage().contains("src/foo")) {
 310 | 				throw e;
 311 | 			}
 312 | 		}
 313 | 	}
 314 | 
 315 | 	@Test
 316 | 	public void bucketing() throws Exception {
 317 | 		File in = tmp.newFolder("in");
 318 | 
 319 | 		Counters expectedCounters = new Counters();
 320 | 		List<String> expectedBucketFiles = new ArrayList<String>();
 321 | 
 322 | 		/*
 323 | 		 * Create a hierarchy of directories. Directories are distinguished by a trailing slash in these comments.
 324 | 		 *
 325 | 		 *	1/
 326 | 		 *			1.1/
 327 | 		 *					file1 10 bytes
 328 | 		 *					file2 20 bytes
 329 | 		 *					file3 30 bytes
 330 | 		 *					file4 41 bytes
 331 | 		 *					file5 15 bytes
 332 | 		 *					file6 30 bytes
 333 | 		 *					file7	20 bytes
 334 | 		 *			1.2/
 335 | 		 *					file1 20 bytes
 336 | 		 *					file2 10 bytes
 337 | 		 *			1.3/
 338 | 		 *	2/
 339 | 		 *			file1 70 bytes
 340 | 		 *			file2 30 bytes
 341 | 		 *			file3 25 bytes
 342 | 		 *			file4 30 bytes
 343 | 		 *			file5 35 bytes
 344 | 		 *			2.1/
 345 | 		 *					file1 10 bytes
 346 | 		 *			2.2/
 347 | 		 *					file1 25 bytes
 348 | 		 *					file2 15 bytes
 349 | 		 *					file3 35 bytes
 350 | 		 *			2.3/
 351 | 		 *					file1 41 bytes
 352 | 		 *					file2 10 bytes
 353 | 		 *			2.4/
 354 | 		 *					2.4.1/
 355 | 		 *							file1 100 bytes
 356 | 		 *							file2	30 bytes
 357 | 		 *					2.4.2/
 358 | 		 *							file1 20 bytes
 359 | 		 *							file2 20 bytes
 360 | 		 *							file3 10 bytes
 361 | 		 */
 362 | 
 363 | 		/*
 364 | 		 * in contains 2 dirs and no files so it is skipped.
 365 | 		 *
 366 | 		 * 	in/
 367 | 		 * 			1/
 368 | 		 * 			2/
 369 | 		 */
 370 | 		expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);
 371 | 		expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
 372 | 
 373 | 		tmp.newFolder("in/1");
 374 | 		File dir2 = tmp.newFolder("in/2");
 375 | 
 376 | 
 377 | 		/*
 378 | 		 * in/1 contains three dirs and no files so it is skipped.
 379 | 		 *
 380 | 		 * 	in/
 381 | 		 * 			1/
 382 | 		 * 					1.1/
 383 | 		 * 					1.2/
 384 | 		 * 					1.3/
 385 | 		 */
 386 | 		expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);
 387 | 		expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
 388 | 
 389 | 		File dir1_1 = tmp.newFolder("in/1/1.1");
 390 | 		File dir1_2 = tmp.newFolder("in/1/1.2");
 391 | 		tmp.newFolder("in/1/1.3");
 392 | 
 393 | 
 394 | 		/*
 395 | 		 * in/2 contains five files and four dirs.
 396 | 		 *
 397 | 		 * 	in/
 398 | 		 * 			2/
 399 | 		 *					file1 70 bytes
 400 | 		 *					file2 30 bytes
 401 | 		 *					file3 25 bytes
 402 | 		 *					file4 30 bytes
 403 | 		 *					file5 35 bytes
 404 | 		 * 					2.1/
 405 | 		 * 					2.2/
 406 | 		 * 					2.3/
 407 | 		 * 					2.4/
 408 | 		 *
 409 | 		 * 	0						1						2
 410 | 		 * 	file5	35		file2 30		file4 30
 411 | 		 * 							file3 25
 412 | 		 *
 413 | 		 * Buckets 0 and 2 have a single file each so they are ignored.
 414 | 		 */
 415 | 		expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);
 416 | 		expectedCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1);
 417 | 
 418 | 		expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 5);
 419 | 		expectedCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, 2);
 420 | 		expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 3);
 421 | 
 422 | 		File dir2_1 = tmp.newFolder("in/2/2.1");
 423 | 		File dir2_2 = tmp.newFolder("in/2/2.2");
 424 | 		File dir2_3 = tmp.newFolder("in/2/2.3");
 425 | 		tmp.newFolder("in/2/2.4");
 426 | 
 427 | 		createFile(dir2, "file1", 70);
 428 | 		createFile(dir2, "file2", 30);
 429 | 		createFile(dir2, "file3", 25);
 430 | 		createFile(dir2, "file4", 30);
 431 | 		createFile(dir2, "file5", 35);
 432 | 
 433 | 		expectedBucketFiles.add(format("%s	%s", dir2.getAbsolutePath() + "-1", new File(dir2, "file2").getAbsolutePath()));
 434 | 		expectedBucketFiles.add(format("%s	%s", dir2.getAbsolutePath() + "-1", new File(dir2, "file3").getAbsolutePath()));
 435 | 
 436 | 
 437 | 		/*
 438 | 		 * in/1/1.1 contains seven files and no dirs.
 439 | 		 *
 440 | 		 * 	in/
 441 | 		 * 			1/
 442 | 		 * 					1.1/
 443 | 		 *							file1 10 bytes
 444 | 		 *							file2 20 bytes
 445 | 		 *							file3 30 bytes
 446 | 		 *							file4 41 bytes
 447 | 		 *							file5 15 bytes
 448 | 		 *							file6 30 bytes
 449 | 		 *							file7	20 bytes
 450 | 		 *
 451 | 		 * 	0						1						2
 452 | 		 * 	file3 30		file6 30		file2 20
 453 | 		 * 	file5 15		file1 10		file7 20
 454 | 		 *
 455 | 		 * file4 is > 50 * 0.8 so it is ignored.
 456 | 		 */
 457 | 		expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);
 458 | 		expectedCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1);
 459 | 
 460 | 		expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 7);
 461 | 		expectedCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, 6);
 462 | 		expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 1);
 463 | 
 464 | 		createFile(dir1_1, "file1", 10);
 465 | 		createFile(dir1_1, "file2", 20);
 466 | 		createFile(dir1_1, "file3", 30);
 467 | 		createFile(dir1_1, "file4", 41);
 468 | 		createFile(dir1_1, "file5", 15);
 469 | 		createFile(dir1_1, "file6", 30);
 470 | 		createFile(dir1_1, "file7", 20);
 471 | 
 472 | 		expectedBucketFiles.add(format("%s	%s", dir1_1.getAbsolutePath() + "-0", new File(dir1_1, "file3").getAbsolutePath()));
 473 | 		expectedBucketFiles.add(format("%s	%s", dir1_1.getAbsolutePath() + "-0", new File(dir1_1, "file5").getAbsolutePath()));
 474 | 		expectedBucketFiles.add(format("%s	%s", dir1_1.getAbsolutePath() + "-1", new File(dir1_1, "file6").getAbsolutePath()));
 475 | 		expectedBucketFiles.add(format("%s	%s", dir1_1.getAbsolutePath() + "-1", new File(dir1_1, "file1").getAbsolutePath()));
 476 | 		expectedBucketFiles.add(format("%s	%s", dir1_1.getAbsolutePath() + "-2", new File(dir1_1, "file2").getAbsolutePath()));
 477 | 		expectedBucketFiles.add(format("%s	%s", dir1_1.getAbsolutePath() + "-2", new File(dir1_1, "file7").getAbsolutePath()));
 478 | 
 479 | 
 480 | 		/*
 481 | 		 * in/1/1.2 contains to files.
 482 | 		 *
 483 | 		 * 	in/
 484 | 		 * 			1/
 485 | 		 * 					1.2/
 486 | 		 *							file1 20 bytes
 487 | 		 *							file2 10 bytes
 488 | 		 */
 489 | 		expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);
 490 | 		expectedCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1);
 491 | 
 492 | 		expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 2);
 493 | 		expectedCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, 2);
 494 | 
 495 | 		createFile(dir1_2, "file1", 20);
 496 | 		createFile(dir1_2, "file2", 10);
 497 | 
 498 | 		expectedBucketFiles.add(format("%s	%s", dir1_2.getAbsolutePath() + "-0", new File(dir1_2, "file1").getAbsolutePath()));
 499 | 		expectedBucketFiles.add(format("%s	%s", dir1_2.getAbsolutePath() + "-0", new File(dir1_2, "file2").getAbsolutePath()));
 500 | 
 501 | 
 502 | 		/*
 503 | 		 * in/1/1.3 is empty.
 504 | 		 *
 505 | 		 * 	in/
 506 | 		 * 			1/
 507 | 		 * 					1.3/
 508 | 		 */
 509 | 		expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);
 510 | 		expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
 511 | 
 512 | 		tmp.newFolder("in/1/1.3");
 513 | 
 514 | 
 515 | 		/*
 516 | 		 * in/2/2.1 contains on file.
 517 | 		 *
 518 | 		 * 	in/
 519 | 		 * 			2/
 520 | 		 * 					2.1/
 521 | 		 *							file1 10 bytes
 522 | 		 *
 523 | 		 * Single file dirs are ignored.
 524 | 		 */
 525 | 		expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);
 526 | 		expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
 527 | 
 528 | 		expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 1);
 529 | 		expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 1);
 530 | 
 531 | 		createFile(dir2_1, "file1", 10);
 532 | 
 533 | 
 534 | 		/*
 535 | 		 * in/2/2.2 contains three files.
 536 | 		 *
 537 | 		 * 	in/
 538 | 		 * 			2/
 539 | 		 * 					2.2/
 540 | 		 *							file1 25 bytes
 541 | 		 *							file2 15 bytes
 542 | 		 *							file3 35 bytes
 543 | 		 *
 544 | 		 * 	0						1
 545 | 		 * 	file3 35		file1 25
 546 | 		 * 							file2 15
 547 | 		 *
 548 | 		 * Bucket 0 with a single file is ignored.
 549 | 		 */
 550 | 		expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);
 551 | 		expectedCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1);
 552 | 
 553 | 		expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 3);
 554 | 		expectedCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, 2);
 555 | 		expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 1);
 556 | 
 557 | 		createFile(dir2_2, "file1", 25);
 558 | 		createFile(dir2_2, "file2", 15);
 559 | 		createFile(dir2_2, "file3", 35);
 560 | 
 561 | 		expectedBucketFiles.add(format("%s	%s", dir2_2.getAbsolutePath() + "-1", new File(dir2_2, "file1").getAbsolutePath()));
 562 | 		expectedBucketFiles.add(format("%s	%s", dir2_2.getAbsolutePath() + "-1", new File(dir2_2, "file2").getAbsolutePath()));
 563 | 
 564 | 
 565 | 		/*
 566 | 		 * in/2/2.3 contains 2 files.
 567 | 		 *
 568 | 		 * 	in/
 569 | 		 * 			2/
 570 | 		 * 					2.3/
 571 | 		 *							file1 41 bytes
 572 | 		 *							file2 10 bytes
 573 | 		 *
 574 | 		 * file1 is too big and leaving file2 as a single file, which is also ignored.
 575 | 		 */
 576 | 		expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);
 577 | 		expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
 578 | 
 579 | 		expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 2);
 580 | 		expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 2);
 581 | 
 582 | 		createFile(dir2_3, "file1", 41);
 583 | 		createFile(dir2_3, "file2", 10);
 584 | 
 585 | 
 586 | 		/*
 587 | 		 * in/2/2.4 contains two sub directories and no files.
 588 | 		 *
 589 | 		 * 	in/
 590 | 		 * 			2/
 591 | 		 *					2.4/
 592 | 		 *							2.4.1/
 593 | 		 *							2.4.2/
 594 | 		 */
 595 | 		expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);
 596 | 		expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
 597 | 
 598 | 		tmp.newFolder("in/2/2.4");
 599 | 
 600 | 		File dir2_4_1 = tmp.newFolder("in/2/2.4/2.4.1");
 601 | 		File dir2_4_2 = tmp.newFolder("in/2/2.4/2.4.2");
 602 | 
 603 | 
 604 | 		/*
 605 | 		 * 	in/
 606 | 		 * 			2/
 607 | 		 *					2.4/
 608 | 		 *							2.4.1/
 609 | 		 *									file1 100 bytes
 610 | 		 *									file2	30 bytes
 611 | 		 */
 612 | 		expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);
 613 | 		expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
 614 | 
 615 | 		expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 2);
 616 | 		expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 2);
 617 | 
 618 | 		createFile(dir2_4_1, "file1", 100);
 619 | 		createFile(dir2_4_1, "file2", 30);
 620 | 
 621 | 
 622 | 		/*
 623 | 		 * 	in/
 624 | 		 * 			2/
 625 | 		 *					2.4/
 626 | 		 *							2.4.2/
 627 | 		 *									file1 20 bytes
 628 | 		 *									file2 20 bytes
 629 | 		 *									file3 10 bytes
 630 | 		 *	0
 631 | 		 *	file1 20
 632 | 		 *	file2 20
 633 | 		 *	file3 10
 634 | 		 */
 635 | 		expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);
 636 | 		expectedCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1);
 637 | 
 638 | 		expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 3);
 639 | 		expectedCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, 3);
 640 | 
 641 | 		createFile(dir2_4_2, "file1", 20);
 642 | 		createFile(dir2_4_2, "file2", 20);
 643 | 		createFile(dir2_4_2, "file3", 10);
 644 | 
 645 | 		expectedBucketFiles.add(format("%s	%s", dir2_4_2.getAbsolutePath() + "-0", new File(dir2_4_2, "file1").getAbsolutePath()));
 646 | 		expectedBucketFiles.add(format("%s	%s", dir2_4_2.getAbsolutePath() + "-0", new File(dir2_4_2, "file2").getAbsolutePath()));
 647 | 		expectedBucketFiles.add(format("%s	%s", dir2_4_2.getAbsolutePath() + "-0", new File(dir2_4_2, "file3").getAbsolutePath()));
 648 | 
 649 | 
 650 | 		Crush crush = new Crush();
 651 | 
 652 | 		crush.setConf(job);
 653 | 		crush.setFileSystem(fileSystem);
 654 | 
 655 | 		/*
 656 | 		 * Call these in the same order that run() does.
 657 | 		 */
 658 | 		crush.createJobConfAndParseArgs("--compress=none", "--max-file-blocks=1", in.getAbsolutePath(), new File(tmp.getRoot(), "out").getAbsolutePath(), "20101124171730");
 659 | 		crush.writeDirs();
 660 | 
 661 | 
 662 | 		/*
 663 | 		 * Verify bucket contents.
 664 | 		 */
 665 | 
 666 | 		List<String> actualBucketFiles = new ArrayList<String>();
 667 | 
 668 | 		Text key = new Text();
 669 | 		Text value = new Text();
 670 | 
 671 | 		Reader reader = new Reader(FileSystem.get(job), crush.getBucketFiles(), job);
 672 | 
 673 | 		while(reader.next(key, value)) {
 674 | 			actualBucketFiles.add(format("%s\t%s", key, value));
 675 | 		}
 676 | 
 677 | 		reader.close();
 678 | 
 679 | 		Collections.sort(expectedBucketFiles);
 680 | 		Collections.sort(actualBucketFiles);
 681 | 
 682 | 		assertThat(actualBucketFiles, equalTo(expectedBucketFiles));
 683 | 
 684 | 		/*
 685 | 		 * Verify the partition map.
 686 | 		 */
 687 | 		Reader partitionMapReader = new Reader(FileSystem.get(job), crush.getPartitionMap(), job);
 688 | 
 689 | 		IntWritable partNum = new IntWritable();
 690 | 
 691 | 		Map<String, Integer> actualPartitions = new HashMap<String, Integer>();
 692 | 
 693 | 		while (partitionMapReader.next(key, partNum)) {
 694 | 			actualPartitions.put(key.toString(), partNum.get());
 695 | 		}
 696 | 
 697 | 		partitionMapReader.close();
 698 | 
 699 | 		/*
 700 | 		 * These crush files need to allocated into 5 partitions:
 701 | 		 *
 702 | 		 * in/2-1						55 bytes
 703 | 		 * in/1/1.1-0				45 bytes
 704 | 		 * in/1/1.1-2				40 bytes
 705 | 		 * in/1/1.1-1				40 bytes
 706 | 		 * in/1/1.2-0				30 bytes
 707 | 		 * in/2/2.2-1				40 bytes
 708 | 		 * in/2/2.4/2.4.2-0	50 bytes
 709 | 		 *
 710 | 		 * 	0							1											2								3								4
 711 | 		 * 	in/2-1 55			in/2/2.4/2.4.2-0	50	in/1/1.1-0	45	in/1/1.1-2	40	in/1/1.1-1	40
 712 | 		 * 																											in/2/2.2-1	40	in/1/1.2-0	39
 713 | 		 */
 714 | 		Map<String, Integer> expectedPartitions = new HashMap<String, Integer>();
 715 | 
 716 | 		//TODO: this may not be deterministic due to jvm/hashmap/filesystem
 717 | 		expectedPartitions.put(dir2.getAbsolutePath() + "-1",			0);
 718 | 		expectedPartitions.put(dir2_4_2.getAbsolutePath() + "-0",	1);
 719 | 		expectedPartitions.put(dir1_1.getAbsolutePath() + "-0",		2);
 720 | 		expectedPartitions.put(dir1_1.getAbsolutePath() + "-2",		4);
 721 | 		expectedPartitions.put(dir2_2.getAbsolutePath() + "-1",		3);
 722 | 		expectedPartitions.put(dir1_1.getAbsolutePath() + "-1",		3);
 723 | 		expectedPartitions.put(dir1_2.getAbsolutePath() + "-0",		4);
 724 | 
 725 | 		assertThat(actualPartitions, equalTo(expectedPartitions));
 726 | 
 727 | 
 728 | 		/*
 729 | 		 * Verify counters.
 730 | 		 */
 731 | 		Counters actualCounters = new Counters();
 732 | 
 733 | 		DataInputStream countersStream = FileSystem.get(job).open(crush.getCounters());
 734 | 
 735 | 		actualCounters.readFields(countersStream);
 736 | 
 737 | 		countersStream.close();
 738 | 
 739 | 		assertThat(actualCounters, equalTo(expectedCounters));
 740 | 	}
 741 | 
 742 | 	/**
 743 | 	 * Returns a qualified file status, just like {@link FileSystem#listStatus(Path)} does.
 744 | 	 */
 745 | 	private static FileStatus createFile(File dir, String fileName, int size) {
 746 | 		File file = new File(dir, fileName);
 747 | 
 748 | 		try {
 749 | 			FileOutputStream os = new FileOutputStream(file);
 750 | 
 751 | 			os.write(new byte[size]);
 752 | 
 753 | 			os.close();
 754 | 		} catch (IOException e) {
 755 | 			throw new RuntimeException(e);
 756 | 		}
 757 | 
 758 | 		return new FileStatus(size, false, 3, 1024 * 1024 * 60, currentTimeMillis(), new Path("hdfs://hostname.pvt:12345" + file.getAbsolutePath()));
 759 | 	}
 760 | 
 761 | 	/**
 762 | 	 * This exists only so we can impose a specific order on the files that are listed.
 763 | 	 */
 764 | 	private static class SortingFileSystem extends FileSystem {
 765 | 
 766 | 		private final FileSystem delegate;
 767 | 
 768 | 		public SortingFileSystem(FileSystem delegate) {
 769 | 			super();
 770 | 
 771 | 			this.delegate = delegate;
 772 | 		}
 773 | 
 774 | 		@Override
 775 | 		public FileStatus[] listStatus(Path arg0) throws IOException {
 776 | 			FileStatus[] contents = delegate.listStatus(arg0);
 777 | 
 778 | 			Arrays.sort(contents);
 779 | 
 780 | 			return contents;
 781 | 		}
 782 | 
 783 | 		@Override
 784 | 		public FSDataOutputStream append(Path arg0, int arg1, Progressable arg2) throws IOException {
 785 | 			return delegate.append(arg0, arg1, arg2);
 786 | 		}
 787 | 
 788 | 		@Override
 789 | 		public FSDataOutputStream append(Path f, int bufferSize) throws IOException {
 790 | 			return delegate.append(f, bufferSize);
 791 | 		}
 792 | 
 793 | 		@Override
 794 | 		public FSDataOutputStream append(Path f) throws IOException {
 795 | 			return delegate.append(f);
 796 | 		}
 797 | 
 798 | 		@Override
 799 | 		public void close() throws IOException {
 800 | 			delegate.close();
 801 | 		}
 802 | 
 803 | 		@Override
 804 | 		public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) throws IOException {
 805 | 			delegate.completeLocalOutput(fsOutputFile, tmpLocalFile);
 806 | 		}
 807 | 
 808 | 		@Override
 809 | 		public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path src, Path dst) throws IOException {
 810 | 			delegate.copyFromLocalFile(delSrc, overwrite, src, dst);
 811 | 		}
 812 | 
 813 | 		@Override
 814 | 		public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path[] srcs, Path dst) throws IOException {
 815 | 			delegate.copyFromLocalFile(delSrc, overwrite, srcs, dst);
 816 | 		}
 817 | 
 818 | 		@Override
 819 | 		public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws IOException {
 820 | 			delegate.copyFromLocalFile(delSrc, src, dst);
 821 | 		}
 822 | 
 823 | 		@Override
 824 | 		public void copyFromLocalFile(Path src, Path dst) throws IOException {
 825 | 			delegate.copyFromLocalFile(src, dst);
 826 | 		}
 827 | 
 828 | 		@Override
 829 | 		public void copyToLocalFile(boolean delSrc, Path src, Path dst) throws IOException {
 830 | 			delegate.copyToLocalFile(delSrc, src, dst);
 831 | 		}
 832 | 
 833 | 		@Override
 834 | 		public void copyToLocalFile(Path src, Path dst) throws IOException {
 835 | 			delegate.copyToLocalFile(src, dst);
 836 | 		}
 837 | 
 838 | 		@Override
 839 | 		public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, Progressable progress) throws IOException {
 840 | 			return delegate.create(f, overwrite, bufferSize, progress);
 841 | 		}
 842 | 
 843 | 		@Override
 844 | 		public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, short replication, long blockSize,
 845 | 				Progressable progress) throws IOException {
 846 | 			return delegate.create(f, overwrite, bufferSize, replication, blockSize, progress);
 847 | 		}
 848 | 
 849 | 		@Override
 850 | 		public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, short replication, long blockSize)
 851 | 				throws IOException {
 852 | 			return delegate.create(f, overwrite, bufferSize, replication, blockSize);
 853 | 		}
 854 | 
 855 | 		@Override
 856 | 		public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize) throws IOException {
 857 | 			return delegate.create(f, overwrite, bufferSize);
 858 | 		}
 859 | 
 860 | 		@Override
 861 | 		public FSDataOutputStream create(Path f, boolean overwrite) throws IOException {
 862 | 			return delegate.create(f, overwrite);
 863 | 		}
 864 | 
 865 | 		@Override
 866 | 		public FSDataOutputStream create(Path arg0, FsPermission arg1, boolean arg2, int arg3, short arg4, long arg5,
 867 | 				Progressable arg6) throws IOException {
 868 | 			return delegate.create(arg0, arg1, arg2, arg3, arg4, arg5, arg6);
 869 | 		}
 870 | 
 871 | 		@Override
 872 | 		public FSDataOutputStream create(Path f, Progressable progress) throws IOException {
 873 | 			return delegate.create(f, progress);
 874 | 		}
 875 | 
 876 | 		@Override
 877 | 		public FSDataOutputStream create(Path f, short replication, Progressable progress) throws IOException {
 878 | 			return delegate.create(f, replication, progress);
 879 | 		}
 880 | 
 881 | 		@Override
 882 | 		public FSDataOutputStream create(Path f, short replication) throws IOException {
 883 | 			return delegate.create(f, replication);
 884 | 		}
 885 | 
 886 | 		@Override
 887 | 		public FSDataOutputStream create(Path f) throws IOException {
 888 | 			return delegate.create(f);
 889 | 		}
 890 | 
 891 | 		@Override
 892 | 		public boolean createNewFile(Path f) throws IOException {
 893 | 			return delegate.createNewFile(f);
 894 | 		}
 895 | 
 896 | 		@Override
 897 | 		public boolean delete(Path arg0, boolean arg1) throws IOException {
 898 | 			return delegate.delete(arg0, arg1);
 899 | 		}
 900 | 
 901 | 		@Override
 902 | 		public boolean delete(Path arg0) throws IOException {
 903 | 			return delegate.delete(arg0);
 904 | 		}
 905 | 
 906 | 		@Override
 907 | 		public boolean deleteOnExit(Path f) throws IOException {
 908 | 			return delegate.deleteOnExit(f);
 909 | 		}
 910 | 
 911 | 		@Override
 912 | 		public boolean equals(Object obj) {
 913 | 			return delegate.equals(obj);
 914 | 		}
 915 | 
 916 | 		@Override
 917 | 		public boolean exists(Path arg0) throws IOException {
 918 | 			return delegate.exists(arg0);
 919 | 		}
 920 | 
 921 | 		@Override
 922 | 		public long getBlockSize(Path f) throws IOException {
 923 | 			return delegate.getBlockSize(f);
 924 | 		}
 925 | 
 926 | 		@Override
 927 | 		public Configuration getConf() {
 928 | 			return delegate.getConf();
 929 | 		}
 930 | 
 931 | 		@Override
 932 | 		public ContentSummary getContentSummary(Path arg0) throws IOException {
 933 | 			return delegate.getContentSummary(arg0);
 934 | 		}
 935 | 
 936 | 		@Override
 937 | 		public long getDefaultBlockSize() {
 938 | 			return delegate.getDefaultBlockSize();
 939 | 		}
 940 | 
 941 | 		@Override
 942 | 		public short getDefaultReplication() {
 943 | 			return delegate.getDefaultReplication();
 944 | 		}
 945 | 
 946 | 		@Override
 947 | 		public BlockLocation[] getFileBlockLocations(FileStatus file, long start, long len) throws IOException {
 948 | 			return delegate.getFileBlockLocations(file, start, len);
 949 | 		}
 950 | 
 951 | 		@Override
 952 | 		public FileChecksum getFileChecksum(Path f) throws IOException {
 953 | 			return delegate.getFileChecksum(f);
 954 | 		}
 955 | 
 956 | 		@Override
 957 | 		public FileStatus getFileStatus(Path arg0) throws IOException {
 958 | 			return delegate.getFileStatus(arg0);
 959 | 		}
 960 | 
 961 | 		@Override
 962 | 		public Path getHomeDirectory() {
 963 | 			return delegate.getHomeDirectory();
 964 | 		}
 965 | 
 966 | 		@Override
 967 | 		public long getLength(Path f) throws IOException {
 968 | 			return delegate.getLength(f);
 969 | 		}
 970 | 
 971 | 		@Override
 972 | 		public String getName() {
 973 | 			return delegate.getName();
 974 | 		}
 975 | 
 976 | 		@Override
 977 | 		public short getReplication(Path src) throws IOException {
 978 | 			return delegate.getReplication(src);
 979 | 		}
 980 | 
 981 | 		@Override
 982 | 		public URI getUri() {
 983 | 			return delegate.getUri();
 984 | 		}
 985 | 
 986 | 		@Override
 987 | 		public long getUsed() throws IOException {
 988 | 			return delegate.getUsed();
 989 | 		}
 990 | 
 991 | 		@Override
 992 | 		public Path getWorkingDirectory() {
 993 | 			return delegate.getWorkingDirectory();
 994 | 		}
 995 | 
 996 | 		@Override
 997 | 		public FileStatus[] globStatus(Path arg0, PathFilter arg1) throws IOException {
 998 | 			return delegate.globStatus(arg0, arg1);
 999 | 		}
1000 | 
1001 | 		@Override
1002 | 		public FileStatus[] globStatus(Path pathPattern) throws IOException {
1003 | 			return delegate.globStatus(pathPattern);
1004 | 		}
1005 | 
1006 | 		@Override
1007 | 		public int hashCode() {
1008 | 			return delegate.hashCode();
1009 | 		}
1010 | 
1011 | 		@Override
1012 | 		public void initialize(URI name, Configuration conf) throws IOException {
1013 | 			delegate.initialize(name, conf);
1014 | 		}
1015 | 
1016 | 		@Override
1017 | 		public boolean isDirectory(Path arg0) throws IOException {
1018 | 			return delegate.isDirectory(arg0);
1019 | 		}
1020 | 
1021 | 		@Override
1022 | 		public boolean isFile(Path arg0) throws IOException {
1023 | 			return delegate.isFile(arg0);
1024 | 		}
1025 | 
1026 | 		@Override
1027 | 		public FileStatus[] listStatus(Path f, PathFilter filter) throws IOException {
1028 | 			return delegate.listStatus(f, filter);
1029 | 		}
1030 | 
1031 | 		@Override
1032 | 		public FileStatus[] listStatus(Path[] arg0, PathFilter arg1) throws IOException {
1033 | 			return delegate.listStatus(arg0, arg1);
1034 | 		}
1035 | 
1036 | 		@Override
1037 | 		public FileStatus[] listStatus(Path[] files) throws IOException {
1038 | 			return delegate.listStatus(files);
1039 | 		}
1040 | 
1041 | 		@Override
1042 | 		public Path makeQualified(Path path) {
1043 | 			return delegate.makeQualified(path);
1044 | 		}
1045 | 
1046 | 		@Override
1047 | 		public boolean mkdirs(Path arg0, FsPermission arg1) throws IOException {
1048 | 			return delegate.mkdirs(arg0, arg1);
1049 | 		}
1050 | 
1051 | 		@Override
1052 | 		public boolean mkdirs(Path f) throws IOException {
1053 | 			return delegate.mkdirs(f);
1054 | 		}
1055 | 
1056 | 		@Override
1057 | 		public void moveFromLocalFile(Path src, Path dst) throws IOException {
1058 | 			delegate.moveFromLocalFile(src, dst);
1059 | 		}
1060 | 
1061 | 		@Override
1062 | 		public void moveFromLocalFile(Path[] srcs, Path dst) throws IOException {
1063 | 			delegate.moveFromLocalFile(srcs, dst);
1064 | 		}
1065 | 
1066 | 		@Override
1067 | 		public void moveToLocalFile(Path src, Path dst) throws IOException {
1068 | 			delegate.moveToLocalFile(src, dst);
1069 | 		}
1070 | 
1071 | 		@Override
1072 | 		public FSDataInputStream open(Path arg0, int arg1) throws IOException {
1073 | 			return delegate.open(arg0, arg1);
1074 | 		}
1075 | 
1076 | 		@Override
1077 | 		public FSDataInputStream open(Path f) throws IOException {
1078 | 			return delegate.open(f);
1079 | 		}
1080 | 
1081 | 		@Override
1082 | 		public boolean rename(Path arg0, Path arg1) throws IOException {
1083 | 			return delegate.rename(arg0, arg1);
1084 | 		}
1085 | 
1086 | 		@Override
1087 | 		public void setConf(Configuration conf) {
1088 | 			if (null != delegate) {
1089 | 				delegate.setConf(conf);
1090 | 			}
1091 | 		}
1092 | 
1093 | 		@Override
1094 | 		public void setOwner(Path p, String username, String groupname) throws IOException {
1095 | 			delegate.setOwner(p, username, groupname);
1096 | 		}
1097 | 
1098 | 		@Override
1099 | 		public void setPermission(Path p, FsPermission permission) throws IOException {
1100 | 			delegate.setPermission(p, permission);
1101 | 		}
1102 | 
1103 | 		@Override
1104 | 		public boolean setReplication(Path src, short replication) throws IOException {
1105 | 			return delegate.setReplication(src, replication);
1106 | 		}
1107 | 
1108 | 		@Override
1109 | 		public void setTimes(Path p, long mtime, long atime) throws IOException {
1110 | 			delegate.setTimes(p, mtime, atime);
1111 | 		}
1112 | 
1113 | 		@Override
1114 | 		public void setVerifyChecksum(boolean verifyChecksum) {
1115 | 			delegate.setVerifyChecksum(verifyChecksum);
1116 | 		}
1117 | 
1118 | 		@Override
1119 | 		public void setWorkingDirectory(Path arg0) {
1120 | 			delegate.setWorkingDirectory(arg0);
1121 | 		}
1122 | 
1123 | 		@Override
1124 | 		public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) throws IOException {
1125 | 			return delegate.startLocalOutput(fsOutputFile, tmpLocalFile);
1126 | 		}
1127 | 
1128 | 		@Override
1129 | 		public String toString() {
1130 | 			return delegate.toString();
1131 | 		}
1132 | 
1133 | 
1134 | 	}
1135 | }
1136 | 


--------------------------------------------------------------------------------