├── .classpath
├── .gitignore
├── .project
├── README.md
└── src
└── com
└── mindpin
└── rsync
├── ChunkParser.java
├── PatchApply.java
├── PatchMaker.java
├── checksum
├── Adler32Util.java
└── MD5Util.java
├── chunk
└── Chunk.java
└── patch
├── Patch.java
├── PatchPartChunk.java
├── PatchPartData.java
└── i
└── PatchPart.java
/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 |
3 | # Package Files #
4 | *.jar
5 | *.war
6 | *.ear
7 |
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | java_binary_diff
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.jdt.core.javabuilder
10 |
11 |
12 |
13 |
14 |
15 | org.eclipse.jdt.core.javanature
16 |
17 |
18 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | java_binary_diff
2 | ================
3 |
4 | 基于java实现的,以rsync算法原理为基础的二进制文件差异比较处理
5 |
6 | 2012-7-8:
7 | 实现了基础的算法逻辑,主要包括以下三个逻辑:
8 | 根据 src_file 计算 chunk_map;
9 | 根据 chunk_map 和 target_file 生成 patch;
10 | 根据 src_file 和 patch 生成与 target_file 一样的 result_file;
11 |
12 | 实现了将 patch 通过 base64 编码保存到磁盘文本文件
13 |
14 | TODO:
15 | 将 patch 文件压缩保存
--------------------------------------------------------------------------------
/src/com/mindpin/rsync/ChunkParser.java:
--------------------------------------------------------------------------------
1 | package com.mindpin.rsync;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.io.RandomAccessFile;
6 | import java.security.NoSuchAlgorithmException;
7 | import java.util.ArrayList;
8 | import java.util.Date;
9 | import java.util.HashMap;
10 | import java.util.List;
11 |
12 | import com.mindpin.rsync.checksum.Adler32Util;
13 | import com.mindpin.rsync.checksum.MD5Util;
14 | import com.mindpin.rsync.chunk.Chunk;
15 |
16 |
17 | public class ChunkParser {
18 |
19 | final public static int CHUNK_LENGTH = 512;
20 | File in_file;
21 | HashMap> chunk_map;
22 |
23 | public ChunkParser(File in_file){
24 | this.in_file = in_file;
25 | this.chunk_map = new HashMap>();
26 | }
27 |
28 | public HashMap> parse() throws IOException, NoSuchAlgorithmException{
29 | long file_length = in_file.length();
30 |
31 | RandomAccessFile raf = new RandomAccessFile(in_file, "r");
32 | // FileInputStream fs = new FileInputStream(in_file);
33 |
34 | int sum = 0;
35 | int id = 0;
36 | long remained_length = file_length;
37 | int bytes_len;
38 |
39 | byte[] bytes;
40 | long cs32;
41 | byte[] md5;
42 |
43 | while(remained_length > 0){
44 | bytes_len = (remained_length < CHUNK_LENGTH) ? (int)remained_length : CHUNK_LENGTH;
45 |
46 | bytes = new byte[bytes_len];
47 | raf.read(bytes);
48 |
49 | cs32 = Adler32Util.checksum(bytes);
50 | md5 = MD5Util.get_md5(bytes);
51 | add(cs32, new Chunk(id, md5, bytes));
52 |
53 | remained_length -= bytes_len;
54 |
55 | sum += bytes_len;
56 | id ++;
57 | }
58 |
59 | raf.close();
60 |
61 | System.out.println("读取原始文件总字节数: " + sum);
62 | return chunk_map;
63 | }
64 |
65 | public void add(long cs32, Chunk chunk){
66 | if(chunk_map.containsKey(cs32)){
67 | chunk_map.get(cs32).add(chunk);
68 | }else{
69 | List list = new ArrayList();
70 | list.add(chunk);
71 | chunk_map.put(cs32, list);
72 | }
73 | }
74 |
75 |
76 | // 测试结果:
77 | //
78 | // 原始文件总字节数: 1049568
79 | // 耗时:0.093 秒
80 | // chunk数据字节数: 82000
81 | //
82 | // 原始文件总字节数: 200061022
83 | // 耗时:4.875 秒
84 | // chunk数据字节数: 15623920
85 | public static void main(String[] args) throws IOException, NoSuchAlgorithmException {
86 | File file;
87 | long t1;
88 | long t2;
89 | HashMap> map;
90 |
91 | // 1MB 0.094 秒
92 | file = new File("d:/差分比较实验/差分比较实验1.rar");
93 | System.out.println("原始文件总字节数: " + file.length());
94 | t1 = new Date().getTime();
95 | map = new ChunkParser(file).parse();
96 | t2 = new Date().getTime();
97 | System.out.println("耗时:" + (t2 - t1) / 1000.0 + " 秒");
98 | System.out.println("chunk数据字节数: " + map.size() * (8 + 32));
99 | // System.out.println(map);
100 |
101 | // 190MB 4.734 秒
102 | // file = new File("d:/差分比较实验/world1.zip");
103 | // System.out.println("原始文件总字节数: " + file.length());
104 | // t1 = new Date().getTime();
105 | // map = new ChunkParser(file).parse();
106 | // t2 = new Date().getTime();
107 | // System.out.println("耗时:" + (t2 - t1) / 1000.0 + " 秒");
108 | // System.out.println("chunk数据字节数: " + map.size() * (8 + 32));
109 |
110 | // 1G 以上的文件会 out of memory
111 | // file = new File("D:/Download/乐可乐可2中文版@USP草帽.rar");
112 | // System.out.println("原始文件总字节数: " + file.length());
113 | // t1 = new Date().getTime();
114 | // map = new ChunkParser(file).parse();
115 | // t2 = new Date().getTime();
116 | // System.out.println("耗时:" + (t2 - t1) / 1000.0 + " 秒");
117 | }
118 | }
119 |
--------------------------------------------------------------------------------
/src/com/mindpin/rsync/PatchApply.java:
--------------------------------------------------------------------------------
1 | package com.mindpin.rsync;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.io.RandomAccessFile;
6 | import java.security.NoSuchAlgorithmException;
7 | import java.util.HashMap;
8 | import java.util.List;
9 |
10 | import com.mindpin.rsync.chunk.Chunk;
11 | import com.mindpin.rsync.patch.Patch;
12 | import com.mindpin.rsync.patch.PatchPartChunk;
13 | import com.mindpin.rsync.patch.PatchPartData;
14 | import com.mindpin.rsync.patch.i.PatchPart;
15 |
16 | public class PatchApply {
17 |
18 | Patch patch;
19 | File src_file;
20 | File result_file;
21 |
22 | public PatchApply(File src_file, Patch patch, File result_file){
23 | this.patch = patch;
24 | this.src_file = src_file;
25 | this.result_file = result_file;
26 | }
27 |
28 | public void apply() throws IOException{
29 | result_file.delete();
30 |
31 | RandomAccessFile read_raf = new RandomAccessFile(src_file, "r");
32 | RandomAccessFile write_raf = new RandomAccessFile(result_file, "rw");
33 |
34 | long src_file_length = src_file.length();
35 |
36 | long sum = 0;
37 | for(PatchPart part : patch.parts){
38 | if(part instanceof PatchPartData){
39 | write_raf.write(((PatchPartData)part).bytes);
40 | sum += ((PatchPartData)part).bytes_size();
41 | }
42 |
43 | if(part instanceof PatchPartChunk){
44 | int off = ((PatchPartChunk)part).id * ChunkParser.CHUNK_LENGTH;
45 | long remained_length = src_file_length - off;
46 |
47 | int length = (remained_length < ChunkParser.CHUNK_LENGTH) ? (int)remained_length : ChunkParser.CHUNK_LENGTH;
48 |
49 | byte[] bytes = new byte[length];
50 |
51 | read_raf.seek(off);
52 | read_raf.read(bytes);
53 | write_raf.write(bytes);
54 |
55 | sum += length;
56 | }
57 | }
58 |
59 | write_raf.close();
60 | read_raf.close();
61 | System.out.println("写入结果文件字节数: " + sum);
62 | }
63 |
64 | public static void main(String[] args) throws NoSuchAlgorithmException, IOException {
65 | // File src_file = new File("d:/差分比较实验/差分比较实验1.rar");
66 | // File target_file = new File("d:/差分比较实验/差分比较实验2.rar");
67 | // File result_file = new File("d:/差分比较实验/差分比较实验result.rar");
68 |
69 | File src_file = new File("d:/差分比较实验/月球_a.bmp");
70 | File target_file = new File("d:/差分比较实验/月球_c.bmp");
71 | File result_file = new File("d:/差分比较实验/月球_c_result.bmp");
72 |
73 | HashMap> chunk_map = new ChunkParser(src_file).parse();
74 | Patch patch = new PatchMaker(target_file, chunk_map).make();
75 |
76 | new PatchApply(src_file, patch, result_file).apply();
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/src/com/mindpin/rsync/PatchMaker.java:
--------------------------------------------------------------------------------
1 | package com.mindpin.rsync;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.io.RandomAccessFile;
6 | import java.security.NoSuchAlgorithmException;
7 | import java.util.ArrayList;
8 | import java.util.HashMap;
9 | import java.util.List;
10 |
11 | import com.mindpin.rsync.checksum.Adler32Util;
12 | import com.mindpin.rsync.chunk.Chunk;
13 | import com.mindpin.rsync.patch.Patch;
14 |
15 |
16 | public class PatchMaker {
17 |
18 | File target_file;
19 | HashMap> chunk_map;
20 |
21 | // src + patch = target
22 | // 根据 src 的 chunk_map
23 | // 扫描 target 一次
24 | // 就得到了如何 从 src -> target 的 patch 数据
25 |
26 | public PatchMaker(File target_file, HashMap> chunk_map){
27 | this.target_file = target_file;
28 | this.chunk_map = chunk_map;
29 | }
30 |
31 | public Patch make() throws IOException, NoSuchAlgorithmException{
32 | Patch patch = new Patch();
33 |
34 | long file_length = target_file.length();
35 | RandomAccessFile raf = new RandomAccessFile(target_file, "r");
36 | List diff_data = new ArrayList();
37 |
38 | byte[] bytes = {};
39 | long remained_length = file_length;
40 |
41 | int sum = 0;
42 | boolean next_block = true;
43 |
44 | Chunk chunk;
45 | while(remained_length > 0){
46 | if(next_block){
47 | bytes = read_next_block(raf, remained_length);
48 | remained_length -= bytes.length;
49 | }else{
50 | bytes = read_next_byte(raf, bytes);
51 | remained_length --;
52 | }
53 |
54 | chunk = match(chunk_map, bytes);
55 |
56 | if(chunk == null){
57 | // 未匹配,数组[0]放入 diff
58 | diff_data.add(bytes[0]);
59 |
60 | next_block = false;
61 | }else{
62 | // 匹配,将现在的 diff 加入 patch, 再将 chunk 加入 patch
63 | if(diff_data.size() > 0){
64 | patch.add(diff_data);
65 | sum += diff_data.size();
66 | diff_data = new ArrayList();
67 | }
68 |
69 | patch.add(chunk);
70 | sum += chunk.length;
71 |
72 | next_block = true;
73 | }
74 | }
75 |
76 | // 结束了,将此时 block_bytes 里剩下的内容(第一个字节除外,因为已经放过了),全部放入 diff,再加入 patch
77 | for(int i = 1; i < bytes.length; i++){
78 | diff_data.add(bytes[i]);
79 | }
80 |
81 | patch.add(diff_data);
82 | sum += diff_data.size();
83 |
84 | raf.close();
85 |
86 | System.out.println("扫描目标文件总字节数: " + sum);
87 |
88 | return patch;
89 | }
90 |
91 | // 向前读一字节,把新内容放入 block_bytes
92 | private byte[] read_next_byte(RandomAccessFile raf, byte[] block_bytes) throws IOException{
93 | byte[] next_byte = new byte[1];
94 | raf.read(next_byte);
95 |
96 | for(int i = 0; i < block_bytes.length - 1; i++){
97 | block_bytes[i] = block_bytes[i + 1];
98 | }
99 | block_bytes[block_bytes.length - 1] = next_byte[0];
100 |
101 | return block_bytes;
102 | }
103 |
104 | private byte[] read_next_block(RandomAccessFile raf, long remained_length) throws IOException{
105 | int bytes_len = (remained_length < ChunkParser.CHUNK_LENGTH) ? (int)remained_length : ChunkParser.CHUNK_LENGTH;
106 | byte[] block_bytes = new byte[bytes_len];
107 | raf.read(block_bytes);
108 |
109 | return block_bytes;
110 | }
111 |
112 | // 根据传入的字节数组,去尝试匹配,如果匹配到,返回匹配元数据,如果没匹配到,返回空
113 | private Chunk match(HashMap> chunk_map, byte[] bytes) throws NoSuchAlgorithmException{
114 | long cs32 = Adler32Util.checksum(bytes);
115 |
116 | if(chunk_map.containsKey(cs32)){
117 | for(Chunk chunk : chunk_map.get(cs32)){
118 | if(chunk.is_bytes_equal(bytes)){
119 | // 匹配了一个block
120 | return chunk;
121 | }
122 | }
123 | }
124 |
125 | return null;
126 | }
127 |
128 | public static void main(String[] args) throws NoSuchAlgorithmException, IOException {
129 | // File src_file = new File("d:/差分比较实验/差分比较实验1.rar");
130 | // File target_file = new File("d:/差分比较实验/差分比较实验2.rar");
131 |
132 | // File src_file = new File("d:/差分比较实验/差分比较实验2.rar");
133 | // File target_file = new File("d:/差分比较实验/差分比较实验1.rar");
134 |
135 | File src_file = new File("d:/差分比较实验/月球_a.bmp");
136 | File target_file = new File("d:/差分比较实验/月球_c.bmp");
137 |
138 | HashMap> chunk_map = new ChunkParser(src_file).parse();
139 | Patch patch = new PatchMaker(target_file, chunk_map).make();
140 | System.out.println(patch);
141 |
142 | patch.write_to_file(new File("d:/差分比较实验/patch"));
143 | }
144 | }
145 |
--------------------------------------------------------------------------------
/src/com/mindpin/rsync/checksum/Adler32Util.java:
--------------------------------------------------------------------------------
1 | package com.mindpin.rsync.checksum;
2 |
3 | import java.util.zip.Adler32;
4 |
5 | public class Adler32Util {
6 | public static long checksum(byte[] b){
7 | Adler32 a32 = new Adler32();
8 | a32.update(b);
9 | return a32.getValue();
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/src/com/mindpin/rsync/checksum/MD5Util.java:
--------------------------------------------------------------------------------
1 | package com.mindpin.rsync.checksum;
2 |
3 | import java.security.MessageDigest;
4 | import java.security.NoSuchAlgorithmException;
5 |
6 | public class MD5Util {
7 |
8 | private static char MD5_CHARS[] = {
9 | '0', '1', '2', '3',
10 | '4', '5', '6', '7',
11 | '8', '9', 'a', 'b',
12 | 'c', 'd', 'e', 'f'
13 | };
14 |
15 | public static byte[] get_md5(byte[] b) throws NoSuchAlgorithmException{
16 | MessageDigest md = MessageDigest.getInstance("MD5");
17 | md.update(b);
18 | return md.digest();
19 | }
20 |
21 | public static String md5_bytes_to_string(byte[] b){
22 | StringBuffer sb = new StringBuffer(2 * b.length);
23 | for(byte i : b){
24 | append_hex_pair(i, sb);
25 | }
26 | return sb.toString();
27 | }
28 |
29 | private static void append_hex_pair(byte b, StringBuffer sb) {
30 | char c0 = MD5_CHARS[(b & 0xf0) >> 4];
31 | char c1 = MD5_CHARS[b & 0xf];
32 | sb.append(c0).append(c1);
33 | }
34 |
35 | public static void main(String[] args) throws NoSuchAlgorithmException {
36 | String md5_a = MD5Util.md5_bytes_to_string(MD5Util.get_md5("a".getBytes()));
37 | System.out.println(md5_a);
38 | // "a" 的 md5 值是 0cc175b9c0f1b6a831c399e269772661
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/com/mindpin/rsync/chunk/Chunk.java:
--------------------------------------------------------------------------------
1 | package com.mindpin.rsync.chunk;
2 |
3 | import java.security.NoSuchAlgorithmException;
4 | import java.util.Arrays;
5 |
6 | import com.mindpin.rsync.checksum.MD5Util;
7 |
8 | public class Chunk {
9 | public int id;
10 | public byte[] md5;
11 | public int length;
12 |
13 | public Chunk(int id, byte[] md5, byte[] bytes){
14 | this.id = id;
15 | this.md5 = md5;
16 | this.length = bytes.length;
17 | }
18 |
19 | public String toString(){
20 | return MD5Util.md5_bytes_to_string(md5) + "=>" + length;
21 | }
22 |
23 | public boolean is_bytes_equal(byte[] bytes) throws NoSuchAlgorithmException{
24 | return Arrays.equals(MD5Util.get_md5(bytes), md5);
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/src/com/mindpin/rsync/patch/Patch.java:
--------------------------------------------------------------------------------
1 | package com.mindpin.rsync.patch;
2 |
3 | import java.io.File;
4 | import java.io.FileWriter;
5 | import java.io.IOException;
6 | import java.util.ArrayList;
7 | import java.util.List;
8 |
9 | import com.mindpin.rsync.chunk.Chunk;
10 | import com.mindpin.rsync.patch.i.PatchPart;
11 |
12 |
13 | public class Patch {
14 |
15 | public List parts;
16 |
17 | public Patch(){
18 | parts = new ArrayList();
19 | }
20 |
21 | public void add(List bytes){
22 | parts.add(new PatchPartData(bytes));
23 | }
24 |
25 | public void add(Chunk chunk){
26 | parts.add(new PatchPartChunk(chunk.id));
27 | }
28 |
29 | public int part_count(){
30 | return parts.size();
31 | }
32 |
33 | public long bytes_size(){
34 | long re = 0;
35 | for(PatchPart p : parts){
36 | re += p.bytes_size();
37 | }
38 | return re;
39 | }
40 |
41 | public void write_to_file(File file) throws IOException{
42 | file.delete();
43 |
44 | FileWriter fw = new FileWriter(file);
45 | for(PatchPart part : parts){
46 | fw.write(part.get_encode_str());
47 | }
48 |
49 | fw.close();
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/src/com/mindpin/rsync/patch/PatchPartChunk.java:
--------------------------------------------------------------------------------
1 | package com.mindpin.rsync.patch;
2 |
3 | import com.mindpin.rsync.patch.i.PatchPart;
4 |
5 |
6 | public class PatchPartChunk implements PatchPart {
7 |
8 | public int id;
9 |
10 | public PatchPartChunk(int id){
11 | this.id = id;
12 | }
13 |
14 | @Override
15 | public int bytes_size() {
16 | return 1;
17 | }
18 |
19 | @Override
20 | public String get_encode_str() {
21 | return "#" + id;
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/src/com/mindpin/rsync/patch/PatchPartData.java:
--------------------------------------------------------------------------------
1 | package com.mindpin.rsync.patch;
2 |
3 | import java.util.List;
4 |
5 | import sun.misc.BASE64Encoder;
6 |
7 | import com.mindpin.rsync.patch.i.PatchPart;
8 |
9 | public class PatchPartData implements PatchPart {
10 |
11 | public byte[] bytes;
12 |
13 | public PatchPartData(List bytes){
14 | int size = bytes.size();
15 |
16 | this.bytes = new byte[size];
17 |
18 | int i = 0;
19 | for(byte b : bytes){
20 | this.bytes[i] = b;
21 | i ++;
22 | }
23 | }
24 |
25 | @Override
26 | public int bytes_size() {
27 | return bytes.length;
28 | }
29 |
30 | @Override
31 | public String get_encode_str() {
32 | BASE64Encoder encoder = new BASE64Encoder();
33 | return "$" + encoder.encode(bytes);
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/src/com/mindpin/rsync/patch/i/PatchPart.java:
--------------------------------------------------------------------------------
1 | package com.mindpin.rsync.patch.i;
2 |
3 | public interface PatchPart {
4 | public int bytes_size();
5 | public String get_encode_str();
6 | }
7 |
--------------------------------------------------------------------------------