├── .classpath ├── .gitignore ├── .project ├── README.md └── src └── com └── mindpin └── rsync ├── ChunkParser.java ├── PatchApply.java ├── PatchMaker.java ├── checksum ├── Adler32Util.java └── MD5Util.java ├── chunk └── Chunk.java └── patch ├── Patch.java ├── PatchPartChunk.java ├── PatchPartData.java └── i └── PatchPart.java /.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | 3 | # Package Files # 4 | *.jar 5 | *.war 6 | *.ear 7 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | java_binary_diff 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | 15 | org.eclipse.jdt.core.javanature 16 | 17 | 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | java_binary_diff 2 | ================ 3 | 4 | 基于java实现的,以rsync算法原理为基础的二进制文件差异比较处理 5 | 6 | 2012-7-8: 7 | 实现了基础的算法逻辑,主要包括以下三个逻辑: 8 | 根据 src_file 计算 chunk_map; 9 | 根据 chunk_map 和 target_file 生成 patch; 10 | 根据 src_file 和 patch 生成与 target_file 一样的 result_file; 11 | 12 | 实现了将 patch 通过 base64 编码保存到磁盘文本文件 13 | 14 | TODO: 15 | 将 patch 文件压缩保存 -------------------------------------------------------------------------------- /src/com/mindpin/rsync/ChunkParser.java: -------------------------------------------------------------------------------- 1 | package com.mindpin.rsync; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.io.RandomAccessFile; 6 | import java.security.NoSuchAlgorithmException; 7 | import java.util.ArrayList; 8 | import java.util.Date; 9 | import java.util.HashMap; 10 | import java.util.List; 11 | 12 | import com.mindpin.rsync.checksum.Adler32Util; 13 | import com.mindpin.rsync.checksum.MD5Util; 14 | import com.mindpin.rsync.chunk.Chunk; 15 | 16 | 17 | public class ChunkParser { 18 | 19 | final public static int CHUNK_LENGTH = 512; 20 | File in_file; 21 | HashMap> chunk_map; 22 | 23 | public ChunkParser(File in_file){ 24 | this.in_file = in_file; 25 | this.chunk_map = new HashMap>(); 26 | } 27 | 28 | public HashMap> parse() throws IOException, NoSuchAlgorithmException{ 29 | long file_length = in_file.length(); 30 | 31 | RandomAccessFile raf = new RandomAccessFile(in_file, "r"); 32 | // FileInputStream fs = new FileInputStream(in_file); 33 | 34 | int sum = 0; 35 | int id = 0; 36 | long remained_length = file_length; 37 | int bytes_len; 38 | 39 | byte[] bytes; 40 | long cs32; 41 | byte[] md5; 42 | 43 | while(remained_length > 0){ 44 | bytes_len = (remained_length < CHUNK_LENGTH) ? (int)remained_length : CHUNK_LENGTH; 45 | 46 | bytes = new byte[bytes_len]; 47 | raf.read(bytes); 48 | 49 | cs32 = Adler32Util.checksum(bytes); 50 | md5 = MD5Util.get_md5(bytes); 51 | add(cs32, new Chunk(id, md5, bytes)); 52 | 53 | remained_length -= bytes_len; 54 | 55 | sum += bytes_len; 56 | id ++; 57 | } 58 | 59 | raf.close(); 60 | 61 | System.out.println("读取原始文件总字节数: " + sum); 62 | return chunk_map; 63 | } 64 | 65 | public void add(long cs32, Chunk chunk){ 66 | if(chunk_map.containsKey(cs32)){ 67 | chunk_map.get(cs32).add(chunk); 68 | }else{ 69 | List list = new ArrayList(); 70 | list.add(chunk); 71 | chunk_map.put(cs32, list); 72 | } 73 | } 74 | 75 | 76 | // 测试结果: 77 | // 78 | // 原始文件总字节数: 1049568 79 | // 耗时:0.093 秒 80 | // chunk数据字节数: 82000 81 | // 82 | // 原始文件总字节数: 200061022 83 | // 耗时:4.875 秒 84 | // chunk数据字节数: 15623920 85 | public static void main(String[] args) throws IOException, NoSuchAlgorithmException { 86 | File file; 87 | long t1; 88 | long t2; 89 | HashMap> map; 90 | 91 | // 1MB 0.094 秒 92 | file = new File("d:/差分比较实验/差分比较实验1.rar"); 93 | System.out.println("原始文件总字节数: " + file.length()); 94 | t1 = new Date().getTime(); 95 | map = new ChunkParser(file).parse(); 96 | t2 = new Date().getTime(); 97 | System.out.println("耗时:" + (t2 - t1) / 1000.0 + " 秒"); 98 | System.out.println("chunk数据字节数: " + map.size() * (8 + 32)); 99 | // System.out.println(map); 100 | 101 | // 190MB 4.734 秒 102 | // file = new File("d:/差分比较实验/world1.zip"); 103 | // System.out.println("原始文件总字节数: " + file.length()); 104 | // t1 = new Date().getTime(); 105 | // map = new ChunkParser(file).parse(); 106 | // t2 = new Date().getTime(); 107 | // System.out.println("耗时:" + (t2 - t1) / 1000.0 + " 秒"); 108 | // System.out.println("chunk数据字节数: " + map.size() * (8 + 32)); 109 | 110 | // 1G 以上的文件会 out of memory 111 | // file = new File("D:/Download/乐可乐可2中文版@USP草帽.rar"); 112 | // System.out.println("原始文件总字节数: " + file.length()); 113 | // t1 = new Date().getTime(); 114 | // map = new ChunkParser(file).parse(); 115 | // t2 = new Date().getTime(); 116 | // System.out.println("耗时:" + (t2 - t1) / 1000.0 + " 秒"); 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /src/com/mindpin/rsync/PatchApply.java: -------------------------------------------------------------------------------- 1 | package com.mindpin.rsync; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.io.RandomAccessFile; 6 | import java.security.NoSuchAlgorithmException; 7 | import java.util.HashMap; 8 | import java.util.List; 9 | 10 | import com.mindpin.rsync.chunk.Chunk; 11 | import com.mindpin.rsync.patch.Patch; 12 | import com.mindpin.rsync.patch.PatchPartChunk; 13 | import com.mindpin.rsync.patch.PatchPartData; 14 | import com.mindpin.rsync.patch.i.PatchPart; 15 | 16 | public class PatchApply { 17 | 18 | Patch patch; 19 | File src_file; 20 | File result_file; 21 | 22 | public PatchApply(File src_file, Patch patch, File result_file){ 23 | this.patch = patch; 24 | this.src_file = src_file; 25 | this.result_file = result_file; 26 | } 27 | 28 | public void apply() throws IOException{ 29 | result_file.delete(); 30 | 31 | RandomAccessFile read_raf = new RandomAccessFile(src_file, "r"); 32 | RandomAccessFile write_raf = new RandomAccessFile(result_file, "rw"); 33 | 34 | long src_file_length = src_file.length(); 35 | 36 | long sum = 0; 37 | for(PatchPart part : patch.parts){ 38 | if(part instanceof PatchPartData){ 39 | write_raf.write(((PatchPartData)part).bytes); 40 | sum += ((PatchPartData)part).bytes_size(); 41 | } 42 | 43 | if(part instanceof PatchPartChunk){ 44 | int off = ((PatchPartChunk)part).id * ChunkParser.CHUNK_LENGTH; 45 | long remained_length = src_file_length - off; 46 | 47 | int length = (remained_length < ChunkParser.CHUNK_LENGTH) ? (int)remained_length : ChunkParser.CHUNK_LENGTH; 48 | 49 | byte[] bytes = new byte[length]; 50 | 51 | read_raf.seek(off); 52 | read_raf.read(bytes); 53 | write_raf.write(bytes); 54 | 55 | sum += length; 56 | } 57 | } 58 | 59 | write_raf.close(); 60 | read_raf.close(); 61 | System.out.println("写入结果文件字节数: " + sum); 62 | } 63 | 64 | public static void main(String[] args) throws NoSuchAlgorithmException, IOException { 65 | // File src_file = new File("d:/差分比较实验/差分比较实验1.rar"); 66 | // File target_file = new File("d:/差分比较实验/差分比较实验2.rar"); 67 | // File result_file = new File("d:/差分比较实验/差分比较实验result.rar"); 68 | 69 | File src_file = new File("d:/差分比较实验/月球_a.bmp"); 70 | File target_file = new File("d:/差分比较实验/月球_c.bmp"); 71 | File result_file = new File("d:/差分比较实验/月球_c_result.bmp"); 72 | 73 | HashMap> chunk_map = new ChunkParser(src_file).parse(); 74 | Patch patch = new PatchMaker(target_file, chunk_map).make(); 75 | 76 | new PatchApply(src_file, patch, result_file).apply(); 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/com/mindpin/rsync/PatchMaker.java: -------------------------------------------------------------------------------- 1 | package com.mindpin.rsync; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.io.RandomAccessFile; 6 | import java.security.NoSuchAlgorithmException; 7 | import java.util.ArrayList; 8 | import java.util.HashMap; 9 | import java.util.List; 10 | 11 | import com.mindpin.rsync.checksum.Adler32Util; 12 | import com.mindpin.rsync.chunk.Chunk; 13 | import com.mindpin.rsync.patch.Patch; 14 | 15 | 16 | public class PatchMaker { 17 | 18 | File target_file; 19 | HashMap> chunk_map; 20 | 21 | // src + patch = target 22 | // 根据 src 的 chunk_map 23 | // 扫描 target 一次 24 | // 就得到了如何 从 src -> target 的 patch 数据 25 | 26 | public PatchMaker(File target_file, HashMap> chunk_map){ 27 | this.target_file = target_file; 28 | this.chunk_map = chunk_map; 29 | } 30 | 31 | public Patch make() throws IOException, NoSuchAlgorithmException{ 32 | Patch patch = new Patch(); 33 | 34 | long file_length = target_file.length(); 35 | RandomAccessFile raf = new RandomAccessFile(target_file, "r"); 36 | List diff_data = new ArrayList(); 37 | 38 | byte[] bytes = {}; 39 | long remained_length = file_length; 40 | 41 | int sum = 0; 42 | boolean next_block = true; 43 | 44 | Chunk chunk; 45 | while(remained_length > 0){ 46 | if(next_block){ 47 | bytes = read_next_block(raf, remained_length); 48 | remained_length -= bytes.length; 49 | }else{ 50 | bytes = read_next_byte(raf, bytes); 51 | remained_length --; 52 | } 53 | 54 | chunk = match(chunk_map, bytes); 55 | 56 | if(chunk == null){ 57 | // 未匹配,数组[0]放入 diff 58 | diff_data.add(bytes[0]); 59 | 60 | next_block = false; 61 | }else{ 62 | // 匹配,将现在的 diff 加入 patch, 再将 chunk 加入 patch 63 | if(diff_data.size() > 0){ 64 | patch.add(diff_data); 65 | sum += diff_data.size(); 66 | diff_data = new ArrayList(); 67 | } 68 | 69 | patch.add(chunk); 70 | sum += chunk.length; 71 | 72 | next_block = true; 73 | } 74 | } 75 | 76 | // 结束了,将此时 block_bytes 里剩下的内容(第一个字节除外,因为已经放过了),全部放入 diff,再加入 patch 77 | for(int i = 1; i < bytes.length; i++){ 78 | diff_data.add(bytes[i]); 79 | } 80 | 81 | patch.add(diff_data); 82 | sum += diff_data.size(); 83 | 84 | raf.close(); 85 | 86 | System.out.println("扫描目标文件总字节数: " + sum); 87 | 88 | return patch; 89 | } 90 | 91 | // 向前读一字节,把新内容放入 block_bytes 92 | private byte[] read_next_byte(RandomAccessFile raf, byte[] block_bytes) throws IOException{ 93 | byte[] next_byte = new byte[1]; 94 | raf.read(next_byte); 95 | 96 | for(int i = 0; i < block_bytes.length - 1; i++){ 97 | block_bytes[i] = block_bytes[i + 1]; 98 | } 99 | block_bytes[block_bytes.length - 1] = next_byte[0]; 100 | 101 | return block_bytes; 102 | } 103 | 104 | private byte[] read_next_block(RandomAccessFile raf, long remained_length) throws IOException{ 105 | int bytes_len = (remained_length < ChunkParser.CHUNK_LENGTH) ? (int)remained_length : ChunkParser.CHUNK_LENGTH; 106 | byte[] block_bytes = new byte[bytes_len]; 107 | raf.read(block_bytes); 108 | 109 | return block_bytes; 110 | } 111 | 112 | // 根据传入的字节数组,去尝试匹配,如果匹配到,返回匹配元数据,如果没匹配到,返回空 113 | private Chunk match(HashMap> chunk_map, byte[] bytes) throws NoSuchAlgorithmException{ 114 | long cs32 = Adler32Util.checksum(bytes); 115 | 116 | if(chunk_map.containsKey(cs32)){ 117 | for(Chunk chunk : chunk_map.get(cs32)){ 118 | if(chunk.is_bytes_equal(bytes)){ 119 | // 匹配了一个block 120 | return chunk; 121 | } 122 | } 123 | } 124 | 125 | return null; 126 | } 127 | 128 | public static void main(String[] args) throws NoSuchAlgorithmException, IOException { 129 | // File src_file = new File("d:/差分比较实验/差分比较实验1.rar"); 130 | // File target_file = new File("d:/差分比较实验/差分比较实验2.rar"); 131 | 132 | // File src_file = new File("d:/差分比较实验/差分比较实验2.rar"); 133 | // File target_file = new File("d:/差分比较实验/差分比较实验1.rar"); 134 | 135 | File src_file = new File("d:/差分比较实验/月球_a.bmp"); 136 | File target_file = new File("d:/差分比较实验/月球_c.bmp"); 137 | 138 | HashMap> chunk_map = new ChunkParser(src_file).parse(); 139 | Patch patch = new PatchMaker(target_file, chunk_map).make(); 140 | System.out.println(patch); 141 | 142 | patch.write_to_file(new File("d:/差分比较实验/patch")); 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /src/com/mindpin/rsync/checksum/Adler32Util.java: -------------------------------------------------------------------------------- 1 | package com.mindpin.rsync.checksum; 2 | 3 | import java.util.zip.Adler32; 4 | 5 | public class Adler32Util { 6 | public static long checksum(byte[] b){ 7 | Adler32 a32 = new Adler32(); 8 | a32.update(b); 9 | return a32.getValue(); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/com/mindpin/rsync/checksum/MD5Util.java: -------------------------------------------------------------------------------- 1 | package com.mindpin.rsync.checksum; 2 | 3 | import java.security.MessageDigest; 4 | import java.security.NoSuchAlgorithmException; 5 | 6 | public class MD5Util { 7 | 8 | private static char MD5_CHARS[] = { 9 | '0', '1', '2', '3', 10 | '4', '5', '6', '7', 11 | '8', '9', 'a', 'b', 12 | 'c', 'd', 'e', 'f' 13 | }; 14 | 15 | public static byte[] get_md5(byte[] b) throws NoSuchAlgorithmException{ 16 | MessageDigest md = MessageDigest.getInstance("MD5"); 17 | md.update(b); 18 | return md.digest(); 19 | } 20 | 21 | public static String md5_bytes_to_string(byte[] b){ 22 | StringBuffer sb = new StringBuffer(2 * b.length); 23 | for(byte i : b){ 24 | append_hex_pair(i, sb); 25 | } 26 | return sb.toString(); 27 | } 28 | 29 | private static void append_hex_pair(byte b, StringBuffer sb) { 30 | char c0 = MD5_CHARS[(b & 0xf0) >> 4]; 31 | char c1 = MD5_CHARS[b & 0xf]; 32 | sb.append(c0).append(c1); 33 | } 34 | 35 | public static void main(String[] args) throws NoSuchAlgorithmException { 36 | String md5_a = MD5Util.md5_bytes_to_string(MD5Util.get_md5("a".getBytes())); 37 | System.out.println(md5_a); 38 | // "a" 的 md5 值是 0cc175b9c0f1b6a831c399e269772661 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/com/mindpin/rsync/chunk/Chunk.java: -------------------------------------------------------------------------------- 1 | package com.mindpin.rsync.chunk; 2 | 3 | import java.security.NoSuchAlgorithmException; 4 | import java.util.Arrays; 5 | 6 | import com.mindpin.rsync.checksum.MD5Util; 7 | 8 | public class Chunk { 9 | public int id; 10 | public byte[] md5; 11 | public int length; 12 | 13 | public Chunk(int id, byte[] md5, byte[] bytes){ 14 | this.id = id; 15 | this.md5 = md5; 16 | this.length = bytes.length; 17 | } 18 | 19 | public String toString(){ 20 | return MD5Util.md5_bytes_to_string(md5) + "=>" + length; 21 | } 22 | 23 | public boolean is_bytes_equal(byte[] bytes) throws NoSuchAlgorithmException{ 24 | return Arrays.equals(MD5Util.get_md5(bytes), md5); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/com/mindpin/rsync/patch/Patch.java: -------------------------------------------------------------------------------- 1 | package com.mindpin.rsync.patch; 2 | 3 | import java.io.File; 4 | import java.io.FileWriter; 5 | import java.io.IOException; 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | 9 | import com.mindpin.rsync.chunk.Chunk; 10 | import com.mindpin.rsync.patch.i.PatchPart; 11 | 12 | 13 | public class Patch { 14 | 15 | public List parts; 16 | 17 | public Patch(){ 18 | parts = new ArrayList(); 19 | } 20 | 21 | public void add(List bytes){ 22 | parts.add(new PatchPartData(bytes)); 23 | } 24 | 25 | public void add(Chunk chunk){ 26 | parts.add(new PatchPartChunk(chunk.id)); 27 | } 28 | 29 | public int part_count(){ 30 | return parts.size(); 31 | } 32 | 33 | public long bytes_size(){ 34 | long re = 0; 35 | for(PatchPart p : parts){ 36 | re += p.bytes_size(); 37 | } 38 | return re; 39 | } 40 | 41 | public void write_to_file(File file) throws IOException{ 42 | file.delete(); 43 | 44 | FileWriter fw = new FileWriter(file); 45 | for(PatchPart part : parts){ 46 | fw.write(part.get_encode_str()); 47 | } 48 | 49 | fw.close(); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/com/mindpin/rsync/patch/PatchPartChunk.java: -------------------------------------------------------------------------------- 1 | package com.mindpin.rsync.patch; 2 | 3 | import com.mindpin.rsync.patch.i.PatchPart; 4 | 5 | 6 | public class PatchPartChunk implements PatchPart { 7 | 8 | public int id; 9 | 10 | public PatchPartChunk(int id){ 11 | this.id = id; 12 | } 13 | 14 | @Override 15 | public int bytes_size() { 16 | return 1; 17 | } 18 | 19 | @Override 20 | public String get_encode_str() { 21 | return "#" + id; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/com/mindpin/rsync/patch/PatchPartData.java: -------------------------------------------------------------------------------- 1 | package com.mindpin.rsync.patch; 2 | 3 | import java.util.List; 4 | 5 | import sun.misc.BASE64Encoder; 6 | 7 | import com.mindpin.rsync.patch.i.PatchPart; 8 | 9 | public class PatchPartData implements PatchPart { 10 | 11 | public byte[] bytes; 12 | 13 | public PatchPartData(List bytes){ 14 | int size = bytes.size(); 15 | 16 | this.bytes = new byte[size]; 17 | 18 | int i = 0; 19 | for(byte b : bytes){ 20 | this.bytes[i] = b; 21 | i ++; 22 | } 23 | } 24 | 25 | @Override 26 | public int bytes_size() { 27 | return bytes.length; 28 | } 29 | 30 | @Override 31 | public String get_encode_str() { 32 | BASE64Encoder encoder = new BASE64Encoder(); 33 | return "$" + encoder.encode(bytes); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/com/mindpin/rsync/patch/i/PatchPart.java: -------------------------------------------------------------------------------- 1 | package com.mindpin.rsync.patch.i; 2 | 3 | public interface PatchPart { 4 | public int bytes_size(); 5 | public String get_encode_str(); 6 | } 7 | --------------------------------------------------------------------------------