├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── LICENSE.md ├── README.md └── src ├── git_utils.rs └── main.rs /.gitignore: -------------------------------------------------------------------------------- 1 | /lucky-commit 2 | .vscode 3 | /target/ 4 | **/*.rs.bk 5 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | [[package]] 4 | name = "adler" 5 | version = "1.0.2" 6 | source = "registry+https://github.com/rust-lang/crates.io-index" 7 | checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" 8 | 9 | [[package]] 10 | name = "autocfg" 11 | version = "1.0.1" 12 | source = "registry+https://github.com/rust-lang/crates.io-index" 13 | checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" 14 | 15 | [[package]] 16 | name = "block-buffer" 17 | version = "0.9.0" 18 | source = "registry+https://github.com/rust-lang/crates.io-index" 19 | checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4" 20 | dependencies = [ 21 | "generic-array", 22 | ] 23 | 24 | [[package]] 25 | name = "build_const" 26 | version = "0.2.1" 27 | source = "registry+https://github.com/rust-lang/crates.io-index" 28 | checksum = "39092a32794787acd8525ee150305ff051b0aa6cc2abaf193924f5ab05425f39" 29 | 30 | [[package]] 31 | name = "cfg-if" 32 | version = "1.0.0" 33 | source = "registry+https://github.com/rust-lang/crates.io-index" 34 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 35 | 36 | [[package]] 37 | name = "cpuid-bool" 38 | version = "0.1.2" 39 | source = "registry+https://github.com/rust-lang/crates.io-index" 40 | checksum = "8aebca1129a03dc6dc2b127edd729435bbc4a37e1d5f4d7513165089ceb02634" 41 | 42 | [[package]] 43 | name = "crc" 44 | version = "1.8.1" 45 | source = "registry+https://github.com/rust-lang/crates.io-index" 46 | checksum = "d663548de7f5cca343f1e0a48d14dcfb0e9eb4e079ec58883b7251539fa10aeb" 47 | dependencies = [ 48 | "build_const", 49 | ] 50 | 51 | [[package]] 52 | name = "crc32fast" 53 | version = "1.2.1" 54 | source = "registry+https://github.com/rust-lang/crates.io-index" 55 | checksum = "81156fece84ab6a9f2afdb109ce3ae577e42b1228441eded99bd77f627953b1a" 56 | dependencies = [ 57 | "cfg-if", 58 | ] 59 | 60 | [[package]] 61 | name = "digest" 62 | version = "0.9.0" 63 | source = "registry+https://github.com/rust-lang/crates.io-index" 64 | checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066" 65 | dependencies = [ 66 | "generic-array", 67 | ] 68 | 69 | [[package]] 70 | name = "every-commit-shorthash" 71 | version = "0.1.0" 72 | dependencies = [ 73 | "crc", 74 | "flate2", 75 | "sha-1", 76 | ] 77 | 78 | [[package]] 79 | name = "flate2" 80 | version = "1.0.20" 81 | source = "registry+https://github.com/rust-lang/crates.io-index" 82 | checksum = "cd3aec53de10fe96d7d8c565eb17f2c687bb5518a2ec453b5b1252964526abe0" 83 | dependencies = [ 84 | "cfg-if", 85 | "crc32fast", 86 | "libc", 87 | "miniz_oxide", 88 | ] 89 | 90 | [[package]] 91 | name = "generic-array" 92 | version = "0.14.4" 93 | source = "registry+https://github.com/rust-lang/crates.io-index" 94 | checksum = "501466ecc8a30d1d3b7fc9229b122b2ce8ed6e9d9223f1138d4babb253e51817" 95 | dependencies = [ 96 | "typenum", 97 | "version_check", 98 | ] 99 | 100 | [[package]] 101 | name = "libc" 102 | version = "0.2.90" 103 | source = "registry+https://github.com/rust-lang/crates.io-index" 104 | checksum = "ba4aede83fc3617411dc6993bc8c70919750c1c257c6ca6a502aed6e0e2394ae" 105 | 106 | [[package]] 107 | name = "miniz_oxide" 108 | version = "0.4.4" 109 | source = "registry+https://github.com/rust-lang/crates.io-index" 110 | checksum = "a92518e98c078586bc6c934028adcca4c92a53d6a958196de835170a01d84e4b" 111 | dependencies = [ 112 | "adler", 113 | "autocfg", 114 | ] 115 | 116 | [[package]] 117 | name = "opaque-debug" 118 | version = "0.3.0" 119 | source = "registry+https://github.com/rust-lang/crates.io-index" 120 | checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" 121 | 122 | [[package]] 123 | name = "sha-1" 124 | version = "0.9.4" 125 | source = "registry+https://github.com/rust-lang/crates.io-index" 126 | checksum = "dfebf75d25bd900fd1e7d11501efab59bc846dbc76196839663e6637bba9f25f" 127 | dependencies = [ 128 | "block-buffer", 129 | "cfg-if", 130 | "cpuid-bool", 131 | "digest", 132 | "opaque-debug", 133 | ] 134 | 135 | [[package]] 136 | name = "typenum" 137 | version = "1.13.0" 138 | source = "registry+https://github.com/rust-lang/crates.io-index" 139 | checksum = "879f6906492a7cd215bfa4cf595b600146ccfac0c79bcbd1f3000162af5e8b06" 140 | 141 | [[package]] 142 | name = "version_check" 143 | version = "0.9.3" 144 | source = "registry+https://github.com/rust-lang/crates.io-index" 145 | checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe" 146 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "every-commit-shorthash" 3 | version = "0.1.0" 4 | authors = ["Teddy Katz "] 5 | edition = "2018" 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [dependencies] 10 | sha-1 = "0.9" 11 | flate2 = "^1.0" 12 | crc = "^1.8" 13 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | ===================== 3 | 4 | Copyright © 2021 Teddy Katz 5 | 6 | Permission is hereby granted, free of charge, to any person 7 | obtaining a copy of this software and associated documentation 8 | files (the “Software”), to deal in the Software without 9 | restriction, including without limitation the rights to use, 10 | copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the 12 | Software is furnished to do so, subject to the following 13 | conditions: 14 | 15 | The above copyright notice and this permission notice shall be 16 | included in all copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, 19 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | OTHER DEALINGS IN THE SOFTWARE. 26 | 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # every-git-commit-shorthash 2 | 3 | A git repository with a commit for *every* seven-character git commit shorthash (all 228 of them). 4 | 5 | If you have a commit shorthash, or any seven-character hex string, you can find a commit for it here! It's like a dictionary, but much less useful. Also see [lucky-commit](https://github.com/not-an-aardvark/lucky-commit) if you'd like to generate commits with arbitrary shorthashes on the fly instead. 6 | 7 | ## FAQ 8 | 9 | ### Where can I see all the commits? 10 | 11 | This repository contains code for generating a repository with every shorthash locally. 12 | 13 | The repository has so many commits that `git push` hangs and runs out of memory, presumably because it tries to regenerate a packfile on the fly. As a result, there isn't a GitHub-hosted interactive demo. Sorry. 14 | 15 | This problem might be tractable in theory. `git` uses the same packfile format for network transport as it does for storage, so it might be possible to convince it to use the packfile from the filesystem directly rather than generating a new one. However, I have some doubts that GitHub would accept the push without timing out or hitting a memory limit somewhere. 16 | 17 | If you want to get the commits without running the tool yourself, you can download a pregenerated packfile and index by following the instructions on the [releases page](https://github.com/not-an-aardvark/every-git-commit-shorthash/releases). 18 | 19 | ### How much space does this take up? 20 | 21 | The commits are stored in a 14.7 GB [git packfile](https://git-scm.com/docs/pack-format), as well as an associated 9.35 GB pack index file. This is the result of significant optimization to reduce the file sizes. 22 | 23 | Specifically, two major strategies are used: 24 | 25 | #### Almost all of the commits are stored as deltas 26 | 27 | Git packfiles support a delta format, where a git object is stored as a diff from another git object, then reassembled at runtime. Normally, this delta format is used for file contents (i.e. blobs), so that git doesn't need to store two copies of a file for a one-line change. However, it's supported for all types of git objects, including commits themselves. Git commits contain a lot of metadata, so storing a commit as e.g. "the same as this other commit, plus one space at the end of the commit message" saves a lot of space over inlining the entire commit. 28 | 29 | #### The commit graph is arranged to take advantage of compression 30 | 31 | It's worth noting a design goal of the tool here: it should be possible to place all 228 commits in a single git branch, such that they won't be immediately garbage-collected by git. 32 | 33 | This design goal requires the commit layout to use more space. Without this requirement, it would be possible to simply create 228 root commits (i.e. commits without parents). However, with this requirement, for each non-branch-tip commit there needs to be at least one child commit whose body contains `parent <40-character hex commit hash>` in the commit metadata. Since all of the 40-character commit hashes are different, this requires a minimum storage cost of of 40 uncompressed bytes per commit, regardless of how the commits are arranged. There's also a flat cost of 20 bytes per commit since the packfile format inlines each commit hash, along with a few bytes of overhead to specify the delta itself. 34 | 35 | With that aside, there is still a significant amount of flexibility in how the commits can be arranged. For example, it would be possible to create a linear commit history of 228 commits, or to create 228 root commits and one massive merge commit with 228 parents, or anything in between. 36 | 37 | Emperically, using a linear commit history resulted in an amortized size of 73 compressed bytes per commit in the packfile, whereas using big merge commits resulted an amortized size of only 46 compressed bytes per commit. This is because giant merge commits are much more easily compressible, since they consist almost entirely of hex characters from the parent hashes. 38 | 39 | I suspected git wouldn't really like having a single merge commit with 228 parents, so I compromised and created 214 merge commits, each with 214 root-commit parents, and one top-level merge commit with each of the 214 merge commits as parents. It's worth noting that while the merge commits account for 0.0061% of the total number of commits, they account for about 46% of the total storage space. I think the total size is close to optimal given the requirement that all the commits be reachable, but removing that requirement would result in almost a 50% packfile size reduction. 40 | 41 | ### How long does it take to generate the commits? 42 | 43 | About 5 hours on my laptop (a 2015 MacBook Pro). 44 | 45 | The tool was designed for one-time use, so I haven't spent a lot of time optimizing its performance. There is some significant low-hanging fruit: 46 | 47 | * Currently, it's single-threaded for simplicity. The performance could be sped up by a significant factor by using multiple cores or running on a GPU. 48 | * Generating the index file currently involves a lot of cache thrashing, which could be fixed with only a bit of added complexity. 49 | 50 | When the tool starts running, the main bottleneck is zlib compression (which is run on each commit, using the maximum compression ratio). This continues to be the main bottleneck until the tool reaches the last million commits or so, at which point SHA1 throughput becomes the main bottleneck. (For each of the last few commits, the tool has to try a large number of commit possibilities in order to find a shorthash that hasn't already been used.) 51 | 52 | Note that the tool is currently very memory-constrained; in order to generate the packfile index, the tool needs to keep track of a sorted index of all of the commit hashes generated so far. As a result of this and a few other pieces of metadata, it uses 11GB of memory, which is just small enough to run on my laptop. Some plausible-seeming performance improvements would result in OOM, and some memory usage improvements (e.g. saving state to the filesystem) could result in slower performance. 53 | 54 | ### How many commits does the tool need to go through to find 228 unique shorthashes? 55 | 56 | Due to the [coupon collector's problem](https://en.wikipedia.org/wiki/Coupon_collector%27s_problem), the expected number of commit attempts is 228(ln(228) + 0.577...), or about 5.4 billion. 57 | 58 | ### Does git actually work in such a big repository? 59 | 60 | Sort of. You can view any particular commit and check out files at that commit. (You can run `git checkout` and keyboard-mash seven random hex characters, and it will go to that commit, which is neat.) Anything that requires stepping through history in order, such as `git log`, seems to stall and run out of memory. 61 | 62 | ### Why would someone want to use this? 63 | 64 | ¯\\\_(ツ)\_/¯ 65 | 66 | ### No, really, is it useful for anything? 67 | 68 | Probably not, but maybe. 69 | 70 | I originally created [lucky-commit](https://github.com/not-an-aardvark/lucky-commit), the companion project to this one, as a practical joke. However, it turned out to be [unexpectedly useful for security research](https://blog.teddykatz.com/2019/11/12/github-actions-dos.html) as a way to generate targeted commit hash collisions. 71 | 72 | In theory, this project could also be used to [generate commit hash collisions in bulk](https://blog.teddykatz.com/2019/11/12/github-actions-dos.html#:~:text=Making%20every%20shorthash%20collide). However, it seems like using `lucky-commit` to generate a targeted collision would be more useful in almost all circumstances, especially since `lucky-commit` allows you to amend your own commits rather than creating useless commits from scratch. This is particularly true because the branch with bulk commits from this project can't really be pushed anywhere. 73 | 74 | ### How do I run it? 75 | 76 | First, [ensure you have `rustc` and `cargo` installed](https://www.rust-lang.org/tools/install). 77 | 78 | Then run: 79 | 80 | ```bash 81 | $ git clone https://github.com/not-an-aardvark/every-git-commit-shorthash.git 82 | $ cd every-git-commit-shorthash 83 | ``` 84 | 85 | Optionally, update the hardcoded commit templates in `src/main.rs`, e.g. to update the author to yourself or change the commit message. 86 | 87 | Then run: 88 | 89 | ```bash 90 | $ cargo run --release 91 | ``` 92 | -------------------------------------------------------------------------------- /src/git_utils.rs: -------------------------------------------------------------------------------- 1 | use crc::crc32; 2 | use flate2::{write::ZlibEncoder, Compression}; 3 | use sha1::{Digest, Sha1}; 4 | use std::{ 5 | collections::BTreeMap, 6 | io, 7 | io::{copy, Seek, SeekFrom, Write}, 8 | num::NonZeroU8, 9 | }; 10 | use std::{ 11 | fs::OpenOptions, 12 | io::BufWriter, 13 | }; 14 | 15 | pub type Sha1Oid = [u8; 20]; 16 | 17 | #[derive(Clone, Debug)] 18 | pub enum GitObjectType { 19 | Commit, 20 | Tree, 21 | #[allow(dead_code)] 22 | Blob, 23 | } 24 | 25 | #[derive(Clone, Debug)] 26 | pub struct GitObject { 27 | pub data: Vec, 28 | pub object_type: GitObjectType, 29 | } 30 | 31 | impl GitObjectType { 32 | fn type_name(&self) -> &'static str { 33 | match self { 34 | GitObjectType::Commit { .. } => "commit", 35 | GitObjectType::Tree { .. } => "tree", 36 | GitObjectType::Blob { .. } => "blob", 37 | } 38 | } 39 | } 40 | 41 | impl GitObject { 42 | pub fn oid(&self) -> Sha1Oid { 43 | Sha1::new() 44 | .chain(format!("{} {}\0", self.object_type.type_name(), self.data.len()).as_bytes()) 45 | .chain(&self.data) 46 | .finalize() 47 | .into() 48 | } 49 | } 50 | 51 | #[derive(Debug)] 52 | pub enum PackFileDeltaInstruction { 53 | CopyFromBaseObject { offset: usize, size: usize }, 54 | AddNewData(Vec), 55 | } 56 | 57 | #[derive(Debug)] 58 | pub enum PackFileObject { 59 | Raw(GitObject), 60 | Deltified { 61 | base_oid: Sha1Oid, 62 | base_size: usize, 63 | delta: Vec, 64 | new_oid: Sha1Oid, 65 | new_size: usize, 66 | }, 67 | } 68 | 69 | impl PackFileObject { 70 | pub fn oid(&self) -> Sha1Oid { 71 | match self { 72 | Self::Raw(git_object) => git_object.oid(), 73 | Self::Deltified { new_oid, .. } => *new_oid, 74 | } 75 | } 76 | } 77 | 78 | #[derive(Debug)] 79 | pub struct PackFile { 80 | object_positions: BTreeMap, 81 | } 82 | 83 | /// Generates a git packfile and index file containing the given git objects. 84 | /// The git packfile format is mostly specified [here](https://git-scm.com/docs/pack-format). In a few places 85 | /// noted below, the format documentation is underspecified; this generator is implemented based on a combination 86 | /// of the documented behavior, testing with git itself, and reading the git source code to figure out what it 87 | /// actually accepts. 88 | pub fn stream_to_pack_file>( 89 | iter: T, 90 | ) -> io::Result { 91 | let mut pack = BufWriter::new( 92 | OpenOptions::new() 93 | .read(true) 94 | .write(true) 95 | .create(true) 96 | .truncate(true) 97 | .open(".git/objects/pack/pack-every-shorthash.pack")?, 98 | ); 99 | 100 | // --- Start of packfile header --- 101 | // 4-byte signature 102 | pack.write_all("PACK".as_bytes())?; 103 | 104 | // 4-byte version number 105 | pack.write_all(&2u32.to_be_bytes())?; 106 | 107 | // 4-byte number of objects (currently initialized to 0; will be filled in afterwards) 108 | pack.write_all(&[0, 0, 0, 0])?; 109 | 110 | // --- End of packfile header --- 111 | 112 | let mut current_position = 12; 113 | let mut object_positions = BTreeMap::new(); 114 | let mut object_counts_by_first_byte = [0u32; 256]; 115 | let mut current_object = Vec::new(); 116 | 117 | for object in iter { 118 | let oid = object.oid(); 119 | let current_object_position = current_position; 120 | 121 | // Object type, using the ID values [here](https://git-scm.com/docs/pack-format#:~:text=Object%20types) 122 | let object_type: u8 = match &object { 123 | PackFileObject::Raw(GitObject { object_type, .. }) => match object_type { 124 | GitObjectType::Commit => 1, 125 | GitObjectType::Tree => 2, 126 | GitObjectType::Blob => 3, 127 | }, 128 | PackFileObject::Deltified { base_oid, .. } => { 129 | if object_positions.contains_key(base_oid) { 130 | 6 131 | } else { 132 | 7 133 | } 134 | } 135 | }; 136 | 137 | let encoded_object = match &object { 138 | // Non-deltified objects have no packfile-specific encoding. 139 | PackFileObject::Raw(git_object) => git_object.data.clone(), 140 | PackFileObject::Deltified { 141 | base_size, 142 | delta, 143 | new_size, 144 | .. 145 | } => { 146 | // Deltified objects use the packfile-specific encoding described 147 | // [here](https://git-scm.com/docs/pack-format#_deltified_representation). 148 | let mut deltified_representation = Vec::new(); 149 | append_variable_length_size(&mut deltified_representation, *base_size)?; 150 | append_variable_length_size(&mut deltified_representation, *new_size)?; 151 | for delta_instruction in delta { 152 | match delta_instruction { 153 | PackFileDeltaInstruction::CopyFromBaseObject { offset, size } => { 154 | // The "copy from base object" instruction encoding, documented 155 | // [here](https://git-scm.com/docs/pack-format#_instruction_to_copy_from_base_object) 156 | let offset1 = NonZeroU8::new(*offset as u8); 157 | let offset2 = NonZeroU8::new((*offset >> 8) as u8); 158 | let offset3 = NonZeroU8::new((*offset >> 16) as u8); 159 | let offset4 = NonZeroU8::new((*offset >> 24) as u8); 160 | let size1 = NonZeroU8::new(*size as u8); 161 | let size2 = NonZeroU8::new((*size >> 8) as u8); 162 | let size3 = NonZeroU8::new((*size >> 16) as u8); 163 | deltified_representation.push( 164 | 0b1000_0000 165 | | if size3.is_some() { 0b0100_0000 } else { 0 } 166 | | if size2.is_some() { 0b0010_0000 } else { 0 } 167 | | if size1.is_some() { 0b0001_0000 } else { 0 } 168 | | if offset4.is_some() { 0b0000_1000 } else { 0 } 169 | | if offset3.is_some() { 0b0000_0100 } else { 0 } 170 | | if offset2.is_some() { 0b0000_0010 } else { 0 } 171 | | if offset1.is_some() { 0b0000_0001 } else { 0 }, 172 | ); 173 | deltified_representation.extend( 174 | vec![offset1, offset2, offset3, offset4, size1, size2, size3] 175 | .into_iter() 176 | .filter_map(|v| v) 177 | .map(NonZeroU8::get), 178 | ); 179 | } 180 | PackFileDeltaInstruction::AddNewData(new_data) => { 181 | // The "add new data" instruction encoding, documented 182 | // [here](https://git-scm.com/docs/pack-format#_instruction_to_add_new_data). 183 | // FIXME: is the length limit for this instruction actually 127? 184 | // It seems like it would be impossible to encode a length more than 127 with 185 | // the documented format, but that seems surprising. Maybe it's supposed to use the 186 | // variable-length encoding described in other places? 187 | // In any case, this tool only uses the instruction with sizes less than 127 anyway. 188 | debug_assert!((1..=127).contains(&new_data.len())); 189 | deltified_representation.push(new_data.len() as u8); 190 | deltified_representation.extend(new_data); 191 | } 192 | } 193 | } 194 | deltified_representation 195 | } 196 | }; 197 | 198 | // Append the object type and object size. The git pack format documentation specifies that this should 199 | // be "3-bit type, (n-1)*7+4-bit length", but it underspecifies how exactly these bits need to be arranged. 200 | // From viewing the git source code: the first byte always has a 1 as the most significant bit, followed 201 | // by the 3 bits of the object type, followed by the four least significant bits of the encoded object size 202 | // (measured before any compression is applied). Then the remaining bits of the encoded object size are appended 203 | // in the documented format for variable-length sizes. 204 | current_object.push(0x80 | (object_type << 4) | (encoded_object.len() & 0xf) as u8); 205 | append_variable_length_size(&mut current_object, encoded_object.len() >> 4)?; 206 | 207 | if let PackFileObject::Deltified { base_oid, .. } = object { 208 | if let Some((previous_position, _)) = object_positions.get(&base_oid) { 209 | // For "offset delta" objects, append the relative offset of the delta base. 210 | let offset = current_object_position - *previous_position; 211 | append_variable_length_size_with_continuation_increment( 212 | &mut current_object, 213 | offset, 214 | ); 215 | } else { 216 | // For "ref delta" objects, append the OID of the delta base. 217 | current_object.extend(&base_oid); 218 | } 219 | } 220 | // Append the encoded object data, with maximum compression. 221 | let mut encoder = ZlibEncoder::new(&mut current_object, Compression::best()); 222 | encoder.write_all(&encoded_object)?; 223 | encoder.finish()?; 224 | 225 | object_positions.insert( 226 | oid, 227 | ( 228 | current_object_position, 229 | // The git pack format documentation specifies that the index file needs to include the CRC32 230 | // of each object, but doesn't specify which CRC32 table to use. Emperically, it seems like git 231 | // uses the IEEE CRC32 table. 232 | crc32::checksum_ieee(¤t_object), 233 | ), 234 | ); 235 | current_position += current_object.len(); 236 | pack.write_all(¤t_object)?; 237 | current_object.clear(); 238 | 239 | object_counts_by_first_byte[oid[0] as usize] += 1; 240 | } 241 | 242 | let mut pack_file = pack.into_inner()?; 243 | 244 | // Now that all the objects have been added to the packfile, insert the correct object count into the header 245 | pack_file.seek(SeekFrom::Start(8))?; 246 | pack_file.write_all(&(object_positions.len() as u32).to_be_bytes())?; 247 | 248 | // Add the sha1 pack checksum to the end of the packfile 249 | pack_file.seek(SeekFrom::Start(0))?; 250 | let mut pack_hasher = Sha1::new(); 251 | copy(&mut pack_file, &mut pack_hasher)?; 252 | let pack_checksum = pack_hasher.finalize(); 253 | pack_file.write_all(&pack_checksum)?; 254 | 255 | pack_file.sync_all()?; 256 | drop(pack_file); 257 | 258 | // At this point, the packfile is complete and we're finished processing commits, but we still need to 259 | // generate an index file. Version-2 index files are needed because the packfile is generally bigger than 260 | // 2**32 bytes. 261 | let mut index = BufWriter::new( 262 | OpenOptions::new() 263 | .read(true) 264 | .write(true) 265 | .create(true) 266 | .truncate(true) 267 | .open(".git/objects/pack/pack-every-shorthash.idx")?, 268 | ); 269 | 270 | // --- Start of index file header --- 271 | 272 | // 4-byte "magic number" 273 | index.write_all(b"\xfftOc")?; 274 | 275 | // 4-byte version number 276 | index.write_all(&2u32.to_be_bytes())?; 277 | 278 | // --- End of index file header --- 279 | 280 | // 256-entry "fanout table", encoding the number of objects in the packfile that start with each of 281 | // 0, 1, 2, ..., 255. 282 | let mut num_objects: u32 = 0; 283 | for &count_with_first_byte_equal in object_counts_by_first_byte.iter() { 284 | num_objects += count_with_first_byte_equal; 285 | index.write_all(&num_objects.to_be_bytes()[..])?; 286 | } 287 | 288 | // At this point, we need to iterate over the objects, in order of their OID, several times. Using a B-tree 289 | // is asymtotically optimal for this, but it results in pretty severe cache thrashing, which greatly slows down 290 | // generating the index file. There is a lot of room for improvement here. 291 | // 292 | // The reason we use a B-tree in the first place, rather than just accumulating a list of 293 | // (oid, position, checksum) tuples and sorting it afterwards, is that the current API allows an object 294 | // to be specified as a delta from any other object by OID, and we need to be able to fetch the position 295 | // of the delta base before we've obtained or sorted the whole list. This API is also intended to be generic 296 | // (in that it can generate packfiles of arbitrary objects, not just the objects generated in main.rs). But in 297 | // reality, all deltified commits that are passed to this API have the same delta base, so this issue could be 298 | // avoided by exposing a more specialized API. 299 | // 300 | // Another way to avoid the issue would be to only iterate over the B-tree once, and write to several different 301 | // parts of the file simultaneously using multiple file descriptors. 302 | 303 | // All of the OIDs, in lexicographic order 304 | for oid in object_positions.keys() { 305 | index.write_all(oid)?; 306 | } 307 | 308 | // CRC32 checksums of the packed object data 309 | for (_, checksum) in object_positions.values() { 310 | index.write_all(&checksum.to_be_bytes())?; 311 | } 312 | 313 | let mut num_big_offsets = 0u32; 314 | // Table of 4-byte object offsets 315 | for (position, _) in object_positions.values() { 316 | if *position < 0x80_00_00_00 { 317 | index.write_all(&(*position as u32).to_be_bytes())?; 318 | } else { 319 | index.write_all(&(0x80_00_00_00 | num_big_offsets).to_be_bytes())?; 320 | num_big_offsets += 1; 321 | } 322 | } 323 | 324 | // Table of 8-byte object offsets 325 | for (position, _) in object_positions.values() { 326 | // FIXME: might faster to have two separate cursors writing to the file rather than iterating over 327 | // the B-tree twice 328 | if *position >= 0x80_00_00_00 { 329 | index.write_all(&(*position as u64).to_be_bytes())?; 330 | } 331 | } 332 | 333 | // Add a copy of the pack file checksum 334 | index.write_all(&pack_checksum)?; 335 | 336 | let mut index_file = index.into_inner()?; 337 | 338 | // Add the sha1 index checksum to the index of the index file 339 | index_file.seek(SeekFrom::Start(0))?; 340 | let mut index_hasher = Sha1::new(); 341 | copy(&mut index_file, &mut index_hasher)?; 342 | index_file.write_all(&index_hasher.finalize())?; 343 | 344 | index_file.sync_all()?; 345 | 346 | // Deallocating the B-tree of object positions is very, very slow. It's a really big B-tree that has lots of 347 | // individual allocations. Deallocating the B-tree is also completely unnecessary if the process is about to 348 | // exit, serving only to add hours to the runtime for no reason. So the B-tree is included as a private 349 | // returned struct field, and the caller can explicitly leak the struct rather than dropping it if needed. 350 | Ok(PackFile { 351 | object_positions, 352 | }) 353 | } 354 | 355 | /// Appends a "size-encoded" non-negative integer to packfile data, using the 356 | /// encoding format specified [here](https://git-scm.com/docs/pack-format#:~:text=Size%20encoding). 357 | fn append_variable_length_size(mut data: T, mut size: usize) -> io::Result<()> { 358 | loop { 359 | let next_seven_bits = (size & 0x7f) as u8; 360 | size >>= 7; 361 | if size == 0 { 362 | data.write_all(&[next_seven_bits])?; 363 | break; 364 | } else { 365 | data.write_all(&[next_seven_bits | 0x80])?; 366 | } 367 | } 368 | Ok(()) 369 | } 370 | 371 | /// Packfiles use a slightly different variable-length size encoding for delta offsets 372 | /// than they do for other values. This modified encoding is entirely undocumented and also necessary 373 | /// to generate a packfile that git will understand. 374 | /// [This blogpost](https://medium.com/@concertdaw/sneaky-git-number-encoding-ddcc5db5329f) contains 375 | /// some more information. 376 | fn append_variable_length_size_with_continuation_increment(data: &mut Vec, mut size: usize) { 377 | let initial_index = data.len(); 378 | data.push((size & 0x7f) as u8); 379 | size >>= 7; 380 | while size > 0 { 381 | size -= 1; 382 | data.insert(initial_index, 0x80 | (size as u8 & 0x7f)); 383 | size >>= 7; 384 | } 385 | } 386 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | mod git_utils; 2 | 3 | use git_utils::{ 4 | stream_to_pack_file, GitObject, GitObjectType, PackFileDeltaInstruction, PackFileObject, 5 | Sha1Oid, 6 | }; 7 | use sha1::{Digest, Sha1}; 8 | use std::{convert::TryInto, iter::FusedIterator, mem::forget}; 9 | 10 | #[repr(transparent)] 11 | struct PackedBoolArray { 12 | data: Vec, 13 | } 14 | 15 | impl PackedBoolArray { 16 | const MASKS: [u8; 8] = [1, 2, 4, 8, 16, 32, 64, 128]; 17 | 18 | #[inline(always)] 19 | fn offset(index: usize) -> usize { 20 | index / 8 21 | } 22 | 23 | #[inline(always)] 24 | fn mask(index: usize) -> u8 { 25 | Self::MASKS[index % 8] 26 | } 27 | 28 | fn get(&self, index: usize) -> bool { 29 | self.data[Self::offset(index)] & Self::mask(index) != 0 30 | } 31 | 32 | fn set(&mut self, index: usize, value: bool) { 33 | if value { 34 | self.data[Self::offset(index)] |= Self::mask(index); 35 | } else { 36 | self.data[Self::offset(index)] &= !Self::mask(index); 37 | } 38 | } 39 | 40 | #[inline(always)] 41 | fn hash_to_shorthash_index(hash: &Sha1Oid) -> usize { 42 | (u32::from_be_bytes(hash[0..4].try_into().unwrap()) >> 4) as usize 43 | } 44 | } 45 | 46 | impl Default for PackedBoolArray { 47 | fn default() -> Self { 48 | Self { data: vec![0; 1 << 25] } 49 | } 50 | } 51 | 52 | fn main() -> std::io::Result<()> { 53 | let empty_tree = GitObject { 54 | object_type: GitObjectType::Tree, 55 | data: vec![], 56 | }; 57 | let delta_base_commit = GitObject { 58 | object_type: GitObjectType::Commit, 59 | data: b"\ 60 | tree 4b825dc642cb6eb9a060e54bf8d69288fbee4904\n\ 61 | author Teddy Katz 1616279625 -0400\n\ 62 | committer Teddy Katz 1616279625 -0400\n\ 63 | \n\ 64 | Entropy value for this commit: " 65 | .to_vec(), 66 | }; 67 | let last_block_length = (delta_base_commit.data.len() 68 | + format!("commit {}\0", delta_base_commit.data.len()).len()) 69 | % 64; 70 | assert!( 71 | (0..=47).contains(&last_block_length), 72 | "suboptimal commit length {}; hashing would be twice as slow", 73 | last_block_length 74 | ); 75 | let mut found_shorthashes = PackedBoolArray::default(); 76 | let delta_base_commit_oid = delta_base_commit.oid(); 77 | found_shorthashes.set( 78 | PackedBoolArray::hash_to_shorthash_index(&delta_base_commit_oid), 79 | true, 80 | ); 81 | 82 | let deltified_generator = DeltifiedCommitGenerator { 83 | delta_base_commit: delta_base_commit.clone(), 84 | delta_base_commit_oid, 85 | found_shorthashes, 86 | root_commit_oid_buffer: vec![delta_base_commit_oid], 87 | merge_commit_oid_buffer: vec![], 88 | delta_base_commit_extension_length: 8, 89 | delta_base_commit_intermediate_sha1_state: Sha1::new() 90 | .chain(format!("commit {}\0", delta_base_commit.data.len() + 8).as_bytes()) 91 | .chain(&delta_base_commit.data), 92 | entropy_specifier: 0, 93 | commit_count_cap: usize::MAX, 94 | is_finished: false, 95 | }; 96 | 97 | let pack_file = stream_to_pack_file( 98 | vec![ 99 | PackFileObject::Raw(empty_tree), 100 | PackFileObject::Raw(delta_base_commit), 101 | ] 102 | .into_iter() 103 | .chain(deltified_generator), 104 | )?; 105 | 106 | // Avoid running the destructor for the metadata, since it takes a very long time to clean up and 107 | // we're about to exit the process anyway. 108 | forget(pack_file); 109 | 110 | Ok(()) 111 | } 112 | 113 | struct DeltifiedCommitGenerator { 114 | delta_base_commit: GitObject, 115 | delta_base_commit_oid: Sha1Oid, 116 | found_shorthashes: PackedBoolArray, 117 | root_commit_oid_buffer: Vec, 118 | merge_commit_oid_buffer: Vec, 119 | delta_base_commit_extension_length: usize, 120 | delta_base_commit_intermediate_sha1_state: Sha1, 121 | 122 | // Due to https://en.wikipedia.org/wiki/Coupon_collector%27s_problem, we expect to need 123 | // 2**28 * (ln(2**28) + 0.577) = 2**32.3 total commits to find all 2**28 unique shorthashes, 124 | // which is over the threshold of 2**32 32-bit ints. 125 | entropy_specifier: u64, 126 | commit_count_cap: usize, 127 | is_finished: bool, 128 | } 129 | 130 | impl DeltifiedCommitGenerator { 131 | fn get_entropy(&self) -> String { 132 | if self.delta_base_commit_extension_length == 8 { 133 | format!("{:08x}", self.entropy_specifier) 134 | } else { 135 | format!("{:016x}", self.entropy_specifier) 136 | } 137 | } 138 | } 139 | 140 | fn create_merge_commit(parent_oids: &[Sha1Oid]) -> GitObject { 141 | GitObject { 142 | object_type: GitObjectType::Commit, 143 | data: format!( 144 | "\ 145 | tree 4b825dc642cb6eb9a060e54bf8d69288fbee4904\n\ 146 | {}\ 147 | author Teddy Katz 1616279625 -0400\n\ 148 | committer Teddy Katz 1616279625 -0400\n\ 149 | \n\ 150 | Merge of {} commits\n", 151 | parent_oids 152 | .iter() 153 | .map(|oid| { 154 | format!( 155 | "parent {}\n", 156 | oid.iter() 157 | .map(|&byte| format!("{:02x}", byte)) 158 | .collect::() 159 | ) 160 | }) 161 | .collect::(), 162 | parent_oids.len() 163 | ) 164 | .as_bytes() 165 | .to_vec(), 166 | } 167 | } 168 | 169 | impl Iterator for DeltifiedCommitGenerator { 170 | type Item = PackFileObject; 171 | fn next(&mut self) -> Option { 172 | if self.is_finished 173 | || self.merge_commit_oid_buffer.len() * (1 << 14) + self.root_commit_oid_buffer.len() 174 | > self.commit_count_cap 175 | { 176 | return None; 177 | } 178 | 179 | if self.merge_commit_oid_buffer.len() >= 1 << 14 { 180 | self.is_finished = true; 181 | let final_merge = create_merge_commit(&self.merge_commit_oid_buffer); 182 | println!( 183 | "Top-level merge commit: {}", 184 | final_merge 185 | .oid() 186 | .iter() 187 | .map(|&byte| format!("{:02x}", byte)) 188 | .collect::() 189 | ); 190 | println!("Your call is important to us."); 191 | println!("Please hold while an index file is generated. This will take a while"); 192 | return Some(PackFileObject::Raw(final_merge)); 193 | } 194 | 195 | if self.root_commit_oid_buffer.len() >= 1 << 14 { 196 | let merge = create_merge_commit(&self.root_commit_oid_buffer); 197 | self.root_commit_oid_buffer.clear(); 198 | self.merge_commit_oid_buffer.push(merge.oid()); 199 | println!( 200 | "created first-level merge commit {}/{}", 201 | self.merge_commit_oid_buffer.len(), 202 | 1 << 14 203 | ); 204 | return Some(PackFileObject::Raw(merge)); 205 | } 206 | 207 | let new_oid = loop { 208 | if self.entropy_specifier == (u32::MAX as u64) + 1 { 209 | self.delta_base_commit_extension_length = 16; 210 | self.delta_base_commit_intermediate_sha1_state = Sha1::new() 211 | .chain( 212 | format!( 213 | "commit {}\0", 214 | self.delta_base_commit.data.len() 215 | + self.delta_base_commit_extension_length 216 | ) 217 | .as_bytes(), 218 | ) 219 | .chain(&self.delta_base_commit.data); 220 | } 221 | 222 | let oid = self 223 | .delta_base_commit_intermediate_sha1_state 224 | .clone() 225 | .chain(self.get_entropy().as_bytes()) 226 | .finalize() 227 | .into(); 228 | 229 | if !self 230 | .found_shorthashes 231 | .get(PackedBoolArray::hash_to_shorthash_index(&oid)) 232 | { 233 | break oid; 234 | } 235 | 236 | self.entropy_specifier += 1; 237 | if self.entropy_specifier & 0xfffff == 0 { 238 | println!("number of commits attempted so far: {}", self.entropy_specifier); 239 | } 240 | }; 241 | 242 | let delta_instructions = vec![ 243 | PackFileDeltaInstruction::CopyFromBaseObject { 244 | offset: 0, 245 | size: self.delta_base_commit.data.len(), 246 | }, 247 | PackFileDeltaInstruction::AddNewData(self.get_entropy().as_bytes().to_vec()), 248 | ]; 249 | 250 | self.found_shorthashes 251 | .set(PackedBoolArray::hash_to_shorthash_index(&new_oid), true); 252 | self.entropy_specifier += 1; 253 | if self.entropy_specifier & 0xfffff == 0 { 254 | println!("number of commits attempted so far: {}", self.entropy_specifier); 255 | } 256 | self.root_commit_oid_buffer.push(new_oid); 257 | 258 | Some(PackFileObject::Deltified { 259 | base_oid: self.delta_base_commit_oid, 260 | base_size: self.delta_base_commit.data.len(), 261 | delta: delta_instructions, 262 | new_oid, 263 | new_size: self.delta_base_commit.data.len() + self.delta_base_commit_extension_length, 264 | }) 265 | } 266 | } 267 | 268 | impl FusedIterator for DeltifiedCommitGenerator {} 269 | --------------------------------------------------------------------------------