├── .gitignore
├── Cargo.lock
├── Cargo.toml
├── LICENSE.md
├── README.md
└── src
    ├── git_utils.rs
    └── main.rs


/.gitignore:
--------------------------------------------------------------------------------
1 | /lucky-commit
2 | .vscode
3 | /target/
4 | **/*.rs.bk
5 | 


--------------------------------------------------------------------------------
/Cargo.lock:
--------------------------------------------------------------------------------
  1 | # This file is automatically @generated by Cargo.
  2 | # It is not intended for manual editing.
  3 | [[package]]
  4 | name = "adler"
  5 | version = "1.0.2"
  6 | source = "registry+https://github.com/rust-lang/crates.io-index"
  7 | checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
  8 | 
  9 | [[package]]
 10 | name = "autocfg"
 11 | version = "1.0.1"
 12 | source = "registry+https://github.com/rust-lang/crates.io-index"
 13 | checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
 14 | 
 15 | [[package]]
 16 | name = "block-buffer"
 17 | version = "0.9.0"
 18 | source = "registry+https://github.com/rust-lang/crates.io-index"
 19 | checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4"
 20 | dependencies = [
 21 |  "generic-array",
 22 | ]
 23 | 
 24 | [[package]]
 25 | name = "build_const"
 26 | version = "0.2.1"
 27 | source = "registry+https://github.com/rust-lang/crates.io-index"
 28 | checksum = "39092a32794787acd8525ee150305ff051b0aa6cc2abaf193924f5ab05425f39"
 29 | 
 30 | [[package]]
 31 | name = "cfg-if"
 32 | version = "1.0.0"
 33 | source = "registry+https://github.com/rust-lang/crates.io-index"
 34 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 35 | 
 36 | [[package]]
 37 | name = "cpuid-bool"
 38 | version = "0.1.2"
 39 | source = "registry+https://github.com/rust-lang/crates.io-index"
 40 | checksum = "8aebca1129a03dc6dc2b127edd729435bbc4a37e1d5f4d7513165089ceb02634"
 41 | 
 42 | [[package]]
 43 | name = "crc"
 44 | version = "1.8.1"
 45 | source = "registry+https://github.com/rust-lang/crates.io-index"
 46 | checksum = "d663548de7f5cca343f1e0a48d14dcfb0e9eb4e079ec58883b7251539fa10aeb"
 47 | dependencies = [
 48 |  "build_const",
 49 | ]
 50 | 
 51 | [[package]]
 52 | name = "crc32fast"
 53 | version = "1.2.1"
 54 | source = "registry+https://github.com/rust-lang/crates.io-index"
 55 | checksum = "81156fece84ab6a9f2afdb109ce3ae577e42b1228441eded99bd77f627953b1a"
 56 | dependencies = [
 57 |  "cfg-if",
 58 | ]
 59 | 
 60 | [[package]]
 61 | name = "digest"
 62 | version = "0.9.0"
 63 | source = "registry+https://github.com/rust-lang/crates.io-index"
 64 | checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066"
 65 | dependencies = [
 66 |  "generic-array",
 67 | ]
 68 | 
 69 | [[package]]
 70 | name = "every-commit-shorthash"
 71 | version = "0.1.0"
 72 | dependencies = [
 73 |  "crc",
 74 |  "flate2",
 75 |  "sha-1",
 76 | ]
 77 | 
 78 | [[package]]
 79 | name = "flate2"
 80 | version = "1.0.20"
 81 | source = "registry+https://github.com/rust-lang/crates.io-index"
 82 | checksum = "cd3aec53de10fe96d7d8c565eb17f2c687bb5518a2ec453b5b1252964526abe0"
 83 | dependencies = [
 84 |  "cfg-if",
 85 |  "crc32fast",
 86 |  "libc",
 87 |  "miniz_oxide",
 88 | ]
 89 | 
 90 | [[package]]
 91 | name = "generic-array"
 92 | version = "0.14.4"
 93 | source = "registry+https://github.com/rust-lang/crates.io-index"
 94 | checksum = "501466ecc8a30d1d3b7fc9229b122b2ce8ed6e9d9223f1138d4babb253e51817"
 95 | dependencies = [
 96 |  "typenum",
 97 |  "version_check",
 98 | ]
 99 | 
100 | [[package]]
101 | name = "libc"
102 | version = "0.2.90"
103 | source = "registry+https://github.com/rust-lang/crates.io-index"
104 | checksum = "ba4aede83fc3617411dc6993bc8c70919750c1c257c6ca6a502aed6e0e2394ae"
105 | 
106 | [[package]]
107 | name = "miniz_oxide"
108 | version = "0.4.4"
109 | source = "registry+https://github.com/rust-lang/crates.io-index"
110 | checksum = "a92518e98c078586bc6c934028adcca4c92a53d6a958196de835170a01d84e4b"
111 | dependencies = [
112 |  "adler",
113 |  "autocfg",
114 | ]
115 | 
116 | [[package]]
117 | name = "opaque-debug"
118 | version = "0.3.0"
119 | source = "registry+https://github.com/rust-lang/crates.io-index"
120 | checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5"
121 | 
122 | [[package]]
123 | name = "sha-1"
124 | version = "0.9.4"
125 | source = "registry+https://github.com/rust-lang/crates.io-index"
126 | checksum = "dfebf75d25bd900fd1e7d11501efab59bc846dbc76196839663e6637bba9f25f"
127 | dependencies = [
128 |  "block-buffer",
129 |  "cfg-if",
130 |  "cpuid-bool",
131 |  "digest",
132 |  "opaque-debug",
133 | ]
134 | 
135 | [[package]]
136 | name = "typenum"
137 | version = "1.13.0"
138 | source = "registry+https://github.com/rust-lang/crates.io-index"
139 | checksum = "879f6906492a7cd215bfa4cf595b600146ccfac0c79bcbd1f3000162af5e8b06"
140 | 
141 | [[package]]
142 | name = "version_check"
143 | version = "0.9.3"
144 | source = "registry+https://github.com/rust-lang/crates.io-index"
145 | checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe"
146 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "every-commit-shorthash"
 3 | version = "0.1.0"
 4 | authors = ["Teddy Katz <teddy.katz@gmail.com>"]
 5 | edition = "2018"
 6 | 
 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 8 | 
 9 | [dependencies]
10 | sha-1 = "0.9"
11 | flate2 = "^1.0"
12 | crc = "^1.8"
13 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | =====================
 3 | 
 4 | Copyright © 2021 Teddy Katz
 5 | 
 6 | Permission is hereby granted, free of charge, to any person
 7 | obtaining a copy of this software and associated documentation
 8 | files (the “Software”), to deal in the Software without
 9 | restriction, including without limitation the rights to use,
10 | copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the
12 | Software is furnished to do so, subject to the following
13 | conditions:
14 | 
15 | The above copyright notice and this permission notice shall be
16 | included in all copies or substantial portions of the Software.
17 | 
18 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND,
19 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25 | OTHER DEALINGS IN THE SOFTWARE.
26 | 
27 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # every-git-commit-shorthash
 2 | 
 3 | A git repository with a commit for *every* seven-character git commit shorthash (all 2<sup>28</sup> of them).
 4 | 
 5 | If you have a commit shorthash, or any seven-character hex string, you can find a commit for it here! It's like a dictionary, but much less useful. Also see [lucky-commit](https://github.com/not-an-aardvark/lucky-commit) if you'd like to generate commits with arbitrary shorthashes on the fly instead.
 6 | 
 7 | ## FAQ
 8 | 
 9 | ### Where can I see all the commits?
10 | 
11 | This repository contains code for generating a repository with every shorthash locally.
12 | 
13 | The repository has so many commits that `git push` hangs and runs out of memory, presumably because it tries to regenerate a packfile on the fly. As a result, there isn't a GitHub-hosted interactive demo. Sorry.
14 | 
15 | This problem might be tractable in theory. `git` uses the same packfile format for network transport as it does for storage, so it might be possible to convince it to use the packfile from the filesystem directly rather than generating a new one. However, I have some doubts that GitHub would accept the push without timing out or hitting a memory limit somewhere.
16 | 
17 | If you want to get the commits without running the tool yourself, you can download a pregenerated packfile and index by following the instructions on the [releases page](https://github.com/not-an-aardvark/every-git-commit-shorthash/releases).
18 | 
19 | ### How much space does this take up?
20 | 
21 | The commits are stored in a 14.7 GB [git packfile](https://git-scm.com/docs/pack-format), as well as an associated 9.35 GB pack index file. This is the result of significant optimization to reduce the file sizes.
22 | 
23 | Specifically, two major strategies are used:
24 | 
25 | #### Almost all of the commits are stored as deltas
26 | 
27 | Git packfiles support a delta format, where a git object is stored as a diff from another git object, then reassembled at runtime. Normally, this delta format is used for file contents (i.e. blobs), so that git doesn't need to store two copies of a file for a one-line change. However, it's supported for all types of git objects, including commits themselves. Git commits contain a lot of metadata, so storing a commit as e.g. "the same as this other commit, plus one space at the end of the commit message" saves a lot of space over inlining the entire commit.
28 | 
29 | #### The commit graph is arranged to take advantage of compression
30 | 
31 | It's worth noting a design goal of the tool here: it should be possible to place all 2<sup>28</sup> commits in a single git branch, such that they won't be immediately garbage-collected by git.
32 | 
33 | This design goal requires the commit layout to use more space. Without this requirement, it would be possible to simply create 2<sup>28</sup> root commits (i.e. commits without parents). However, with this requirement, for each non-branch-tip commit there needs to be at least one child commit whose body contains `parent <40-character hex commit hash>` in the commit metadata. Since all of the 40-character commit hashes are different, this requires a minimum storage cost of of 40 uncompressed bytes per commit, regardless of how the commits are arranged. There's also a flat cost of 20 bytes per commit since the packfile format inlines each commit hash, along with a few bytes of overhead to specify the delta itself.
34 | 
35 | With that aside, there is still a significant amount of flexibility in how the commits can be arranged. For example, it would be possible to create a linear commit history of 2<sup>28</sup> commits, or to create 2<sup>28</sup> root commits and one massive merge commit with 2<sup>28</sup> parents, or anything in between.
36 | 
37 | Emperically, using a linear commit history resulted in an amortized size of 73 compressed bytes per commit in the packfile, whereas using big merge commits resulted an amortized size of only 46 compressed bytes per commit. This is because giant merge commits are much more easily compressible, since they consist almost entirely of hex characters from the parent hashes.
38 | 
39 | I suspected git wouldn't really like having a single merge commit with 2<sup>28</sup> parents, so I compromised and created 2<sup>14</sup> merge commits, each with 2<sup>14</sup> root-commit parents, and one top-level merge commit with each of the 2<sup>14</sup> merge commits as parents. It's worth noting that while the merge commits account for 0.0061% of the total number of commits, they account for about 46% of the total storage space. I think the total size is close to optimal given the requirement that all the commits be reachable, but removing that requirement would result in almost a 50% packfile size reduction.
40 | 
41 | ### How long does it take to generate the commits?
42 | 
43 | About 5 hours on my laptop (a 2015 MacBook Pro).
44 | 
45 | The tool was designed for one-time use, so I haven't spent a lot of time optimizing its performance. There is some significant low-hanging fruit:
46 | 
47 | * Currently, it's single-threaded for simplicity. The performance could be sped up by a significant factor by using multiple cores or running on a GPU.
48 | * Generating the index file currently involves a lot of cache thrashing, which could be fixed with only a bit of added complexity.
49 | 
50 | When the tool starts running, the main bottleneck is zlib compression (which is run on each commit, using the maximum compression ratio). This continues to be the main bottleneck until the tool reaches the last million commits or so, at which point SHA1 throughput becomes the main bottleneck. (For each of the last few commits, the tool has to try a large number of commit possibilities in order to find a shorthash that hasn't already been used.)
51 | 
52 | Note that the tool is currently very memory-constrained; in order to generate the packfile index, the tool needs to keep track of a sorted index of all of the commit hashes generated so far. As a result of this and a few other pieces of metadata, it uses 11GB of memory, which is just small enough to run on my laptop. Some plausible-seeming performance improvements would result in OOM, and some memory usage improvements (e.g. saving state to the filesystem) could result in slower performance.
53 | 
54 | ### How many commits does the tool need to go through to find 2<sup>28</sup> unique shorthashes?
55 | 
56 | Due to the [coupon collector's problem](https://en.wikipedia.org/wiki/Coupon_collector%27s_problem), the expected number of commit attempts is 2<sup>28</sup>(ln(2<sup>28</sup>) + 0.577...), or about 5.4 billion.
57 | 
58 | ### Does git actually work in such a big repository?
59 | 
60 | Sort of. You can view any particular commit and check out files at that commit. (You can run `git checkout` and keyboard-mash seven random hex characters, and it will go to that commit, which is neat.) Anything that requires stepping through history in order, such as `git log`, seems to stall and run out of memory.
61 | 
62 | ### Why would someone want to use this?
63 | 
64 | ¯\\\_(ツ)\_/¯
65 | 
66 | ### No, really, is it useful for anything?
67 | 
68 | Probably not, but maybe.
69 | 
70 | I originally created [lucky-commit](https://github.com/not-an-aardvark/lucky-commit), the companion project to this one, as a practical joke. However, it turned out to be [unexpectedly useful for security research](https://blog.teddykatz.com/2019/11/12/github-actions-dos.html) as a way to generate targeted commit hash collisions.
71 | 
72 | In theory, this project could also be used to [generate commit hash collisions in bulk](https://blog.teddykatz.com/2019/11/12/github-actions-dos.html#:~:text=Making%20every%20shorthash%20collide). However, it seems like using `lucky-commit` to generate a targeted collision would be more useful in almost all circumstances, especially since `lucky-commit` allows you to amend your own commits rather than creating useless commits from scratch. This is particularly true because the branch with bulk commits from this project can't really be pushed anywhere.
73 | 
74 | ### How do I run it?
75 | 
76 | First, [ensure you have `rustc` and `cargo` installed](https://www.rust-lang.org/tools/install).
77 | 
78 | Then run:
79 | 
80 | ```bash
81 | $ git clone https://github.com/not-an-aardvark/every-git-commit-shorthash.git
82 | $ cd every-git-commit-shorthash
83 | ```
84 | 
85 | Optionally, update the hardcoded commit templates in `src/main.rs`, e.g. to update the author to yourself or change the commit message.
86 | 
87 | Then run:
88 | 
89 | ```bash
90 | $ cargo run --release
91 | ```
92 | 


--------------------------------------------------------------------------------
/src/git_utils.rs:
--------------------------------------------------------------------------------
  1 | use crc::crc32;
  2 | use flate2::{write::ZlibEncoder, Compression};
  3 | use sha1::{Digest, Sha1};
  4 | use std::{
  5 |     collections::BTreeMap,
  6 |     io,
  7 |     io::{copy, Seek, SeekFrom, Write},
  8 |     num::NonZeroU8,
  9 | };
 10 | use std::{
 11 |     fs::OpenOptions,
 12 |     io::BufWriter,
 13 | };
 14 | 
 15 | pub type Sha1Oid = [u8; 20];
 16 | 
 17 | #[derive(Clone, Debug)]
 18 | pub enum GitObjectType {
 19 |     Commit,
 20 |     Tree,
 21 |     #[allow(dead_code)]
 22 |     Blob,
 23 | }
 24 | 
 25 | #[derive(Clone, Debug)]
 26 | pub struct GitObject {
 27 |     pub data: Vec<u8>,
 28 |     pub object_type: GitObjectType,
 29 | }
 30 | 
 31 | impl GitObjectType {
 32 |     fn type_name(&self) -> &'static str {
 33 |         match self {
 34 |             GitObjectType::Commit { .. } => "commit",
 35 |             GitObjectType::Tree { .. } => "tree",
 36 |             GitObjectType::Blob { .. } => "blob",
 37 |         }
 38 |     }
 39 | }
 40 | 
 41 | impl GitObject {
 42 |     pub fn oid(&self) -> Sha1Oid {
 43 |         Sha1::new()
 44 |             .chain(format!("{} {}\0", self.object_type.type_name(), self.data.len()).as_bytes())
 45 |             .chain(&self.data)
 46 |             .finalize()
 47 |             .into()
 48 |     }
 49 | }
 50 | 
 51 | #[derive(Debug)]
 52 | pub enum PackFileDeltaInstruction {
 53 |     CopyFromBaseObject { offset: usize, size: usize },
 54 |     AddNewData(Vec<u8>),
 55 | }
 56 | 
 57 | #[derive(Debug)]
 58 | pub enum PackFileObject {
 59 |     Raw(GitObject),
 60 |     Deltified {
 61 |         base_oid: Sha1Oid,
 62 |         base_size: usize,
 63 |         delta: Vec<PackFileDeltaInstruction>,
 64 |         new_oid: Sha1Oid,
 65 |         new_size: usize,
 66 |     },
 67 | }
 68 | 
 69 | impl PackFileObject {
 70 |     pub fn oid(&self) -> Sha1Oid {
 71 |         match self {
 72 |             Self::Raw(git_object) => git_object.oid(),
 73 |             Self::Deltified { new_oid, .. } => *new_oid,
 74 |         }
 75 |     }
 76 | }
 77 | 
 78 | #[derive(Debug)]
 79 | pub struct PackFile {
 80 |     object_positions: BTreeMap<Sha1Oid, (usize, u32)>,
 81 | }
 82 | 
 83 | /// Generates a git packfile and index file containing the given git objects.
 84 | /// The git packfile format is mostly specified [here](https://git-scm.com/docs/pack-format). In a few places
 85 | /// noted below, the format documentation is underspecified; this generator is implemented based on a combination
 86 | /// of the documented behavior, testing with git itself, and reading the git source code to figure out what it
 87 | /// actually accepts.
 88 | pub fn stream_to_pack_file<T: IntoIterator<Item = PackFileObject>>(
 89 |     iter: T,
 90 | ) -> io::Result<PackFile> {
 91 |     let mut pack = BufWriter::new(
 92 |         OpenOptions::new()
 93 |             .read(true)
 94 |             .write(true)
 95 |             .create(true)
 96 |             .truncate(true)
 97 |             .open(".git/objects/pack/pack-every-shorthash.pack")?,
 98 |     );
 99 | 
100 |     // --- Start of packfile header ---
101 |     // 4-byte signature
102 |     pack.write_all("PACK".as_bytes())?;
103 | 
104 |     // 4-byte version number
105 |     pack.write_all(&2u32.to_be_bytes())?;
106 | 
107 |     // 4-byte number of objects (currently initialized to 0; will be filled in afterwards)
108 |     pack.write_all(&[0, 0, 0, 0])?;
109 | 
110 |     // --- End of packfile header ---
111 | 
112 |     let mut current_position = 12;
113 |     let mut object_positions = BTreeMap::new();
114 |     let mut object_counts_by_first_byte = [0u32; 256];
115 |     let mut current_object = Vec::new();
116 | 
117 |     for object in iter {
118 |         let oid = object.oid();
119 |         let current_object_position = current_position;
120 | 
121 |         // Object type, using the ID values [here](https://git-scm.com/docs/pack-format#:~:text=Object%20types)
122 |         let object_type: u8 = match &object {
123 |             PackFileObject::Raw(GitObject { object_type, .. }) => match object_type {
124 |                 GitObjectType::Commit => 1,
125 |                 GitObjectType::Tree => 2,
126 |                 GitObjectType::Blob => 3,
127 |             },
128 |             PackFileObject::Deltified { base_oid, .. } => {
129 |                 if object_positions.contains_key(base_oid) {
130 |                     6
131 |                 } else {
132 |                     7
133 |                 }
134 |             }
135 |         };
136 | 
137 |         let encoded_object = match &object {
138 |             // Non-deltified objects have no packfile-specific encoding.
139 |             PackFileObject::Raw(git_object) => git_object.data.clone(),
140 |             PackFileObject::Deltified {
141 |                 base_size,
142 |                 delta,
143 |                 new_size,
144 |                 ..
145 |             } => {
146 |                 // Deltified objects use the packfile-specific encoding described
147 |                 // [here](https://git-scm.com/docs/pack-format#_deltified_representation).
148 |                 let mut deltified_representation = Vec::new();
149 |                 append_variable_length_size(&mut deltified_representation, *base_size)?;
150 |                 append_variable_length_size(&mut deltified_representation, *new_size)?;
151 |                 for delta_instruction in delta {
152 |                     match delta_instruction {
153 |                         PackFileDeltaInstruction::CopyFromBaseObject { offset, size } => {
154 |                             // The "copy from base object" instruction encoding, documented
155 |                             // [here](https://git-scm.com/docs/pack-format#_instruction_to_copy_from_base_object)
156 |                             let offset1 = NonZeroU8::new(*offset as u8);
157 |                             let offset2 = NonZeroU8::new((*offset >> 8) as u8);
158 |                             let offset3 = NonZeroU8::new((*offset >> 16) as u8);
159 |                             let offset4 = NonZeroU8::new((*offset >> 24) as u8);
160 |                             let size1 = NonZeroU8::new(*size as u8);
161 |                             let size2 = NonZeroU8::new((*size >> 8) as u8);
162 |                             let size3 = NonZeroU8::new((*size >> 16) as u8);
163 |                             deltified_representation.push(
164 |                                 0b1000_0000
165 |                                     | if size3.is_some() { 0b0100_0000 } else { 0 }
166 |                                     | if size2.is_some() { 0b0010_0000 } else { 0 }
167 |                                     | if size1.is_some() { 0b0001_0000 } else { 0 }
168 |                                     | if offset4.is_some() { 0b0000_1000 } else { 0 }
169 |                                     | if offset3.is_some() { 0b0000_0100 } else { 0 }
170 |                                     | if offset2.is_some() { 0b0000_0010 } else { 0 }
171 |                                     | if offset1.is_some() { 0b0000_0001 } else { 0 },
172 |                             );
173 |                             deltified_representation.extend(
174 |                                 vec![offset1, offset2, offset3, offset4, size1, size2, size3]
175 |                                     .into_iter()
176 |                                     .filter_map(|v| v)
177 |                                     .map(NonZeroU8::get),
178 |                             );
179 |                         }
180 |                         PackFileDeltaInstruction::AddNewData(new_data) => {
181 |                             // The "add new data" instruction encoding, documented
182 |                             // [here](https://git-scm.com/docs/pack-format#_instruction_to_add_new_data).
183 |                             // FIXME: is the length limit for this instruction actually 127?
184 |                             // It seems like it would be impossible to encode a length more than 127 with
185 |                             // the documented format, but that seems surprising. Maybe it's supposed to use the
186 |                             // variable-length encoding described in other places?
187 |                             // In any case, this tool only uses the instruction with sizes less than 127 anyway.
188 |                             debug_assert!((1..=127).contains(&new_data.len()));
189 |                             deltified_representation.push(new_data.len() as u8);
190 |                             deltified_representation.extend(new_data);
191 |                         }
192 |                     }
193 |                 }
194 |                 deltified_representation
195 |             }
196 |         };
197 | 
198 |         // Append the object type and object size. The git pack format documentation specifies that this should
199 |         // be "3-bit type, (n-1)*7+4-bit length", but it underspecifies how exactly these bits need to be arranged.
200 |         // From viewing the git source code: the first byte always has a 1 as the most significant bit, followed
201 |         // by the 3 bits of the object type, followed by the four least significant bits of the encoded object size
202 |         // (measured before any compression is applied). Then the remaining bits of the encoded object size are appended
203 |         // in the documented format for variable-length sizes.
204 |         current_object.push(0x80 | (object_type << 4) | (encoded_object.len() & 0xf) as u8);
205 |         append_variable_length_size(&mut current_object, encoded_object.len() >> 4)?;
206 | 
207 |         if let PackFileObject::Deltified { base_oid, .. } = object {
208 |             if let Some((previous_position, _)) = object_positions.get(&base_oid) {
209 |                 // For "offset delta" objects, append the relative offset of the delta base.
210 |                 let offset = current_object_position - *previous_position;
211 |                 append_variable_length_size_with_continuation_increment(
212 |                     &mut current_object,
213 |                     offset,
214 |                 );
215 |             } else {
216 |                 // For "ref delta" objects, append the OID of the delta base.
217 |                 current_object.extend(&base_oid);
218 |             }
219 |         }
220 |         // Append the encoded object data, with maximum compression.
221 |         let mut encoder = ZlibEncoder::new(&mut current_object, Compression::best());
222 |         encoder.write_all(&encoded_object)?;
223 |         encoder.finish()?;
224 | 
225 |         object_positions.insert(
226 |             oid,
227 |             (
228 |                 current_object_position,
229 |                 // The git pack format documentation specifies that the index file needs to include the CRC32
230 |                 // of each object, but doesn't specify which CRC32 table to use. Emperically, it seems like git
231 |                 // uses the IEEE CRC32 table.
232 |                 crc32::checksum_ieee(&current_object),
233 |             ),
234 |         );
235 |         current_position += current_object.len();
236 |         pack.write_all(&current_object)?;
237 |         current_object.clear();
238 | 
239 |         object_counts_by_first_byte[oid[0] as usize] += 1;
240 |     }
241 | 
242 |     let mut pack_file = pack.into_inner()?;
243 | 
244 |     // Now that all the objects have been added to the packfile, insert the correct object count into the header
245 |     pack_file.seek(SeekFrom::Start(8))?;
246 |     pack_file.write_all(&(object_positions.len() as u32).to_be_bytes())?;
247 | 
248 |     // Add the sha1 pack checksum to the end of the packfile
249 |     pack_file.seek(SeekFrom::Start(0))?;
250 |     let mut pack_hasher = Sha1::new();
251 |     copy(&mut pack_file, &mut pack_hasher)?;
252 |     let pack_checksum = pack_hasher.finalize();
253 |     pack_file.write_all(&pack_checksum)?;
254 | 
255 |     pack_file.sync_all()?;
256 |     drop(pack_file);
257 | 
258 |     // At this point, the packfile is complete and we're finished processing commits, but we still need to
259 |     // generate an index file. Version-2 index files are needed because the packfile is generally bigger than
260 |     // 2**32 bytes.
261 |     let mut index = BufWriter::new(
262 |         OpenOptions::new()
263 |             .read(true)
264 |             .write(true)
265 |             .create(true)
266 |             .truncate(true)
267 |             .open(".git/objects/pack/pack-every-shorthash.idx")?,
268 |     );
269 | 
270 |     // --- Start of index file header ---
271 | 
272 |     // 4-byte "magic number"
273 |     index.write_all(b"\xfftOc")?;
274 | 
275 |     // 4-byte version number
276 |     index.write_all(&2u32.to_be_bytes())?;
277 | 
278 |     // --- End of index file header ---
279 | 
280 |     // 256-entry "fanout table", encoding the number of objects in the packfile that start with each of
281 |     // 0, 1, 2, ..., 255.
282 |     let mut num_objects: u32 = 0;
283 |     for &count_with_first_byte_equal in object_counts_by_first_byte.iter() {
284 |         num_objects += count_with_first_byte_equal;
285 |         index.write_all(&num_objects.to_be_bytes()[..])?;
286 |     }
287 | 
288 |     // At this point, we need to iterate over the objects, in order of their OID, several times. Using a B-tree
289 |     // is asymtotically optimal for this, but it results in pretty severe cache thrashing, which greatly slows down
290 |     // generating the index file. There is a lot of room for improvement here.
291 |     //
292 |     // The reason we use a B-tree in the first place, rather than just accumulating a list of
293 |     // (oid, position, checksum) tuples and sorting it afterwards, is that the current API allows an object
294 |     // to be specified as a delta from any other object by OID, and we need to be able to fetch the position
295 |     // of the delta base before we've obtained or sorted the whole list. This API is also intended to be generic
296 |     // (in that it can generate packfiles of arbitrary objects, not just the objects generated in main.rs). But in
297 |     // reality, all deltified commits that are passed to this API have the same delta base, so this issue could be
298 |     // avoided by exposing a more specialized API.
299 |     //
300 |     // Another way to avoid the issue would be to only iterate over the B-tree once, and write to several different
301 |     // parts of the file simultaneously using multiple file descriptors.
302 | 
303 |     // All of the OIDs, in lexicographic order
304 |     for oid in object_positions.keys() {
305 |         index.write_all(oid)?;
306 |     }
307 | 
308 |     // CRC32 checksums of the packed object data
309 |     for (_, checksum) in object_positions.values() {
310 |         index.write_all(&checksum.to_be_bytes())?;
311 |     }
312 | 
313 |     let mut num_big_offsets = 0u32;
314 |     // Table of 4-byte object offsets
315 |     for (position, _) in object_positions.values() {
316 |         if *position < 0x80_00_00_00 {
317 |             index.write_all(&(*position as u32).to_be_bytes())?;
318 |         } else {
319 |             index.write_all(&(0x80_00_00_00 | num_big_offsets).to_be_bytes())?;
320 |             num_big_offsets += 1;
321 |         }
322 |     }
323 | 
324 |     // Table of 8-byte object offsets
325 |     for (position, _) in object_positions.values() {
326 |         // FIXME: might faster to have two separate cursors writing to the file rather than iterating over
327 |         // the B-tree twice
328 |         if *position >= 0x80_00_00_00 {
329 |             index.write_all(&(*position as u64).to_be_bytes())?;
330 |         }
331 |     }
332 | 
333 |     // Add a copy of the pack file checksum
334 |     index.write_all(&pack_checksum)?;
335 | 
336 |     let mut index_file = index.into_inner()?;
337 | 
338 |     // Add the sha1 index checksum to the index of the index file
339 |     index_file.seek(SeekFrom::Start(0))?;
340 |     let mut index_hasher = Sha1::new();
341 |     copy(&mut index_file, &mut index_hasher)?;
342 |     index_file.write_all(&index_hasher.finalize())?;
343 | 
344 |     index_file.sync_all()?;
345 | 
346 |     // Deallocating the B-tree of object positions is very, very slow. It's a really big B-tree that has lots of
347 |     // individual allocations. Deallocating the B-tree is also completely unnecessary if the process is about to
348 |     // exit, serving only to add hours to the runtime for no reason. So the B-tree is included as a private
349 |     // returned struct field, and the caller can explicitly leak the struct rather than dropping it if needed.
350 |     Ok(PackFile {
351 |         object_positions,
352 |     })
353 | }
354 | 
355 | /// Appends a "size-encoded" non-negative integer to packfile data, using the
356 | /// encoding format specified [here](https://git-scm.com/docs/pack-format#:~:text=Size%20encoding).
357 | fn append_variable_length_size<T: Write>(mut data: T, mut size: usize) -> io::Result<()> {
358 |     loop {
359 |         let next_seven_bits = (size & 0x7f) as u8;
360 |         size >>= 7;
361 |         if size == 0 {
362 |             data.write_all(&[next_seven_bits])?;
363 |             break;
364 |         } else {
365 |             data.write_all(&[next_seven_bits | 0x80])?;
366 |         }
367 |     }
368 |     Ok(())
369 | }
370 | 
371 | /// Packfiles use a slightly different variable-length size encoding for delta offsets
372 | /// than they do for other values. This modified encoding is entirely undocumented and also necessary
373 | /// to generate a packfile that git will understand.
374 | /// [This blogpost](https://medium.com/@concertdaw/sneaky-git-number-encoding-ddcc5db5329f) contains
375 | /// some more information.
376 | fn append_variable_length_size_with_continuation_increment(data: &mut Vec<u8>, mut size: usize) {
377 |     let initial_index = data.len();
378 |     data.push((size & 0x7f) as u8);
379 |     size >>= 7;
380 |     while size > 0 {
381 |         size -= 1;
382 |         data.insert(initial_index, 0x80 | (size as u8 & 0x7f));
383 |         size >>= 7;
384 |     }
385 | }
386 | 


--------------------------------------------------------------------------------
/src/main.rs:
--------------------------------------------------------------------------------
  1 | mod git_utils;
  2 | 
  3 | use git_utils::{
  4 |     stream_to_pack_file, GitObject, GitObjectType, PackFileDeltaInstruction, PackFileObject,
  5 |     Sha1Oid,
  6 | };
  7 | use sha1::{Digest, Sha1};
  8 | use std::{convert::TryInto, iter::FusedIterator, mem::forget};
  9 | 
 10 | #[repr(transparent)]
 11 | struct PackedBoolArray {
 12 |     data: Vec<u8>,
 13 | }
 14 | 
 15 | impl PackedBoolArray {
 16 |     const MASKS: [u8; 8] = [1, 2, 4, 8, 16, 32, 64, 128];
 17 | 
 18 |     #[inline(always)]
 19 |     fn offset(index: usize) -> usize {
 20 |         index / 8
 21 |     }
 22 | 
 23 |     #[inline(always)]
 24 |     fn mask(index: usize) -> u8 {
 25 |         Self::MASKS[index % 8]
 26 |     }
 27 | 
 28 |     fn get(&self, index: usize) -> bool {
 29 |         self.data[Self::offset(index)] & Self::mask(index) != 0
 30 |     }
 31 | 
 32 |     fn set(&mut self, index: usize, value: bool) {
 33 |         if value {
 34 |             self.data[Self::offset(index)] |= Self::mask(index);
 35 |         } else {
 36 |             self.data[Self::offset(index)] &= !Self::mask(index);
 37 |         }
 38 |     }
 39 | 
 40 |     #[inline(always)]
 41 |     fn hash_to_shorthash_index(hash: &Sha1Oid) -> usize {
 42 |         (u32::from_be_bytes(hash[0..4].try_into().unwrap()) >> 4) as usize
 43 |     }
 44 | }
 45 | 
 46 | impl Default for PackedBoolArray {
 47 |     fn default() -> Self {
 48 |         Self { data: vec![0; 1 << 25] }
 49 |     }
 50 | }
 51 | 
 52 | fn main() -> std::io::Result<()> {
 53 |     let empty_tree = GitObject {
 54 |         object_type: GitObjectType::Tree,
 55 |         data: vec![],
 56 |     };
 57 |     let delta_base_commit = GitObject {
 58 |         object_type: GitObjectType::Commit,
 59 |         data: b"\
 60 |             tree 4b825dc642cb6eb9a060e54bf8d69288fbee4904\n\
 61 |             author Teddy Katz <teddy.katz@gmail.com> 1616279625 -0400\n\
 62 |             committer Teddy Katz <teddy.katz@gmail.com> 1616279625 -0400\n\
 63 |             \n\
 64 |             Entropy value for this commit: "
 65 |             .to_vec(),
 66 |     };
 67 |     let last_block_length = (delta_base_commit.data.len()
 68 |         + format!("commit {}\0", delta_base_commit.data.len()).len())
 69 |         % 64;
 70 |     assert!(
 71 |         (0..=47).contains(&last_block_length),
 72 |         "suboptimal commit length {}; hashing would be twice as slow",
 73 |         last_block_length
 74 |     );
 75 |     let mut found_shorthashes = PackedBoolArray::default();
 76 |     let delta_base_commit_oid = delta_base_commit.oid();
 77 |     found_shorthashes.set(
 78 |         PackedBoolArray::hash_to_shorthash_index(&delta_base_commit_oid),
 79 |         true,
 80 |     );
 81 | 
 82 |     let deltified_generator = DeltifiedCommitGenerator {
 83 |         delta_base_commit: delta_base_commit.clone(),
 84 |         delta_base_commit_oid,
 85 |         found_shorthashes,
 86 |         root_commit_oid_buffer: vec![delta_base_commit_oid],
 87 |         merge_commit_oid_buffer: vec![],
 88 |         delta_base_commit_extension_length: 8,
 89 |         delta_base_commit_intermediate_sha1_state: Sha1::new()
 90 |             .chain(format!("commit {}\0", delta_base_commit.data.len() + 8).as_bytes())
 91 |             .chain(&delta_base_commit.data),
 92 |         entropy_specifier: 0,
 93 |         commit_count_cap: usize::MAX,
 94 |         is_finished: false,
 95 |     };
 96 | 
 97 |     let pack_file = stream_to_pack_file(
 98 |         vec![
 99 |             PackFileObject::Raw(empty_tree),
100 |             PackFileObject::Raw(delta_base_commit),
101 |         ]
102 |         .into_iter()
103 |         .chain(deltified_generator),
104 |     )?;
105 | 
106 |     // Avoid running the destructor for the metadata, since it takes a very long time to clean up and
107 |     // we're about to exit the process anyway.
108 |     forget(pack_file);
109 | 
110 |     Ok(())
111 | }
112 | 
113 | struct DeltifiedCommitGenerator {
114 |     delta_base_commit: GitObject,
115 |     delta_base_commit_oid: Sha1Oid,
116 |     found_shorthashes: PackedBoolArray,
117 |     root_commit_oid_buffer: Vec<Sha1Oid>,
118 |     merge_commit_oid_buffer: Vec<Sha1Oid>,
119 |     delta_base_commit_extension_length: usize,
120 |     delta_base_commit_intermediate_sha1_state: Sha1,
121 | 
122 |     // Due to https://en.wikipedia.org/wiki/Coupon_collector%27s_problem, we expect to need
123 |     // 2**28 * (ln(2**28) + 0.577) = 2**32.3 total commits to find all 2**28 unique shorthashes,
124 |     // which is over the threshold of 2**32 32-bit ints.
125 |     entropy_specifier: u64,
126 |     commit_count_cap: usize,
127 |     is_finished: bool,
128 | }
129 | 
130 | impl DeltifiedCommitGenerator {
131 |     fn get_entropy(&self) -> String {
132 |         if self.delta_base_commit_extension_length == 8 {
133 |             format!("{:08x}", self.entropy_specifier)
134 |         } else {
135 |             format!("{:016x}", self.entropy_specifier)
136 |         }
137 |     }
138 | }
139 | 
140 | fn create_merge_commit(parent_oids: &[Sha1Oid]) -> GitObject {
141 |     GitObject {
142 |         object_type: GitObjectType::Commit,
143 |         data: format!(
144 |             "\
145 |                 tree 4b825dc642cb6eb9a060e54bf8d69288fbee4904\n\
146 |                 {}\
147 |                 author Teddy Katz <teddy.katz@gmail.com> 1616279625 -0400\n\
148 |                 committer Teddy Katz <teddy.katz@gmail.com> 1616279625 -0400\n\
149 |                 \n\
150 |                 Merge of {} commits\n",
151 |             parent_oids
152 |                 .iter()
153 |                 .map(|oid| {
154 |                     format!(
155 |                         "parent {}\n",
156 |                         oid.iter()
157 |                             .map(|&byte| format!("{:02x}", byte))
158 |                             .collect::<String>()
159 |                     )
160 |                 })
161 |                 .collect::<String>(),
162 |             parent_oids.len()
163 |         )
164 |         .as_bytes()
165 |         .to_vec(),
166 |     }
167 | }
168 | 
169 | impl Iterator for DeltifiedCommitGenerator {
170 |     type Item = PackFileObject;
171 |     fn next(&mut self) -> Option<Self::Item> {
172 |         if self.is_finished
173 |             || self.merge_commit_oid_buffer.len() * (1 << 14) + self.root_commit_oid_buffer.len()
174 |                 > self.commit_count_cap
175 |         {
176 |             return None;
177 |         }
178 | 
179 |         if self.merge_commit_oid_buffer.len() >= 1 << 14 {
180 |             self.is_finished = true;
181 |             let final_merge = create_merge_commit(&self.merge_commit_oid_buffer);
182 |             println!(
183 |                 "Top-level merge commit: {}",
184 |                 final_merge
185 |                     .oid()
186 |                     .iter()
187 |                     .map(|&byte| format!("{:02x}", byte))
188 |                     .collect::<String>()
189 |             );
190 |             println!("Your call is important to us.");
191 |             println!("Please hold while an index file is generated. This will take a while");
192 |             return Some(PackFileObject::Raw(final_merge));
193 |         }
194 | 
195 |         if self.root_commit_oid_buffer.len() >= 1 << 14 {
196 |             let merge = create_merge_commit(&self.root_commit_oid_buffer);
197 |             self.root_commit_oid_buffer.clear();
198 |             self.merge_commit_oid_buffer.push(merge.oid());
199 |             println!(
200 |                 "created first-level merge commit {}/{}",
201 |                 self.merge_commit_oid_buffer.len(),
202 |                 1 << 14
203 |             );
204 |             return Some(PackFileObject::Raw(merge));
205 |         }
206 | 
207 |         let new_oid = loop {
208 |             if self.entropy_specifier == (u32::MAX as u64) + 1 {
209 |                 self.delta_base_commit_extension_length = 16;
210 |                 self.delta_base_commit_intermediate_sha1_state = Sha1::new()
211 |                     .chain(
212 |                         format!(
213 |                             "commit {}\0",
214 |                             self.delta_base_commit.data.len()
215 |                                 + self.delta_base_commit_extension_length
216 |                         )
217 |                         .as_bytes(),
218 |                     )
219 |                     .chain(&self.delta_base_commit.data);
220 |             }
221 | 
222 |             let oid = self
223 |                 .delta_base_commit_intermediate_sha1_state
224 |                 .clone()
225 |                 .chain(self.get_entropy().as_bytes())
226 |                 .finalize()
227 |                 .into();
228 | 
229 |             if !self
230 |                 .found_shorthashes
231 |                 .get(PackedBoolArray::hash_to_shorthash_index(&oid))
232 |             {
233 |                 break oid;
234 |             }
235 | 
236 |             self.entropy_specifier += 1;
237 |             if self.entropy_specifier & 0xfffff == 0 {
238 |                 println!("number of commits attempted so far: {}", self.entropy_specifier);
239 |             }
240 |         };
241 | 
242 |         let delta_instructions = vec![
243 |             PackFileDeltaInstruction::CopyFromBaseObject {
244 |                 offset: 0,
245 |                 size: self.delta_base_commit.data.len(),
246 |             },
247 |             PackFileDeltaInstruction::AddNewData(self.get_entropy().as_bytes().to_vec()),
248 |         ];
249 | 
250 |         self.found_shorthashes
251 |             .set(PackedBoolArray::hash_to_shorthash_index(&new_oid), true);
252 |         self.entropy_specifier += 1;
253 |         if self.entropy_specifier & 0xfffff == 0 {
254 |             println!("number of commits attempted so far: {}", self.entropy_specifier);
255 |         }
256 |         self.root_commit_oid_buffer.push(new_oid);
257 | 
258 |         Some(PackFileObject::Deltified {
259 |             base_oid: self.delta_base_commit_oid,
260 |             base_size: self.delta_base_commit.data.len(),
261 |             delta: delta_instructions,
262 |             new_oid,
263 |             new_size: self.delta_base_commit.data.len() + self.delta_base_commit_extension_length,
264 |         })
265 |     }
266 | }
267 | 
268 | impl FusedIterator for DeltifiedCommitGenerator {}
269 | 


--------------------------------------------------------------------------------