├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── src └── main.rs └── stallman.tex /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | *.aux 3 | *.log 4 | *.pdf 5 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "adler" 7 | version = "1.0.2" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" 10 | 11 | [[package]] 12 | name = "cfg-if" 13 | version = "1.0.0" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 16 | 17 | [[package]] 18 | name = "crc32fast" 19 | version = "1.3.2" 20 | source = "registry+https://github.com/rust-lang/crates.io-index" 21 | checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" 22 | dependencies = [ 23 | "cfg-if", 24 | ] 25 | 26 | [[package]] 27 | name = "flate2" 28 | version = "1.0.25" 29 | source = "registry+https://github.com/rust-lang/crates.io-index" 30 | checksum = "a8a2db397cb1c8772f31494cb8917e48cd1e64f0fa7efac59fbd741a0a8ce841" 31 | dependencies = [ 32 | "crc32fast", 33 | "miniz_oxide", 34 | ] 35 | 36 | [[package]] 37 | name = "miniz_oxide" 38 | version = "0.6.2" 39 | source = "registry+https://github.com/rust-lang/crates.io-index" 40 | checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa" 41 | dependencies = [ 42 | "adler", 43 | ] 44 | 45 | [[package]] 46 | name = "rust-pdf-hacking" 47 | version = "0.1.0" 48 | dependencies = [ 49 | "flate2", 50 | ] 51 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rust-pdf-hacking" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | flate2 = "1.0.25" 10 | 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Alexey Kutepov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Rust PDF Hacking 2 | 3 | Just some experiments with parsing PDF using Rust with minimal dependencies. 4 | 5 | ## Quick Start 6 | 7 | ```console 8 | $ pdflatex stallman.tex 9 | $ cargo run ./stallman.pdf 10 | ``` 11 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use std::result; 2 | use std::fs::File; 3 | use std::io::Read; 4 | use flate2::read::ZlibDecoder; 5 | 6 | type Result = result::Result; 7 | 8 | #[derive(Debug)] 9 | enum Token<'a> { 10 | Number(i32), 11 | Symbol(&'a str), 12 | Dictionary(usize), 13 | Stream(&'a [u8]), 14 | } 15 | 16 | struct PdfParser<'a> { 17 | content: &'a [u8] 18 | } 19 | 20 | impl<'a> PdfParser<'a> { 21 | fn from_bytes(content: &'a [u8]) -> Self { 22 | Self { content } 23 | } 24 | 25 | fn trim_left_spaces(&mut self) { 26 | let mut index = 0; 27 | while index < self.content.len() && self.content[index].is_ascii_whitespace() { 28 | index += 1; 29 | } 30 | self.content = &self.content[index..]; 31 | } 32 | 33 | fn drop_line(&mut self) { 34 | let mut index = 0; 35 | while index < self.content.len() && self.content[index] != '\n' as u8 { 36 | index += 1; 37 | } 38 | if index < self.content.len() { 39 | self.content = &self.content[index + 1..] 40 | } else { 41 | self.content = &self.content[index..] 42 | } 43 | } 44 | 45 | fn trim_left_spaces_and_comments(&mut self) { 46 | loop { 47 | self.trim_left_spaces(); 48 | if self.content.len() > 0 && self.content[0] == '%' as u8 { 49 | self.drop_line(); 50 | continue; 51 | } else { 52 | break; 53 | } 54 | } 55 | } 56 | 57 | fn chop_brackets(&mut self, bra: &[u8], ket: &[u8]) -> &[u8] { 58 | self.content = &self.content[bra.len()..]; 59 | let mut index = 0; 60 | while index < self.content.len() && !self.content[index..].starts_with(ket) { 61 | index += 1; 62 | } 63 | let bytes = &self.content[0..index]; 64 | if self.content[index..].starts_with(ket) { 65 | self.content = &self.content[index+ket.len()..]; 66 | } else { 67 | self.content = &self.content[index..]; 68 | } 69 | bytes 70 | } 71 | 72 | fn next_token(&mut self) -> Option { 73 | self.trim_left_spaces_and_comments(); 74 | 75 | if self.content.len() == 0 { 76 | return None; 77 | } 78 | 79 | // Number 80 | if self.content[0].is_ascii_digit() { 81 | let mut index = 0; 82 | while index < self.content.len() && self.content[index].is_ascii_digit() { 83 | index += 1; 84 | } 85 | let number = std::str::from_utf8(&self.content[0..index]) 86 | .expect("sequence of ASCII digits to be a correct UTF-8 string") 87 | .parse() 88 | .expect("that the sequence will fit within the limits of i32, but we don't know for sure"); 89 | self.content = &self.content[index..]; 90 | return Some(Token::Number(number)); 91 | } 92 | 93 | // Dictionary 94 | if self.content.starts_with(b"<<") { 95 | return Some(Token::Dictionary(self.chop_brackets(b"<<", b">>").len())) 96 | } 97 | 98 | // Stream 99 | if self.content.starts_with(b"stream\n") { 100 | return Some(Token::Stream(self.chop_brackets(b"stream\n", b"\nendstream"))); 101 | } 102 | 103 | // Symbol 104 | if self.content[0].is_ascii_alphabetic() { 105 | let mut index = 0; 106 | while index < self.content.len() && self.content[index].is_ascii_alphanumeric() { 107 | index += 1; 108 | } 109 | let symbol = std::str::from_utf8(&self.content[0..index]) 110 | .expect("sequence of ASCII alphanumerics to be a correct UTF-8 string"); 111 | self.content = &self.content[index..]; 112 | return Some(Token::Symbol(symbol)); 113 | } 114 | 115 | unreachable!("Unknown object") 116 | } 117 | } 118 | 119 | fn main() -> Result<()> { 120 | let mut args = std::env::args(); 121 | let program = args.next().expect("Program is always provided"); 122 | let file_path = args.next().ok_or_else(|| { 123 | eprintln!("Usage: {program} "); 124 | eprintln!("ERROR: no input was provided"); 125 | })?; 126 | let mut content = Vec::new(); 127 | File::open(&file_path) 128 | .and_then(|mut file| file.read_to_end(&mut content)) 129 | .map_err(|err| { 130 | eprintln!("ERROR: could not read file {file_path}: {err}"); 131 | })?; 132 | let mut pdf_parser = PdfParser::from_bytes(&content); 133 | 134 | while let Some(token) = pdf_parser.next_token() { 135 | if let Token::Stream(bytes) = token { 136 | let mut d = ZlibDecoder::new(bytes); 137 | let mut s = String::new(); 138 | match d.read_to_string(&mut s) { 139 | Ok(_) => println!("{s}"), 140 | Err(err) => { 141 | eprintln!("{err}"); 142 | match std::str::from_utf8(&bytes[0..8]) { 143 | Ok(s) => println!("{s}"), 144 | Err(err) => eprintln!("{err}"), 145 | } 146 | } 147 | } 148 | println!("------------------------------"); 149 | } 150 | } 151 | 152 | Ok(()) 153 | } 154 | -------------------------------------------------------------------------------- /stallman.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \begin{document} 3 | \section{Stallman} 4 | 5 | I'd just like to interject for a moment. What you're refering to as 6 | Linux, is in fact, GNU/Linux, or as I've recently taken to calling it, 7 | GNU plus Linux. Linux is not an operating system unto itself, but 8 | rather another free component of a fully functioning GNU system made 9 | useful by the GNU corelibs, shell utilities and vital system 10 | components comprising a full OS as defined by POSIX. 11 | 12 | Many computer users run a modified version of the GNU system every 13 | day, without realizing it. Through a peculiar turn of events, the 14 | version of GNU which is widely used today is often called Linux, and 15 | many of its users are not aware that it is basically the GNU system, 16 | developed by the GNU Project. 17 | 18 | There really is a Linux, and these people are using it, but it is just 19 | a part of the system they use. Linux is the kernel: the program in the 20 | system that allocates the machine's resources to the other programs 21 | that you run. The kernel is an essential part of an operating system, 22 | but useless by itself; it can only function in the context of a 23 | complete operating system. Linux is normally used in combination with 24 | the GNU operating system: the whole system is basically GNU with Linux 25 | added, or GNU/Linux. All the so-called Linux distributions are really 26 | distributions of GNU/Linux! 27 | \end{document} --------------------------------------------------------------------------------