├── .gitignore ├── .goreleaser.yaml ├── LICENSE ├── README.md ├── go.mod ├── image-1.png └── main.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Test binary, built with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | 14 | # Dependency directories (remove the comment below to include it) 15 | # vendor/ 16 | 17 | *.pdf 18 | generated/ -------------------------------------------------------------------------------- /.goreleaser.yaml: -------------------------------------------------------------------------------- 1 | # This is an example .goreleaser.yml file with some sensible defaults. 2 | # Make sure to check the documentation at https://goreleaser.com 3 | before: 4 | hooks: 5 | # You may remove this if you don't use go modules. 6 | - go mod tidy 7 | # you may remove this if you don't need go generate 8 | - go generate ./... 9 | builds: 10 | - env: 11 | - CGO_ENABLED=0 12 | goos: 13 | - linux 14 | - windows 15 | - darwin 16 | archives: 17 | - replacements: 18 | darwin: Darwin 19 | linux: Linux 20 | windows: Windows 21 | 386: i386 22 | amd64: x86_64 23 | checksum: 24 | name_template: 'checksums.txt' 25 | snapshot: 26 | name_template: "{{ incpatch .Version }}-next" 27 | changelog: 28 | sort: asc 29 | filters: 30 | exclude: 31 | - '^docs:' 32 | - '^test:' 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 SerHack 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pdf-diff 2 | A tool for visualizing differences between two pdf files. Mainly dedicated to editors that usually spends a lot of hours on several pdf. 3 | 4 | ![Example of output](https://raw.githubusercontent.com/serhack/pdf-diff/main/image-1.png) 5 | 6 | ## Foreword 7 | 8 | I use [Indesign](https://www.adobe.com/it/products/indesign.html) almost daily, and the pagination and convenient graphical interface make that product number 1 among desktop publishing programs. Indesign, as well as many other graphics programs, have one flaw: because they are not based on any versioning tool, it is difficult to compare two versions of the same file 9 | 10 | I sometimes have to do some retouching to files I produce. Be they resumes, books or technical manuals. However, if editing a resume is very easy, editing large volumes is much more difficult. Several times, sharing the result of pdfs with my team, we could not clearly visualize the differences between one version and another. This is compounded by human error: with more than 50-60 pages to review, it is impossible to keep track of all the changes between versions! 11 | 12 | Therefore, I developed through the powerful go programming language a new tool called `pdf-diff`. Pdf-diff allows you to create images that show exactly where the pdf has changed, thus displaying the changes from one version to another. 13 | 14 | ## How it works 15 | 16 | From a technical point of view, the tool is very simple and trivial. Pdf-diff uses pdftoppm to generate a series of images from the pdfs to be compared (one for each page). It then uses a very trivial pixel comparison algorithm to draw some red rectangles that display the differences between one pdf and another. The difference is based on RGB values of the pixel, so it can basically compare whatever you want. The go script also uses golang's very powerful native encoding/decoding image engine (which I personally was not familiar with!). I was very impressed with what is possible to do with Go in just a few lines of code. 17 | 18 | The images generated by pdf are inserted into a folder named as the hash of the content of the pdf file. E.g. the file has the hash `fc324..`, the images are in the `fc324` folder. If a folder with that name already exists, pdf-diff will not create any images since it consider that images were already generated. 19 | 20 | The code is not very clean and certainly can be optimized. I am asking some person much more knowledgeable than me in graphics if it is possible to create a simple algorithm that can apply a background color only locally, and not on the whole row where the pixel is changed. 21 | 22 | ## How to use 23 | 24 | The only requirement asked for running this tool is the `pdftoppm` program. Based on your operating system or distro, you might want to check `poppler-utils` package. A command for installing that tool in Ubuntu/Debian distro might be: 25 | 26 | ``` 27 | apt install poppler-utils 28 | ``` 29 | 30 | For [Homebrew](https://brew.sh/) users: 31 | 32 | ```sh 33 | $ brew install pdf-diff 34 | ``` 35 | 36 | ```sh 37 | $ pdf-diff pdf-1.pdf pdf-2.pdf 38 | ``` 39 | 40 | Once ran, the images are created in the folder `generated`. 41 | 42 | ### Contact 43 | 44 | If you wish to use this for your project, go ahead. If you have any issues or improvements, feel free to open a new [ISSUE](https://github.com/serhack/pdf-diff/issues). Lastly, if you have a good algorithm to implement or just to discuss about any other tools for editor, you can [email me](hi@serhack.me). 45 | 46 | #### Donation 47 | 48 | If you think my work contributed a little bit to your projects, goals or company, please let me know. 49 | 50 | Monero: `47VFueCo1yvc6nq688QsBt9UZSrg5z2JLFUwWFs4WtHBSwDsybDbnmLiydo46ybPeqSMxypnjmz5pdz87t4VjngfQfmMd4S` 51 | Bitcoin: `1Pt3YwkFoexAA3s9pV3saoJ2EAXzpqBmrp` 52 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/serhack/pdf-diff 2 | 3 | go 1.18 4 | -------------------------------------------------------------------------------- /image-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/serhack/pdf-diff/e4277cb22337157d7aa078f5b1957df95fefac5e/image-1.png -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "crypto/sha256" 5 | "errors" 6 | "flag" 7 | "fmt" 8 | "image" 9 | "image/color" 10 | "image/png" 11 | "io" 12 | "os" 13 | "os/exec" 14 | "strconv" 15 | ) 16 | 17 | // Structure for Pixel. 18 | type Pixel struct { 19 | r, g, b, a uint8 20 | modified bool 21 | } 22 | 23 | var rmaster, gmaster, bmaster float64 24 | 25 | func rgbaToPixel(r uint32, g uint32, b uint32, a uint32) Pixel { 26 | return Pixel{uint8(r >> 8), uint8(g >> 8), uint8(b >> 8), uint8(a >> 8), false} 27 | } 28 | 29 | func CreatePNG(PDFPath string) { 30 | 31 | fmt.Println("Image generation for: " + PDFPath) 32 | 33 | // Computes the sha256 hash 34 | folderName := ComputeSha256(PDFPath) 35 | 36 | // Checks if a folder with the name sha256(file) already exists 37 | if _, err := os.Stat(folderName); err == nil { 38 | return 39 | } 40 | 41 | // If not, probably we never met this pdf. Create the folder 42 | err := os.Mkdir(folderName, os.ModePerm) 43 | if err != nil { 44 | panic(err) 45 | } 46 | 47 | // Create the images 48 | cmd, _ := exec.Command("pdftoppm", "-png", PDFPath, folderName+"/png_gen").Output() 49 | fmt.Println(cmd) 50 | } 51 | 52 | func RetrievePixel(fileName string) ([][]Pixel, int, int) { 53 | infile, err := os.Open(fileName) 54 | if err != nil { 55 | panic(err) 56 | } 57 | defer infile.Close() 58 | 59 | img, _, err := image.Decode(infile) 60 | if err != nil { 61 | panic(err) 62 | } 63 | 64 | bounds := img.Bounds() 65 | width, height := bounds.Max.X, bounds.Max.Y 66 | pixels := make([][]Pixel, bounds.Max.Y) 67 | for y := bounds.Min.Y; y < height; y++ { 68 | row := make([]Pixel, bounds.Max.X) 69 | for x := bounds.Min.X; x < width; x++ { 70 | row[x] = rgbaToPixel(img.At(x, y).RGBA()) 71 | } 72 | pixels[y] = row 73 | } 74 | return pixels, width, height 75 | } 76 | 77 | func drawSection(row []Pixel) { 78 | alpha := 0.6 79 | notalpha := float64(1 - alpha) 80 | 81 | for i := 0; i < len(row); i++ { 82 | 83 | if !row[i].modified { 84 | row[i].r = uint8(float64(row[i].r)*alpha + notalpha*rmaster) 85 | row[i].g = uint8(float64(row[i].g)*alpha + notalpha*gmaster) 86 | row[i].b = uint8(float64(row[i].b)*alpha + notalpha*bmaster) 87 | row[i].modified = true 88 | } 89 | 90 | } 91 | } 92 | 93 | func CompareSingleImage(path1 string, path2 string, i int) { 94 | 95 | sha1 := ComputeSha256(path1) 96 | sha2 := ComputeSha256(path2) 97 | 98 | // If the two images have the same hash, the two pages are the same. 99 | if sha1 == sha2 { 100 | fmt.Printf("The pages number %d are the same.\n", i) 101 | return 102 | } 103 | 104 | pixel_1, x_1, y_1 := RetrievePixel(path1) 105 | pixel_2, x_2, y_2 := RetrievePixel(path2) 106 | 107 | if x_1 != x_2 { 108 | if y_1 != y_2 { 109 | fmt.Println("Warning: comparing two pdfs that do not have the same dimensions might cause some problems.") 110 | } 111 | } 112 | 113 | pixel_3 := pixel_2 114 | 115 | for y := 0; y < len(pixel_1); y++ { 116 | for x := 0; x < len(pixel_1[y]); x++ { 117 | if !pixel_3[y][x].modified { 118 | result := compareSinglePixel(pixel_1[y][x], pixel_2[y][x]) 119 | if !result { 120 | drawSection(pixel_3[y]) 121 | } 122 | } 123 | } 124 | } 125 | 126 | img := image.NewRGBA(image.Rect(0, 0, x_1, y_1)) 127 | for y := 0; y < y_1; y++ { 128 | for x := 0; x < x_1; x++ { 129 | img.Set(x, y, color.RGBA{ 130 | R: pixel_3[y][x].r, 131 | G: pixel_3[y][x].g, 132 | B: pixel_3[y][x].b, 133 | A: pixel_3[y][x].a, 134 | }) 135 | } 136 | } 137 | 138 | // Create the file under "generated" folder 139 | f, err := os.Create("generated/image-" + strconv.Itoa(i) + ".png") 140 | if err != nil { 141 | panic(err) 142 | } 143 | 144 | // Encode the image 145 | if err := png.Encode(f, img); err != nil { 146 | f.Close() 147 | panic(err) 148 | } 149 | 150 | if err := f.Close(); err != nil { 151 | panic(err) 152 | } 153 | 154 | } 155 | 156 | func compareSinglePixel(image1 Pixel, image2 Pixel) bool { 157 | // Returns true if two pixel are the same pixel 158 | if image1.b == image2.b && image1.g == image2.g && image1.r == image2.r && image1.a == image2.a { 159 | return true 160 | } 161 | return false 162 | } 163 | 164 | func ComputeSha256(filePath string) string { 165 | // Computes the hash of any file 166 | f, err := os.Open(filePath) 167 | if err != nil { 168 | panic(err) 169 | } 170 | defer f.Close() 171 | 172 | h := sha256.New() 173 | if _, err := io.Copy(h, f); err != nil { 174 | panic(err) 175 | } 176 | 177 | return fmt.Sprintf("%x", h.Sum(nil)) 178 | } 179 | 180 | func Compare(PDF1 string, PDF2 string) { 181 | // Compares the two files 182 | 183 | shaPDF1 := ComputeSha256(PDF1) 184 | shaPDF2 := ComputeSha256(PDF2) 185 | 186 | if _, err := os.Stat("generated"); errors.Is(err, os.ErrNotExist) { 187 | err := os.Mkdir("generated", os.ModePerm) 188 | if err != nil { 189 | panic(err) 190 | } 191 | } 192 | 193 | i := 1 194 | k := 1 195 | for { 196 | // pdftoppm creates pngs and the numbers are padded with a variable numbers of 0. 197 | // e.g. pdf contains <= 99 pages => 01.. 02.. 03.. 198 | // pdf contains <= 999 pages => 001.. 002.. 003 199 | 200 | o := fmt.Sprintf("%d", k) 201 | s := fmt.Sprintf("%0"+o+"d", i) 202 | 203 | s_pdf1 := shaPDF1 + "/png_gen-" + s + ".png" 204 | s_pdf2 := shaPDF2 + "/png_gen-" + s + ".png" 205 | 206 | if _, err := os.Stat(s_pdf1); errors.Is(err, os.ErrNotExist) { 207 | k++ 208 | if k == 12 { 209 | break 210 | } 211 | } else { 212 | CompareSingleImage(s_pdf1, s_pdf2, i) 213 | i++ 214 | } 215 | 216 | } 217 | } 218 | 219 | func hexToRGB(hexcolor string) { 220 | // converts a string to rgb values 221 | values, _ := strconv.ParseUint(hexcolor, 16, 32) 222 | rmaster = float64(values >> 16) 223 | gmaster = float64((values >> 8) & 0xff) 224 | bmaster = float64((values) & 0xff) 225 | 226 | fmt.Printf("Color chosen: %f %f %f \n", rmaster, gmaster, bmaster) 227 | 228 | } 229 | 230 | func main() { 231 | 232 | // flags 233 | 234 | color := flag.String("color", "ff2010", "hex value for the background color for highlighting") 235 | flag.Parse() 236 | 237 | arguments := flag.Args() 238 | 239 | if len(arguments) < 2 { 240 | fmt.Println("pdf-diff: highlights the differences between two pdf files.") 241 | fmt.Println("Usage: pdf-diff pdf-file-1 pdf-file-2 [-color] hex-color") 242 | fmt.Println() 243 | flag.PrintDefaults() 244 | os.Exit(1) 245 | } 246 | 247 | hexToRGB(*color) 248 | CreatePNG(arguments[0]) 249 | CreatePNG(arguments[1]) 250 | Compare(arguments[0], arguments[1]) 251 | 252 | } 253 | --------------------------------------------------------------------------------