├── Dockerfile ├── README.md └── waybackunifier.go /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.8-onbuild 2 | MAINTAINER Mohammed Diaa 3 | 4 | ENTRYPOINT ["app"] -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # WaybackUnifier 2 | 3 | WaybackUnifier allow you to take a look at how a file has ever looked by aggregating all versions of this file, and creating a unified version that contains every line that has ever been in it. 4 | 5 | ### Installation 6 | Go is required. 7 | ``` 8 | go get github.com/mhmdiaa/waybackunifier 9 | cd $GOPATH/src/github.com/mhmdiaa/waybackunifier 10 | go install 11 | ``` 12 | 13 | ### Syntax 14 | ``` 15 | -concurrency int 16 | Number of requests to make in prallel (default 1) 17 | -output string 18 | File to save results in (default "output.txt") 19 | -sub string 20 | list of comma-separated substrings to look for in snapshots (snapshots will only be considered if they contnain one of them) (default "Disallow,disallow") 21 | -url string 22 | URL to unify versions of (without protocol prefix) (default "site.com/robots.txt") 23 | ``` 24 | 25 | You can see that settings are by default suitable for unifying robots.txt files. Feel free to change the value of `-sub` to anything else, or supply an empty string to get all versions of a file without filtering. 26 | 27 | **Note:** Lines are saved *unordered* for performance reasons -------------------------------------------------------------------------------- /waybackunifier.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "flag" 6 | "fmt" 7 | "io/ioutil" 8 | "log" 9 | "net/http" 10 | "os" 11 | "strings" 12 | "sync" 13 | ) 14 | 15 | var results = struct { 16 | sync.RWMutex 17 | res map[string]struct{} 18 | }{res: make(map[string]struct{})} 19 | 20 | func main() { 21 | url := flag.String("url", "site.com/robots.txt", "URL to unify versions of (without protocol prefix)") 22 | output := flag.String("output", "output.txt", "File to save results in") 23 | concurrency := flag.Int("concurrency", 1, "Number of requests to make in prallel") 24 | substrings := flag.String("sub", "Disallow,disallow", "list of comma-separated substrings to look for in snapshots (snapshots will only be considered if they contnain one of them)") 25 | 26 | flag.Parse() 27 | var subs []string 28 | 29 | for _, sub := range strings.Split(*substrings, ",") { 30 | subs = append(subs, sub) 31 | } 32 | 33 | snapshots, err := getSnapshots(*url) 34 | if err != nil { 35 | log.Fatalf("couldn't get snapshots: %v", err) 36 | } 37 | fmt.Printf("[*] Found %d snapshots", len(snapshots)) 38 | 39 | lim := make(chan bool, *concurrency) 40 | for _, snapshot := range snapshots { 41 | lim <- true 42 | go func(snapshot []string) { 43 | defer func() { <-lim }() 44 | unifySnapshots(snapshot, subs) 45 | if err != nil { 46 | log.Printf("couldn't unify snapshots: %v", err) 47 | } 48 | }(snapshot) 49 | } 50 | 51 | for i := 0; i < cap(lim); i++ { 52 | lim <- true 53 | } 54 | 55 | r := "" 56 | for i := range results.res { 57 | r += i + "\n" 58 | } 59 | f, err := os.Create(*output) 60 | if err != nil { 61 | log.Fatalf("couldn't create output file: %v", err) 62 | } 63 | defer f.Close() 64 | 65 | f.Write([]byte(r)) 66 | } 67 | 68 | func unifySnapshots(snapshot []string, subs []string) { 69 | content, err := getContent(snapshot) 70 | if err != nil { 71 | log.Printf("couldn't fetch snapshot: %v", err) 72 | } 73 | if len(subs) > 0 { 74 | foundSub := false 75 | for _, sub := range subs { 76 | if strings.Contains(content, sub) { 77 | foundSub = true 78 | } 79 | } 80 | if !foundSub { 81 | log.Printf("snapshot %s/%s doesn't contain any substring", snapshot[0], snapshot[1]) 82 | } 83 | } 84 | c := strings.Split(content, "\n") 85 | for _, line := range c { 86 | results.Lock() 87 | if line != "" { 88 | results.res[line] = struct{}{} 89 | } 90 | results.Unlock() 91 | } 92 | } 93 | 94 | func getSnapshots(url string) ([][]string, error) { 95 | resp, err := http.Get("https://web.archive.org/cdx/search/cdx?url=" + url + "&output=json&fl=timestamp,original&filter=statuscode:200&collapse=digest") 96 | if err != nil { 97 | return nil, fmt.Errorf("coudln't load waybackmachine search results for %s: %v", url, err) 98 | } 99 | defer resp.Body.Close() 100 | 101 | var results [][]string 102 | body, err := ioutil.ReadAll(resp.Body) 103 | if err != nil { 104 | return nil, fmt.Errorf("couldn't read waybackmachine search results for %s: %v", url, err) 105 | } 106 | 107 | err = json.Unmarshal(body, &results) 108 | if err != nil { 109 | return nil, fmt.Errorf("coudln't deserialize JSON response from waybackmachine for %s: %v", url, err) 110 | } 111 | if len(results) == 0 { 112 | return make([][]string, 0), fmt.Errorf("") 113 | } 114 | return results[1:], nil 115 | } 116 | 117 | func getContent(snapshot []string) (string, error) { 118 | timestamp := snapshot[0] 119 | original := snapshot[1] 120 | url := "https://web.archive.org/web/" + timestamp + "if_" + "/" + original 121 | resp, err := http.Get(url) 122 | if err != nil { 123 | return "", fmt.Errorf("couldn't load snapshot for %s/%s: %v", timestamp, original, err) 124 | } 125 | defer resp.Body.Close() 126 | content, err := ioutil.ReadAll(resp.Body) 127 | if err != nil { 128 | return "", fmt.Errorf("couldn't read snapshot content for %s/%s: %v", timestamp, original, err) 129 | } 130 | return string(content), nil 131 | } 132 | --------------------------------------------------------------------------------