├── .gitignore ├── README.md ├── go.mod ├── go.sum └── main.go /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_STORE 2 | .idea 3 | out* 4 | dist/* -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Diff URLs 2 | 3 | Remove duplicate URLs by retaining only the unique combinations of hostname, path, and parameter names. 4 | 5 | ## Install 6 | 7 | ```bash 8 | go install github.com/j3ssie/durl@latest 9 | ``` 10 | 11 | ## Usage 12 | 13 | ```bash 14 | # basic usage 15 | cat wayback_urls.txt | durl | tee differ-urls.txt 16 | 17 | # with extra regex 18 | cat wayback_urls.txt | durl -e 'your-regex-here' | tee differ-urls.txt 19 | 20 | # only get the scope domain 21 | cat spider-urls.txt | durl -t 'target.com' | tee in-scope-url.txt 22 | 23 | # parse JSONL data 24 | cat large-jsonl-data.txt | durl -t 'target.com' -f url | tee in-scope-jsonl-data.txt 25 | ``` 26 | 27 | ## Covered cases 28 | 29 | The following examples illustrate the criteria used to ensure each URL is considered unique and listed only once: 30 | 31 | 1. URLs with the same hostname, path, and parameter names 32 | 33 | ``` 34 | http://sample.example.com/product.aspx?productID=123&type=customer 35 | http://sample.example.com/product.aspx?productID=456&type=admin 36 | ``` 37 | 38 | 2. Paths indicating static content like blog, news or calender. 39 | 40 | ``` 41 | https://www.example.com/cn/news/all-news/public-1.html 42 | https://www.sample.com/de/about/business/countrysites.htm 43 | https://www.sample.com/de/about/business/very-long-string-here-that-exceed-100-char.htm 44 | https://www.sample.com/de/blog/2022/01/02/blog-title.htm 45 | ``` 46 | 47 | 3. URLs with numeric variations 48 | 49 | ``` 50 | https://www.example.com/data/0001.html 51 | https://www.example.com/data/0002.html 52 | ``` 53 | 54 | 4. Static file will be ignore like `http://example.com.com/cdn-cgi/style.css` 55 | 56 | 5. Select a url JSON field from the input then filtering with all of the cases above. -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/j3ssie/durl 2 | 3 | go 1.22.1 4 | 5 | require github.com/valyala/fastjson v1.6.4 // indirect 6 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/valyala/fastjson v1.6.4 h1:uAUNq9Z6ymTgGhcm0UynUAB6tlbakBrz6CQFax3BXVQ= 2 | github.com/valyala/fastjson v1.6.4/go.mod h1:CLCAqky6SMuOcxStkYQvblddUtoRxhYMGLrsQns1aXY= 3 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "crypto/sha1" 6 | "flag" 7 | "fmt" 8 | "net/url" 9 | "os" 10 | "regexp" 11 | "sort" 12 | "strings" 13 | 14 | "github.com/valyala/fastjson" 15 | ) 16 | 17 | // Strip out similar URLs by unique hostname-path-paramName and some other noise pattern 18 | // cat urls.txt | durl 19 | // only grep url have parameter 20 | // cat urls.txt | durl -p 21 | 22 | var ( 23 | excludeStatic bool 24 | excludeNoise bool 25 | haveParam bool 26 | handleJson bool 27 | limit int 28 | ext string 29 | targetScope string 30 | jsonField string 31 | ) 32 | 33 | func main() { 34 | // cli aguments 35 | flag.BoolVar(&excludeStatic, "s", true, "Exclude static files extensions") 36 | flag.BoolVar(&excludeNoise, "n", true, "Exclude noise content pattern like blogspot, calender, etc") 37 | flag.BoolVar(&haveParam, "p", false, "Enable check if input have parameter") 38 | flag.IntVar(&limit, "l", 100, "Limit length of path item (default 100)") 39 | flag.StringVar(&ext, "e", "", "Blacklist regex string") 40 | flag.StringVar(&targetScope, "t", "", "Target scope") 41 | flag.StringVar(&jsonField, "f", "", "Field to select in JSON data (only apply for JSON input)") 42 | 43 | flag.Parse() 44 | var p fastjson.Parser 45 | if jsonField != "" { 46 | handleJson = true 47 | } 48 | 49 | data := make(map[string]string) 50 | hostMapping := make(map[string]string) 51 | sc := bufio.NewScanner(os.Stdin) 52 | for sc.Scan() { 53 | raw := strings.TrimSpace(sc.Text()) 54 | if sc.Err() != nil && raw == "" { 55 | continue 56 | } 57 | 58 | var original string 59 | 60 | // handle json 61 | // check if the input is JSON or not 62 | if jsonField != "" { 63 | v, err := p.Parse(raw) 64 | if err != nil { 65 | continue 66 | } 67 | original = raw 68 | raw = string(v.GetStringBytes(jsonField)) 69 | } 70 | 71 | if excludeStatic { 72 | if IsStaticPattern(raw) { 73 | continue 74 | } 75 | } 76 | 77 | // parsing the URL 78 | u, err := url.Parse(raw) 79 | if err != nil || u.Hostname() == "" { 80 | continue 81 | } 82 | 83 | // check if the url host is in scope or not 84 | if targetScope != "" { 85 | if !strings.Contains(u.Hostname(), targetScope) { 86 | continue 87 | } 88 | } 89 | 90 | hash := hashUrl(u) 91 | if hash == "" { 92 | continue 93 | } 94 | 95 | _, exist := data[hash] 96 | if !exist { 97 | if excludeNoise { 98 | if IsBlackList(raw) { 99 | _, notSeenYet := hostMapping[u.Hostname()] 100 | if !notSeenYet { 101 | hostMapping[u.Hostname()] = raw 102 | fmt.Println(raw) 103 | } 104 | continue 105 | } 106 | } 107 | 108 | if ext != "" { 109 | if !RegexCheck(ext, raw) { 110 | continue 111 | } 112 | } 113 | 114 | if handleJson { 115 | data[hash] = original 116 | fmt.Println(original) 117 | } else { 118 | data[hash] = raw 119 | fmt.Println(data[hash]) 120 | } 121 | 122 | } 123 | } 124 | } 125 | 126 | // IsBlackList check if url is blacklisted or not 127 | func IsBlackList(raw string) bool { 128 | calenderPattern := `(\d{2,4})(-|/)(\d{1,2})(-|/)(\d{1,2})` 129 | if RegexCheck(calenderPattern, raw) { 130 | return true 131 | } 132 | 133 | noiseContent := `/(articles|about|blog|event|events|shop|post|posts|product|products|docs|support|pages|media|careers|jobs|video|videos|resource|resources)/.*` 134 | if RegexCheck(noiseContent, raw) { 135 | return true 136 | } 137 | 138 | // e.g: /abc/1234 139 | idContentNoExt := `.*\/[0-9]+$` 140 | if RegexCheck(idContentNoExt, raw) { 141 | return true 142 | } 143 | // e.g: /abc/1234.html 144 | idContent := `.*\/[0-9]+\.[a-z]+` 145 | return RegexCheck(idContent, raw) 146 | } 147 | 148 | func RegexCheck(pattern string, raw string) bool { 149 | r, err := regexp.Compile(pattern) 150 | if err != nil { 151 | return false 152 | } 153 | return r.MatchString(raw) 154 | } 155 | 156 | // IsStaticPattern check if url is blacklisted or not 157 | func IsStaticPattern(raw string) bool { 158 | staticPattern := `(?i)\.(png|apng|bmp|gif|ico|cur|jpg|jpeg|jfif|pjp|pjpeg|svg|tif|tiff|webp|xbm|3gp|aac|flac|mpg|mpeg|mp3|mp4|m4a|m4v|m4p|oga|ogg|ogv|mov|wav|webm|eot|woff|woff2|ttf|otf|css)(?:\?|#|$)` 159 | 160 | if RegexCheck(staticPattern, raw) { 161 | return true 162 | } 163 | 164 | // check if have param 165 | if haveParam { 166 | return !RegexCheck(`\?.*\=`, raw) 167 | } 168 | 169 | return false 170 | } 171 | 172 | // hashUrl gen unique hash base on url 173 | func hashUrl(u *url.URL) string { 174 | // length check for path element or seeing too much "-" 175 | if strings.Count(u.Path, "/") >= 1 { 176 | paths := strings.Split(u.Path, "/") 177 | for _, item := range paths { 178 | if len(item) > limit || strings.Count(item, "-") > 3 { 179 | return "" 180 | } 181 | } 182 | } 183 | 184 | var queries []string 185 | for k := range u.Query() { 186 | queries = append(queries, k) 187 | } 188 | sort.Strings(queries) 189 | query := strings.Join(queries, "-") 190 | 191 | data := fmt.Sprintf("%v-%v-%v", u.Hostname(), u.Path, query) 192 | return genHash(data) 193 | } 194 | 195 | // genHash gen SHA1 hash from string 196 | func genHash(text string) string { 197 | h := sha1.New() 198 | h.Write([]byte(text)) 199 | hashed := h.Sum(nil) 200 | return fmt.Sprintf("%v", hashed) 201 | } 202 | --------------------------------------------------------------------------------