├── urlx.go └── urlx_test.go /urlx.go: -------------------------------------------------------------------------------- 1 | // Package urlx extracts urls from plain text using regular expressions. 2 | package urlx 3 | 4 | import ( 5 | "html" 6 | "net/url" 7 | "strings" 8 | "unicode" 9 | ) 10 | 11 | // ExtractSubdomains finds all subdomains from a given text 12 | func ExtractSubdomains(text, domain string) (urls []string) { 13 | allUrls := findAllUrls(text) 14 | var finalUrls []string 15 | 16 | for _, u := range allUrls { 17 | finalUrls = append(finalUrls, handleURI(u)...) 18 | } 19 | 20 | // Filter by domains and remove duplicates 21 | finalUrls = filterByDomain(finalUrls, domain) 22 | 23 | return finalUrls 24 | } 25 | 26 | func findAllUrls(text string) (urls []string) { 27 | for i, r := range text { 28 | if r == '.' { 29 | bck := string(r) 30 | //Go back till first valid ascii or number 31 | for backIndex := i - 1; backIndex >= 0; backIndex-- { 32 | rr := rune(text[backIndex]) 33 | if isValidRuneBack(rr) { 34 | bck = string(rr) + bck 35 | } else { 36 | break 37 | } 38 | } 39 | 40 | //Go forth till the last valid ascii or number 41 | for forwardIndex := i + 1; forwardIndex < len(text); forwardIndex++ { 42 | rr := rune(text[forwardIndex]) 43 | if isValidRuneForward(rr) { 44 | bck = bck + string(rr) 45 | } else { 46 | break 47 | } 48 | } 49 | urls = append(urls, bck) 50 | } 51 | } 52 | 53 | return urls 54 | } 55 | 56 | func isValidRuneBack(r rune) bool { 57 | return unicode.IsNumber(r) || unicode.IsLetter(r) || r == ':' || r == '/' || r == '_' || r == '-' || r == '%' 58 | } 59 | 60 | func isValidRuneForward(r rune) bool { 61 | return isValidRuneBack(r) || r == '.' 62 | } 63 | 64 | func handleURI(u string) []string { 65 | var urls []string 66 | // Try to parse as normal URI 67 | if u, err := url.ParseRequestURI(u); err == nil { 68 | urls = append(urls, u.Host) 69 | return urls 70 | 71 | } 72 | 73 | // Html Unescape 74 | u = html.UnescapeString(u) 75 | 76 | // Query Unescape 77 | u, _ = url.QueryUnescape(u) 78 | 79 | replacer := strings.NewReplacer( 80 | "u003d", " ", 81 | "/", " ", 82 | "\\", " ", 83 | "-site:", " ", 84 | "-www", "www", 85 | ) 86 | 87 | // Suppress bad chars 88 | u = replacer.Replace(u) 89 | 90 | // Suppress bad starting characters 91 | u = suppressLeftChar(u) 92 | 93 | // Split on spaces 94 | return strings.Split(u, " ") 95 | } 96 | 97 | func suppressLeftChar(s string) string { 98 | if strings.HasPrefix(s, "-www") { 99 | return s[1:] 100 | } 101 | 102 | if strings.HasPrefix(s, "-site:") { 103 | return s[6:] 104 | } 105 | 106 | for i, r := range s { 107 | if r == '/' { 108 | return s[i:] 109 | } 110 | } 111 | 112 | return s 113 | } 114 | 115 | func filterByDomain(urls []string, domain string) []string { 116 | result := []string{} 117 | seen := map[string]string{} 118 | for _, u := range urls { 119 | if strings.HasSuffix(u, domain) { 120 | if _, ok := seen[u]; !ok { 121 | result = append(result, u) 122 | seen[u] = u 123 | } 124 | } 125 | } 126 | return result 127 | } 128 | -------------------------------------------------------------------------------- /urlx_test.go: -------------------------------------------------------------------------------- 1 | package urlx 2 | 3 | import ( 4 | "strings" 5 | "testing" 6 | ) 7 | 8 | func TestExtractSubdomains(t *testing.T) { 9 | tests := [...]string{ 10 | "http://aa.bb.cc", 11 | "aa.bb.cc", 12 | "ftp://aa.bb.cc", 13 | "aa.bb.cc", 14 | "aa-bb.cc.dd.cc", 15 | "1212-aa.bb.345-d.cc", 16 | "-www.aa.bb.cc", 17 | "aa.bb.cc.dd/dd.aa.cc", 18 | "-www.dd.aa.cc", 19 | "-site:www.dd.aa.cc -site:www.dd.aa.cc", 20 | } 21 | 22 | for _, test := range tests { 23 | t.Logf("Current test: %s", test) 24 | results := ExtractSubdomains(test, "cc") 25 | if len(results) == 0 { 26 | t.Errorf("No result found for test: %s", test) 27 | } 28 | for _, result := range results { 29 | t.Logf("Current result: %s", result) 30 | if !strings.Contains(test, result) { 31 | t.Errorf("Domain was incorrect, got: %s, want it contained in: %s.", result, test) 32 | } 33 | } 34 | } 35 | } 36 | 37 | func BenchmarkExtractSubdomainsWithCheck(b *testing.B) { 38 | tests := [...]string{ 39 | "http://aa.bb.cc", 40 | "aa.bb.cc", 41 | "ftp://aa.bb.cc", 42 | "aa.bb.cc", 43 | "aa-bb.cc.dd.cc", 44 | "1212-aa.bb.345-d.cc", 45 | "-www.aa.bb.cc", 46 | "aa.bb.cc.dd/dd.aa.cc", 47 | "-www.dd.aa.cc", 48 | "-site:www.dd.aa.cc -site:www.dd.aa.cc", 49 | } 50 | 51 | for n := 0; n < b.N; n++ { 52 | for _, test := range tests { 53 | results := ExtractSubdomains(test, "cc") 54 | for _, result := range results { 55 | if !strings.Contains(test, result) { 56 | b.Errorf("Expected to find a 'cc' domain in '%v'", test) 57 | } 58 | } 59 | } 60 | } 61 | } 62 | 63 | func BenchmarkExtractSubdomains(b *testing.B) { 64 | tests := [...]string{ 65 | "http://aa.bb.cc", 66 | "aa.bb.cc", 67 | "ftp://aa.bb.cc", 68 | "aa.bb.cc", 69 | "aa-bb.cc.dd.cc", 70 | "1212-aa.bb.345-d.cc", 71 | "-www.aa.bb.cc", 72 | "aa.bb.cc.dd/dd.aa.cc", 73 | "-www.dd.aa.cc", 74 | "-site:www.dd.aa.cc -site:www.dd.aa.cc", 75 | } 76 | 77 | for n := 0; n < b.N; n++ { 78 | for _, test := range tests { 79 | results := ExtractSubdomains(test, "cc") 80 | if !(len(results) > 1) { 81 | b.Errorf("expected to be more than one result, got '%v'", len(results)) 82 | } 83 | } 84 | } 85 | } 86 | 87 | func BenchmarkExtractSubdomainsFromBlobOfTextLevel1(b *testing.B) { 88 | text := ` 89 | http://aa.bb.cc 90 | aa.bb.cc 91 | ftp://aa.bb.cc 92 | aa.bb.cc 93 | aa-bb.cc.dd.cc 94 | 1212-aa.bb.345-d.cc 95 | -www.aa.bb.cc 96 | aa.bb.cc.dd/dd.aa.cc 97 | -www.dd.aa.cc 98 | -site:www.dd.aa.cc -site:www.dd.aa.cc 99 | ` 100 | for n := 0; n < b.N; n++ { 101 | results := ExtractSubdomains(text, "cc") 102 | if !(len(results) > 1) { 103 | b.Errorf("expected to be more than one result, got '%v'", len(results)) 104 | } 105 | } 106 | } 107 | 108 | func BenchmarkExtractSubdomainsFromBlobOfTextLevel2(b *testing.B) { 109 | text := ` 110 | http://aa.bb.cc 111 | aa.bb.cc 112 | ftp://aa.bb.cc 113 | aa.bb.cc 114 | aa-bb.cc.dd.cc 115 | 1212-aa.bb.345-d.cc 116 | -www.aa.bb.cc 117 | aa.bb.cc.dd/dd.aa.cc 118 | -www.dd.aa.cc 119 | -site:www.dd.aa.cc -site:www.dd.aa.cc 120 | ` 121 | // grow text 2x 122 | for i := 1; i <= 2; i++ { 123 | text += text 124 | } 125 | 126 | for n := 0; n < b.N; n++ { 127 | results := ExtractSubdomains(text, "cc") 128 | if !(len(results) > 1) { 129 | b.Errorf("expected to be more than one result, got '%v'", len(results)) 130 | } 131 | } 132 | } 133 | 134 | func BenchmarkExtractSubdomainsFromBlobOfTextLevel3(b *testing.B) { 135 | text := ` 136 | http://aa.bb.cc 137 | aa.bb.cc 138 | ftp://aa.bb.cc 139 | aa.bb.cc 140 | aa-bb.cc.dd.cc 141 | 1212-aa.bb.345-d.cc 142 | 143 | -www.aa.bb.cc 144 | aa.bb.cc.dd/dd.aa.cc 145 | -www.dd.aa.cc 146 | -site:www.dd.aa.cc -site:www.dd.aa.cc 147 | ` 148 | // grow text 3x 149 | for i := 1; i <= 3; i++ { 150 | text += text 151 | } 152 | 153 | for n := 0; n < b.N; n++ { 154 | results := ExtractSubdomains(text, "cc") 155 | if !(len(results) > 1) { 156 | b.Errorf("expected to be more than one result, got '%v'", len(results)) 157 | } 158 | } 159 | } 160 | 161 | func BenchmarkExtractSubdomainsFromBlobOfTextLevel4(b *testing.B) { 162 | text := ` 163 | http://aa.bb.cc 164 | aa.bb.cc 165 | ftp://aa.bb.cc 166 | aa.bb.cc 167 | aa-bb.cc.dd.cc 168 | 1212-aa.bb.345-d.cc 169 | 170 | -www.aa.bb.cc 171 | aa.bb.cc.dd/dd.aa.cc 172 | -www.dd.aa.cc 173 | -site:www.dd.aa.cc -site:www.dd.aa.cc 174 | ` 175 | // grow text 4x 176 | for i := 1; i <= 4; i++ { 177 | text += text 178 | } 179 | 180 | for n := 0; n < b.N; n++ { 181 | results := ExtractSubdomains(text, "cc") 182 | if !(len(results) > 1) { 183 | b.Errorf("expected to be more than one result, got '%v'", len(results)) 184 | } 185 | } 186 | } 187 | 188 | func BenchmarkExtractSubdomainsFromBlobOfTextLevel5(b *testing.B) { 189 | text := ` 190 | http://aa.bb.cc 191 | aa.bb.cc 192 | ftp://aa.bb.cc 193 | aa.bb.cc 194 | aa-bb.cc.dd.cc 195 | 1212-aa.bb.345-d.cc 196 | 197 | -www.aa.bb.cc 198 | aa.bb.cc.dd/dd.aa.cc 199 | -www.dd.aa.cc 200 | -site:www.dd.aa.cc -site:www.dd.aa.cc 201 | ` 202 | // grow text 5x 203 | for i := 1; i <= 5; i++ { 204 | text += text 205 | } 206 | 207 | for n := 0; n < b.N; n++ { 208 | results := ExtractSubdomains(text, "cc") 209 | if !(len(results) > 1) { 210 | b.Errorf("expected to be more than one result, got '%v'", len(results)) 211 | } 212 | } 213 | } 214 | 215 | func BenchmarkExtractSubdomainsFromBlobOfTextLevel6(b *testing.B) { 216 | text := ` 217 | http://aa.bb.cc 218 | aa.bb.cc 219 | ftp://aa.bb.cc 220 | aa.bb.cc 221 | aa-bb.cc.dd.cc 222 | 1212-aa.bb.345-d.cc 223 | 224 | -www.aa.bb.cc 225 | aa.bb.cc.dd/dd.aa.cc 226 | -www.dd.aa.cc 227 | -site:www.dd.aa.cc -site:www.dd.aa.cc 228 | ` 229 | // grow text 6x 230 | for i := 1; i <= 6; i++ { 231 | text += text 232 | } 233 | 234 | for n := 0; n < b.N; n++ { 235 | results := ExtractSubdomains(text, "cc") 236 | if !(len(results) > 1) { 237 | b.Errorf("expected to be more than one result, got '%v'", len(results)) 238 | } 239 | } 240 | } 241 | 242 | func BenchmarkExtractSubdomainsFromBlobOfTextLevel7(b *testing.B) { 243 | text := ` 244 | http://aa.bb.cc 245 | aa.bb.cc 246 | ftp://aa.bb.cc 247 | aa.bb.cc 248 | aa-bb.cc.dd.cc 249 | 1212-aa.bb.345-d.cc 250 | 251 | -www.aa.bb.cc 252 | aa.bb.cc.dd/dd.aa.cc 253 | -www.dd.aa.cc 254 | -site:www.dd.aa.cc -site:www.dd.aa.cc 255 | ` 256 | // grow text 7x 257 | for i := 1; i <= 7; i++ { 258 | text += text 259 | } 260 | 261 | for n := 0; n < b.N; n++ { 262 | results := ExtractSubdomains(text, "cc") 263 | if !(len(results) > 1) { 264 | b.Errorf("expected to be more than one result, got '%v'", len(results)) 265 | } 266 | } 267 | } 268 | 269 | func BenchmarkExtractSubdomainsFromBlobOfTextLevel8(b *testing.B) { 270 | text := ` 271 | http://aa.bb.cc 272 | aa.bb.cc 273 | ftp://aa.bb.cc 274 | aa.bb.cc 275 | aa-bb.cc.dd.cc 276 | 1212-aa.bb.345-d.cc 277 | 278 | -www.aa.bb.cc 279 | aa.bb.cc.dd/dd.aa.cc 280 | -www.dd.aa.cc 281 | -site:www.dd.aa.cc -site:www.dd.aa.cc 282 | ` 283 | // grow text 8x 284 | for i := 1; i <= 8; i++ { 285 | text += text 286 | } 287 | 288 | for n := 0; n < b.N; n++ { 289 | results := ExtractSubdomains(text, "cc") 290 | if !(len(results) > 1) { 291 | b.Errorf("expected to be more than one result, got '%v'", len(results)) 292 | } 293 | } 294 | } 295 | 296 | func BenchmarkExtractSubdomainsFromBlobOfTextLevel9(b *testing.B) { 297 | text := ` 298 | http://aa.bb.cc 299 | aa.bb.cc 300 | ftp://aa.bb.cc 301 | aa.bb.cc 302 | aa-bb.cc.dd.cc 303 | 1212-aa.bb.345-d.cc 304 | 305 | -www.aa.bb.cc 306 | aa.bb.cc.dd/dd.aa.cc 307 | -www.dd.aa.cc 308 | -site:www.dd.aa.cc -site:www.dd.aa.cc 309 | ` 310 | // grow text 9x 311 | for i := 1; i <= 9; i++ { 312 | text += text 313 | } 314 | 315 | for n := 0; n < b.N; n++ { 316 | results := ExtractSubdomains(text, "cc") 317 | if !(len(results) > 1) { 318 | b.Errorf("expected to be more than one result, got '%v'", len(results)) 319 | } 320 | } 321 | } 322 | --------------------------------------------------------------------------------