├── .github
└── ISSUE_TEMPLATE
│ └── bug.md
├── .gitignore
├── LICENSE
├── README.md
├── banner.png
├── dsieve.go
├── dsieve.png
├── generate-urls.py
├── go.mod
└── go.sum
/.github/ISSUE_TEMPLATE/bug.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug
3 | about: Issue for bugs
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | #### Problem description
11 |
12 | #### Steps to reproduce
13 |
14 | #### Expected behaviour
15 |
16 | #### Screenshots if applicable
17 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | env
2 | venv
3 | .idea
4 |
5 | urls-*.txt
6 | dsieve
7 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Trickest
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
dsieve 
2 | Filter and enrich a list of subdomains by level
3 |
4 | 
5 |
6 | Take a single domain or read an input file and extract unique parent domains, enrich subdomains, filter subdomains by level, or find out which subdomains have the most number of sub-subdomains (or sub-sub-subdomains or sub-sub-sub...). Dsieve supports any format of URL, with or without protocol, port, path, parameters.
7 |
8 | # Installation
9 | ## Binary
10 | Binaries are available in the [latest release](https://github.com/trickest/dsieve/releases/latest).
11 |
12 | ## Docker
13 | ```
14 | docker run quay.io/trickest/dsieve
15 | ```
16 |
17 | ## From source
18 | ```
19 | go install github.com/trickest/dsieve@latest
20 | ```
21 |
22 | # Usage
23 | ```
24 | -f string
25 | Filter domain level. Use python slice notation to select range.
26 | Example input: foo.bar.baz.tld
27 | -f 3 bar.baz.tld
28 | -f 3: bar.baz.tld, foo.bar.baz.tld
29 | -f 2:4 baz.tld, bar.baz.tld
30 | -f :3 tld, baz.tld
31 | -i string
32 | Input url or domain
33 | -if string
34 | Input file path, one url/domain per line.
35 | -o string
36 | Output file path, optional
37 | -top int
38 | Only consider top X subdomains of a certain level and return all their subdomains
39 | ```
40 |
41 | Domains can be passed through stdin as well.
42 | ```
43 | cat domains.txt | dsieve -f 2
44 | ```
45 |
46 | ### Example
47 | ##### test.txt
48 | ```
49 | a.foo.target.com
50 | b.foo.target.com
51 | c.foo.target.com
52 | a.bar.target.com
53 | b.bar.target.com
54 | a.baz.target.com
55 | ```
56 |
57 | ```shell script
58 | # All levels by default
59 | $ dsieve -if test.txt
60 | a.foo.target.com
61 | foo.target.com
62 | target.com
63 | b.foo.target.com
64 | c.foo.target.com
65 | a.bar.target.com
66 | bar.target.com
67 | b.bar.target.com
68 | a.baz.target.com
69 | baz.target.com
70 |
71 | # Level 2, the main domain
72 | $ dsieve -if test.txt -f 2
73 | target.com
74 |
75 | # Level 3, one level above the main domain
76 | $ dsieve -if test.txt -f 3
77 | foo.target.com
78 | bar.target.com
79 | baz.target.com
80 |
81 | # Levels 2 and above, main domain and all its subdomains
82 | $ dsieve -if test.txt -f 2:
83 | a.foo.target.com
84 | foo.target.com
85 | target.com
86 | b.foo.target.com
87 | c.foo.target.com
88 | a.bar.target.com
89 | bar.target.com
90 | b.bar.target.com
91 | a.baz.target.com
92 | baz.target.com
93 |
94 | # The top one level 3 subdomain with the highest number of sub-subdomains
95 | $ dsieve -if test.txt -f 3 -top 1
96 | foo.target.com
97 |
98 | # The top two level 3 subdomain with the highest number of sub-subdomains
99 | $ dsieve -if test.txt -f 3 -top 2
100 | foo.target.com
101 | bar.target.com
102 | ```
103 |
104 | # Report Bugs / Feedback
105 | We look forward to any feedback you want to share with us or if you're stuck with a problem you can contact us at [support@trickest.com](mailto:support@trickest.com). You can also create an [Issue](https://github.com/trickest/dsieve/issues/new) or pull request on the Github repository.
106 |
107 | # Where does this fit in your methodology?
108 | Dsieve is an integral part of many workflows in the Trickest store. Sign up on [trickest.com](https://trickest.com) to get access to these workflows or build your own from scratch!
109 |
110 | [
](https://trickest.io/auth/register)
111 |
--------------------------------------------------------------------------------
/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trickest/dsieve/44eb7f52a03356406cbbabdb496b5b525ef2b2da/banner.png
--------------------------------------------------------------------------------
/dsieve.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "bufio"
5 | "flag"
6 | "fmt"
7 | "net/url"
8 | "os"
9 | "sort"
10 | "strconv"
11 | "strings"
12 |
13 | "golang.org/x/net/publicsuffix"
14 | )
15 |
16 | var (
17 | inputUrl *string
18 | inputFilePath *string
19 | filterLevel *string
20 | outputFilePath *string
21 | filterTLD *bool
22 | top *int
23 | topDomainsPerLevel = make(map[int]*map[string]int)
24 | topDomainsPerLevelFiltered = make(map[int][]string)
25 | )
26 |
27 | func fail(text string) {
28 | fmt.Fprintln(os.Stderr, text)
29 | os.Exit(1)
30 | }
31 |
32 | func check(err error) {
33 | if err != nil {
34 | fail(err.Error())
35 | }
36 | }
37 |
38 | func parseFilter(filter string) (int, int) {
39 | if filter == "" {
40 | return -1, -1
41 | }
42 | vMin := -1
43 | vMax := -1
44 | var err error
45 | minMax := strings.Split(filter, ":")
46 | if len(minMax) == 1 {
47 | vMin, err = strconv.Atoi(minMax[0])
48 | vMax = vMin
49 | check(err)
50 | } else if len(minMax) == 2 {
51 | if minMax[0] != "" {
52 | vMin, err = strconv.Atoi(minMax[0])
53 | check(err)
54 | }
55 | if minMax[1] != "" {
56 | vMax, err = strconv.Atoi(minMax[1])
57 | check(err)
58 | }
59 | } else {
60 | fail("Invalid filter value: " + filter)
61 | }
62 | return vMin, vMax
63 | }
64 |
65 | func writeResults(domains *[]string) {
66 | _, err := os.Create(*outputFilePath)
67 | check(err)
68 |
69 | if len(*domains) > 0 {
70 | file, _ := os.OpenFile(*outputFilePath, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0600)
71 | defer file.Close()
72 | writer := bufio.NewWriter(file)
73 | for _, domain := range *domains {
74 | _, _ = fmt.Fprintln(writer, domain)
75 | }
76 | _ = writer.Flush()
77 | }
78 | }
79 |
80 | func parseUrl(rawUrl string, lMin, lMax int) []string {
81 | domains := make([]string, 0)
82 | if !strings.HasPrefix(rawUrl, "http") {
83 | rawUrl = "http://" + rawUrl
84 | }
85 | u, err := url.Parse(rawUrl)
86 | if err != nil {
87 | return domains
88 | }
89 |
90 | domainLevels := strings.Split(u.Host, ".")
91 |
92 | suffixes := make([]string, 0)
93 | eTLD, icann := publicsuffix.PublicSuffix(u.Host)
94 | if icann {
95 | if eTLD != u.Host {
96 | suffixes = append(suffixes, eTLD)
97 | }
98 | }
99 |
100 | if len(suffixes) > 0 {
101 | sort.Slice(suffixes, func(i, j int) bool {
102 | return len(suffixes[i]) > len(suffixes[j])
103 | })
104 | tld := suffixes[0]
105 | tldLength := strings.Count(tld, ".") + 1
106 | domainLevels = domainLevels[:len(domainLevels)-tldLength]
107 | domainLevels = append(domainLevels, tld)
108 | }
109 |
110 | if lMin <= len(domainLevels) {
111 | if lMax == -1 || lMax > len(domainLevels) {
112 | lMax = len(domainLevels)
113 | }
114 | for i := lMax; i > 0 && i >= lMin; i-- {
115 | domain := strings.Join(domainLevels[len(domainLevels)-i:], ".")
116 | if domain != "" {
117 | domains = append(domains, domain)
118 | if *top > 0 {
119 | if topDomainsPerLevel[i] == nil {
120 | levelMap := make(map[string]int)
121 | topDomainsPerLevel[i] = &levelMap
122 | }
123 | levelMap := topDomainsPerLevel[i]
124 | (*levelMap)[domain] = (*levelMap)[domain] + 1
125 | }
126 | }
127 | }
128 | }
129 |
130 | return domains
131 | }
132 |
133 | func main() {
134 | inputUrl = flag.String("i", "", "Input url or domain")
135 | inputFilePath = flag.String("if", "", "Input file path, one url/domain per line.")
136 | filterLevel = flag.String("f", "", "Filter domain level. "+
137 | "Use python slice notation to select range. \nExample input: foo.bar.baz.tld \n"+
138 | " \033[3m-f 3 \033[0m bar.baz.tld \n"+
139 | " \033[3m-f 3: \033[0m bar.baz.tld, foo.bar.baz.tld\n"+
140 | " \033[3m-f 2:4\033[0m baz.tld, bar.baz.tld\n"+
141 | " \033[3m-f :3 \033[0m tld, baz.tld")
142 | outputFilePath = flag.String("o", "", "Output file path, optional")
143 | //filterTLD = flag.Bool("t", true, "Filter invalid domains according to Mozilla's publicsuffix list.")
144 | top = flag.Int("top", 0, "Only consider top X subdomains of a certain level and return all their subdomains")
145 |
146 | flag.Parse()
147 |
148 | // set filterTLD to true by default while removing the flag
149 | t := true
150 | filterTLD = &t
151 |
152 | inputUrls := make([]string, 0)
153 | if *inputUrl != "" {
154 | inputUrls = append(inputUrls, *inputUrl)
155 | }
156 | if *inputFilePath != "" {
157 | inputFile, err := os.Open(*inputFilePath)
158 | check(err)
159 | defer inputFile.Close()
160 | scanner := bufio.NewScanner(inputFile)
161 | for scanner.Scan() {
162 | inputUrls = append(inputUrls, scanner.Text())
163 | }
164 | }
165 |
166 | // TODO: Consider program main execution to be in goroutine.
167 | if *inputUrl == "" && *inputFilePath == "" {
168 | // Let's get input from Stdin
169 |
170 | // Check for stdin input
171 | stat, _ := os.Stdin.Stat()
172 | if (stat.Mode() & os.ModeCharDevice) != 0 {
173 | fmt.Fprintln(os.Stderr, "No domains detected. Hint: cat domains.txt | dsieve -f 2")
174 | os.Exit(1)
175 | }
176 |
177 | sc := bufio.NewScanner(os.Stdin)
178 |
179 | for sc.Scan() {
180 | inputUrls = append(inputUrls, sc.Text())
181 | }
182 |
183 | }
184 |
185 | // Leaving this in, for incase all/any input process gives un Zero URLs.
186 | if len(inputUrls) == 0 {
187 | flag.PrintDefaults()
188 | fmt.Fprintln(os.Stderr, "\nError: No input.")
189 | os.Exit(1)
190 | }
191 |
192 | lMin, lMax := parseFilter(*filterLevel)
193 | domainMap := make(map[string]bool)
194 | domains := make([]string, 0)
195 | for _, inputURL := range inputUrls {
196 | for _, domain := range parseUrl(inputURL, lMin, lMax) {
197 | if _, dup := domainMap[domain]; !dup {
198 | eTLD, icann := publicsuffix.PublicSuffix(domain)
199 | if *filterTLD {
200 | if icann {
201 | if eTLD != domain {
202 | if *top == 0 {
203 | fmt.Println(domain)
204 | }
205 | domainMap[domain] = true
206 | domains = append(domains, domain)
207 | }
208 | }
209 | } else {
210 | if *top == 0 {
211 | fmt.Println(domain)
212 | }
213 | domainMap[domain] = true
214 | domains = append(domains, domain)
215 | }
216 | }
217 | }
218 | }
219 |
220 | if *top > 0 {
221 | if lMax < 0 {
222 | for lvl := range topDomainsPerLevel {
223 | if lvl > lMax {
224 | lMax = lvl
225 | }
226 | }
227 | }
228 | for i := lMax; i > 0 && i >= lMin; i-- {
229 | domainsForLevel := make([]string, 0)
230 | levelMap := topDomainsPerLevel[i]
231 | if levelMap == nil {
232 | continue
233 | }
234 | for _, domain := range domains {
235 | if _, ok := (*levelMap)[domain]; ok {
236 | domainsForLevel = append(domainsForLevel, domain)
237 | }
238 | }
239 | if len(domainsForLevel) > 0 {
240 | sort.Slice(domainsForLevel, func(i, j int) bool {
241 | return (*levelMap)[domainsForLevel[i]] > (*levelMap)[domainsForLevel[j]]
242 | })
243 | if len(domainsForLevel) >= *top {
244 | domainsForLevel = domainsForLevel[:*top]
245 | }
246 | topDomainsPerLevelFiltered[i] = domainsForLevel
247 | }
248 | }
249 |
250 | maxLevel := 0
251 | for level := range topDomainsPerLevelFiltered {
252 | if level > maxLevel {
253 | maxLevel = level
254 | }
255 | }
256 |
257 | if strings.Contains(*filterLevel, ":") {
258 | if strings.HasSuffix(*filterLevel, ":") {
259 | lvl, err := strconv.Atoi(strings.TrimSuffix(*filterLevel, ":"))
260 | if err != nil {
261 | check(err)
262 | }
263 | maxLevel = lvl
264 | } else {
265 | split := strings.Split(*filterLevel, ":")
266 | lvl, err := strconv.Atoi(split[len(split)-1])
267 | if err != nil {
268 | check(err)
269 | }
270 | maxLevel = lvl - 1
271 | }
272 | filteredDomains := make([]string, 0)
273 | for _, inputURL := range inputUrls {
274 | for _, d := range topDomainsPerLevelFiltered[maxLevel] {
275 | if strings.HasSuffix(inputURL, d) {
276 | filteredDomains = append(filteredDomains, inputURL)
277 | fmt.Println(inputURL)
278 | }
279 | }
280 | }
281 | domains = filteredDomains
282 | } else {
283 | for _, d := range topDomainsPerLevelFiltered[maxLevel] {
284 | fmt.Println(d)
285 | }
286 | domains = topDomainsPerLevelFiltered[maxLevel]
287 | }
288 | }
289 |
290 | if *outputFilePath != "" && len(domains) > 0 {
291 | writeResults(&domains)
292 | }
293 | }
294 |
--------------------------------------------------------------------------------
/dsieve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trickest/dsieve/44eb7f52a03356406cbbabdb496b5b525ef2b2da/dsieve.png
--------------------------------------------------------------------------------
/generate-urls.py:
--------------------------------------------------------------------------------
1 | import random
2 | import sys
3 |
4 | CHARS = "abcdefghijklmnopqrstuvwxyz"
5 | NUMBERS = "0123456789"
6 | ALPHANUM = CHARS + NUMBERS
7 |
8 |
9 | def rand_word(max_len=5):
10 | word = random.choice(CHARS)
11 | for _ in range(random.randint(1, max_len)):
12 | word += random.choice(ALPHANUM)
13 | return word
14 |
15 |
16 | def rand_protocol():
17 | return random.choice(["http", "https"])
18 |
19 |
20 | def rand_port(none_chance=0.8):
21 | if random.random() > none_chance:
22 | return random.choice(["80", "8000", "8080", "5000", "443", "4200"])
23 | return ""
24 |
25 |
26 | def random_times(f, min_times=0, max_times=5, none_chance=0.0) -> list:
27 | tokens = []
28 | if random.random() > none_chance:
29 | for i in range(random.randint(min_times, max_times)):
30 | tokens.append(f())
31 | return tokens
32 |
33 |
34 | def random_url():
35 | port = rand_port()
36 | host_tokens = random_times(rand_word, 2, 4)
37 | host_tokens.append(random.choice(["com", "net", "ch", "jp", "ru", "us", "uk"]))
38 | path_tokens = random_times(rand_word, 1, 4, 0.4)
39 | parameters_tokens = random_times(rand_word, 0, 2)
40 | url = rand_protocol() + "://" + ".".join(host_tokens)
41 | if port:
42 | url += ":" + port
43 | if path_tokens:
44 | url += "/" + "/".join(path_tokens)
45 | if parameters_tokens:
46 | url += "?" if random.random() > 0.3 else "/?"
47 | url += "&".join(["{}={}".format(param, random.randint(0, 10000)) for param in parameters_tokens])
48 | return url
49 |
50 |
51 | if __name__ == '__main__':
52 | if len(sys.argv) > 1:
53 | try:
54 | max_size = int(sys.argv[1])
55 | except Exception:
56 | max_size = 100
57 | else:
58 | max_size = 100
59 |
60 | for i in range(max_size):
61 | print(random_url())
62 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/trickest/dsieve
2 |
3 | go 1.18
4 |
5 | require golang.org/x/net v0.0.0-20220425223048-2871e0cb64e4
6 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | golang.org/x/net v0.0.0-20220425223048-2871e0cb64e4 h1:HVyaeDAYux4pnY+D/SiwmLOR36ewZ4iGQIIrtnuCjFA=
2 | golang.org/x/net v0.0.0-20220425223048-2871e0cb64e4/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
3 |
--------------------------------------------------------------------------------