├── .github └── workflows │ └── go.yml ├── LICENSE ├── README.md ├── go.mod ├── go.sum └── raven.go /.github/workflows/go.yml: -------------------------------------------------------------------------------- 1 | name: Go 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | build: 11 | name: Build and Test 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - name: Checkout code 16 | uses: actions/checkout@v2 17 | 18 | - name: Set up Go 19 | uses: actions/setup-go@v2 20 | with: 21 | go-version: 1.22.2 22 | 23 | - name: Build 24 | run: go build -v ./... 25 | 26 | - name: Test 27 | run: go test -v -race ./... 28 | 29 | release: 30 | name: Release Binary 31 | runs-on: ubuntu-latest 32 | needs: build 33 | 34 | steps: 35 | - name: Checkout code 36 | uses: actions/checkout@v2 37 | 38 | - name: Set up Go 39 | uses: actions/setup-go@v2 40 | with: 41 | go-version: 1.22.2 42 | 43 | - name: Build Release Binary 44 | run: go build -o raven 45 | 46 | - name: Compress Binary 47 | run: tar -czvf raven.tar.gz raven 48 | 49 | - name: Upload Release Binary 50 | uses: actions/upload-artifact@v2 51 | with: 52 | name: raven 53 | path: raven.tar.gz 54 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Yasin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Raven 2 | ![Raven](https://github.com/user-attachments/assets/14ecc6e5-02c4-4767-961c-bc993dd721a8) 3 | Raven is a powerful and customizable web crawler written in Go. It allows you to extract internal and external links from a given website with options for concurrent crawling, depth customization, and maximum URL limits. 4 | ## Features 5 | - Concurrent crawling to maximize efficiency. 6 | - Customizable depth and maximum URL limits to tailor the crawling process to your needs. 7 | - Extraction of both internal and external links for comprehensive analysis. 8 | - Colorful logging for easy debugging and tracking of crawling progress. 9 | - Error handling for fetching URLs to ensure robustness. 10 | 11 | ## Installation 12 | To install Raven, you have three options: 13 | 14 | ⚠️ **Ensure you have Go installed on your system. If not, you can download it from the official Go website.** ⚠️ 15 | 16 | 1. Compiled Version: 17 | [Click Here](https://github.com/Symbolexe/Raven/releases) 18 | 19 | 2. Clone the Raven repository: 20 | 21 | ```git clone https://github.com/Symbolexe/Raven.git``` 22 | 23 | - Navigate to the project directory: 24 | ```cd raven``` 25 | 26 | - Build the project: 27 | 28 | ```go build``` 29 | 30 | 3. To install Raven, use go get: 31 | 32 | ```go get github.com/Symbolexe/raven``` 33 | ## Usage 34 | 35 | ``` ./raven [options] ``` 36 | 37 | ⚠️ startURL: The starting URL from which the crawling process begins. ⚠️ 38 | 39 | ## Options 40 | 1. -maxURLs : Maximum number of URLs to crawl (default: 100) 41 | 2. -maxDepth : Maximum depth of crawling (default: 3) 42 | 3. -concurrency : Number of concurrent requests (default: 10) 43 | 44 | ## Example 45 | 46 | ```./raven -maxURLs 500 -maxDepth 5 -concurrency 20 https://example.com``` 47 | 48 | This command will crawl the website https://example.com with a maximum of 500 URLs, a maximum depth of 5, and 20 concurrent requests. 49 | 50 | ## Dependencies 51 | 52 | 1. Raven depends on the following external packages: 53 | golang.org/x/net/html: Used for HTML parsing. 54 | 55 | 2. You can install these dependencies using the following command: 56 | ```go mod tidy``` 57 | 58 | ## License 59 | This project is licensed under the MIT License. See the LICENSE file for details. 60 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module raven 2 | 3 | go 1.22.2 4 | 5 | require ( 6 | github.com/PuerkitoBio/goquery v1.9.1 7 | github.com/fatih/color v1.16.0 8 | golang.org/x/sync v0.1.0 9 | ) 10 | 11 | require ( 12 | github.com/andybalholm/cascadia v1.3.2 // indirect 13 | github.com/mattn/go-colorable v0.1.13 // indirect 14 | github.com/mattn/go-isatty v0.0.20 // indirect 15 | golang.org/x/net v0.21.0 // indirect 16 | golang.org/x/sys v0.17.0 // indirect 17 | ) 18 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/PuerkitoBio/goquery v1.9.1 h1:mTL6XjbJTZdpfL+Gwl5U2h1l9yEkJjhmlTeV9VPW7UI= 2 | github.com/PuerkitoBio/goquery v1.9.1/go.mod h1:cW1n6TmIMDoORQU5IU/P1T3tGFunOeXEpGP2WHRwkbY= 3 | github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= 4 | github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= 5 | github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM= 6 | github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE= 7 | github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= 8 | github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= 9 | github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= 10 | github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= 11 | github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= 12 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= 13 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 14 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= 15 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= 16 | golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= 17 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 18 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= 19 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= 20 | golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= 21 | golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= 22 | golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= 23 | golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= 24 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 25 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 26 | golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o= 27 | golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 28 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 29 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 30 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 31 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 32 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 33 | golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 34 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 35 | golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 36 | golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 37 | golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y= 38 | golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 39 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= 40 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= 41 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= 42 | golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= 43 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 44 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 45 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= 46 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= 47 | golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= 48 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 49 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 50 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= 51 | golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= 52 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 53 | -------------------------------------------------------------------------------- /raven.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "log" 7 | "net/http" 8 | "net/url" 9 | "os" 10 | "strings" 11 | "sync" 12 | "time" 13 | 14 | "golang.org/x/net/html" 15 | ) 16 | 17 | var ( 18 | internalURLs = make(map[string]bool) 19 | externalURLs = make(map[string]bool) 20 | maxURLs int 21 | maxDepth int 22 | wg sync.WaitGroup 23 | mutex sync.Mutex 24 | concurrency int 25 | client = &http.Client{ 26 | Timeout: 10 * time.Second, 27 | } 28 | ) 29 | 30 | func isValidURL(u string) bool { 31 | parsed, err := url.Parse(u) 32 | if err != nil { 33 | return false 34 | } 35 | return parsed.Scheme != "" && parsed.Host != "" 36 | } 37 | 38 | func getAllWebsiteLinks(u string) []string { 39 | var urls []string 40 | resp, err := client.Get(u) 41 | if err != nil { 42 | log.Printf("[raven] Error fetching URL %s: %v\n", u, err) 43 | return urls 44 | } 45 | defer resp.Body.Close() 46 | 47 | tokenizer := html.NewTokenizer(resp.Body) 48 | for { 49 | tokenType := tokenizer.Next() 50 | if tokenType == html.ErrorToken { 51 | break 52 | } 53 | token := tokenizer.Token() 54 | if tokenType == html.StartTagToken && token.Data == "a" { 55 | for _, attr := range token.Attr { 56 | if attr.Key == "href" { 57 | href := attr.Val 58 | if href == "" { 59 | continue 60 | } 61 | if strings.HasPrefix(href, "#") || strings.HasPrefix(href, "mailto:") || strings.HasPrefix(href, "tel:") { 62 | continue 63 | } 64 | if !isValidURL(href) { 65 | continue 66 | } 67 | urls = append(urls, href) 68 | } 69 | } 70 | } 71 | } 72 | return urls 73 | } 74 | 75 | func crawl(u string, depth int) { 76 | defer wg.Done() 77 | if depth > maxDepth { 78 | return 79 | } 80 | log.Printf("[raven] [*] Crawling: %s (Depth: %d)\n", u, depth) 81 | links := getAllWebsiteLinks(u) 82 | mutex.Lock() 83 | for _, link := range links { 84 | if len(internalURLs)+len(externalURLs) > maxURLs { 85 | break 86 | } 87 | _, isInternal := internalURLs[link] 88 | _, isExternal := externalURLs[link] 89 | if isInternal || isExternal { 90 | continue 91 | } 92 | if strings.Contains(link, u) { 93 | log.Printf("[raven] [*] Internal link: %s\n", link) 94 | internalURLs[link] = true 95 | } else { 96 | log.Printf("[raven] [!] External link: %s\n", link) 97 | externalURLs[link] = true 98 | } 99 | wg.Add(1) 100 | go crawl(link, depth+1) 101 | } 102 | mutex.Unlock() 103 | } 104 | 105 | func main() { 106 | flag.IntVar(&maxURLs, "maxURLs", 100, "Maximum number of URLs to crawl") 107 | flag.IntVar(&maxDepth, "maxDepth", 3, "Maximum depth of crawling") 108 | flag.IntVar(&concurrency, "concurrency", 10, "Number of concurrent requests") 109 | flag.Parse() 110 | 111 | startURL := flag.Arg(0) 112 | if startURL == "" { 113 | fmt.Println("Usage: raven [options] [URL]") 114 | flag.PrintDefaults() 115 | os.Exit(1) 116 | } 117 | 118 | log.SetFlags(log.LstdFlags | log.Lmicroseconds) 119 | 120 | wg.Add(1) 121 | go crawl(startURL, 0) 122 | wg.Wait() 123 | 124 | fmt.Println("[+] Total Internal links:", len(internalURLs)) 125 | fmt.Println("[+] Total External links:", len(externalURLs)) 126 | fmt.Println("[+] Total URLs:", len(internalURLs)+len(externalURLs)) 127 | } 128 | --------------------------------------------------------------------------------