├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── finder.go ├── finder_test.go ├── helper.go ├── helper_test.go ├── interfaces ├── doc.go ├── fetcher.go ├── parser.go ├── registrar.go └── reporter.go ├── main.go ├── processed.go └── report.go /.gitignore: -------------------------------------------------------------------------------- 1 | /.idea 2 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | go: 3 | - 1.5 4 | - 1.6 5 | 6 | install: 7 | - go get -t -v ./... 8 | 9 | script: 10 | - go test -v ./... 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Konstantin Komelin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # insecRes - Insecure Resource Finder 2 | [![Build Status](https://travis-ci.org/kkomelin/insecres.svg)](https://travis-ci.org/kkomelin/insecres) 3 | [![Go Report Card](https://goreportcard.com/badge/github.com/kkomelin/insecres)](https://goreportcard.com/report/github.com/kkomelin/insecres) 4 | [![GoDoc](https://godoc.org/github.com/kkomelin/insecres?status.png)](http://godoc.org/github.com/kkomelin/insecres) 5 | 6 | A console tool that finds insecure resources on HTTPS sites. 7 | It is written in Go language and uses the power of "multi-threading" (goroutines) to crawl and parse site pages. 8 | 9 | ## The motivation 10 | 11 | Some time ago, I switched my site to HTTPS. _And you should too!_ 12 | All went well except the fact that my pages contained images, embedded videos and other resources, 13 | which pointed to HTTP content and made browsers display warnings about the insecure content on the pages. 14 | After some research of existing tools, which did not fit my needs, I decided to create my own one. 15 | 16 | ## Features 17 | 18 | - Crawls all site pages in parallel 19 | - Finds the following resources with absolute HTTP (insecure) urls: 20 | - IMG 21 | - IFRAME 22 | - OBJECT 23 | - AUDIO, VIDEO, SOURCE, TRACK 24 | - Uses a random delay between requests to prevent blacklisting 25 | - Prints results to a CSV file 26 | 27 | ## Installation 28 | 29 | First of all, [install Go](https://golang.org/doc/install). 30 | 31 | After that, run the following command: 32 | 33 | ``` 34 | go get github.com/kkomelin/insecres 35 | ``` 36 | 37 | ## Usage 38 | 39 | - Find insecure resources on a site and print results to the console: 40 | ``` 41 | $GOPATH/bin/insecres https://example.com 42 | ``` 43 | - Find insecure resources on a site and print results to a CSV file: 44 | ``` 45 | $GOPATH/bin/insecres -f="/home/user/report.csv" https://example.com 46 | ``` 47 | - Display usage guide: 48 | ``` 49 | $GOPATH/bin/insecres -h 50 | ``` 51 | 52 | ## Roadmap 53 | 54 | - [ ] Display result counters 55 | - [ ] Compare performance of simple regex parsing and Tokenized parsing, which is currently used 56 | - [x] Implement an option for printing results to a CSV file 57 | - [x] Add random delay between requests to prevent blacklisting 58 | - [x] Ignore trailing slashes (https://example.com and https://example.com/ are considered equivalent) 59 | - [x] Handle domains w/ and w/o WWW 60 | - [x] Support IFRAME tags 61 | - [x] Support OBJECT tags 62 | - [x] Support AUDIO, VIDEO, SOURCE and TRACK tags 63 | -------------------------------------------------------------------------------- /finder.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "crypto/tls" 5 | "fmt" 6 | "golang.org/x/net/html" 7 | "io" 8 | "net/http" 9 | "net/url" 10 | "strings" 11 | ) 12 | 13 | // ResourceAndLinkFinder encapsulates logic that is used for finding page link urls and resource urls.. 14 | type ResourceAndLinkFinder struct{} 15 | 16 | // Fetch page by url and returns response body. 17 | func (f ResourceAndLinkFinder) Fetch(url string) (responseBody io.ReadCloser, err error) { 18 | transport := &http.Transport{ 19 | TLSClientConfig: &tls.Config{ 20 | InsecureSkipVerify: true, 21 | }, 22 | } 23 | 24 | client := http.Client{Transport: transport} 25 | 26 | response, err := client.Get(url) 27 | if err != nil { 28 | return nil, err 29 | } 30 | //defer response.Body.Close() 31 | 32 | return response.Body, nil 33 | } 34 | 35 | // Parse takes a reader object and returns a slice of insecure resource urls 36 | // found in the HTML. 37 | // It does not close the reader. The reader should be closed from the outside. 38 | func (f ResourceAndLinkFinder) Parse(baseUrl string, httpBody io.Reader) (resourceUrls []string, linkUrls []string, err error) { 39 | 40 | resourceMap := make(map[string]bool) 41 | linkMap := make(map[string]bool) 42 | 43 | page := html.NewTokenizer(httpBody) 44 | for { 45 | tokenType := page.Next() 46 | if tokenType == html.ErrorToken { 47 | break 48 | } 49 | token := page.Token() 50 | 51 | switch { 52 | case f.isResourceToken(token): 53 | uris, err := f.processResourceToken(token) 54 | if err != nil { 55 | continue 56 | } 57 | 58 | for _, uri := range uris { 59 | resourceMap[uri] = true 60 | } 61 | case f.isLinkToken(token): 62 | uri, err := f.processLinkToken(token, baseUrl) 63 | if err != nil { 64 | continue 65 | } 66 | 67 | linkMap[uri] = true 68 | } 69 | } 70 | 71 | resourceUrls = make([]string, 0, len(resourceMap)) 72 | 73 | for k := range resourceMap { 74 | resourceUrls = append(resourceUrls, k) 75 | } 76 | 77 | linkUrls = make([]string, 0, len(linkMap)) 78 | 79 | for k := range linkMap { 80 | linkUrls = append(linkUrls, k) 81 | } 82 | 83 | return resourceUrls, linkUrls, nil 84 | } 85 | 86 | // Determine whether the token passed is a resource token. 87 | func (f ResourceAndLinkFinder) isResourceToken(token html.Token) bool { 88 | 89 | switch { 90 | case token.Type == html.SelfClosingTagToken && token.Data == "img": 91 | return true 92 | case token.Type == html.StartTagToken: 93 | switch token.Data { 94 | case 95 | "iframe", 96 | "object", 97 | "video", 98 | "audio", 99 | "source", 100 | "track": 101 | return true 102 | } 103 | } 104 | return false 105 | } 106 | 107 | // Determine whether the token passed is a resource token. 108 | func (f ResourceAndLinkFinder) isTargetedResourceTokenAttribute(token html.Token, attribute html.Attribute) bool { 109 | 110 | if token.Data == "object" && attribute.Key == "data" { 111 | return true 112 | } 113 | 114 | if attribute.Key == "src" || attribute.Key == "poster" { 115 | return true 116 | } 117 | 118 | return false 119 | } 120 | 121 | // Process resource token in order to get urls of the resources (a few if it is video, for example). 122 | func (f ResourceAndLinkFinder) processResourceToken(token html.Token) (map[string]string, error) { 123 | 124 | result := make(map[string]string) 125 | 126 | // Loop for tag attributes. 127 | for _, attr := range token.Attr { 128 | 129 | if !f.isTargetedResourceTokenAttribute(token, attr) { 130 | continue 131 | } 132 | 133 | uri, err := url.Parse(attr.Val) 134 | if err != nil { 135 | continue 136 | } 137 | 138 | // Ignore relative and secure urls. 139 | if !uri.IsAbs() || uri.Scheme == "https" || (uri.Host != "" && strings.HasPrefix(uri.String(), "//")) { 140 | continue 141 | } 142 | 143 | result[attr.Key] = uri.String() 144 | } 145 | 146 | if len(result) == 0 { 147 | return nil, fmt.Errorf("Targeted attributes have not been found. Skipped.") 148 | } 149 | 150 | return result, nil 151 | } 152 | 153 | // Determine whether the token passed is a link token. 154 | func (f ResourceAndLinkFinder) isLinkToken(token html.Token) bool { 155 | switch { 156 | case token.Type == html.StartTagToken && token.Data == "a": 157 | return true 158 | default: 159 | return false 160 | } 161 | } 162 | 163 | // Process token in order to get an absolute url of the link. 164 | func (f ResourceAndLinkFinder) processLinkToken(token html.Token, base string) (string, error) { 165 | 166 | // Loop for tag attributes. 167 | for _, attr := range token.Attr { 168 | if attr.Key != "href" { 169 | continue 170 | } 171 | 172 | // Ignore anchors. 173 | if strings.HasPrefix(attr.Val, "#") { 174 | return "", fmt.Errorf("Url is an anchor. Skipped.") 175 | } 176 | 177 | uri, err := url.Parse(attr.Val) 178 | if err != nil { 179 | return "", err 180 | } 181 | 182 | baseUrl, err := url.Parse(base) 183 | if err != nil { 184 | return "", err 185 | } 186 | 187 | // Return result if the uri is absolute. 188 | if uri.IsAbs() || (uri.Host != "" && strings.HasPrefix(uri.String(), "//")) { 189 | 190 | // Ignore external urls considering urls w/ WWW and w/o WWW as the same. 191 | if strings.TrimPrefix(uri.Host, "www.") != strings.TrimPrefix(baseUrl.Host, "www.") { 192 | return "", fmt.Errorf("Url is expernal. Skipped.") 193 | } 194 | 195 | return strings.TrimSuffix(uri.String(), "/"), nil 196 | } 197 | 198 | // Make it absolute if it's relative. 199 | absoluteUrl := f.convertToAbsolute(uri, baseUrl) 200 | 201 | return strings.TrimSuffix(absoluteUrl.String(), "/"), nil 202 | } 203 | 204 | return "", fmt.Errorf("Src has not been found. Skipped.") 205 | } 206 | 207 | // Convert a relative url to absolute. 208 | func (f ResourceAndLinkFinder) convertToAbsolute(source, base *url.URL) *url.URL { 209 | return base.ResolveReference(source) 210 | } 211 | -------------------------------------------------------------------------------- /finder_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | "testing" 7 | ) 8 | 9 | // TestParse tests fetcher.Parse method and its results. 10 | func TestParse(t *testing.T) { 11 | 12 | fmt.Println("TestParse") 13 | 14 | reader := strings.NewReader(` 15 | 16 | 17 | 18 | 19 | 20 | 21 | Anchor (ignored) 22 | Relative link 23 | Absolute HTTP link 24 | Absolute HTTPS link 25 | Absolute HTTPS link 26 | External link 27 | Reproduces bug in Go url.isAbs() 28 | Ignoring trailing slash 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 45 | 50 | `) 51 | 52 | expectedResources := map[string]int{ 53 | // img[src] 54 | "http://example.com/images/test.png": 0, 55 | // iframe[src] 56 | "http://www.youtube.com/embed/0sRPY3WWSNc": 0, 57 | // object[data] 58 | "http://www.example.com/flash/insecure.swf": 0, 59 | // audio[src] 60 | "http://www.example.com/audio.ogg": 0, 61 | // audio track[src] 62 | "http://www.example.com/audio_track.vtt": 0, 63 | // audio source[src] 64 | "http://www.example.com/audio_in_source.ogg": 0, 65 | // video[src] 66 | "http://www.example.com/video.mp4": 0, 67 | // video[poster] 68 | "http://www.example.com/poster.jpg": 0, 69 | // video track[src] 70 | "http://www.example.com/video_track.vtt": 0, 71 | // video source[src] 72 | "http://www.example.com/video_in_source.mp4": 0, 73 | } 74 | 75 | expectedLinks := map[string]int{ 76 | "https://example.com/article/test1": 0, 77 | "http://example.com/test2": 0, 78 | "https://example.com/test3": 0, 79 | "http://www.example.com/test3": 0, 80 | "http://www.example.com/test4": 0, 81 | } 82 | 83 | resources, links, err := (ResourceAndLinkFinder{}).Parse("https://example.com/", reader) 84 | if err != nil { 85 | t.Fatalf("fetcher.Parse has returned error: %s\n", err) 86 | } 87 | 88 | // Check resources. 89 | fmt.Printf("Resources: %q\n", resources) 90 | 91 | if len(resources) != len(expectedResources) { 92 | t.Errorf("Wrong number of resources. Found %d of %d", len(resources), len(expectedResources)) 93 | } else { 94 | for i := 0; i < len(resources); i++ { 95 | if _, ok := expectedResources[resources[i]]; !ok { 96 | t.Errorf("Resource url is not found in the expected values: %s", resources[i]) 97 | } 98 | } 99 | } 100 | 101 | // Check links. 102 | fmt.Printf("Links: %q\n", links) 103 | 104 | if len(links) != len(expectedLinks) { 105 | t.Errorf("Wrong number of links. Found %d of %d", len(links), len(expectedLinks)) 106 | 107 | } else { 108 | for i := 0; i < len(links); i++ { 109 | if _, ok := expectedLinks[links[i]]; !ok { 110 | t.Errorf("Link url is not found in the expected values: %s", links[i]) 111 | } 112 | } 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /helper.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "github.com/kkomelin/insecres/interfaces" 6 | "math/rand" 7 | "strings" 8 | "time" 9 | ) 10 | 11 | // Goroutine callback, which fetches and parses the passed url 12 | // in order to find insecure resources and next urls to fetch from. 13 | func processPage(url string, queue chan string, registrar interfaces.Registrar, fetcher interfaces.Fetcher, parser interfaces.Parser, reporter interfaces.Reporter) { 14 | 15 | // Ignore processed urls. 16 | if !registrar.IsNew(url) { 17 | return 18 | } 19 | // Lock url so that no one other goroutine can process it. 20 | registrar.Register(url) 21 | 22 | fmt.Print(".") 23 | 24 | responseBody, err := fetcher.Fetch(url) 25 | if err != nil { 26 | fmt.Printf("Error occured: %s\n", err) 27 | return 28 | } 29 | 30 | defer responseBody.Close() 31 | 32 | insecureResourceUrls, pageUrls, err := parser.Parse(url, responseBody) 33 | if err != nil { 34 | fmt.Printf("Error occured: %s\n", err) 35 | return 36 | } 37 | 38 | reportPageResources(url, insecureResourceUrls, reporter) 39 | 40 | for _, url := range pageUrls { 41 | // Random pause before sending to the main thread. 42 | delayBetweenRequests() 43 | queue <- url 44 | } 45 | } 46 | 47 | // Reports page resources. 48 | func reportPageResources(url string, resources []string, reporter interfaces.Reporter) error { 49 | if len(resources) == 0 { 50 | return nil 51 | } 52 | 53 | if !reporter.IsEmpty() { 54 | for i, insecureResourceUrl := range resources { 55 | resources[i] = url + ", " + insecureResourceUrl 56 | } 57 | 58 | return reporter.WriteLines(resources) 59 | } 60 | 61 | fmt.Printf("\n%s:\n", url) 62 | for _, insecureResourceUrl := range resources { 63 | fmt.Printf("- %s\n", insecureResourceUrl) 64 | } 65 | return nil 66 | } 67 | 68 | // Implement random pause before sending the next request to 69 | // (no more than beforeEngTimeout/ and no less than beforeEngTimeout/4 constant). 70 | // It is one of the measures to prevent banning by the server. 71 | func delayBetweenRequests() { 72 | randNum := randomInRange(MinDelayBetweenRequests, MaxDelayBetweenRequests) 73 | time.Sleep(time.Duration(randNum) * time.Millisecond) 74 | } 75 | 76 | // Returns a random number in a given range. 77 | // The idea has been borrowed from http://golangcookbook.blogspot.ru/2012/11/generate-random-number-in-given-range.html 78 | // and improved. 79 | func randomInRange(min, max int) int { 80 | rand.Seed(time.Now().UTC().UnixNano()) 81 | 82 | if (min == 0 || max == 0) || (min > max) { 83 | return 0 84 | } 85 | 86 | if min == max { 87 | return min 88 | } 89 | 90 | return rand.Intn(max-min) + min 91 | } 92 | 93 | // Crawl pages starting from the passed url and find insecure resources. 94 | func Crawl(url, reportFile string) { 95 | 96 | url = strings.TrimSuffix(url, "/") 97 | 98 | report := &Report{} 99 | 100 | // Print results to the file. 101 | if reportFile != "" { 102 | err := report.Open(reportFile) 103 | if err != nil { 104 | fmt.Println(err) 105 | return 106 | } 107 | } else { // Print results to console. 108 | fmt.Println("-----") 109 | fmt.Println("Insecure resources (grouped by page):") 110 | fmt.Println("-----") 111 | } 112 | 113 | registry := &Processed{processed: make(map[string]int)} 114 | finder := &ResourceAndLinkFinder{} 115 | 116 | queue := make(chan string) 117 | 118 | go processPage(url, queue, registry, finder, finder, report) 119 | 120 | tick := time.Tick(time.Duration(BeforeEngDelay) * time.Millisecond) 121 | flag := false 122 | for { 123 | select { 124 | case url := <-queue: 125 | flag = false 126 | 127 | go processPage(url, queue, registry, finder, finder, report) 128 | case <-tick: 129 | if flag { 130 | // TODO: Implement a verbose mode when all crawled pages are also displayed. 131 | //if false { 132 | // fmt.Println("-----") 133 | // fmt.Println("Analized pages:") 134 | // fmt.Println("-----") 135 | // fmt.Println(registry) 136 | //} 137 | fmt.Println("") 138 | 139 | // Close report. 140 | report.Close() 141 | 142 | return 143 | } 144 | flag = true 145 | } 146 | } 147 | } 148 | 149 | func displayHelp() { 150 | fmt.Printf(`usage: insecres [-h|-f="path/to/report.csv"] 151 | ARGUMENTS 152 | url 153 | A url to start from, e.g. https://example.com" 154 | OPTIONS 155 | -h 156 | Show this help message. 157 | -f 158 | Define the location of the CSV file with the results. 159 | If it is not set, results are printed to the console. 160 | `) 161 | } 162 | -------------------------------------------------------------------------------- /helper_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | // TestRandomInRange tests randomInRange function. 9 | func TestRandomInRange(t *testing.T) { 10 | 11 | fmt.Println("TestRandomInRange") 12 | 13 | tests := []struct { 14 | min int 15 | max int 16 | expectedResult int 17 | }{ 18 | // Edge cases. 19 | {0, 0, 0}, 20 | {1000, 1000, 1000}, 21 | // Normal case. 22 | {500, 1000, -1}, 23 | // Wrong parameters. 24 | {5000, 1000, 0}, 25 | {1000, 500, 0}, 26 | } 27 | 28 | for _, testData := range tests { 29 | givenResult := randomInRange(testData.min, testData.max) 30 | 31 | // Normal parameters, random results. 32 | if testData.expectedResult == -1 { 33 | if testData.min > givenResult || givenResult > testData.max { 34 | t.Errorf("randomInRange(%d, %d): Given: %d", testData.min, testData.max, givenResult) 35 | } 36 | continue 37 | } 38 | 39 | // Edge and wrong parameters, fixed result. 40 | if givenResult != testData.expectedResult { 41 | t.Errorf("randomInRange(%d, %d): Given: %d, Expected: %d", testData.min, testData.max, givenResult, testData.expectedResult) 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /interfaces/doc.go: -------------------------------------------------------------------------------- 1 | // Package interfaces provides interfaces for the application, such as Fetcher, Parser, etc. 2 | package interfaces 3 | -------------------------------------------------------------------------------- /interfaces/fetcher.go: -------------------------------------------------------------------------------- 1 | package interfaces 2 | 3 | import "io" 4 | 5 | // Fetcher is the interface that wraps the Fetch method. 6 | type Fetcher interface { 7 | // Fetch fetches page by url and returns the response body. 8 | Fetch(url string) (responseBody io.ReadCloser, err error) 9 | } 10 | -------------------------------------------------------------------------------- /interfaces/parser.go: -------------------------------------------------------------------------------- 1 | package interfaces 2 | 3 | import "io" 4 | 5 | // Parser is the interface that wraps the Parse method. 6 | type Parser interface { 7 | // Parse parses passed response body and finds urls of resources and pages. 8 | Parse(baseUrl string, httpBody io.Reader) (resourceUrls []string, linkUrls []string, err error) 9 | } 10 | -------------------------------------------------------------------------------- /interfaces/registrar.go: -------------------------------------------------------------------------------- 1 | package interfaces 2 | 3 | // Registrar is the interface that wraps methods necessary for storing processed urls. 4 | // All methods of this interface should be thread-safe. 5 | type Registrar interface { 6 | // Register adds processed url to the registry. 7 | Register(url string) 8 | // IsNew checks whether the passed url is new or not. 9 | IsNew(url string) bool 10 | } 11 | -------------------------------------------------------------------------------- /interfaces/reporter.go: -------------------------------------------------------------------------------- 1 | package interfaces 2 | 3 | // Reporter is the interface that wraps methods for reporting results. 4 | type Reporter interface { 5 | // Init prepares file to report to. 6 | Open(filePath string) error 7 | // WriteLines dumps slice of strings to the report. 8 | WriteLines(lines []string) error 9 | // Close releases file. 10 | Close() error 11 | // IsEmpty checks whether a target resource is nil or not. 12 | IsEmpty() bool 13 | } 14 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | ) 6 | 7 | const ( 8 | // BeforeEngDelay defines time in milliseconds, which the program waits before exit 9 | // so that all goroutines can finish and return results. 10 | BeforeEngDelay int = 2000 11 | // MinDelayBetweenRequests is minimum time in milliseconds, 12 | // which the program waits before processing any new url. 13 | // We wait for some random time (between MinDelayBetweenRequests and MaxDelayBetweenRequests) 14 | // to prevent blacklisting by the server. 15 | MinDelayBetweenRequests int = 500 16 | // MaxDelayBetweenRequests is maximum time in milliseconds, 17 | // which the program waits before processing any new url. 18 | MaxDelayBetweenRequests int = 1000 19 | ) 20 | 21 | func main() { 22 | var ( 23 | helpFlag bool 24 | reportFlag string 25 | ) 26 | 27 | // Find options. 28 | flag.BoolVar(&helpFlag, "h", false, "") 29 | flag.StringVar(&reportFlag, "f", "", "") 30 | flag.Parse() 31 | 32 | // Find argument. 33 | args := flag.Args() 34 | if len(args) < 1 { 35 | displayHelp() 36 | return 37 | } 38 | 39 | // Display help. 40 | if helpFlag { 41 | displayHelp() 42 | return 43 | } 44 | 45 | // Run the crawler. 46 | Crawl(args[0], reportFlag) 47 | } 48 | -------------------------------------------------------------------------------- /processed.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "sync" 5 | ) 6 | 7 | // Processed is a thread-safe storage for processed urls. 8 | type Processed struct { 9 | processed map[string]int 10 | mux sync.Mutex 11 | } 12 | 13 | // Register adds a processed url to the registry. 14 | func (r *Processed) Register(url string) { 15 | r.mux.Lock() 16 | defer r.mux.Unlock() 17 | 18 | r.processed[url] = 1 19 | } 20 | 21 | // IsNew checks whether the url is new. 22 | func (r *Processed) IsNew(url string) bool { 23 | r.mux.Lock() 24 | defer r.mux.Unlock() 25 | 26 | if _, ok := r.processed[url]; ok { 27 | return false 28 | } 29 | 30 | return true 31 | } 32 | 33 | // String defines our own way to output the processed urls. 34 | // [url1]\n 35 | // [url2]\n 36 | func (r *Processed) String() string { 37 | r.mux.Lock() 38 | defer r.mux.Unlock() 39 | 40 | output := "" 41 | for url := range r.processed { 42 | output += url + "\n" 43 | } 44 | return output 45 | } 46 | -------------------------------------------------------------------------------- /report.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "os" 6 | "sync" 7 | ) 8 | 9 | // Report is a thread-safe data reporting tool. 10 | type Report struct { 11 | file *os.File 12 | writer *bufio.Writer 13 | mux sync.Mutex 14 | } 15 | 16 | // Open opens or creates a file and initializes buffered writer. 17 | func (r *Report) Open(filePath string) error { 18 | var err error 19 | 20 | r.file, err = os.Create(filePath) 21 | if err != nil { 22 | return err 23 | } 24 | 25 | r.writer = bufio.NewWriter(r.file) 26 | 27 | return nil 28 | } 29 | 30 | // WriteLines dump slice of strings to the file. It also adds trailing endline marker to each string. 31 | func (r *Report) WriteLines(lines []string) error { 32 | r.mux.Lock() 33 | defer r.mux.Unlock() 34 | 35 | var err error 36 | 37 | for _, line := range lines { 38 | _, err = r.writer.WriteString(line + "\n") 39 | if err != nil { 40 | return err 41 | } 42 | } 43 | 44 | return r.writer.Flush() 45 | } 46 | 47 | // Close closes file handler in case it is not empty. 48 | func (r *Report) Close() error { 49 | r.mux.Lock() 50 | defer r.mux.Unlock() 51 | 52 | if r.IsEmpty() { 53 | return nil 54 | } 55 | 56 | return r.file.Close() 57 | } 58 | 59 | // IsEmpty check whether the file handler is initialized or not. 60 | func (r *Report) IsEmpty() bool { 61 | return (r.file == nil) 62 | } 63 | --------------------------------------------------------------------------------