├── .gitignore ├── LICENSE ├── README.md └── imagescraper.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | *.test 24 | *.prof 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Lakshay Kalbhor 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ImageScraper 2 | 3 | [![license](https://img.shields.io/github/license/mashape/apistatus.svg?style=flat-square)](LICENSE) 4 | 5 | 6 | > ### A high performance, concurrent Image Scraper. 7 | 8 | 9 | 10 | ## Installation 11 | 12 | ### Releases 13 | [Latest release](https://github.com/kalbhor/Image-Scraper/releases) 14 | 15 | 16 | ### Source 17 | ```sh 18 | $ git clone https://github.com/kalbhor/image-scraper 19 | $ cd image-scraper 20 | $ go build imagescraper.go 21 | ``` 22 | 23 | ## Usage 24 | ```sh 25 | $ ./imagescraper [URLs] 26 | ``` 27 | 28 | 29 | ## Contribute 30 | 31 | Found an issue? Post it in the [issue tracker](https://github.com/kalbhor/Image-Scraper/issues).
32 | Want to add another awesome feature? [Fork](https://github.com/kalbhor/Image-Scraper/fork) this repository and add your feature, then send a pull request. 33 | 34 | ##### Binaries for Linux required. 35 | 36 | 37 | ## License 38 | The MIT License (MIT) 39 | Copyright (c) 2017 Lakshay Kalbhor 40 | -------------------------------------------------------------------------------- /imagescraper.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "net/http" 7 | "net/url" 8 | "os" 9 | "strings" 10 | "sync" 11 | 12 | "github.com/PuerkitoBio/goquery" 13 | ) 14 | 15 | type Sites struct { 16 | url string 17 | images []string 18 | folder string 19 | } 20 | 21 | var crawlers sync.WaitGroup 22 | var downloaders sync.WaitGroup 23 | var verbose bool = false 24 | 25 | func (Site *Sites) Crawl() { 26 | defer crawlers.Done() 27 | 28 | resp, err := goquery.NewDocument(Site.url) 29 | if err != nil { 30 | fmt.Printf("ERROR: Failed to crawl \"" + Site.url + "\"\n\n") 31 | os.Exit(3) 32 | } 33 | // use CSS selector found with the browser inspector 34 | // for each, use index and item 35 | resp.Find("*").Each(func(index int, item *goquery.Selection) { 36 | linkTag := item.Find("img") 37 | link, _ := linkTag.Attr("src") 38 | 39 | if link != "" { 40 | Site.images = append(Site.images, link) 41 | } 42 | }) 43 | 44 | fmt.Printf("%s found %d unique images\n", Site.url, len(Site.images)) 45 | 46 | pool := len(Site.images) / 3 47 | if pool > 10 { 48 | pool = 10 49 | } 50 | 51 | l := 0 52 | counter := len(Site.images) / pool 53 | 54 | for i := counter; i < len(Site.images); i += counter { 55 | downloaders.Add(1) 56 | go Site.DownloadImg(Site.images[l:i]) 57 | l = i 58 | } 59 | 60 | downloaders.Wait() 61 | } 62 | 63 | func (Site *Sites) DownloadImg(images []string) { 64 | 65 | defer downloaders.Done() 66 | 67 | os.Mkdir(Site.folder, os.FileMode(0777)) 68 | 69 | Site.images = SliceUniq(images) 70 | 71 | for _, url := range Site.images { 72 | if url[:4] != "http" { 73 | url = "http:" + url 74 | } 75 | parts := strings.Split(url, "/") 76 | name := parts[len(parts)-1] 77 | file, _ := os.Create(string(Site.folder + "/" + name)) 78 | resp, _ := http.Get(url) 79 | io.Copy(file, resp.Body) 80 | file.Close() 81 | resp.Body.Close() 82 | if verbose == true { 83 | fmt.Printf("Saving %s \n", Site.folder+"/"+name) 84 | } 85 | } 86 | } 87 | 88 | func SliceUniq(s []string) []string { 89 | for i := 0; i < len(s); i++ { 90 | for i2 := i + 1; i2 < len(s); i2++ { 91 | if s[i] == s[i2] { 92 | // delete 93 | s = append(s[:i2], s[i2+1:]...) 94 | i2-- 95 | } 96 | } 97 | } 98 | return s 99 | } 100 | 101 | func main() { 102 | 103 | 104 | var seedUrls []string 105 | 106 | if len(os.Args) < 2 { 107 | fmt.Println("ERROR : Less Args\nCommand should be of type : imagescraper [websites]\n\n") 108 | os.Exit(3) 109 | } 110 | if os.Args[1] == "-v" || os.Args[1] == "--verbose" { 111 | verbose = true 112 | seedUrls = os.Args[2:] 113 | }else { 114 | seedUrls = os.Args[1:] 115 | } 116 | Site := make([]Sites, len(seedUrls)) 117 | 118 | // Crawl process (concurrently) 119 | for i, name := range seedUrls { 120 | if name[:4] != "http" { 121 | name = "http://" + name 122 | } 123 | u, err := url.Parse(name) 124 | if err != nil { 125 | fmt.Printf("could not fetch page - %s %v", name, err) 126 | } 127 | Site[i].folder = u.Host 128 | Site[i].url = name 129 | crawlers.Add(1) 130 | go Site[i].Crawl() 131 | } 132 | 133 | crawlers.Wait() 134 | 135 | fmt.Printf("\n\nScraped succesfully\n\n") 136 | 137 | } 138 | --------------------------------------------------------------------------------