├── .gitignore
├── LICENSE
├── README.md
└── imagescraper.go
/.gitignore:
--------------------------------------------------------------------------------
1 | # Compiled Object files, Static and Dynamic libs (Shared Objects)
2 | *.o
3 | *.a
4 | *.so
5 |
6 | # Folders
7 | _obj
8 | _test
9 |
10 | # Architecture specific extensions/prefixes
11 | *.[568vq]
12 | [568vq].out
13 |
14 | *.cgo1.go
15 | *.cgo2.c
16 | _cgo_defun.c
17 | _cgo_gotypes.go
18 | _cgo_export.*
19 |
20 | _testmain.go
21 |
22 | *.exe
23 | *.test
24 | *.prof
25 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Lakshay Kalbhor
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ImageScraper
2 |
3 | [](LICENSE)
4 |
5 |
6 | > ### A high performance, concurrent Image Scraper.
7 |
8 |
9 |
10 | ## Installation
11 |
12 | ### Releases
13 | [Latest release](https://github.com/kalbhor/Image-Scraper/releases)
14 |
15 |
16 | ### Source
17 | ```sh
18 | $ git clone https://github.com/kalbhor/image-scraper
19 | $ cd image-scraper
20 | $ go build imagescraper.go
21 | ```
22 |
23 | ## Usage
24 | ```sh
25 | $ ./imagescraper [URLs]
26 | ```
27 |
28 |
29 | ## Contribute
30 |
31 | Found an issue? Post it in the [issue tracker](https://github.com/kalbhor/Image-Scraper/issues).
32 | Want to add another awesome feature? [Fork](https://github.com/kalbhor/Image-Scraper/fork) this repository and add your feature, then send a pull request.
33 |
34 | ##### Binaries for Linux required.
35 |
36 |
37 | ## License
38 | The MIT License (MIT)
39 | Copyright (c) 2017 Lakshay Kalbhor
40 |
--------------------------------------------------------------------------------
/imagescraper.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "io"
6 | "net/http"
7 | "net/url"
8 | "os"
9 | "strings"
10 | "sync"
11 |
12 | "github.com/PuerkitoBio/goquery"
13 | )
14 |
15 | type Sites struct {
16 | url string
17 | images []string
18 | folder string
19 | }
20 |
21 | var crawlers sync.WaitGroup
22 | var downloaders sync.WaitGroup
23 | var verbose bool = false
24 |
25 | func (Site *Sites) Crawl() {
26 | defer crawlers.Done()
27 |
28 | resp, err := goquery.NewDocument(Site.url)
29 | if err != nil {
30 | fmt.Printf("ERROR: Failed to crawl \"" + Site.url + "\"\n\n")
31 | os.Exit(3)
32 | }
33 | // use CSS selector found with the browser inspector
34 | // for each, use index and item
35 | resp.Find("*").Each(func(index int, item *goquery.Selection) {
36 | linkTag := item.Find("img")
37 | link, _ := linkTag.Attr("src")
38 |
39 | if link != "" {
40 | Site.images = append(Site.images, link)
41 | }
42 | })
43 |
44 | fmt.Printf("%s found %d unique images\n", Site.url, len(Site.images))
45 |
46 | pool := len(Site.images) / 3
47 | if pool > 10 {
48 | pool = 10
49 | }
50 |
51 | l := 0
52 | counter := len(Site.images) / pool
53 |
54 | for i := counter; i < len(Site.images); i += counter {
55 | downloaders.Add(1)
56 | go Site.DownloadImg(Site.images[l:i])
57 | l = i
58 | }
59 |
60 | downloaders.Wait()
61 | }
62 |
63 | func (Site *Sites) DownloadImg(images []string) {
64 |
65 | defer downloaders.Done()
66 |
67 | os.Mkdir(Site.folder, os.FileMode(0777))
68 |
69 | Site.images = SliceUniq(images)
70 |
71 | for _, url := range Site.images {
72 | if url[:4] != "http" {
73 | url = "http:" + url
74 | }
75 | parts := strings.Split(url, "/")
76 | name := parts[len(parts)-1]
77 | file, _ := os.Create(string(Site.folder + "/" + name))
78 | resp, _ := http.Get(url)
79 | io.Copy(file, resp.Body)
80 | file.Close()
81 | resp.Body.Close()
82 | if verbose == true {
83 | fmt.Printf("Saving %s \n", Site.folder+"/"+name)
84 | }
85 | }
86 | }
87 |
88 | func SliceUniq(s []string) []string {
89 | for i := 0; i < len(s); i++ {
90 | for i2 := i + 1; i2 < len(s); i2++ {
91 | if s[i] == s[i2] {
92 | // delete
93 | s = append(s[:i2], s[i2+1:]...)
94 | i2--
95 | }
96 | }
97 | }
98 | return s
99 | }
100 |
101 | func main() {
102 |
103 |
104 | var seedUrls []string
105 |
106 | if len(os.Args) < 2 {
107 | fmt.Println("ERROR : Less Args\nCommand should be of type : imagescraper [websites]\n\n")
108 | os.Exit(3)
109 | }
110 | if os.Args[1] == "-v" || os.Args[1] == "--verbose" {
111 | verbose = true
112 | seedUrls = os.Args[2:]
113 | }else {
114 | seedUrls = os.Args[1:]
115 | }
116 | Site := make([]Sites, len(seedUrls))
117 |
118 | // Crawl process (concurrently)
119 | for i, name := range seedUrls {
120 | if name[:4] != "http" {
121 | name = "http://" + name
122 | }
123 | u, err := url.Parse(name)
124 | if err != nil {
125 | fmt.Printf("could not fetch page - %s %v", name, err)
126 | }
127 | Site[i].folder = u.Host
128 | Site[i].url = name
129 | crawlers.Add(1)
130 | go Site[i].Crawl()
131 | }
132 |
133 | crawlers.Wait()
134 |
135 | fmt.Printf("\n\nScraped succesfully\n\n")
136 |
137 | }
138 |
--------------------------------------------------------------------------------