├── LICENSE ├── README.md └── mailbot.go /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 AnikHasibul 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mailbot 2 | A Crawler That Crawls Email Addresses From Various Sources! Ex: pastebin, github, paste ubuntu e.t.c 3 | 4 | ### TODO 5 | 6 | * Random proxy support 7 | * Bing crawler 8 | * Github crawler 9 | -------------------------------------------------------------------------------- /mailbot.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "errors" 5 | "flag" 6 | "fmt" 7 | "io/ioutil" 8 | "net/http" 9 | "os" 10 | "regexp" 11 | "strconv" 12 | "strings" 13 | "sync" 14 | "time" 15 | ) 16 | 17 | // Default settings 18 | var ( 19 | DefaultFileName = "crawler" + strconv.FormatInt(time.Now().UnixNano(), 36) + ".log" 20 | ) 21 | 22 | // Crawler holds the flags and locks 23 | type Crawler struct { 24 | flags struct { 25 | filename string 26 | printToStdout bool 27 | verbose bool 28 | pastebin bool 29 | debian bool 30 | slexy bool 31 | } 32 | file *os.File 33 | } 34 | 35 | var blacklist = []string{ 36 | "formorer@debian.org", 37 | "user@user", 38 | } 39 | 40 | var c = new(Crawler) 41 | 42 | func init() { 43 | var err error 44 | flag.StringVar( 45 | &c.flags.filename, 46 | "o", 47 | DefaultFileName, 48 | "File to save the collected mails", 49 | ) 50 | flag.BoolVar( 51 | &c.flags.printToStdout, 52 | "stdout", 53 | true, 54 | "Print to stdout", 55 | ) 56 | flag.BoolVar( 57 | &c.flags.verbose, 58 | "verbose", 59 | false, 60 | "Verbose mode", 61 | ) 62 | flag.BoolVar( 63 | &c.flags.pastebin, 64 | "pastebin", 65 | true, 66 | "Crawl pastebin.com", 67 | ) 68 | flag.BoolVar( 69 | &c.flags.debian, 70 | "debian", 71 | true, 72 | "Crawl paste.debian.net", 73 | ) 74 | flag.BoolVar( 75 | &c.flags.slexy, 76 | "slexy", 77 | true, 78 | "Crawl slexy.org", 79 | ) 80 | 81 | flag.Parse() 82 | 83 | c.file, err = os.OpenFile( 84 | c.flags.filename, 85 | os.O_APPEND|os.O_WRONLY|os.O_CREATE, 86 | 0600, 87 | ) 88 | if err != nil { 89 | report(err) 90 | } 91 | } 92 | 93 | func main() { 94 | c.Run() 95 | } 96 | 97 | // Run runs the crawler 98 | func (c *Crawler) Run() { 99 | var wg = &sync.WaitGroup{} 100 | for { 101 | if c.flags.pastebin { 102 | wg.Add(1) 103 | go c.Pastebin(wg) 104 | } 105 | if c.flags.debian { 106 | wg.Add(1) 107 | go c.Debian(wg) 108 | } 109 | if c.flags.slexy { 110 | wg.Add(1) 111 | go c.Slexy(wg) 112 | } 113 | wg.Wait() 114 | } 115 | } 116 | 117 | // GetMail extracts email addresses from text documents 118 | func (c *Crawler) GetMail(page string) { 119 | r := regexp.MustCompile(`[\w]+@[\w.]+`) 120 | mails := r.FindAllString(page, -1) 121 | if mails == nil { 122 | if c.flags.verbose { 123 | report( 124 | errors.New("no mail found"), 125 | ) 126 | } 127 | return 128 | } 129 | fresh := FreshFilter(mails) 130 | if len(fresh) != 0 { 131 | return 132 | } 133 | toWrite := strings.Join(fresh, "\n") 134 | c.file.WriteString(toWrite + "\n") 135 | if c.flags.printToStdout { 136 | fmt.Println(toWrite) 137 | } 138 | c.file.Sync() 139 | return 140 | } 141 | 142 | // FetchPage fetches/scrapes pages from web URLsl 143 | func (c *Crawler) FetchPage(url string) (string, error) { 144 | if c.flags.verbose { 145 | fmt.Printf("Fetching: %s\n", url) 146 | } 147 | client := &http.Client{} 148 | resp, err := client.Get(url) 149 | if err != nil { 150 | report(err) 151 | return "", err 152 | } 153 | b, err := ioutil.ReadAll(resp.Body) 154 | return string(b), err 155 | } 156 | 157 | // Pastebin collects emails from pastebin.com 158 | func (c *Crawler) Pastebin(wg *sync.WaitGroup) { 159 | defer wg.Done() 160 | r := regexp.MustCompile(`class="i_p0" alt="" />`) 161 | url := "https://pastebin.com/archive" 162 | page, err := c.FetchPage(url) 163 | if err != nil { 164 | report(err) 165 | } 166 | raws := r.FindAllString(page, -1) 167 | if raws == nil { 168 | if c.flags.verbose { 169 | report(errors.New("no raw link")) 170 | } 171 | return 172 | } 173 | for _, v := range raws { 174 | parser := strings.Split(v, `="`) 175 | if len(parser) < 4 { 176 | report(errors.New("can't parse")) 177 | return 178 | } 179 | rawlink := "https://pastebin.com/raw" + strings.Replace(parser[3], `">`, "", -1) 180 | page, err := c.FetchPage(rawlink) 181 | if err != nil { 182 | report(err) 183 | return 184 | } 185 | c.GetMail(page) 186 | } 187 | 188 | } 189 | 190 | // Debian collects emails from paste.debian.net 191 | func (c *Crawler) Debian(wg *sync.WaitGroup) { 192 | defer wg.Done() 193 | r := regexp.MustCompile(`
  • `) 194 | url := "http://paste.debian.net" 195 | page, err := c.FetchPage(url) 196 | if err != nil { 197 | report(err) 198 | } 199 | raws := r.FindAllString(page, -1) 200 | if raws == nil { 201 | if c.flags.verbose { 202 | report(errors.New("no raw link")) 203 | } 204 | return 205 | } 206 | for _, v := range raws { 207 | parser := strings.Split(v, `
  • `, "", -1) 213 | page, err := c.FetchPage(rawlink) 214 | if err != nil { 215 | report(err) 216 | return 217 | } 218 | c.GetMail(page) 219 | } 220 | 221 | } 222 | 223 | // Slexy collects emails from slexy.org 224 | func (c *Crawler) Slexy(wg *sync.WaitGroup) { 225 | defer wg.Done() 226 | r := regexp.MustCompile(`\/view(.*?)">`) 227 | url := "http://slexy.org/recent" 228 | page, err := c.FetchPage(url) 229 | if err != nil { 230 | report(err) 231 | } 232 | raws := r.FindAllString(page, -1) 233 | if raws == nil { 234 | if c.flags.verbose { 235 | report(errors.New("no raw link")) 236 | } 237 | return 238 | } 239 | for _, v := range raws { 240 | parser := strings.Split(v, `/view`) 241 | if len(parser) < 2 { 242 | report(errors.New("can't parse")) 243 | return 244 | } 245 | rawlink := "http://slexy.org/raw" + strings.Replace(parser[1], `">`, "", -1) 246 | page, err := c.FetchPage(rawlink) 247 | if err != nil { 248 | report(err) 249 | return 250 | } 251 | c.GetMail(page) 252 | } 253 | } 254 | 255 | func report(err error) { 256 | fmt.Fprintln(os.Stderr, err) 257 | } 258 | 259 | // FreshFilter filters out invalid email addresses 260 | func FreshFilter(mails []string) []string { 261 | var fresh []string 262 | for _, mail := range mails { 263 | var blocked bool 264 | 265 | if strings.Contains(mail, ".png") { 266 | continue 267 | } 268 | if strings.Contains(mail, ".gif") { 269 | continue 270 | } 271 | if strings.Contains(mail, ".jpg") { 272 | continue 273 | } 274 | if strings.Contains(mail, "._") { 275 | continue 276 | } 277 | if strings.Contains(mail, "@.") { 278 | continue 279 | } 280 | if !strings.Contains(mail, ".") { 281 | continue 282 | } 283 | for _, black := range blacklist { 284 | if mail == black { 285 | fmt.Println(mail, black) 286 | blocked = true 287 | } 288 | } 289 | if !blocked { 290 | fresh = append(fresh, mail) 291 | } 292 | } 293 | return fresh 294 | } 295 | --------------------------------------------------------------------------------