├── example.py
├── README.md
└── CreepyCrawler.py


/example.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | """
 4 | A simple python script to crawl Google and get email.
 5 | Just couldn't find a normal functioning script so there you go.
 6 | """
 7 | 
 8 | import CreepyCrawler
 9 | 
10 | if __name__ == '__main__':
11 | 
12 |     CripHandlr = CreepyCrawler.CreepyCrawler()
13 | 
14 |     appendix = "Digital Whisper"
15 |     emails = CripHandlr.RunSearchOnQuery(appendix, 1, 20)
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## So...
2 | 
3 | So this tool is just quick ugly script to get emails from Google for a later phishing attack. It was written after i could not find any tool that will allow me to find  emails by company from Google without an API/too many false-positives which were useless to try and sort through. This will give many false-positives but in a way which is easier to sort (did not filter by domain since many companies have several domains with variations to the domain name).
4 | 
5 | It is written horribly and will hog your memory (it saves non of the HTML source codes to disk but only runs through them in memory. And even that it does not preform one by one, but in bulks!). It's ugly, undocumented and requires you to read the code before running it. Probably no updates and maintenance will be made. You can submit you code changes to the [GitHub Page](https://www.github.com/ytisf/CreepyCrawler) but ideas or blueprints will not be made so if you wish you can upload your changes to the code and they will probably be merged.
6 | 
7 | Good luck.


--------------------------------------------------------------------------------
/CreepyCrawler.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import re
  4 | import sys
  5 | import urllib
  6 | import urllib2
  7 | from progressbar import *
  8 | from BeautifulSoup import BeautifulSoup
  9 | 
 10 | 
 11 | class CreepyCrawler():
 12 |     def __init__(self):
 13 |         self._Search = ""
 14 |         self._ResPerPage = 100
 15 | 
 16 |     def get_links(self, query, pages=0, site=""):
 17 |         """
 18 |         Google for a query and get a list of links.
 19 |         :param query: Search query.
 20 |         :param pages: Amount of pages to search.
 21 |         :param site: If the search should be restricted to a domain.
 22 |         :return:Returns list of links from Google.
 23 |         """
 24 |         results = []
 25 | 
 26 |         pages *= self._ResPerPage            # Since Google likes working with offsets instead of pages.
 27 | 
 28 |         if pages != 0:
 29 |             print "Starting search for '%s' over %s pages." % (query, pages/self._ResPerPage)
 30 |         else:
 31 |             print "Starting search for '%s' over 1 page." % query
 32 | 
 33 |         fh = open(self._Search + "all_links.txt", "w")
 34 | 
 35 |         for i in range(0, pages, 100):
 36 | 
 37 |             if i != 0:
 38 |                 print "Getting page %s of %s." % (i/self._ResPerPage+1, pages/self._ResPerPage)
 39 |             else:
 40 |                 print "Getting first page."
 41 | 
 42 |             if site == "":
 43 |                 address = "http://www.google.com/search?q=%s&num=%s&hl=en&start=%s" % ((urllib.quote_plus(query)),self._ResPerPage,  i)
 44 |             else:
 45 |                 address = "http://www.google.com/search?q=%s%s&num=%s&hl=en&start=%s" % (urllib.quote_plus(query), "+site:" + str(site),self._ResPerPage, i)
 46 | 
 47 |             request = urllib2.Request(address, None, {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4_CrCrw) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11'})
 48 |             urlfile = urllib2.urlopen(request)
 49 |             page = urlfile.read()
 50 |             soup = BeautifulSoup(page)
 51 | 
 52 |             for li in soup.findAll('li', attrs={'class': 'g'}):
 53 |                 slink = li.find('a')
 54 |                 results.append(slink['href'])
 55 |                 fh.write(slink['href'] + "\n")
 56 | 
 57 |         fh.close()
 58 |         return results
 59 | 
 60 |     def get_source(self, links):
 61 |         """
 62 |         Gets the HTML source for each link.
 63 |         :param links: Array of links.
 64 |         :return:Array of sources
 65 |         """
 66 |         source_array = []
 67 |         failed = []
 68 | 
 69 |         print("Getting %s pages." % len(links))
 70 |         widgets = ['Getting HTML: ', Percentage(), ' ', Bar(marker='-', left='[', right=']'), ' ', ETA(), ' ', FileTransferSpeed()]
 71 | 
 72 |         pbar = ProgressBar(widgets=widgets, maxval=len(links))
 73 |         pbar.start()
 74 | 
 75 |         for i in range(1, len(links), 1):
 76 |             try:
 77 |                 response = urllib2.urlopen(links[i])
 78 |                 source_array.append(response.read())
 79 |             except:
 80 |                 failed.append(i)
 81 | 
 82 |             pbar.update(i)
 83 |             time.sleep(0.01)
 84 | 
 85 |         pbar.finish()
 86 |         print ""
 87 |         return source_array
 88 | 
 89 |     def extract_emails(self, sources):
 90 |         """
 91 |         Extract emails (regex) from an array.
 92 |         :param sources:an array to search for in.
 93 |         :return:an array of emails.
 94 |         """
 95 |         fh = open(self._Search + "all_emails.txt", "w")
 96 | 
 97 |         emails = []
 98 |         email_regex = r'([A-Za-z0-9\.\-\_]+@[A-Za-z0-9\.\-\_]+\.[A-Za-z]+)'
 99 | 
100 |         for source in sources:
101 |             a = re.findall(email_regex, source)
102 |             for each in a:
103 |                 emails.append(each)
104 |                 fh.write(each + "\n")
105 | 
106 |         fh.close()
107 | 
108 |         s = []
109 |         for i in emails:
110 |             if i not in s:
111 |                 s.append(i)
112 | 
113 |         fh = open(self._Search + "filtered_emails.txt", "w")
114 |         for mail in s:
115 |             fh.write(mail + "\n")
116 |         fh.close()
117 | 
118 |         return emails
119 | 
120 |     def RunSearchOnQuery(self, query, pages=0, search_results_per_page=100):
121 |         """
122 |         A dummy function to bundle it all together.
123 |         :param query: Search query
124 |         :param pages: Number of pages
125 |         :param search_results_per_page: How many results per page. Default is 100.
126 |         :return:List of emails
127 |         """
128 | 
129 |         self._Search = query
130 |         self._ResPerPage = search_results_per_page
131 | 
132 |         links = self.get_links(query, pages)
133 |         sources = self.get_source(links)
134 |         return self.extract_emails(sources)
135 | 
136 | 
137 | if __name__ == "__main__":
138 |     print("This should not be ran as a standalone.\nCall it as a method.")
139 |     sys.exit(1)
140 | 


--------------------------------------------------------------------------------