├── .gitignore ├── LICENCE ├── README.md ├── keywords.txt └── pastebin.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | 24 | # Unit test / coverage reports 25 | .coverage 26 | .tox 27 | nosetests.xml 28 | 29 | # Translations 30 | *.mo 31 | 32 | # Mr Developer 33 | .mr.developer.cfg 34 | .project 35 | .pydevproject 36 | 37 | 38 | # OS generated files # 39 | ###################### 40 | .DS_Store 41 | .DS_Store? 42 | ._* 43 | .Spotlight-V100 44 | .Trashes 45 | ehthumbs.db 46 | Thumbs.db 47 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) {{2014}} {{Ryan M. Clough}} 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Pastebin-Scraper 2 | ================ 3 | 4 | Monitors pastebin.com for a specified set of keywords. 5 | 6 | Usage 7 | ===== 8 | 9 | Copy pastebin.py to the location of your choice. Make a file called 'keywords.txt' in the same directory. When a paste contains a word from the keyword.txt file, the paste will be saved in a 'Pastebin' folder within date sorted folders. 10 | - pastebin.py 11 | - keywords.txt 12 | - Pastebin 13 | - Year 14 | - Month 15 | - Day 16 | - date_pasteID_matchedKeywords.txt 17 | - date_pasteID_matchedKeywords.txt 18 | - date_pasteID_matchedKeywords.txt 19 | 20 | Dependencies 21 | ============ 22 | 23 | pastebin.py was developed to run on Windows 7. The script requires TOR is installed (https://www.torproject.org/download/download). The following python modules are required: 24 | - Socks (http://socksipy.sourceforge.net/) 25 | - SocksiPyHandler (https://github.com/Anorov/PySocks) 26 | -------------------------------------------------------------------------------- /keywords.txt: -------------------------------------------------------------------------------- 1 | some 2 | newline 3 | separated 4 | keywords -------------------------------------------------------------------------------- /pastebin.py: -------------------------------------------------------------------------------- 1 | ''' 2 | pastebin.py 3 | Author: Ryan Clough 4 | Scrapes Pastebin in real time over the TOR network 5 | If there is a match on a keyword (located in keywords.txt) 6 | save the file locally. 7 | 8 | Developed for Windows 7 9 | ''' 10 | 11 | import re 12 | import os 13 | import sys 14 | import time 15 | import socks 16 | import urllib2 17 | from os import path as op 18 | from datetime import datetime 19 | 20 | # TOR Proxy 21 | from sockshandler import SocksiPyHandler 22 | opener = urllib2.build_opener(SocksiPyHandler( 23 | socks.PROXY_TYPE_SOCKS5, '127.0.0.1', 9050)) 24 | tor_exe_path = op.join('C:/', 'Program Files (x86)', 'Tor', 'tor.exe') 25 | 26 | # A few variables 27 | _verbose = True 28 | dir = op.dirname(op.realpath(__file__)) 29 | pastebin_overload = 'Hey, it seems you are requesting a little bit too \ 30 | much from Pastebin. Please slow down!' 31 | timeout = 10 32 | 33 | 34 | def get_url(url): 35 | try: 36 | html = opener.open(url, timeout=timeout).read() 37 | except KeyboardInterrupt: 38 | # Kill program if Control-C is pressed 39 | sys.exit(0) 40 | except: 41 | # Restart TOR if theres a problem 42 | e = sys.exc_info()[0] 43 | if _verbose: 44 | print "ERROR: " + str(e) 45 | os.system('taskkill /im tor.exe') 46 | if _verbose: 47 | print "INFO: Restarting TOR" 48 | time.sleep(1) 49 | os.startfile(tor_exe_path) 50 | return '' 51 | 52 | # See if pastebin is complaining 53 | if html == pastebin_overload: 54 | if _verbose: 55 | print 'ERROR: Too many requests' 56 | time.sleep(5) 57 | return '' 58 | # If not, we are good 59 | elif html: 60 | return html 61 | # Tor download error, skip it and move on 62 | else: 63 | if _verbose: 64 | print 'ERROR: Download is empty - ' + url 65 | return '' 66 | 67 | 68 | # Get the most recent 200 pastes from pastebin.com/archive 69 | def get_recent_pastes(): 70 | html = get_url('http://pastebin.com/archive') 71 | pastes = re.findall(r' \/>(.+?)<\/a><\/td>', html) 72 | return pastes 73 | 74 | 75 | # Get raw, individual paste given a paste id 76 | def get_paste(paste_id): 77 | url = 'http://pastebin.com/raw.php?i=' + paste_id 78 | return get_url(url) 79 | 80 | 81 | def main(): 82 | # Insert the 200 latest pastes into recent pastes 83 | # so we don't start out way behind real time 84 | recent_pastes = [] 85 | for paste_id, paste_title in get_recent_pastes(): 86 | recent_pastes.append(paste_id) 87 | 88 | while True: 89 | # Once we have processed 65 pastes, reset recent pastes to 90 | # make sure we dont fall behind real time 91 | if len(recent_pastes) > 275: 92 | if _verbose: 93 | print 'INFO: Resetting recent_pastes' 94 | recent_pastes = [] 95 | for paste_id, paste_title in get_recent_pastes(): 96 | recent_pastes.append(paste_id) 97 | 98 | #Get the new pastes and continue if successful 99 | new_pastes = get_recent_pastes() 100 | if new_pastes: 101 | for paste_id, paste_title in new_pastes: 102 | if paste_id not in recent_pastes: 103 | 104 | # Get paste and check for errors 105 | paste_text = get_paste(paste_id) 106 | if paste_text == '': 107 | continue 108 | if _verbose: 109 | print 'INFO: Success ' + paste_id 110 | 111 | # Run paste text against keywords and record hits 112 | keywords = [line.strip() for line in open( 113 | op.join(dir, 'keywords.txt'), 'r') 114 | ] 115 | hits = [] 116 | for keyword in keywords: 117 | if keyword.lower() in paste_text.lower(): 118 | hits.append(keyword) 119 | 120 | # If there are hits, save to file system 121 | if hits: 122 | if _verbose: 123 | print 'INFO: Keyword Hit ' + paste_id + ' ' + ','.join(hits) 124 | year = datetime.now().strftime('%Y') 125 | month = datetime.now().strftime('%m') 126 | date = datetime.now().strftime('%d') 127 | yyyymmdd = datetime.now().strftime('%Y%m%d') 128 | 129 | if not op.exists(op.join(dir, 'Pastebin')): 130 | os.makedirs(op.join(dir, 'Pastebin')) 131 | os.chdir(op.join(dir, 'Pastebin')) 132 | if not op.exists(op.join(year, month, date)): 133 | os.makedirs(op.join(year, month, date)) 134 | 135 | os.chdir(op.join(year, month, date)) 136 | fname = yyyymmdd + '_' + paste_id 137 | for hit in hits: 138 | fname += '_' + ''.join(ch for ch in hit if ch.isalnum()) 139 | fname += '.txt' 140 | with open(fname, 'w') as f: 141 | f.write('Title: ' + paste_title + '\n') 142 | f.write(paste_text) 143 | 144 | # Add current paste to recent paste list 145 | recent_pastes.append(paste_id) 146 | 147 | 148 | if __name__ == '__main__': 149 | main() 150 | --------------------------------------------------------------------------------