├── .gitignore ├── README.md ├── pastebin ├── README.md └── pastes.py ├── twitter ├── README.md ├── utils.py ├── images.py └── links.py └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # osint-tools 2 | A small collection of OSINT data collection tools. 3 | -------------------------------------------------------------------------------- /pastebin/README.md: -------------------------------------------------------------------------------- 1 | # pastebin 2 | 3 | Scrape recent paste metadata from pastebin in real-time. 4 | 5 | Streams newline-delimited JSON to stdout with the keys `id`, `title`, and `syntax` (if there is one). 6 | 7 | You can access the paste content using pastebin.com/raw/[id] but they have a habit of temporarily 8 | blocking IP addresses that do this too frequently so use at your own risk (or behind someone else's IP). 9 | -------------------------------------------------------------------------------- /twitter/README.md: -------------------------------------------------------------------------------- 1 | # twitter 2 | 3 | These tools require you to have [Twitter API keys](https://developer.twitter.com/apps) set as the following environment variables: 4 | 5 | - TWITTER_CONSUMER_KEY 6 | - TWITTER_CONSUMER_SECRET 7 | - TWITTER_ACCESS_TOKEN_KEY 8 | - TWITTER_ACCESS_TOKEN_SECRET 9 | 10 | ## links.py 11 | Streams real-time JSON metadata about Tweets that contain links to stdout. 12 | 13 | ## images.py 14 | Streams real-time JSON metadata about Tweets that contain images to stdout. 15 | 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 davy wybiral 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /twitter/utils.py: -------------------------------------------------------------------------------- 1 | from os import environ 2 | from time import sleep 3 | 4 | try: 5 | from twitter import Api 6 | except: 7 | print('Requires python-twitter: pip install python-twitter') 8 | exit(1) 9 | 10 | 11 | class Twitter: 12 | 13 | def __init__(self, **kwargs): 14 | # Instantiate Twitter API 15 | # Set env variables with values from https://developer.twitter.com/apps 16 | self.api = Api( 17 | consumer_key=environ['TWITTER_CONSUMER_KEY'], 18 | consumer_secret=environ['TWITTER_CONSUMER_SECRET'], 19 | access_token_key=environ['TWITTER_ACCESS_TOKEN_KEY'], 20 | access_token_secret=environ['TWITTER_ACCESS_TOKEN_SECRET'], 21 | ) 22 | 23 | def stream_sample(self): 24 | # backoff to avoid being put in timeout by Twitter if errors occur 25 | backoff = 1 26 | while True: 27 | try: 28 | for tweet in self.api.GetStreamSample(): 29 | # Reset backoff since request succeeded 30 | backoff = 1 31 | yield tweet 32 | except Exception as e: 33 | # Sometimes GetStreamSample connection fails 34 | sleep(backoff) 35 | # exponential backoff for repeated errors 36 | backoff *= 2 37 | -------------------------------------------------------------------------------- /twitter/images.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Stream newline-delimited JSON of objects of Tweets containing images. 3 | 4 | Structure: 5 | {"tweet_id": , "user_id": , "images": [list of url strings]} 6 | 7 | Data obtained from real-time Twitter API stream sample. 8 | ''' 9 | 10 | from json import dumps 11 | from utils import Twitter 12 | 13 | def main(): 14 | try: 15 | for tweet in twitter_images(): 16 | print(dumps(tweet), flush=True) 17 | except KeyboardInterrupt: 18 | return 19 | 20 | # Yields tweets with images from Twitter stream sample. 21 | # Each yielded objects contains tweet_id, user_id, images[] 22 | def twitter_images(**kwargs): 23 | twitter = Twitter(**kwargs) 24 | for tweet in twitter.stream_sample(): 25 | if 'entities' not in tweet: 26 | continue 27 | entities = tweet['entities'] 28 | if 'media' not in entities: 29 | continue 30 | images = set() 31 | for media in entities['media']: 32 | url = media['media_url_https'] 33 | if 'video_thumb' in url: 34 | # Ignore video thumbnails 35 | continue 36 | images.add(url) 37 | if not images: 38 | continue 39 | yield { 40 | 'tweet_id': tweet['id_str'], 41 | 'user_id': tweet['user']['id_str'], 42 | 'images': sorted(images), 43 | } 44 | 45 | # Start at main if executed as a program 46 | if __name__ == '__main__': 47 | main() 48 | -------------------------------------------------------------------------------- /twitter/links.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Stream newline-delimited JSON of objects of Tweets containing links. 3 | 4 | Structure: 5 | {"tweet_id": , "user_id": , "links": [list of url strings]} 6 | 7 | Data obtained from real-time Twitter API stream sample. 8 | ''' 9 | 10 | from json import dumps 11 | from utils import Twitter 12 | 13 | def main(): 14 | try: 15 | for tweet in twitter_links(): 16 | print(dumps(tweet), flush=True) 17 | except KeyboardInterrupt: 18 | return 19 | 20 | # Yields tweets with links from Twitter stream sample. 21 | # Each yielded objects contains tweet_id, user_id, links[] 22 | def twitter_links(**kwargs): 23 | twitter = Twitter(**kwargs) 24 | for tweet in twitter.stream_sample(): 25 | if 'entities' not in tweet: 26 | continue 27 | entities = tweet['entities'] 28 | if 'urls' not in entities: 29 | continue 30 | links = set() 31 | for url in entities['urls']: 32 | if 'unwound' in url: 33 | u = url['unwound']['url'] 34 | else: 35 | u = url['expanded_url'] 36 | # Ignore tweet links 37 | if u.startswith('https://twitter.com'): 38 | continue 39 | links.add(u) 40 | if not links: 41 | continue 42 | yield { 43 | 'tweet_id': tweet['id_str'], 44 | 'user_id': tweet['user']['id_str'], 45 | 'links': sorted(links), 46 | } 47 | 48 | # Start at main if executed as a program 49 | if __name__ == '__main__': 50 | main() 51 | -------------------------------------------------------------------------------- /pastebin/pastes.py: -------------------------------------------------------------------------------- 1 | from json import dumps 2 | from time import sleep 3 | 4 | try: 5 | from bs4 import BeautifulSoup 6 | except: 7 | print('Requires bs4: pip install bs4') 8 | exit(1) 9 | 10 | try: 11 | import requests 12 | except: 13 | print('Requires requests: pip install requests') 14 | exit(1) 15 | 16 | # Number of IDs to store in memory to avoid duplicates 17 | MAX_CACHE = 100 18 | # Seconds between crawls 19 | CRAWL_DELAY = 5.0 20 | 21 | def main(): 22 | try: 23 | for paste in paste_stream(): 24 | print(dumps(paste), flush=True) 25 | except KeyboardInterrupt: 26 | return 27 | 28 | def paste_stream(): 29 | url = 'https://pastebin.com/archive' 30 | cache = [] 31 | while True: 32 | r = requests.get(url) 33 | s = BeautifulSoup(r.content, 'html.parser') 34 | table = s.find('table', {'class': 'maintable'}) 35 | trs = table.find_all('tr') 36 | # Skip header row 37 | trs = trs[1:] 38 | for tr in trs: 39 | tds = tr.find_all('td') 40 | a = tds[0].find('a') 41 | paste_id = a['href'][1:] 42 | if paste_id in cache: 43 | # Ignore recently seen 44 | continue 45 | obj = {'id': paste_id, 'title': a.string} 46 | syntax = tds[2].string 47 | if syntax != '-': 48 | # Check for syntax 49 | obj['syntax'] = syntax 50 | yield obj 51 | cache.append(paste_id) 52 | if len(cache) > MAX_CACHE: 53 | cache = cache[-MAX_CACHE:] 54 | sleep(CRAWL_DELAY) 55 | 56 | # Start at main if executed as a program 57 | if __name__ == '__main__': 58 | main() 59 | --------------------------------------------------------------------------------