├── .gitignore
├── README.md
├── pastebin
    ├── README.md
    └── pastes.py
├── twitter
    ├── README.md
    ├── utils.py
    ├── images.py
    └── links.py
└── LICENSE


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # osint-tools
2 | A small collection of OSINT data collection tools.
3 | 


--------------------------------------------------------------------------------
/pastebin/README.md:
--------------------------------------------------------------------------------
1 | # pastebin
2 | 
3 | Scrape recent paste metadata from pastebin in real-time.
4 | 
5 | Streams newline-delimited JSON to stdout with the keys `id`, `title`, and `syntax` (if there is one).
6 | 
7 | You can access the paste content using pastebin.com/raw/[id] but they have a habit of temporarily
8 | blocking IP addresses that do this too frequently so use at your own risk (or behind someone else's IP).
9 | 


--------------------------------------------------------------------------------
/twitter/README.md:
--------------------------------------------------------------------------------
 1 | # twitter
 2 | 
 3 | These tools require you to have [Twitter API keys](https://developer.twitter.com/apps) set as the following environment variables:
 4 | 
 5 | - TWITTER_CONSUMER_KEY
 6 | - TWITTER_CONSUMER_SECRET
 7 | - TWITTER_ACCESS_TOKEN_KEY
 8 | - TWITTER_ACCESS_TOKEN_SECRET
 9 | 
10 | ## links.py
11 | Streams real-time JSON metadata about Tweets that contain links to stdout.
12 | 
13 | ## images.py
14 | Streams real-time JSON metadata about Tweets that contain images to stdout.
15 | 
16 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 davy wybiral
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/twitter/utils.py:
--------------------------------------------------------------------------------
 1 | from os import environ
 2 | from time import sleep
 3 | 
 4 | try:
 5 |     from twitter import Api
 6 | except:
 7 |     print('Requires python-twitter: pip install python-twitter')
 8 |     exit(1)
 9 | 
10 | 
11 | class Twitter:
12 | 
13 |     def __init__(self, **kwargs):
14 |         # Instantiate Twitter API
15 |         # Set env variables with values from https://developer.twitter.com/apps
16 |         self.api = Api(
17 |             consumer_key=environ['TWITTER_CONSUMER_KEY'],
18 |             consumer_secret=environ['TWITTER_CONSUMER_SECRET'],
19 |             access_token_key=environ['TWITTER_ACCESS_TOKEN_KEY'],
20 |             access_token_secret=environ['TWITTER_ACCESS_TOKEN_SECRET'],
21 |         )
22 | 
23 |     def stream_sample(self):
24 |         # backoff to avoid being put in timeout by Twitter if errors occur
25 |         backoff = 1
26 |         while True:
27 |             try:
28 |                 for tweet in self.api.GetStreamSample():
29 |                     # Reset backoff since request succeeded
30 |                     backoff = 1
31 |                     yield tweet
32 |             except Exception as e:
33 |                 # Sometimes GetStreamSample connection fails
34 |                 sleep(backoff)
35 |                 # exponential backoff for repeated errors
36 |                 backoff *= 2
37 | 


--------------------------------------------------------------------------------
/twitter/images.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Stream newline-delimited JSON of objects of Tweets containing images.
 3 | 
 4 | Structure:
 5 | {"tweet_id": <str>, "user_id": <str>, "images": [list of url strings]}
 6 | 
 7 | Data obtained from real-time Twitter API stream sample.
 8 | '''
 9 | 
10 | from json import dumps
11 | from utils import Twitter
12 | 
13 | def main():
14 |     try:
15 |         for tweet in twitter_images():
16 |             print(dumps(tweet), flush=True)
17 |     except KeyboardInterrupt:
18 |         return
19 | 
20 | # Yields tweets with images from Twitter stream sample.
21 | # Each yielded objects contains tweet_id, user_id, images[]
22 | def twitter_images(**kwargs):
23 |     twitter = Twitter(**kwargs)
24 |     for tweet in twitter.stream_sample():
25 |         if 'entities' not in tweet:
26 |             continue
27 |         entities = tweet['entities']
28 |         if 'media' not in entities:
29 |             continue
30 |         images = set()
31 |         for media in entities['media']:
32 |             url = media['media_url_https']
33 |             if 'video_thumb' in url:
34 |                 # Ignore video thumbnails
35 |                 continue
36 |             images.add(url)
37 |         if not images:
38 |             continue
39 |         yield {
40 |             'tweet_id': tweet['id_str'],
41 |             'user_id': tweet['user']['id_str'],
42 |             'images': sorted(images),
43 |         }
44 | 
45 | # Start at main if executed as a program
46 | if __name__ == '__main__':
47 |     main()
48 | 


--------------------------------------------------------------------------------
/twitter/links.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Stream newline-delimited JSON of objects of Tweets containing links.
 3 | 
 4 | Structure:
 5 | {"tweet_id": <str>, "user_id": <str>, "links": [list of url strings]}
 6 | 
 7 | Data obtained from real-time Twitter API stream sample.
 8 | '''
 9 | 
10 | from json import dumps
11 | from utils import Twitter
12 | 
13 | def main():
14 |     try:
15 |         for tweet in twitter_links():
16 |             print(dumps(tweet), flush=True)
17 |     except KeyboardInterrupt:
18 |         return
19 | 
20 | # Yields tweets with links from Twitter stream sample.
21 | # Each yielded objects contains tweet_id, user_id, links[]
22 | def twitter_links(**kwargs):
23 |     twitter = Twitter(**kwargs)
24 |     for tweet in twitter.stream_sample():
25 |         if 'entities' not in tweet:
26 |             continue
27 |         entities = tweet['entities']
28 |         if 'urls' not in entities:
29 |             continue
30 |         links = set()
31 |         for url in entities['urls']:
32 |             if 'unwound' in url:
33 |                 u = url['unwound']['url']
34 |             else:
35 |                 u = url['expanded_url']
36 |             # Ignore tweet links
37 |             if u.startswith('https://twitter.com'):
38 |                 continue
39 |             links.add(u)
40 |         if not links:
41 |             continue
42 |         yield {
43 |             'tweet_id': tweet['id_str'],
44 |             'user_id': tweet['user']['id_str'],
45 |             'links': sorted(links),
46 |         }
47 | 
48 | # Start at main if executed as a program
49 | if __name__ == '__main__':
50 |     main()
51 | 


--------------------------------------------------------------------------------
/pastebin/pastes.py:
--------------------------------------------------------------------------------
 1 | from json import dumps
 2 | from time import sleep
 3 | 
 4 | try:
 5 |     from bs4 import BeautifulSoup
 6 | except:
 7 |     print('Requires bs4: pip install bs4')
 8 |     exit(1)
 9 | 
10 | try:
11 |     import requests
12 | except:
13 |     print('Requires requests: pip install requests')
14 |     exit(1)
15 | 
16 | # Number of IDs to store in memory to avoid duplicates
17 | MAX_CACHE = 100
18 | # Seconds between crawls
19 | CRAWL_DELAY = 5.0
20 | 
21 | def main():
22 |     try:
23 |         for paste in paste_stream():
24 |             print(dumps(paste), flush=True)
25 |     except KeyboardInterrupt:
26 |         return
27 | 
28 | def paste_stream():
29 |     url = 'https://pastebin.com/archive'
30 |     cache = []
31 |     while True:
32 |         r = requests.get(url)
33 |         s = BeautifulSoup(r.content, 'html.parser')
34 |         table = s.find('table', {'class': 'maintable'})
35 |         trs = table.find_all('tr')
36 |         # Skip header row
37 |         trs = trs[1:]
38 |         for tr in trs:
39 |             tds = tr.find_all('td')
40 |             a = tds[0].find('a')
41 |             paste_id = a['href'][1:]
42 |             if paste_id in cache:
43 |                 # Ignore recently seen
44 |                 continue
45 |             obj = {'id': paste_id, 'title': a.string}
46 |             syntax = tds[2].string
47 |             if syntax != '-':
48 |                 # Check for syntax
49 |                 obj['syntax'] = syntax
50 |             yield obj
51 |             cache.append(paste_id)
52 |             if len(cache) > MAX_CACHE:
53 |                 cache = cache[-MAX_CACHE:]
54 |         sleep(CRAWL_DELAY)
55 | 
56 | # Start at main if executed as a program
57 | if __name__ == '__main__':
58 |     main()
59 | 


--------------------------------------------------------------------------------