├── .gitignore ├── LICENSE ├── README.md ├── credentials.json.example ├── m-scraper.py ├── m_scraper └── rq │ ├── LICENSE │ ├── README.md │ ├── __init__.py │ ├── downloader.py │ ├── error.txt │ ├── instagramer.py │ ├── pixiver.py │ ├── redditer.py │ ├── stats.py │ ├── tiktoker.py │ ├── tumblrer.py │ └── utils │ ├── helpers.py │ └── instagram.py ├── mediascraper ├── general.py ├── instagram.py ├── interactive │ ├── instagram.py │ └── twitter.py └── twitter.py ├── mediascrapers.py ├── mediatypes ├── README.md ├── audio.csv ├── image.csv └── video.csv ├── requirements.txt └── util ├── file.py ├── instagram.py ├── seleniumdriver.py ├── twitter.py └── url.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .ipynb_checkpoints/ 3 | download/ 4 | download_*/ 5 | webdriver/ 6 | build/ 7 | dist/ 8 | credentials.json 9 | *.log 10 | test.* 11 | example_sites.txt 12 | *.spec 13 | error.txt 14 | debug.txt 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Elvis Yu-Jing Lin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Media Scraper 2 | 3 | `media-scraper` scrapes all photos and videos in a web page. 4 | It supports general-purpose scraping as well as SNS-specific scraping. 5 | 6 | `media-scraper` utilizes the web driver to simulate a user browsing web pages. 7 | With the web driver, sessions and cookies easily can be handled easily but it works slightly slowly. 8 | On the other hand, I'm currently working on the migration of another [repository](https://github.com/elvisyjlin/tumblrer), 9 | which crawls media only by HTTP requests, to this repository. See [here](https://github.com/elvisyjlin/media-scraper/tree/master/tumblrer). 10 | 11 | 12 | ##### General-purpose Scraping 13 | 14 | The general media scraper scrapes and downloads all photos and videos 15 | in all links ``, images `` and videos `

58 |

59 |

Sorry, something went wrong.

60 |

61 | We're working on it and we'll get it fixed as soon as we can. 62 |

63 |

64 | Go Back 65 |

66 | 76 |

77 | 106 | 107 | 108 | 109 | 110 | Expecting value: line 1 column 1 (char 0) -------------------------------------------------------------------------------- /m_scraper/rq/instagramer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2018 Elvis Yu-Jing Lin 5 | # Licensed under the MIT License - https://opensource.org/licenses/MIT 6 | 7 | import time 8 | import os 9 | from .downloader import Downloader 10 | from .utils.instagram import * 11 | 12 | class Instagramer(Downloader): 13 | def __init__(self): 14 | super(Instagramer, self).__init__() 15 | self.description = 'Instagramer' 16 | self.identifier = 'instagram' 17 | self.keyword = 'username' 18 | self.save_path = './download_instagram' 19 | self.api = { 20 | 'base': 'https://www.instagram.com', 21 | 'posts': 'https://www.instagram.com/graphql/query/?query_hash={}&variables={{"id":"{}","first":{},"after":"{}"}}', 22 | 'query_hash': '9ca88e465c3f866a76f7adee3871bdd8', 23 | 'first': 12 24 | } 25 | 26 | def perform(self, tasks, username, early_stop=False): 27 | print('# of tasks:', len(tasks)) 28 | success = True 29 | for img_url, filename in tasks: 30 | success = False 31 | while not success: 32 | try: 33 | res = self.download(img_url, os.path.join(self.save_path, username, filename)) 34 | success = True 35 | except Exception as e: 36 | print(e) 37 | print('Sleep for 1 hour...') 38 | time.sleep(1 * 60 * 60) 39 | success = success and res 40 | if early_stop and not success: 41 | return res 42 | return res 43 | 44 | def crawl(self, username, early_stop=False): 45 | print('Instagramer Task:', username) 46 | tasks, end_cursor, has_next, length, user_id, rhx_gis, csrf_token = get_first_page(username) 47 | if tasks is None: 48 | return -1 49 | res = self.perform(tasks, username, early_stop=early_stop) 50 | if early_stop and res == 1: 51 | return 0 52 | while has_next: 53 | tasks, end_cursor, has_next, length = get_following_page(query_hash, user_id, end_cursor, rhx_gis, csrf_token) 54 | res = self.perform(tasks, username, early_stop=early_stop) 55 | if early_stop and res == 1: 56 | return 0 57 | 58 | if __name__ == '__main__': 59 | instagramer = Instagramer() 60 | instagramer.run() -------------------------------------------------------------------------------- /m_scraper/rq/pixiver.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2018 Elvis Yu-Jing Lin 5 | # Licensed under the MIT License - https://opensource.org/licenses/MIT 6 | 7 | import os 8 | from bs4 import BeautifulSoup 9 | from tqdm import tqdm 10 | from .downloader import Downloader 11 | from .utils.helpers import save_json 12 | 13 | class Pixiver(Downloader): 14 | def __init__(self): 15 | super(Pixiver, self).__init__() 16 | self.description = 'Pixiver' 17 | self.identifier = 'pixiv' 18 | self.keyword = 'userid' 19 | self.save_path = './download_pixiv' 20 | # Daily Ranking 'https://www.pixiv.net/ranking.php?mode=daily&p=1&format=json' 21 | 22 | def login(self, username, password): 23 | print('Logging in pixiv account...') 24 | res = self.sess.get('https://accounts.pixiv.net/login') 25 | if res.status_code == 200: 26 | soup = BeautifulSoup(res.text, 'html.parser') 27 | post_key = soup.select('input[name==post_key]')[0]['value'] 28 | else: 29 | print(res.text) 30 | raise Exception(res.status_code) 31 | 32 | form_data = { 33 | 'captcha': None, 34 | 'g_recaptcha_response': None, 35 | 'pixiv_id': username, 36 | 'password': password, 37 | 'post_key': post_key, 38 | 'source': 'accounts', 39 | 'ref': None, 40 | 'return_to': 'https://www.pixiv.net/' 41 | } 42 | res = self.sess.post('https://accounts.pixiv.net/api/login', data=form_data) 43 | if res.status_code == 200: 44 | print(res.text) 45 | else: 46 | print(res.text) 47 | raise Exception(res.status_code) 48 | print('Logged in successfully.') 49 | 50 | def download_illust(self, id, path): 51 | data_file = os.path.join(path, '{:s}.json'.format(id)) 52 | headers = { 53 | 'Referer': 'http://www.pixiv.net/member_illust.php?mode=medium&illust_id={:s}'.format(id) 54 | } 55 | if os.path.exists(data_file): 56 | return True 57 | res = self.sess.get('https://www.pixiv.net/ajax/illust/{:s}'.format(id)) 58 | if res.status_code == 200: 59 | data = res.json() 60 | first_image_url = data['body']['urls']['original'] 61 | first_image_filename = os.path.join(path, os.path.basename(first_image_url).split('?', 1)[0]) 62 | page_count = data['body']['pageCount'] 63 | illust_type = data['body']['illustType'] # 0: single, 1: multiple, 2: animated 64 | if illust_type == 0: 65 | self.download(first_image_url, first_image_filename, headers=headers) 66 | elif illust_type == 1: 67 | res = self.sess.get('https://www.pixiv.net/ajax/illust/{:s}/pages'.format(id)) 68 | if res.status_code == 200: 69 | data = res.json() 70 | for page in data['body']: 71 | url = page['urls']['original'] 72 | filename = os.path.join(path, os.path.basename(url).split('?', 1)[0]) 73 | self.download(url, filename, headers=headers) 74 | else: 75 | print(res.text) 76 | raise Exception(res.status_code) 77 | elif illust_type == 2: 78 | self.download(first_image_url, first_image_filename, headers=headers) 79 | res = self.sess.get('https://www.pixiv.net/ajax/illust/{:s}/ugoira_meta'.format(id)) 80 | if res.status_code == 200: 81 | data = res.json() 82 | zip_url = data['body']['src'] 83 | zip_filename = os.path.join(path, os.path.basename(zip_url).split('?', 1)[0]) 84 | self.download(zip_url, zip_filename, headers=headers) 85 | zip_url = data['body']['originalSrc'] 86 | zip_filename = os.path.join(path, os.path.basename(zip_url).split('?', 1)[0]) 87 | self.download(zip_url, zip_filename, headers=headers) 88 | else: 89 | print(res.text) 90 | raise Exception(res.status_code) 91 | else: 92 | raise Exception('Invalid illustType: {}'.format(illust_type)) 93 | save_json(data, data_file) # Save JSON data once images are downloaded successfully 94 | else: 95 | print(res.text) 96 | raise Exception(res.status_code) 97 | return False 98 | 99 | def crawl(self, userid, early_stop=False): 100 | print('Pixiver Task:', userid) 101 | url = 'https://www.pixiv.net/ajax/user/{:s}/profile/all'.format(userid) 102 | res = self.sess.get(url) 103 | if res.status_code == 200: 104 | # 'manga', 'bookmarkCount', 'mangaSeries', 'illusts', 'novels', 'pickup', 'novelSeries' 105 | data = res.json() 106 | data_file = os.path.join(self.save_path, userid, 'all.json') 107 | if os.path.exists(data_file): 108 | i = 2 109 | while os.path.exists(data_file): 110 | data_file = os.path.join(self.save_path, userid, 'all-{:d}.json'.format(i)) 111 | i += 1 112 | save_json(data, data_file) 113 | for illust_id in tqdm(reversed(sorted(data['body']['illusts'].keys()))): 114 | # From the latest illust to the oldest one 115 | path = os.path.join(self.save_path, userid, 'illusts') 116 | done = self.download_illust(illust_id, path) 117 | if early_stop and done: 118 | break 119 | for manga_id in tqdm(reversed(sorted(data['body']['manga'].keys()))): 120 | # From the latest manga to the oldest one 121 | path = os.path.join(self.save_path, userid, 'manga') 122 | done = self.download_illust(manga_id, path) 123 | if early_stop and done: 124 | break 125 | else: 126 | print(res.text) 127 | raise Exception(res.status_code) 128 | 129 | if __name__ == '__main__': 130 | pixiver = Pixiver() 131 | pixiver.run() -------------------------------------------------------------------------------- /m_scraper/rq/redditer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2018 Elvis Yu-Jing Lin 5 | # Licensed under the MIT License - https://opensource.org/licenses/MIT 6 | 7 | import requests 8 | import os 9 | import time 10 | import json 11 | from .downloader import Downloader 12 | from .utils.helpers import get_imgur, get_gfycat, requests_get 13 | 14 | class Redditer(Downloader): 15 | def __init__(self): 16 | super(Redditer, self).__init__() 17 | self.description = 'Redditer' 18 | self.identifier = 'reddit' 19 | self.keyword = 'subreddit' 20 | self.save_path = './download_reddit' 21 | self.api = { 22 | 'base': 'https://www.reddit.com', 23 | 'posts': 'https://www.reddit.com/r/{}.json?after={}', 24 | 'search': 'https://www.reddit.com/subreddits/search.json?q={}&include_over_18=on' 25 | } 26 | 27 | def safe_download(self, subreddit, name, img_url): 28 | filename = os.path.join(self.save_path, subreddit, name + '.' + os.path.basename(img_url).split('?')[0]) 29 | success = False 30 | while not success: 31 | try: 32 | self.download(img_url, filename) 33 | success = True 34 | except requests.exceptions.ConnectionError as e: 35 | print(e) 36 | print(img_url, filename) 37 | print('Skip this.') 38 | break 39 | except Exception as e: 40 | print(e) 41 | print(img_url, filename) 42 | print('Sleep for 1 hour...') 43 | time.sleep(1*60*60) 44 | 45 | def crawl(self, subreddit, early_stop=False): 46 | print('Redditer Task:', subreddit) 47 | 48 | userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15' 49 | headers = { 50 | 'User-Agent': userAgent 51 | } 52 | 53 | after = '' 54 | done = False 55 | 56 | while not done: 57 | text = requests_get(self.api['posts'].format(subreddit, after), headers=headers) 58 | red = json.loads(text) 59 | print(len(red['data']['children'])) 60 | 61 | for child in red['data']['children']: 62 | name = child['data']['name'] 63 | img_url = child['data']['url'] 64 | if os.path.splitext(img_url)[1] == '': 65 | if 'imgur.com/' in img_url: 66 | img_url = get_imgur(img_url) 67 | if img_url is not None: 68 | self.safe_download(subreddit, name, img_url) 69 | elif 'gfycat.com/' in img_url: 70 | for vid_url in get_gfycat(img_url): 71 | self.safe_download(subreddit, name, vid_url) 72 | else: 73 | print('No media in [{}]. Skip it.'.format(img_url)) 74 | continue 75 | else: 76 | self.safe_download(subreddit, name, img_url) 77 | after = name 78 | 79 | if len(red['data']['children']) == 0: 80 | done = True 81 | 82 | if __name__ == '__main__': 83 | redditer = Redditer() 84 | redditer.run() -------------------------------------------------------------------------------- /m_scraper/rq/stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2018 Elvis Yu-Jing Lin 5 | # Licensed under the MIT License - https://opensource.org/licenses/MIT 6 | 7 | import os 8 | 9 | if __name__ == '__main__': 10 | num_images = 0 11 | subfolders = os.listdir('./download') if os.path.exists('./download') else [] 12 | for subfolder in subfolders: 13 | num = len(os.listdir(os.path.join('./download', subfolder, 'photo'))) 14 | num_images += num 15 | print(subfolder, num) 16 | 17 | print('=== Tumblrer Stats ===') 18 | print('# of galleries:', len(subfolders)) 19 | print('# of photos:', num_images) 20 | print('======================') 21 | 22 | num_images = 0 23 | subfolders = os.listdir('./download_instagram') if os.path.exists('./download_instagram') else [] 24 | for subfolder in subfolders: 25 | num = len(os.listdir(os.path.join('./download_instagram', subfolder))) 26 | num_images += num 27 | print(subfolder, num) 28 | 29 | print('=== Instagramer Stats ===') 30 | print('# of galleries:', len(subfolders)) 31 | print('# of media:', num_images) 32 | print('======================') -------------------------------------------------------------------------------- /m_scraper/rq/tiktoker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2018 Elvis Yu-Jing Lin 5 | # Licensed under the MIT License - https://opensource.org/licenses/MIT 6 | 7 | import os 8 | from .downloader import Downloader 9 | from .utils.helpers import save_json 10 | 11 | class TikToker(Downloader): 12 | def __init__(self): 13 | super(TikToker, self).__init__() 14 | self.description = 'TikToker' 15 | self.identifier = 'tiktok' 16 | self.keyword = 'userid' 17 | self.save_path = './download_tiktok' 18 | 19 | def crawl(self, user_id, early_stop=False): 20 | path = os.path.join(self.save_path, user_id) 21 | max_cursor = '0' 22 | min_cursor = '0' 23 | extra_params = '' 24 | page_url = 'https://www.tiktok.com/share/user/{:s}'.format(user_id) 25 | headers = { 26 | 'Referer': page_url, 27 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0' 28 | } 29 | data = None 30 | done = False 31 | while not done: 32 | data_url = 'https://www.tiktok.com/share/item/list?id={:s}&type=1&count=100&maxCursor={:s}&minCursor={:s}{:s}'.format(user_id, max_cursor, min_cursor, extra_params) 33 | res = self.sess.get(data_url, headers=headers) 34 | if res.status_code == 200: 35 | data = res.json() 36 | if 'body' not in data: 37 | print(data) 38 | raise Exception('body not found') 39 | itemListData = data['body']['itemListData'] 40 | max_cursor = data['body']['maxCursor'] 41 | done = not data['body']['hasMore'] 42 | for item in data['body']['itemListData']: 43 | video_id = item['itemInfos']['id'] 44 | cover_urls = item['itemInfos']['covers'] 45 | cover_origin_urls = item['itemInfos']['coversOrigin'] 46 | video_urls = item['itemInfos']['video']['urls'] 47 | assert len(cover_urls) == 1, 'Got {:d} cover urls.'.format(len(cover_urls)) 48 | assert len(cover_origin_urls) == 1, 'Got {:d} cover urls.'.format(len(cover_origin_urls)) 49 | assert len(video_urls) == 4, 'Got {:d} video urls.'.format(len(video_urls)) 50 | 51 | data_path = os.path.join(path, video_id + '.json') 52 | if early_stop and os.path.exists(data_path): 53 | done = True 54 | break 55 | 56 | video_url = video_urls[2] 57 | video_filname = os.path.join(path, video_id + '_watermark.mp4') 58 | self.download(video_url, video_filname, headers) 59 | video_no_watermark_url = video_urls[2].replace('watermark=1', 'watermark=0') 60 | video_no_watermar_filename = os.path.join(path, video_id + '.mp4') 61 | self.download(video_no_watermark_url, video_no_watermar_filename, headers) 62 | cover_url = cover_urls[0] 63 | cover_filename = os.path.join(path, video_id + '_cover.jpg') 64 | self.download(cover_url, cover_filename, headers) 65 | cover_origin_url = cover_origin_urls[0] 66 | cover_origin_filename = os.path.join(path, video_id + '_cover_origin.jpg') 67 | self.download(cover_origin_url, cover_origin_filename, headers) 68 | save_json(item, data_path) 69 | else: 70 | print(res.text) 71 | raise Exception(res.status_code) 72 | if data is not None: 73 | user_data = dict(data) 74 | del data['body']['itemListData'] 75 | del data['body']['hasMore'] 76 | del data['body']['maxCursor'] 77 | del data['body']['minCursor'] 78 | save_json(user_data, os.path.join(self.save_path, user_id + '.json')) 79 | 80 | if __name__ == '__main__': 81 | tiktoker = TikToker() 82 | tiktoker.run() -------------------------------------------------------------------------------- /m_scraper/rq/tumblrer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2018 Elvis Yu-Jing Lin 5 | # Licensed under the MIT License - https://opensource.org/licenses/MIT 6 | 7 | import json 8 | import time 9 | import os 10 | from .downloader import Downloader 11 | from .utils.helpers import requests_get 12 | 13 | # VAR = { 14 | # 'save_path': '.', 15 | # 'types': ['text', 'quote', 'photo', 'link', 'chat', 'video', 'audio'] 16 | # } 17 | 18 | class Tumblrer(Downloader): 19 | def __init__(self, target='media'): 20 | super(Tumblrer, self).__init__() 21 | self.description = 'Tumblrer' 22 | self.identifier = 'tumblr' 23 | self.keyword = 'sitename' 24 | self.save_path = './download_tumblr' 25 | self.api = { 26 | 'base': 'https://www.tumblr.com', 27 | 'v1': { 28 | 'read': '/api/read/json' 29 | } 30 | } 31 | 32 | if target == 'media': 33 | self.crawl = self.crawl_media 34 | elif target == 'article': 35 | self.crawl = self.crawl_article 36 | 37 | def get(self, sitename, start=0, num=50): 38 | url = 'https://' + sitename + self.api['v1']['read'] 39 | params = {'start': start, 'num': num} 40 | print('Get {} with {}'.format(url, params)) 41 | text = requests_get(url=url, params=params, verify=False) 42 | content = json.loads(text.replace('var tumblr_api_read = ', '')[:-2]) 43 | return content 44 | 45 | def crawl_media(self, sitename, early_stop=False, start=0, num=50): 46 | print('Tumblrer Task:', sitename) 47 | total = start + 1 48 | while start < total: 49 | content = self.get(sitename, start, num) 50 | if type(content) is not dict: 51 | start += 1 52 | continue 53 | with open('current.json', 'w') as f: 54 | f.write(json.dumps(content)) 55 | blog_name = content['tumblelog']['name'] 56 | start = content['posts-start'] 57 | total = content['posts-total'] 58 | posts = content['posts'] 59 | print('[{}/{}]'.format(start, total), '# of posts: {}'.format(len(posts))) 60 | for post in posts: 61 | if 'photo-url-1280' in post: 62 | img_url = post['photo-url-1280'] 63 | filename = os.path.join(self.save_path, blog_name, post['type'], 64 | str(post['id']) + '.' + img_url.rsplit('/', 1)[1]) 65 | success = False 66 | while not success: 67 | try: 68 | res = self.download(img_url, filename) 69 | success = True 70 | except Exception as e: 71 | print(e) 72 | print('Sleep for 1 minute...') 73 | time.sleep(1 * 60) 74 | if early_stop and res == 1: 75 | return 0 76 | else: 77 | print(post['id'], post['url'], post['type']) 78 | if 'photos' in post and post['photos'] != []: 79 | print('photos', len(post['photos'])) 80 | for photo in post['photos']: 81 | img_url = photo['photo-url-1280'] 82 | filename = os.path.join(self.save_path, blog_name, post['type'], 83 | str(post['id']) + '.' + img_url.rsplit('/', 1)[1]) 84 | success = False 85 | while not success: 86 | try: 87 | self.download(img_url, filename) 88 | success = True 89 | except Exception as e: 90 | print(e) 91 | print('Sleep for 1 minute...') 92 | time.sleep(1 * 60) 93 | start += num 94 | return 0 95 | 96 | def crawl_article(self, sitename, early_stop=False, start=0, num=50): 97 | print('Tumblrer Task:', sitename) 98 | total = start + 1 99 | total_posts = [] 100 | while start < total: 101 | content = self.get(sitename, start, num) 102 | if type(content) is not dict: 103 | start += 1 104 | continue 105 | with open('current.json', 'w') as f: 106 | f.write(json.dumps(content)) 107 | blog_name = content['tumblelog']['name'] 108 | start = content['posts-start'] 109 | total = content['posts-total'] 110 | posts = content['posts'] 111 | total_posts += posts 112 | print('[{}/{}]'.format(start, total), '# of posts: {}'.format(len(posts))) 113 | start += num 114 | with open('posts.json', 'w', encoding='utf-8') as f: 115 | f.write(json.dumps(total_posts)) 116 | return 0 117 | 118 | if __name__ == '__main__': 119 | tumblrer = Tumblrer(target="article") 120 | tumblrer.run() -------------------------------------------------------------------------------- /m_scraper/rq/utils/helpers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2018 Elvis Yu-Jing Lin 5 | # Licensed under the MIT License - https://opensource.org/licenses/MIT 6 | 7 | import json 8 | import requests 9 | import os 10 | import time 11 | 12 | def log(msg, file='log.txt'): 13 | print(msg) 14 | with open(file, 'a') as f: 15 | f.write(msg + '\n') 16 | 17 | def requests_get(url, fn=None, **kwarg): 18 | res = None 19 | data = None 20 | success = False 21 | while not success: 22 | try: 23 | res = requests.get(url, **kwarg) 24 | text = res.text 25 | data = text if fn is None else fn(text) 26 | success = True 27 | except Exception as e: 28 | print('Error when getting', url) 29 | f = open('error.txt', 'w', encoding='utf-8') 30 | f.write(url + '\n\n') 31 | if res is not None: 32 | f.write(text + '\n\n') 33 | f.write(str(e)) 34 | f.close() 35 | print('Error details are saved in error.txt') 36 | print('Sleep for 1 hour...') 37 | time.sleep(1 * 60 * 60) 38 | return data 39 | 40 | def get_imgur(url): 41 | ''' 42 | Returns an image url (jpg, png, gif, ...) of the given Imgur url. 43 | ''' 44 | 45 | assert 'imgur.com/' in url, 'Error occurs when parsing url [{}]'.format(url) 46 | 47 | IMGUR_URL = 'https://i.imgur.com/{}.{}' 48 | IMGUR_JPG = 'https://i.imgur.com/{}.jpg' 49 | 50 | img_id = url.rsplit('/', 1)[1] 51 | res = requests.head(IMGUR_JPG.format(img_id)) 52 | 53 | if int(res.headers['Content-Length']) == 0: 54 | return None 55 | 56 | try: 57 | content_type = res.headers['Content-Type'] 58 | except Exception as e: 59 | print(IMGUR_JPG.format(img_id)) 60 | print(res.headers) 61 | raise e 62 | 63 | ext = content_type.split('/')[1] 64 | ext = 'jpg' if ext == 'jpeg' else ext 65 | 66 | return IMGUR_URL.format(img_id, ext) 67 | 68 | 69 | def get_gfycat(url): 70 | ''' 71 | Returns an mp4 url and a webm url of the given Gfycat url. 72 | ''' 73 | 74 | assert 'gfycat.com/' in url, 'Error occurs when parsing url [{}]'.format(url) 75 | 76 | GFYCAT_MP4 = 'https://giant.gfycat.com/{}.mp4' 77 | GFYCAT_WEBM = 'https://giant.gfycat.com/{}.webm' 78 | 79 | name = url.rsplit('/', 1)[1] 80 | 81 | return [GFYCAT_MP4.format(name), 82 | GFYCAT_WEBM.format(name)] 83 | 84 | def save_json(data, filename): 85 | os.makedirs(os.path.dirname(filename), exist_ok=True) 86 | json.dump(data, open(filename, 'w', encoding='utf-8')) 87 | 88 | def url_basename(url): 89 | filename = os.path.basename(url) 90 | if '?' in filename: 91 | filename = filename.split('?', 1)[0] 92 | return filename -------------------------------------------------------------------------------- /m_scraper/rq/utils/instagram.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2018 Elvis Yu-Jing Lin 5 | # Licensed under the MIT License - https://opensource.org/licenses/MIT 6 | 7 | import re 8 | import json 9 | import time 10 | from bs4 import BeautifulSoup 11 | from .helpers import requests_get, url_basename 12 | 13 | query_hash = '42323d64886122307be10013ad2dcc44' # query shorcode pages 14 | # query_hash = '9ca88e465c3f866a76f7adee3871bdd8' # query `{"data":{"viewer":null,"user":null},"status":"ok"}` 15 | userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15' 16 | 17 | def parse_share_data(text): 18 | soup = BeautifulSoup(text, 'html.parser') 19 | for script in soup.find_all('script'): 20 | p = re.compile('window._sharedData = (.*?);$') # $ specifies the end of the string 21 | m = p.match(script.string) 22 | if m is not None: 23 | break 24 | shared_data = json.loads(m.groups()[0], encoding='utf-8') 25 | return shared_data 26 | 27 | def get_shared_data(username): 28 | headers = { 29 | 'User-Agent': userAgent 30 | } 31 | url = 'https://www.instagram.com/' + username 32 | shared_data = requests_get( 33 | url, 34 | fn=parse_share_data, 35 | headers=headers, 36 | verify=False 37 | ) 38 | return shared_data 39 | 40 | def get_x_instagram_gis(rhx_gis, url): 41 | import hashlib 42 | vals = rhx_gis + ':' + url.split('variables=')[1] 43 | m = hashlib.md5() 44 | m.update(vals.encode()) 45 | return m.hexdigest() 46 | 47 | def get_first_page(username): 48 | shared_data = get_shared_data(username) 49 | if shared_data is None: 50 | return None, None, None, None, None, None, None 51 | csrf_token = shared_data['config']['csrf_token'] 52 | rhx_gis = shared_data['rhx_gis'] 53 | 54 | user = shared_data['entry_data']['ProfilePage'][0]['graphql']['user'] 55 | is_private = user['is_private'] 56 | user_id = user['id'] 57 | profile_pic_url = user['profile_pic_url'] 58 | profile_pic_url_hd = user['profile_pic_url_hd'] 59 | 60 | edge_owner_to_timeline_media = user['edge_owner_to_timeline_media'] 61 | end_cursor = edge_owner_to_timeline_media['page_info']['end_cursor'] 62 | has_next_page = edge_owner_to_timeline_media['page_info']['has_next_page'] 63 | count = edge_owner_to_timeline_media['count'] 64 | edges = edge_owner_to_timeline_media['edges'] 65 | 66 | # print('Searching edges...') 67 | tasks = [] 68 | for edge in edges: 69 | node = edge['node'] 70 | typename = node['__typename'] # 'GraphImage', 'GraphVideo', 'GraphSidecar' 71 | node_id = node['id'] 72 | display_url = node['display_url'] 73 | shortcode = node['shortcode'] 74 | # print(shortcode, typename, display_url) 75 | tasks.extend(parse_node(retrieve_node_from_shortcode(shortcode))) 76 | 77 | return tasks, end_cursor, has_next_page, len(edges), user_id, rhx_gis, csrf_token 78 | 79 | def get_following_page(query_hash, user_id, after, rhx_gis, csrf_token): 80 | first = 12 81 | url = 'https://www.instagram.com/graphql/query/?query_hash={}&variables={{"id":"{}","first":{},"after":"{}"}}'.format(query_hash, user_id, first, after) 82 | headers = { 83 | 'User-Agent': userAgent, 84 | 'X-Instagram-GIS': get_x_instagram_gis(rhx_gis, url) 85 | } 86 | cookies = { 87 | 'csrf_token': csrf_token 88 | } 89 | edge_owner_to_timeline_media = requests_get( 90 | url, 91 | fn=lambda text: json.loads(text, encoding='utf-8')['data']['user']['edge_owner_to_timeline_media'], 92 | headers=headers, cookies=cookies, verify=False 93 | ) 94 | 95 | count = edge_owner_to_timeline_media['count'] 96 | end_cursor = edge_owner_to_timeline_media['page_info']['end_cursor'] 97 | has_next_page = edge_owner_to_timeline_media['page_info']['has_next_page'] 98 | edges = edge_owner_to_timeline_media['edges'] 99 | 100 | tasks = [] 101 | for edge in edges: 102 | node = edge['node'] 103 | typename = node['__typename'] # 'GraphImage', 'GraphVideo', 'GraphSidecar' 104 | node_id = node['id'] 105 | display_url = node['display_url'] 106 | shortcode = node['shortcode'] 107 | # print(shortcode, typename, display_url) 108 | tasks.extend(parse_node(retrieve_node_from_shortcode(shortcode))) 109 | 110 | return tasks, end_cursor, has_next_page, len(edges) 111 | 112 | def largest_image_url(resources): 113 | return max(resources, key=lambda x: x['config_height']*x['config_width'])['src'] 114 | 115 | def node_name(node): 116 | return '{}.{}'.format(node['id'], node['shortcode']) 117 | 118 | def parse_node(node, name=''): 119 | tasks = [] 120 | 121 | if name == '': 122 | name = node_name(node) 123 | else: 124 | name += '.' + node_name(node) 125 | 126 | # print(node.keys()) 127 | # print(node['__typename']) 128 | # print(node['display_url']) 129 | # print(node['thumbnail_resources']) 130 | 131 | display_resources = node['display_resources'] 132 | # find the highest resolution image 133 | url = largest_image_url(display_resources) 134 | url_filename = url_basename(url) 135 | # download(url, path=save_path, rename=name, replace=False) 136 | tasks.append((url, name + '.' + url_filename.rsplit('.', 1)[1])) 137 | 138 | typename = node['__typename'] 139 | if typename == 'GraphImage': 140 | pass 141 | elif typename == 'GraphSidecar': 142 | edges = node['edge_sidecar_to_children']['edges'] 143 | for edge in edges: 144 | # parse_node(edge['node'], name, save_path) 145 | tasks += parse_node(edge['node'], name) 146 | elif typename == 'GraphVideo': 147 | url = node['video_url'] 148 | url_filename = url_basename(url) 149 | # download(url, path=save_path, rename=name, replace=False) 150 | tasks.append((url, name + '.' + url_filename.rsplit('.', 1)[1])) 151 | else: 152 | print('Error: unsupported typename "{}".'.format(typename)) 153 | 154 | return tasks 155 | 156 | # Go into the page for a certain post and get the node information 157 | def retrieve_node_from_shortcode(shortcode): 158 | def _fn(text): 159 | try: 160 | data = json.loads(text, encoding='utf-8')['graphql']['shortcode_media'] 161 | except Exception as e: 162 | data = parse_share_data(text)['entry_data']['PostPage'][0]['graphql']['shortcode_media'] 163 | return data 164 | url = 'https://www.instagram.com/p/{}/?__a=1'.format(shortcode) 165 | headers = { 166 | 'User-Agent': userAgent 167 | } 168 | node = requests_get( 169 | url, 170 | fn=_fn, 171 | headers=headers, 172 | verify=False 173 | ) 174 | return node -------------------------------------------------------------------------------- /mediascraper/general.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2018 Elvis Yu-Jing Lin 5 | # Licensed under the MIT License - https://opensource.org/licenses/MIT 6 | 7 | import mediascrapers 8 | import sys 9 | 10 | if __name__ == '__main__': 11 | scraper = mediascrapers.MediaScraper(mode='normal', debug=False) 12 | for url in sys.argv[1:]: 13 | tasks = scraper.scrape(url) 14 | scraper.download(tasks=tasks, path='download/general') 15 | -------------------------------------------------------------------------------- /mediascraper/instagram.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2018 Elvis Yu-Jing Lin 5 | # Licensed under the MIT License - https://opensource.org/licenses/MIT 6 | 7 | import mediascrapers 8 | import os 9 | import sys 10 | 11 | if __name__ == '__main__': 12 | scraper = mediascrapers.InstagramScraper(scroll_pause = 1.0, mode='normal', debug=False) 13 | if os.path.exists('credentials.json'): 14 | scraper.login('credentials.json') 15 | for username in sys.argv[1:]: 16 | tasks = scraper.scrape(username) 17 | scraper.download(tasks=tasks, path='download/instagram') -------------------------------------------------------------------------------- /mediascraper/interactive/instagram.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2018 Elvis Yu-Jing Lin 5 | # Licensed under the MIT License - https://opensource.org/licenses/MIT 6 | 7 | import mediascrapers 8 | import os 9 | import sys 10 | 11 | def input_username(): 12 | return input('Enter a username (ENTER to exit): ').strip() 13 | 14 | if __name__ == '__main__': 15 | print('Starting InstagramScraper...') 16 | scraper = mediascrapers.InstagramScraper(scroll_pause = 1.0, mode='normal', debug=False) 17 | if os.path.exists('credentials.json'): 18 | scraper.login('credentials.json') 19 | username = input_username() 20 | while username != '': 21 | tasks = scraper.scrape(username) 22 | scraper.download(tasks=tasks, path='download/instagram') 23 | username = input_username() -------------------------------------------------------------------------------- /mediascraper/interactive/twitter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2018 Elvis Yu-Jing Lin 5 | # Licensed under the MIT License - https://opensource.org/licenses/MIT 6 | 7 | import mediascrapers 8 | import os 9 | import sys 10 | 11 | def input_username(): 12 | return input('Enter a username (ENTER to exit): ').strip() 13 | 14 | if __name__ == '__main__': 15 | print('Starting TwitterScraper...') 16 | scraper = mediascrapers.TwitterScraper(scroll_pause = 1.0, mode='normal', debug=False) 17 | if os.path.exists('credentials.json'): 18 | scraper.login('credentials.json') 19 | username = input_username() 20 | while username != '': 21 | if username[0] == '@': 22 | username = username[1:] 23 | tasks = scraper.scrape(username) 24 | scraper.download(tasks=tasks, path='download/twitter') 25 | username = input_username() -------------------------------------------------------------------------------- /mediascraper/twitter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2018 Elvis Yu-Jing Lin 5 | # Licensed under the MIT License - https://opensource.org/licenses/MIT 6 | 7 | import mediascrapers 8 | import os 9 | import sys 10 | 11 | if __name__ == '__main__': 12 | scraper = mediascrapers.TwitterScraper(scroll_pause = 1.0, mode='normal', debug=False) 13 | if os.path.exists('credentials.json'): 14 | scraper.login('credentials.json') 15 | for username in sys.argv[1:]: 16 | if username[0] == '@': 17 | username = username[1:] 18 | tasks = scraper.scrape(username) 19 | scraper.download(tasks=tasks, path='download/twitter') -------------------------------------------------------------------------------- /mediascrapers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2018 Elvis Yu-Jing Lin 5 | # Licensed under the MIT License - https://opensource.org/licenses/MIT 6 | 7 | import json 8 | import os 9 | import sys 10 | import time 11 | import re 12 | from abc import ABCMeta, abstractmethod 13 | from bs4 import BeautifulSoup as bs 14 | from selenium import webdriver 15 | from tqdm import tqdm 16 | from util import seleniumdriver 17 | from util.file import get_basename, get_extension, rename_file, safe_makedirs 18 | from util.instagram import parse_node 19 | from util.twitter import get_twitter_video_url 20 | from util.url import get_filename, complete_url, download, is_media 21 | 22 | class Scraper(metaclass=ABCMeta): 23 | 24 | def __init__(self, driver='phantomjs', scroll_pause=1.0, next_page_pause=1.0, mode='normal', debug=False): 25 | self._scroll_pause_time = scroll_pause 26 | self._next_page_pause_time = next_page_pause 27 | self._login_pause_time = 5.0 28 | self._mode = mode 29 | self._debug = debug 30 | self._name = 'scraper' 31 | 32 | if self._debug: 33 | driver = 'chrome' 34 | 35 | if driver == 'phantomjs': 36 | if self._mode != 'silent': 37 | print('Starting PhantomJS web driver...') 38 | self._driver = seleniumdriver.get('PhantomJS') 39 | elif driver == 'chrome': 40 | if self._mode == 'verbose': 41 | print('Starting Chrome web driver...') 42 | self._driver = seleniumdriver.get('Chrome') 43 | else: 44 | raise Exception('Driver not found "{}".'.format(driver)) 45 | 46 | # self._driver.set_window_size(1920, 1080) 47 | 48 | def _connect(self, url): 49 | if self._debug: 50 | print('Connecting to "{}"...'.format(url)) 51 | self._driver.get(url) 52 | 53 | def source(self): 54 | return self._driver.page_source 55 | 56 | def print(self): 57 | print(self.source()) 58 | 59 | def save(self, file): 60 | with open(file, 'wb') as f: 61 | f.write(self.source().encode('utf-8')) 62 | print('Saved web page to {}.'.format(file)) 63 | 64 | def load_credentials(self, credentials_file): 65 | assert os.path.exists(credentials_file), 'Error: Credentials file "{}" does not exist.'.format(credentials_file) 66 | 67 | with open(credentials_file, 'r') as f: 68 | credentials = json.loads(f.read()) 69 | 70 | assert self._name in credentials, 'Error: "{}" does not support credentials.'.format(self._name) 71 | 72 | credentials = credentials[self._name] 73 | if self._mode == 'verbose': 74 | user = credentials.keys().remove('password') 75 | print('Logging in as "{}"...'.format(credentials[user])) 76 | return credentials 77 | 78 | def find_element_by_class_name(self, class_name): 79 | try: 80 | element = self._driver.find_element_by_class_name(class_name) 81 | return element 82 | except: 83 | return None 84 | 85 | def scrollToBottom(self, fn=None, times=-1): 86 | if times < 0: times = sys.maxsize 87 | last_height, new_height = self._driver.execute_script("return document.body.scrollHeight"), 0 88 | counter = 0 89 | while (new_height != last_height or fn is not None and fn()) and counter < times: 90 | self._driver.execute_script("window.scrollTo(0, document.body.scrollHeight)") 91 | time.sleep(self._scroll_pause_time) 92 | last_height = new_height 93 | new_height = self._driver.execute_script("return document.body.scrollHeight") 94 | counter += 1 95 | return not (new_height != last_height or fn is not None and fn()) 96 | 97 | @abstractmethod 98 | def scrape(self): 99 | return None 100 | 101 | def download(self, tasks, path='.', force=False): 102 | if self._mode != 'silent': 103 | print('Downloading...') 104 | for url, folder, rename in tqdm(tasks): 105 | target_path = path 106 | if folder is not None: 107 | target_path = os.path.join(target_path, folder) 108 | download(url, path=target_path, rename=rename, replace=force) 109 | 110 | @abstractmethod 111 | def login(self): 112 | pass 113 | 114 | class MediaScraper(Scraper): 115 | 116 | def __init__(self, **kwargs): 117 | super().__init__(**kwargs) 118 | self._name = 'general' 119 | # self.abs_url_regex = '([a-z0-9]*:|.{0})\/\/[^"\s]+' 120 | # self.rel_url_regex = '\"[^\/]+\/[^\/].*$|^\/[^\/].*\"' 121 | # self.abs_url_regex = '/^([a-z0-9]*:|.{0})\/\/.*$/gmi' 122 | # self.rel_url_regex = '/^[^\/]+\/[^\/].*$|^\/[^\/].*$/gmi' 123 | 124 | def scrape(self, url): 125 | self._connect(url) 126 | self.scrollToBottom() 127 | 128 | if self._debug: 129 | self.save('test.html') 130 | 131 | source = self.source() 132 | 133 | # Parse links, images, and videos successively by BeautifulSoup parser. 134 | 135 | media_urls = [] 136 | soup = bs(source, 'html.parser') 137 | title = soup.find('title').text 138 | for link in soup.find_all('a', href=True): 139 | if is_media(link['href']): 140 | media_urls.append(link['href']) 141 | if is_media(link.text): 142 | media_urls.append(link.text) 143 | for image in soup.find_all('img', src=True): 144 | if is_media(image['src']): 145 | media_urls.append(image['src']) 146 | for video in soup.find_all('video', src=True): 147 | if is_media(video['src']): 148 | media_urls.append(video['src']) 149 | 150 | if self._debug: 151 | print(media_urls) 152 | 153 | tasks = [(complete_url(media_url, self._driver.current_url), title, None) for media_url in media_urls] 154 | 155 | if self._debug: 156 | print(tasks) 157 | 158 | if self._mode != 'silent': 159 | print('{} media are found.'.format(len(media_urls))) 160 | 161 | return tasks 162 | 163 | 164 | # # Parse links, images, and videos successively by native regex matching. 165 | 166 | # urls = re.findall('http', source) 167 | # print('test urls:') 168 | # for url in urls: 169 | # print(url) 170 | 171 | # urls = re.findall(self.abs_url_regex, source) 172 | # print('abs urls:') 173 | # for url in urls: 174 | # print(url) 175 | 176 | # urls = re.findall(self.rel_url_regex, source) 177 | # print('rel urls:') 178 | # for url in urls: 179 | # print(url) 180 | 181 | def login(self, credentials_file): 182 | pass 183 | 184 | 185 | class InstagramScraper(Scraper): 186 | 187 | # NOTES: 188 | # 1. Naming rule of Instagram username: 189 | # (1) letters (a-zA-Z) 190 | # (2) digits (0-9) 191 | # (3) underline (_) 192 | # (4) dot (.) 193 | # 2. Shortcode: 194 | # not necessarily is a string of 11 characters 195 | # maybe a string of 38 (on private account) 196 | # 3. In a page, there are at most 30 rows of posts. 197 | 198 | def __init__(self, **kwargs): 199 | super().__init__(**kwargs) 200 | self._name = 'instagram' 201 | 202 | self.login_url = 'https://www.instagram.com/accounts/login/' 203 | self.json_data_url = 'https://www.instagram.com/{}/?__a=1' 204 | self.json_data_url_with_max_id = 'https://www.instagram.com/{}/?__a=1&max_id={}' 205 | self.new_json_data_url = 'https://www.instagram.com/graphql/query/?query_hash={}&variables={{"id":"{}","first":{},"after":"{}"}}' 206 | self.query_parameters = { 207 | 'query_hash': '472f257a40c653c64c666ce877d59d2b', 208 | 'first': 12 209 | } 210 | self.post_regex = '\/p\/[^\/]+\/' 211 | 212 | # def getJsonData(self, target, max_id=None): 213 | # if max_id is None: 214 | # self._connect(self.json_data_url.format(target)) 215 | # else: 216 | # self._connect(self.json_data_url_with_max_id.format(target, max_id)) 217 | # content = self._driver.find_element_by_tag_name('pre').text 218 | # data = json.loads(content) 219 | # return data 220 | 221 | def getJsonData(self, user_or_id, after=None): 222 | if after is None: # user_or_id should be username 223 | self._connect(self.json_data_url.format(user_or_id)) 224 | else: # user_or_id should be id 225 | self._connect(self.new_json_data_url.format( 226 | self.query_parameters['query_hash'], user_or_id, self.query_parameters['first'], after)) 227 | content = self._driver.find_element_by_tag_name('pre').text 228 | data = json.loads(content) 229 | return data 230 | 231 | def sharedData(self): 232 | return self._driver.execute_script("return window._sharedData") 233 | 234 | def scrape(self, username): 235 | if self._mode != 'silent': 236 | print('Crawling...') 237 | 238 | data = self.getJsonData(username) 239 | 240 | # user = data['user'] 241 | # media = user['media'] 242 | # nodes = media['nodes'] 243 | 244 | user = data['graphql']['user'] 245 | user_id = user['id'] 246 | media = user['edge_owner_to_timeline_media'] 247 | count = media['count'] 248 | # print('Count: {}'.format(count)) 249 | page_info = media['page_info'] 250 | edges = media['edges'] 251 | has_next_page = page_info['has_next_page'] 252 | end_cursor = page_info['end_cursor'] 253 | 254 | tasks = [] 255 | num_post = 0 256 | while len(edges) > 0: 257 | num_post += len(edges) 258 | for edge in edges: 259 | # post = self.getJsonData('p/'+node['code']) 260 | post = self.getJsonData('p/'+edge['node']['shortcode']) 261 | task = parse_node(post['graphql']['shortcode_media']) 262 | tasks += (task[0], username, task[1]) 263 | # nodes = data['user']['media']['nodes'] 264 | if has_next_page: 265 | # data = self.getJsonData(username, edges[-1]['node']['id']) 266 | data = self.getJsonData(user_id, end_cursor) 267 | try: 268 | edges = data['data']['user']['edge_owner_to_timeline_media']['edges'] 269 | except Exception as e: 270 | print(data) 271 | print(e) 272 | has_next_page = data['data']['user']['edge_owner_to_timeline_media']['page_info']['has_next_page'] 273 | end_cursor = data['data']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor'] 274 | else: 275 | break 276 | 277 | if self._mode != 'silent': 278 | print('{} posts are found.'.format(num_post)) 279 | 280 | if self._mode != 'silent': 281 | print('{} media are found.'.format(len(tasks))) 282 | 283 | return tasks 284 | 285 | def scrapePage(self, username): 286 | self._connect('{}/{}/'.format(self.base_url, username)) 287 | 288 | if self._mode != 'silent': 289 | print('Crawling...') 290 | done = False 291 | codes = re.findall(self.post_regex, self.source()) 292 | while not done: 293 | done = self.scrollToBottom(fn=lambda: self.find_element_by_class_name('_o5uzb'), times=2) 294 | codes += re.findall(self.post_regex, self.source()) 295 | codes = list(set(codes)) 296 | codes = [code[3:-1] for code in codes] 297 | 298 | if self._mode != 'silent': 299 | print('{} posts are found.'.format(len(codes))) 300 | 301 | if self._debug: 302 | self.save('test.html') 303 | with open('shortcodes.txt', 'w') as f: 304 | f.write(json.dumps(codes)) 305 | 306 | if self._mode != 'silent': 307 | print('Scraping...') 308 | 309 | tasks = [] 310 | for code in tqdm(codes): 311 | self._connect('{}/p/{}/'.format(self.base_url, code)) 312 | data = self.sharedData() 313 | node = data['entry_data']['PostPage'][0]['graphql']['shortcode_media'] 314 | tasks += parse_node(node, node['owner']['username']) 315 | 316 | if self._mode != 'silent': 317 | print('{} media are found.'.format(len(tasks))) 318 | 319 | return tasks 320 | 321 | def scrapeSharedData(self): 322 | sharedData = self.sharedData() 323 | profilePage = sharedData['entry_data']['ProfilePage'] 324 | print('# of profilePage: {}.'.format(len(profilePage))) 325 | user = profilePage[0]['user'] 326 | print('# of following: {}.'.format(user['follows']['count'])) 327 | print('Url of profile picture: {}.'.format(user['profile_pic_url_hd'])) 328 | print('Full name: {}.'.format(user['full_name'])) 329 | print('# of followers: {}.'.format(user['followed_by'])) 330 | # user['media_collections'] 331 | media = user['media'] 332 | print('# of media: {}.'.format(media['count'])) 333 | nodes = media['nodes'] 334 | target = [] 335 | for node in nodes: 336 | # node['date'] 337 | # node['comments']['count'] 338 | # node['is_video'] 339 | # node['id'] 340 | # node['__typename'] 341 | target.append(node['code']) 342 | # node['likes']['count'] 343 | # node['caption'] 344 | # user['is_private'] 345 | # user['username'] 346 | 347 | with open('json.txt', 'w') as f: 348 | f.write(json.dumps(sharedData)) 349 | 350 | with open('ids_shared_data.txt', 'w') as f: 351 | f.write(json.dumps(target)) 352 | 353 | def login(self, credentials_file): 354 | credentials = self.load_credentials(credentials_file) 355 | return 356 | 357 | if credentials['username'] == '' or credentials['password'] == '': 358 | print('Either username or password is empty. Abort login.') 359 | 360 | if self._mode != 'silent': 361 | print('Logging in as "{}"...'.format(credentials['username'])) 362 | 363 | self._connect(self.login_url) 364 | time.sleep(self._login_pause_time) 365 | 366 | username, password = self._driver.find_elements_by_tag_name('input') 367 | button = self._driver.find_element_by_tag_name('button') 368 | 369 | username.send_keys(credentials['username']) 370 | password.send_keys(credentials['password']) 371 | button.click() 372 | time.sleep(self._login_pause_time) 373 | 374 | 375 | class TwitterScraper(Scraper): 376 | 377 | def __init__(self, **kwargs): 378 | super().__init__(**kwargs) 379 | self._name = 'twitter' 380 | 381 | self.base_url = 'https://twitter.com' 382 | self.login_url = 'https://twitter.com/login' 383 | # self.post_regex = '/p/[ -~]{11}/' 384 | self.scroll_pause = 3.0 385 | 386 | def scrape(self, username): 387 | self._connect('{}/{}/media'.format(self.base_url, username)) 388 | 389 | if self._mode != 'silent': 390 | print('Crawling...') 391 | 392 | done = self.scrollToBottom() 393 | 394 | source = self.source() 395 | soup = bs(source, 'html.parser') 396 | 397 | # title = soup.find('title') 398 | # name = title.get_text().replace('Media Tweets by ', '').replace(' | Twitter', '') 399 | 400 | # avatar_url = soup.find("a", { "class" : "ProfileCardMini-avatar" }).get('data-resolved-url-large') 401 | # background_url = soup.find("div", { "class" : "ProfileCanopy-headerBg" }).find('img').get('src') 402 | 403 | tasks = [] 404 | for li in soup.find_all('li', {'class': 'js-stream-item stream-item stream-item '}): 405 | photos = li.find_all('div', { "class" : "AdaptiveMedia-photoContainer" }) 406 | if photos == []: 407 | try: 408 | img_url, vid_url = get_twitter_video_url(li['data-item-id']) 409 | tasks.append((img_url+':large', username, get_basename(get_filename(img_url)))) 410 | tasks.append((vid_url, username, get_basename(get_filename(vid_url)))) 411 | except Exception as e: 412 | with open('error.txt', 'w', encoding='utf-8') as f: 413 | f.write(str(e) + '\n') 414 | f.write(str(li)) 415 | else: 416 | for photo in photos: 417 | url = photo['data-image-url'] 418 | tasks.append((url+':large', username, get_basename(get_filename(url)))) 419 | for div in soup.find_all('div', { "class" : "AdaptiveMedia-photoContainer" }): 420 | url = div['data-image-url'] 421 | tasks.append((url+':large', username, get_basename(get_filename(url)))) 422 | 423 | if self._mode != 'silent': 424 | print('{} media are found.'.format(len(tasks))) 425 | 426 | return tasks 427 | 428 | def login(self, credentials_file): 429 | credentials = self.load_credentials(credentials_file) 430 | 431 | if credentials['username'] == '' or credentials['password'] == '': 432 | print('Either username or password is empty. Abort login.') 433 | return 434 | 435 | if self._mode != 'silent': 436 | print('Logging in as "{}"...'.format(credentials['username'])) 437 | 438 | self._connect(self.login_url) 439 | time.sleep(self._login_pause_time) 440 | 441 | usernames = self._driver.find_elements_by_name('session[username_or_email]') 442 | passwords = self._driver.find_elements_by_name('session[password]') 443 | buttons = self._driver.find_elements_by_tag_name('button') 444 | username = [u for u in usernames if u.get_attribute('class') == 'js-username-field email-input js-initial-focus'][0] 445 | password = [p for p in passwords if p.get_attribute('class') == 'js-password-field'][0] 446 | button = [b for b in buttons if b.text != ''][0] 447 | self._driver.save_screenshot('test.png') 448 | self._driver.implicitly_wait(10) 449 | 450 | username.send_keys(credentials['username']) 451 | password.send_keys(credentials['password']) 452 | button.click() 453 | time.sleep(self._login_pause_time) 454 | 455 | 456 | class FacebookScraper(Scraper): 457 | 458 | def __init__(self, **kwargs): 459 | super().__init__(**kwargs) 460 | self._name = 'facebook' 461 | 462 | self.base_url = 'https://www.facebook.com' 463 | self.login_url = 'https://www.facebook.com/login' 464 | # self.post_regex = '/p/[ -~]{11}/' 465 | 466 | def scrape(self, username): 467 | self._connect('{}/{}/media'.format(self.base_url, username)) 468 | 469 | if self._mode != 'silent': 470 | print('Crawling...') 471 | 472 | done = self.scrollToBottom() 473 | 474 | source = self.source() 475 | soup = bs(source, 'html.parser') 476 | 477 | # title = soup.find('title') 478 | # name = title.get_text().replace('Media Tweets by ', '').replace(' | Twitter', '') 479 | 480 | # avatar_url = soup.find("a", { "class" : "ProfileCardMini-avatar" }).get('data-resolved-url-large') 481 | # background_url = soup.find("div", { "class" : "ProfileCanopy-headerBg" }).find('img').get('src') 482 | 483 | tasks = [] 484 | for div in soup.find_all('div', { "class" : "AdaptiveMedia-photoContainer" }): 485 | url = div.get('data-image-url') 486 | tasks.append((url+':large', username, get_filename(url))) 487 | 488 | if self._mode != 'silent': 489 | print('{} media are found.'.format(len(tasks))) 490 | 491 | return tasks 492 | 493 | def login(self, credentials_file): 494 | credentials = self.load_credentials(credentials_file) 495 | 496 | if credentials['email'] == '' or credentials['password'] == '': 497 | print('Either email or password is empty. Abort login.') 498 | return 499 | 500 | if self._mode != 'silent': 501 | print('Logging in as "{}"...'.format(credentials['email'])) 502 | 503 | self._connect(self.login_url) 504 | time.sleep(self._login_pause_time) 505 | 506 | email = self._driver.find_element_by_tag_name('email') 507 | password = self._driver.find_element_by_tag_name('pass') 508 | buttons = self._driver.find_element_by_tag_name('login') 509 | 510 | username.send_keys(credentials['email']) 511 | password.send_keys(credentials['password']) 512 | button.click() 513 | time.sleep(self._login_pause_time) 514 | 515 | 516 | class pixivScraper(Scraper): 517 | 518 | def __init__(self, **kwargs): 519 | super().__init__(**kwargs) 520 | self._name = 'pixiv' 521 | 522 | self.base_url = 'https://www.pixiv.net' 523 | self.post_url = 'https://www.pixiv.net/member_illust.php?id=' 524 | self.login_url = 'https://accounts.pixiv.net/login' 525 | # self.post_regex = '/p/[ -~]{11}/' 526 | 527 | def scrape(self, id, content_type='all'): 528 | self._connect(self.post_url + id) 529 | self.id = id 530 | self.type = content_type 531 | 532 | if self._mode != 'silent': 533 | print('Crawling...') 534 | 535 | # TODO 536 | 537 | # get page num 538 | pager_container = self._driver.get_element_by_class_name('page-list') 539 | last_pager = pager_container.get_element_by_tag_name('li')[-1] 540 | num_page = int(last_pager.get_element_by_tag_name('a').text) 541 | print('# of page: {}'.format(num_page)) 542 | 543 | # crawl each page 544 | for p in range(1, num_page+1): 545 | url = 'https://www.pixiv.net/member_illust.php?id={}&type={}&p={}'.format(self.id, self.type, p) 546 | self._driver._connect(url) 547 | time.sleep(self._next_page_pause_time) 548 | print(url) 549 | # scrape each post 550 | 551 | return 552 | 553 | done = self.scrollToBottom() 554 | 555 | source = self.source() 556 | soup = bs(source, 'html.parser') 557 | 558 | # title = soup.find('title') 559 | # name = title.get_text().replace('Media Tweets by ', '').replace(' | Twitter', '') 560 | 561 | # avatar_url = soup.find("a", { "class" : "ProfileCardMini-avatar" }).get('data-resolved-url-large') 562 | # background_url = soup.find("div", { "class" : "ProfileCanopy-headerBg" }).find('img').get('src') 563 | 564 | tasks = [] 565 | for div in soup.find_all('div', { "class" : "AdaptiveMedia-photoContainer" }): 566 | url = div.get('data-image-url') 567 | tasks.append((url+':large', get_filename(url))) 568 | 569 | if self._mode != 'silent': 570 | print('{} media are found.'.format(len(tasks))) 571 | 572 | return tasks 573 | 574 | def login(self, credentials_file): 575 | credentials = self.load_credentials(credentials_file) 576 | 577 | if credentials['username'] == '' or credentials['password'] == '': 578 | print('Either username or password is empty. Abort login.') 579 | return 580 | 581 | if self._mode != 'silent': 582 | print('Logging in as "{}"...'.format(credentials['username'])) 583 | 584 | self._connect(self.login_url) 585 | time.sleep(self._login_pause_time) 586 | 587 | container = self._driver.find_element_by_id('container-login') 588 | 589 | username, password = container.find_elements_by_tag_name('input') 590 | buttons = container.find_element_by_tag_name('button') 591 | 592 | username.send_keys(credentials['username']) 593 | password.send_keys(credentials['password']) 594 | button.click() 595 | time.sleep(self._login_pause_time) 596 | -------------------------------------------------------------------------------- /mediatypes/README.md: -------------------------------------------------------------------------------- 1 | # Media Type 2 | 3 | The tables of `audio.csv`, `image.csv` and `video.csv` are retrieved 4 | [here](https://www.iana.org/assignments/media-types/media-types.xhtml). -------------------------------------------------------------------------------- /mediatypes/audio.csv: -------------------------------------------------------------------------------- 1 | Name,Template,Reference 2 | 1d-interleaved-parityfec,audio/1d-interleaved-parityfec,[RFC6015] 3 | 32kadpcm,audio/32kadpcm,[RFC3802][RFC2421] 4 | 3gpp,audio/3gpp,[RFC3839][RFC6381] 5 | 3gpp2,audio/3gpp2,[RFC4393][RFC6381] 6 | ac3,audio/ac3,[RFC4184] 7 | AMR,audio/AMR,[RFC4867] 8 | AMR-WB,audio/AMR-WB,[RFC4867] 9 | amr-wb+,audio/amr-wb+,[RFC4352] 10 | aptx,audio/aptx,[RFC7310] 11 | asc,audio/asc,[RFC6295] 12 | ATRAC-ADVANCED-LOSSLESS,audio/ATRAC-ADVANCED-LOSSLESS,[RFC5584] 13 | ATRAC-X,audio/ATRAC-X,[RFC5584] 14 | ATRAC3,audio/ATRAC3,[RFC5584] 15 | basic,audio/basic,[RFC2045][RFC2046] 16 | BV16,audio/BV16,[RFC4298] 17 | BV32,audio/BV32,[RFC4298] 18 | clearmode,audio/clearmode,[RFC4040] 19 | CN,audio/CN,[RFC3389] 20 | DAT12,audio/DAT12,[RFC3190] 21 | dls,audio/dls,[RFC4613] 22 | dsr-es201108,audio/dsr-es201108,[RFC3557] 23 | dsr-es202050,audio/dsr-es202050,[RFC4060] 24 | dsr-es202211,audio/dsr-es202211,[RFC4060] 25 | dsr-es202212,audio/dsr-es202212,[RFC4060] 26 | DV,audio/DV,[RFC6469] 27 | DVI4,audio/DVI4,[RFC4856] 28 | eac3,audio/eac3,[RFC4598] 29 | encaprtp,audio/encaprtp,[RFC6849] 30 | EVRC,audio/EVRC,[RFC4788] 31 | EVRC-QCP,audio/EVRC-QCP,[RFC3625] 32 | EVRC0,audio/EVRC0,[RFC4788] 33 | EVRC1,audio/EVRC1,[RFC4788] 34 | EVRCB,audio/EVRCB,[RFC5188] 35 | EVRCB0,audio/EVRCB0,[RFC5188] 36 | EVRCB1,audio/EVRCB1,[RFC4788] 37 | EVRCNW,audio/EVRCNW,[RFC6884] 38 | EVRCNW0,audio/EVRCNW0,[RFC6884] 39 | EVRCNW1,audio/EVRCNW1,[RFC6884] 40 | EVRCWB,audio/EVRCWB,[RFC5188] 41 | EVRCWB0,audio/EVRCWB0,[RFC5188] 42 | EVRCWB1,audio/EVRCWB1,[RFC5188] 43 | EVS,audio/EVS,[_3GPP][Kyunghun_Jung] 44 | example,audio/example,[RFC4735] 45 | fwdred,audio/fwdred,[RFC6354] 46 | G711-0,audio/G711-0,[RFC7655] 47 | G719,audio/G719,[RFC5404][RFC Errata 3245] 48 | G7221,audio/G7221,[RFC5577] 49 | G722,audio/G722,[RFC4856] 50 | G723,audio/G723,[RFC4856] 51 | G726-16,audio/G726-16,[RFC4856] 52 | G726-24,audio/G726-24,[RFC4856] 53 | G726-32,audio/G726-32,[RFC4856] 54 | G726-40,audio/G726-40,[RFC4856] 55 | G728,audio/G728,[RFC4856] 56 | G729,audio/G729,[RFC4856] 57 | G7291,,[RFC4749][RFC5459] 58 | G729D,audio/G729D,[RFC4856] 59 | G729E,audio/G729E,[RFC4856] 60 | GSM,audio/GSM,[RFC4856] 61 | GSM-EFR,audio/GSM-EFR,[RFC4856] 62 | GSM-HR-08,audio/GSM-HR-08,[RFC5993] 63 | iLBC,audio/iLBC,[RFC3952] 64 | ip-mr_v2.5,audio/ip-mr_v2.5,[RFC6262] 65 | L8,audio/L8,[RFC4856] 66 | L16,audio/L16,[RFC4856] 67 | L20,audio/L20,[RFC3190] 68 | L24,audio/L24,[RFC3190] 69 | LPC,audio/LPC,[RFC4856] 70 | MELP,audio/MELP,[RFC8130] 71 | MELP600,audio/MELP600,[RFC8130] 72 | MELP1200,audio/MELP1200,[RFC8130] 73 | MELP2400,audio/MELP2400,[RFC8130] 74 | mobile-xmf,audio/mobile-xmf,[RFC4723] 75 | MPA,audio/MPA,[RFC3555] 76 | mp4,audio/mp4,[RFC4337][RFC6381] 77 | MP4A-LATM,audio/MP4A-LATM,[RFC6416] 78 | mpa-robust,audio/mpa-robust,[RFC5219] 79 | mpeg,audio/mpeg,[RFC3003] 80 | mpeg4-generic,audio/mpeg4-generic,[RFC3640][RFC5691][RFC6295] 81 | ogg,audio/ogg,[RFC5334][RFC7845] 82 | opus,audio/opus,[RFC7587] 83 | parityfec,,[RFC5109] 84 | PCMA,audio/PCMA,[RFC4856] 85 | PCMA-WB,audio/PCMA-WB,[RFC5391] 86 | PCMU,audio/PCMU,[RFC4856] 87 | PCMU-WB,audio/PCMU-WB,[RFC5391] 88 | prs.sid,audio/prs.sid,[Linus_Walleij] 89 | QCELP,,[RFC3555][RFC3625] 90 | raptorfec,audio/raptorfec,[RFC6682] 91 | RED,audio/RED,[RFC3555] 92 | rtp-enc-aescm128,audio/rtp-enc-aescm128,[_3GPP] 93 | rtploopback,audio/rtploopback,[RFC6849] 94 | rtp-midi,audio/rtp-midi,[RFC6295] 95 | rtx,audio/rtx,[RFC4588] 96 | SMV,audio/SMV,[RFC3558] 97 | SMV0,audio/SMV0,[RFC3558] 98 | SMV-QCP,audio/SMV-QCP,[RFC3625] 99 | sp-midi,audio/sp-midi,[Timo_Kosonen][Tom_White] 100 | speex,audio/speex,[RFC5574] 101 | t140c,audio/t140c,[RFC4351] 102 | t38,audio/t38,[RFC4612] 103 | telephone-event,audio/telephone-event,[RFC4733] 104 | tone,audio/tone,[RFC4733] 105 | UEMCLIP,audio/UEMCLIP,[RFC5686] 106 | ulpfec,audio/ulpfec,[RFC5109] 107 | VDVI,audio/VDVI,[RFC4856] 108 | VMR-WB,audio/VMR-WB,[RFC4348][RFC4424] 109 | vnd.3gpp.iufp,audio/vnd.3gpp.iufp,[Thomas_Belling] 110 | vnd.4SB,audio/vnd.4SB,[Serge_De_Jaham] 111 | vnd.audiokoz,audio/vnd.audiokoz,[Vicki_DeBarros] 112 | vnd.CELP,audio/vnd.CELP,[Serge_De_Jaham] 113 | vnd.cisco.nse,audio/vnd.cisco.nse,[Rajesh_Kumar] 114 | vnd.cmles.radio-events,audio/vnd.cmles.radio-events,[Jean-Philippe_Goulet] 115 | vnd.cns.anp1,audio/vnd.cns.anp1,[Ann_McLaughlin] 116 | vnd.cns.inf1,audio/vnd.cns.inf1,[Ann_McLaughlin] 117 | vnd.dece.audio,audio/vnd.dece.audio,[Michael_A_Dolan] 118 | vnd.digital-winds,audio/vnd.digital-winds,[Armands_Strazds] 119 | vnd.dlna.adts,audio/vnd.dlna.adts,[Edwin_Heredia] 120 | vnd.dolby.heaac.1,audio/vnd.dolby.heaac.1,[Steve_Hattersley] 121 | vnd.dolby.heaac.2,audio/vnd.dolby.heaac.2,[Steve_Hattersley] 122 | vnd.dolby.mlp,audio/vnd.dolby.mlp,[Mike_Ward] 123 | vnd.dolby.mps,audio/vnd.dolby.mps,[Steve_Hattersley] 124 | vnd.dolby.pl2,audio/vnd.dolby.pl2,[Steve_Hattersley] 125 | vnd.dolby.pl2x,audio/vnd.dolby.pl2x,[Steve_Hattersley] 126 | vnd.dolby.pl2z,audio/vnd.dolby.pl2z,[Steve_Hattersley] 127 | vnd.dolby.pulse.1,audio/vnd.dolby.pulse.1,[Steve_Hattersley] 128 | vnd.dra,audio/vnd.dra,[Jiang_Tian] 129 | vnd.dts,audio/vnd.dts,[William_Zou] 130 | vnd.dts.hd,audio/vnd.dts.hd,[William_Zou] 131 | vnd.dvb.file,audio/vnd.dvb.file,[Peter_Siebert] 132 | vnd.everad.plj,audio/vnd.everad.plj,[Shay_Cicelsky] 133 | vnd.hns.audio,audio/vnd.hns.audio,[Swaminathan] 134 | vnd.lucent.voice,audio/vnd.lucent.voice,[Greg_Vaudreuil] 135 | vnd.ms-playready.media.pya,audio/vnd.ms-playready.media.pya,[Steve_DiAcetis] 136 | vnd.nokia.mobile-xmf,audio/vnd.nokia.mobile-xmf,[Nokia] 137 | vnd.nortel.vbk,audio/vnd.nortel.vbk,[Glenn_Parsons] 138 | vnd.nuera.ecelp4800,audio/vnd.nuera.ecelp4800,[Michael_Fox] 139 | vnd.nuera.ecelp7470,audio/vnd.nuera.ecelp7470,[Michael_Fox] 140 | vnd.nuera.ecelp9600,audio/vnd.nuera.ecelp9600,[Michael_Fox] 141 | vnd.octel.sbc,audio/vnd.octel.sbc,[Greg_Vaudreuil] 142 | vnd.presonus.multitrack,audio/vnd.presonus.multitrack,[Matthias_Juwan] 143 | vnd.qcelp - DEPRECATED in favor of audio/qcelp,audio/vnd.qcelp,[RFC3625] 144 | vnd.rhetorex.32kadpcm,audio/vnd.rhetorex.32kadpcm,[Greg_Vaudreuil] 145 | vnd.rip,audio/vnd.rip,[Martin_Dawe] 146 | vnd.sealedmedia.softseal.mpeg,audio/vnd.sealedmedia.softseal-mpeg,[David_Petersen] 147 | vnd.vmx.cvsd,audio/vnd.vmx.cvsd,[Greg_Vaudreuil] 148 | vorbis,audio/vorbis,[RFC5215] 149 | vorbis-config,audio/vorbis-config,[RFC5215] 150 | -------------------------------------------------------------------------------- /mediatypes/image.csv: -------------------------------------------------------------------------------- 1 | Name,Template,Description,Reference 2 | aces,image/aces,,[SMPTE][Howard_Lukk] 3 | bmp,image/bmp,,[RFC7903] 4 | cgm,image/cgm,Computer Graphics Metafile,[Alan_Francis] 5 | dicom-rle,image/dicom-rle,,[DICOM_Standards_Committee][David_Clunie] 6 | emf,image/emf,,[RFC7903] 7 | example,image/example,,[RFC4735] 8 | fits,image/fits,,[RFC4047] 9 | g3fax,image/g3fax,,[RFC1494] 10 | gif,,,[RFC2045][RFC2046] 11 | ief,,Image Exchange Format,[RFC1314] 12 | jls,image/jls,,[DICOM_Standards_Committee][David_Clunie] 13 | jp2,image/jp2,,[RFC3745] 14 | jpeg,,,[RFC2045][RFC2046] 15 | jpm,image/jpm,,[RFC3745] 16 | jpx,image/jpx,,[RFC3745] 17 | ktx,,,[Khronos][Mark_Callow][http://www.khronos.org/opengles/sdk/tools/KTX/file_format_spec/#mimeregistration] 18 | naplps,image/naplps,,[Ilya_Ferber] 19 | png,image/png,,[Glenn_Randers-Pehrson] 20 | prs.btif,image/prs.btif,,[Ben_Simon] 21 | prs.pti,image/prs.pti,,[Juern_Laun] 22 | pwg-raster,image/pwg-raster,,[Michael_Sweet] 23 | svg+xml,,,[W3C][http://www.w3.org/TR/SVG/mimereg.html] 24 | t38,image/t38,,[RFC3362] 25 | tiff,image/tiff,Tag Image File Format,[RFC3302] 26 | tiff-fx,image/tiff-fx,Tag Image File Format Fax eXtended,[RFC3950] 27 | vnd.adobe.photoshop,image/vnd.adobe.photoshop,,[Kim_Scarborough] 28 | vnd.airzip.accelerator.azv,image/vnd.airzip.accelerator.azv,,[Gary_Clueit] 29 | vnd.cns.inf2,image/vnd.cns.inf2,,[Ann_McLaughlin] 30 | vnd.dece.graphic,image/vnd.dece.graphic,,[Michael_A_Dolan] 31 | vnd.djvu,image/vnd-djvu,,[Leon_Bottou] 32 | vnd.dwg,image/vnd.dwg,,[Jodi_Moline] 33 | vnd.dxf,image/vnd.dxf,,[Jodi_Moline] 34 | vnd.dvb.subtitle,image/vnd.dvb.subtitle,,[Peter_Siebert][Michael_Lagally] 35 | vnd.fastbidsheet,image/vnd.fastbidsheet,,[Scott_Becker] 36 | vnd.fpx,image/vnd.fpx,,[Marc_Douglas_Spencer] 37 | vnd.fst,image/vnd.fst,,[Arild_Fuldseth] 38 | vnd.fujixerox.edmics-mmr,image/vnd.fujixerox.edmics-mmr,,[Masanori_Onda] 39 | vnd.fujixerox.edmics-rlc,image/vnd.fujixerox.edmics-rlc,,[Masanori_Onda] 40 | vnd.globalgraphics.pgb,image/vnd.globalgraphics.pgb,,[Martin_Bailey] 41 | vnd.microsoft.icon,image/vnd.microsoft.icon,,[Simon_Butcher] 42 | vnd.mix,image/vnd.mix,,[Saveen_Reddy] 43 | vnd.ms-modi,image/vnd.ms-modi,,[Gregory_Vaughan] 44 | vnd.mozilla.apng,image/vnd.mozilla.apng,,[Stuart_Parmenter] 45 | vnd.net-fpx,image/vnd.net-fpx,,[Marc_Douglas_Spencer] 46 | vnd.radiance,image/vnd.radiance,,[Randolph_Fritz][Greg_Ward] 47 | vnd.sealed.png,image/vnd.sealed-png,,[David_Petersen] 48 | vnd.sealedmedia.softseal.gif,image/vnd.sealedmedia.softseal-gif,,[David_Petersen] 49 | vnd.sealedmedia.softseal.jpg,image/vnd.sealedmedia.softseal-jpg,,[David_Petersen] 50 | vnd.svf,image/vnd-svf,,[Jodi_Moline] 51 | vnd.tencent.tap,image/vnd.tencent.tap,,[Ni_Hui] 52 | vnd.valve.source.texture,image/vnd.valve.source.texture,,[Henrik_Andersson] 53 | vnd.wap.wbmp,image/vnd-wap-wbmp,,[Peter_Stark] 54 | vnd.xiff,image/vnd.xiff,,[Steven_Martin] 55 | vnd.zbrush.pcx,image/vnd.zbrush.pcx,,[Chris_Charabaruk] 56 | wmf,image/wmf,,[RFC7903] 57 | x-emf - DEPRECATED in favor of image/emf,image/emf,,[RFC7903] 58 | x-wmf - DEPRECATED in favor of image/wmf,image/wmf,,[RFC7903] 59 | -------------------------------------------------------------------------------- /mediatypes/video.csv: -------------------------------------------------------------------------------- 1 | Name,Template,Reference 2 | 1d-interleaved-parityfec,video/1d-interleaved-parityfec,[RFC6015] 3 | 3gpp,video/3gpp,[RFC3839][RFC6381] 4 | 3gpp2,video/3gpp2,[RFC4393][RFC6381] 5 | 3gpp-tt,video/3gpp-tt,[RFC4396] 6 | BMPEG,video/BMPEG,[RFC3555] 7 | BT656,video/BT656,[RFC3555] 8 | CelB,video/CelB,[RFC3555] 9 | DV,video/DV,[RFC6469] 10 | encaprtp,video/encaprtp,[RFC6849] 11 | example,video/example,[RFC4735] 12 | H261,video/H261,[RFC4587] 13 | H263,video/H263,[RFC3555] 14 | H263-1998,video/H263-1998,[RFC4629] 15 | H263-2000,video/H263-2000,[RFC4629] 16 | H264,video/H264,[RFC6184] 17 | H264-RCDO,video/H264-RCDO,[RFC6185] 18 | H264-SVC,video/H264-SVC,[RFC6190] 19 | H265,video/H265,[RFC7798] 20 | iso.segment,video/iso.segment,[David_Singer][ISO-IEC_JTC1] 21 | JPEG,video/JPEG,[RFC3555] 22 | jpeg2000,video/jpeg2000,[RFC5371][RFC5372] 23 | mj2,video/mj2,[RFC3745] 24 | MP1S,video/MP1S,[RFC3555] 25 | MP2P,video/MP2P,[RFC3555] 26 | MP2T,video/MP2T,[RFC3555] 27 | mp4,video/mp4,[RFC4337][RFC6381] 28 | MP4V-ES,video/MP4V-ES,[RFC6416] 29 | MPV,video/MPV,[RFC3555] 30 | mpeg,,[RFC2045][RFC2046] 31 | mpeg4-generic,video/mpeg4-generic,[RFC3640] 32 | nv,video/nv,[RFC4856] 33 | ogg,video/ogg,[RFC5334][RFC7845] 34 | parityfec,,[RFC5109] 35 | pointer,video/pointer,[RFC2862] 36 | quicktime,video/quicktime,[RFC6381][Paul_Lindner] 37 | raptorfec,video/raptorfec,[RFC6682] 38 | raw,,[RFC4175] 39 | rtp-enc-aescm128,video/rtp-enc-aescm128,[_3GPP] 40 | rtploopback,video/rtploopback,[RFC6849] 41 | rtx,video/rtx,[RFC4588] 42 | smpte291,video/smpte291,[RFC8331] 43 | SMPTE292M,video/SMPTE292M,[RFC3497] 44 | ulpfec,video/ulpfec,[RFC5109] 45 | vc1,video/vc1,[RFC4425] 46 | vnd.CCTV,video/vnd.CCTV,[Frank_Rottmann] 47 | vnd.dece.hd,video/vnd.dece.hd,[Michael_A_Dolan] 48 | vnd.dece.mobile,video/vnd.dece.mobile,[Michael_A_Dolan] 49 | vnd.dece.mp4,video/vnd.dece-mp4,[Michael_A_Dolan] 50 | vnd.dece.pd,video/vnd.dece.pd,[Michael_A_Dolan] 51 | vnd.dece.sd,video/vnd.dece.sd,[Michael_A_Dolan] 52 | vnd.dece.video,video/vnd.dece.video,[Michael_A_Dolan] 53 | vnd.directv.mpeg,video/vnd.directv-mpeg,[Nathan_Zerbe] 54 | vnd.directv.mpeg-tts,video/vnd.directv.mpeg-tts,[Nathan_Zerbe] 55 | vnd.dlna.mpeg-tts,video/vnd.dlna.mpeg-tts,[Edwin_Heredia] 56 | vnd.dvb.file,video/vnd.dvb.file,[Peter_Siebert][Kevin_Murray] 57 | vnd.fvt,video/vnd.fvt,[Arild_Fuldseth] 58 | vnd.hns.video,video/vnd.hns.video,[Swaminathan] 59 | vnd.iptvforum.1dparityfec-1010,video/vnd.iptvforum.1dparityfec-1010,[Shuji_Nakamura] 60 | vnd.iptvforum.1dparityfec-2005,video/vnd.iptvforum.1dparityfec-2005,[Shuji_Nakamura] 61 | vnd.iptvforum.2dparityfec-1010,video/vnd.iptvforum.2dparityfec-1010,[Shuji_Nakamura] 62 | vnd.iptvforum.2dparityfec-2005,video/vnd.iptvforum.2dparityfec-2005,[Shuji_Nakamura] 63 | vnd.iptvforum.ttsavc,video/vnd.iptvforum.ttsavc,[Shuji_Nakamura] 64 | vnd.iptvforum.ttsmpeg2,video/vnd.iptvforum.ttsmpeg2,[Shuji_Nakamura] 65 | vnd.motorola.video,video/vnd.motorola.video,[Tom_McGinty] 66 | vnd.motorola.videop,video/vnd.motorola.videop,[Tom_McGinty] 67 | vnd.mpegurl,video/vnd-mpegurl,[Heiko_Recktenwald] 68 | vnd.ms-playready.media.pyv,video/vnd.ms-playready.media.pyv,[Steve_DiAcetis] 69 | vnd.nokia.interleaved-multimedia,video/vnd.nokia.interleaved-multimedia,[Petteri_Kangaslampi] 70 | vnd.nokia.mp4vr,video/vnd.nokia.mp4vr,[Miska_M._Hannuksela] 71 | vnd.nokia.videovoip,video/vnd.nokia.videovoip,[Nokia] 72 | vnd.objectvideo,video/vnd.objectvideo,[John_Clark] 73 | vnd.radgamettools.bink,video/vnd.radgamettools.bink,[Henrik_Andersson] 74 | vnd.radgamettools.smacker,video/vnd.radgamettools.smacker,[Henrik_Andersson] 75 | vnd.sealed.mpeg1,video/vnd.sealed.mpeg1,[David_Petersen] 76 | vnd.sealed.mpeg4,video/vnd.sealed.mpeg4,[David_Petersen] 77 | vnd.sealed.swf,video/vnd.sealed-swf,[David_Petersen] 78 | vnd.sealedmedia.softseal.mov,video/vnd.sealedmedia.softseal-mov,[David_Petersen] 79 | vnd.uvvu.mp4,video/vnd.uvvu-mp4,[Michael_A_Dolan] 80 | vnd.vivo,video/vnd-vivo,[John_Wolfe] 81 | VP8,video/VP8,[RFC7741] 82 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4 2 | requests 3 | selenium 4 | tqdm -------------------------------------------------------------------------------- /util/file.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2018 Elvis Yu-Jing Lin 5 | # Licensed under the MIT License - https://opensource.org/licenses/MIT 6 | 7 | import os 8 | 9 | def get_basename(filename): 10 | return filename.rsplit('.', 1)[0] 11 | 12 | def get_extension(filename): 13 | return filename.rsplit('.', 1)[1] 14 | 15 | def rename_file(filename, name): 16 | return '{}.{}'.format(name, get_extension(filename)) 17 | 18 | def safe_makedirs(path): 19 | if not os.path.exists(path): 20 | os.makedirs(path) -------------------------------------------------------------------------------- /util/instagram.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2018 Elvis Yu-Jing Lin 5 | # Licensed under the MIT License - https://opensource.org/licenses/MIT 6 | 7 | def largest_image_url(resources): 8 | return max(resources, key=lambda x: x['config_height']*x['config_width'])['src'] 9 | 10 | def node_name(node): 11 | return '{},{}'.format(node['id'], node['shortcode']) 12 | 13 | def parse_node(node, name=''): 14 | tasks = [] 15 | 16 | if name == '': 17 | name = node_name(node) 18 | else: 19 | name += ' ' + node_name(node) 20 | 21 | display_resources = node['display_resources'] 22 | # find the highest resolution image 23 | url = largest_image_url(display_resources) 24 | # download(url, path=save_path, rename=name, replace=False) 25 | tasks.append((url, name)) 26 | 27 | typename = node['__typename'] 28 | if typename == 'GraphImage': 29 | pass 30 | elif typename == 'GraphSidecar': 31 | edges = node['edge_sidecar_to_children']['edges'] 32 | for edge in edges: 33 | # parse_node(edge['node'], name, save_path) 34 | tasks += parse_node(edge['node'], name) 35 | elif typename == 'GraphVideo': 36 | url = node['video_url'] 37 | # download(url, path=save_path, rename=name, replace=False) 38 | tasks.append((url, name)) 39 | else: 40 | print('Error: unsupported typename "{}".'.format(typename)) 41 | 42 | return tasks -------------------------------------------------------------------------------- /util/seleniumdriver.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2018 Elvis Yu-Jing Lin 5 | # Licensed under the MIT License - https://opensource.org/licenses/MIT 6 | 7 | import platform 8 | from os import chmod, makedirs, stat 9 | from os.path import dirname, exists, join 10 | from selenium import webdriver 11 | from selenium.webdriver.chrome.options import Options 12 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 13 | 14 | SRC_URL_DICT = { 15 | 'webdriver/phantomjsdriver_2.1.1_win32/phantomjs.exe': 'https://www.dropbox.com/s/y1sc5ujzhdqb9f4/phantomjs.exe?dl=1', 16 | 'webdriver/phantomjsdriver_2.1.1_mac64/phantomjs': 'https://www.dropbox.com/s/b6hmitsz1u4wc5w/phantomjs?dl=1', 17 | 'webdriver/phantomjsdriver_2.1.1_linux32/phantomjs': 'https://www.dropbox.com/s/xxka7isoskg53tr/phantomjs?dl=1', 18 | 'webdriver/phantomjsdriver_2.1.1_linux64/phantomjs': 'https://www.dropbox.com/s/dhuw71d9l5umk5m/phantomjs?dl=1', 19 | 'webdriver/chromedriver_2.39_win32/chromedriver.exe': 'https://www.dropbox.com/s/k8dibiirz35zjf0/chromedriver.exe?dl=1', 20 | 'webdriver/chromedriver_2.39_mac64/chromedriver': 'https://www.dropbox.com/s/jatcb8n8lqijat9/chromedriver?dl=1', 21 | 'webdriver/chromedriver_2.39_linux64/chromedriver': 'https://www.dropbox.com/s/vgyik5zsngpkck4/chromedriver?dl=1', 22 | 'webdriver/geckodriver_0.19.1_win32/geckodriver.exe': 'https://www.dropbox.com/s/s10tyhwc8z9nikg/geckodriver.exe?dl=1', 23 | 'webdriver/geckodriver_0.19.1_win64/geckodriver.exe': 'https://www.dropbox.com/s/r9zt6l9c7cn1pc8/geckodriver.exe?dl=1', 24 | 'webdriver/geckodriver_0.19.1_macos/geckodriver': 'https://www.dropbox.com/s/la2bfgdsdk2mrhj/geckodriver?dl=1', 25 | 'webdriver/geckodriver_0.19.1_linux32/geckodriver': 'https://www.dropbox.com/s/8qjr5n1i9jhmkmb/geckodriver?dl=1', 26 | 'webdriver/geckodriver_0.19.1_linux64/geckodriver': 'https://www.dropbox.com/s/b966sm5v98nmd5g/geckodriver?dl=1', 27 | 'webdriver/geckodriver_0.19.1_arm7hf/geckodriver': 'https://www.dropbox.com/s/k8dibiirz35zjf0/chromedriver.exe?dl=1' 28 | } 29 | 30 | def get(driverType, localDriver=True, path='.'): 31 | driverType = str(driverType) 32 | if driverType == 'PhantomJS': 33 | # phantomjs_options.add_argument("--disable-web-security") 34 | if localDriver: 35 | source = get_source(driverType, path) 36 | driver = webdriver.PhantomJS(executable_path=source, service_log_path=join(path, 'phantomjs.log'), service_args=["--remote-debugger-port=9000", "--web-security=false"]) 37 | # driver = webdriver.PhantomJS(executable_path=source, service_args=["--remote-debugger-port=9000", "--web-security=false"]) 38 | else: 39 | driver = webdriver.PhantomJS(service_log_path=join(path, 'phantomjs.log'), service_args=["--remote-debugger-port=9000", "--web-security=false"]) 40 | # driver = webdriver.PhantomJS(service_args=["--remote-debugger-port=9000", "--web-security=false"]) 41 | elif driverType == 'Chrome': 42 | desired = DesiredCapabilities.CHROME 43 | desired['loggingPrefs'] = {'browser': 'ALL'} 44 | chrome_options = Options() 45 | chrome_options.add_argument("--start-maximized") 46 | chrome_options.add_argument("--disable-infobars") 47 | chrome_options.add_argument("--disable-web-security") 48 | # chrome_options.add_argument("--window-size=800,600") 49 | # chrome_options.add_argument("--headless") # will not show the Chrome browser window 50 | if localDriver: 51 | source = get_source(driverType, path) 52 | driver = webdriver.Chrome(executable_path=source, service_log_path=join(path, 'chromedriver.log'), desired_capabilities=desired, chrome_options=chrome_options) 53 | else: 54 | driver = webdriver.Chrome(service_log_path=join(path, 'chromedriver.log'), desired_capabilities=desired, chrome_options=chrome_options) 55 | elif driverType == 'Firefox': 56 | # desired = DesiredCapabilities.FIREFOX 57 | # desired['loggingPrefs'] = {'browser': 'ALL'} 58 | firefox_options = Options() 59 | firefox_options.add_argument("--start-maximized") 60 | firefox_options.add_argument("--disable-infobars") 61 | if localDriver: 62 | source = get_source(driverType, path) 63 | driver = webdriver.Firefox(executable_path=source, service_log_path=join(path, 'geckodriver.log'), firefox_options=firefox_options) 64 | else: 65 | driver = webdriver.Firefox(service_log_path=join(path, 'geckodriver.log'), firefox_options=firefox_options) 66 | return driver 67 | 68 | def get_source(driverType, path='.'): 69 | driverType = str(driverType) 70 | os = platform.system() 71 | bits = platform.architecture()[0] 72 | source = None 73 | if driverType == 'PhantomJS': 74 | if os == 'Windows': 75 | source = join(path, 'webdriver/phantomjsdriver_2.1.1_win32/phantomjs.exe') 76 | elif os == 'Darwin': 77 | source = join(path, 'webdriver/phantomjsdriver_2.1.1_mac64/phantomjs') 78 | elif os == 'Linux' and bits == '32bit': 79 | source = join(path, 'webdriver/phantomjsdriver_2.1.1_linux32/phantomjs') 80 | elif os == 'Linux' and bits == '64bit': 81 | source = join(path, 'webdriver/phantomjsdriver_2.1.1_linux64/phantomjs') 82 | else: 83 | raise Exception('Failed to recognize your OS [%s / %s].' % (os, bits)) 84 | elif driverType == 'Chrome': 85 | if os == 'Windows': 86 | source = join(path, 'webdriver/chromedriver_2.39_win32/chromedriver.exe') 87 | elif os == 'Darwin': 88 | source = join(path, 'webdriver/chromedriver_2.39_mac64/chromedriver') 89 | elif os == 'Linux': 90 | source = join(path, 'webdriver/chromedriver_2.39_linux64/chromedriver') 91 | else: 92 | raise Exception('Failed to recognize your OS [%s / %s].' % (os, bits)) 93 | elif driverType == 'Firefox': 94 | if os == 'Windows' and bits == '32bit': 95 | source = join(path, 'webdriver/geckodriver_0.19.1_win32/geckodriver.exe') 96 | elif os == 'Windows' and bits == '64bit': 97 | source = join(path, 'webdriver/geckodriver_0.19.1_win64/geckodriver.exe') 98 | elif os == 'Darwin': 99 | source = join(path, 'webdriver/geckodriver_0.19.1_macos/geckodriver') 100 | elif os == 'Linux' and bits == '32bit': 101 | source = join(path, 'webdriver/geckodriver_0.19.1_linux32/geckodriver') 102 | elif os == 'Linux' and bits == '64bit': 103 | source = join(path, 'webdriver/geckodriver_0.19.1_linux64/geckodriver') 104 | else: 105 | raise Exception('Failed to recognize your OS [%s / %s].' % (os, bits)) 106 | else: 107 | raise Exception('Not supported driver type [%s].' % driverType) 108 | if not exists(source): 109 | print('Web driver "%s" not found.' % source) 110 | global SRC_URL_DICT 111 | for (src, url) in SRC_URL_DICT.items(): 112 | if src in source: 113 | print('Start downloading the web driver...') 114 | makedirs(dirname(source), exist_ok=True) 115 | # import urllib.request 116 | import requests 117 | import shutil 118 | from requests.packages.urllib3.exceptions import InsecureRequestWarning 119 | requests.packages.urllib3.disable_warnings(InsecureRequestWarning) 120 | # u = urllib.request.urlopen(url) 121 | # data = u.read() 122 | # u.close() 123 | res = requests.get(url, stream=True) 124 | with open(source, "wb") as f: 125 | # f.write(data) 126 | shutil.copyfileobj(res.raw, f) 127 | st = stat(source) 128 | chmod(source, st.st_mode | 0o111) # make it executable 129 | print('Web driver "%s" has been downloaded successfully.' % source) 130 | print(source) 131 | return source 132 | -------------------------------------------------------------------------------- /util/twitter.py: -------------------------------------------------------------------------------- 1 | import requests as r 2 | import json as j 3 | import re 4 | 5 | def get_twitter_video_url(id): 6 | headers = { 7 | 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw' 8 | } 9 | 10 | url = 'https://api.twitter.com/1.1/videos/tweet/config/{}.json'.format(id) 11 | res = r.get(url, headers=headers) 12 | post = j.loads(res.text) 13 | return post['posterImage'], post['track']['playbackUrl'].rsplit('?', 1)[0] 14 | 15 | def twitter_m3u8(url, file): 16 | res = r.get(url) 17 | p = re.compile('/(\d+x\d+)/') 18 | max_resol = 0 19 | link = '' 20 | for line in res.text.split('\n'): 21 | if not line.startswith('#') and line != '': 22 | m = p.search(line) 23 | w, h = map(int, m.group(1).split('x')) 24 | resol = w * h 25 | if resol > max_resol: 26 | link = line 27 | max_resol = resol 28 | return download_m3u8('https://video.twimg.com' + link, file) 29 | 30 | def download_m3u8(url, file): 31 | res = r.get(url) 32 | for line in res.text.split('\n'): 33 | if not line.startswith('#') and line != '': 34 | link = 'https://video.twimg.com' + line 35 | stream = r.get(link, stream=True) 36 | with open(file, 'ab') as f: 37 | for chunk in stream.iter_content(chunk_size=1024): 38 | if chunk: 39 | f.write(chunk) 40 | return True -------------------------------------------------------------------------------- /util/url.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2018 Elvis Yu-Jing Lin 5 | # Licensed under the MIT License - https://opensource.org/licenses/MIT 6 | 7 | import os 8 | import urllib 9 | import requests 10 | from util.file import rename_file, safe_makedirs 11 | from util.twitter import twitter_m3u8 12 | 13 | def get_filename(url): 14 | return url.rsplit('/')[-1].split(':')[0] 15 | 16 | def complete_url(url, current_url): 17 | if ':' not in url: 18 | if url.startswith('//'): 19 | url = 'http:' + url 20 | elif url.startswith('/'): 21 | if url.endswith('/'): 22 | base_url = '/'.join(current_url.split('/')[:3]) 23 | url = base_url + url 24 | else: 25 | dir_url = current_url.rsplit('/', 1)[0] 26 | url = dir_url + url 27 | else: 28 | dir_url = current_url.rsplit('/', 1)[0] 29 | url = dir_url + '/' + url 30 | return url 31 | 32 | def download(url, path='.', rename=None, replace=True): 33 | if rename is None: 34 | filename = urllib.parse.unquote(get_filename(url)) 35 | else: 36 | filename = rename_file(get_filename(url), rename) 37 | 38 | file = os.path.join(path, filename) 39 | 40 | if not replace and os.path.exists(file): 41 | print('The file {} exists. Skip it.'.format(file)) 42 | return 43 | 44 | if '.m3u8' in url: 45 | twitter_m3u8(url, file.replace('.m3u8', '.ts')) 46 | else: 47 | r = requests.get(url, stream=True) 48 | if r.status_code == 200: 49 | safe_makedirs(path) 50 | with open(file, 'wb') as f: 51 | for chunk in r: 52 | f.write(chunk) 53 | else: 54 | print('Error: status code of {} "{}" is {}.'.format(filename, url, r.status_code)) 55 | 56 | # Guess the type of url by its mime format. 57 | # 58 | # IANA - MIME 59 | # Media Types: http://www.iana.org/assignments/media-types/media-types.xhtml 60 | 61 | import mimetypes 62 | 63 | def get_mimetype(url): 64 | return mimetypes.guess_type(url, strict=False)[0] 65 | 66 | def is_image(url): 67 | mimetype = get_mimetype(url) 68 | return None if mimetype is None else mimetype.split('/')[0] == 'image' 69 | 70 | def is_video(url): 71 | mimetype = get_mimetype(url) 72 | return None if mimetype is None else mimetype.split('/')[0] == 'video' 73 | 74 | def is_media(url): 75 | mimetype = get_mimetype(url) 76 | return False if mimetype is None else mimetype.split('/')[0] == 'image' or mimetype.split('/')[0] == 'video' --------------------------------------------------------------------------------