├── requirements.txt ├── test.py ├── info_handler.py ├── scraper_handler.py ├── info_collectors ├── _init_anidb.py └── anidb.py ├── .gitignore ├── download_handler.py ├── downloaders ├── mp4.py ├── mycloud.py └── vidstreaming.py ├── templates └── module_search.py ├── scrapers ├── anime9.py ├── gogoanime.py ├── animeheaven.py └── masteranime.py └── README.md /requirements.txt: -------------------------------------------------------------------------------- 1 | bs4 2 | cfscrape 3 | requests 4 | furl 5 | html5lib 6 | lxml 7 | demjson 8 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import os 2 | fileLocation = os.path.realpath(__file__) 3 | directory = os.path.dirname(fileLocation) 4 | print(os.path.join(directory, "..")) 5 | -------------------------------------------------------------------------------- /info_handler.py: -------------------------------------------------------------------------------- 1 | import glob 2 | 3 | from .templates.module_search import ModuleSearch 4 | 5 | 6 | class InfoHandler(ModuleSearch): 7 | 8 | def __init__(self): 9 | self._get_modules('info_collectors') 10 | 11 | def _search_module(self, query, strict, module): 12 | return module.search(query, strict) 13 | 14 | def search(self, query, strict=False): 15 | return [ 16 | self._search_module(query, strict, x) 17 | for x in self.modules 18 | ] 19 | 20 | def _details_module(self, id, module): 21 | return module.getDetailedInfo(id) 22 | 23 | def getDetailedInfo(self, id): 24 | return [ 25 | self._details_module(id, x) for x in self.modules 26 | ] 27 | 28 | 29 | info_handler = InfoHandler() 30 | -------------------------------------------------------------------------------- /scraper_handler.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import imp 3 | import logging 4 | import os 5 | import re 6 | 7 | from difflib import SequenceMatcher 8 | from .templates.module_search import ModuleSearch 9 | 10 | 11 | class ScraperHandler(ModuleSearch): 12 | # Deals with resolving the scraping of links 13 | # Automatically resolves with modules in 14 | # the scrapers folder. 15 | def __init__(self): 16 | self._get_modules('scrapers') 17 | 18 | def _search_module(self, query, module): 19 | return module.search(query) 20 | 21 | # Searches using scraper modules based on query 22 | def search(self, query, limited_modules=None): 23 | logging.debug("Starting a search for '%s'." % (query,)) 24 | return [ 25 | self._search_module(query, x) 26 | for x in self.modules 27 | if limited_modules is None or x.site_name in limited_modules 28 | ] 29 | 30 | # Resolves a URL and returns data from 31 | # proper module and function 32 | def resolve(self, link): 33 | logging.debug( 34 | "Starting a resolution for '%s'" 35 | "under scraper_handler." % (link,) 36 | ) 37 | for module in self.modules: 38 | functions = self._try_match_module(link, module) 39 | if len(functions) > 0: 40 | return functions[0](link) 41 | return None 42 | 43 | 44 | def score_similarity(stringA, stringB): 45 | return SequenceMatcher(None, stringA, stringB).ratio() 46 | 47 | scraper_handler = ScraperHandler() 48 | -------------------------------------------------------------------------------- /info_collectors/_init_anidb.py: -------------------------------------------------------------------------------- 1 | from datetime import date 2 | import requests 3 | import os 4 | 5 | 6 | BASE_PATH = os.path.dirname(os.path.realpath(__file__)) 7 | INFO_FILE = os.path.join(BASE_PATH, "last_download.txt") 8 | DOWNLOAD_URL = "http://anidb.net/api/anime-titles.xml.gz" 9 | DOWNLOAD_FILE = os.path.join(BASE_PATH, "anime-titles.xml") 10 | 11 | 12 | class DownloadList: 13 | 14 | def __init__(self): 15 | self.need_download = self.need_to_download() 16 | 17 | def need_to_download(self): 18 | try: 19 | with open(INFO_FILE, "r") as f: 20 | data = f.readline() 21 | if len(data) > 0: 22 | last_download = date.fromordinal(int(data)) 23 | time_delta = date.today() - last_download 24 | if time_delta.days > 7: 25 | return True 26 | else: 27 | return False 28 | else: 29 | return True 30 | except FileNotFoundError: 31 | return True 32 | return False 33 | 34 | def write_today_ordinal(self): 35 | with open(INFO_FILE, "w") as f: 36 | f.write(str(date.today().toordinal()) + "\n") 37 | 38 | def download_list(self): 39 | request = requests.get(DOWNLOAD_URL, stream=True) 40 | with open(DOWNLOAD_FILE, "wb") as f: 41 | for chunk in request.iter_content(chunk_size=1024): 42 | if chunk: 43 | f.write(chunk) 44 | 45 | def get_file(self): 46 | if self.need_to_download(): 47 | self.write_today_ordinal() 48 | self.download_list() 49 | 50 | 51 | download_list = DownloadList() 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *.cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # Jupyter Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # SageMath parsed files 79 | *.sage.py 80 | 81 | # Environments 82 | .env 83 | .venv 84 | env/ 85 | venv/ 86 | ENV/ 87 | 88 | # Spyder project settings 89 | .spyderproject 90 | .spyproject 91 | 92 | # Rope project settings 93 | .ropeproject 94 | 95 | # mkdocs documentation 96 | /site 97 | 98 | # My files 99 | anime-titles.xml.gz 100 | anime-titles.xml 101 | last_download.txt -------------------------------------------------------------------------------- /download_handler.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from .templates.module_search import ModuleSearch 4 | 5 | 6 | class DownloadHandler(ModuleSearch): 7 | # Deals with resolving downloading of files 8 | def __init__(self): 9 | self._get_modules('downloaders') 10 | 11 | def single_download(self, link, abs_path): 12 | """ 13 | Download a single episode. 14 | 'link' is the full link of it (get it with scraper_handler). 15 | 'abs_path' is full path + filename of downloaded file, example - 16 | "/home/User/MyDownloadedEpisode.mp4" 17 | """ 18 | for module in self.modules: 19 | if self._try_match_module(link, module): 20 | if module.download(link, abs_path): 21 | return True 22 | return False 23 | return False 24 | 25 | def resolve(self, data): 26 | logging.info( 27 | "Trying to resolve '%s'" 28 | % (data['epNum']) 29 | ) 30 | for module in self.modules: 31 | for source in data['sources']: 32 | logging.info( 33 | "Trying to resolve '%s' source." 34 | % (source['link']) 35 | ) 36 | if self._try_match_module(source['link'], module): 37 | logging.info( 38 | "Found a matching module for '%s'." 39 | % (source,) 40 | ) 41 | # PEP8 Too long 42 | fileName = "%s.mp4" % (data['epNum'],) if 'epNum' in data else source 43 | if module.download(source['link'], fileName): 44 | break 45 | 46 | download_handler = DownloadHandler() 47 | -------------------------------------------------------------------------------- /downloaders/mp4.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import logging 5 | 6 | import requests 7 | 8 | 9 | class Timer: 10 | def restart(self, request): 11 | self.length = int(request.headers.get('content-length')) 12 | self.start = time.clock() 13 | 14 | self.current = 0 15 | 16 | def __init__(self, request): 17 | self.restart(request) 18 | 19 | def tick(self, chunk_size): 20 | self.current += chunk_size 21 | speed = round(self.current // (time.clock() - self.start) / 1000000, 2) 22 | percentComplete = round((self.current / self.length) * 100, 1) 23 | sys.stdout.write( 24 | "\r %s Mbps | %r Percent Complete" 25 | % (speed, percentComplete) 26 | ) 27 | 28 | 29 | def download(link, filename): 30 | logging.info("Starting download for %s." % (link,)) 31 | tempName = "%s.tmp" % (filename,) 32 | 33 | with open(tempName, 'wb') as f: 34 | request = requests.get(link, stream=True) 35 | 36 | # timer = Timer(request) 37 | 38 | for chunk in request.iter_content(chunk_size=1024): 39 | # timer.tick(len(chunk)) 40 | 41 | if chunk: 42 | f.write(chunk) 43 | else: 44 | logging.error("Failed to a chunk for '%s'." % (link,)) 45 | logging.info("Finished downloading '%s'." % (link,)) 46 | os.rename(tempName, filename) 47 | return True 48 | 49 | matching_urls = [ 50 | { 51 | 'urls': [ 52 | r'http://(.*).animeheaven.eu/video/(.*).mp4(.*)', 53 | r'http://(.*).animeheaven.eu/[0-9]+pi/(.*).mp4(.*)', 54 | r'https://[0-9]+.bp.blogspot.com(.*)', 55 | ], 56 | 'function': download, 57 | }, 58 | ] 59 | -------------------------------------------------------------------------------- /templates/module_search.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import imp 3 | import logging 4 | import os 5 | import re 6 | 7 | 8 | class ModuleSearch(object): 9 | def _load_single_module(self, f): 10 | return imp.load_source(f[:-3], f) 11 | 12 | def _load_modules(self): 13 | return [self._load_single_module(x) for x in self.modules] 14 | 15 | def _try_match_url(self, link, matchingURL): 16 | return True if re.match(matchingURL, link) is not None else False 17 | 18 | def _try_match_module_section(self, link, section): 19 | urls = section['urls'] 20 | matches = [ 21 | section['function'] for x in urls 22 | if self._try_match_url(link, x) is not False 23 | ] 24 | return True if len(matches) > 0 else False 25 | 26 | def _try_match_module(self, link, module): 27 | sections = module.matching_urls 28 | return [x['function'] for x in sections 29 | if self._try_match_module_section(link, x) is not False] 30 | 31 | def __is_underscore(self, f): 32 | if f[f.rfind('/') + 1] == "_": 33 | return True 34 | return False 35 | 36 | def _get_modules(self, location): 37 | fileLocation = os.path.realpath(__file__) 38 | directory = os.path.dirname(fileLocation) 39 | self.module_location = os.path.join(directory, '..', location) 40 | self.modules = glob.glob("%s/*.py" % (self.module_location)) 41 | self.modules = [ 42 | module for module in self.modules 43 | if not self.__is_underscore(module) 44 | ] 45 | ''' 46 | for i in range(len(self.modules)): # Delete modules beginning with '_' 47 | module = self.modules[i] 48 | if module[module.rfind("/") + 1] == "_": 49 | del self.modules[i] 50 | ''' 51 | self.modules = self._load_modules() 52 | -------------------------------------------------------------------------------- /downloaders/mycloud.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import re 4 | import subprocess 5 | 6 | import requests 7 | 8 | from bs4 import BeautifulSoup 9 | 10 | types = ['iframe'] 11 | 12 | MY_CLOUD_PAT = re.compile(' MIN_SIMILARITY_RATIO: 44 | if ratio > highest_ratio: 45 | highest_ratio = ratio 46 | if not highest_ratio: 47 | continue 48 | ratio_list.append(highest_ratio) 49 | id = int(anime['aid']) 50 | titles = [title.string for title in 51 | anime.findAll("title", attrs={"type": ["main", "official"]})] 52 | results.append({"id": id, "titles": titles}) 53 | return [x for (y, x) in 54 | sorted(list(zip(ratio_list, results)), 55 | key=lambda pair: pair[0], reverse=True)] 56 | 57 | 58 | def getDetailedInfo(id): 59 | ''' 60 | Gets a detailed info from the ID provided. A dict is returned with 61 | the following keys. The type of the value is also mentioned. 62 | 63 | id: int, type: str, start_date: str, end_date: str, other_names: str, 64 | creators: [{str: str}], permanent_rating: float, image_url: str, 65 | description: str, recommendations: [{str: str}] 66 | ''' 67 | request = requests.get(BASE_URL, params={ 68 | "request": "anime", 69 | "aid": str(id), 70 | "protover": "1", 71 | "client": CLIENT, 72 | "clientver": str(CLIENT_VERSION) 73 | }) 74 | request.raise_for_status() 75 | result_page = BeautifulSoup(request.text, "xml") 76 | 77 | results = { 78 | "id": id, 79 | "type": result_page.find("type").string, 80 | "episode_count": result_page.find("episodecount").string, 81 | "start_date": result_page.find("startdate").string, 82 | "end_date": result_page.find("enddate").string, 83 | "other_names": [title.string for title in 84 | result_page.find("titles").findAll("title")], 85 | "creators": [{name['type']: name.string} 86 | for name in result_page.find("creators").findAll("name")], 87 | "permanent_rating": float(result_page.find("ratings") 88 | .find("permanent").string), 89 | "image_url": IMAGE_URL + result_page.find("picture").string, 90 | "description": result_page.find("description").string 91 | } 92 | return results 93 | 94 | 95 | matching_urls = [ 96 | { 97 | 'urls': [], 98 | 'function': search, 99 | }, 100 | { 101 | 'urls': [], 102 | 'function': getDetailedInfo, 103 | }, 104 | ] 105 | -------------------------------------------------------------------------------- /scrapers/anime9.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import requests 4 | 5 | from bs4 import BeautifulSoup as bs 6 | 7 | site_name = "9anime.is" 8 | 9 | BASE_URL = 'https://9anime.is' 10 | SEARCH_URL = '%s/search' % (BASE_URL,) 11 | INFO_API_URL = "%s/ajax/episode/info" % (BASE_URL,) 12 | 13 | 14 | def _parse_search_single(data): 15 | img = data.find("img") 16 | nameAnchor = data.find("a", {"class": "name"}) 17 | lang = data.find('div', {'class': 'lang'}) 18 | lang = lang.text if lang is not None else 'sub' 19 | 20 | return { 21 | 'title': nameAnchor.text, 22 | 'link': nameAnchor['href'], 23 | 'language': lang.lower(), 24 | 'host': site_name, 25 | 'poster': img['src'] 26 | } 27 | 28 | 29 | def _parse_search_multi(data): 30 | return [ 31 | _parse_search_single(x) 32 | for x in data.findAll("div", {"class": "item"}) 33 | ] 34 | 35 | 36 | def search(query): 37 | params = { 38 | 'keyword': query, 39 | } 40 | data = bs(requests.get(SEARCH_URL, params=params).content) 41 | 42 | return _parse_search_multi(data) 43 | 44 | 45 | def _scrape_episode_source(data): 46 | return { 47 | 'link': data['file'], 48 | 'type': data['type'], 49 | 'quality': data['label'], 50 | } 51 | 52 | 53 | def _scrape_episode_sources(data): 54 | request = requests.get(data['grabber'], params=data['params']).json() 55 | return [_scrape_episode_source(x) for x in request['data']] 56 | 57 | 58 | def _scrape_episode_info(id): 59 | logging.debug("'%s' is performing a info grab for '%s'" % (site_name, id,)) 60 | params = {'id': id} 61 | data = requests.get(INFO_API_URL, params=params) 62 | if data.status_code == 200: 63 | data = data.json() 64 | if data.get('target') == '' or data.get('type') == 'direct': 65 | return _scrape_episode_sources(data) 66 | else: 67 | return { 68 | 'link': data.get('target'), 69 | 'type': data.get('type'), 70 | } 71 | 72 | 73 | def _parse_server_single_episode(data): 74 | anchor = data.find("a") 75 | id = anchor['data-id'] 76 | output = { 77 | 'data-id': id, 78 | 'epNum': anchor.text, 79 | 'sources': _scrape_episode_info(id), 80 | } 81 | return output if output['sources'] is not None else None 82 | 83 | 84 | def _parse_server_episodes(data): 85 | episodes = data.findAll("li") 86 | sources = [_parse_server_single_episode(x) for x in episodes] 87 | if len(sources) > 0: 88 | return list(filter(None, sources)) 89 | 90 | 91 | def _scrape_all_servers(data): 92 | servers = data.findAll("ul", {"class": "episodes range active"}) 93 | sourcedServers = [_parse_server_episodes(x) for x in servers] 94 | return list(filter(None, sourcedServers)) 95 | 96 | 97 | def format_combine_multi(unformatedOutput): 98 | output = [] 99 | for ep in unformatedOutput: 100 | output.append({ 101 | 'epNum': str(int(ep)), # remove leading 0s 102 | 'sources': unformatedOutput[ep] 103 | }) 104 | return output 105 | 106 | 107 | def combine_multi(servers): 108 | unformatedOutput = {} 109 | print(servers) 110 | for server in servers: 111 | for ep in server: 112 | if ep['epNum'] not in unformatedOutput: 113 | unformatedOutput[ep['epNum']] = [ep['sources']] 114 | else: 115 | unformatedOutput[ep['epNum']] += [ep['sources']] 116 | 117 | return format_combine_multi(unformatedOutput) 118 | 119 | 120 | def _scrape_title(data): 121 | return data.find('h1', {'class': 'title'}).text 122 | 123 | 124 | def scrape_all_show_sources(link): 125 | logging.info( 126 | "A request for '%s' was made under %s scraper." % 127 | (link, site_name) 128 | ) 129 | data = bs(requests.get(link).content, 'html.parser') 130 | body = data.find('body') 131 | servers = _scrape_all_servers(data) 132 | return { 133 | 'episodes': combine_multi(servers), 134 | 'title': _scrape_title(data), 135 | 'host': site_name, 136 | } 137 | 138 | matching_urls = [ 139 | { 140 | 'urls': [ 141 | r'https://9anime.to/watch/(.*).(.*)', 142 | r'https://9anime.is/watch/(.*).(.*)' 143 | ], 144 | 'function': scrape_all_show_sources, 145 | }, 146 | ] 147 | -------------------------------------------------------------------------------- /downloaders/vidstreaming.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import sys 4 | import logging 5 | 6 | import requests 7 | 8 | from furl import furl 9 | from bs4 import BeautifulSoup 10 | 11 | BASE_PATH = os.path.dirname(os.path.realpath(__file__)) 12 | sys.path.append(BASE_PATH) 13 | import mp4 14 | 15 | site_name = 'vidstream' 16 | 17 | BASE_URL = "https://vidstreaming.io" 18 | DOWNLOAD_URL = "https://vidstream.co/download" 19 | 20 | qualities = ['1080', '720', '480', '360'] 21 | 22 | STREAMING_PAT = '\?id=([a-zA-Z0-9]+?)(?:=|$)' 23 | 24 | 25 | def _try_match_url(link, matchingURL): 26 | return True if re.match(matchingURL, link) is not None else False 27 | 28 | 29 | def _try_match_module_section(link, section): 30 | urls = section['urls'] 31 | matches = [ 32 | section['function'] for x in urls 33 | if _try_match_url(link, x) is not False 34 | ] 35 | return True if len(matches) > 0 else False 36 | 37 | 38 | def resolve(link): 39 | for section in internal_matching_urls: 40 | if _try_match_module_section(link, section): 41 | logging.info("Found a match for %s" % (link,)) 42 | return section['function'](link) 43 | return None 44 | 45 | 46 | def download(link, fname): 47 | logging.info("Starting download for '%s' under vidstreaming." % (link,)) 48 | sources = resolve(link)['sources'] 49 | logging.info("Recieved %i sources" % (len(sources))) 50 | if len(sources) > 0: 51 | source = sources[0]['link'] 52 | else: 53 | logging.critical("Can't find sources on vidstreaming!") 54 | return False 55 | if source is not None: 56 | if mp4.download(source, fname): 57 | return True 58 | return False 59 | 60 | 61 | def _parse_quality(title): 62 | for q in qualities: 63 | if q in title: 64 | return q 65 | return None 66 | 67 | 68 | def _parse_list_single(data): 69 | return { 70 | 'link': data['href'], 71 | 'type': 'mp4', 72 | 'quality': _parse_quality(data.text), 73 | } 74 | 75 | 76 | def _parse_list_multi(data): 77 | box = data.find("div", {"class": "mirror_link"}) 78 | sources = box.findAll("a") 79 | if len(sources) == 0: 80 | logging.critical("Can't find sources on vidstreaming!") 81 | return [_parse_list_single(x) for x in sources] 82 | 83 | 84 | def _scrape_video_sources_id(id): 85 | params = { 86 | 'id': id, 87 | } 88 | request = requests.get(DOWNLOAD_URL, params=params).content 89 | data = BeautifulSoup(request, 'html.parser') 90 | return { 91 | 'sources': _parse_list_multi(data), 92 | } 93 | 94 | 95 | def _scrape_video_sources(link): 96 | id = furl(link).args['id'] 97 | logging.info("Found id %s from '%s'" % (id, link,)) 98 | return _scrape_video_sources_id(id) 99 | 100 | 101 | def _parse_list_embed_single(data): 102 | return { 103 | 'link': data['src'], 104 | 'type': 'mp4', 105 | 'quality': data['label'], 106 | } 107 | 108 | 109 | def _parse_list_embed_multi(data): 110 | sources = data.findAll("source", {"type": "video/mp4"}) 111 | return [_parse_list_embed_single(x) for x in sources] 112 | 113 | 114 | def _scrape_video_embed(link): 115 | data = BeautifulSoup(requests.get(link).content, 'html.parser') 116 | result = { 117 | 'sources': _parse_list_embed_multi(data), 118 | } 119 | if len(result['sources']) == 0: 120 | logging.info('Falling back to legacy downloader for %s' % (link,)) 121 | result['sources'] = _scrape_video_sources(link) 122 | return result 123 | 124 | def _fix_link(link): 125 | fixed_link = "http:" + link 126 | return _scrape_video_embed(fixed_link) 127 | 128 | def _scrape_streaming(link): 129 | id = re.search(STREAMING_PAT, link) 130 | id = id.group(1) if id is not None else None 131 | 132 | if id: 133 | return _scrape_video_sources_id(id) 134 | 135 | return None 136 | 137 | matching_urls = [ 138 | { 139 | 'urls': [ 140 | r'//vidstreaming.io/streaming.php\?id=(.*)&title=(.*)', 141 | r'https://vidstream.co/embed.php\?(.*)', 142 | r'https://vidstreaming.io/embed.php\?id=(.*)', 143 | ], 144 | 'function': download, 145 | } 146 | ] 147 | 148 | internal_matching_urls = [ 149 | { 150 | 'urls': [ 151 | r'https://vidstream.co/download\?id=(.*)', 152 | ], 153 | 'function': _scrape_video_sources, 154 | }, 155 | { 156 | 'urls': [ 157 | r'https://vidstream.co/embed.php\?(.*)', 158 | r'https://vidstreaming.io/embed.php\?id=(.*)', 159 | ], 160 | 'function': _scrape_video_embed, 161 | }, 162 | { 163 | 'urls': [ 164 | r'//vidstreaming.io/streaming.php\?id=(.*)&title=(.*)', 165 | ], 166 | 'function': _scrape_streaming, 167 | } 168 | ] 169 | -------------------------------------------------------------------------------- /scrapers/gogoanime.py: -------------------------------------------------------------------------------- 1 | import re 2 | import logging 3 | 4 | import cfscrape as cf 5 | 6 | from bs4 import BeautifulSoup 7 | 8 | site_name = 'gogoanime' 9 | 10 | BASE_URL = "https://gogoanime.io" 11 | SEARCH_URL = "%s/search.html" % (BASE_URL,) 12 | EPISODE_LIST_URL = "%s//load-list-episode" % (BASE_URL,) 13 | SHOW_URL = "%s/category/" % (BASE_URL,) 14 | 15 | id_pat = re.compile("var id = (.*?);") 16 | streaming_name_pat = re.compile('"(.*?)"') 17 | epnum_pat = re.compile('episode-(.*?)$') 18 | released_pat = re.compile("Released: ([0-9]+)") 19 | 20 | cfscrape = cf.create_scraper() 21 | 22 | 23 | def _combine_link(url): 24 | return ("%s%s" % (BASE_URL, url,)).replace(' ', '') 25 | 26 | 27 | def _parse_released_date(data): 28 | fullString = str(data.find("p", {"class": "released"})) 29 | output = re.findall(released_pat, fullString) 30 | return output[0] if len(output) > 0 else None 31 | 32 | 33 | def _extract_single_search(data): 34 | name = data.find('p', {'class': 'name'}).find('a') 35 | return { 36 | 'link': _combine_link(name['href']), 37 | 'title': name.text, 38 | 'language': 'dub' if 'dub' in name.text.lower() else 'sub', 39 | 'released': _parse_released_date(data), 40 | 'host': site_name, 41 | } 42 | 43 | 44 | def _extract_multiple_search(data): 45 | entries = data.find('ul', {'class': 'items'}).findAll("li") 46 | return [_extract_single_search(x) for x in entries] 47 | 48 | 49 | def search(query): 50 | ''' 51 | Returns all search results based on a query 52 | [ 53 | { 54 | 'link': 'link to show on gogoanime', 55 | 'title': 'the full title of the show', 56 | 'language': 'either subbed or dubbed', 57 | } 58 | ] 59 | ''' 60 | params = { 61 | 'keyword': query, 62 | 'id': -1, 63 | } 64 | 65 | data = cfscrape.get(SEARCH_URL, params=params).content 66 | data = BeautifulSoup(data, 'html.parser') 67 | 68 | return _extract_multiple_search(data) 69 | 70 | 71 | def _parse_list_single(data): 72 | return { 73 | 'name': data.find("div", {"class": "name"}).text, 74 | 'link': _combine_link(data['href']), 75 | 'language': data.find("div", {"class": "cate"}).text.lower(), 76 | 'type': 'iframe', 77 | } 78 | 79 | 80 | def _parse_list_multi(data): 81 | episodes = data.findAll("a") 82 | return [_parse_list_single(x) for x in episodes] 83 | 84 | 85 | def _load_list_episode(id): 86 | params = { 87 | 'ep_start': 0, 88 | 'ep_end': 9999999, 89 | 'id': id, 90 | 'default_ep': 0, 91 | } 92 | data = cfscrape.get(EPISODE_LIST_URL, params=params).content 93 | data = BeautifulSoup(data, 'html.parser') 94 | return _parse_list_multi(data) 95 | 96 | 97 | def _scrape_show_id(data): 98 | return re.findall(id_pat, str(data)) 99 | 100 | 101 | def _scrape_title(data): 102 | return data.find("div", {"class": "anime_info_body_bg"}).find('h1').text 103 | 104 | 105 | def _scrape_status(data): 106 | return data.findAll('p', {'class': 'type'})[4].text.replace('Status: ', '') 107 | 108 | 109 | def _scrape_released(data): 110 | text = data.findAll('p', {'class': 'type'})[3].text 111 | return text.replace('Released: ', '') 112 | 113 | 114 | def _scrape_epNum(url): 115 | epNum = re.findall(epnum_pat, url) 116 | return epNum[0] if len(epNum) > 0 else '0' 117 | 118 | 119 | def _scrape_single_video_source(data): 120 | return { 121 | 'link': data['data-video'], 122 | 'type': 'iframe' 123 | } 124 | 125 | 126 | def _scrape_video_sources(link): 127 | data = cfscrape.get(link).content 128 | soupedData = BeautifulSoup(data, 'html.parser') 129 | sources = soupedData.find("div", {"class", "anime_muti_link"}) 130 | sources = sources.findAll("a") 131 | 132 | return { 133 | 'epNum': _scrape_epNum(link), 134 | 'sources': list(map( 135 | lambda x: _scrape_single_video_source(x), 136 | sources) 137 | ), 138 | } 139 | 140 | 141 | def scrape_all_show_sources(link): 142 | data = cfscrape.get(link).content 143 | id = _scrape_show_id(data) 144 | data = BeautifulSoup(data, 'html.parser') 145 | episodes = _load_list_episode(id) 146 | 147 | return { 148 | 'episodes': [_scrape_video_sources(x['link']) for x in episodes], 149 | 'title': _scrape_title(data), 150 | 'status': _scrape_status(data), 151 | 'host': 'gogoanime', 152 | 'released': _scrape_released(data), 153 | } 154 | 155 | matching_urls = [ 156 | { 157 | 'urls': [r'https://(.*)gogoanime.io/category/(.*)'], 158 | 'function': scrape_all_show_sources, 159 | }, 160 | { 161 | 'urls': [r'https://(.*)gogoanime.io//search.html?keyword=(.*)'], 162 | 'function': search, 163 | }, 164 | { 165 | 'urls': [r'https://(.*)gogoanime.io/(.*)-episode-([0-9]+)'], 166 | 'function': _scrape_video_sources, 167 | } 168 | ] 169 | -------------------------------------------------------------------------------- /scrapers/animeheaven.py: -------------------------------------------------------------------------------- 1 | import re 2 | import logging 3 | 4 | import requests 5 | 6 | from bs4 import BeautifulSoup 7 | 8 | site_name = 'animeheaven' 9 | 10 | BASE_URL = "http://animeheaven.eu" 11 | SEARCH_URL = "%s/search.php" % (BASE_URL,) 12 | 13 | # source_pat = re.compile("Status:
(.*?)
') 17 | released_pat = re.compile('
Year:
(.*)
') 18 | 19 | 20 | def _combine_link(url): 21 | # Combines the relative url with the base url 22 | return ("%s/%s" % (BASE_URL, url,)).replace(' ', '%20') 23 | 24 | 25 | def _extract_single_search(data): 26 | # Takes in bs4 data of a single search result 27 | # and returns a formated dict 28 | anchor = data.find("a") 29 | img = anchor.find("img") 30 | name = img['alt'] 31 | return { 32 | 'link': _combine_link(anchor['href']), 33 | 'title': name, 34 | 'language': 'dub' if 'dub' in name.lower() else 'sub', 35 | 'host': site_name, 36 | 'poster': _combine_link(img['src']), 37 | } 38 | 39 | 40 | def _extract_multiple_search(data): 41 | # Takes in search result page 42 | # and returns list of formated results 43 | entries = data.findAll('div', {'class': 'iep'}) 44 | return [_extract_single_search(x) for x in entries] 45 | 46 | 47 | def search(query): 48 | ''' 49 | Returns all search results based on a query 50 | [ 51 | { 52 | 'link': 'link to show on gogoanime', 53 | 'title': 'the full title of the show', 54 | 'language': 'either subbed or dubbed', 55 | } 56 | ] 57 | ''' 58 | logging.info("A query for %s was made under animeheaven" % (query,)) 59 | params = {'q': query} 60 | data = requests.get(SEARCH_URL, params=params).content 61 | data = BeautifulSoup(data, 'html.parser') 62 | 63 | return _extract_multiple_search(data) 64 | 65 | 66 | def _parse_list_single(data): 67 | return { 68 | 'name': data.find("div", {"class": "infoept2"}), 69 | 'link': _combine_link(data['href']), 70 | } 71 | 72 | 73 | def _parse_list_multi(data): 74 | box = data.find("div", {"class": "infoepbox"}) 75 | episodes = box.findAll("a") 76 | return [_parse_list_single(x) for x in episodes] 77 | 78 | 79 | def _hex_source_to_str(source_url): 80 | return bytes(source_url, 'utf-8').decode('unicode_escape') 81 | 82 | 83 | def _scrape_single_video_source(data): 84 | source_url = re.findall(source_pat, str(data)) 85 | return { 86 | 'link': _hex_source_to_str(source_url[0]) if len(source_url) > 0 else None, 87 | 'type': 'mp4', 88 | } 89 | 90 | 91 | def _scrape_epNum(url): 92 | epNum = re.search(epnum_pat, url) 93 | return epNum.group().replace('e=', '') if epNum is not None else None 94 | 95 | def _parse_multi_video_sources(data): 96 | return [_scrape_video_sources(x) for x in data] 97 | 98 | 99 | def _scrape_video_sources(link): 100 | # Scrapes details on a specific 101 | # episode of a show based on link 102 | data = BeautifulSoup(requests.get(link).content) 103 | logging.info("Scraping video sources for %s under animeheaven" % (link,)) # test = data.findall("div", {'class': 'centerf2'}) 104 | sources = data.find("div", {'class': 'centerf2'}).findAll('script') 105 | 106 | return { 107 | 'epNum': _scrape_epNum(link), 108 | 'sources': [_scrape_single_video_source(x) for x in sources], 109 | } 110 | 111 | def _scrape_title(data): 112 | # Takes in bs4 show page 113 | # and returns the title of 114 | # the show 115 | return data.find("div", {"class": "infodes"}).text 116 | 117 | 118 | def _scrape_released(data): 119 | # Takes in bs4 show page and 120 | # returns released year as string 121 | box = data.findAll("div", {"class": 'infodes2'}) 122 | if len(box) < 1: return None 123 | box = box[1] 124 | released_date = re.search(released_pat, str(box)) 125 | return released_date.group() if released_date is not None else Noneß 126 | 127 | 128 | def _scrape_status(data): 129 | # Takes in bs4 show page and 130 | # return status of the show 131 | box = data.findAll("div", {"class": "infodes2"}) 132 | if len(box) < 1: return Noneß 133 | box = box[1] 134 | status = re.search(status_pat, str(box)) 135 | return status.group() if status is not None else None 136 | 137 | 138 | def scrape_all_show_sources(link): 139 | # Returns all show's sources and details 140 | # based on the link of the show. 141 | logging.info( 142 | "A request for '%s' was made to animeheaven scraper." 143 | % (link,) 144 | ) 145 | data = BeautifulSoup(requests.get(link).content, 'html.parser') 146 | episodes = _parse_list_multi(data) 147 | logging.debug("Got %i links for %s" % (len(episodes), link,)) 148 | 149 | return { 150 | 'episodes': [_scrape_video_sources(x['link']) for x in episodes], 151 | 'title': _scrape_title(data), 152 | 'status': _scrape_status(data), 153 | 'host': 'animeheaven', 154 | 'released': _scrape_released(data), 155 | } 156 | 157 | matching_urls = [ 158 | { 159 | 'urls': [r'http://animeheaven.eu/i.php\?a=(.*)'], 160 | 'function': scrape_all_show_sources, 161 | }, 162 | { 163 | 'urls': [r'http://animeheaven.eu/search.php\?q=(.*)'], 164 | 'function': search, 165 | }, 166 | { 167 | 'urls': [r'http://animeheaven.eu/watch.php\?a=(.*)&e=([0-9]+)'], 168 | 'function': _scrape_video_sources, 169 | } 170 | ] 171 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # anime-scrapers 2 | 3 | Anime scrapers is a collection of scrapers that have been all unified. 4 | 5 | ## Table of Contents 6 | - [Installation](#installation) 7 | - [Mac / OSX](#mac-osx) 8 | - [General Installation](#general-installation) 9 | - [Usage](#usage) 10 | - [Functions](#functions) 11 | - [Handlers](#handlers) 12 | - [Scraper Handler](#scraper_handler) 13 | - [Download Handler](#download_handler) 14 | - [Info Handler](#info_handler) 15 | - [Individual Scraper Functions](#individual-scraper-functions) 16 | - [Contributing](#contributing) 17 | - [URL Handling (matching_urls)](#url-handling-matching_urls) 18 | - [Adding a Scraper](#adding-a-scraper) 19 | - [Adding a Downloader](#adding-a-downloader) 20 | - [Adding an Info Collector](#adding-an-information-collector) 21 | - [Credits](#credits) 22 | 23 | ## Installation 24 | 25 | ### Mac (OSX) 26 | - Install [Brew](https://brew.sh/) 27 | - Install python3 28 | - `brew install python3` 29 | - Continue to general installation 30 | 31 | ### General Installation 32 | - Clone repository 33 | - `git clone https://github.com/jQwotos/anime_scrapers` 34 | - Nav into repo 35 | - `cd anime_scrapers` 36 | - Install required python packages 37 | - `pip install -r requirements.txt` 38 | 39 | ## Usage 40 | 41 | anime_scrapers is the backend that is to be used by other applications. You can however use it directly if you want by using the python shell, but it's better to use an application. 42 | 43 | ### Functions 44 | 45 | #### Handlers 46 | 47 | Handlers are all classes, however each of them also have premade variables so you don't need to create a new object each time. 48 | 49 | For example `scraper_handler.py` has 50 | 51 | `class ScraperHandler():` 52 | 53 | and 54 | 55 | `var scraper_handler` 56 | 57 | ##### scraper_handler 58 | - `search(query, limited_modules[])` 59 | - Searches all scraper modules (or specified ones) 60 | - `resolve(link)` 61 | - Finds matching function in module and returns proper response 62 | 63 | ##### download_handler 64 | - `resolve(link)` 65 | - Takes in a download data (typically gotten from a scraper_handler resolve) 66 | ``` 67 | { 68 | 'epNum': 'name of file', 69 | 'sources': [ 70 | 'link': 'link', 71 | 'type': 'typically mp4 or iframe', 72 | ] 73 | } 74 | ``` 75 | 76 | ##### info_handler 77 | 78 | For information gathering, use the `info_handler.py`. The functions are - 79 | 80 | ``` 81 | # strict is a boolean, which if True, searches for exact query only. 82 | search(query, strict): 83 | return [ 84 | { 85 | 'id': 'id of the show (int)', 86 | 'titles': 'Other names of the show (str)', 87 | } 88 | ] 89 | ``` 90 | 91 | ``` 92 | getDetailedInfo(id): 93 | return [ 94 | { 95 | 'id': 'return the id from the parameter (int)', 96 | 'other-show-stuff': 'Other info related to show. 97 | See anidb.py in info_collectors for example', 98 | ... 99 | } 100 | ] 101 | ``` 102 | 103 | #### Individual Scraper Functions 104 | 105 | ``` 106 | scrape_all_show_sources(link): 107 | return { 108 | 'episodes': [ 109 | { 110 | 'epNumber', 'number as a string', 111 | 'sources', sourceType 112 | } 113 | ], 114 | 'title': 'title of the show', 115 | 'status': 'status of the show', 116 | 'host': 'host such as gogoanime', 117 | 'released': 'year released as a string', 118 | } 119 | ``` 120 | 121 | ``` 122 | search(query): 123 | return [ 124 | { 125 | 'link': 'link to the show', 126 | 'title': 'title of the show', 127 | 'language': 'sub or dub', 128 | }, 129 | ] 130 | ``` 131 | 132 | ``` 133 | _scrape_video_sources(link): 134 | return { 135 | 'epNum': 'episode number as a string', 136 | 'sources': sourceType 137 | } 138 | ``` 139 | 140 | SourceTypes are in the following format. 141 | ``` 142 | [ 143 | { 144 | 'link': 'link to the mp4 or iframe embed', 145 | 'type': 'mp4 or iframe', 146 | } 147 | ] 148 | ``` 149 | 150 | ## Contributing 151 | 152 | Want to add a downloader or scraper or info collector? 153 | Each module must have 154 | - URL Handeling / a matching_urls variable that looks like this. 155 | 156 | ### URL Handling (matching_urls) 157 | Most functions will go through functions until there is a matching url schema. Each scraper contains the following variable which is used by the handler in order to identify the correct module to use when resolving links. 158 | ``` 159 | matching_urls = [ 160 | { 161 | 'urls': ['regex match expression'], 162 | 'function': function that should be called, 163 | }, 164 | ] 165 | ``` 166 | 167 | ### Adding a Scraper 168 | Scrapers handle search queries, scraping episodes from hosts and scraping sources from those episodes. 169 | 170 | Refer to [Functions](#Functions) for data formatting 171 | 172 | Scrapers should have a couple of functions 173 | - `search` 174 | - `scrape_all_show_sources` 175 | - scrapes all show details and episodes along with their direct sources 176 | 177 | Optionally there can also be 178 | - `_scrape_episode_source` 179 | - scrapes a single episode's source 180 | 181 | 182 | Scrapers should be put into the `scrapers` folder 183 | 184 | ### Adding a downloader 185 | Downloaders are what extract the direct link the the video file or download the file based off a filename. 186 | 187 | Downloaders need these functions. 188 | - `download(link, filename)` 189 | - returns True when download is successful or false if failed. 190 | 191 | Downloaders should be put into the `downloaders` folder 192 | 193 | ### Adding an information collector 194 | Information collectors collect various information about a particular anime series/movie. 195 | 196 | They need these functions, which are mentioned in details above. 197 | - search 198 | - getDetailedInfo 199 | 200 | Info collectors should also have the following variables 201 | - matching_urls 202 | 203 | - Put them in the `info_collectors` folder 204 | 205 | ## Credits 206 | - jQwotos 207 | - FadedCoder 208 | - DxCx (NineAnimeUrlExtender) 209 | -------------------------------------------------------------------------------- /scrapers/masteranime.py: -------------------------------------------------------------------------------- 1 | import re 2 | import logging 3 | 4 | # import requests 5 | # Not working, using below instead 6 | import cfscrape 7 | 8 | import demjson 9 | 10 | from bs4 import BeautifulSoup 11 | 12 | site_name = 'masteranime' 13 | requests = cfscrape.create_scraper() 14 | 15 | BASE_URL = "https://www.masterani.me" 16 | SEARCH_URL = "%s/api/anime/search" % (BASE_URL,) 17 | SHOW_URL = "%s/anime/info/" % (BASE_URL,) 18 | EPISODE_LIST_URL = "%s/api/anime/{ID}/detailed" % (BASE_URL,) 19 | POSTER_URL = ("%s/poster/3/" % BASE_URL).replace("www", "cdn") 20 | 21 | showid_pat = re.compile("%s([0-9]+)-" % (SHOW_URL,)) 22 | sources_pat = re.compile('mirrors:(.*?), auto_update: \[1') 23 | # sources_pat_2 = re.compile('\[(.*)\]') 24 | multi_source_pat = [ 25 | { 26 | 'pat': sources_pat, 27 | 'secondary': False, 28 | }, 29 | { 30 | 'pat': re.compile("var videos = (\[.*?\])"), 31 | 'secondary': True, 32 | } 33 | ] 34 | 35 | ''' 36 | { 37 | 'pat': sources_pat_2, 38 | 'secondary': True, 39 | }, 40 | ''' 41 | 42 | 43 | def _combine_link(url): 44 | return ("%s%s" % (BASE_URL, url,)).replace(' ', '') 45 | 46 | 47 | def _merge_slug(location, slug): 48 | return _combine_link("/anime/%s/%s" % (location, slug,)) 49 | 50 | 51 | def _merge_poster(poster_url): 52 | return "%s%s" % (POSTER_URL, poster_url,) 53 | 54 | 55 | def _extract_single_search(data): 56 | return { 57 | 'link': _merge_slug("info", data['slug']), 58 | 'title': data['title'], 59 | 'id': data['id'], 60 | 'language': 'sub', # masteranime only has subs 61 | 'host': site_name, 62 | 'poster': _merge_poster(data['poster']['file']), 63 | } 64 | 65 | 66 | def _extract_multiple_search(data): 67 | return [_extract_single_search(x) for x in data] 68 | 69 | 70 | # Masteranime has a hidden api 71 | # that we can abuse, this makes it easier 72 | # so that we don't need to webscrape as much. 73 | def search(query): 74 | params = { 75 | 'search': query, 76 | 'sb': 'true', 77 | } 78 | data = requests.get(SEARCH_URL, params=params).json() 79 | 80 | return _extract_multiple_search(data) 81 | 82 | 83 | def _scrape_show_id(link): 84 | return re.findall(showid_pat, link)[0] 85 | 86 | 87 | def _scrape_single_video_source(data, **kwargs): 88 | if 'secondary' in kwargs and kwargs['secondary'] is True: 89 | return { 90 | 'link': data['src'], 91 | 'quality': data['res'], 92 | 'type': data['type'], 93 | } 94 | 95 | combined = '%s%s' % (data['host']['embed_prefix'], data['embed_id']) 96 | if data['host']['embed_suffix'] is not None: 97 | combined = "%s%s" % (combined, data['host']['embed_suffix']) 98 | return { 99 | 'link': combined, 100 | 'type': '', 101 | 'quality': data['quality'], 102 | 'id': data['id'], 103 | } 104 | 105 | ''' 106 | def _scrape_video_sources(link): 107 | logging.info("Scraping sources for %s under masteranime." % (link,)) 108 | data = BeautifulSoup(requests.get(link).content, 'html.parser') 109 | scripts = data.findAll("script") 110 | sources = str(scripts[3]) 111 | 112 | encoded_sources = re.findall(sources_pat, sources) 113 | 114 | # If the sources are located in the first primary script location 115 | if len(encoded_sources) > 0: 116 | sources = demjson.decode(encoded_sources[0]) 117 | return [_scrape_single_video_source(x) for x in sources] 118 | # If the sources are in the second location 119 | else: 120 | script = str(scripts[2]) 121 | encoded_sources = re.findall(sources_pat_2, script) 122 | encoded_sources = "[%s]" % (encoded_sources[0],) 123 | print(encoded_sources) 124 | sources = demjson.decode(encoded_sources) 125 | return [_scrape_single_video_source(x, secondary=True) for x in sources] 126 | ''' 127 | 128 | 129 | def _scrape_video_sources(link): 130 | logging.info("Scraping sources for %s under masteanime." % (link,)) 131 | data = BeautifulSoup(requests.get(link).content, 'html.parser') 132 | scripts = data.findAll('script') 133 | scripts = scripts[2:] 134 | for script in scripts: 135 | for reSource in multi_source_pat: 136 | encoded_sources = re.findall(reSource.get('pat'), str(script)) 137 | if len(encoded_sources) > 0: 138 | sources = demjson.decode(encoded_sources[0]) 139 | return [ 140 | _scrape_single_video_source(x, secondary=reSource.get('secondary')) 141 | for x in sources 142 | ] 143 | 144 | 145 | def _parse_list_single(data, link): 146 | data = data['info'] 147 | link = "%s/%s" % (link, data['episode']) 148 | return { 149 | 'epNum': data['episode'], 150 | 'sources': _scrape_video_sources(link), 151 | } 152 | 153 | 154 | def _parse_list_multi(data): 155 | logging.info( 156 | "A request for scraping all sources from %s under masteranime" 157 | % (data['link'],) 158 | ) 159 | return [_parse_list_single(x, data['link']) for x in data['episodes']] 160 | 161 | 162 | def _load_list_episodes(data): 163 | slug = data.get('info').get('slug') 164 | link = _merge_slug("watch", slug) 165 | data['link'] = link 166 | return _parse_list_multi(data) 167 | 168 | 169 | def _parse_status(status): 170 | statuses = ['completed', 'airing'] 171 | return statuses[status] 172 | 173 | 174 | def scrape_all_show_sources(link): 175 | id = _scrape_show_id(link) 176 | updatedLink = EPISODE_LIST_URL.replace('{ID}', id) 177 | data = requests.get(updatedLink).json() 178 | episodes = _load_list_episodes(data) 179 | data = data['info'] 180 | data.update({ 181 | 'episodes': episodes, 182 | 'status': _parse_status(data['status']), 183 | }) 184 | return data 185 | 186 | 187 | matching_urls = [ 188 | { 189 | 'urls': [r'https://www.masterani.me/anime/info/(.*)'], 190 | 'function': scrape_all_show_sources, 191 | }, 192 | { 193 | 'urls': [], 194 | 'function': search, 195 | }, 196 | { 197 | 'urls': [r'https://www.masterani.me/anime/watch/(.*)/([0-9]+)'], 198 | 'function': _scrape_video_sources, 199 | } 200 | ] 201 | --------------------------------------------------------------------------------