├── requirements.txt
├── test.py
├── info_handler.py
├── scraper_handler.py
├── info_collectors
    ├── _init_anidb.py
    └── anidb.py
├── .gitignore
├── download_handler.py
├── downloaders
    ├── mp4.py
    ├── mycloud.py
    └── vidstreaming.py
├── templates
    └── module_search.py
├── scrapers
    ├── anime9.py
    ├── gogoanime.py
    ├── animeheaven.py
    └── masteranime.py
└── README.md


/requirements.txt:
--------------------------------------------------------------------------------
1 | bs4
2 | cfscrape
3 | requests
4 | furl
5 | html5lib
6 | lxml
7 | demjson
8 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | fileLocation = os.path.realpath(__file__)
3 | directory = os.path.dirname(fileLocation)
4 | print(os.path.join(directory, ".."))
5 | 


--------------------------------------------------------------------------------
/info_handler.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | 
 3 | from .templates.module_search import ModuleSearch
 4 | 
 5 | 
 6 | class InfoHandler(ModuleSearch):
 7 | 
 8 |     def __init__(self):
 9 |         self._get_modules('info_collectors')
10 | 
11 |     def _search_module(self, query, strict, module):
12 |         return module.search(query, strict)
13 | 
14 |     def search(self, query, strict=False):
15 |         return [
16 |             self._search_module(query, strict, x)
17 |             for x in self.modules
18 |         ]
19 | 
20 |     def _details_module(self, id, module):
21 |         return module.getDetailedInfo(id)
22 | 
23 |     def getDetailedInfo(self, id):
24 |         return [
25 |             self._details_module(id, x) for x in self.modules
26 |         ]
27 | 
28 | 
29 | info_handler = InfoHandler()
30 | 


--------------------------------------------------------------------------------
/scraper_handler.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import imp
 3 | import logging
 4 | import os
 5 | import re
 6 | 
 7 | from difflib import SequenceMatcher
 8 | from .templates.module_search import ModuleSearch
 9 | 
10 | 
11 | class ScraperHandler(ModuleSearch):
12 |     # Deals with resolving the scraping of links
13 |     # Automatically resolves with modules in
14 |     # the scrapers folder.
15 |     def __init__(self):
16 |         self._get_modules('scrapers')
17 | 
18 |     def _search_module(self, query, module):
19 |         return module.search(query)
20 | 
21 |     # Searches using scraper modules based on query
22 |     def search(self, query, limited_modules=None):
23 |         logging.debug("Starting a search for '%s'." % (query,))
24 |         return [
25 |             self._search_module(query, x)
26 |             for x in self.modules
27 |             if limited_modules is None or x.site_name in limited_modules
28 |         ]
29 | 
30 |     # Resolves a URL and returns data from
31 |     # proper module and function
32 |     def resolve(self, link):
33 |         logging.debug(
34 |             "Starting a resolution for '%s'"
35 |             "under scraper_handler." % (link,)
36 |         )
37 |         for module in self.modules:
38 |             functions = self._try_match_module(link, module)
39 |             if len(functions) > 0:
40 |                 return functions[0](link)
41 |         return None
42 | 
43 | 
44 | def score_similarity(stringA, stringB):
45 |     return SequenceMatcher(None, stringA, stringB).ratio()
46 | 
47 | scraper_handler = ScraperHandler()
48 | 


--------------------------------------------------------------------------------
/info_collectors/_init_anidb.py:
--------------------------------------------------------------------------------
 1 | from datetime import date
 2 | import requests
 3 | import os
 4 | 
 5 | 
 6 | BASE_PATH = os.path.dirname(os.path.realpath(__file__))
 7 | INFO_FILE = os.path.join(BASE_PATH, "last_download.txt")
 8 | DOWNLOAD_URL = "http://anidb.net/api/anime-titles.xml.gz"
 9 | DOWNLOAD_FILE = os.path.join(BASE_PATH, "anime-titles.xml")
10 | 
11 | 
12 | class DownloadList:
13 | 
14 |     def __init__(self):
15 |         self.need_download = self.need_to_download()
16 | 
17 |     def need_to_download(self):
18 |         try:
19 |             with open(INFO_FILE, "r") as f:
20 |                 data = f.readline()
21 |                 if len(data) > 0:
22 |                     last_download = date.fromordinal(int(data))
23 |                     time_delta = date.today() - last_download
24 |                     if time_delta.days > 7:
25 |                         return True
26 |                     else:
27 |                         return False
28 |                 else:
29 |                     return True
30 |         except FileNotFoundError:
31 |             return True
32 |         return False
33 | 
34 |     def write_today_ordinal(self):
35 |         with open(INFO_FILE, "w") as f:
36 |             f.write(str(date.today().toordinal()) + "\n")
37 | 
38 |     def download_list(self):
39 |         request = requests.get(DOWNLOAD_URL, stream=True)
40 |         with open(DOWNLOAD_FILE, "wb") as f:
41 |             for chunk in request.iter_content(chunk_size=1024):
42 |                 if chunk:
43 |                     f.write(chunk)
44 | 
45 |     def get_file(self):
46 |         if self.need_to_download():
47 |             self.write_today_ordinal()
48 |             self.download_list()
49 | 
50 | 
51 | download_list = DownloadList()
52 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | 
 27 | # PyInstaller
 28 | #  Usually these files are written by a python script from a template
 29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 30 | *.manifest
 31 | *.spec
 32 | 
 33 | # Installer logs
 34 | pip-log.txt
 35 | pip-delete-this-directory.txt
 36 | 
 37 | # Unit test / coverage reports
 38 | htmlcov/
 39 | .tox/
 40 | .coverage
 41 | .coverage.*
 42 | .cache
 43 | nosetests.xml
 44 | coverage.xml
 45 | *.cover
 46 | .hypothesis/
 47 | 
 48 | # Translations
 49 | *.mo
 50 | *.pot
 51 | 
 52 | # Django stuff:
 53 | *.log
 54 | local_settings.py
 55 | 
 56 | # Flask stuff:
 57 | instance/
 58 | .webassets-cache
 59 | 
 60 | # Scrapy stuff:
 61 | .scrapy
 62 | 
 63 | # Sphinx documentation
 64 | docs/_build/
 65 | 
 66 | # PyBuilder
 67 | target/
 68 | 
 69 | # Jupyter Notebook
 70 | .ipynb_checkpoints
 71 | 
 72 | # pyenv
 73 | .python-version
 74 | 
 75 | # celery beat schedule file
 76 | celerybeat-schedule
 77 | 
 78 | # SageMath parsed files
 79 | *.sage.py
 80 | 
 81 | # Environments
 82 | .env
 83 | .venv
 84 | env/
 85 | venv/
 86 | ENV/
 87 | 
 88 | # Spyder project settings
 89 | .spyderproject
 90 | .spyproject
 91 | 
 92 | # Rope project settings
 93 | .ropeproject
 94 | 
 95 | # mkdocs documentation
 96 | /site
 97 | 
 98 | # My files
 99 | anime-titles.xml.gz
100 | anime-titles.xml
101 | last_download.txt


--------------------------------------------------------------------------------
/download_handler.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from .templates.module_search import ModuleSearch
 4 | 
 5 | 
 6 | class DownloadHandler(ModuleSearch):
 7 |     # Deals with resolving downloading of files
 8 |     def __init__(self):
 9 |         self._get_modules('downloaders')
10 | 
11 |     def single_download(self, link, abs_path):
12 |         """
13 |         Download a single episode.
14 |         'link' is the full link of it (get it with scraper_handler).
15 |         'abs_path' is full path + filename of downloaded file, example -
16 |         "/home/User/MyDownloadedEpisode.mp4"
17 |         """
18 |         for module in self.modules:
19 |             if self._try_match_module(link, module):
20 |                 if module.download(link, abs_path):
21 |                     return True
22 |                 return False
23 |         return False
24 | 
25 |     def resolve(self, data):
26 |         logging.info(
27 |             "Trying to resolve '%s'"
28 |             % (data['epNum'])
29 |          )
30 |         for module in self.modules:
31 |             for source in data['sources']:
32 |                 logging.info(
33 |                     "Trying to resolve '%s' source."
34 |                     % (source['link'])
35 |                 )
36 |                 if self._try_match_module(source['link'], module):
37 |                     logging.info(
38 |                         "Found a matching module for '%s'."
39 |                         % (source,)
40 |                     )
41 |                     # PEP8 Too long
42 |                     fileName = "%s.mp4" % (data['epNum'],) if 'epNum' in data else source
43 |                     if module.download(source['link'], fileName):
44 |                         break
45 | 
46 | download_handler = DownloadHandler()
47 | 


--------------------------------------------------------------------------------
/downloaders/mp4.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import time
 4 | import logging
 5 | 
 6 | import requests
 7 | 
 8 | 
 9 | class Timer:
10 |     def restart(self, request):
11 |         self.length = int(request.headers.get('content-length'))
12 |         self.start = time.clock()
13 | 
14 |         self.current = 0
15 | 
16 |     def __init__(self, request):
17 |         self.restart(request)
18 | 
19 |     def tick(self, chunk_size):
20 |         self.current += chunk_size
21 |         speed = round(self.current // (time.clock() - self.start) / 1000000, 2)
22 |         percentComplete = round((self.current / self.length) * 100, 1)
23 |         sys.stdout.write(
24 |             "\r %s Mbps | %r Percent Complete"
25 |             % (speed, percentComplete)
26 |         )
27 | 
28 | 
29 | def download(link, filename):
30 |     logging.info("Starting download for %s." % (link,))
31 |     tempName = "%s.tmp" % (filename,)
32 | 
33 |     with open(tempName, 'wb') as f:
34 |         request = requests.get(link, stream=True)
35 | 
36 |         # timer = Timer(request)
37 | 
38 |         for chunk in request.iter_content(chunk_size=1024):
39 |             # timer.tick(len(chunk))
40 | 
41 |             if chunk:
42 |                 f.write(chunk)
43 |             else:
44 |                 logging.error("Failed to a chunk for '%s'." % (link,))
45 |     logging.info("Finished downloading '%s'." % (link,))
46 |     os.rename(tempName, filename)
47 |     return True
48 | 
49 | matching_urls = [
50 |     {
51 |         'urls': [
52 |             r'http://(.*).animeheaven.eu/video/(.*).mp4(.*)',
53 |             r'http://(.*).animeheaven.eu/[0-9]+pi/(.*).mp4(.*)',
54 |             r'https://[0-9]+.bp.blogspot.com(.*)',
55 |         ],
56 |         'function': download,
57 |     },
58 | ]
59 | 


--------------------------------------------------------------------------------
/templates/module_search.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import imp
 3 | import logging
 4 | import os
 5 | import re
 6 | 
 7 | 
 8 | class ModuleSearch(object):
 9 |     def _load_single_module(self, f):
10 |         return imp.load_source(f[:-3], f)
11 | 
12 |     def _load_modules(self):
13 |         return [self._load_single_module(x) for x in self.modules]
14 | 
15 |     def _try_match_url(self, link, matchingURL):
16 |         return True if re.match(matchingURL, link) is not None else False
17 | 
18 |     def _try_match_module_section(self, link, section):
19 |         urls = section['urls']
20 |         matches = [
21 |             section['function'] for x in urls
22 |             if self._try_match_url(link, x) is not False
23 |         ]
24 |         return True if len(matches) > 0 else False
25 | 
26 |     def _try_match_module(self, link, module):
27 |         sections = module.matching_urls
28 |         return [x['function'] for x in sections
29 |                 if self._try_match_module_section(link, x) is not False]
30 | 
31 |     def __is_underscore(self, f):
32 |         if f[f.rfind('/') + 1] == "_":
33 |             return True
34 |         return False
35 | 
36 |     def _get_modules(self, location):
37 |         fileLocation = os.path.realpath(__file__)
38 |         directory = os.path.dirname(fileLocation)
39 |         self.module_location = os.path.join(directory, '..', location)
40 |         self.modules = glob.glob("%s/*.py" % (self.module_location))
41 |         self.modules = [
42 |             module for module in self.modules
43 |             if not self.__is_underscore(module)
44 |         ]
45 |         '''
46 |         for i in range(len(self.modules)):  # Delete modules beginning with '_'
47 |             module = self.modules[i]
48 |             if module[module.rfind("/") + 1] == "_":
49 |                 del self.modules[i]
50 |         '''
51 |         self.modules = self._load_modules()
52 | 


--------------------------------------------------------------------------------
/downloaders/mycloud.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | import re
  4 | import subprocess
  5 | 
  6 | import requests
  7 | 
  8 | from bs4 import BeautifulSoup
  9 | 
 10 | types = ['iframe']
 11 | 
 12 | MY_CLOUD_PAT = re.compile('<meta property="og:image" content="(.*?)"')
 13 | 
 14 | resolutions = ['1080', '720', '480', '360']
 15 | 
 16 | 
 17 | def _pick_highest_res(link, headers=None):
 18 |     logging.info(
 19 |         "Picking the highest res for '%s' under mycloud downloader."
 20 |         % (link,)
 21 |     )
 22 |     testIncrement = 1
 23 |     for res in resolutions:
 24 |         trialLink = _increment_link(link, testIncrement).replace(
 25 |             "{{RESOLUTION}}", res
 26 |         )
 27 |         if requests.get(trialLink, stream=True, headers=headers).status_code == 200:
 28 |             logging.info("Highest quality possible is %s" % (res))
 29 |             return link.replace("{{RESOLUTION}}", res)
 30 |     # raise ValueError, "Can't find a proper resolution for %s." % (link,)
 31 | 
 32 | 
 33 | def _get_direct_link(link, headers=None):
 34 |     logging.info(
 35 |         "Getting direct link for '%s' under mycloud downloader."
 36 |         % (link,)
 37 |     )
 38 |     data = requests.get(link).content
 39 |     details = re.findall(MY_CLOUD_PAT, str(data))[0]
 40 |     logging.info("Details are '%s' from '%s'" % (details, link))
 41 |     actualLink = details.replace(
 42 |         'preview.jpg',
 43 |         'hls/{{RESOLUTION}}/{{RESOLUTION}}-{{INCREMENT}}.ts'
 44 |     )
 45 |     logging.info(
 46 |         "Resolved link '%s' under mycloud downloader to '%s'"
 47 |         % (link, actualLink,)
 48 |     )
 49 |     return _pick_highest_res(actualLink, headers)
 50 | 
 51 | 
 52 | def _increment_link(link, increment):
 53 |     return link.replace('{{INCREMENT}}', '%04d' % (increment,))
 54 | 
 55 | 
 56 | def download(link, fname, **passedArgs):
 57 |     headers = {'Referer': link}
 58 |     # Downloads a file from MyCloud based on link and filename
 59 |     directLink = _get_direct_link(link, headers)
 60 |     logging.info("Recieved link of '%s' from '%s'" % (directLink, link,))
 61 |     increment = 0
 62 |     finished = False
 63 | 
 64 |     tempName = "%s.ts.tmp" % (fname,)
 65 |     with open(tempName, 'wb') as f:
 66 |         while True:
 67 |             increment += 1
 68 |             newLink = _increment_link(directLink, increment)
 69 |             while True:
 70 |                 try:
 71 |                     download = requests.get(newLink, stream=True, timeout=10, headers=headers)
 72 |                     break
 73 |                 except Exception as e:
 74 |                     logging.error(
 75 |                         "Connection timed out while downloading block %i. With error %s"
 76 |                         % (increment, str(e),)
 77 |                     )
 78 |             if download.status_code == 200:
 79 |                 f.write(download.content)
 80 |                 logging.info("Finished writing increment #%i" % (increment))
 81 |                 finished = True
 82 |             else:
 83 |                 logging.error("FAILED to download increment #%i" % (increment))
 84 |                 break
 85 | 
 86 |     if finished:
 87 |         if 'convert' in passedArgs and passedArgs['convert']:
 88 |             try:
 89 |                 subprocess.run(['ffmpeg', '-i', tempName, '%s' % (fname,)])
 90 |             except:
 91 |                 print("Please install FFMPEG")
 92 |                 return False
 93 |             os.remove(tempName)
 94 |         else:
 95 |             os.rename(tempName, fname)
 96 |             return True
 97 | 
 98 | matching_urls = [
 99 |     {
100 |         'urls': [
101 |             r'https://mcloud.to/embed/(.*)&autostart=true',
102 |             r'https://mcloud.to/embed/(.*)',
103 |         ],
104 |         'function': download,
105 |     }
106 | ]
107 | 


--------------------------------------------------------------------------------
/info_collectors/anidb.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import os
  3 | from bs4 import BeautifulSoup
  4 | from difflib import SequenceMatcher
  5 | import sys
  6 | 
  7 | # Constants
  8 | BASE_URL = "http://api.anidb.net:9001/httpapi?request=anime"
  9 | BASE_PATH = os.path.dirname(os.path.realpath(__file__))
 10 | SEARCH_FILE = os.path.join(BASE_PATH, "anime-titles.xml")
 11 | IMAGE_URL = "http://img7.anidb.net/pics/anime/"
 12 | CLIENT = "fadedanimefinder"
 13 | CLIENT_VERSION = 1
 14 | MIN_SIMILARITY_RATIO = 0.5
 15 | 
 16 | sys.path.append(BASE_PATH)
 17 | from _init_anidb import download_list
 18 | 
 19 | 
 20 | def _similar(original_txt, matched_txt):
 21 |     return SequenceMatcher(None, original_txt, matched_txt).ratio()
 22 | 
 23 | 
 24 | def search(query, strict=False):
 25 |     '''
 26 |     Search for a particular anime among the DB.
 27 |     In this module, `strict` is a dummy parameter, and does not do anything.
 28 |     Returns a list which contains a dict, containing the show ID and different
 29 |     names. Use that ID to get detailed info via getDetailedInfo(ID).
 30 |     '''
 31 |     download_list.get_file()
 32 | 
 33 |     with open(SEARCH_FILE, "rb") as f:
 34 |         search_contents = f.read()
 35 |     result_page = BeautifulSoup(search_contents, "xml").animetitles
 36 | 
 37 |     results = list()
 38 |     ratio_list = list()
 39 |     for anime in result_page.findAll("anime"):
 40 |         highest_ratio = 0
 41 |         for title in anime.findAll("title"):
 42 |             ratio = _similar(query, title.string)
 43 |             if ratio > MIN_SIMILARITY_RATIO:
 44 |                 if ratio > highest_ratio:
 45 |                     highest_ratio = ratio
 46 |         if not highest_ratio:
 47 |             continue
 48 |         ratio_list.append(highest_ratio)
 49 |         id = int(anime['aid'])
 50 |         titles = [title.string for title in
 51 |                   anime.findAll("title", attrs={"type": ["main", "official"]})]
 52 |         results.append({"id": id, "titles": titles})
 53 |     return [x for (y, x) in
 54 |             sorted(list(zip(ratio_list, results)),
 55 |                    key=lambda pair: pair[0], reverse=True)]
 56 | 
 57 | 
 58 | def getDetailedInfo(id):
 59 |     '''
 60 |     Gets a detailed info from the ID provided. A dict is returned with
 61 |     the following keys. The type of the value is also mentioned.
 62 | 
 63 |     id: int, type: str, start_date: str, end_date: str, other_names: str,
 64 |     creators: [{str: str}], permanent_rating: float, image_url: str,
 65 |     description: str, recommendations: [{str: str}]
 66 |     '''
 67 |     request = requests.get(BASE_URL, params={
 68 |         "request": "anime",
 69 |         "aid": str(id),
 70 |         "protover": "1",
 71 |         "client": CLIENT,
 72 |         "clientver": str(CLIENT_VERSION)
 73 |     })
 74 |     request.raise_for_status()
 75 |     result_page = BeautifulSoup(request.text, "xml")
 76 | 
 77 |     results = {
 78 |         "id": id,
 79 |         "type": result_page.find("type").string,
 80 |         "episode_count": result_page.find("episodecount").string,
 81 |         "start_date": result_page.find("startdate").string,
 82 |         "end_date": result_page.find("enddate").string,
 83 |         "other_names": [title.string for title in
 84 |                         result_page.find("titles").findAll("title")],
 85 |         "creators": [{name['type']: name.string}
 86 |                      for name in result_page.find("creators").findAll("name")],
 87 |         "permanent_rating": float(result_page.find("ratings")
 88 |                                   .find("permanent").string),
 89 |         "image_url": IMAGE_URL + result_page.find("picture").string,
 90 |         "description": result_page.find("description").string
 91 |     }
 92 |     return results
 93 | 
 94 | 
 95 | matching_urls = [
 96 |     {
 97 |         'urls': [],
 98 |         'function': search,
 99 |     },
100 |     {
101 |         'urls': [],
102 |         'function': getDetailedInfo,
103 |     },
104 | ]
105 | 


--------------------------------------------------------------------------------
/scrapers/anime9.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import requests
  4 | 
  5 | from bs4 import BeautifulSoup as bs
  6 | 
  7 | site_name = "9anime.is"
  8 | 
  9 | BASE_URL = 'https://9anime.is'
 10 | SEARCH_URL = '%s/search' % (BASE_URL,)
 11 | INFO_API_URL = "%s/ajax/episode/info" % (BASE_URL,)
 12 | 
 13 | 
 14 | def _parse_search_single(data):
 15 |     img = data.find("img")
 16 |     nameAnchor = data.find("a", {"class": "name"})
 17 |     lang = data.find('div', {'class': 'lang'})
 18 |     lang = lang.text if lang is not None else 'sub'
 19 | 
 20 |     return {
 21 |         'title': nameAnchor.text,
 22 |         'link': nameAnchor['href'],
 23 |         'language': lang.lower(),
 24 |         'host': site_name,
 25 |         'poster': img['src']
 26 |     }
 27 | 
 28 | 
 29 | def _parse_search_multi(data):
 30 |     return [
 31 |         _parse_search_single(x)
 32 |         for x in data.findAll("div", {"class": "item"})
 33 |     ]
 34 | 
 35 | 
 36 | def search(query):
 37 |     params = {
 38 |         'keyword': query,
 39 |     }
 40 |     data = bs(requests.get(SEARCH_URL, params=params).content)
 41 | 
 42 |     return _parse_search_multi(data)
 43 | 
 44 | 
 45 | def _scrape_episode_source(data):
 46 |     return {
 47 |         'link': data['file'],
 48 |         'type': data['type'],
 49 |         'quality': data['label'],
 50 |     }
 51 | 
 52 | 
 53 | def _scrape_episode_sources(data):
 54 |     request = requests.get(data['grabber'], params=data['params']).json()
 55 |     return [_scrape_episode_source(x) for x in request['data']]
 56 | 
 57 | 
 58 | def _scrape_episode_info(id):
 59 |     logging.debug("'%s' is performing a info grab for '%s'" % (site_name, id,))
 60 |     params = {'id': id}
 61 |     data = requests.get(INFO_API_URL, params=params)
 62 |     if data.status_code == 200:
 63 |         data = data.json()
 64 |         if data.get('target') == '' or data.get('type') == 'direct':
 65 |             return _scrape_episode_sources(data)
 66 |         else:
 67 |             return {
 68 |                 'link': data.get('target'),
 69 |                 'type': data.get('type'),
 70 |             }
 71 | 
 72 | 
 73 | def _parse_server_single_episode(data):
 74 |     anchor = data.find("a")
 75 |     id = anchor['data-id']
 76 |     output = {
 77 |         'data-id': id,
 78 |         'epNum': anchor.text,
 79 |         'sources': _scrape_episode_info(id),
 80 |     }
 81 |     return output if output['sources'] is not None else None
 82 | 
 83 | 
 84 | def _parse_server_episodes(data):
 85 |     episodes = data.findAll("li")
 86 |     sources = [_parse_server_single_episode(x) for x in episodes]
 87 |     if len(sources) > 0:
 88 |         return list(filter(None, sources))
 89 | 
 90 | 
 91 | def _scrape_all_servers(data):
 92 |     servers = data.findAll("ul", {"class": "episodes range active"})
 93 |     sourcedServers = [_parse_server_episodes(x) for x in servers]
 94 |     return list(filter(None, sourcedServers))
 95 | 
 96 | 
 97 | def format_combine_multi(unformatedOutput):
 98 |     output = []
 99 |     for ep in unformatedOutput:
100 |         output.append({
101 |             'epNum': str(int(ep)),  # remove leading 0s
102 |             'sources': unformatedOutput[ep]
103 |         })
104 |     return output
105 | 
106 | 
107 | def combine_multi(servers):
108 |     unformatedOutput = {}
109 |     print(servers)
110 |     for server in servers:
111 |         for ep in server:
112 |             if ep['epNum'] not in unformatedOutput:
113 |                 unformatedOutput[ep['epNum']] = [ep['sources']]
114 |             else:
115 |                 unformatedOutput[ep['epNum']] += [ep['sources']]
116 | 
117 |     return format_combine_multi(unformatedOutput)
118 | 
119 | 
120 | def _scrape_title(data):
121 |     return data.find('h1', {'class': 'title'}).text
122 | 
123 | 
124 | def scrape_all_show_sources(link):
125 |     logging.info(
126 |         "A request for '%s' was made under %s scraper." %
127 |         (link, site_name)
128 |     )
129 |     data = bs(requests.get(link).content, 'html.parser')
130 |     body = data.find('body')
131 |     servers = _scrape_all_servers(data)
132 |     return {
133 |         'episodes': combine_multi(servers),
134 |         'title': _scrape_title(data),
135 |         'host': site_name,
136 |     }
137 | 
138 | matching_urls = [
139 |     {
140 |         'urls': [
141 |             r'https://9anime.to/watch/(.*).(.*)',
142 |             r'https://9anime.is/watch/(.*).(.*)'
143 |         ],
144 |         'function': scrape_all_show_sources,
145 |     },
146 | ]
147 | 


--------------------------------------------------------------------------------
/downloaders/vidstreaming.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | import sys
  4 | import logging
  5 | 
  6 | import requests
  7 | 
  8 | from furl import furl
  9 | from bs4 import BeautifulSoup
 10 | 
 11 | BASE_PATH = os.path.dirname(os.path.realpath(__file__))
 12 | sys.path.append(BASE_PATH)
 13 | import mp4
 14 | 
 15 | site_name = 'vidstream'
 16 | 
 17 | BASE_URL = "https://vidstreaming.io"
 18 | DOWNLOAD_URL = "https://vidstream.co/download"
 19 | 
 20 | qualities = ['1080', '720', '480', '360']
 21 | 
 22 | STREAMING_PAT = '\?id=([a-zA-Z0-9]+?)(?:=|$)'
 23 | 
 24 | 
 25 | def _try_match_url(link, matchingURL):
 26 |     return True if re.match(matchingURL, link) is not None else False
 27 | 
 28 | 
 29 | def _try_match_module_section(link, section):
 30 |     urls = section['urls']
 31 |     matches = [
 32 |         section['function'] for x in urls
 33 |         if _try_match_url(link, x) is not False
 34 |     ]
 35 |     return True if len(matches) > 0 else False
 36 | 
 37 | 
 38 | def resolve(link):
 39 |     for section in internal_matching_urls:
 40 |         if _try_match_module_section(link, section):
 41 |             logging.info("Found a match for %s" % (link,))
 42 |             return section['function'](link)
 43 |     return None
 44 | 
 45 | 
 46 | def download(link, fname):
 47 |     logging.info("Starting download for '%s' under vidstreaming." % (link,))
 48 |     sources = resolve(link)['sources']
 49 |     logging.info("Recieved %i sources" % (len(sources)))
 50 |     if len(sources) > 0:
 51 |         source = sources[0]['link']
 52 |     else:
 53 |         logging.critical("Can't find sources on vidstreaming!")
 54 |         return False
 55 |     if source is not None:
 56 |         if mp4.download(source, fname):
 57 |             return True
 58 |     return False
 59 | 
 60 | 
 61 | def _parse_quality(title):
 62 |     for q in qualities:
 63 |         if q in title:
 64 |             return q
 65 |     return None
 66 | 
 67 | 
 68 | def _parse_list_single(data):
 69 |     return {
 70 |         'link': data['href'],
 71 |         'type': 'mp4',
 72 |         'quality': _parse_quality(data.text),
 73 |     }
 74 | 
 75 | 
 76 | def _parse_list_multi(data):
 77 |     box = data.find("div", {"class": "mirror_link"})
 78 |     sources = box.findAll("a")
 79 |     if len(sources) == 0:
 80 |         logging.critical("Can't find sources on vidstreaming!")
 81 |     return [_parse_list_single(x) for x in sources]
 82 | 
 83 | 
 84 | def _scrape_video_sources_id(id):
 85 |     params = {
 86 |         'id': id,
 87 |     }
 88 |     request = requests.get(DOWNLOAD_URL, params=params).content
 89 |     data = BeautifulSoup(request, 'html.parser')
 90 |     return {
 91 |         'sources': _parse_list_multi(data),
 92 |     }
 93 | 
 94 | 
 95 | def _scrape_video_sources(link):
 96 |     id = furl(link).args['id']
 97 |     logging.info("Found id %s from '%s'" % (id, link,))
 98 |     return _scrape_video_sources_id(id)
 99 | 
100 | 
101 | def _parse_list_embed_single(data):
102 |     return {
103 |         'link': data['src'],
104 |         'type': 'mp4',
105 |         'quality': data['label'],
106 |     }
107 | 
108 | 
109 | def _parse_list_embed_multi(data):
110 |     sources = data.findAll("source", {"type": "video/mp4"})
111 |     return [_parse_list_embed_single(x) for x in sources]
112 | 
113 | 
114 | def _scrape_video_embed(link):
115 |     data = BeautifulSoup(requests.get(link).content, 'html.parser')
116 |     result = {
117 |         'sources': _parse_list_embed_multi(data),
118 |     }
119 |     if len(result['sources']) == 0:
120 |         logging.info('Falling back to legacy downloader for %s' % (link,))
121 |         result['sources'] = _scrape_video_sources(link)
122 |     return result
123 | 
124 | def _fix_link(link):
125 |     fixed_link = "http:" + link
126 |     return _scrape_video_embed(fixed_link)
127 | 
128 | def _scrape_streaming(link):
129 |     id = re.search(STREAMING_PAT, link)
130 |     id = id.group(1) if id is not None else None
131 | 
132 |     if id:
133 |         return _scrape_video_sources_id(id)
134 | 
135 |     return None
136 | 
137 | matching_urls = [
138 |     {
139 |         'urls': [
140 |             r'//vidstreaming.io/streaming.php\?id=(.*)&title=(.*)',
141 |             r'https://vidstream.co/embed.php\?(.*)',
142 |             r'https://vidstreaming.io/embed.php\?id=(.*)',
143 |             ],
144 |         'function': download,
145 |     }
146 | ]
147 | 
148 | internal_matching_urls = [
149 |     {
150 |         'urls': [
151 |             r'https://vidstream.co/download\?id=(.*)',
152 |                 ],
153 |         'function': _scrape_video_sources,
154 |     },
155 |     {
156 |         'urls': [
157 |             r'https://vidstream.co/embed.php\?(.*)',
158 |             r'https://vidstreaming.io/embed.php\?id=(.*)',
159 |         ],
160 |         'function': _scrape_video_embed,
161 |     },
162 |     {
163 |         'urls': [
164 |             r'//vidstreaming.io/streaming.php\?id=(.*)&title=(.*)',
165 |         ],
166 |         'function': _scrape_streaming,
167 |     }
168 | ]
169 | 


--------------------------------------------------------------------------------
/scrapers/gogoanime.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import logging
  3 | 
  4 | import cfscrape as cf
  5 | 
  6 | from bs4 import BeautifulSoup
  7 | 
  8 | site_name = 'gogoanime'
  9 | 
 10 | BASE_URL = "https://gogoanime.io"
 11 | SEARCH_URL = "%s/search.html" % (BASE_URL,)
 12 | EPISODE_LIST_URL = "%s//load-list-episode" % (BASE_URL,)
 13 | SHOW_URL = "%s/category/" % (BASE_URL,)
 14 | 
 15 | id_pat = re.compile("var id = (.*?);")
 16 | streaming_name_pat = re.compile('"(.*?)"')
 17 | epnum_pat = re.compile('episode-(.*?)$')
 18 | released_pat = re.compile("Released: ([0-9]+)")
 19 | 
 20 | cfscrape = cf.create_scraper()
 21 | 
 22 | 
 23 | def _combine_link(url):
 24 |     return ("%s%s" % (BASE_URL, url,)).replace(' ', '')
 25 | 
 26 | 
 27 | def _parse_released_date(data):
 28 |     fullString = str(data.find("p", {"class": "released"}))
 29 |     output = re.findall(released_pat, fullString)
 30 |     return output[0] if len(output) > 0 else None
 31 | 
 32 | 
 33 | def _extract_single_search(data):
 34 |     name = data.find('p', {'class': 'name'}).find('a')
 35 |     return {
 36 |         'link': _combine_link(name['href']),
 37 |         'title': name.text,
 38 |         'language': 'dub' if 'dub' in name.text.lower() else 'sub',
 39 |         'released': _parse_released_date(data),
 40 |         'host': site_name,
 41 |     }
 42 | 
 43 | 
 44 | def _extract_multiple_search(data):
 45 |     entries = data.find('ul', {'class': 'items'}).findAll("li")
 46 |     return [_extract_single_search(x) for x in entries]
 47 | 
 48 | 
 49 | def search(query):
 50 |     '''
 51 |     Returns all search results based on a query
 52 |     [
 53 |         {
 54 |             'link': 'link to show on gogoanime',
 55 |             'title': 'the full title of the show',
 56 |             'language': 'either subbed or dubbed',
 57 |         }
 58 |     ]
 59 |     '''
 60 |     params = {
 61 |         'keyword': query,
 62 |         'id': -1,
 63 |     }
 64 | 
 65 |     data = cfscrape.get(SEARCH_URL, params=params).content
 66 |     data = BeautifulSoup(data, 'html.parser')
 67 | 
 68 |     return _extract_multiple_search(data)
 69 | 
 70 | 
 71 | def _parse_list_single(data):
 72 |     return {
 73 |         'name': data.find("div", {"class": "name"}).text,
 74 |         'link': _combine_link(data['href']),
 75 |         'language': data.find("div", {"class": "cate"}).text.lower(),
 76 |         'type': 'iframe',
 77 |     }
 78 | 
 79 | 
 80 | def _parse_list_multi(data):
 81 |     episodes = data.findAll("a")
 82 |     return [_parse_list_single(x) for x in episodes]
 83 | 
 84 | 
 85 | def _load_list_episode(id):
 86 |     params = {
 87 |         'ep_start': 0,
 88 |         'ep_end': 9999999,
 89 |         'id': id,
 90 |         'default_ep': 0,
 91 |     }
 92 |     data = cfscrape.get(EPISODE_LIST_URL, params=params).content
 93 |     data = BeautifulSoup(data, 'html.parser')
 94 |     return _parse_list_multi(data)
 95 | 
 96 | 
 97 | def _scrape_show_id(data):
 98 |     return re.findall(id_pat, str(data))
 99 | 
100 | 
101 | def _scrape_title(data):
102 |     return data.find("div", {"class": "anime_info_body_bg"}).find('h1').text
103 | 
104 | 
105 | def _scrape_status(data):
106 |     return data.findAll('p', {'class': 'type'})[4].text.replace('Status: ', '')
107 | 
108 | 
109 | def _scrape_released(data):
110 |     text = data.findAll('p', {'class': 'type'})[3].text
111 |     return text.replace('Released: ', '')
112 | 
113 | 
114 | def _scrape_epNum(url):
115 |     epNum = re.findall(epnum_pat, url)
116 |     return epNum[0] if len(epNum) > 0 else '0'
117 | 
118 | 
119 | def _scrape_single_video_source(data):
120 |     return {
121 |         'link': data['data-video'],
122 |         'type': 'iframe'
123 |     }
124 | 
125 | 
126 | def _scrape_video_sources(link):
127 |     data = cfscrape.get(link).content
128 |     soupedData = BeautifulSoup(data, 'html.parser')
129 |     sources = soupedData.find("div", {"class", "anime_muti_link"})
130 |     sources = sources.findAll("a")
131 | 
132 |     return {
133 |         'epNum': _scrape_epNum(link),
134 |         'sources': list(map(
135 |             lambda x: _scrape_single_video_source(x),
136 |             sources)
137 |         ),
138 |     }
139 | 
140 | 
141 | def scrape_all_show_sources(link):
142 |     data = cfscrape.get(link).content
143 |     id = _scrape_show_id(data)
144 |     data = BeautifulSoup(data, 'html.parser')
145 |     episodes = _load_list_episode(id)
146 | 
147 |     return {
148 |         'episodes': [_scrape_video_sources(x['link']) for x in episodes],
149 |         'title': _scrape_title(data),
150 |         'status': _scrape_status(data),
151 |         'host': 'gogoanime',
152 |         'released': _scrape_released(data),
153 |     }
154 | 
155 | matching_urls = [
156 |     {
157 |         'urls': [r'https://(.*)gogoanime.io/category/(.*)'],
158 |         'function': scrape_all_show_sources,
159 |     },
160 |     {
161 |         'urls': [r'https://(.*)gogoanime.io//search.html?keyword=(.*)'],
162 |         'function': search,
163 |     },
164 |     {
165 |         'urls': [r'https://(.*)gogoanime.io/(.*)-episode-([0-9]+)'],
166 |         'function': _scrape_video_sources,
167 |     }
168 | ]
169 | 


--------------------------------------------------------------------------------
/scrapers/animeheaven.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import logging
  3 | 
  4 | import requests
  5 | 
  6 | from bs4 import BeautifulSoup
  7 | 
  8 | site_name = 'animeheaven'
  9 | 
 10 | BASE_URL = "http://animeheaven.eu"
 11 | SEARCH_URL = "%s/search.php" % (BASE_URL,)
 12 | 
 13 | # source_pat = re.compile("<source src='(.*?)'")
 14 | source_pat = re.compile("document.write\(\"<a  class='an' href='(.*?)'")
 15 | epnum_pat = re.compile('e=(.*?)$')
 16 | status_pat = re.compile('<div class="textd">Status:</div><div class="textc">(.*?)</div>')
 17 | released_pat = re.compile('<div class="textd">Year:</div><div class="textc">(.*)</div>')
 18 | 
 19 | 
 20 | def _combine_link(url):
 21 |     # Combines the relative url with the base url
 22 |     return ("%s/%s" % (BASE_URL, url,)).replace(' ', '%20')
 23 | 
 24 | 
 25 | def _extract_single_search(data):
 26 |     # Takes in bs4 data of a single search result
 27 |     # and returns a formated dict
 28 |     anchor = data.find("a")
 29 |     img = anchor.find("img")
 30 |     name = img['alt']
 31 |     return {
 32 |         'link': _combine_link(anchor['href']),
 33 |         'title': name,
 34 |         'language': 'dub' if 'dub' in name.lower() else 'sub',
 35 |         'host': site_name,
 36 |         'poster': _combine_link(img['src']),
 37 |     }
 38 | 
 39 | 
 40 | def _extract_multiple_search(data):
 41 |     # Takes in search result page
 42 |     # and returns list of formated results
 43 |     entries = data.findAll('div', {'class': 'iep'})
 44 |     return [_extract_single_search(x) for x in entries]
 45 | 
 46 | 
 47 | def search(query):
 48 |     '''
 49 |     Returns all search results based on a query
 50 |     [
 51 |         {
 52 |             'link': 'link to show on gogoanime',
 53 |             'title': 'the full title of the show',
 54 |             'language': 'either subbed or dubbed',
 55 |         }
 56 |     ]
 57 |     '''
 58 |     logging.info("A query for %s was made under animeheaven" % (query,))
 59 |     params = {'q': query}
 60 |     data = requests.get(SEARCH_URL, params=params).content
 61 |     data = BeautifulSoup(data, 'html.parser')
 62 | 
 63 |     return _extract_multiple_search(data)
 64 | 
 65 | 
 66 | def _parse_list_single(data):
 67 |     return {
 68 |         'name': data.find("div", {"class": "infoept2"}),
 69 |         'link': _combine_link(data['href']),
 70 |     }
 71 | 
 72 | 
 73 | def _parse_list_multi(data):
 74 |     box = data.find("div", {"class": "infoepbox"})
 75 |     episodes = box.findAll("a")
 76 |     return [_parse_list_single(x) for x in episodes]
 77 | 
 78 | 
 79 | def _hex_source_to_str(source_url):
 80 |     return bytes(source_url, 'utf-8').decode('unicode_escape')
 81 | 
 82 | 
 83 | def _scrape_single_video_source(data):
 84 |     source_url = re.findall(source_pat, str(data))
 85 |     return {
 86 |         'link': _hex_source_to_str(source_url[0]) if len(source_url) > 0 else None,
 87 |         'type': 'mp4',
 88 |     }
 89 | 
 90 | 
 91 | def _scrape_epNum(url):
 92 |     epNum = re.search(epnum_pat, url)
 93 |     return epNum.group().replace('e=', '') if epNum is not None else None
 94 | 
 95 | def _parse_multi_video_sources(data):
 96 |     return [_scrape_video_sources(x) for x in data]
 97 | 
 98 | 
 99 | def _scrape_video_sources(link):
100 |     # Scrapes details on a specific
101 |     # episode of a show based on link
102 |     data = BeautifulSoup(requests.get(link).content)
103 |     logging.info("Scraping video sources for %s under animeheaven" % (link,))    # test = data.findall("div", {'class': 'centerf2'})
104 |     sources = data.find("div", {'class': 'centerf2'}).findAll('script')
105 | 
106 |     return {
107 |         'epNum': _scrape_epNum(link),
108 |         'sources': [_scrape_single_video_source(x) for x in sources],
109 |     }
110 | 
111 | def _scrape_title(data):
112 |     # Takes in bs4 show page
113 |     # and returns the title of
114 |     # the show
115 |     return data.find("div", {"class": "infodes"}).text
116 | 
117 | 
118 | def _scrape_released(data):
119 |     # Takes in bs4 show page and
120 |     # returns released year as string
121 |     box = data.findAll("div", {"class": 'infodes2'})
122 |     if len(box) < 1: return None
123 |     box = box[1]
124 |     released_date = re.search(released_pat, str(box))
125 |     return released_date.group() if released_date is not None else Noneß
126 | 
127 | 
128 | def _scrape_status(data):
129 |     # Takes in bs4 show page and
130 |     # return status of the show
131 |     box = data.findAll("div", {"class": "infodes2"})
132 |     if len(box) < 1: return Noneß
133 |     box = box[1]
134 |     status = re.search(status_pat, str(box))
135 |     return status.group() if status is not None else None
136 | 
137 | 
138 | def scrape_all_show_sources(link):
139 |     # Returns all show's sources and details
140 |     # based on the link of the show.
141 |     logging.info(
142 |         "A request for '%s' was made to animeheaven scraper."
143 |         % (link,)
144 |     )
145 |     data = BeautifulSoup(requests.get(link).content, 'html.parser')
146 |     episodes = _parse_list_multi(data)
147 |     logging.debug("Got %i links for %s" % (len(episodes), link,))
148 | 
149 |     return {
150 |         'episodes': [_scrape_video_sources(x['link']) for x in episodes],
151 |         'title': _scrape_title(data),
152 |         'status': _scrape_status(data),
153 |         'host': 'animeheaven',
154 |         'released': _scrape_released(data),
155 |     }
156 | 
157 | matching_urls = [
158 |     {
159 |         'urls': [r'http://animeheaven.eu/i.php\?a=(.*)'],
160 |         'function': scrape_all_show_sources,
161 |     },
162 |     {
163 |         'urls': [r'http://animeheaven.eu/search.php\?q=(.*)'],
164 |         'function': search,
165 |     },
166 |     {
167 |         'urls': [r'http://animeheaven.eu/watch.php\?a=(.*)&e=([0-9]+)'],
168 |         'function': _scrape_video_sources,
169 |     }
170 | ]
171 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # anime-scrapers
  2 | 
  3 | Anime scrapers is a collection of scrapers that have been all unified.
  4 | 
  5 | ## Table of Contents
  6 | - [Installation](#installation)
  7 |   - [Mac / OSX](#mac-osx)
  8 |   - [General Installation](#general-installation)
  9 | - [Usage](#usage)
 10 |   - [Functions](#functions)
 11 |     - [Handlers](#handlers)
 12 |       - [Scraper Handler](#scraper_handler)
 13 |       - [Download Handler](#download_handler)
 14 |       - [Info Handler](#info_handler)
 15 |   - [Individual Scraper Functions](#individual-scraper-functions)
 16 | - [Contributing](#contributing)
 17 |     - [URL Handling (matching_urls)](#url-handling-matching_urls)
 18 |     - [Adding a Scraper](#adding-a-scraper)
 19 |     - [Adding a Downloader](#adding-a-downloader)
 20 |     - [Adding an Info Collector](#adding-an-information-collector)
 21 | - [Credits](#credits)
 22 | 
 23 | ## Installation
 24 | 
 25 | ### Mac (OSX)
 26 | - Install [Brew](https://brew.sh/)
 27 | - Install python3
 28 |   - `brew install python3`
 29 | - Continue to general installation
 30 | 
 31 | ### General Installation
 32 | - Clone repository
 33 |   - `git clone https://github.com/jQwotos/anime_scrapers`
 34 | - Nav into repo
 35 |   - `cd anime_scrapers`
 36 | - Install required python packages
 37 |   - `pip install -r requirements.txt`
 38 | 
 39 | ## Usage
 40 | 
 41 | anime_scrapers is the backend that is to be used by other applications. You can however use it directly if you want by using the python shell, but it's better to use an application.
 42 | 
 43 | ### Functions
 44 | 
 45 | #### Handlers
 46 | 
 47 | Handlers are all classes, however each of them also have premade variables so you don't need to create a new object each time.
 48 | 
 49 | For example `scraper_handler.py` has
 50 | 
 51 | `class ScraperHandler():`
 52 | 
 53 | and
 54 | 
 55 | `var scraper_handler`
 56 | 
 57 | ##### scraper_handler
 58 | - `search(query, limited_modules[])`
 59 |   - Searches all scraper modules (or specified ones)
 60 | - `resolve(link)`
 61 |   - Finds matching function in module and returns proper response
 62 | 
 63 | ##### download_handler
 64 | - `resolve(link)`
 65 |   - Takes in a download data (typically gotten from a scraper_handler resolve)
 66 |   ```
 67 |   {
 68 |     'epNum': 'name of file',
 69 |     'sources': [
 70 |       'link': 'link',
 71 |       'type': 'typically mp4 or iframe',
 72 |     ]
 73 |   }
 74 |   ```
 75 | 
 76 | ##### info_handler
 77 | 
 78 | For information gathering, use the `info_handler.py`. The functions are -
 79 | 
 80 | ```
 81 | # strict is a boolean, which if True, searches for exact query only.
 82 | search(query, strict):
 83 |   return [
 84 |     {
 85 |       'id': 'id of the show (int)',
 86 |       'titles': 'Other names of the show (str)',
 87 |     }
 88 |   ]
 89 | ```
 90 | 
 91 | ```
 92 | getDetailedInfo(id):
 93 |   return [
 94 |     {
 95 |       'id': 'return the id from the parameter (int)',
 96 |       'other-show-stuff': 'Other info related to show.
 97 |       			  See anidb.py in info_collectors for example',
 98 |       ...
 99 |     }
100 |   ]
101 | ```
102 | 
103 | #### Individual Scraper Functions
104 | 
105 | ```
106 | scrape_all_show_sources(link):
107 |   return {
108 |     'episodes': [
109 |       {
110 |         'epNumber', 'number as a string',
111 |         'sources', sourceType
112 |       }
113 |     ],
114 |     'title': 'title of the show',
115 |     'status': 'status of the show',
116 |     'host': 'host such as gogoanime',
117 |     'released': 'year released as a string',
118 |   }
119 | ```
120 | 
121 | ```
122 | search(query):
123 |   return [
124 |     {
125 |       'link': 'link to the show',
126 |       'title': 'title of the show',
127 |       'language': 'sub or dub',
128 |     },
129 |   ]
130 | ```
131 | 
132 | ```
133 | _scrape_video_sources(link):
134 |   return {
135 |     'epNum': 'episode number as a string',
136 |     'sources': sourceType
137 |   }
138 | ```
139 | 
140 | SourceTypes are in the following format.
141 | ```
142 | [
143 |   {
144 |     'link': 'link to the mp4 or iframe embed',
145 |     'type': 'mp4 or iframe',
146 |   }
147 | ]
148 | ```
149 | 
150 | ## Contributing
151 | 
152 | Want to add a downloader or scraper or info collector?
153 | Each module must have
154 | - URL Handeling / a matching_urls variable that looks like this.
155 | 
156 | ### URL Handling (matching_urls)
157 | Most functions will go through functions until there is a matching url schema. Each scraper contains the following variable which is used by the handler in order to identify the correct module to use when resolving links.
158 | ```
159 | matching_urls = [
160 |   {
161 |     'urls': ['regex match expression'],
162 |     'function': function that should be called,
163 |   },
164 | ]
165 | ```
166 | 
167 | ### Adding a Scraper
168 | Scrapers handle search queries, scraping episodes from hosts and scraping sources from those episodes.
169 | 
170 | Refer to [Functions](#Functions) for data formatting
171 | 
172 | Scrapers should have a couple of functions
173 | - `search`
174 | - `scrape_all_show_sources`
175 |   - scrapes all show details and episodes along with their direct sources
176 | 
177 | Optionally there can also be
178 | - `_scrape_episode_source`
179 |   - scrapes a single episode's source
180 | 
181 | 
182 | Scrapers should be put into the `scrapers` folder
183 | 
184 | ### Adding a downloader
185 | Downloaders are what extract the direct link the the video file or download the file based off a filename.
186 | 
187 | Downloaders need these functions.
188 | - `download(link, filename)`
189 |   - returns True when download is successful or false if failed.
190 | 
191 | Downloaders should be put into the `downloaders` folder
192 | 
193 | ### Adding an information collector
194 | Information collectors collect various information about a particular anime series/movie.
195 | 
196 | They need these functions, which are mentioned in details above.
197 | - search
198 | - getDetailedInfo
199 | 
200 | Info collectors should also have the following variables
201 | - matching_urls
202 | 
203 | - Put them in the `info_collectors` folder
204 | 
205 | ## Credits
206 | - jQwotos
207 | - FadedCoder
208 | - DxCx (NineAnimeUrlExtender)
209 | 


--------------------------------------------------------------------------------
/scrapers/masteranime.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import logging
  3 | 
  4 | # import requests
  5 | # Not working, using below instead
  6 | import cfscrape
  7 | 
  8 | import demjson
  9 | 
 10 | from bs4 import BeautifulSoup
 11 | 
 12 | site_name = 'masteranime'
 13 | requests = cfscrape.create_scraper()
 14 | 
 15 | BASE_URL = "https://www.masterani.me"
 16 | SEARCH_URL = "%s/api/anime/search" % (BASE_URL,)
 17 | SHOW_URL = "%s/anime/info/" % (BASE_URL,)
 18 | EPISODE_LIST_URL = "%s/api/anime/{ID}/detailed" % (BASE_URL,)
 19 | POSTER_URL = ("%s/poster/3/" % BASE_URL).replace("www", "cdn")
 20 | 
 21 | showid_pat = re.compile("%s([0-9]+)-" % (SHOW_URL,))
 22 | sources_pat = re.compile('mirrors:(.*?), auto_update: \[1')
 23 | # sources_pat_2 = re.compile('\[(.*)\]')
 24 | multi_source_pat = [
 25 |     {
 26 |         'pat': sources_pat,
 27 |         'secondary': False,
 28 |     },
 29 |     {
 30 |         'pat': re.compile("var videos = (\[.*?\])"),
 31 |         'secondary': True,
 32 |     }
 33 | ]
 34 | 
 35 | '''
 36 | {
 37 |     'pat': sources_pat_2,
 38 |     'secondary': True,
 39 | },
 40 | '''
 41 | 
 42 | 
 43 | def _combine_link(url):
 44 |     return ("%s%s" % (BASE_URL, url,)).replace(' ', '')
 45 | 
 46 | 
 47 | def _merge_slug(location, slug):
 48 |     return _combine_link("/anime/%s/%s" % (location, slug,))
 49 | 
 50 | 
 51 | def _merge_poster(poster_url):
 52 |     return "%s%s" % (POSTER_URL, poster_url,)
 53 | 
 54 | 
 55 | def _extract_single_search(data):
 56 |     return {
 57 |         'link': _merge_slug("info", data['slug']),
 58 |         'title': data['title'],
 59 |         'id': data['id'],
 60 |         'language': 'sub',  # masteranime only has subs
 61 |         'host': site_name,
 62 |         'poster': _merge_poster(data['poster']['file']),
 63 |     }
 64 | 
 65 | 
 66 | def _extract_multiple_search(data):
 67 |     return [_extract_single_search(x) for x in data]
 68 | 
 69 | 
 70 | # Masteranime has a hidden api
 71 | # that we can abuse, this makes it easier
 72 | # so that we don't need to webscrape as much.
 73 | def search(query):
 74 |     params = {
 75 |         'search': query,
 76 |         'sb': 'true',
 77 |     }
 78 |     data = requests.get(SEARCH_URL, params=params).json()
 79 | 
 80 |     return _extract_multiple_search(data)
 81 | 
 82 | 
 83 | def _scrape_show_id(link):
 84 |     return re.findall(showid_pat, link)[0]
 85 | 
 86 | 
 87 | def _scrape_single_video_source(data, **kwargs):
 88 |     if 'secondary' in kwargs and kwargs['secondary'] is True:
 89 |         return {
 90 |             'link': data['src'],
 91 |             'quality': data['res'],
 92 |             'type': data['type'],
 93 |         }
 94 | 
 95 |     combined = '%s%s' % (data['host']['embed_prefix'], data['embed_id'])
 96 |     if data['host']['embed_suffix'] is not None:
 97 |         combined = "%s%s" % (combined, data['host']['embed_suffix'])
 98 |     return {
 99 |         'link': combined,
100 |         'type': '',
101 |         'quality': data['quality'],
102 |         'id': data['id'],
103 |     }
104 | 
105 | '''
106 | def _scrape_video_sources(link):
107 |     logging.info("Scraping sources for %s under masteranime." % (link,))
108 |     data = BeautifulSoup(requests.get(link).content, 'html.parser')
109 |     scripts = data.findAll("script")
110 |     sources = str(scripts[3])
111 | 
112 |     encoded_sources = re.findall(sources_pat, sources)
113 | 
114 |     # If the sources are located in the first primary script location
115 |     if len(encoded_sources) > 0:
116 |         sources = demjson.decode(encoded_sources[0])
117 |         return [_scrape_single_video_source(x) for x in sources]
118 |     # If the sources are in the second location
119 |     else:
120 |         script = str(scripts[2])
121 |         encoded_sources = re.findall(sources_pat_2, script)
122 |         encoded_sources = "[%s]" % (encoded_sources[0],)
123 |         print(encoded_sources)
124 |         sources = demjson.decode(encoded_sources)
125 |         return [_scrape_single_video_source(x, secondary=True) for x in sources]
126 | '''
127 | 
128 | 
129 | def _scrape_video_sources(link):
130 |     logging.info("Scraping sources for %s under masteanime." % (link,))
131 |     data = BeautifulSoup(requests.get(link).content, 'html.parser')
132 |     scripts = data.findAll('script')
133 |     scripts = scripts[2:]
134 |     for script in scripts:
135 |         for reSource in multi_source_pat:
136 |             encoded_sources = re.findall(reSource.get('pat'), str(script))
137 |             if len(encoded_sources) > 0:
138 |                 sources = demjson.decode(encoded_sources[0])
139 |                 return [
140 |                     _scrape_single_video_source(x, secondary=reSource.get('secondary'))
141 |                     for x in sources
142 |                 ]
143 | 
144 | 
145 | def _parse_list_single(data, link):
146 |     data = data['info']
147 |     link = "%s/%s" % (link, data['episode'])
148 |     return {
149 |         'epNum': data['episode'],
150 |         'sources': _scrape_video_sources(link),
151 |     }
152 | 
153 | 
154 | def _parse_list_multi(data):
155 |     logging.info(
156 |         "A request for scraping all sources from %s under masteranime"
157 |         % (data['link'],)
158 |     )
159 |     return [_parse_list_single(x, data['link']) for x in data['episodes']]
160 | 
161 | 
162 | def _load_list_episodes(data):
163 |     slug = data.get('info').get('slug')
164 |     link = _merge_slug("watch", slug)
165 |     data['link'] = link
166 |     return _parse_list_multi(data)
167 | 
168 | 
169 | def _parse_status(status):
170 |     statuses = ['completed', 'airing']
171 |     return statuses[status]
172 | 
173 | 
174 | def scrape_all_show_sources(link):
175 |     id = _scrape_show_id(link)
176 |     updatedLink = EPISODE_LIST_URL.replace('{ID}', id)
177 |     data = requests.get(updatedLink).json()
178 |     episodes = _load_list_episodes(data)
179 |     data = data['info']
180 |     data.update({
181 |         'episodes': episodes,
182 |         'status': _parse_status(data['status']),
183 |     })
184 |     return data
185 | 
186 | 
187 | matching_urls = [
188 |     {
189 |         'urls': [r'https://www.masterani.me/anime/info/(.*)'],
190 |         'function': scrape_all_show_sources,
191 |     },
192 |     {
193 |         'urls': [],
194 |         'function': search,
195 |     },
196 |     {
197 |         'urls': [r'https://www.masterani.me/anime/watch/(.*)/([0-9]+)'],
198 |         'function': _scrape_video_sources,
199 |     }
200 | ]
201 | 


--------------------------------------------------------------------------------