├── requirements.txt
├── test.py
├── info_handler.py
├── scraper_handler.py
├── info_collectors
├── _init_anidb.py
└── anidb.py
├── .gitignore
├── download_handler.py
├── downloaders
├── mp4.py
├── mycloud.py
└── vidstreaming.py
├── templates
└── module_search.py
├── scrapers
├── anime9.py
├── gogoanime.py
├── animeheaven.py
└── masteranime.py
└── README.md
/requirements.txt:
--------------------------------------------------------------------------------
1 | bs4
2 | cfscrape
3 | requests
4 | furl
5 | html5lib
6 | lxml
7 | demjson
8 |
--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | fileLocation = os.path.realpath(__file__)
3 | directory = os.path.dirname(fileLocation)
4 | print(os.path.join(directory, ".."))
5 |
--------------------------------------------------------------------------------
/info_handler.py:
--------------------------------------------------------------------------------
1 | import glob
2 |
3 | from .templates.module_search import ModuleSearch
4 |
5 |
6 | class InfoHandler(ModuleSearch):
7 |
8 | def __init__(self):
9 | self._get_modules('info_collectors')
10 |
11 | def _search_module(self, query, strict, module):
12 | return module.search(query, strict)
13 |
14 | def search(self, query, strict=False):
15 | return [
16 | self._search_module(query, strict, x)
17 | for x in self.modules
18 | ]
19 |
20 | def _details_module(self, id, module):
21 | return module.getDetailedInfo(id)
22 |
23 | def getDetailedInfo(self, id):
24 | return [
25 | self._details_module(id, x) for x in self.modules
26 | ]
27 |
28 |
29 | info_handler = InfoHandler()
30 |
--------------------------------------------------------------------------------
/scraper_handler.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import imp
3 | import logging
4 | import os
5 | import re
6 |
7 | from difflib import SequenceMatcher
8 | from .templates.module_search import ModuleSearch
9 |
10 |
11 | class ScraperHandler(ModuleSearch):
12 | # Deals with resolving the scraping of links
13 | # Automatically resolves with modules in
14 | # the scrapers folder.
15 | def __init__(self):
16 | self._get_modules('scrapers')
17 |
18 | def _search_module(self, query, module):
19 | return module.search(query)
20 |
21 | # Searches using scraper modules based on query
22 | def search(self, query, limited_modules=None):
23 | logging.debug("Starting a search for '%s'." % (query,))
24 | return [
25 | self._search_module(query, x)
26 | for x in self.modules
27 | if limited_modules is None or x.site_name in limited_modules
28 | ]
29 |
30 | # Resolves a URL and returns data from
31 | # proper module and function
32 | def resolve(self, link):
33 | logging.debug(
34 | "Starting a resolution for '%s'"
35 | "under scraper_handler." % (link,)
36 | )
37 | for module in self.modules:
38 | functions = self._try_match_module(link, module)
39 | if len(functions) > 0:
40 | return functions[0](link)
41 | return None
42 |
43 |
44 | def score_similarity(stringA, stringB):
45 | return SequenceMatcher(None, stringA, stringB).ratio()
46 |
47 | scraper_handler = ScraperHandler()
48 |
--------------------------------------------------------------------------------
/info_collectors/_init_anidb.py:
--------------------------------------------------------------------------------
1 | from datetime import date
2 | import requests
3 | import os
4 |
5 |
6 | BASE_PATH = os.path.dirname(os.path.realpath(__file__))
7 | INFO_FILE = os.path.join(BASE_PATH, "last_download.txt")
8 | DOWNLOAD_URL = "http://anidb.net/api/anime-titles.xml.gz"
9 | DOWNLOAD_FILE = os.path.join(BASE_PATH, "anime-titles.xml")
10 |
11 |
12 | class DownloadList:
13 |
14 | def __init__(self):
15 | self.need_download = self.need_to_download()
16 |
17 | def need_to_download(self):
18 | try:
19 | with open(INFO_FILE, "r") as f:
20 | data = f.readline()
21 | if len(data) > 0:
22 | last_download = date.fromordinal(int(data))
23 | time_delta = date.today() - last_download
24 | if time_delta.days > 7:
25 | return True
26 | else:
27 | return False
28 | else:
29 | return True
30 | except FileNotFoundError:
31 | return True
32 | return False
33 |
34 | def write_today_ordinal(self):
35 | with open(INFO_FILE, "w") as f:
36 | f.write(str(date.today().toordinal()) + "\n")
37 |
38 | def download_list(self):
39 | request = requests.get(DOWNLOAD_URL, stream=True)
40 | with open(DOWNLOAD_FILE, "wb") as f:
41 | for chunk in request.iter_content(chunk_size=1024):
42 | if chunk:
43 | f.write(chunk)
44 |
45 | def get_file(self):
46 | if self.need_to_download():
47 | self.write_today_ordinal()
48 | self.download_list()
49 |
50 |
51 | download_list = DownloadList()
52 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *.cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 |
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 |
60 | # Scrapy stuff:
61 | .scrapy
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | # Jupyter Notebook
70 | .ipynb_checkpoints
71 |
72 | # pyenv
73 | .python-version
74 |
75 | # celery beat schedule file
76 | celerybeat-schedule
77 |
78 | # SageMath parsed files
79 | *.sage.py
80 |
81 | # Environments
82 | .env
83 | .venv
84 | env/
85 | venv/
86 | ENV/
87 |
88 | # Spyder project settings
89 | .spyderproject
90 | .spyproject
91 |
92 | # Rope project settings
93 | .ropeproject
94 |
95 | # mkdocs documentation
96 | /site
97 |
98 | # My files
99 | anime-titles.xml.gz
100 | anime-titles.xml
101 | last_download.txt
--------------------------------------------------------------------------------
/download_handler.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from .templates.module_search import ModuleSearch
4 |
5 |
6 | class DownloadHandler(ModuleSearch):
7 | # Deals with resolving downloading of files
8 | def __init__(self):
9 | self._get_modules('downloaders')
10 |
11 | def single_download(self, link, abs_path):
12 | """
13 | Download a single episode.
14 | 'link' is the full link of it (get it with scraper_handler).
15 | 'abs_path' is full path + filename of downloaded file, example -
16 | "/home/User/MyDownloadedEpisode.mp4"
17 | """
18 | for module in self.modules:
19 | if self._try_match_module(link, module):
20 | if module.download(link, abs_path):
21 | return True
22 | return False
23 | return False
24 |
25 | def resolve(self, data):
26 | logging.info(
27 | "Trying to resolve '%s'"
28 | % (data['epNum'])
29 | )
30 | for module in self.modules:
31 | for source in data['sources']:
32 | logging.info(
33 | "Trying to resolve '%s' source."
34 | % (source['link'])
35 | )
36 | if self._try_match_module(source['link'], module):
37 | logging.info(
38 | "Found a matching module for '%s'."
39 | % (source,)
40 | )
41 | # PEP8 Too long
42 | fileName = "%s.mp4" % (data['epNum'],) if 'epNum' in data else source
43 | if module.download(source['link'], fileName):
44 | break
45 |
46 | download_handler = DownloadHandler()
47 |
--------------------------------------------------------------------------------
/downloaders/mp4.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import time
4 | import logging
5 |
6 | import requests
7 |
8 |
9 | class Timer:
10 | def restart(self, request):
11 | self.length = int(request.headers.get('content-length'))
12 | self.start = time.clock()
13 |
14 | self.current = 0
15 |
16 | def __init__(self, request):
17 | self.restart(request)
18 |
19 | def tick(self, chunk_size):
20 | self.current += chunk_size
21 | speed = round(self.current // (time.clock() - self.start) / 1000000, 2)
22 | percentComplete = round((self.current / self.length) * 100, 1)
23 | sys.stdout.write(
24 | "\r %s Mbps | %r Percent Complete"
25 | % (speed, percentComplete)
26 | )
27 |
28 |
29 | def download(link, filename):
30 | logging.info("Starting download for %s." % (link,))
31 | tempName = "%s.tmp" % (filename,)
32 |
33 | with open(tempName, 'wb') as f:
34 | request = requests.get(link, stream=True)
35 |
36 | # timer = Timer(request)
37 |
38 | for chunk in request.iter_content(chunk_size=1024):
39 | # timer.tick(len(chunk))
40 |
41 | if chunk:
42 | f.write(chunk)
43 | else:
44 | logging.error("Failed to a chunk for '%s'." % (link,))
45 | logging.info("Finished downloading '%s'." % (link,))
46 | os.rename(tempName, filename)
47 | return True
48 |
49 | matching_urls = [
50 | {
51 | 'urls': [
52 | r'http://(.*).animeheaven.eu/video/(.*).mp4(.*)',
53 | r'http://(.*).animeheaven.eu/[0-9]+pi/(.*).mp4(.*)',
54 | r'https://[0-9]+.bp.blogspot.com(.*)',
55 | ],
56 | 'function': download,
57 | },
58 | ]
59 |
--------------------------------------------------------------------------------
/templates/module_search.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import imp
3 | import logging
4 | import os
5 | import re
6 |
7 |
8 | class ModuleSearch(object):
9 | def _load_single_module(self, f):
10 | return imp.load_source(f[:-3], f)
11 |
12 | def _load_modules(self):
13 | return [self._load_single_module(x) for x in self.modules]
14 |
15 | def _try_match_url(self, link, matchingURL):
16 | return True if re.match(matchingURL, link) is not None else False
17 |
18 | def _try_match_module_section(self, link, section):
19 | urls = section['urls']
20 | matches = [
21 | section['function'] for x in urls
22 | if self._try_match_url(link, x) is not False
23 | ]
24 | return True if len(matches) > 0 else False
25 |
26 | def _try_match_module(self, link, module):
27 | sections = module.matching_urls
28 | return [x['function'] for x in sections
29 | if self._try_match_module_section(link, x) is not False]
30 |
31 | def __is_underscore(self, f):
32 | if f[f.rfind('/') + 1] == "_":
33 | return True
34 | return False
35 |
36 | def _get_modules(self, location):
37 | fileLocation = os.path.realpath(__file__)
38 | directory = os.path.dirname(fileLocation)
39 | self.module_location = os.path.join(directory, '..', location)
40 | self.modules = glob.glob("%s/*.py" % (self.module_location))
41 | self.modules = [
42 | module for module in self.modules
43 | if not self.__is_underscore(module)
44 | ]
45 | '''
46 | for i in range(len(self.modules)): # Delete modules beginning with '_'
47 | module = self.modules[i]
48 | if module[module.rfind("/") + 1] == "_":
49 | del self.modules[i]
50 | '''
51 | self.modules = self._load_modules()
52 |
--------------------------------------------------------------------------------
/downloaders/mycloud.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 | import re
4 | import subprocess
5 |
6 | import requests
7 |
8 | from bs4 import BeautifulSoup
9 |
10 | types = ['iframe']
11 |
12 | MY_CLOUD_PAT = re.compile(' MIN_SIMILARITY_RATIO:
44 | if ratio > highest_ratio:
45 | highest_ratio = ratio
46 | if not highest_ratio:
47 | continue
48 | ratio_list.append(highest_ratio)
49 | id = int(anime['aid'])
50 | titles = [title.string for title in
51 | anime.findAll("title", attrs={"type": ["main", "official"]})]
52 | results.append({"id": id, "titles": titles})
53 | return [x for (y, x) in
54 | sorted(list(zip(ratio_list, results)),
55 | key=lambda pair: pair[0], reverse=True)]
56 |
57 |
58 | def getDetailedInfo(id):
59 | '''
60 | Gets a detailed info from the ID provided. A dict is returned with
61 | the following keys. The type of the value is also mentioned.
62 |
63 | id: int, type: str, start_date: str, end_date: str, other_names: str,
64 | creators: [{str: str}], permanent_rating: float, image_url: str,
65 | description: str, recommendations: [{str: str}]
66 | '''
67 | request = requests.get(BASE_URL, params={
68 | "request": "anime",
69 | "aid": str(id),
70 | "protover": "1",
71 | "client": CLIENT,
72 | "clientver": str(CLIENT_VERSION)
73 | })
74 | request.raise_for_status()
75 | result_page = BeautifulSoup(request.text, "xml")
76 |
77 | results = {
78 | "id": id,
79 | "type": result_page.find("type").string,
80 | "episode_count": result_page.find("episodecount").string,
81 | "start_date": result_page.find("startdate").string,
82 | "end_date": result_page.find("enddate").string,
83 | "other_names": [title.string for title in
84 | result_page.find("titles").findAll("title")],
85 | "creators": [{name['type']: name.string}
86 | for name in result_page.find("creators").findAll("name")],
87 | "permanent_rating": float(result_page.find("ratings")
88 | .find("permanent").string),
89 | "image_url": IMAGE_URL + result_page.find("picture").string,
90 | "description": result_page.find("description").string
91 | }
92 | return results
93 |
94 |
95 | matching_urls = [
96 | {
97 | 'urls': [],
98 | 'function': search,
99 | },
100 | {
101 | 'urls': [],
102 | 'function': getDetailedInfo,
103 | },
104 | ]
105 |
--------------------------------------------------------------------------------
/scrapers/anime9.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import requests
4 |
5 | from bs4 import BeautifulSoup as bs
6 |
7 | site_name = "9anime.is"
8 |
9 | BASE_URL = 'https://9anime.is'
10 | SEARCH_URL = '%s/search' % (BASE_URL,)
11 | INFO_API_URL = "%s/ajax/episode/info" % (BASE_URL,)
12 |
13 |
14 | def _parse_search_single(data):
15 | img = data.find("img")
16 | nameAnchor = data.find("a", {"class": "name"})
17 | lang = data.find('div', {'class': 'lang'})
18 | lang = lang.text if lang is not None else 'sub'
19 |
20 | return {
21 | 'title': nameAnchor.text,
22 | 'link': nameAnchor['href'],
23 | 'language': lang.lower(),
24 | 'host': site_name,
25 | 'poster': img['src']
26 | }
27 |
28 |
29 | def _parse_search_multi(data):
30 | return [
31 | _parse_search_single(x)
32 | for x in data.findAll("div", {"class": "item"})
33 | ]
34 |
35 |
36 | def search(query):
37 | params = {
38 | 'keyword': query,
39 | }
40 | data = bs(requests.get(SEARCH_URL, params=params).content)
41 |
42 | return _parse_search_multi(data)
43 |
44 |
45 | def _scrape_episode_source(data):
46 | return {
47 | 'link': data['file'],
48 | 'type': data['type'],
49 | 'quality': data['label'],
50 | }
51 |
52 |
53 | def _scrape_episode_sources(data):
54 | request = requests.get(data['grabber'], params=data['params']).json()
55 | return [_scrape_episode_source(x) for x in request['data']]
56 |
57 |
58 | def _scrape_episode_info(id):
59 | logging.debug("'%s' is performing a info grab for '%s'" % (site_name, id,))
60 | params = {'id': id}
61 | data = requests.get(INFO_API_URL, params=params)
62 | if data.status_code == 200:
63 | data = data.json()
64 | if data.get('target') == '' or data.get('type') == 'direct':
65 | return _scrape_episode_sources(data)
66 | else:
67 | return {
68 | 'link': data.get('target'),
69 | 'type': data.get('type'),
70 | }
71 |
72 |
73 | def _parse_server_single_episode(data):
74 | anchor = data.find("a")
75 | id = anchor['data-id']
76 | output = {
77 | 'data-id': id,
78 | 'epNum': anchor.text,
79 | 'sources': _scrape_episode_info(id),
80 | }
81 | return output if output['sources'] is not None else None
82 |
83 |
84 | def _parse_server_episodes(data):
85 | episodes = data.findAll("li")
86 | sources = [_parse_server_single_episode(x) for x in episodes]
87 | if len(sources) > 0:
88 | return list(filter(None, sources))
89 |
90 |
91 | def _scrape_all_servers(data):
92 | servers = data.findAll("ul", {"class": "episodes range active"})
93 | sourcedServers = [_parse_server_episodes(x) for x in servers]
94 | return list(filter(None, sourcedServers))
95 |
96 |
97 | def format_combine_multi(unformatedOutput):
98 | output = []
99 | for ep in unformatedOutput:
100 | output.append({
101 | 'epNum': str(int(ep)), # remove leading 0s
102 | 'sources': unformatedOutput[ep]
103 | })
104 | return output
105 |
106 |
107 | def combine_multi(servers):
108 | unformatedOutput = {}
109 | print(servers)
110 | for server in servers:
111 | for ep in server:
112 | if ep['epNum'] not in unformatedOutput:
113 | unformatedOutput[ep['epNum']] = [ep['sources']]
114 | else:
115 | unformatedOutput[ep['epNum']] += [ep['sources']]
116 |
117 | return format_combine_multi(unformatedOutput)
118 |
119 |
120 | def _scrape_title(data):
121 | return data.find('h1', {'class': 'title'}).text
122 |
123 |
124 | def scrape_all_show_sources(link):
125 | logging.info(
126 | "A request for '%s' was made under %s scraper." %
127 | (link, site_name)
128 | )
129 | data = bs(requests.get(link).content, 'html.parser')
130 | body = data.find('body')
131 | servers = _scrape_all_servers(data)
132 | return {
133 | 'episodes': combine_multi(servers),
134 | 'title': _scrape_title(data),
135 | 'host': site_name,
136 | }
137 |
138 | matching_urls = [
139 | {
140 | 'urls': [
141 | r'https://9anime.to/watch/(.*).(.*)',
142 | r'https://9anime.is/watch/(.*).(.*)'
143 | ],
144 | 'function': scrape_all_show_sources,
145 | },
146 | ]
147 |
--------------------------------------------------------------------------------
/downloaders/vidstreaming.py:
--------------------------------------------------------------------------------
1 | import re
2 | import os
3 | import sys
4 | import logging
5 |
6 | import requests
7 |
8 | from furl import furl
9 | from bs4 import BeautifulSoup
10 |
11 | BASE_PATH = os.path.dirname(os.path.realpath(__file__))
12 | sys.path.append(BASE_PATH)
13 | import mp4
14 |
15 | site_name = 'vidstream'
16 |
17 | BASE_URL = "https://vidstreaming.io"
18 | DOWNLOAD_URL = "https://vidstream.co/download"
19 |
20 | qualities = ['1080', '720', '480', '360']
21 |
22 | STREAMING_PAT = '\?id=([a-zA-Z0-9]+?)(?:=|$)'
23 |
24 |
25 | def _try_match_url(link, matchingURL):
26 | return True if re.match(matchingURL, link) is not None else False
27 |
28 |
29 | def _try_match_module_section(link, section):
30 | urls = section['urls']
31 | matches = [
32 | section['function'] for x in urls
33 | if _try_match_url(link, x) is not False
34 | ]
35 | return True if len(matches) > 0 else False
36 |
37 |
38 | def resolve(link):
39 | for section in internal_matching_urls:
40 | if _try_match_module_section(link, section):
41 | logging.info("Found a match for %s" % (link,))
42 | return section['function'](link)
43 | return None
44 |
45 |
46 | def download(link, fname):
47 | logging.info("Starting download for '%s' under vidstreaming." % (link,))
48 | sources = resolve(link)['sources']
49 | logging.info("Recieved %i sources" % (len(sources)))
50 | if len(sources) > 0:
51 | source = sources[0]['link']
52 | else:
53 | logging.critical("Can't find sources on vidstreaming!")
54 | return False
55 | if source is not None:
56 | if mp4.download(source, fname):
57 | return True
58 | return False
59 |
60 |
61 | def _parse_quality(title):
62 | for q in qualities:
63 | if q in title:
64 | return q
65 | return None
66 |
67 |
68 | def _parse_list_single(data):
69 | return {
70 | 'link': data['href'],
71 | 'type': 'mp4',
72 | 'quality': _parse_quality(data.text),
73 | }
74 |
75 |
76 | def _parse_list_multi(data):
77 | box = data.find("div", {"class": "mirror_link"})
78 | sources = box.findAll("a")
79 | if len(sources) == 0:
80 | logging.critical("Can't find sources on vidstreaming!")
81 | return [_parse_list_single(x) for x in sources]
82 |
83 |
84 | def _scrape_video_sources_id(id):
85 | params = {
86 | 'id': id,
87 | }
88 | request = requests.get(DOWNLOAD_URL, params=params).content
89 | data = BeautifulSoup(request, 'html.parser')
90 | return {
91 | 'sources': _parse_list_multi(data),
92 | }
93 |
94 |
95 | def _scrape_video_sources(link):
96 | id = furl(link).args['id']
97 | logging.info("Found id %s from '%s'" % (id, link,))
98 | return _scrape_video_sources_id(id)
99 |
100 |
101 | def _parse_list_embed_single(data):
102 | return {
103 | 'link': data['src'],
104 | 'type': 'mp4',
105 | 'quality': data['label'],
106 | }
107 |
108 |
109 | def _parse_list_embed_multi(data):
110 | sources = data.findAll("source", {"type": "video/mp4"})
111 | return [_parse_list_embed_single(x) for x in sources]
112 |
113 |
114 | def _scrape_video_embed(link):
115 | data = BeautifulSoup(requests.get(link).content, 'html.parser')
116 | result = {
117 | 'sources': _parse_list_embed_multi(data),
118 | }
119 | if len(result['sources']) == 0:
120 | logging.info('Falling back to legacy downloader for %s' % (link,))
121 | result['sources'] = _scrape_video_sources(link)
122 | return result
123 |
124 | def _fix_link(link):
125 | fixed_link = "http:" + link
126 | return _scrape_video_embed(fixed_link)
127 |
128 | def _scrape_streaming(link):
129 | id = re.search(STREAMING_PAT, link)
130 | id = id.group(1) if id is not None else None
131 |
132 | if id:
133 | return _scrape_video_sources_id(id)
134 |
135 | return None
136 |
137 | matching_urls = [
138 | {
139 | 'urls': [
140 | r'//vidstreaming.io/streaming.php\?id=(.*)&title=(.*)',
141 | r'https://vidstream.co/embed.php\?(.*)',
142 | r'https://vidstreaming.io/embed.php\?id=(.*)',
143 | ],
144 | 'function': download,
145 | }
146 | ]
147 |
148 | internal_matching_urls = [
149 | {
150 | 'urls': [
151 | r'https://vidstream.co/download\?id=(.*)',
152 | ],
153 | 'function': _scrape_video_sources,
154 | },
155 | {
156 | 'urls': [
157 | r'https://vidstream.co/embed.php\?(.*)',
158 | r'https://vidstreaming.io/embed.php\?id=(.*)',
159 | ],
160 | 'function': _scrape_video_embed,
161 | },
162 | {
163 | 'urls': [
164 | r'//vidstreaming.io/streaming.php\?id=(.*)&title=(.*)',
165 | ],
166 | 'function': _scrape_streaming,
167 | }
168 | ]
169 |
--------------------------------------------------------------------------------
/scrapers/gogoanime.py:
--------------------------------------------------------------------------------
1 | import re
2 | import logging
3 |
4 | import cfscrape as cf
5 |
6 | from bs4 import BeautifulSoup
7 |
8 | site_name = 'gogoanime'
9 |
10 | BASE_URL = "https://gogoanime.io"
11 | SEARCH_URL = "%s/search.html" % (BASE_URL,)
12 | EPISODE_LIST_URL = "%s//load-list-episode" % (BASE_URL,)
13 | SHOW_URL = "%s/category/" % (BASE_URL,)
14 |
15 | id_pat = re.compile("var id = (.*?);")
16 | streaming_name_pat = re.compile('"(.*?)"')
17 | epnum_pat = re.compile('episode-(.*?)$')
18 | released_pat = re.compile("Released: ([0-9]+)")
19 |
20 | cfscrape = cf.create_scraper()
21 |
22 |
23 | def _combine_link(url):
24 | return ("%s%s" % (BASE_URL, url,)).replace(' ', '')
25 |
26 |
27 | def _parse_released_date(data):
28 | fullString = str(data.find("p", {"class": "released"}))
29 | output = re.findall(released_pat, fullString)
30 | return output[0] if len(output) > 0 else None
31 |
32 |
33 | def _extract_single_search(data):
34 | name = data.find('p', {'class': 'name'}).find('a')
35 | return {
36 | 'link': _combine_link(name['href']),
37 | 'title': name.text,
38 | 'language': 'dub' if 'dub' in name.text.lower() else 'sub',
39 | 'released': _parse_released_date(data),
40 | 'host': site_name,
41 | }
42 |
43 |
44 | def _extract_multiple_search(data):
45 | entries = data.find('ul', {'class': 'items'}).findAll("li")
46 | return [_extract_single_search(x) for x in entries]
47 |
48 |
49 | def search(query):
50 | '''
51 | Returns all search results based on a query
52 | [
53 | {
54 | 'link': 'link to show on gogoanime',
55 | 'title': 'the full title of the show',
56 | 'language': 'either subbed or dubbed',
57 | }
58 | ]
59 | '''
60 | params = {
61 | 'keyword': query,
62 | 'id': -1,
63 | }
64 |
65 | data = cfscrape.get(SEARCH_URL, params=params).content
66 | data = BeautifulSoup(data, 'html.parser')
67 |
68 | return _extract_multiple_search(data)
69 |
70 |
71 | def _parse_list_single(data):
72 | return {
73 | 'name': data.find("div", {"class": "name"}).text,
74 | 'link': _combine_link(data['href']),
75 | 'language': data.find("div", {"class": "cate"}).text.lower(),
76 | 'type': 'iframe',
77 | }
78 |
79 |
80 | def _parse_list_multi(data):
81 | episodes = data.findAll("a")
82 | return [_parse_list_single(x) for x in episodes]
83 |
84 |
85 | def _load_list_episode(id):
86 | params = {
87 | 'ep_start': 0,
88 | 'ep_end': 9999999,
89 | 'id': id,
90 | 'default_ep': 0,
91 | }
92 | data = cfscrape.get(EPISODE_LIST_URL, params=params).content
93 | data = BeautifulSoup(data, 'html.parser')
94 | return _parse_list_multi(data)
95 |
96 |
97 | def _scrape_show_id(data):
98 | return re.findall(id_pat, str(data))
99 |
100 |
101 | def _scrape_title(data):
102 | return data.find("div", {"class": "anime_info_body_bg"}).find('h1').text
103 |
104 |
105 | def _scrape_status(data):
106 | return data.findAll('p', {'class': 'type'})[4].text.replace('Status: ', '')
107 |
108 |
109 | def _scrape_released(data):
110 | text = data.findAll('p', {'class': 'type'})[3].text
111 | return text.replace('Released: ', '')
112 |
113 |
114 | def _scrape_epNum(url):
115 | epNum = re.findall(epnum_pat, url)
116 | return epNum[0] if len(epNum) > 0 else '0'
117 |
118 |
119 | def _scrape_single_video_source(data):
120 | return {
121 | 'link': data['data-video'],
122 | 'type': 'iframe'
123 | }
124 |
125 |
126 | def _scrape_video_sources(link):
127 | data = cfscrape.get(link).content
128 | soupedData = BeautifulSoup(data, 'html.parser')
129 | sources = soupedData.find("div", {"class", "anime_muti_link"})
130 | sources = sources.findAll("a")
131 |
132 | return {
133 | 'epNum': _scrape_epNum(link),
134 | 'sources': list(map(
135 | lambda x: _scrape_single_video_source(x),
136 | sources)
137 | ),
138 | }
139 |
140 |
141 | def scrape_all_show_sources(link):
142 | data = cfscrape.get(link).content
143 | id = _scrape_show_id(data)
144 | data = BeautifulSoup(data, 'html.parser')
145 | episodes = _load_list_episode(id)
146 |
147 | return {
148 | 'episodes': [_scrape_video_sources(x['link']) for x in episodes],
149 | 'title': _scrape_title(data),
150 | 'status': _scrape_status(data),
151 | 'host': 'gogoanime',
152 | 'released': _scrape_released(data),
153 | }
154 |
155 | matching_urls = [
156 | {
157 | 'urls': [r'https://(.*)gogoanime.io/category/(.*)'],
158 | 'function': scrape_all_show_sources,
159 | },
160 | {
161 | 'urls': [r'https://(.*)gogoanime.io//search.html?keyword=(.*)'],
162 | 'function': search,
163 | },
164 | {
165 | 'urls': [r'https://(.*)gogoanime.io/(.*)-episode-([0-9]+)'],
166 | 'function': _scrape_video_sources,
167 | }
168 | ]
169 |
--------------------------------------------------------------------------------
/scrapers/animeheaven.py:
--------------------------------------------------------------------------------
1 | import re
2 | import logging
3 |
4 | import requests
5 |
6 | from bs4 import BeautifulSoup
7 |
8 | site_name = 'animeheaven'
9 |
10 | BASE_URL = "http://animeheaven.eu"
11 | SEARCH_URL = "%s/search.php" % (BASE_URL,)
12 |
13 | # source_pat = re.compile("Status:(.*?)
')
17 | released_pat = re.compile('Year:
(.*)
')
18 |
19 |
20 | def _combine_link(url):
21 | # Combines the relative url with the base url
22 | return ("%s/%s" % (BASE_URL, url,)).replace(' ', '%20')
23 |
24 |
25 | def _extract_single_search(data):
26 | # Takes in bs4 data of a single search result
27 | # and returns a formated dict
28 | anchor = data.find("a")
29 | img = anchor.find("img")
30 | name = img['alt']
31 | return {
32 | 'link': _combine_link(anchor['href']),
33 | 'title': name,
34 | 'language': 'dub' if 'dub' in name.lower() else 'sub',
35 | 'host': site_name,
36 | 'poster': _combine_link(img['src']),
37 | }
38 |
39 |
40 | def _extract_multiple_search(data):
41 | # Takes in search result page
42 | # and returns list of formated results
43 | entries = data.findAll('div', {'class': 'iep'})
44 | return [_extract_single_search(x) for x in entries]
45 |
46 |
47 | def search(query):
48 | '''
49 | Returns all search results based on a query
50 | [
51 | {
52 | 'link': 'link to show on gogoanime',
53 | 'title': 'the full title of the show',
54 | 'language': 'either subbed or dubbed',
55 | }
56 | ]
57 | '''
58 | logging.info("A query for %s was made under animeheaven" % (query,))
59 | params = {'q': query}
60 | data = requests.get(SEARCH_URL, params=params).content
61 | data = BeautifulSoup(data, 'html.parser')
62 |
63 | return _extract_multiple_search(data)
64 |
65 |
66 | def _parse_list_single(data):
67 | return {
68 | 'name': data.find("div", {"class": "infoept2"}),
69 | 'link': _combine_link(data['href']),
70 | }
71 |
72 |
73 | def _parse_list_multi(data):
74 | box = data.find("div", {"class": "infoepbox"})
75 | episodes = box.findAll("a")
76 | return [_parse_list_single(x) for x in episodes]
77 |
78 |
79 | def _hex_source_to_str(source_url):
80 | return bytes(source_url, 'utf-8').decode('unicode_escape')
81 |
82 |
83 | def _scrape_single_video_source(data):
84 | source_url = re.findall(source_pat, str(data))
85 | return {
86 | 'link': _hex_source_to_str(source_url[0]) if len(source_url) > 0 else None,
87 | 'type': 'mp4',
88 | }
89 |
90 |
91 | def _scrape_epNum(url):
92 | epNum = re.search(epnum_pat, url)
93 | return epNum.group().replace('e=', '') if epNum is not None else None
94 |
95 | def _parse_multi_video_sources(data):
96 | return [_scrape_video_sources(x) for x in data]
97 |
98 |
99 | def _scrape_video_sources(link):
100 | # Scrapes details on a specific
101 | # episode of a show based on link
102 | data = BeautifulSoup(requests.get(link).content)
103 | logging.info("Scraping video sources for %s under animeheaven" % (link,)) # test = data.findall("div", {'class': 'centerf2'})
104 | sources = data.find("div", {'class': 'centerf2'}).findAll('script')
105 |
106 | return {
107 | 'epNum': _scrape_epNum(link),
108 | 'sources': [_scrape_single_video_source(x) for x in sources],
109 | }
110 |
111 | def _scrape_title(data):
112 | # Takes in bs4 show page
113 | # and returns the title of
114 | # the show
115 | return data.find("div", {"class": "infodes"}).text
116 |
117 |
118 | def _scrape_released(data):
119 | # Takes in bs4 show page and
120 | # returns released year as string
121 | box = data.findAll("div", {"class": 'infodes2'})
122 | if len(box) < 1: return None
123 | box = box[1]
124 | released_date = re.search(released_pat, str(box))
125 | return released_date.group() if released_date is not None else Noneß
126 |
127 |
128 | def _scrape_status(data):
129 | # Takes in bs4 show page and
130 | # return status of the show
131 | box = data.findAll("div", {"class": "infodes2"})
132 | if len(box) < 1: return Noneß
133 | box = box[1]
134 | status = re.search(status_pat, str(box))
135 | return status.group() if status is not None else None
136 |
137 |
138 | def scrape_all_show_sources(link):
139 | # Returns all show's sources and details
140 | # based on the link of the show.
141 | logging.info(
142 | "A request for '%s' was made to animeheaven scraper."
143 | % (link,)
144 | )
145 | data = BeautifulSoup(requests.get(link).content, 'html.parser')
146 | episodes = _parse_list_multi(data)
147 | logging.debug("Got %i links for %s" % (len(episodes), link,))
148 |
149 | return {
150 | 'episodes': [_scrape_video_sources(x['link']) for x in episodes],
151 | 'title': _scrape_title(data),
152 | 'status': _scrape_status(data),
153 | 'host': 'animeheaven',
154 | 'released': _scrape_released(data),
155 | }
156 |
157 | matching_urls = [
158 | {
159 | 'urls': [r'http://animeheaven.eu/i.php\?a=(.*)'],
160 | 'function': scrape_all_show_sources,
161 | },
162 | {
163 | 'urls': [r'http://animeheaven.eu/search.php\?q=(.*)'],
164 | 'function': search,
165 | },
166 | {
167 | 'urls': [r'http://animeheaven.eu/watch.php\?a=(.*)&e=([0-9]+)'],
168 | 'function': _scrape_video_sources,
169 | }
170 | ]
171 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # anime-scrapers
2 |
3 | Anime scrapers is a collection of scrapers that have been all unified.
4 |
5 | ## Table of Contents
6 | - [Installation](#installation)
7 | - [Mac / OSX](#mac-osx)
8 | - [General Installation](#general-installation)
9 | - [Usage](#usage)
10 | - [Functions](#functions)
11 | - [Handlers](#handlers)
12 | - [Scraper Handler](#scraper_handler)
13 | - [Download Handler](#download_handler)
14 | - [Info Handler](#info_handler)
15 | - [Individual Scraper Functions](#individual-scraper-functions)
16 | - [Contributing](#contributing)
17 | - [URL Handling (matching_urls)](#url-handling-matching_urls)
18 | - [Adding a Scraper](#adding-a-scraper)
19 | - [Adding a Downloader](#adding-a-downloader)
20 | - [Adding an Info Collector](#adding-an-information-collector)
21 | - [Credits](#credits)
22 |
23 | ## Installation
24 |
25 | ### Mac (OSX)
26 | - Install [Brew](https://brew.sh/)
27 | - Install python3
28 | - `brew install python3`
29 | - Continue to general installation
30 |
31 | ### General Installation
32 | - Clone repository
33 | - `git clone https://github.com/jQwotos/anime_scrapers`
34 | - Nav into repo
35 | - `cd anime_scrapers`
36 | - Install required python packages
37 | - `pip install -r requirements.txt`
38 |
39 | ## Usage
40 |
41 | anime_scrapers is the backend that is to be used by other applications. You can however use it directly if you want by using the python shell, but it's better to use an application.
42 |
43 | ### Functions
44 |
45 | #### Handlers
46 |
47 | Handlers are all classes, however each of them also have premade variables so you don't need to create a new object each time.
48 |
49 | For example `scraper_handler.py` has
50 |
51 | `class ScraperHandler():`
52 |
53 | and
54 |
55 | `var scraper_handler`
56 |
57 | ##### scraper_handler
58 | - `search(query, limited_modules[])`
59 | - Searches all scraper modules (or specified ones)
60 | - `resolve(link)`
61 | - Finds matching function in module and returns proper response
62 |
63 | ##### download_handler
64 | - `resolve(link)`
65 | - Takes in a download data (typically gotten from a scraper_handler resolve)
66 | ```
67 | {
68 | 'epNum': 'name of file',
69 | 'sources': [
70 | 'link': 'link',
71 | 'type': 'typically mp4 or iframe',
72 | ]
73 | }
74 | ```
75 |
76 | ##### info_handler
77 |
78 | For information gathering, use the `info_handler.py`. The functions are -
79 |
80 | ```
81 | # strict is a boolean, which if True, searches for exact query only.
82 | search(query, strict):
83 | return [
84 | {
85 | 'id': 'id of the show (int)',
86 | 'titles': 'Other names of the show (str)',
87 | }
88 | ]
89 | ```
90 |
91 | ```
92 | getDetailedInfo(id):
93 | return [
94 | {
95 | 'id': 'return the id from the parameter (int)',
96 | 'other-show-stuff': 'Other info related to show.
97 | See anidb.py in info_collectors for example',
98 | ...
99 | }
100 | ]
101 | ```
102 |
103 | #### Individual Scraper Functions
104 |
105 | ```
106 | scrape_all_show_sources(link):
107 | return {
108 | 'episodes': [
109 | {
110 | 'epNumber', 'number as a string',
111 | 'sources', sourceType
112 | }
113 | ],
114 | 'title': 'title of the show',
115 | 'status': 'status of the show',
116 | 'host': 'host such as gogoanime',
117 | 'released': 'year released as a string',
118 | }
119 | ```
120 |
121 | ```
122 | search(query):
123 | return [
124 | {
125 | 'link': 'link to the show',
126 | 'title': 'title of the show',
127 | 'language': 'sub or dub',
128 | },
129 | ]
130 | ```
131 |
132 | ```
133 | _scrape_video_sources(link):
134 | return {
135 | 'epNum': 'episode number as a string',
136 | 'sources': sourceType
137 | }
138 | ```
139 |
140 | SourceTypes are in the following format.
141 | ```
142 | [
143 | {
144 | 'link': 'link to the mp4 or iframe embed',
145 | 'type': 'mp4 or iframe',
146 | }
147 | ]
148 | ```
149 |
150 | ## Contributing
151 |
152 | Want to add a downloader or scraper or info collector?
153 | Each module must have
154 | - URL Handeling / a matching_urls variable that looks like this.
155 |
156 | ### URL Handling (matching_urls)
157 | Most functions will go through functions until there is a matching url schema. Each scraper contains the following variable which is used by the handler in order to identify the correct module to use when resolving links.
158 | ```
159 | matching_urls = [
160 | {
161 | 'urls': ['regex match expression'],
162 | 'function': function that should be called,
163 | },
164 | ]
165 | ```
166 |
167 | ### Adding a Scraper
168 | Scrapers handle search queries, scraping episodes from hosts and scraping sources from those episodes.
169 |
170 | Refer to [Functions](#Functions) for data formatting
171 |
172 | Scrapers should have a couple of functions
173 | - `search`
174 | - `scrape_all_show_sources`
175 | - scrapes all show details and episodes along with their direct sources
176 |
177 | Optionally there can also be
178 | - `_scrape_episode_source`
179 | - scrapes a single episode's source
180 |
181 |
182 | Scrapers should be put into the `scrapers` folder
183 |
184 | ### Adding a downloader
185 | Downloaders are what extract the direct link the the video file or download the file based off a filename.
186 |
187 | Downloaders need these functions.
188 | - `download(link, filename)`
189 | - returns True when download is successful or false if failed.
190 |
191 | Downloaders should be put into the `downloaders` folder
192 |
193 | ### Adding an information collector
194 | Information collectors collect various information about a particular anime series/movie.
195 |
196 | They need these functions, which are mentioned in details above.
197 | - search
198 | - getDetailedInfo
199 |
200 | Info collectors should also have the following variables
201 | - matching_urls
202 |
203 | - Put them in the `info_collectors` folder
204 |
205 | ## Credits
206 | - jQwotos
207 | - FadedCoder
208 | - DxCx (NineAnimeUrlExtender)
209 |
--------------------------------------------------------------------------------
/scrapers/masteranime.py:
--------------------------------------------------------------------------------
1 | import re
2 | import logging
3 |
4 | # import requests
5 | # Not working, using below instead
6 | import cfscrape
7 |
8 | import demjson
9 |
10 | from bs4 import BeautifulSoup
11 |
12 | site_name = 'masteranime'
13 | requests = cfscrape.create_scraper()
14 |
15 | BASE_URL = "https://www.masterani.me"
16 | SEARCH_URL = "%s/api/anime/search" % (BASE_URL,)
17 | SHOW_URL = "%s/anime/info/" % (BASE_URL,)
18 | EPISODE_LIST_URL = "%s/api/anime/{ID}/detailed" % (BASE_URL,)
19 | POSTER_URL = ("%s/poster/3/" % BASE_URL).replace("www", "cdn")
20 |
21 | showid_pat = re.compile("%s([0-9]+)-" % (SHOW_URL,))
22 | sources_pat = re.compile('mirrors:(.*?), auto_update: \[1')
23 | # sources_pat_2 = re.compile('\[(.*)\]')
24 | multi_source_pat = [
25 | {
26 | 'pat': sources_pat,
27 | 'secondary': False,
28 | },
29 | {
30 | 'pat': re.compile("var videos = (\[.*?\])"),
31 | 'secondary': True,
32 | }
33 | ]
34 |
35 | '''
36 | {
37 | 'pat': sources_pat_2,
38 | 'secondary': True,
39 | },
40 | '''
41 |
42 |
43 | def _combine_link(url):
44 | return ("%s%s" % (BASE_URL, url,)).replace(' ', '')
45 |
46 |
47 | def _merge_slug(location, slug):
48 | return _combine_link("/anime/%s/%s" % (location, slug,))
49 |
50 |
51 | def _merge_poster(poster_url):
52 | return "%s%s" % (POSTER_URL, poster_url,)
53 |
54 |
55 | def _extract_single_search(data):
56 | return {
57 | 'link': _merge_slug("info", data['slug']),
58 | 'title': data['title'],
59 | 'id': data['id'],
60 | 'language': 'sub', # masteranime only has subs
61 | 'host': site_name,
62 | 'poster': _merge_poster(data['poster']['file']),
63 | }
64 |
65 |
66 | def _extract_multiple_search(data):
67 | return [_extract_single_search(x) for x in data]
68 |
69 |
70 | # Masteranime has a hidden api
71 | # that we can abuse, this makes it easier
72 | # so that we don't need to webscrape as much.
73 | def search(query):
74 | params = {
75 | 'search': query,
76 | 'sb': 'true',
77 | }
78 | data = requests.get(SEARCH_URL, params=params).json()
79 |
80 | return _extract_multiple_search(data)
81 |
82 |
83 | def _scrape_show_id(link):
84 | return re.findall(showid_pat, link)[0]
85 |
86 |
87 | def _scrape_single_video_source(data, **kwargs):
88 | if 'secondary' in kwargs and kwargs['secondary'] is True:
89 | return {
90 | 'link': data['src'],
91 | 'quality': data['res'],
92 | 'type': data['type'],
93 | }
94 |
95 | combined = '%s%s' % (data['host']['embed_prefix'], data['embed_id'])
96 | if data['host']['embed_suffix'] is not None:
97 | combined = "%s%s" % (combined, data['host']['embed_suffix'])
98 | return {
99 | 'link': combined,
100 | 'type': '',
101 | 'quality': data['quality'],
102 | 'id': data['id'],
103 | }
104 |
105 | '''
106 | def _scrape_video_sources(link):
107 | logging.info("Scraping sources for %s under masteranime." % (link,))
108 | data = BeautifulSoup(requests.get(link).content, 'html.parser')
109 | scripts = data.findAll("script")
110 | sources = str(scripts[3])
111 |
112 | encoded_sources = re.findall(sources_pat, sources)
113 |
114 | # If the sources are located in the first primary script location
115 | if len(encoded_sources) > 0:
116 | sources = demjson.decode(encoded_sources[0])
117 | return [_scrape_single_video_source(x) for x in sources]
118 | # If the sources are in the second location
119 | else:
120 | script = str(scripts[2])
121 | encoded_sources = re.findall(sources_pat_2, script)
122 | encoded_sources = "[%s]" % (encoded_sources[0],)
123 | print(encoded_sources)
124 | sources = demjson.decode(encoded_sources)
125 | return [_scrape_single_video_source(x, secondary=True) for x in sources]
126 | '''
127 |
128 |
129 | def _scrape_video_sources(link):
130 | logging.info("Scraping sources for %s under masteanime." % (link,))
131 | data = BeautifulSoup(requests.get(link).content, 'html.parser')
132 | scripts = data.findAll('script')
133 | scripts = scripts[2:]
134 | for script in scripts:
135 | for reSource in multi_source_pat:
136 | encoded_sources = re.findall(reSource.get('pat'), str(script))
137 | if len(encoded_sources) > 0:
138 | sources = demjson.decode(encoded_sources[0])
139 | return [
140 | _scrape_single_video_source(x, secondary=reSource.get('secondary'))
141 | for x in sources
142 | ]
143 |
144 |
145 | def _parse_list_single(data, link):
146 | data = data['info']
147 | link = "%s/%s" % (link, data['episode'])
148 | return {
149 | 'epNum': data['episode'],
150 | 'sources': _scrape_video_sources(link),
151 | }
152 |
153 |
154 | def _parse_list_multi(data):
155 | logging.info(
156 | "A request for scraping all sources from %s under masteranime"
157 | % (data['link'],)
158 | )
159 | return [_parse_list_single(x, data['link']) for x in data['episodes']]
160 |
161 |
162 | def _load_list_episodes(data):
163 | slug = data.get('info').get('slug')
164 | link = _merge_slug("watch", slug)
165 | data['link'] = link
166 | return _parse_list_multi(data)
167 |
168 |
169 | def _parse_status(status):
170 | statuses = ['completed', 'airing']
171 | return statuses[status]
172 |
173 |
174 | def scrape_all_show_sources(link):
175 | id = _scrape_show_id(link)
176 | updatedLink = EPISODE_LIST_URL.replace('{ID}', id)
177 | data = requests.get(updatedLink).json()
178 | episodes = _load_list_episodes(data)
179 | data = data['info']
180 | data.update({
181 | 'episodes': episodes,
182 | 'status': _parse_status(data['status']),
183 | })
184 | return data
185 |
186 |
187 | matching_urls = [
188 | {
189 | 'urls': [r'https://www.masterani.me/anime/info/(.*)'],
190 | 'function': scrape_all_show_sources,
191 | },
192 | {
193 | 'urls': [],
194 | 'function': search,
195 | },
196 | {
197 | 'urls': [r'https://www.masterani.me/anime/watch/(.*)/([0-9]+)'],
198 | 'function': _scrape_video_sources,
199 | }
200 | ]
201 |
--------------------------------------------------------------------------------