├── cheggscraper ├── __init__.py ├── conf.json ├── Exceptions.py ├── Downloader.py ├── chapter_type_frame.html ├── CheggScraper.py └── template.html ├── requirements.txt ├── Downloader.py ├── .deepsource.toml ├── setup.py ├── LICENSE ├── NOOB.md ├── .gitignore └── README.md /cheggscraper/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4 2 | lxml 3 | requests 4 | setuptools 5 | Jinja2 -------------------------------------------------------------------------------- /Downloader.py: -------------------------------------------------------------------------------- 1 | from cheggscraper import Downloader 2 | 3 | Downloader.main() 4 | -------------------------------------------------------------------------------- /.deepsource.toml: -------------------------------------------------------------------------------- 1 | version = 1 2 | 3 | [[analyzers]] 4 | name = "python" 5 | enabled = true 6 | 7 | [analyzers.meta] 8 | runtime_version = "3.x.x" 9 | -------------------------------------------------------------------------------- /cheggscraper/conf.json: -------------------------------------------------------------------------------- 1 | { 2 | "default_save_file_format": "{heading}.html", 3 | "default_cookie_file_path": "cookie.txt", 4 | "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" 5 | } -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='cheggscraper', 5 | version='1.3', 6 | description='Convert Chegg url to complete html', 7 | packages=['cheggscraper'], 8 | install_requires=[ 9 | 'beautifulsoup4', 10 | 'lxml', 11 | 'requests', 12 | 'jinja2', 13 | ], 14 | package_data={ 15 | '': ['conf.json', 'template.html', 'chapter_type_frame.html'], 16 | }, 17 | include_package_data=True 18 | ) 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /cheggscraper/Exceptions.py: -------------------------------------------------------------------------------- 1 | class FailedToParse(Exception): 2 | def __init__(self): 3 | self.message = 'Failed to parse data' 4 | 5 | 6 | class UnableToParseUUID(FailedToParse): 7 | def __init__(self): 8 | self.message = 'Unable to get question uuid' 9 | 10 | 11 | class UnexpectedStatusCode(Exception): 12 | def __init__(self, status_code: int): 13 | self.message = 'Unexpected status code: {}'.format(status_code) 14 | 15 | 16 | class UnableToGetLegacyQuestionID(FailedToParse): 17 | def __init__(self): 18 | self.message = 'Unable to get question legacy id' 19 | 20 | 21 | class FailedToParseAnswer(FailedToParse): 22 | def __init__(self): 23 | self.message = 'Failed to parse answer' 24 | 25 | 26 | class JsonParseError(Exception): 27 | ... 28 | 29 | 30 | class UnableToGetToken(FailedToParse): 31 | def __init__(self): 32 | self.message = 'Unable to get token' 33 | 34 | 35 | class UrlNotSupported(ValueError): 36 | def __init__(self, url): 37 | self.message = f'URL NOT SUPPORTED: {url}' 38 | 39 | 40 | class DeviceAllowedQuotaExceeded(Exception): 41 | def __init__(self): 42 | self.message = 'Device allowed quota exceeded' 43 | 44 | 45 | class CookieFileDoesNotExist(FileNotFoundError): 46 | def __init__(self, path): 47 | self.message = f'File does not exist: {path}' 48 | 49 | 50 | class BotFlagError(Exception): 51 | def __init__(self): 52 | self.message = 'The account is flagged as bot, open you chegg account with same browser where you get the cookies and fill the captcha' 53 | -------------------------------------------------------------------------------- /cheggscraper/Downloader.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | from importlib.resources import read_text 5 | 6 | from .CheggScraper import CheggScraper 7 | 8 | 9 | def main(): 10 | """ 11 | User Friendly Downloader for chegg homework help pages 12 | 13 | :return: Nothing 14 | :rtype: None 15 | """ 16 | conf = json.loads(read_text('cheggscraper', 'conf.json')) 17 | 18 | default_save_file_format = conf.get('default_save_file_format') 19 | default_cookie_file_path = conf.get('default_cookie_file_path') 20 | 21 | ap = argparse.ArgumentParser() 22 | ap.add_argument('-c', '--cookie', default=default_cookie_file_path, 23 | help='path of cookie life', dest='cookie_file') 24 | ap.add_argument('-u', '--url', help='url of chegg homework-help, put inside " "', 25 | type=str, dest='url') 26 | # FIXME: DIFF TAGS FOR FILE FORMAT AND BASE PATH 27 | ap.add_argument('-s', '--save', 28 | help='file path, where you want to save, put inside " " eg: test.html or' 29 | ' D:\\myFolder\\test.html or /home/test.html', 30 | type=str, default=default_save_file_format, dest='file_format') 31 | args = vars(ap.parse_args()) 32 | 33 | if not os.path.exists(path=args['cookie_file']): 34 | raise Exception(f'{args["cookie_file"]} does not exists') 35 | 36 | if not args.get('url'): 37 | args.update({'url': input('Enter url of the homework-help: ')}) 38 | 39 | Chegg = CheggScraper(cookie_path=args['cookie_file']) 40 | print(Chegg.url_to_html(args['url'], file_name_format=args['file_format'])) 41 | -------------------------------------------------------------------------------- /NOOB.md: -------------------------------------------------------------------------------- 1 | # Chegg-Scraper 2 | 3 | This scrapper can scrape through [Chegg.com](https://www.chegg.com) and create an html file to save the content locally. 4 | The repository can be used in Chegg-Scraping Bots or for downloading the webpage. 5 | 6 | ### How To Use ?? 7 | First, download the latest release from [here](https://github.com/ThreeGiantNoobs/chegg-scraper/releases/latest) 8 | 9 | After unzipping, install the requirements by using 10 | 11 | pip install -r requirements.txt 12 | or 13 | 14 | pip3 install -r requirements.txt 15 | 16 | whatever works depending on your System 17 | 18 | The above step is a one time process, once done move onto the next step. 19 | 20 | Now in order to run the file, use- 21 | 22 | python Downloader.py 23 | 24 | There are 2 optional arguments 25 | 26 | --url or -u --> To enter the page url 27 | --cookie or -c --> To enter the cookies file path 28 | 29 | Cookies are supported in ``cookie.txt`` format. 30 | Copy the cookie from `document.cookies` from the Browser console or you may use a browser extension like [EditThisCookie](https://chrome.google.com/webstore/detail/editthiscookie/fngmhnnpilhplaeedifhccceomclgfbg) 31 | 32 | Once the cookie file is made, You need to add the cookie file in the project folder to save yourself from the `-c` argument or use `-c` argument to provide cookie path to the downloader before each run. 33 | 34 | #### Example for Usage 35 | 36 | python Downloader.py -c path/to/the/cookie/file.txt -u "https://chegg.com/using-chegg-scraper" 37 | If the cookie is saved as cookie.txt or cookie.json in the project folder you can run the following as: 38 | 39 | python Downloader.py -u "https://chegg.com/using-chegg-scraper" 40 | 41 | For Help, use: 42 | 43 | python Downloader.py -h 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !conf.json 3 | !setup.py 4 | !*/ 5 | !cheggscraper/* 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | pip-wheel-metadata/ 30 | share/python-wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .nox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | *.py,cover 57 | .hypothesis/ 58 | .pytest_cache/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | db.sqlite3 68 | db.sqlite3-journal 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # PyBuilder 81 | target/ 82 | 83 | # Jupyter Notebook 84 | .ipynb_checkpoints 85 | 86 | # IPython 87 | profile_default/ 88 | ipython_config.py 89 | 90 | # pyenv 91 | .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 101 | __pypackages__/ 102 | 103 | # Celery stuff 104 | celerybeat-schedule 105 | celerybeat.pid 106 | 107 | # SageMath parsed files 108 | *.sage.py 109 | 110 | # Environments 111 | .env 112 | .venv 113 | env/ 114 | venv/ 115 | ENV/ 116 | env.bak/ 117 | venv.bak/ 118 | 119 | # Spyder project settings 120 | .spyderproject 121 | .spyproject 122 | 123 | # Rope project settings 124 | .ropeproject 125 | 126 | # mkdocs documentation 127 | /site 128 | 129 | # mypy 130 | .mypy_cache/ 131 | .dmypy.json 132 | dmypy.json 133 | 134 | # Pyre type checker 135 | .pyre/ 136 | 137 | # 138 | cookie.txt 139 | .idea 140 | cookie.json 141 | *.pdf 142 | *.html 143 | 144 | 145 | !cheggscraper/template.html 146 | !cheggscraper/chapter_type_frame.html 147 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## NOTE 2 | ### The Original Developers are no longer in a position to maintain this project. But we would still like to keep the project alive, thus any open source contribution from the community is more than welcome. 3 | 4 | 5 |
6 | 7 | # Chegg-Scrapper 8 | 9 | 10 | 11 | Download Chegg homework-help questions to html files, these html files are self sufficient, you don't need account access to load them 12 | 13 |
14 | Details 15 |
  • 16 | All files are saved to html document. 17 |
  • 18 |
  • 19 | You will not need your chegg account to open these files later. 20 |
  • 21 |
    22 | 23 | 24 | 25 | ## USE-CASES 26 | 27 | 28 | *
    29 | In Bots 30 | 31 | You can share your chegg subscription with your friends, eg: by making discord bot 32 | 33 |
    34 | 35 | * Saving Chegg Questions Locally 36 | 37 | 38 | ## Setup: 39 | 40 | * Download [latest release](https://github.com/ThreeGiantNoobs/chegg-scraper/releases/latest) 41 | 42 | * Install requirements 43 | `pip install -r requirements.txt` 44 | 45 | * Save your cookie in file cookie.txt (preferably) 46 | 47 |
    48 | Using Browser Console 49 | 52 | 56 | 59 |
    60 | 61 | ​ Or 62 | 63 |
    64 | Using Chrome Extenstion 65 | 68 | 71 | 74 |
    75 | 76 | * You may also need to change user-agent 77 | 78 | * Open conf.json and edit user_agent 79 | 80 | * Find your browser user agent 81 | 82 | * Open [What's My User Agent](https://whatmyuseragent.com/) 83 | 84 | Or 85 | 86 | * Open Browser console and run 87 | 88 | ``console.log(navigator.userAgent)`` 89 | 90 | 91 | 92 | ## Usage: 93 | 94 | * If you are new to python go [here](NOOB.md) 95 | 96 | * Run the `Downloader.py` Script 97 | 98 | ```console 99 | $ python Downloader.py 100 | 101 | Enter url of the homework-help: 102 | ``` 103 | 104 | * Arguments 105 | 106 | ``` 107 | ALL ARGUMENTS ARE OPTIONAL 108 | -u or --url > URL of Chegg 109 | -c or --cookie > Path of Cookie file (Defualt: cookie.txt) 110 | -s or --save > file path, where you want to save, put inside " " 111 | ``` 112 | 113 | 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /cheggscraper/chapter_type_frame.html: -------------------------------------------------------------------------------- 1 |
    2 | 3 |
    4 |
    5 | 6 | 8 |

    Chapter {{chapterName}}, Problem {{problemName}}

    9 |
    10 |
    12 | 14 |
    15 | 17 |
    18 |
    19 |
    20 |
    21 |
    22 | 24 |

    Problem

    25 |
    26 |
    27 |
    28 | {{problemHtml}} 29 |
    30 |
    31 |
    32 |
    33 |

    Step-by-step solution

    34 |
      35 | {% for step in steps %} 36 |
    1. 37 |
      38 |
      39 |

      43 | Step {{loop.index}} of {{totalSteps}}

      44 |
      45 |
      46 | {{step.html}} 47 |
      48 |
      49 |
      50 |
    2. 51 | {% endfor %} 52 |
    53 | 54 |
    55 |
    56 | 59 |
    60 | Loading content 62 |
    63 |
    -------------------------------------------------------------------------------- /cheggscraper/CheggScraper.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import random 5 | import re 6 | import string 7 | from typing import Union, Optional 8 | 9 | import unicodedata 10 | from importlib.resources import read_text 11 | 12 | import requests 13 | from requests import Response 14 | from bs4 import BeautifulSoup 15 | from bs4.element import Tag 16 | from jinja2 import Environment, BaseLoader 17 | 18 | from .Exceptions import * 19 | 20 | logging.basicConfig(filename='scraper.log', filemode='w', level=logging.DEBUG) 21 | 22 | main_template = Environment(loader=BaseLoader).from_string(read_text('cheggscraper', 'template.html')) 23 | chapter_type_template = Environment(loader=BaseLoader).from_string(read_text('cheggscraper', 'chapter_type_frame.html')) 24 | 25 | 26 | class CheggScraper: 27 | """ 28 | Scrape html from chegg.com and store them in a way so you don't need cookie to view the file 29 | """ 30 | 31 | def __init__(self, cookie: str = None, cookie_path: str = None, user_agent: str = None, base_path: str = None, 32 | save_file_format: str = None, config: dict = None, template_path: str = None, 33 | extra_header_tag: str = None): 34 | 35 | self.base_path = base_path 36 | 37 | self.save_file_format = save_file_format 38 | 39 | if self.base_path: 40 | if not os.path.exists(self.base_path): 41 | os.makedirs(self.base_path) 42 | 43 | if not self.base_path: 44 | self.base_path = '' 45 | 46 | self.extra_header_tag = extra_header_tag 47 | 48 | if cookie: 49 | self.cookie = cookie 50 | else: 51 | self.cookie = self.parse_cookie(cookie_path) 52 | 53 | self.cookie_dict = self.cookie_str_to_dict(self.cookie) 54 | 55 | self.template_path = template_path 56 | 57 | if not config: 58 | config = json.loads(read_text('cheggscraper', 'conf.json')) 59 | 60 | if not user_agent: 61 | user_agent = config.get('user_agent') 62 | if not user_agent: 63 | raise Exception('user_agent not defined') 64 | 65 | logging.debug(msg=f'user_agent: {user_agent}') 66 | 67 | self.user_agent = user_agent 68 | 69 | self.headers = { 70 | 'authority': 'www.chegg.com', 71 | # 'cache-control': 'max-age=0', 72 | "Accept-Encoding": "gzip, deflate, br", 73 | 'accept-language': 'en-US,en;q=0.9', 74 | 'cookie': self.cookie, 75 | 'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"', 76 | 'sec-ch-ua-mobile': '?0', 77 | 'upgrade-insecure-requests': '1', 78 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 79 | 'sec-fetch-site': 'cross-site', 80 | 'sec-fetch-mode': 'navigate', 81 | 'sec-fetch-user': '?1', 82 | 'sec-fetch-dest': 'document', 83 | 'user-agent': self.user_agent, 84 | } 85 | 86 | self.ajax_url = 'https://www.chegg.com/study/_ajax/enhancedcontent?token={token}&questionUuid={question_uuid}&showOnboarding=&templateName=ENHANCED_CONTENT_V2&deviceFingerPrintId={deviceFingerPrintId}' 87 | 88 | logging.debug(f'self.cookie = {self.cookie}') 89 | 90 | self.deviceFingerPrintId = self.cookie_dict.get('DFID') 91 | 92 | @staticmethod 93 | def slugify(value: str, allow_unicode: bool = False) -> str: 94 | """ 95 | slugify the names of files 96 | 97 | :param value: string to be slugify 98 | :type value: str 99 | :param allow_unicode: allow unicode 100 | :type allow_unicode: bool 101 | :return: string after slugify 102 | :rtype: str 103 | """ 104 | value = str(value) 105 | if allow_unicode: 106 | value = unicodedata.normalize('NFKC', value) 107 | else: 108 | value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii') 109 | value = re.sub(r'[^\w\s-]', '', value.lower()) 110 | return re.sub(r'[-\s]+', '-', value).strip('-_') 111 | 112 | @staticmethod 113 | def render_chapter_type_html(data: dict) -> str: 114 | """ 115 | Render chapter type answers using data 116 | 117 | :param data: response from graphql url 118 | :type data: dict 119 | :return: rendered html code 120 | :rtype: str 121 | """ 122 | chapter_d = data['data']['textbook_solution']['chapter'][0] 123 | problem_d = chapter_d['problems'][0] 124 | solutionV2 = problem_d['solutionV2'][0] 125 | 126 | _data = { 127 | 'chapterName': chapter_d['chapterName'], 128 | 'problemName': problem_d['problemName'], 129 | 'problemHtml': problem_d['problemHtml'], 130 | 'totalSteps': solutionV2['totalSteps'], 131 | 'steps': solutionV2['steps'], 132 | } 133 | 134 | return chapter_type_template.render(**_data) 135 | 136 | @staticmethod 137 | def replace_src_links(html_text: str) -> str: 138 | """ 139 | Replace relative links from page, so even you are opening file without any host, still can see all contents, 140 | still some css and js won't load 141 | 142 | :param html_text: html code of page 143 | :type html_text: str 144 | :return: html code after modify all relative links 145 | :rtype: str 146 | """ 147 | return re.sub(r'src=\s*?"//(.*)?"', r'src="https://\1"', html_text) 148 | 149 | @staticmethod 150 | def cookie_str_to_dict(cookie_str: str) -> dict: 151 | """ 152 | Convert cookie str to dict of key, value pairs 153 | 154 | :param cookie_str: cookie in format of string [key=value; key=value] 155 | :type cookie_str: str 156 | :return: dictionary of key value pairs of key value pairs 157 | :rtype: dict 158 | """ 159 | ret = {} 160 | cookie_pairs = cookie_str.split(';') 161 | for pair in cookie_pairs: 162 | key, value = pair.split('=', 1) 163 | key = key.strip() 164 | value = value.strip() 165 | ret.update({key: value}) 166 | return ret 167 | 168 | @staticmethod 169 | def parse_json(json_string: str) -> dict: 170 | """ 171 | just parse json 172 | 173 | :param json_string: json data in format of string 174 | :type json_string: str 175 | :return: dict 176 | :rtype: dict 177 | """ 178 | try: 179 | data = json.loads(json_string) 180 | return data 181 | except Exception as e: 182 | logging.debug(msg=f'::while parsing json: {e}') 183 | raise JsonParseError 184 | 185 | @staticmethod 186 | def dict_to_cookie_str(cookie_dict: dict) -> str: 187 | """ 188 | Convert dict to cookie string 189 | 190 | :param cookie_dict: dictionary of cookie, key value pairs 191 | :type cookie_dict: dict 192 | :return: cookie in string format 193 | :rtype: str 194 | """ 195 | cookie_str = '' 196 | first_flag = True 197 | for cookie in cookie_dict: 198 | if not first_flag: 199 | cookie_str += '; ' 200 | cookie_str += '{name}={value}'.format(**cookie) 201 | first_flag = False 202 | return cookie_str 203 | 204 | @staticmethod 205 | def parse_cookie(cookie_path: str) -> str: 206 | """ 207 | Parse cookie from cookie_path 208 | 209 | :param cookie_path: path of cookie file 210 | :type cookie_path: str 211 | :return: string cookie 212 | :rtype: str 213 | """ 214 | if os.path.exists(cookie_path): 215 | if os.path.isfile(cookie_path): 216 | with open(cookie_path, 'r') as f: 217 | cookie_text = f.read() 218 | try: 219 | json_result = CheggScraper.parse_json(cookie_text) 220 | logging.debug(f'::cookie_path: {cookie_path} is json file') 221 | return CheggScraper.dict_to_cookie_str(json_result).strip() 222 | except JsonParseError: 223 | logging.debug(f'::cookie_path: {cookie_path} is not json file') 224 | return cookie_text.strip() 225 | else: 226 | logging.error(msg=f"{cookie_path} is not a file") 227 | else: 228 | logging.error(msg=f"{cookie_path} don't exist") 229 | raise CookieFileDoesNotExist(cookie_path) 230 | 231 | @staticmethod 232 | def clean_url(url: str) -> (bool, Optional[int], str): 233 | """ 234 | Cleans the url, So no track id goes to url 235 | """ 236 | # https://www.chegg.com/homework-help/questions-and-answers/question--choose-random-questions-answer-possible-least-5-questions--thank-q8125333 237 | chapter_type = False 238 | q_id = None 239 | match = re.search(r'chegg\.com/homework-help/questions-and-answers/([^ ?/\n]+)-q(\d+)', url) 240 | if not match: 241 | chapter_type = True 242 | match = re.search(r'chegg\.com/homework-help/[^?/]+', url) 243 | if not match: 244 | logging.error(f'THIS URL NOT SUPPORTED\nurl: {url}') 245 | raise UrlNotSupported(url) 246 | else: 247 | q_id = int(match.group(2)) 248 | 249 | return chapter_type, q_id, 'https://www.' + match.group(0) 250 | 251 | @staticmethod 252 | def final_touch(html_text: str) -> str: 253 | """ 254 | Final changes to final html code, like changing class of some divs 255 | """ 256 | soup = BeautifulSoup(html_text, 'lxml') 257 | if soup.find('div', {'id': 'show-more'}): 258 | soup.find('div', {'id': 'show-more'}).decompose() 259 | if soup.find('section', {'id': 'general-guidance'}): 260 | soup.find('section', {'id': 'general-guidance'})['class'] = 'viewable visible' 261 | 262 | return str(soup) 263 | 264 | def _web_response(self, url: str, headers: dict = None, extra_headers: dict = None, expected_status: tuple = (200,), 265 | note: str = None, error_note: str = "Error in request", post: bool = False, data: dict = None, 266 | _json=None, raise_exception=False) -> Response: 267 | """ 268 | Returns response from web 269 | """ 270 | 271 | if not headers: 272 | headers = self.headers 273 | if extra_headers: 274 | headers.update(extra_headers) 275 | if post: 276 | response = requests.post( 277 | url=url, 278 | headers=headers, 279 | json=_json, 280 | data=data 281 | ) 282 | else: 283 | response = requests.get( 284 | url=url, 285 | headers=headers) 286 | 287 | if response.status_code not in expected_status: 288 | logging.error(msg=f'Expected status codes {expected_status} but got {response.status_code}\n{error_note}') 289 | if raise_exception: 290 | raise UnexpectedStatusCode(response.status_code) 291 | return response 292 | if note: 293 | logging.info(msg=note) 294 | return response 295 | 296 | def _get_response_text(self, url: str, headers: dict = None, extra_headers: dict = None, 297 | expected_status: tuple = (200,), note: str = None, 298 | error_note: str = "Error in request", raise_exception=False) -> str: 299 | """ 300 | text response from web 301 | 302 | :return: Text response from web 303 | :rtype: str 304 | """ 305 | logging.debug(msg=f'::getting response from url: {url}') 306 | response = self._web_response(url=url, headers=headers, extra_headers=extra_headers, 307 | expected_status=expected_status, note=note, 308 | error_note=error_note, raise_exception=raise_exception) 309 | logging.info(msg=f'::response status code: {response.status_code}') 310 | if response.status_code not in expected_status: 311 | raise Exception(f'Expected status code {expected_status} but got {response.status_code}\n{error_note}') 312 | return response.text 313 | 314 | def _get_response_dict(self, url: str, headers: dict = None, extra_headers: dict = None, 315 | expected_status: tuple = (200,), note: str = None, error_note: str = "Error in request", 316 | post: bool = False, data: dict = None, _json=None, raise_exception=False) -> dict: 317 | """ 318 | dict response from web 319 | 320 | :return: json response from web 321 | :rtype: dict 322 | """ 323 | logging.info(msg=f'::getting response from url: {url}') 324 | response = self._web_response(url=url, headers=headers, extra_headers=extra_headers, 325 | expected_status=expected_status, note=note, error_note=error_note, post=post, 326 | data=data, _json=_json, raise_exception=raise_exception) 327 | logging.info(msg=f'::response status code: {response.status_code}') 328 | logging.debug(msg=f'::response text: {response.text}') 329 | return self.parse_json(response.text) 330 | 331 | @staticmethod 332 | def _parse_heading(soup: BeautifulSoup) -> str: 333 | """ 334 | Parse heading from html 335 | 336 | @param soup: BeautifulSoup from chegg_html 337 | @type soup: BeautifulSoup 338 | @return: heading of the question page 339 | @rtype: str 340 | """ 341 | heading = None 342 | heading_data = soup.find('script', id='__NEXT_DATA__') 343 | if heading_data: 344 | heading_data = heading_data.text 345 | heading = json.loads(heading_data)['query']['qnaSlug'] 346 | if not heading: 347 | title = soup.find('title') 348 | if title: 349 | heading = title.text 350 | 351 | if not heading: 352 | logging.error(msg="can't able to get heading") 353 | else: 354 | logging.info(msg=f"Heading: {heading}") 355 | return str(heading) 356 | 357 | def _get_non_chapter_type_data(self, legacy_id: int, auth_token: str) -> dict: 358 | """ 359 | Get non chapter type quetion and answer data from chegg api 360 | """ 361 | logging.info(msg="Getting non chapter type data, legacy_id: {}".format(legacy_id)) 362 | 363 | query = { 364 | "operationName": "QnaPageQuestionByLegacyId", 365 | "variables": { 366 | "id": legacy_id 367 | }, 368 | "extensions": { 369 | "persistedQuery": { 370 | "version": 1, 371 | "sha256Hash": "26efed323ef07d1759f67adadd2832ac85d7046b7eca681fe224d7824bab0928" 372 | } 373 | } 374 | } 375 | graphql_url = 'https://gateway.chegg.com/one-graph/graphql' 376 | 377 | extra_headers = { 378 | 'authorization': f'Basic {auth_token}', 379 | 'content-type': 'application/json', 380 | 'apollographql-client-name': 'chegg-web', 381 | 'apollographql-client-version': 'main-127d14c8-2503803178' 382 | } 383 | 384 | data = self._get_response_dict(url=graphql_url, post=True, _json=query, extra_headers=extra_headers) 385 | try: 386 | if data['errors']: 387 | logging.error(msg=f"Error in getting non chapter type data, legacy_id: {legacy_id}") 388 | logging.error(msg=f"Error: {data['errors']['message']}") 389 | if (restrictions := data['errors']['message'].get('extensions', {}).get('metadata', {}).get( 390 | 'accessRestrictions')) and 'DEVICE_ALLOWED_QUOTA_EXCEEDED' in restrictions: 391 | raise DeviceAllowedQuotaExceeded 392 | except KeyError: 393 | # No errors found 394 | pass 395 | 396 | return data 397 | 398 | def _get_chapter_type_data(self, token: str, html_text: str) -> dict: 399 | chapter_id = str(re.search(r'\?id=(\d+).*?isbn', html_text).group(1)) 400 | isbn13 = str(re.search(r'"isbn13":"(\d+)"', html_text).group(1)) 401 | problemId = str(re.search(r'"problemId":"(\d+)"', html_text).group(1)) 402 | 403 | query = { 404 | "query": { 405 | "operationName": "getSolutionDetails", 406 | "variables": { 407 | "isbn13": isbn13, 408 | "chapterId": chapter_id, 409 | "problemId": problemId 410 | } 411 | }, 412 | "token": token 413 | } 414 | graphql_url = 'https://www.chegg.com/study/_ajax/persistquerygraphql' 415 | res_data = self._get_response_dict(url=graphql_url, post=True, _json=query) 416 | return res_data 417 | 418 | def _parse_question_answer(self, legacy_id: Optional[int], html_text: str, chapter_type: bool, token: Optional[str], 419 | auth_token: str): 420 | """ 421 | Parse Question and Answers 422 | """ 423 | if not chapter_type: 424 | data = self._get_non_chapter_type_data(legacy_id=legacy_id, auth_token=auth_token) 425 | question_div = data['data']['questionByLegacyId']['content']['body'] 426 | answer_divs = [f"
    {answers_['answerData']['html']}
    " for 427 | answers_ in 428 | data['data']['questionByLegacyId']['htmlAnswers']] 429 | return question_div, '" 430 | else: 431 | return '
    ', self.render_chapter_type_html( 432 | self._get_chapter_type_data(token=token, html_text=html_text) 433 | ) 434 | 435 | def _parse(self, html_text: str, token: Optional[str], q_id: Optional[int], auth_token: str, 436 | chapter_type: bool = None) -> (str, str, str, str): 437 | html_text = self.replace_src_links(html_text) 438 | soup = BeautifulSoup(html_text, 'lxml') 439 | logging.debug("HTML\n\n" + html_text + "HTML\n\n") 440 | 441 | if soup.find('div', id='px-captcha'): 442 | raise BotFlagError 443 | 444 | """Parse headers""" 445 | headers = soup.find('head') 446 | 447 | """Parse heading""" 448 | heading = self._parse_heading(soup) 449 | 450 | """Parse Question""" 451 | if not chapter_type: 452 | if not q_id: 453 | raise UnableToGetLegacyQuestionID 454 | 455 | question_div, answers_div = self._parse_question_answer( 456 | legacy_id=q_id, html_text=html_text, chapter_type=chapter_type, token=token, auth_token=auth_token 457 | ) 458 | 459 | return str(headers), heading, self.replace_src_links(question_div), self.replace_src_links(answers_div) 460 | 461 | def _save_html_file(self, rendered_html: str, heading: str = None, question_uuid: str = None, 462 | file_name_format: str = None): 463 | heading = self.slugify(heading.strip('.').strip()) 464 | if not file_name_format: 465 | file_name_format = self.save_file_format 466 | if not file_name_format: 467 | file_name_format = heading + '.html' 468 | 469 | file_path = os.path.join( 470 | self.base_path, 471 | file_name_format) 472 | 473 | file_path = file_path.format(**{ 474 | 'random_u_str_int': ''.join(random.choices(string.ascii_uppercase + string.digits, k=10)), 475 | 'random_u_str': ''.join(random.choices(string.ascii_uppercase, k=10)), 476 | 'random_str': ''.join(random.choices(string.ascii_letters, k=10)), 477 | 'random_int': ''.join(random.choices(string.digits, k=10)), 478 | 'heading': heading, 479 | 'title': heading, 480 | 'question_uuid': question_uuid 481 | }) 482 | 483 | # if self.save_file_format: 484 | # file_path = os.path.join( 485 | # file_path, 486 | # self.save_file_format) 487 | 488 | with open(file_path, 'w', encoding='utf-8') as f: 489 | f.write(rendered_html) 490 | 491 | return file_path 492 | 493 | def _render_html(self, url, headers, heading, question_div, answers__): 494 | html_rendered_text = main_template.render( 495 | url=url, 496 | headers=headers, 497 | title=heading, 498 | heading=heading, 499 | question_body=question_div, 500 | answers_wrap=answers__, 501 | extra_header_tag=self.extra_header_tag, 502 | ) 503 | 504 | return self.final_touch(html_text=html_rendered_text) 505 | 506 | def url_to_html(self, url: str, file_name_format: str = None, get_dict_info: bool = False): 507 | """ 508 | Chegg url to html file, saves the file and return file path 509 | 510 | @param url: chegg url 511 | @type url: str 512 | @param get_dict_info: 513 | @type get_dict_info: 514 | @param file_name_format: File path to save file 515 | @type file_name_format: str 516 | @return: file_path 517 | @rtype: 518 | """ 519 | chapter_type, q_id, url = self.clean_url(url) 520 | 521 | html_res_text = self._get_response_text(url=url) 522 | try: 523 | token = re.search(r'"token":"(.+?)"', html_res_text).group(1) 524 | except AttributeError: 525 | token = None 526 | 527 | if chapter_type and not token: 528 | raise UnableToGetToken 529 | 530 | # static 531 | auth_token = "TnNZS3dJMGxMdVhBQWQwenFTMHFlak5UVXAwb1l1WDY6R09JZVdFRnVvNndRRFZ4Ug==" 532 | 533 | headers, heading, question_div, answers__ = self._parse( 534 | html_text=html_res_text, 535 | q_id=q_id, 536 | chapter_type=chapter_type, 537 | token=token, 538 | auth_token=auth_token 539 | ) 540 | 541 | rendered_html = self._render_html(url, headers, heading, question_div, answers__) 542 | 543 | file_path = self._save_html_file(rendered_html, heading, None, file_name_format) 544 | 545 | if get_dict_info: 546 | return file_path, url, headers, heading, question_div, answers__ 547 | return file_path 548 | -------------------------------------------------------------------------------- /cheggscraper/template.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {{heading}} 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 |
    26 |
    27 |
    28 |
    29 | 30 | 31 |
    32 | Home 33 |
    34 | Home 36 | Books 37 |
    38 |
    39 |
    STUDY
    40 | Textbook 41 | Solutions 42 | Q&A 44 |
    45 |
    46 |
    47 |
    CAREER CENTER
    48 | Careers 50 | Internships 52 |
    53 |
    54 | Colleges 55 | Scholarships 57 |
    58 |
    59 |
    60 |
    61 | 62 |
    63 |
    64 | 65 |
    67 |
    68 | 75 |
    76 |
    77 |
    78 | {{question_body}} 79 |
    80 | 81 |
    82 |
    83 |
    Comments
    84 |
      85 |
      86 |
      87 |
      88 | 89 |
      90 |
      91 |
      92 |

      Expert Answer

      93 | 94 | 95 |
      96 |
      97 | {{answers_wrap}} 98 |
      99 |
      100 | 101 |
      102 |
      103 | 104 |
      105 |
      106 |
      107 | 108 | 109 |
      110 |
      111 |
      112 |
      113 |
      114 |
      115 | 116 | 117 | --------------------------------------------------------------------------------