├── src ├── __init__.py ├── objects.py ├── io.py ├── parsing.py ├── http.py └── core.py ├── .gitignore ├── requirements.txt ├── .idea ├── vcs.xml ├── .gitignore ├── inspectionProfiles │ ├── profiles_settings.xml │ └── Project_Default.xml ├── discord.xml ├── fit-vut-wis-project-dumper.iml ├── modules.xml └── misc.xml ├── Pipfile ├── README.md ├── main.py ├── LICENSE └── Pipfile.lock /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/__pycache__ 2 | **/.DS_Store 3 | wis_projects 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | bs4 3 | html5lib 4 | unidecode 5 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Editor-based HTTP Client requests 5 | /httpRequests/ 6 | # Datasource local storage ignored files 7 | /dataSources/ 8 | /dataSources.local.xml 9 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/discord.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | requests = "*" 8 | bs4 = "*" 9 | html5lib = "*" 10 | unidecode = "*" 11 | 12 | [dev-packages] 13 | 14 | [requires] 15 | python_version = "3.9" 16 | -------------------------------------------------------------------------------- /.idea/fit-vut-wis-project-dumper.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FIT VUT WIS Project Dumper 2 | 3 | Requirements: 4 | - `Python >=3.9` 5 | - (Optional) `pipenv` 6 | 7 | ### Installation 8 | ``` 9 | pipenv install 10 | ``` 11 | 12 | or 13 | 14 | ```sh 15 | pip3 install -r requirements.txt 16 | ``` 17 | 18 | ### Usage 19 | ``` 20 | pipenv run python main.py [-v[v]] [-o|--output OUTPUT_DIR] 21 | ``` 22 | 23 | or 24 | 25 | ```sh 26 | python3.9 main.py [-v[v]] [-o|--output OUTPUT_DIR] 27 | ``` 28 | -------------------------------------------------------------------------------- /src/objects.py: -------------------------------------------------------------------------------- 1 | 2 | class Course: 3 | 4 | def __init__(self, course_abbr: str, course_link: str): 5 | self.abbr = course_abbr 6 | self.link = course_link 7 | 8 | 9 | class CourseTask: 10 | 11 | def __init__(self, task_name: str, task_link: str, course: Course): 12 | self.name = task_name 13 | self.link = task_link 14 | self.course = course 15 | 16 | 17 | class TaskFile: 18 | 19 | def __init__(self, file_name: str, file_year: str, file_link: str, task: CourseTask): 20 | self.name = file_name 21 | self.year = file_year 22 | self.link = file_link 23 | self.task = task 24 | -------------------------------------------------------------------------------- /src/io.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | import sys 4 | from getpass import getpass 5 | from gettext import gettext 6 | 7 | log = logging.getLogger("io") 8 | 9 | 10 | def get_user_credentials() -> (str, str): 11 | log.debug("loading user credentials") 12 | 13 | username, password = "", "" 14 | while re.fullmatch(r"^x[a-z]{5}[a-z\d]{2}$", username) is None: 15 | username = input("WIS username: ") 16 | 17 | while len(password) < 10: 18 | password = getpass("WIS password: ", sys.stdout) 19 | 20 | log.debug("credentials successfully obtained: username=%s, password=%s", username, password) 21 | return username, password 22 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os.path 4 | from pathlib import Path 5 | 6 | from src.core import Downloader 7 | 8 | log = logging.getLogger("main") 9 | 10 | 11 | def main(output_dir: Path): 12 | Downloader(output_dir).run() 13 | 14 | 15 | if __name__ == '__main__': 16 | default_dir = f"{os.getcwd()}/wis_projects" 17 | 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument("-o", "--output", action="store", dest="output_dir", metavar="OUTPUT_DIR", default=default_dir) 20 | parser.add_argument("-v", action="count", dest="version", default=0) 21 | args = parser.parse_args() 22 | 23 | logging.basicConfig( 24 | format="%(asctime)s %(levelname)-8s [%(name)s]: %(message)s", 25 | level=logging.WARNING - args.version * 10, 26 | ) 27 | 28 | output_directory = Path(args.output_dir) 29 | main(output_directory) 30 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Daniel Dolejška 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/parsing.py: -------------------------------------------------------------------------------- 1 | from typing import Generator, Optional 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | 6 | class Parser: 7 | 8 | def __init__(self, page_content: str): 9 | self.content = BeautifulSoup(page_content, "html5lib") 10 | 11 | 12 | class StudyParser(Parser): 13 | 14 | def get_course_names_and_links(self) -> Generator[tuple[str, str], None, None]: 15 | course_entries = self.content.select(".content > .table-holder tr[align='center'][valign='top']") 16 | for entry in course_entries: 17 | yield entry.find("th").text, entry.select_one("a.bar")["href"] 18 | 19 | 20 | class CourseParser(Parser): 21 | 22 | def get_task_names_and_links(self) -> Generator[tuple[str, str], None, None]: 23 | task_links = self.content.select(".content > form > .table-holder a.bar") 24 | for link in task_links: 25 | yield link.text, link["href"] 26 | 27 | 28 | class TaskParser(Parser): 29 | 30 | def try_get_files_link(self) -> Optional[str]: 31 | task_links = self.content.select(".content > p > a") 32 | for link in task_links: 33 | link_target = link["href"] 34 | if "course-sf.php" in link_target: 35 | return link_target 36 | 37 | return None 38 | 39 | 40 | class TaskFilesParser(Parser): 41 | 42 | def get_file_names_and_links(self) -> Generator[tuple[str, str, str], None, None]: 43 | year = self.content.find("h1").text.rsplit("/", maxsplit=1)[-1] 44 | file_links = self.content.select(".content > form > table tr[valign='middle'] > td > a") 45 | for link in file_links: 46 | yield link.text, year, link["href"] 47 | 48 | return None 49 | -------------------------------------------------------------------------------- /src/http.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import shutil 3 | from pathlib import Path 4 | 5 | from requests import Session as HTTPSession, Response 6 | from requests.auth import HTTPBasicAuth 7 | 8 | log = logging.getLogger("http") 9 | 10 | 11 | class Connection: 12 | 13 | def __init__(self, username: str, password: str): 14 | self.session = HTTPSession() 15 | self.session.auth = HTTPBasicAuth(username, password) 16 | 17 | def _get_url(self, url: str) -> str: 18 | if not url.startswith("http"): 19 | if url.startswith("/FIT/st/"): 20 | return f"https://wis.fit.vutbr.cz{url}" 21 | elif url.startswith("/st/"): 22 | return f"https://wis.fit.vutbr.cz/FIT{url}" 23 | else: 24 | return f"https://wis.fit.vutbr.cz/FIT/st/{url}" 25 | 26 | return url 27 | 28 | def _get(self, url: str, params: dict[str, str], stream: bool = False) -> Response: 29 | """ Makes an HTTP GET request to the given URL and ensures the response is valid. """ 30 | url = self._get_url(url) 31 | response = self.session.get(url, params=params, stream=stream) 32 | response.raise_for_status() 33 | 34 | return response 35 | 36 | def get_content(self, url: str, **params: [str, int]) -> str: 37 | """ Loads and returns contents of the given webpage. """ 38 | with self._get(url, params) as response: 39 | return response.content.decode("iso-8859-2") 40 | 41 | def get_studies_page(self) -> str: 42 | return self.get_content("study-s.php.cs") 43 | 44 | def get_courses_page(self, study: int = 1) -> str: 45 | assert study > 0 46 | return self.get_content("study-a.php.cs", cist=study) 47 | 48 | def download_file(self, link: str, filepath: Path): 49 | with self._get(link, {}, stream=True) as response: 50 | log.debug("downloading %s -> %s", link, filepath.as_posix()) 51 | with filepath.open("wb") as file: 52 | for chunk in response.iter_content(chunk_size=128 * 1024): 53 | if chunk: 54 | file.write(chunk) 55 | 56 | log.info("successfully downloaded %s", filepath.as_posix()) 57 | 58 | def close(self): 59 | self.session.close() 60 | -------------------------------------------------------------------------------- /Pipfile.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_meta": { 3 | "hash": { 4 | "sha256": "247ae7f4e1b509c78c58bf44d4a6a8d817a1ef13721d2262af66ba600db82585" 5 | }, 6 | "pipfile-spec": 6, 7 | "requires": { 8 | "python_version": "3.9" 9 | }, 10 | "sources": [ 11 | { 12 | "name": "pypi", 13 | "url": "https://pypi.org/simple", 14 | "verify_ssl": true 15 | } 16 | ] 17 | }, 18 | "default": { 19 | "beautifulsoup4": { 20 | "hashes": [ 21 | "sha256:58d5c3d29f5a36ffeb94f02f0d786cd53014cf9b3b3951d42e0080d8a9498d30", 22 | "sha256:ad9aa55b65ef2808eb405f46cf74df7fcb7044d5cbc26487f96eb2ef2e436693" 23 | ], 24 | "markers": "python_full_version >= '3.6.0'", 25 | "version": "==4.11.1" 26 | }, 27 | "bs4": { 28 | "hashes": [ 29 | "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a" 30 | ], 31 | "index": "pypi", 32 | "version": "==0.0.1" 33 | }, 34 | "certifi": { 35 | "hashes": [ 36 | "sha256:9c5705e395cd70084351dd8ad5c41e65655e08ce46f2ec9cf6c2c08390f71eb7", 37 | "sha256:f1d53542ee8cbedbe2118b5686372fb33c297fcd6379b050cca0ef13a597382a" 38 | ], 39 | "markers": "python_full_version >= '3.6.0'", 40 | "version": "==2022.5.18.1" 41 | }, 42 | "charset-normalizer": { 43 | "hashes": [ 44 | "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597", 45 | "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df" 46 | ], 47 | "markers": "python_version >= '3'", 48 | "version": "==2.0.12" 49 | }, 50 | "html5lib": { 51 | "hashes": [ 52 | "sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d", 53 | "sha256:b2e5b40261e20f354d198eae92afc10d750afb487ed5e50f9c4eaf07c184146f" 54 | ], 55 | "index": "pypi", 56 | "version": "==1.1" 57 | }, 58 | "idna": { 59 | "hashes": [ 60 | "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff", 61 | "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d" 62 | ], 63 | "markers": "python_version >= '3'", 64 | "version": "==3.3" 65 | }, 66 | "requests": { 67 | "hashes": [ 68 | "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", 69 | "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" 70 | ], 71 | "index": "pypi", 72 | "version": "==2.27.1" 73 | }, 74 | "six": { 75 | "hashes": [ 76 | "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", 77 | "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" 78 | ], 79 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", 80 | "version": "==1.16.0" 81 | }, 82 | "soupsieve": { 83 | "hashes": [ 84 | "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759", 85 | "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d" 86 | ], 87 | "markers": "python_full_version >= '3.6.0'", 88 | "version": "==2.3.2.post1" 89 | }, 90 | "urllib3": { 91 | "hashes": [ 92 | "sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14", 93 | "sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e" 94 | ], 95 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", 96 | "version": "==1.26.9" 97 | }, 98 | "webencodings": { 99 | "hashes": [ 100 | "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78", 101 | "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923" 102 | ], 103 | "version": "==0.5.1" 104 | } 105 | }, 106 | "develop": {} 107 | } 108 | -------------------------------------------------------------------------------- /src/core.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | from pathlib import Path 4 | from typing import Generator, Optional 5 | from unidecode import unidecode 6 | 7 | from requests import HTTPError 8 | 9 | from .http import Connection 10 | from .io import get_user_credentials 11 | from .objects import Course, CourseTask, TaskFile 12 | from .parsing import StudyParser, CourseParser, TaskParser, TaskFilesParser 13 | 14 | log = logging.getLogger("core") 15 | 16 | 17 | class Downloader: 18 | 19 | connection: Connection = None 20 | 21 | def __init__(self, output_dir: Path): 22 | self.output_dir = output_dir 23 | 24 | def _get_courses(self, study_id: int) -> Generator[Course, None, None]: 25 | assert self.connection is not None 26 | page_content = self.connection.get_courses_page(study_id) 27 | for course_abbr, course_link in StudyParser(page_content).get_course_names_and_links(): 28 | yield Course(course_abbr, course_link) 29 | 30 | def _get_course_tasks(self, course: Course) -> Generator[CourseTask, None, None]: 31 | assert self.connection is not None 32 | page_content = self.connection.get_content(course.link) 33 | for task_name, task_link in CourseParser(page_content).get_task_names_and_links(): 34 | yield CourseTask(task_name, task_link, course) 35 | 36 | def _try_get_course_task_files_link(self, course_task_link: str) -> Optional[str]: 37 | assert self.connection is not None 38 | page_content = self.connection.get_content(course_task_link) 39 | return TaskParser(page_content).try_get_files_link() 40 | 41 | def _get_course_task_files(self, course_files_link: str, task: CourseTask) -> Generator[TaskFile, None, None]: 42 | assert self.connection is not None 43 | page_content = self.connection.get_content(course_files_link) 44 | for file_name, file_year, file_link in TaskFilesParser(page_content).get_file_names_and_links(): 45 | yield TaskFile(file_name, file_year, file_link, task) 46 | 47 | def _explore_course(self, course: Course): 48 | log.info("exploring course %s", course.abbr) 49 | files_downloaded = 0 50 | for task in self._get_course_tasks(course): 51 | files_downloaded += self._download_files_from_course_task(task) 52 | 53 | if not files_downloaded: 54 | log.info("found no project files in %s", course.abbr) 55 | else: 56 | log.debug("found and downloaded %d file(s) from %s", files_downloaded, course.abbr) 57 | 58 | def _download_files_from_course_task(self, task: CourseTask) -> int: 59 | if (course_files_link := self._try_get_course_task_files_link(task.link)) is None: 60 | log.debug("course task '%s' in %s does not contain any downloadable files", task.name, task.course.abbr) 61 | return 0 62 | 63 | files_found = 0 64 | task_name = unidecode(re.sub(r'[<>:\?\*\\/]', '', task.name)) 65 | destination_dir = self.output_dir.joinpath(f'{unidecode(task.course.abbr)}/{task_name}') 66 | for file in self._get_course_task_files(course_files_link, task): 67 | log.debug("found file '%s', downloading...", file.name) 68 | file_destination_dir = destination_dir.joinpath(file.year) 69 | file_destination_dir.mkdir(parents=True, exist_ok=True) 70 | destination_path = file_destination_dir.joinpath(file.name) 71 | self.connection.download_file(file.link, destination_path) 72 | files_found += 1 73 | 74 | if not files_found: 75 | log.info("found no project files in %s/%s, none submitted maybe? :(", task.course.abbr, task.name) 76 | 77 | return files_found 78 | 79 | def _prepare_output(self): 80 | try: 81 | self.output_dir.mkdir() 82 | except FileExistsError: 83 | log.critical("The output directory '%s' already exists.", self.output_dir.as_posix()) 84 | exit(1) 85 | 86 | def _setup_connection(self): 87 | while True: 88 | credentials = get_user_credentials() 89 | self.connection = Connection(*credentials) 90 | 91 | try: 92 | self.connection.get_studies_page() 93 | 94 | except HTTPError as ex: 95 | if ex.response.status_code == 401: 96 | log.error("authentication has failed - the provided credentials were not correct, wanna try again?") 97 | continue 98 | 99 | log.error("connection/authentication to WIS failed", exc_info=ex) 100 | exit(2) 101 | 102 | break 103 | 104 | def run(self): 105 | log.info("starting the mighty project downloader") 106 | 107 | self._setup_connection() 108 | self._prepare_output() 109 | self._explore_studies() 110 | 111 | self.connection.close() 112 | 113 | def _explore_studies(self): 114 | previous_had_courses = True 115 | for study_id in range(1, 6): 116 | log.info("exploring courses in study %d", study_id) 117 | 118 | has_courses = False 119 | for course in self._get_courses(study_id): 120 | self._explore_course(course) 121 | has_courses = True 122 | 123 | if not has_courses and not previous_had_courses: 124 | log.info("two subsequent studies (%d and %d) contained no courses, finishing study exploration", 125 | study_id - 1, study_id) 126 | break 127 | 128 | previous_had_courses = has_courses 129 | --------------------------------------------------------------------------------