├── src
├── __init__.py
├── objects.py
├── io.py
├── parsing.py
├── http.py
└── core.py
├── .gitignore
├── requirements.txt
├── .idea
├── vcs.xml
├── .gitignore
├── inspectionProfiles
│ ├── profiles_settings.xml
│ └── Project_Default.xml
├── discord.xml
├── fit-vut-wis-project-dumper.iml
├── modules.xml
└── misc.xml
├── Pipfile
├── README.md
├── main.py
├── LICENSE
└── Pipfile.lock
/src/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | **/__pycache__
2 | **/.DS_Store
3 | wis_projects
4 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | bs4
3 | html5lib
4 | unidecode
5 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Editor-based HTTP Client requests
5 | /httpRequests/
6 | # Datasource local storage ignored files
7 | /dataSources/
8 | /dataSources.local.xml
9 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/discord.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
1 | [[source]]
2 | url = "https://pypi.org/simple"
3 | verify_ssl = true
4 | name = "pypi"
5 |
6 | [packages]
7 | requests = "*"
8 | bs4 = "*"
9 | html5lib = "*"
10 | unidecode = "*"
11 |
12 | [dev-packages]
13 |
14 | [requires]
15 | python_version = "3.9"
16 |
--------------------------------------------------------------------------------
/.idea/fit-vut-wis-project-dumper.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # FIT VUT WIS Project Dumper
2 |
3 | Requirements:
4 | - `Python >=3.9`
5 | - (Optional) `pipenv`
6 |
7 | ### Installation
8 | ```
9 | pipenv install
10 | ```
11 |
12 | or
13 |
14 | ```sh
15 | pip3 install -r requirements.txt
16 | ```
17 |
18 | ### Usage
19 | ```
20 | pipenv run python main.py [-v[v]] [-o|--output OUTPUT_DIR]
21 | ```
22 |
23 | or
24 |
25 | ```sh
26 | python3.9 main.py [-v[v]] [-o|--output OUTPUT_DIR]
27 | ```
28 |
--------------------------------------------------------------------------------
/src/objects.py:
--------------------------------------------------------------------------------
1 |
2 | class Course:
3 |
4 | def __init__(self, course_abbr: str, course_link: str):
5 | self.abbr = course_abbr
6 | self.link = course_link
7 |
8 |
9 | class CourseTask:
10 |
11 | def __init__(self, task_name: str, task_link: str, course: Course):
12 | self.name = task_name
13 | self.link = task_link
14 | self.course = course
15 |
16 |
17 | class TaskFile:
18 |
19 | def __init__(self, file_name: str, file_year: str, file_link: str, task: CourseTask):
20 | self.name = file_name
21 | self.year = file_year
22 | self.link = file_link
23 | self.task = task
24 |
--------------------------------------------------------------------------------
/src/io.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import re
3 | import sys
4 | from getpass import getpass
5 | from gettext import gettext
6 |
7 | log = logging.getLogger("io")
8 |
9 |
10 | def get_user_credentials() -> (str, str):
11 | log.debug("loading user credentials")
12 |
13 | username, password = "", ""
14 | while re.fullmatch(r"^x[a-z]{5}[a-z\d]{2}$", username) is None:
15 | username = input("WIS username: ")
16 |
17 | while len(password) < 10:
18 | password = getpass("WIS password: ", sys.stdout)
19 |
20 | log.debug("credentials successfully obtained: username=%s, password=%s", username, password)
21 | return username, password
22 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 | import os.path
4 | from pathlib import Path
5 |
6 | from src.core import Downloader
7 |
8 | log = logging.getLogger("main")
9 |
10 |
11 | def main(output_dir: Path):
12 | Downloader(output_dir).run()
13 |
14 |
15 | if __name__ == '__main__':
16 | default_dir = f"{os.getcwd()}/wis_projects"
17 |
18 | parser = argparse.ArgumentParser()
19 | parser.add_argument("-o", "--output", action="store", dest="output_dir", metavar="OUTPUT_DIR", default=default_dir)
20 | parser.add_argument("-v", action="count", dest="version", default=0)
21 | args = parser.parse_args()
22 |
23 | logging.basicConfig(
24 | format="%(asctime)s %(levelname)-8s [%(name)s]: %(message)s",
25 | level=logging.WARNING - args.version * 10,
26 | )
27 |
28 | output_directory = Path(args.output_dir)
29 | main(output_directory)
30 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
14 |
15 |
16 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Daniel Dolejška
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/src/parsing.py:
--------------------------------------------------------------------------------
1 | from typing import Generator, Optional
2 |
3 | from bs4 import BeautifulSoup
4 |
5 |
6 | class Parser:
7 |
8 | def __init__(self, page_content: str):
9 | self.content = BeautifulSoup(page_content, "html5lib")
10 |
11 |
12 | class StudyParser(Parser):
13 |
14 | def get_course_names_and_links(self) -> Generator[tuple[str, str], None, None]:
15 | course_entries = self.content.select(".content > .table-holder tr[align='center'][valign='top']")
16 | for entry in course_entries:
17 | yield entry.find("th").text, entry.select_one("a.bar")["href"]
18 |
19 |
20 | class CourseParser(Parser):
21 |
22 | def get_task_names_and_links(self) -> Generator[tuple[str, str], None, None]:
23 | task_links = self.content.select(".content > form > .table-holder a.bar")
24 | for link in task_links:
25 | yield link.text, link["href"]
26 |
27 |
28 | class TaskParser(Parser):
29 |
30 | def try_get_files_link(self) -> Optional[str]:
31 | task_links = self.content.select(".content > p > a")
32 | for link in task_links:
33 | link_target = link["href"]
34 | if "course-sf.php" in link_target:
35 | return link_target
36 |
37 | return None
38 |
39 |
40 | class TaskFilesParser(Parser):
41 |
42 | def get_file_names_and_links(self) -> Generator[tuple[str, str, str], None, None]:
43 | year = self.content.find("h1").text.rsplit("/", maxsplit=1)[-1]
44 | file_links = self.content.select(".content > form > table tr[valign='middle'] > td > a")
45 | for link in file_links:
46 | yield link.text, year, link["href"]
47 |
48 | return None
49 |
--------------------------------------------------------------------------------
/src/http.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import shutil
3 | from pathlib import Path
4 |
5 | from requests import Session as HTTPSession, Response
6 | from requests.auth import HTTPBasicAuth
7 |
8 | log = logging.getLogger("http")
9 |
10 |
11 | class Connection:
12 |
13 | def __init__(self, username: str, password: str):
14 | self.session = HTTPSession()
15 | self.session.auth = HTTPBasicAuth(username, password)
16 |
17 | def _get_url(self, url: str) -> str:
18 | if not url.startswith("http"):
19 | if url.startswith("/FIT/st/"):
20 | return f"https://wis.fit.vutbr.cz{url}"
21 | elif url.startswith("/st/"):
22 | return f"https://wis.fit.vutbr.cz/FIT{url}"
23 | else:
24 | return f"https://wis.fit.vutbr.cz/FIT/st/{url}"
25 |
26 | return url
27 |
28 | def _get(self, url: str, params: dict[str, str], stream: bool = False) -> Response:
29 | """ Makes an HTTP GET request to the given URL and ensures the response is valid. """
30 | url = self._get_url(url)
31 | response = self.session.get(url, params=params, stream=stream)
32 | response.raise_for_status()
33 |
34 | return response
35 |
36 | def get_content(self, url: str, **params: [str, int]) -> str:
37 | """ Loads and returns contents of the given webpage. """
38 | with self._get(url, params) as response:
39 | return response.content.decode("iso-8859-2")
40 |
41 | def get_studies_page(self) -> str:
42 | return self.get_content("study-s.php.cs")
43 |
44 | def get_courses_page(self, study: int = 1) -> str:
45 | assert study > 0
46 | return self.get_content("study-a.php.cs", cist=study)
47 |
48 | def download_file(self, link: str, filepath: Path):
49 | with self._get(link, {}, stream=True) as response:
50 | log.debug("downloading %s -> %s", link, filepath.as_posix())
51 | with filepath.open("wb") as file:
52 | for chunk in response.iter_content(chunk_size=128 * 1024):
53 | if chunk:
54 | file.write(chunk)
55 |
56 | log.info("successfully downloaded %s", filepath.as_posix())
57 |
58 | def close(self):
59 | self.session.close()
60 |
--------------------------------------------------------------------------------
/Pipfile.lock:
--------------------------------------------------------------------------------
1 | {
2 | "_meta": {
3 | "hash": {
4 | "sha256": "247ae7f4e1b509c78c58bf44d4a6a8d817a1ef13721d2262af66ba600db82585"
5 | },
6 | "pipfile-spec": 6,
7 | "requires": {
8 | "python_version": "3.9"
9 | },
10 | "sources": [
11 | {
12 | "name": "pypi",
13 | "url": "https://pypi.org/simple",
14 | "verify_ssl": true
15 | }
16 | ]
17 | },
18 | "default": {
19 | "beautifulsoup4": {
20 | "hashes": [
21 | "sha256:58d5c3d29f5a36ffeb94f02f0d786cd53014cf9b3b3951d42e0080d8a9498d30",
22 | "sha256:ad9aa55b65ef2808eb405f46cf74df7fcb7044d5cbc26487f96eb2ef2e436693"
23 | ],
24 | "markers": "python_full_version >= '3.6.0'",
25 | "version": "==4.11.1"
26 | },
27 | "bs4": {
28 | "hashes": [
29 | "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"
30 | ],
31 | "index": "pypi",
32 | "version": "==0.0.1"
33 | },
34 | "certifi": {
35 | "hashes": [
36 | "sha256:9c5705e395cd70084351dd8ad5c41e65655e08ce46f2ec9cf6c2c08390f71eb7",
37 | "sha256:f1d53542ee8cbedbe2118b5686372fb33c297fcd6379b050cca0ef13a597382a"
38 | ],
39 | "markers": "python_full_version >= '3.6.0'",
40 | "version": "==2022.5.18.1"
41 | },
42 | "charset-normalizer": {
43 | "hashes": [
44 | "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597",
45 | "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"
46 | ],
47 | "markers": "python_version >= '3'",
48 | "version": "==2.0.12"
49 | },
50 | "html5lib": {
51 | "hashes": [
52 | "sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d",
53 | "sha256:b2e5b40261e20f354d198eae92afc10d750afb487ed5e50f9c4eaf07c184146f"
54 | ],
55 | "index": "pypi",
56 | "version": "==1.1"
57 | },
58 | "idna": {
59 | "hashes": [
60 | "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
61 | "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
62 | ],
63 | "markers": "python_version >= '3'",
64 | "version": "==3.3"
65 | },
66 | "requests": {
67 | "hashes": [
68 | "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
69 | "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
70 | ],
71 | "index": "pypi",
72 | "version": "==2.27.1"
73 | },
74 | "six": {
75 | "hashes": [
76 | "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
77 | "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
78 | ],
79 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
80 | "version": "==1.16.0"
81 | },
82 | "soupsieve": {
83 | "hashes": [
84 | "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759",
85 | "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d"
86 | ],
87 | "markers": "python_full_version >= '3.6.0'",
88 | "version": "==2.3.2.post1"
89 | },
90 | "urllib3": {
91 | "hashes": [
92 | "sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14",
93 | "sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e"
94 | ],
95 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
96 | "version": "==1.26.9"
97 | },
98 | "webencodings": {
99 | "hashes": [
100 | "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78",
101 | "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"
102 | ],
103 | "version": "==0.5.1"
104 | }
105 | },
106 | "develop": {}
107 | }
108 |
--------------------------------------------------------------------------------
/src/core.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import re
3 | from pathlib import Path
4 | from typing import Generator, Optional
5 | from unidecode import unidecode
6 |
7 | from requests import HTTPError
8 |
9 | from .http import Connection
10 | from .io import get_user_credentials
11 | from .objects import Course, CourseTask, TaskFile
12 | from .parsing import StudyParser, CourseParser, TaskParser, TaskFilesParser
13 |
14 | log = logging.getLogger("core")
15 |
16 |
17 | class Downloader:
18 |
19 | connection: Connection = None
20 |
21 | def __init__(self, output_dir: Path):
22 | self.output_dir = output_dir
23 |
24 | def _get_courses(self, study_id: int) -> Generator[Course, None, None]:
25 | assert self.connection is not None
26 | page_content = self.connection.get_courses_page(study_id)
27 | for course_abbr, course_link in StudyParser(page_content).get_course_names_and_links():
28 | yield Course(course_abbr, course_link)
29 |
30 | def _get_course_tasks(self, course: Course) -> Generator[CourseTask, None, None]:
31 | assert self.connection is not None
32 | page_content = self.connection.get_content(course.link)
33 | for task_name, task_link in CourseParser(page_content).get_task_names_and_links():
34 | yield CourseTask(task_name, task_link, course)
35 |
36 | def _try_get_course_task_files_link(self, course_task_link: str) -> Optional[str]:
37 | assert self.connection is not None
38 | page_content = self.connection.get_content(course_task_link)
39 | return TaskParser(page_content).try_get_files_link()
40 |
41 | def _get_course_task_files(self, course_files_link: str, task: CourseTask) -> Generator[TaskFile, None, None]:
42 | assert self.connection is not None
43 | page_content = self.connection.get_content(course_files_link)
44 | for file_name, file_year, file_link in TaskFilesParser(page_content).get_file_names_and_links():
45 | yield TaskFile(file_name, file_year, file_link, task)
46 |
47 | def _explore_course(self, course: Course):
48 | log.info("exploring course %s", course.abbr)
49 | files_downloaded = 0
50 | for task in self._get_course_tasks(course):
51 | files_downloaded += self._download_files_from_course_task(task)
52 |
53 | if not files_downloaded:
54 | log.info("found no project files in %s", course.abbr)
55 | else:
56 | log.debug("found and downloaded %d file(s) from %s", files_downloaded, course.abbr)
57 |
58 | def _download_files_from_course_task(self, task: CourseTask) -> int:
59 | if (course_files_link := self._try_get_course_task_files_link(task.link)) is None:
60 | log.debug("course task '%s' in %s does not contain any downloadable files", task.name, task.course.abbr)
61 | return 0
62 |
63 | files_found = 0
64 | task_name = unidecode(re.sub(r'[<>:\?\*\\/]', '', task.name))
65 | destination_dir = self.output_dir.joinpath(f'{unidecode(task.course.abbr)}/{task_name}')
66 | for file in self._get_course_task_files(course_files_link, task):
67 | log.debug("found file '%s', downloading...", file.name)
68 | file_destination_dir = destination_dir.joinpath(file.year)
69 | file_destination_dir.mkdir(parents=True, exist_ok=True)
70 | destination_path = file_destination_dir.joinpath(file.name)
71 | self.connection.download_file(file.link, destination_path)
72 | files_found += 1
73 |
74 | if not files_found:
75 | log.info("found no project files in %s/%s, none submitted maybe? :(", task.course.abbr, task.name)
76 |
77 | return files_found
78 |
79 | def _prepare_output(self):
80 | try:
81 | self.output_dir.mkdir()
82 | except FileExistsError:
83 | log.critical("The output directory '%s' already exists.", self.output_dir.as_posix())
84 | exit(1)
85 |
86 | def _setup_connection(self):
87 | while True:
88 | credentials = get_user_credentials()
89 | self.connection = Connection(*credentials)
90 |
91 | try:
92 | self.connection.get_studies_page()
93 |
94 | except HTTPError as ex:
95 | if ex.response.status_code == 401:
96 | log.error("authentication has failed - the provided credentials were not correct, wanna try again?")
97 | continue
98 |
99 | log.error("connection/authentication to WIS failed", exc_info=ex)
100 | exit(2)
101 |
102 | break
103 |
104 | def run(self):
105 | log.info("starting the mighty project downloader")
106 |
107 | self._setup_connection()
108 | self._prepare_output()
109 | self._explore_studies()
110 |
111 | self.connection.close()
112 |
113 | def _explore_studies(self):
114 | previous_had_courses = True
115 | for study_id in range(1, 6):
116 | log.info("exploring courses in study %d", study_id)
117 |
118 | has_courses = False
119 | for course in self._get_courses(study_id):
120 | self._explore_course(course)
121 | has_courses = True
122 |
123 | if not has_courses and not previous_had_courses:
124 | log.info("two subsequent studies (%d and %d) contained no courses, finishing study exploration",
125 | study_id - 1, study_id)
126 | break
127 |
128 | previous_had_courses = has_courses
129 |
--------------------------------------------------------------------------------