├── src
    ├── __init__.py
    ├── local.sqlite3
    ├── log.py
    ├── schemas.py
    ├── models.py
    ├── fetch_proxies.py
    ├── data_fetcher.py
    ├── scraper.py
    ├── server.py
    └── data_parser.py
├── .dockerignore
├── .gitattributes
├── .env.example
├── .pre-commit-config.yaml
├── docker-compose.yml
├── .github
    └── workflows
    │   ├── ci-dev-pr.yml
    │   ├── lint.yml
    │   ├── scraper.yml
    │   └── production.yml
├── pyproject.toml
├── Dockerfile
├── scraper.Dockerfile
├── LICENSE
├── README.md
└── .gitignore


/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | .git
2 | .github


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | * text=auto eol=lf
2 | 


--------------------------------------------------------------------------------
/src/local.sqlite3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/compsci-adl/courses-api/HEAD/src/local.sqlite3


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 | DEFAULT_LOGGING_LEVEL = DEBUG # Options: 'DEBUG' or 'ERROR'
2 | YEAR = 2025
3 | 
4 | DB_TYPE=local  # Options: 'libsql', 'dev', or 'local'
5 | TURSO_DATABASE_URL=https://your-turso-database-url
6 | TURSO_AUTH_TOKEN=your-turso-auth-token
7 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | - repo: https://github.com/astral-sh/ruff-pre-commit
 3 |   # Ruff version
 4 |   rev: v0.7.3
 5 |   hooks:
 6 |     # Run the linter
 7 |     - id: ruff
 8 |       args: [ --fix ]
 9 |     # Run the formatter
10 |     - id: ruff-format
11 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   courses-api:
 3 |     image: courses-api:latest
 4 |     container_name: courses-api
 5 |     environment:
 6 |       - PUID=1000
 7 |       - PGID=1000
 8 |       - PORT=8000
 9 |     ports:
10 |       - 8000:8000
11 |     volumes:
12 |       - ./local.sqlite3:/app/src/local.sqlite3
13 |     networks:
14 |       - csclub
15 | 
16 | networks:
17 |   csclub:
18 |     external: true
19 | 


--------------------------------------------------------------------------------
/.github/workflows/ci-dev-pr.yml:
--------------------------------------------------------------------------------
 1 | name: Development - Pull Request
 2 | on:
 3 |   pull_request:
 4 |     branches:
 5 |       - '**'
 6 | 
 7 | jobs:
 8 |   lint-format:
 9 |     name: Linting Checks
10 |     uses: ./.github/workflows/lint.yml
11 | 
12 |   build:
13 |     needs: lint-format
14 |     name: Build
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - name: Checkout repository
18 |         uses: actions/checkout@v4
19 | 
20 |       - name: Build Docker container
21 |         run: |
22 |           docker buildx build \
23 |             --file=Dockerfile -t courses-api .
24 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: Linting Checks
 2 | on:
 3 |   workflow_call:
 4 | 
 5 | jobs:
 6 |   lint_and_format:
 7 |     name: Lint and Format
 8 |     runs-on: ubuntu-latest
 9 | 
10 |     steps:
11 |       - name: Checkout code
12 |         uses: actions/checkout@v4
13 | 
14 |       - name: Install uv
15 |         uses: astral-sh/setup-uv@v3
16 |         with:
17 |             enable-cache: true
18 |             cache-dependency-glob: "uv.lock"
19 | 
20 |       - name: Set up Python
21 |         uses: actions/setup-python@v5
22 |         with:
23 |           python-version: '3.12'
24 | 
25 |       - name: Install dependencies
26 |         run: |
27 |           uv sync
28 | 
29 |       - name: Check for code errors
30 |         run: |
31 |           uv run ruff check 
32 | 
33 |       - name: Check formatting
34 |         run: |
35 |           uv run ruff format --check
36 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "courses-api"
 3 | version = "2.0.0"
 4 | description = "API for getting Adelaide University course information"
 5 | authors = [
 6 |     { name = "CS Club Open Source Team", email = "dev@csclub.org.au" }
 7 | ]
 8 | license = "MIT"
 9 | readme = "README.md"
10 | requires-python = ">=3.12"
11 | dependencies = [
12 |     "beautifulsoup4>=4.14.3",
13 |     "fastapi[standard]>=0.115.5",
14 |     "json-repair>=0.30.1",
15 |     "nanoid>=2.0.0",
16 |     "pydantic>=2.9.2",
17 |     "requests>=2.32.3",
18 |     "rich>=13.9.4",
19 |     "ruff>=0.7.3",
20 |     "sqlalchemy-libsql>=0.1.0",
21 |     "sqlalchemy>=2.0.36",
22 | ]
23 | 
24 | [tool.ruff]
25 | lint.select = ['E', 'F', 'W', 'A', 'PLC', 'PLE', 'PLW', 'I']
26 | lint.ignore = ["E501"] 
27 | lint.fixable = ["ALL"]
28 | 
29 | [dependency-groups]
30 | dev = [
31 |     "pre-commit>=4.0.1",
32 |     "ruff>=0.7.3",
33 | ]
34 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use a Python image with uv pre-installed
 2 | FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
 3 | 
 4 | # Install the project into `/app`
 5 | WORKDIR /app
 6 | 
 7 | # Enable bytecode compilation
 8 | ENV UV_COMPILE_BYTECODE=1
 9 | 
10 | # Copy from the cache instead of linking since it's a mounted volume
11 | ENV UV_LINK_MODE=copy
12 | 
13 | # Install the project's dependencies using the lockfile and settings
14 | RUN --mount=type=cache,target=/root/.cache/uv \
15 |     --mount=type=bind,source=uv.lock,target=uv.lock \
16 |     --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
17 |     uv sync --frozen --no-install-project --no-dev
18 | 
19 | # Then, add the rest of the project source code and install it
20 | # Installing separately from its dependencies allows optimal layer caching
21 | ADD . /app
22 | RUN --mount=type=cache,target=/root/.cache/uv \
23 |     uv sync --frozen --no-dev
24 | 
25 | # Place executables in the environment at the front of the path
26 | ENV PATH="/app/.venv/bin:$PATH"
27 | 
28 | EXPOSE 8000
29 | 
30 | ENTRYPOINT ["fastapi", "run", "src/server.py"]
31 | 


--------------------------------------------------------------------------------
/scraper.Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use a Python image with uv pre-installed
 2 | FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
 3 | 
 4 | # Install the project into `/app`
 5 | WORKDIR /app
 6 | 
 7 | # Enable bytecode compilation
 8 | ENV UV_COMPILE_BYTECODE=1
 9 | 
10 | # Copy from the cache instead of linking since it's a mounted volume
11 | ENV UV_LINK_MODE=copy
12 | 
13 | # Install the project's dependencies using the lockfile and settings
14 | RUN --mount=type=cache,target=/root/.cache/uv \
15 |     --mount=type=bind,source=uv.lock,target=uv.lock \
16 |     --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
17 |     uv sync --frozen --no-install-project --no-dev
18 | 
19 | # Then, add the rest of the project source code and install it
20 | # Installing separately from its dependencies allows optimal layer caching
21 | ADD . /app
22 | RUN --mount=type=cache,target=/root/.cache/uv \
23 |     uv sync --frozen --no-dev
24 | 
25 | # Place executables in the environment at the front of the path
26 | ENV PATH="/app/.venv/bin:$PATH"
27 | 
28 | EXPOSE 8000
29 | 
30 | ENTRYPOINT ["python3", "src/scraper.py"]
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024-present Adelaide University Computer Science Club
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/log.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from datetime import datetime
 3 | from pathlib import Path
 4 | 
 5 | from dotenv import dotenv_values
 6 | 
 7 | 
 8 | def setup_logger() -> logging.Logger:
 9 |     """
10 |     Sets up a logger that writes logs to a file in the form {timestamp}.log
11 |     The level of logging that is written to the file depends on the environment
12 |     variable "DEFAULT_LOGGING_LEVEL".
13 |     Returns:
14 |         logging.Logger = a customised logger object
15 |     """
16 | 
17 |     # Initialise logger
18 |     logger = logging.getLogger("courseAPICallLogger")
19 |     default_logging_level = dotenv_values().get("DEFAULT_LOGGING_LEVEL")
20 |     logger.setLevel(default_logging_level)
21 | 
22 |     if not logger.hasHandlers():
23 |         # Initialise log dir path
24 |         logs_dir = Path(__file__).resolve().parent.parent / "logs"
25 |         logs_dir.mkdir(parents=True, exist_ok=True)
26 | 
27 |         # Each error log sent into separate file
28 |         timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
29 | 
30 |         # Log gile path
31 |         log_file_path = logs_dir / f"{timestamp}.log"
32 | 
33 |         # Setup log file handler
34 |         log_file_handler = logging.FileHandler(log_file_path, mode="w", delay=True)
35 | 
36 |         # Set level of file handler
37 |         log_file_handler.setLevel(default_logging_level)
38 | 
39 |         # Setup file formatter
40 |         file_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
41 |         log_file_handler.setFormatter(file_formatter)
42 | 
43 |         # Add handler to logger
44 |         logger.addHandler(log_file_handler)
45 | 
46 |     return logger
47 | 
48 | 
49 | logger = setup_logger()
50 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Courses API
 2 | Courses API is a tool to scrape course information from the Adelaide University website and provide course data to other CS Club Open Source Team projects via an API endpoint.
 3 | 
 4 | ## Getting Started
 5 | 
 6 | To get started, please follow these steps:
 7 | 
 8 | 1. Install `uv` if not already installed:
 9 | 
10 |     Linux, macOS, Windows (WSL)
11 |     ```bash
12 |     curl -LsSf https://astral.sh/uv/install.sh | sh
13 |     ```
14 |     Windows (Powershell)
15 |     ```powershell
16 |     powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex"
17 |     ```
18 | 
19 | 2. Install dependencies:
20 | 
21 |     ```sh
22 |     uv sync
23 |     pre-commit install
24 |     ```
25 | 
26 | 3. Make a `.env` file and copy `.env.example` into it
27 | 
28 | ### Running the API Server
29 | 
30 | 1. Start the FastAPI server:
31 | 
32 |     ```sh
33 |     uv run fastapi dev src/server.py
34 |     ```
35 | 
36 | 2. Open [http://localhost:8000/docs](http://localhost:8000/docs) with your browser to see the API documentation and to test the available endpoints.
37 | 
38 | ### Running the scraper
39 | 
40 | Start the scraper (Note: Scraping all the courses may take over an hour):
41 | 
42 | ```sh
43 | uv run python3 src/scraper.py
44 | ```
45 | 
46 | #### Debugging
47 | The output level of the logger can be configured in the `.env`. Set `DEFAULT_LOGGING_LEVEL` to your desires level such as `DEBUG` and `ERROR`. `DEBUG` outputs all logs into a file, including errors. `ERROR` only logs errors into a log file.
48 | 
49 | ## Contributing
50 | 
51 | We welcome contributions to enhance Courses API! If you find any issues, have suggestions, or want to request a feature, please follow our [Contributing Guidelines](https://github.com/compsci-adl/.github/blob/main/CONTRIBUTING.md).
52 | 
53 | ## License
54 | 
55 | This project is licensed under the MIT License.
56 | See [LICENSE](LICENSE) for details.


--------------------------------------------------------------------------------
/src/schemas.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Literal, Optional
 2 | 
 3 | from pydantic import BaseModel, Field, root_validator
 4 | 
 5 | 
 6 | class NameSchema(BaseModel):
 7 |     subject: str
 8 |     code: str
 9 |     title: str
10 | 
11 | 
12 | DateField = Field(pattern=r"\d{2}-\d{2}")
13 | TimeField = Field(pattern=r"\d{2}:\d{2}")
14 | 
15 | 
16 | class DateRageSchema(BaseModel):
17 |     start: str = DateField
18 |     end: str = DateField
19 | 
20 | 
21 | class TimeRageSchema(BaseModel):
22 |     start: str = TimeField
23 |     end: str = TimeField
24 | 
25 | 
26 | class MeetingSchema(BaseModel):
27 |     day: Literal[
28 |         "Monday",
29 |         "Tuesday",
30 |         "Wednesday",
31 |         "Thursday",
32 |         "Friday",
33 |         "Saturday",
34 |         "Sunday",
35 |     ]
36 |     location: str
37 |     date: DateRageSchema
38 |     time: TimeRageSchema
39 | 
40 | 
41 | class ClassSchema(BaseModel):
42 |     number: str
43 |     section: str  # Return class section
44 |     available_seats: str
45 |     meetings: List[MeetingSchema]
46 | 
47 | 
48 | class ClassTypeSchema(BaseModel):
49 |     id: str
50 |     category: Optional[Literal["enrolment", "related", "unknown"]] = "unknown"
51 |     type: Optional[str] = None
52 |     component: Optional[str] = None
53 |     classes: List[ClassSchema]
54 | 
55 |     @root_validator(pre=True)
56 |     def ensure_component_or_type(cls, values):
57 |         if not values.get("component") and values.get("type"):
58 |             values["component"] = values.get("type")
59 |         if not values.get("type") and values.get("component"):
60 |             values["type"] = values.get("component")
61 |         return values
62 | 
63 | 
64 | class RequirementsSchema(BaseModel):
65 |     prerequisites: Optional[List[str]] = None
66 |     corequisites: Optional[List[str]] = None
67 |     antirequisites: Optional[List[str]] = None
68 | 
69 | 
70 | class CourseSchema(BaseModel):
71 |     id: str
72 |     course_id: int
73 |     name: NameSchema
74 |     class_number: Optional[int] = None
75 |     year: str
76 |     term: str
77 |     campus: str
78 |     units: int
79 |     requirements: RequirementsSchema
80 |     class_list: List[ClassTypeSchema]
81 | 


--------------------------------------------------------------------------------
/.github/workflows/scraper.yml:
--------------------------------------------------------------------------------
 1 | name: Scraper
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '30 */12 * * *'
 6 |   workflow_dispatch:
 7 | 
 8 | env:
 9 |   AWS_REGION: ap-southeast-2
10 | 
11 | jobs:
12 |   run-scraper:
13 |     name: Run Scraper
14 |     runs-on: ubuntu-latest
15 |     environment: Scraper
16 | 
17 |     env:
18 |       DEFAULT_LOGGING_LEVEL: ${{ secrets.DEFAULT_LOGGING_LEVEL }}
19 |       YEAR: ${{ secrets.YEAR }}
20 |     
21 |     permissions:
22 |       id-token: write
23 |       contents: read
24 | 
25 |     steps:
26 |       - name: Checkout repository
27 |         uses: actions/checkout@v4
28 | 
29 |       - name: Configure AWS credentials
30 |         uses: aws-actions/configure-aws-credentials@v4
31 |         with:
32 |           role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }}
33 |           role-session-name: ${{ secrets.AWS_ROLE_SESSION_NAME }}
34 |           aws-region: ${{ env.AWS_REGION }}
35 | 
36 |       - name: Set up Docker Buildx
37 |         uses: docker/setup-buildx-action@v3
38 | 
39 |       - name: Create .env file
40 |         run: |
41 |           echo "DEFAULT_LOGGING_LEVEL=${{ env.DEFAULT_LOGGING_LEVEL }}" > src/.env
42 |           echo "YEAR=${{ env.YEAR }}" >> src/.env
43 | 
44 |       - name: Build Docker image
45 |         run: docker build -f scraper.Dockerfile -t courses-api-scraper:latest .
46 | 
47 |       - name: Run scraper
48 |         run: |
49 |           docker run --rm \
50 |             -v ${{ github.workspace }}/src:/app/src \
51 |             -e DEFAULT_LOGGING_LEVEL=${{ env.DEFAULT_LOGGING_LEVEL }} \
52 |             -e YEAR=${{ env.YEAR }} \
53 |             courses-api-scraper:latest
54 | 
55 |       - name: Rename SQLite DB to local.sqlite3
56 |         run: mv src/dev.sqlite3 src/local.sqlite3
57 | 
58 |       - name: Upload DB to S3
59 |         run: |
60 |           aws s3 cp src/local.sqlite3 s3://${{ secrets.AWS_S3_BUCKET }}/courses-api/
61 | 
62 |       - name: Download DB and restart courses-api container on EC2
63 |         env:
64 |           KEY: ${{ secrets.SSH_EC2_KEY }}
65 |           HOSTNAME: ${{ secrets.SSH_EC2_HOSTNAME }}
66 |           USER: ${{ secrets.SSH_EC2_USER }}
67 |         run: |
68 |           echo "$KEY" > private_key && chmod 600 private_key
69 |           ssh -v -o StrictHostKeyChecking=no -i private_key ${USER}@${HOSTNAME} '
70 |             cd ~/courses-api
71 |             aws s3 cp s3://${{ secrets.AWS_S3_BUCKET }}/courses-api/local.sqlite3 .
72 |             docker restart courses-api
73 |           '
74 | 


--------------------------------------------------------------------------------
/src/models.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import (
 2 |     Column,
 3 |     ForeignKey,
 4 |     Integer,
 5 |     String,
 6 | )
 7 | from sqlalchemy.ext.declarative import declarative_base
 8 | from sqlalchemy.orm import relationship
 9 | 
10 | Base = declarative_base()
11 | 
12 | 
13 | class Subject(Base):
14 |     __tablename__ = "subjects"
15 |     id = Column(String, primary_key=True)
16 |     name = Column(String, unique=True, nullable=False)
17 |     courses = relationship("Course", backref="subject_ref")
18 | 
19 | 
20 | class Course(Base):
21 |     __tablename__ = "courses"
22 |     id = Column(String, primary_key=True)
23 |     course_id = Column(Integer, unique=True, nullable=False)
24 |     year = Column(String, nullable=False)
25 |     terms = Column(String, nullable=False)
26 |     subject = Column(String, ForeignKey("subjects.name"), nullable=False)
27 |     course_code = Column(String, nullable=False)
28 |     title = Column(String, nullable=False)
29 |     campus = Column(String, nullable=False)
30 |     level_of_study = Column(String, nullable=True)
31 |     units = Column(Integer, nullable=False)
32 |     course_coordinator = Column(String, nullable=True)
33 |     course_level = Column(String, nullable=False)
34 |     course_overview = Column(String, nullable=True)
35 |     prerequisites = Column(String, nullable=False)
36 |     corequisites = Column(String, nullable=False)
37 |     antirequisites = Column(String, nullable=False)
38 |     url = Column(String, nullable=False)
39 |     course_classes = relationship("CourseClass", backref="course")
40 | 
41 | 
42 | class Meetings(Base):
43 |     __tablename__ = "meetings"
44 |     id = Column(String, primary_key=True)
45 |     dates = Column(String, nullable=False)
46 |     days = Column(String, nullable=False)
47 |     start_time = Column(String, nullable=False)
48 |     end_time = Column(String, nullable=False)
49 |     campus = Column(String, nullable=False)
50 |     location = Column(String, nullable=False)
51 |     course_class_id = Column(String, ForeignKey("course_classes.id"), nullable=False)
52 | 
53 | 
54 | class CourseClass(Base):
55 |     __tablename__ = "course_classes"
56 |     id = Column(String, primary_key=True)
57 |     class_nbr = Column(Integer, nullable=False)
58 |     section = Column(String, nullable=False)
59 |     size = Column(Integer, nullable=False)
60 |     available = Column(Integer, nullable=False)
61 |     component = Column(String, nullable=False)
62 |     meetings = relationship("Meetings", backref="course_class")
63 |     course_id = Column(String, ForeignKey("courses.id"), nullable=False)
64 | 


--------------------------------------------------------------------------------
/.github/workflows/production.yml:
--------------------------------------------------------------------------------
 1 | name: Production
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [main]
 6 | 
 7 | env:
 8 |   AWS_REGION: ap-southeast-2
 9 | 
10 | jobs:
11 |   build:
12 |     name: Build
13 |     runs-on: ubuntu-24.04-arm
14 |     environment: Production
15 |     permissions:
16 |       id-token: write
17 |       contents: read
18 |     steps:
19 |       - name: Checkout repository
20 |         uses: actions/checkout@v4
21 | 
22 |       - name: Configure AWS credentials
23 |         uses: aws-actions/configure-aws-credentials@v4
24 |         with:
25 |           role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }}
26 |           role-session-name: ${{ secrets.AWS_ROLE_SESSION_NAME }}
27 |           aws-region: ${{ env.AWS_REGION }}
28 | 
29 |       - name: Set up Docker Buildx
30 |         uses: docker/setup-buildx-action@v3
31 | 
32 |       - name: Cache Docker layers
33 |         uses: actions/cache@v4
34 |         with:
35 |           path: /tmp/.buildx-cache
36 |           key: ${{ runner.os }}-buildx-${{ github.sha }}
37 |           restore-keys: |
38 |             ${{ runner.os }}-buildx-
39 | 
40 |       - name: Build Docker container
41 |         env:
42 |           PRODUCTION_BUILD: 'true'
43 |         run: |
44 |           docker buildx build \
45 |             --cache-from=type=local,src=/tmp/.buildx-cache \
46 |             --cache-to=type=local,dest=/tmp/.buildx-cache-new,mode=max \
47 |             --output type=docker,dest=courses-api.tar \
48 |             --platform=linux/arm64 --file=Dockerfile -t courses-api .
49 |           gzip courses-api.tar
50 | 
51 |       - name: Save Docker cache
52 |         if: success()
53 |         run: |
54 |           rsync -a --delete /tmp/.buildx-cache-new/ /tmp/.buildx-cache/
55 | 
56 |       - name: Copy image and compose file to S3
57 |         run: |
58 |           aws s3 cp ./courses-api.tar.gz s3://${{ secrets.AWS_S3_BUCKET }}/courses-api/
59 |           aws s3 cp ./docker-compose.yml s3://${{ secrets.AWS_S3_BUCKET }}/courses-api/
60 | 
61 |   deploy:
62 |     needs: build
63 |     name: Deploy
64 |     runs-on: ubuntu-latest
65 |     environment: Production
66 |     steps:
67 |       - name: Deploy on EC2
68 |         env:
69 |           KEY: ${{ secrets.SSH_EC2_KEY }}
70 |           HOSTNAME: ${{ secrets.SSH_EC2_HOSTNAME }}
71 |           USER: ${{ secrets.SSH_EC2_USER }}
72 |           YEAR: ${{ secrets.YEAR }}
73 |           DB_TYPE: ${{ secrets.DB_TYPE }}
74 |           TURSO_DATABASE_URL: ${{ secrets.TURSO_DATABASE_URL }}
75 |           TURSO_AUTH_TOKEN: ${{ secrets.TURSO_AUTH_TOKEN }}
76 |         run: |
77 |           echo "$KEY" > private_key && chmod 600 private_key
78 |           ssh -v -o StrictHostKeyChecking=no -i private_key ${USER}@${HOSTNAME} '
79 |             cd ~/courses-api
80 |             aws s3 cp s3://${{ secrets.AWS_S3_BUCKET }}/courses-api/courses-api.tar.gz .
81 |             aws s3 cp s3://${{ secrets.AWS_S3_BUCKET }}/courses-api/docker-compose.yml .
82 |             echo YEAR=${{ secrets.YEAR }} > .env
83 |             echo DB_TYPE=${{ secrets.DB_TYPE }} >> .env
84 |             echo TURSO_DATABASE_URL=${{ secrets.TURSO_DATABASE_URL }} >> .env
85 |             echo TURSO_AUTH_TOKEN=${{ secrets.TURSO_AUTH_TOKEN }} >> .env
86 |             docker load -i courses-api.tar.gz
87 |             docker compose up -d
88 |             docker restart courses-api
89 |           '
90 | 


--------------------------------------------------------------------------------
/src/fetch_proxies.py:
--------------------------------------------------------------------------------
 1 | from concurrent.futures import ThreadPoolExecutor, as_completed
 2 | 
 3 | import requests
 4 | from requests.exceptions import RequestException, Timeout
 5 | from rich.progress import (
 6 |     BarColumn,
 7 |     Progress,
 8 |     TextColumn,
 9 |     TimeRemainingColumn,
10 | )
11 | 
12 | 
13 | def fetch_proxies(url):
14 |     """Fetch the list of proxies from the URL."""
15 |     response = requests.get(url)
16 |     if response.status_code == 200:
17 |         # Split the response content by newline to get each proxy
18 |         return response.text.splitlines()
19 |     else:
20 |         print("Failed to retrieve proxies.")
21 |         return []
22 | 
23 | 
24 | def test_proxy(
25 |     proxy,
26 |     test_url="https://uosa-search.funnelback.squiz.cloud/s/search.html?collection=uosa~sp-aem-prod&form=json&num_ranks=1",
27 |     timeout=5,
28 |     retries=2,  # Number of retries
29 | ):
30 |     """Test if the given proxy is working by making a request."""
31 |     proxies = {
32 |         "http": f"http://{proxy}",
33 |         "https": f"http://{proxy}",
34 |     }
35 |     for attempt in range(retries + 1):  # Retry logic
36 |         try:
37 |             response = requests.get(test_url, proxies=proxies, timeout=timeout)
38 |             if response.status_code == 200:
39 |                 return proxy  # Return the working proxy
40 |         except (RequestException, Timeout):
41 |             if attempt < retries:
42 |                 continue  # Retry on failure
43 |             else:
44 |                 return None  # Skip the proxy if all retries fail
45 | 
46 | 
47 | def save_working_proxies(proxies, filename="src/working_proxies.txt"):
48 |     """Save working proxies to a text file."""
49 |     with open(filename, "w") as file:
50 |         for proxy in proxies:
51 |             file.write(f"{proxy}\n")
52 | 
53 | 
54 | def main():
55 |     proxy_url = "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/refs/heads/master/http.txt"
56 |     proxies = fetch_proxies(proxy_url)
57 | 
58 |     working_proxies = []
59 | 
60 |     # Rich Progress Bar
61 |     with Progress(
62 |         TextColumn("[progress.description]{task.description}"),
63 |         BarColumn(),
64 |         TimeRemainingColumn(),
65 |     ) as progress:
66 |         task = progress.add_task("Testing Proxies...", total=len(proxies))
67 | 
68 |         # Use ThreadPoolExecutor for concurrency
69 |         with ThreadPoolExecutor(max_workers=1000) as executor:
70 |             future_to_proxy = {
71 |                 executor.submit(test_proxy, proxy): proxy for proxy in proxies
72 |             }
73 |             for future in as_completed(future_to_proxy):
74 |                 progress.update(task, advance=1)
75 |                 try:
76 |                     result = future.result()
77 |                     if result:
78 |                         working_proxies.append(result)
79 |                 except Exception:
80 |                     pass  # Handle or log specific proxy testing errors if needed
81 | 
82 |     # Save working proxies to file
83 |     if working_proxies:
84 |         save_working_proxies(working_proxies)
85 |         print(
86 |             f"\n[+] Saved {len(working_proxies)} working proxies to 'working_proxies.txt'."
87 |         )
88 |     else:
89 |         print("\n[-] No working proxies found.")
90 | 
91 | 
92 | if __name__ == "__main__":
93 |     main()
94 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | .DS_Store
  2 | 
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | cover/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | .pybuilder/
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | #   For a library or package, you might want to ignore these files since the code is
 89 | #   intended to run in multiple environments; otherwise, check them in:
 90 | # .python-version
 91 | 
 92 | # pipenv
 93 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 94 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 95 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 96 | #   install all needed dependencies.
 97 | #Pipfile.lock
 98 | 
 99 | # poetry
100 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
101 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
102 | #   commonly ignored for libraries.
103 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
104 | #poetry.lock
105 | 
106 | # pdm
107 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
108 | #pdm.lock
109 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
110 | #   in version control.
111 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
112 | .pdm.toml
113 | .pdm-python
114 | .pdm-build/
115 | 
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117 | __pypackages__/
118 | 
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 | 
123 | # SageMath parsed files
124 | *.sage.py
125 | 
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 | 
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 | 
139 | # Rope project settings
140 | .ropeproject
141 | 
142 | # mkdocs documentation
143 | /site
144 | 
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 | 
150 | # Pyre type checker
151 | .pyre/
152 | 
153 | # pytype static type analyzer
154 | .pytype/
155 | 
156 | # Cython debug symbols
157 | cython_debug/
158 | 
159 | # PyCharm
160 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
163 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | #.idea/
165 | 
166 | .python-version
167 | 
168 | dev.sqlite3
169 | dev.sqlite3-journal
170 | working_proxies.txt
171 | 


--------------------------------------------------------------------------------
/src/data_fetcher.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import re
  3 | import time
  4 | 
  5 | import json_repair
  6 | import requests
  7 | from bs4 import BeautifulSoup
  8 | 
  9 | from log import logger
 10 | 
 11 | 
 12 | class DataFetcher:
 13 |     """Fetch data from a Funnelback search host or from published course content pages.
 14 | 
 15 |     By default, the DataFetcher uses `BASE_URL` (Funnelback search) and the endpoint
 16 |     is expected to be a query string that starts with `?`.
 17 | 
 18 |     Pass `use_class_url=True` to use `BASE_INFO_URL` instead and treat the endpoint
 19 |     as a path under the course content base URL.
 20 |     """
 21 | 
 22 |     BASE_URL = "https://uosa-search.funnelback.squiz.cloud/s/search.html"
 23 |     BASE_INFO_URL = "https://adelaideuni.edu.au"
 24 |     PROXY_FILE = "src/working_proxies.txt"
 25 | 
 26 |     def __init__(self, endpoint: str, use_class_url: bool = False) -> None:
 27 |         self.endpoint = endpoint
 28 |         self.use_class_url = use_class_url
 29 |         if self.use_class_url:
 30 |             # Build a full URL for course page content. Ensure endpoint is a path.
 31 |             path = (
 32 |                 self.endpoint if self.endpoint.startswith("/") else f"/{self.endpoint}"
 33 |             )
 34 |             self.url = self.BASE_INFO_URL.rstrip("/") + path
 35 |         else:
 36 |             self.url = self.BASE_URL + endpoint
 37 |         self.data = None
 38 |         self.last_response = None
 39 |         self.proxies = self.load_proxies()
 40 | 
 41 |     def load_proxies(self) -> list:
 42 |         """Load proxies from the file."""
 43 |         try:
 44 |             with open(self.PROXY_FILE, "r") as file:
 45 |                 proxies = file.read().splitlines()
 46 |                 logger.debug(f"Loaded {len(proxies)} proxies from {self.PROXY_FILE}.")
 47 |                 return proxies
 48 |         except FileNotFoundError:
 49 |             logger.error(f"Proxy file {self.PROXY_FILE} not found.")
 50 |             return []
 51 | 
 52 |     def get_random_proxy(self) -> dict:
 53 |         """Get a random proxy from the loaded list."""
 54 |         if not self.proxies:
 55 |             logger.warning("No proxies available. Proceeding without a proxy.")
 56 |             return None
 57 |         proxy = random.choice(self.proxies)
 58 |         return {
 59 |             "http": f"http://{proxy}",
 60 |             "https": f"http://{proxy}",
 61 |         }
 62 | 
 63 |     def get(self) -> dict:
 64 |         """Fetch data from the API, handling retries and rate-limiting."""
 65 |         logger.debug(f"Fetching {self.endpoint}...")
 66 |         if self.data is not None:
 67 |             return self.data
 68 | 
 69 |         if not self.url:
 70 |             logger.error("Error: No URL provided.")
 71 |             return {}
 72 | 
 73 |         max_retries = 50  # Maximum number of retries
 74 |         retries = 0
 75 |         # Clear previous last_response to avoid stale values in callers
 76 |         self.last_response = None
 77 | 
 78 |         # Exponential backoff base, increase gently, capped to avoid huge sleeps.
 79 |         backoff_base = 1.5
 80 |         while retries < max_retries:
 81 |             proxy = self.get_random_proxy()
 82 |             try:
 83 |                 logger.debug(f"Using proxy: {proxy}")
 84 |                 response = requests.get(self.url, proxies=proxy, timeout=10)
 85 |                 self.last_response = response
 86 | 
 87 |                 if response.status_code == 429:
 88 |                     # Handle rate limiting properly, use Retry-After if available
 89 |                     logger.warning("HTTP 429 - Too Many Requests.")
 90 |                     retry_after = response.headers.get("Retry-After")
 91 |                     if retry_after:
 92 |                         try:
 93 |                             wait_seconds = int(retry_after)
 94 |                         except ValueError:
 95 |                             # Retry-After may be a HTTP-date; fall back to default
 96 |                             wait_seconds = min(60, int(backoff_base**retries))
 97 |                     else:
 98 |                         wait_seconds = min(60, int(backoff_base**retries))
 99 | 
100 |                     logger.warning(
101 |                         f"Sleeping for {wait_seconds} seconds due to 429 response"
102 |                     )
103 |                     time.sleep(wait_seconds)
104 |                     # Try another proxy for the next attempt
105 |                     proxy = self.get_random_proxy()
106 |                     retries += 1
107 |                     continue
108 | 
109 |                 if response.status_code != 200:
110 |                     logger.error(f"HTTP {response.status_code} - {response.text}")
111 |                     # Small backoff for other HTTP errors
112 |                     wait_seconds = min(10, int(backoff_base**retries))
113 |                     logger.debug(f"Waiting for {wait_seconds}s before retrying")
114 |                     time.sleep(wait_seconds)
115 |                     retries += 1
116 |                     continue
117 | 
118 |                 # If using Funnelback (search), parse as JSON and return the response dict.
119 |                 if not self.use_class_url:
120 |                     resp = json_repair.loads(response.text)
121 |                     if not resp.get("response", {}).get("resultPacket"):
122 |                         logger.error(
123 |                             f"Funnelback API Error: {resp.get('error', 'Unknown error')}"
124 |                         )
125 |                         retries += 1
126 |                         continue
127 |                     self.data = resp.get("response", {})
128 |                     return self.data
129 | 
130 |                 # If fetching a class/course content page, just return the HTML text as {'data': <text>}.
131 |                 if self.use_class_url:
132 |                     soup = BeautifulSoup(response.content, "html.parser")
133 |                     # Get main content
134 |                     main_tag = soup.find("main")
135 |                     if main_tag:
136 |                         text = main_tag.get_text()
137 |                     else:
138 |                         text = soup.get_text()
139 |                     # Grab H1 text if present as a separate field to help parsers
140 |                     h1_tag = soup.find("h1")
141 |                     h1_text = h1_tag.get_text().strip() if h1_tag else ""
142 |                     self.data = {"h1": h1_text, "data": re.sub(r"\n+", "\n", text)}
143 |                     return self.data
144 | 
145 |             except requests.exceptions.ProxyError:
146 |                 logger.error(f"Proxy error with proxy: {proxy}")
147 |                 retries += 1
148 |                 # Reduce retry flurry by sleeping a moment
149 |                 time.sleep(min(3, backoff_base**retries))
150 |             except requests.exceptions.RequestException as e:
151 |                 logger.error(f"Request failed: {e}")
152 |                 retries += 1
153 |                 time.sleep(min(3, backoff_base**retries))
154 |             except Exception as e:
155 |                 logger.error(f"Unexpected error: {e}")
156 |                 retries += 1
157 |                 time.sleep(min(3, backoff_base**retries))
158 | 
159 |         logger.error(
160 |             f"Failed to fetch data from {self.url} after {max_retries} retries."
161 |         )
162 |         return {}
163 | 


--------------------------------------------------------------------------------
/src/scraper.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | from concurrent.futures import ThreadPoolExecutor, as_completed
  4 | from hashlib import shake_256
  5 | from queue import Queue
  6 | from threading import Lock, Thread
  7 | 
  8 | from dotenv import dotenv_values
  9 | from rich.progress import Progress
 10 | from sqlalchemy import create_engine
 11 | from sqlalchemy.orm import sessionmaker
 12 | 
 13 | import data_parser
 14 | import fetch_proxies
 15 | from log import logger
 16 | from models import Base, Course, CourseClass, Meetings, Subject
 17 | 
 18 | # Session and write queue for DB writer thread
 19 | Session = sessionmaker()
 20 | write_queue = Queue()
 21 | 
 22 | 
 23 | def get_short_hash(content: str, even_length=12) -> str:
 24 |     """Generates a short hash from the given content using the shake_256 algorithm."""
 25 |     return shake_256(content.encode("utf8")).hexdigest(even_length // 2)
 26 | 
 27 | 
 28 | def db_writer(engine):
 29 |     """Dedicated DB writer thread to serialize all DB operations and prevent locking."""
 30 |     while True:
 31 |         obj = write_queue.get()
 32 |         if obj is None:
 33 |             break  # Stop signal
 34 | 
 35 |         session = Session(bind=engine)
 36 | 
 37 |         try:
 38 |             session.merge(obj)
 39 |             session.commit()
 40 |         except Exception as e:
 41 |             session.rollback()
 42 |             print(f"[DB ERROR] {e} on {obj}")
 43 |         finally:
 44 |             session.close()
 45 | 
 46 | 
 47 | def join_str_if_iterable(value):
 48 |     """Return a comma-separated string if value is a list/tuple, otherwise return the value as str or empty string for None."""
 49 |     if isinstance(value, (list, tuple)):
 50 |         return ",".join([str(x) for x in value])
 51 |     if value is None:
 52 |         return ""
 53 |     return str(value)
 54 | 
 55 | 
 56 | def process_course(course, year, subject, engine, progress, subject_task, lock):
 57 |     """Process a single course and insert data into the database."""
 58 |     try:
 59 |         logger.debug(f"Processing course {course['code']}...")
 60 |         course_code = course.get("code")
 61 |         if not course_code:
 62 |             print(f"Skipping course with missing code: {course}")
 63 |             progress.update(subject_task, advance=1)
 64 |             return
 65 |         course_details = data_parser.get_course_details(course_code)
 66 | 
 67 |         name = subject["subject"]
 68 |         title = course_details.get("title", "")
 69 |         terms = course.get("terms")
 70 |         campus = course_details["campus"]
 71 | 
 72 |         # Course Custom ID
 73 |         course_cid = get_short_hash(f"{name}{course_code}{title}{year}{terms}{campus}")
 74 | 
 75 |         # Encode course code to match URL format
 76 |         code_str = (
 77 |             course_code[0] if isinstance(course_code, (list, tuple)) else course_code
 78 |         )
 79 |         encoded_course_code = re.sub(
 80 |             r"([a-zA-Z]+)([0-9]+)", r"\1-\2", str(code_str)
 81 |         ).lower()
 82 | 
 83 |         try:
 84 |             db_course = Course(
 85 |                 id=course_cid,
 86 |                 course_id=course_details.get("course_id", 0),
 87 |                 year=year,
 88 |                 terms=join_str_if_iterable(terms),
 89 |                 subject=name,
 90 |                 course_code=course_code[0]
 91 |                 if isinstance(course_code, (list, tuple))
 92 |                 else course_code,
 93 |                 title=title,
 94 |                 campus=join_str_if_iterable(campus),
 95 |                 level_of_study=course_details.get("level_of_study", "N/A"),
 96 |                 units=int(course_details.get("unit_value", "6")),
 97 |                 course_coordinator=course_details.get("course_coordinator", "N/A"),
 98 |                 course_level=course_details.get("course_level", "N/A"),
 99 |                 course_overview=course_details.get("course_overview", "N/A"),
100 |                 prerequisites=course_details.get("prerequisites", "N/A"),
101 |                 corequisites=course_details.get("corequisites", "N/A"),
102 |                 antirequisites=course_details.get("antirequisites", "N/A"),
103 |                 url="https://adelaideuni.edu.au/study/courses/" + encoded_course_code,
104 |             )
105 |             write_queue.put(db_course)
106 |         except Exception as e:
107 |             print(f"Error inserting course {course_code}: {e}")
108 |             progress.update(subject_task, advance=1)
109 |             return
110 | 
111 |         if terms:
112 |             class_list = data_parser.get_course_class_list(course_code)
113 |             class_items = (
114 |                 class_list.get("classes", []) if isinstance(class_list, dict) else []
115 |             )
116 | 
117 |             for individual_class in class_items:
118 |                 class_type = individual_class.get("component")
119 |                 class_nbr = individual_class.get("class_number")
120 |                 section = individual_class.get("section")
121 |                 class_cid = get_short_hash(
122 |                     f"{course_cid}{class_type}{class_nbr}{section}"
123 |                 )
124 |                 try:
125 |                     db_course_class = CourseClass(
126 |                         id=class_cid,
127 |                         class_nbr=class_nbr,
128 |                         section=section,
129 |                         size=int(individual_class.get("size", 0)),
130 |                         available=int(individual_class.get("available", 0)),
131 |                         component=class_type,
132 |                         course_id=course_cid,
133 |                     )
134 |                     write_queue.put(db_course_class)
135 |                 except Exception as e:
136 |                     print(f"Error inserting class for course {course_code}: {e}")
137 |                     print(individual_class)
138 | 
139 |                 meetings = individual_class.get("meetings", [])
140 |                 for meeting in meetings:
141 |                     try:
142 |                         meeting_cid = get_short_hash(
143 |                             f"{class_cid}{meeting.get('dates')}{meeting.get('days')}{meeting.get('time')}{meeting.get('campus')}{meeting.get('location')}"
144 |                         )
145 |                         # Extract start and end time from time string
146 |                         time_str = meeting.get("time", "")
147 |                         start_time = (
148 |                             time_str.split("-")[0].strip() if "-" in time_str else "N/A"
149 |                         )
150 |                         end_time = (
151 |                             time_str.split("-")[1].strip() if "-" in time_str else "N/A"
152 |                         )
153 |                         db_meeting = Meetings(
154 |                             id=meeting_cid,
155 |                             dates=meeting.get("dates", "N/A"),
156 |                             days=meeting.get("days", "N/A"),
157 |                             start_time=start_time,
158 |                             end_time=end_time,
159 |                             campus=meeting.get("campus", "N/A"),
160 |                             location=meeting.get("location", "N/A"),
161 |                             course_class_id=class_cid,
162 |                         )
163 |                         write_queue.put(db_meeting)
164 |                     except Exception as e:
165 |                         print(
166 |                             f"Error inserting meeting for class {class_nbr} of course {course_code}: {e}"
167 |                         )
168 | 
169 |         progress.update(subject_task, advance=1)
170 | 
171 |     except Exception as e:
172 |         print(f"Error processing course {course['code']}: {e}")
173 | 
174 | 
175 | def process_subject(subject, year, engine, progress, all_task, lock):
176 |     """Process a single subject and insert data into the database."""
177 |     try:
178 |         name = subject["subject"]
179 |         subject_task = progress.add_task(f"[cyan]{name}", total=None)
180 | 
181 |         # Subject Custom ID
182 |         subject_cid = get_short_hash(f"{name}")
183 | 
184 |         # Open a new session just before inserting data
185 |         session = Session(bind=engine)
186 | 
187 |         # Insert subject into the queue
188 |         db_subject = Subject(id=subject_cid, name=name)
189 |         session.close()
190 |         write_queue.put(db_subject)
191 | 
192 |         # Encode & in subject name
193 |         encoded_name = name.replace("&", "%26")
194 |         courses = data_parser.get_course_codes(encoded_name, year)
195 |         course_list = courses.get("courses", []) if isinstance(courses, dict) else []
196 |         progress.update(subject_task, total=len(course_list))
197 | 
198 |         # Process each course concurrently
199 |         with ThreadPoolExecutor(max_workers=50) as executor:
200 |             futures = []
201 |             for course in course_list:
202 |                 future = executor.submit(
203 |                     process_course,
204 |                     course,
205 |                     year,
206 |                     subject,
207 |                     engine,
208 |                     progress,
209 |                     subject_task,
210 |                     lock,
211 |                 )
212 |                 futures.append(future)
213 | 
214 |             # Wait for all threads to complete
215 |             for future in as_completed(futures):
216 |                 future.result()
217 | 
218 |         progress.update(subject_task, advance=1)
219 |         progress.update(all_task, advance=1)
220 | 
221 |     except Exception as e:
222 |         print(f"Error processing subject {subject['subject']}: {e}")
223 | 
224 | 
225 | def main():
226 |     """Scrape data from the API and store it in a local database"""
227 | 
228 |     # Run proxy fetching and testing
229 |     fetch_proxies.main()
230 | 
231 |     # If db already exists, delete it
232 |     if os.path.exists("src/dev.sqlite3"):
233 |         os.remove("src/dev.sqlite3")
234 | 
235 |     engine = create_engine(
236 |         "sqlite:///src/dev.sqlite3",
237 |         pool_size=1000,  # Increase the pool size to allow for more connections
238 |         max_overflow=1000,  # Allow overflow connections
239 |         pool_timeout=30,  # Set the pool timeout to 30 seconds
240 |     )
241 |     Base.metadata.create_all(engine)
242 |     Session.configure(bind=engine)
243 | 
244 |     year_str = dotenv_values().get("YEAR")
245 |     if year_str is None:
246 |         raise ValueError("YEAR environment variable is not set")
247 |     year = int(year_str)
248 | 
249 |     # Create lock for thread-safe operations
250 |     lock = Lock()
251 | 
252 |     # Start DB writer thread
253 |     writer_thread = Thread(target=db_writer, args=(engine,))
254 |     writer_thread.start()
255 | 
256 |     with Progress() as progress:
257 |         subjects = data_parser.get_subjects(year)
258 | 
259 |         all_task = progress.add_task(
260 |             "[cyan bold]All Courses", total=len(subjects["subjects"])
261 |         )
262 | 
263 |         # Create a thread pool with multiple threads
264 |         with ThreadPoolExecutor(max_workers=50) as executor:
265 |             futures = []
266 |             for subject in subjects["subjects"]:
267 |                 future = executor.submit(
268 |                     process_subject, subject, year, engine, progress, all_task, lock
269 |                 )
270 |                 futures.append(future)
271 | 
272 |             # Wait for all threads to complete
273 |             for future in as_completed(futures):
274 |                 future.result()
275 | 
276 |     # Signal DB writer to stop and wait
277 |     write_queue.put(None)
278 |     writer_thread.join()
279 | 
280 | 
281 | if __name__ == "__main__":
282 |     main()
283 | 


--------------------------------------------------------------------------------
/src/server.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import sys
  3 | from datetime import datetime
  4 | from typing import Dict, List, Union
  5 | 
  6 | from dotenv import dotenv_values
  7 | from fastapi import Depends, FastAPI, HTTPException
  8 | from fastapi.middleware.cors import CORSMiddleware
  9 | from pydantic import ValidationError
 10 | from sqlalchemy import create_engine
 11 | from sqlalchemy.orm import Session, sessionmaker
 12 | 
 13 | from .models import Base, Course, CourseClass, Subject
 14 | from .schemas import CourseSchema
 15 | 
 16 | # Check if the application is running in development mode
 17 | is_dev_mode = "dev" in sys.argv
 18 | 
 19 | # Configure FastAPI based on the mode
 20 | app = FastAPI(
 21 |     docs_url="/docs" if is_dev_mode else None,
 22 |     redoc_url="/redoc" if is_dev_mode else None,
 23 | )
 24 | 
 25 | # Determine the database type
 26 | DB_TYPE = dotenv_values().get("DB_TYPE")
 27 | 
 28 | 
 29 | if DB_TYPE == "libsql":
 30 |     # Use LibSQL
 31 |     TURSO_DATABASE_URL = dotenv_values().get("TURSO_DATABASE_URL")
 32 |     TURSO_AUTH_TOKEN = dotenv_values().get("TURSO_AUTH_TOKEN")
 33 |     DATABASE_URL = (
 34 |         f"sqlite+{TURSO_DATABASE_URL}/?authToken={TURSO_AUTH_TOKEN}&secure=true"
 35 |     )
 36 |     engine = create_engine(
 37 |         DATABASE_URL, connect_args={"check_same_thread": False}, echo=True
 38 |     )
 39 | elif DB_TYPE == "dev":
 40 |     # Use dev db
 41 |     DATABASE_URL = "sqlite:///src/dev.sqlite3"
 42 |     engine = create_engine(DATABASE_URL)
 43 | else:
 44 |     # Use completed courses db
 45 |     DATABASE_URL = "sqlite:///src/local.sqlite3"
 46 |     engine = create_engine(DATABASE_URL)
 47 | 
 48 | print("DB_TYPE:", DB_TYPE)
 49 | print("DATABASE_URL:", DATABASE_URL)
 50 | 
 51 | SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
 52 | Base.metadata.create_all(bind=engine)
 53 | 
 54 | # Configure CORS for local development and production
 55 | origins = [
 56 |     "http://localhost:5173",
 57 |     "http://localhost:8000",
 58 |     "https://mytimetable.csclub.org.au",
 59 | ]
 60 | 
 61 | app.add_middleware(
 62 |     CORSMiddleware,
 63 |     allow_origins=origins,
 64 |     allow_credentials=True,
 65 |     allow_methods=["*"],
 66 |     allow_headers=["*"],
 67 | )
 68 | 
 69 | 
 70 | def get_db():
 71 |     """Get a database session."""
 72 |     db = SessionLocal()
 73 |     try:
 74 |         yield db
 75 |     finally:
 76 |         db.close()
 77 | 
 78 | 
 79 | def current_year() -> int:
 80 |     """Gets the current year."""
 81 |     year_str = dotenv_values().get("YEAR")
 82 |     if year_str is None:
 83 |         return datetime.now().year
 84 |     return int(year_str)
 85 | 
 86 | 
 87 | def current_sem() -> str:
 88 |     """Gets the current semester."""
 89 |     return "Semester 1" if datetime.now().month <= 6 else "Semester 2"
 90 | 
 91 | 
 92 | def get_term_number(db, year: int, term: str) -> str:
 93 |     """Gets the term number from the local database."""
 94 | 
 95 |     # Convert aliases
 96 |     term = convert_term_alias(term)
 97 |     courses = db.query(Course).filter(Course.year == year).all()
 98 | 
 99 |     if not courses:
100 |         raise HTTPException(
101 |             status_code=404, detail=f"No courses found for year: {year}"
102 |         )
103 | 
104 |     for course in courses:
105 |         if term in (course.terms or ""):
106 |             return term
107 | 
108 |     raise HTTPException(
109 |         status_code=404, detail=f"Invalid term: {term} for year: {year}"
110 |     )
111 | 
112 | 
113 | def meeting_date_convert(raw_date: str) -> dict[str, str]:
114 |     """Converts the date format given in the meetings to "MM-DD"
115 |     Args:
116 |         raw_date (str): The given meeting date in the format of "DD {3-char weekday}
117 |         - DD {3-char weekday}"
118 |     Returns:
119 |         formatted_date (dict[str]): The formatted meeting date in the format of "MM-DD"
120 |     """
121 |     months = [
122 |         "Jan",
123 |         "Feb",
124 |         "Mar",
125 |         "Apr",
126 |         "May",
127 |         "Jun",
128 |         "Jul",
129 |         "Aug",
130 |         "Sep",
131 |         "Oct",
132 |         "Nov",
133 |         "Dec",
134 |     ]
135 |     start, end = raw_date.split(" - ")
136 | 
137 |     start_d, start_m = start.split()
138 |     start_m = str(months.index(start_m) + 1).zfill(2)
139 | 
140 |     end_d, end_m = end.split()
141 |     end_m = str(months.index(end_m) + 1).zfill(2)
142 | 
143 |     return {
144 |         "start": f"{start_m}-{start_d.zfill(2)}",
145 |         "end": f"{end_m}-{end_d.zfill(2)}",
146 |     }
147 | 
148 | 
149 | def meeting_time_convert(raw_time: str) -> str:
150 |     """Converts the time given in meetings to "HH:mm"
151 |     Args:
152 |         raw_time (str): The given meeting time in the format of "H{am/pm}"
153 |     Returns:
154 |         formatted_time (str): The formatted meeting time in the format of "HH:mm"
155 |     """
156 |     if ":" in raw_time:
157 |         time_part, period = raw_time[:-2], raw_time[-2:].lower()
158 |         hour, minute = map(int, time_part.split(":"))
159 |     else:
160 |         period = raw_time[-2:].lower()
161 |         hour = int(raw_time[:-2])
162 |         minute = 0
163 | 
164 |     if period == "pm" and hour != 12:
165 |         hour += 12
166 |     elif period == "am" and hour == 12:
167 |         hour = 0
168 | 
169 |     return f"{str(hour).zfill(2)}:{str(minute).zfill(2)}"
170 | 
171 | 
172 | def parse_requisites(raw_requisites: str) -> Union[list[str], None]:
173 |     """Takes in a string of -requisites and returns a list of the parsed-out subjects
174 |     Args:
175 |         raw_requisites (str): The raw string containing a list of -requisites, usually
176 |         in the format of "COMP SCI 1103, COMP SCI 2202, COMP SCI 2202B" as an example
177 |     Returns:
178 |         parsed_requisites (Union[list[str], None]): A list of the parsed -requisites,
179 |         or None if raw_requisites is None
180 |     """
181 | 
182 |     if not raw_requisites:
183 |         return None
184 | 
185 |     # Regex pattern to match subjects and course numbers
186 |     pattern = r"\b([A-Z]+(?:\s+[A-Z]+)*)\s+(\d{4}\w*)\b"
187 |     matched_subjects = [
188 |         " ".join(match) for match in re.findall(pattern, raw_requisites)
189 |     ]
190 | 
191 |     return matched_subjects if matched_subjects else None
192 | 
193 | 
194 | def convert_term_alias(term_alias: str) -> str:
195 |     """Takes in a term alias and returns the CoursePlanner API name for said term
196 |     Args:
197 |         term_alias (str): The unconverted term, this doesn't have to be an alias
198 |         in which case no conversion will be done
199 |     Returns:
200 |         str: The converted or original term depending on if a conversion was made
201 |     """
202 | 
203 |     terms_without_digits = ("summer", "winter")
204 |     aliases = {
205 |         "sem": "Semester",
206 |         "summer": "summer",
207 |         "winter": "Winter",
208 |         "online": "Online Term",
209 |         "term": "Term",
210 |         "uao": "UAO Teaching Period",
211 |     }
212 | 
213 |     # Convert the alias, append its digit to the end if the term needs a digit at the end
214 |     converted_alias = aliases.get(
215 |         term_alias[:-1] if term_alias[-1].isdigit() else term_alias, term_alias
216 |     )
217 |     if (
218 |         term_alias not in terms_without_digits
219 |         and term_alias[-1].isdigit()
220 |         and converted_alias != term_alias
221 |     ):
222 |         converted_alias += " " + term_alias[-1]
223 | 
224 |     return converted_alias
225 | 
226 | 
227 | @app.get("/subjects", response_model=List[str])
228 | def get_subjects(
229 |     year: int = current_year(), term: str = current_sem(), db: Session = Depends(get_db)
230 | ):
231 |     """Get all possible subjects for a given year and term, sorted alphabetically.
232 | 
233 |     Args:
234 |         year (int, optional): The year to search for courses. Defaults to current year.
235 |         term (str, optional): The term to search for courses. Defaults to current semester.
236 | 
237 |     Returns:
238 |         dict: A dictionary containing a list of subjects.
239 |     """
240 |     term_number = get_term_number(db, year, term)
241 | 
242 |     results = (
243 |         db.query(Course)
244 |         .filter(Course.year == year, Course.terms.contains(term_number))
245 |         .all()
246 |     )
247 | 
248 |     if not results:
249 |         raise HTTPException(
250 |             status_code=404, detail="No courses found for the specified year and term"
251 |         )
252 | 
253 |     # Extract unique subject codes from the results
254 |     subjects = db.query(Subject).all()
255 |     unique_names = set()
256 | 
257 |     subjects: list[str] = []
258 | 
259 |     # Collect unique subject codes from course results
260 |     for entry in results:
261 |         name = entry.subject
262 |         if name:  # Skip empty names
263 |             unique_names.add(name)
264 | 
265 |     # Add subject name for each unique name
266 |     for name in unique_names:
267 |         subjects.append(name)
268 | 
269 |     # Sort the subjects alphabetically
270 |     subjects.sort()
271 |     return subjects
272 | 
273 | 
274 | @app.get("/courses", response_model=Union[Dict, List])
275 | def get_subject_courses(
276 |     subject: str,
277 |     year: int = current_year(),
278 |     term: str = current_sem(),
279 |     db: Session = Depends(get_db),
280 | ):
281 |     """Gets a list of courses given a subject (and optionally a year and term).
282 | 
283 |     Args:
284 |         subject (str, required): The subject code to search for.
285 |         year (int, optional): The year of the courses from 2006 to
286 |         the current year. Defaults to current year.
287 |         term (str, optional): The term of the courses. Defaults to current semester.
288 | 
289 |     Returns:
290 |         list[dict]: A list of courses as dictionaries.
291 |     """
292 |     term_number = get_term_number(db, year, term)
293 |     results = (
294 |         db.query(Course)
295 |         .filter(
296 |             Course.subject == subject,
297 |             Course.year == year,
298 |             Course.terms.contains(term_number),
299 |         )
300 |         .order_by(Course.course_code)
301 |         .all()
302 |     )
303 | 
304 |     if not results:
305 |         raise HTTPException(
306 |             status_code=404, detail="No courses found for the specified year and term"
307 |         )
308 | 
309 |     transformed_courses = {"courses": []}
310 | 
311 |     # Extract necessary information from the results
312 |     for entry in results:
313 |         transformed_courses["courses"].append(
314 |             {
315 |                 "id": entry.id,
316 |                 "name": {
317 |                     "subject": entry.subject,
318 |                     "code": entry.course_code,
319 |                     "title": entry.title,
320 |                 },
321 |             }
322 |         )
323 | 
324 |     # Sort courses by course code alphabetically
325 |     transformed_courses["courses"].sort(
326 |         key=lambda x: x["name"]["code"].lower() if x["name"]["code"] else ""
327 |     )
328 |     return transformed_courses
329 | 
330 | 
331 | def split_class_type_category(original_type: str):
332 |     CATEGORIES = {"enrolment", "related"}
333 |     full_category, class_type = original_type.split(": ")
334 |     class_category = "unknown"
335 |     for category in CATEGORIES:
336 |         if category in full_category.lower():
337 |             class_category = category
338 |             break
339 |     return {"category": class_category, "type": class_type}
340 | 
341 | 
342 | @app.get("/courses/{course_cid}", response_model=Union[Dict, List])
343 | def get_course(course_cid: str, db: Session = Depends(get_db)):
344 |     """Course details route, takes in an id returns the courses' info and classes.
345 | 
346 |     Args:
347 |         course_cid (string, required): The id to search for.
348 | 
349 |     Returns:
350 |         dict: A dictionary containing the course information and classes.
351 |     """
352 |     course = db.query(Course).filter(Course.id == course_cid).first()
353 | 
354 |     if not course:
355 |         raise HTTPException(status_code=404, detail="Course not found")
356 | 
357 |     course_id = course.course_id
358 | 
359 |     course_details = db.query(Course).filter(Course.course_id == course_id).first()
360 | 
361 |     # Extract necessary information from details
362 |     if course_details:
363 |         name = {
364 |             "subject": course.subject,
365 |             "code": course.course_code,
366 |             "title": course.title,
367 |         }
368 |         requirements = {
369 |             "prerequisites": parse_requisites(course_details.prerequisites),
370 |             "corequisites": parse_requisites(course_details.corequisites),
371 |             "antirequisites": parse_requisites(course_details.antirequisites),
372 |         }
373 |     else:
374 |         name = {"subject": "", "code": "", "title": ""}
375 |         requirements = {}
376 | 
377 |     # Construct the response
378 |     response = {
379 |         "id": course_cid,
380 |         "course_id": course.course_id,
381 |         "name": name,
382 |         "year": course.year,
383 |         "term": course.terms,
384 |         "campus": course.campus,
385 |         "units": course.units,
386 |         "requirements": requirements,
387 |         "class_list": [],
388 |     }
389 | 
390 |     # Fetch classes info and process to match the required structure
391 |     classes = db.query(CourseClass).filter(CourseClass.course_id == course_cid).all()
392 |     if classes:
393 |         class_groups = {}
394 |         for class_group in classes:
395 |             class_type = split_class_type_category(class_group.component)["type"]
396 |             if class_type not in class_groups:
397 |                 class_groups[class_type] = {
398 |                     **split_class_type_category(class_group.component),
399 |                     "id": class_group.id,
400 |                     "classes": [],
401 |                 }
402 |             class_list_entry = class_groups[class_type]
403 |             class_entry = {
404 |                 "number": str(class_group.class_nbr),
405 |                 "section": class_group.section,  # Returns class section
406 |                 "size": str(class_group.size),
407 |                 "available_seats": str(class_group.available),
408 |                 "meetings": [],
409 |             }
410 |             for meeting in class_group.meetings:
411 |                 # Split the meeting days by commas, and handle multiple same-day entries
412 |                 meeting_days = [
413 |                     day.strip() for day in meeting.days.split(",") if day.strip()
414 |                 ]
415 | 
416 |                 # Flatten the list if the days appear multiple times (e.g., "Monday, Monday")
417 |                 flattened_meeting_days = []
418 |                 for day in meeting_days:
419 |                     # Append each day individually
420 |                     flattened_meeting_days.append(day)
421 | 
422 |                 # Skip weekend meetings
423 |                 if any(day in flattened_meeting_days for day in ["Saturday", "Sunday"]):
424 |                     continue
425 | 
426 |                 # Create meeting entry
427 |                 for day in flattened_meeting_days:
428 |                     meeting_entry = {
429 |                         "day": day,
430 |                         "location": meeting.location,
431 |                         "campus": meeting.campus,
432 |                         "date": meeting_date_convert(meeting.dates),
433 |                         "time": {
434 |                             "start": meeting_time_convert(meeting.start_time),
435 |                             "end": meeting_time_convert(meeting.end_time),
436 |                         },
437 |                     }
438 |                     class_entry["meetings"].append(meeting_entry)
439 | 
440 |             class_list_entry["classes"].append(class_entry)
441 | 
442 |         response["class_list"] = list(class_groups.values())
443 | 
444 |     try:
445 |         CourseSchema.model_validate(response)
446 |     except ValidationError as e:
447 |         raise HTTPException(status_code=501, detail=e.errors())
448 | 
449 |     return response
450 | 


--------------------------------------------------------------------------------
/src/data_parser.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | from bs4 import BeautifulSoup
  4 | 
  5 | import data_fetcher
  6 | from log import logger
  7 | 
  8 | 
  9 | def get_subjects(year: int) -> dict[str, list[dict[str, str]]]:
 10 |     """Return a list of subjects for a given year."""
 11 |     subjects = data_fetcher.DataFetcher(
 12 |         f"?f.Tabs|type=Degrees+%26+Courses&form=json&num_ranks=10&profile=site-search&query=&f.Year|year={year}&collection=uosa~sp-aem-prod&f.Study+type|studyType=Course&start_rank=1"
 13 |     )
 14 | 
 15 |     try:
 16 |         data = subjects.get()
 17 |         if (
 18 |             subjects.last_response is None
 19 |             or subjects.last_response.status_code != 200
 20 |             or data is None
 21 |         ):
 22 |             status = (
 23 |                 subjects.last_response.status_code
 24 |                 if subjects.last_response
 25 |                 else "NO_RESPONSE"
 26 |             )
 27 |             print(f"Error: {status} - {data}")
 28 |             return {"subjects": []}
 29 | 
 30 |         subject_list = []
 31 | 
 32 |         data = data.get("facets")[5].get("allValues", [])
 33 |         for subject in data:
 34 |             subj = subject.get("data")
 35 |             subject_list.append({"subject": subj})
 36 |         logger.debug(f"Subjects: {subject_list}")
 37 |         return {"subjects": subject_list}
 38 | 
 39 |     except Exception as e:
 40 |         print(f"An error occurred while fetching subjects: {e}")
 41 |         return {"subjects": []}
 42 | 
 43 | 
 44 | def get_course_codes(subject: str, year: int):
 45 |     """Return a list of course codes for a given subject code and year."""
 46 |     courses = data_fetcher.DataFetcher(
 47 |         f"?f.Tabs%7Ctype=Degrees+%26+Courses&form=json&f.Year%7Cyear={year}&num_ranks=1000&profile=site-search&query=&f.Area+of+study%7CstudyArea={subject}&collection=uosa%7Esp-aem-prod&f.Study+type%7CstudyType=Course"
 48 |     )
 49 | 
 50 |     try:
 51 |         data = courses.get()
 52 |         logger.debug(f"Course data: {data}")
 53 |         if (
 54 |             courses.last_response is None
 55 |             or courses.last_response.status_code != 200
 56 |             or data is None
 57 |         ):
 58 |             status = (
 59 |                 courses.last_response.status_code
 60 |                 if courses.last_response
 61 |                 else "NO_RESPONSE"
 62 |             )
 63 |             print(f"Error: {status} - {data}")
 64 |             return {"courses": []}
 65 |         results = data.get("resultPacket", []).get("results", [])
 66 |         logger.debug(f"Number of courses found: {len(results)}")
 67 | 
 68 |         if not results:
 69 |             logger.debug("No results found in course codes.")
 70 |             return {}
 71 | 
 72 |         course_codes = [
 73 |             {
 74 |                 "code": course.get("listMetadata", {}).get("courseCode"),
 75 |                 "terms": course.get("listMetadata", {}).get("term"),
 76 |             }
 77 |             for course in results
 78 |         ]
 79 |         logger.debug("Course codes extracted successfully.")
 80 |         return {"courses": course_codes}
 81 | 
 82 |     except Exception as e:
 83 |         print(f"An error occurred while fetching course codes: {e}")
 84 |         return {"courses": []}
 85 | 
 86 | 
 87 | def get_course_details(course_code: str, max_retries=3):
 88 |     """Return the details for a given course."""
 89 |     logger.debug(f"Fetching details for course {course_code}")
 90 |     for _ in range(max_retries):
 91 |         # Encode course code to match URL format
 92 |         code_str = (
 93 |             course_code[0] if isinstance(course_code, (list, tuple)) else course_code
 94 |         )
 95 |         encoded_course_code = re.sub(
 96 |             r"([a-zA-Z]+)([0-9]+)", r"\1-\2", str(code_str)
 97 |         ).lower()
 98 | 
 99 |         course_details = data_fetcher.DataFetcher(
100 |             f"/study/courses/{encoded_course_code}/", use_class_url=True
101 |         )
102 |         try:
103 |             data = course_details.get()
104 |             if course_details.last_response is None:
105 |                 logger.error(
106 |                     f"No HTTP response available for {course_code}. Data: {data}"
107 |                 )
108 |                 # Make sure to return a dictionary with expected keys so caller won't crash
109 |                 return {
110 |                     "code": code_str,
111 |                     "title": data.get("h1", "") if isinstance(data, dict) else "",
112 |                     "course_id": None,
113 |                 }
114 |             if course_details.last_response.status_code != 200:
115 |                 print(
116 |                     f"Error: {course_details.last_response.status_code} - "
117 |                     f"{course_details.last_response.text}"
118 |                 )
119 |                 return {}
120 |             # Return plain text string without extra newlines
121 |             text = data.get("data", "")
122 | 
123 |             # Strip HTML tags
124 |             soup = BeautifulSoup(text, "html.parser")
125 |             body_text = soup.get_text() if soup else text
126 | 
127 |             # Parse the plain-body text for label/value pairs
128 |             parsed = parse_course_text(body_text)
129 | 
130 |             # Return a dict with the parsed fields and the canonical code string
131 |             course_details = {
132 |                 "code": code_str,
133 |                 "title": data.get("h1", ""),
134 |                 "course_id": parsed.get("course_id"),
135 |                 "campus": parsed.get("campus"),
136 |                 "level_of_study": parsed.get("level_of_study"),
137 |                 "units": parsed.get("units"),
138 |                 "course_coordinator": parsed.get("course_coordinator"),
139 |                 "course_level": parsed.get("course_level"),
140 |                 "course_overview": parsed.get("course_overview"),
141 |                 "prerequisites": parsed.get("prerequisites"),
142 |                 "corequisites": parsed.get("corequisites"),
143 |                 "antirequisites": parsed.get("antirequisites"),
144 |             }
145 | 
146 |             logger.debug("Course details extracted successfully.")
147 |             return course_details
148 | 
149 |         except Exception as e:
150 |             print(f"An error occurred while fetching course details: {e}")
151 |             return {}
152 | 
153 |     print(
154 |         f"Failed to retrieve course details for course {course_code} after {max_retries} attempts."
155 |     )
156 |     return {}
157 | 
158 | 
159 | def parse_course_text(text: str) -> dict:
160 |     """Parse a course details plain text and return a dict of fields."""
161 |     if not isinstance(text, str):
162 |         return {}
163 | 
164 |     # Ensure text is plain and normalised
165 |     lines = [line.strip() for line in text.splitlines() if line.strip()]
166 |     labels = {
167 |         "course id": "course_id",
168 |         "campus": "campus",
169 |         "level of study": "level_of_study",
170 |         "unit value": "units",
171 |         "course coordinator": "course_coordinator",
172 |         "course level": "course_level",
173 |         "course overview": "course_overview",
174 |         "prerequisite(s)": "prerequisites",
175 |         "corequisite(s)": "corequisites",
176 |         "antirequisite(s)": "antirequisites",
177 |     }
178 | 
179 |     parsed = {v: None for v in labels.values()}
180 |     i = 0
181 |     # Update the parse_course_text function to handle the case where campus is "Location"
182 |     while i < len(lines):
183 |         key = lines[i].lower()
184 |         if key in labels and i + 1 < len(lines):
185 |             parsed_key = labels[key]
186 |             value = lines[i + 1].strip()
187 |             # Skip if the value for campus is "Location"
188 |             if parsed_key == "campus" and value.lower() == "location":
189 |                 i += 2
190 |                 continue
191 |             parsed[parsed_key] = value
192 |             i += 2
193 |             continue
194 |         i += 1
195 |     return parsed
196 | 
197 | 
198 | def get_course_class_list(course_code: int):
199 |     """Return the class list of a course for a given course code."""
200 | 
201 |     # Encode course code to match URL format
202 |     code_str = course_code[0] if isinstance(course_code, (list, tuple)) else course_code
203 |     encoded_course_code = re.sub(
204 |         r"([a-zA-Z]+)([0-9]+)", r"\1-\2", str(code_str)
205 |     ).lower()
206 | 
207 |     course_details = data_fetcher.DataFetcher(
208 |         f"/study/courses/{encoded_course_code}/", use_class_url=True
209 |     )
210 | 
211 |     try:
212 |         data = course_details.get()
213 |         if (
214 |             course_details.last_response is None
215 |             or course_details.last_response.status_code != 200
216 |         ):
217 |             status = (
218 |                 course_details.last_response.status_code
219 |                 if course_details.last_response
220 |                 else "NO_RESPONSE"
221 |             )
222 |             text = (
223 |                 course_details.last_response.text
224 |                 if course_details.last_response
225 |                 else ""
226 |             )
227 |             print(f"Error: {status} - {text}")
228 |             # Return a minimal dict so callers don't KeyError when accessing title/course_id
229 |             return {
230 |                 "code": code_str,
231 |                 "title": data.get("h1", "") if isinstance(data, dict) else "",
232 |                 "course_id": None,
233 |             }
234 |         # Return plain text string without extra newlines
235 |         text = data.get("data", "")
236 | 
237 |         # Parse the plain-body text for class list details
238 |         parsed_classes = parse_course_class_list(text)
239 |         return {"classes": parsed_classes}
240 | 
241 |     except Exception as e:
242 |         print(f"An error occurred while fetching course class list: {e}")
243 |         return {}
244 | 
245 | 
246 | def parse_course_class_list(text: str) -> list[dict]:
247 |     """Parse course class list details from the given text."""
248 |     if not isinstance(text, str):
249 |         return []
250 | 
251 |     lines = [line.strip() for line in text.splitlines() if line.strip()]
252 |     parsed_classes = []
253 |     current_class = None
254 |     # Current component context for classes within a "Class details" block.
255 |     current_component = "unknown"
256 |     i = 0
257 | 
258 |     def _is_date_line(s: str) -> bool:
259 |         # matches things like '3 Aug - 21 Sep' or '12 Oct - 9 Nov'
260 |         return bool(re.search(r"^\d{1,2} [A-Za-z]+\s*-\s*\d{1,2} [A-Za-z]+$", s))
261 | 
262 |     while i < len(lines):
263 |         line = lines[i]
264 | 
265 |         # Look for "Availability" and skip until "Class details" – content prior to class details is not needed
266 |         if "Availability" in line:
267 |             while i < len(lines) and not lines[i].startswith("Class details"):
268 |                 i += 1
269 |             continue
270 | 
271 |         # Start parsing class details: reset the context for a new set of classes and components
272 |         if line.startswith("Class details"):
273 |             # Commit any prior orphaned class (if it has a class number)
274 |             if current_class and current_class.get("class_number"):
275 |                 parsed_classes.append(current_class)
276 |             current_class = None
277 |             # Reset the current component and attempt a lookahead for the first component
278 |             current_component = "unknown"
279 |             lookahead = 1
280 |             max_look = 24
281 |             while i + lookahead < len(lines) and lookahead <= max_look:
282 |                 candidate = lines[i + lookahead].strip()
283 |                 if candidate.lower().startswith(
284 |                     "enrolment class"
285 |                 ) or candidate.lower().startswith("related class"):
286 |                     parts = candidate.split(":", 1)
287 |                     if len(parts) > 1 and parts[1].strip():
288 |                         current_component = parts[0].strip() + ": " + parts[1].strip()
289 |                     else:
290 |                         # If value is next line and not a class number, use it
291 |                         next_val = (
292 |                             lines[i + lookahead + 1].strip()
293 |                             if i + lookahead + 1 < len(lines)
294 |                             else ""
295 |                         )
296 |                         if next_val and not next_val.lower().startswith("class number"):
297 |                             current_component = parts[0].strip() + ": " + next_val
298 |                     break
299 |                 # If the block restarts, stop searching
300 |                 if candidate.startswith("Class details"):
301 |                     break
302 |                 lookahead += 1
303 |             i += 1
304 |             continue
305 | 
306 |         # Parse other class attributes
307 |         # Detect inline component labels anywhere in the Class details block
308 |         if line.lower().startswith("enrolment class") or line.lower().startswith(
309 |             "related class"
310 |         ):
311 |             parts = line.split(":", 1)
312 |             if len(parts) > 1 and parts[1].strip():
313 |                 current_component = parts[0].strip() + ": " + parts[1].strip()
314 |             else:
315 |                 next_val = lines[i + 1].strip() if i + 1 < len(lines) else ""
316 |                 if next_val and not next_val.lower().startswith("class number"):
317 |                     current_component = parts[0].strip() + ": " + next_val
318 |                     i += 1
319 |             if current_class and not current_class.get("class_number"):
320 |                 current_class["component"] = current_component
321 |             i += 1
322 |             continue
323 | 
324 |         if line.startswith("Class number"):
325 |             # If we already have a class with a class_number, start a new class and append the old one
326 |             if current_class and current_class.get("class_number"):
327 |                 parsed_classes.append(current_class)
328 |                 # Carry over campus/header values
329 |                 current_class = {
330 |                     "meetings": [],
331 |                     "campus": current_class.get("campus"),
332 |                     "component": current_component or current_class.get("component"),
333 |                 }
334 |             elif not current_class:
335 |                 current_class = {"meetings": []}
336 |             current_class["class_number"] = line.split("Class number")[-1].strip()
337 |             # Ensure the class has a component set (inherit from block context)
338 |             if "component" not in current_class or not current_class.get("component"):
339 |                 current_class["component"] = current_component or "unknown"
340 |             i += 1
341 |             continue
342 | 
343 |         if line.startswith("Section"):
344 |             if not current_class:
345 |                 current_class = {"meetings": []}
346 |                 current_class["component"] = current_component or "unknown"
347 |             current_class["section"] = line.split("Section")[-1].strip()
348 |             i += 1
349 |             continue
350 |         if line.startswith("Size"):
351 |             if not current_class:
352 |                 current_class = {"meetings": []}
353 |                 current_class["component"] = current_component or "unknown"
354 |             current_class["size"] = line.split("Size")[-1].strip()
355 |             i += 1
356 |             continue
357 |         if line.startswith("Available"):
358 |             if not current_class:
359 |                 current_class = {"meetings": []}
360 |                 current_class["component"] = current_component or "unknown"
361 |             if line.split("Available")[-1].strip().isdigit():
362 |                 current_class["available"] = line.split("Available")[-1].strip()
363 |             i += 1
364 |             continue
365 | 
366 |         # Detect meeting table header
367 |         headers = ["Dates", "Days", "Time", "Campus", "Location", "Instructor"]
368 |         if all(
369 |             i + j < len(lines) and lines[i + j] == headers[j]
370 |             for j in range(len(headers))
371 |         ):
372 |             # Skip header row
373 |             i += len(headers)
374 |             # Read meeting rows until next class starts
375 |             while (
376 |                 i < len(lines)
377 |                 and not lines[i].startswith("Class number")
378 |                 and not lines[i].startswith("Class details")
379 |             ):
380 |                 # Need at least a date, day, time, campus, location, instructor to be a valid row
381 |                 if not _is_date_line(lines[i]):
382 |                     break
383 |                 # Attempt to parse row segments
384 |                 dates_val = lines[i]
385 |                 days_val = lines[i + 1] if i + 1 < len(lines) else ""
386 |                 time_val = lines[i + 2] if i + 2 < len(lines) else ""
387 |                 campus_val = lines[i + 3] if i + 3 < len(lines) else ""
388 |                 location_val = lines[i + 4] if i + 4 < len(lines) else ""
389 |                 next_i = i + 5
390 | 
391 |                 meeting = {
392 |                     "dates": dates_val,
393 |                     "days": days_val,
394 |                     "time": time_val,
395 |                     "campus": campus_val,
396 |                     "location": location_val,
397 |                 }
398 |                 if not current_class:
399 |                     current_class = {"meetings": []}
400 |                     current_class["component"] = current_component or "unknown"
401 |                 current_class.setdefault("meetings", []).append(meeting)
402 |                 i = next_i
403 |             continue
404 | 
405 |         # Fallback: advance
406 |         i += 1
407 | 
408 |     # Append the last class if it contains a class number
409 |     if current_class and current_class.get("class_number"):
410 |         parsed_classes.append(current_class)
411 |     return parsed_classes
412 | 


--------------------------------------------------------------------------------