├── src ├── __init__.py ├── local.sqlite3 ├── log.py ├── schemas.py ├── models.py ├── fetch_proxies.py ├── data_fetcher.py ├── scraper.py ├── server.py └── data_parser.py ├── .dockerignore ├── .gitattributes ├── .env.example ├── .pre-commit-config.yaml ├── docker-compose.yml ├── .github └── workflows │ ├── ci-dev-pr.yml │ ├── lint.yml │ ├── scraper.yml │ └── production.yml ├── pyproject.toml ├── Dockerfile ├── scraper.Dockerfile ├── LICENSE ├── README.md └── .gitignore /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | .github -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto eol=lf 2 | -------------------------------------------------------------------------------- /src/local.sqlite3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/compsci-adl/courses-api/HEAD/src/local.sqlite3 -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | DEFAULT_LOGGING_LEVEL = DEBUG # Options: 'DEBUG' or 'ERROR' 2 | YEAR = 2025 3 | 4 | DB_TYPE=local # Options: 'libsql', 'dev', or 'local' 5 | TURSO_DATABASE_URL=https://your-turso-database-url 6 | TURSO_AUTH_TOKEN=your-turso-auth-token 7 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/astral-sh/ruff-pre-commit 3 | # Ruff version 4 | rev: v0.7.3 5 | hooks: 6 | # Run the linter 7 | - id: ruff 8 | args: [ --fix ] 9 | # Run the formatter 10 | - id: ruff-format 11 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | courses-api: 3 | image: courses-api:latest 4 | container_name: courses-api 5 | environment: 6 | - PUID=1000 7 | - PGID=1000 8 | - PORT=8000 9 | ports: 10 | - 8000:8000 11 | volumes: 12 | - ./local.sqlite3:/app/src/local.sqlite3 13 | networks: 14 | - csclub 15 | 16 | networks: 17 | csclub: 18 | external: true 19 | -------------------------------------------------------------------------------- /.github/workflows/ci-dev-pr.yml: -------------------------------------------------------------------------------- 1 | name: Development - Pull Request 2 | on: 3 | pull_request: 4 | branches: 5 | - '**' 6 | 7 | jobs: 8 | lint-format: 9 | name: Linting Checks 10 | uses: ./.github/workflows/lint.yml 11 | 12 | build: 13 | needs: lint-format 14 | name: Build 15 | runs-on: ubuntu-latest 16 | steps: 17 | - name: Checkout repository 18 | uses: actions/checkout@v4 19 | 20 | - name: Build Docker container 21 | run: | 22 | docker buildx build \ 23 | --file=Dockerfile -t courses-api . 24 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Linting Checks 2 | on: 3 | workflow_call: 4 | 5 | jobs: 6 | lint_and_format: 7 | name: Lint and Format 8 | runs-on: ubuntu-latest 9 | 10 | steps: 11 | - name: Checkout code 12 | uses: actions/checkout@v4 13 | 14 | - name: Install uv 15 | uses: astral-sh/setup-uv@v3 16 | with: 17 | enable-cache: true 18 | cache-dependency-glob: "uv.lock" 19 | 20 | - name: Set up Python 21 | uses: actions/setup-python@v5 22 | with: 23 | python-version: '3.12' 24 | 25 | - name: Install dependencies 26 | run: | 27 | uv sync 28 | 29 | - name: Check for code errors 30 | run: | 31 | uv run ruff check 32 | 33 | - name: Check formatting 34 | run: | 35 | uv run ruff format --check 36 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "courses-api" 3 | version = "2.0.0" 4 | description = "API for getting Adelaide University course information" 5 | authors = [ 6 | { name = "CS Club Open Source Team", email = "dev@csclub.org.au" } 7 | ] 8 | license = "MIT" 9 | readme = "README.md" 10 | requires-python = ">=3.12" 11 | dependencies = [ 12 | "beautifulsoup4>=4.14.3", 13 | "fastapi[standard]>=0.115.5", 14 | "json-repair>=0.30.1", 15 | "nanoid>=2.0.0", 16 | "pydantic>=2.9.2", 17 | "requests>=2.32.3", 18 | "rich>=13.9.4", 19 | "ruff>=0.7.3", 20 | "sqlalchemy-libsql>=0.1.0", 21 | "sqlalchemy>=2.0.36", 22 | ] 23 | 24 | [tool.ruff] 25 | lint.select = ['E', 'F', 'W', 'A', 'PLC', 'PLE', 'PLW', 'I'] 26 | lint.ignore = ["E501"] 27 | lint.fixable = ["ALL"] 28 | 29 | [dependency-groups] 30 | dev = [ 31 | "pre-commit>=4.0.1", 32 | "ruff>=0.7.3", 33 | ] 34 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Use a Python image with uv pre-installed 2 | FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim 3 | 4 | # Install the project into `/app` 5 | WORKDIR /app 6 | 7 | # Enable bytecode compilation 8 | ENV UV_COMPILE_BYTECODE=1 9 | 10 | # Copy from the cache instead of linking since it's a mounted volume 11 | ENV UV_LINK_MODE=copy 12 | 13 | # Install the project's dependencies using the lockfile and settings 14 | RUN --mount=type=cache,target=/root/.cache/uv \ 15 | --mount=type=bind,source=uv.lock,target=uv.lock \ 16 | --mount=type=bind,source=pyproject.toml,target=pyproject.toml \ 17 | uv sync --frozen --no-install-project --no-dev 18 | 19 | # Then, add the rest of the project source code and install it 20 | # Installing separately from its dependencies allows optimal layer caching 21 | ADD . /app 22 | RUN --mount=type=cache,target=/root/.cache/uv \ 23 | uv sync --frozen --no-dev 24 | 25 | # Place executables in the environment at the front of the path 26 | ENV PATH="/app/.venv/bin:$PATH" 27 | 28 | EXPOSE 8000 29 | 30 | ENTRYPOINT ["fastapi", "run", "src/server.py"] 31 | -------------------------------------------------------------------------------- /scraper.Dockerfile: -------------------------------------------------------------------------------- 1 | # Use a Python image with uv pre-installed 2 | FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim 3 | 4 | # Install the project into `/app` 5 | WORKDIR /app 6 | 7 | # Enable bytecode compilation 8 | ENV UV_COMPILE_BYTECODE=1 9 | 10 | # Copy from the cache instead of linking since it's a mounted volume 11 | ENV UV_LINK_MODE=copy 12 | 13 | # Install the project's dependencies using the lockfile and settings 14 | RUN --mount=type=cache,target=/root/.cache/uv \ 15 | --mount=type=bind,source=uv.lock,target=uv.lock \ 16 | --mount=type=bind,source=pyproject.toml,target=pyproject.toml \ 17 | uv sync --frozen --no-install-project --no-dev 18 | 19 | # Then, add the rest of the project source code and install it 20 | # Installing separately from its dependencies allows optimal layer caching 21 | ADD . /app 22 | RUN --mount=type=cache,target=/root/.cache/uv \ 23 | uv sync --frozen --no-dev 24 | 25 | # Place executables in the environment at the front of the path 26 | ENV PATH="/app/.venv/bin:$PATH" 27 | 28 | EXPOSE 8000 29 | 30 | ENTRYPOINT ["python3", "src/scraper.py"] 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024-present Adelaide University Computer Science Club 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from datetime import datetime 3 | from pathlib import Path 4 | 5 | from dotenv import dotenv_values 6 | 7 | 8 | def setup_logger() -> logging.Logger: 9 | """ 10 | Sets up a logger that writes logs to a file in the form {timestamp}.log 11 | The level of logging that is written to the file depends on the environment 12 | variable "DEFAULT_LOGGING_LEVEL". 13 | Returns: 14 | logging.Logger = a customised logger object 15 | """ 16 | 17 | # Initialise logger 18 | logger = logging.getLogger("courseAPICallLogger") 19 | default_logging_level = dotenv_values().get("DEFAULT_LOGGING_LEVEL") 20 | logger.setLevel(default_logging_level) 21 | 22 | if not logger.hasHandlers(): 23 | # Initialise log dir path 24 | logs_dir = Path(__file__).resolve().parent.parent / "logs" 25 | logs_dir.mkdir(parents=True, exist_ok=True) 26 | 27 | # Each error log sent into separate file 28 | timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") 29 | 30 | # Log gile path 31 | log_file_path = logs_dir / f"{timestamp}.log" 32 | 33 | # Setup log file handler 34 | log_file_handler = logging.FileHandler(log_file_path, mode="w", delay=True) 35 | 36 | # Set level of file handler 37 | log_file_handler.setLevel(default_logging_level) 38 | 39 | # Setup file formatter 40 | file_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") 41 | log_file_handler.setFormatter(file_formatter) 42 | 43 | # Add handler to logger 44 | logger.addHandler(log_file_handler) 45 | 46 | return logger 47 | 48 | 49 | logger = setup_logger() 50 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Courses API 2 | Courses API is a tool to scrape course information from the Adelaide University website and provide course data to other CS Club Open Source Team projects via an API endpoint. 3 | 4 | ## Getting Started 5 | 6 | To get started, please follow these steps: 7 | 8 | 1. Install `uv` if not already installed: 9 | 10 | Linux, macOS, Windows (WSL) 11 | ```bash 12 | curl -LsSf https://astral.sh/uv/install.sh | sh 13 | ``` 14 | Windows (Powershell) 15 | ```powershell 16 | powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex" 17 | ``` 18 | 19 | 2. Install dependencies: 20 | 21 | ```sh 22 | uv sync 23 | pre-commit install 24 | ``` 25 | 26 | 3. Make a `.env` file and copy `.env.example` into it 27 | 28 | ### Running the API Server 29 | 30 | 1. Start the FastAPI server: 31 | 32 | ```sh 33 | uv run fastapi dev src/server.py 34 | ``` 35 | 36 | 2. Open [http://localhost:8000/docs](http://localhost:8000/docs) with your browser to see the API documentation and to test the available endpoints. 37 | 38 | ### Running the scraper 39 | 40 | Start the scraper (Note: Scraping all the courses may take over an hour): 41 | 42 | ```sh 43 | uv run python3 src/scraper.py 44 | ``` 45 | 46 | #### Debugging 47 | The output level of the logger can be configured in the `.env`. Set `DEFAULT_LOGGING_LEVEL` to your desires level such as `DEBUG` and `ERROR`. `DEBUG` outputs all logs into a file, including errors. `ERROR` only logs errors into a log file. 48 | 49 | ## Contributing 50 | 51 | We welcome contributions to enhance Courses API! If you find any issues, have suggestions, or want to request a feature, please follow our [Contributing Guidelines](https://github.com/compsci-adl/.github/blob/main/CONTRIBUTING.md). 52 | 53 | ## License 54 | 55 | This project is licensed under the MIT License. 56 | See [LICENSE](LICENSE) for details. -------------------------------------------------------------------------------- /src/schemas.py: -------------------------------------------------------------------------------- 1 | from typing import List, Literal, Optional 2 | 3 | from pydantic import BaseModel, Field, root_validator 4 | 5 | 6 | class NameSchema(BaseModel): 7 | subject: str 8 | code: str 9 | title: str 10 | 11 | 12 | DateField = Field(pattern=r"\d{2}-\d{2}") 13 | TimeField = Field(pattern=r"\d{2}:\d{2}") 14 | 15 | 16 | class DateRageSchema(BaseModel): 17 | start: str = DateField 18 | end: str = DateField 19 | 20 | 21 | class TimeRageSchema(BaseModel): 22 | start: str = TimeField 23 | end: str = TimeField 24 | 25 | 26 | class MeetingSchema(BaseModel): 27 | day: Literal[ 28 | "Monday", 29 | "Tuesday", 30 | "Wednesday", 31 | "Thursday", 32 | "Friday", 33 | "Saturday", 34 | "Sunday", 35 | ] 36 | location: str 37 | date: DateRageSchema 38 | time: TimeRageSchema 39 | 40 | 41 | class ClassSchema(BaseModel): 42 | number: str 43 | section: str # Return class section 44 | available_seats: str 45 | meetings: List[MeetingSchema] 46 | 47 | 48 | class ClassTypeSchema(BaseModel): 49 | id: str 50 | category: Optional[Literal["enrolment", "related", "unknown"]] = "unknown" 51 | type: Optional[str] = None 52 | component: Optional[str] = None 53 | classes: List[ClassSchema] 54 | 55 | @root_validator(pre=True) 56 | def ensure_component_or_type(cls, values): 57 | if not values.get("component") and values.get("type"): 58 | values["component"] = values.get("type") 59 | if not values.get("type") and values.get("component"): 60 | values["type"] = values.get("component") 61 | return values 62 | 63 | 64 | class RequirementsSchema(BaseModel): 65 | prerequisites: Optional[List[str]] = None 66 | corequisites: Optional[List[str]] = None 67 | antirequisites: Optional[List[str]] = None 68 | 69 | 70 | class CourseSchema(BaseModel): 71 | id: str 72 | course_id: int 73 | name: NameSchema 74 | class_number: Optional[int] = None 75 | year: str 76 | term: str 77 | campus: str 78 | units: int 79 | requirements: RequirementsSchema 80 | class_list: List[ClassTypeSchema] 81 | -------------------------------------------------------------------------------- /.github/workflows/scraper.yml: -------------------------------------------------------------------------------- 1 | name: Scraper 2 | 3 | on: 4 | schedule: 5 | - cron: '30 */12 * * *' 6 | workflow_dispatch: 7 | 8 | env: 9 | AWS_REGION: ap-southeast-2 10 | 11 | jobs: 12 | run-scraper: 13 | name: Run Scraper 14 | runs-on: ubuntu-latest 15 | environment: Scraper 16 | 17 | env: 18 | DEFAULT_LOGGING_LEVEL: ${{ secrets.DEFAULT_LOGGING_LEVEL }} 19 | YEAR: ${{ secrets.YEAR }} 20 | 21 | permissions: 22 | id-token: write 23 | contents: read 24 | 25 | steps: 26 | - name: Checkout repository 27 | uses: actions/checkout@v4 28 | 29 | - name: Configure AWS credentials 30 | uses: aws-actions/configure-aws-credentials@v4 31 | with: 32 | role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }} 33 | role-session-name: ${{ secrets.AWS_ROLE_SESSION_NAME }} 34 | aws-region: ${{ env.AWS_REGION }} 35 | 36 | - name: Set up Docker Buildx 37 | uses: docker/setup-buildx-action@v3 38 | 39 | - name: Create .env file 40 | run: | 41 | echo "DEFAULT_LOGGING_LEVEL=${{ env.DEFAULT_LOGGING_LEVEL }}" > src/.env 42 | echo "YEAR=${{ env.YEAR }}" >> src/.env 43 | 44 | - name: Build Docker image 45 | run: docker build -f scraper.Dockerfile -t courses-api-scraper:latest . 46 | 47 | - name: Run scraper 48 | run: | 49 | docker run --rm \ 50 | -v ${{ github.workspace }}/src:/app/src \ 51 | -e DEFAULT_LOGGING_LEVEL=${{ env.DEFAULT_LOGGING_LEVEL }} \ 52 | -e YEAR=${{ env.YEAR }} \ 53 | courses-api-scraper:latest 54 | 55 | - name: Rename SQLite DB to local.sqlite3 56 | run: mv src/dev.sqlite3 src/local.sqlite3 57 | 58 | - name: Upload DB to S3 59 | run: | 60 | aws s3 cp src/local.sqlite3 s3://${{ secrets.AWS_S3_BUCKET }}/courses-api/ 61 | 62 | - name: Download DB and restart courses-api container on EC2 63 | env: 64 | KEY: ${{ secrets.SSH_EC2_KEY }} 65 | HOSTNAME: ${{ secrets.SSH_EC2_HOSTNAME }} 66 | USER: ${{ secrets.SSH_EC2_USER }} 67 | run: | 68 | echo "$KEY" > private_key && chmod 600 private_key 69 | ssh -v -o StrictHostKeyChecking=no -i private_key ${USER}@${HOSTNAME} ' 70 | cd ~/courses-api 71 | aws s3 cp s3://${{ secrets.AWS_S3_BUCKET }}/courses-api/local.sqlite3 . 72 | docker restart courses-api 73 | ' 74 | -------------------------------------------------------------------------------- /src/models.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import ( 2 | Column, 3 | ForeignKey, 4 | Integer, 5 | String, 6 | ) 7 | from sqlalchemy.ext.declarative import declarative_base 8 | from sqlalchemy.orm import relationship 9 | 10 | Base = declarative_base() 11 | 12 | 13 | class Subject(Base): 14 | __tablename__ = "subjects" 15 | id = Column(String, primary_key=True) 16 | name = Column(String, unique=True, nullable=False) 17 | courses = relationship("Course", backref="subject_ref") 18 | 19 | 20 | class Course(Base): 21 | __tablename__ = "courses" 22 | id = Column(String, primary_key=True) 23 | course_id = Column(Integer, unique=True, nullable=False) 24 | year = Column(String, nullable=False) 25 | terms = Column(String, nullable=False) 26 | subject = Column(String, ForeignKey("subjects.name"), nullable=False) 27 | course_code = Column(String, nullable=False) 28 | title = Column(String, nullable=False) 29 | campus = Column(String, nullable=False) 30 | level_of_study = Column(String, nullable=True) 31 | units = Column(Integer, nullable=False) 32 | course_coordinator = Column(String, nullable=True) 33 | course_level = Column(String, nullable=False) 34 | course_overview = Column(String, nullable=True) 35 | prerequisites = Column(String, nullable=False) 36 | corequisites = Column(String, nullable=False) 37 | antirequisites = Column(String, nullable=False) 38 | url = Column(String, nullable=False) 39 | course_classes = relationship("CourseClass", backref="course") 40 | 41 | 42 | class Meetings(Base): 43 | __tablename__ = "meetings" 44 | id = Column(String, primary_key=True) 45 | dates = Column(String, nullable=False) 46 | days = Column(String, nullable=False) 47 | start_time = Column(String, nullable=False) 48 | end_time = Column(String, nullable=False) 49 | campus = Column(String, nullable=False) 50 | location = Column(String, nullable=False) 51 | course_class_id = Column(String, ForeignKey("course_classes.id"), nullable=False) 52 | 53 | 54 | class CourseClass(Base): 55 | __tablename__ = "course_classes" 56 | id = Column(String, primary_key=True) 57 | class_nbr = Column(Integer, nullable=False) 58 | section = Column(String, nullable=False) 59 | size = Column(Integer, nullable=False) 60 | available = Column(Integer, nullable=False) 61 | component = Column(String, nullable=False) 62 | meetings = relationship("Meetings", backref="course_class") 63 | course_id = Column(String, ForeignKey("courses.id"), nullable=False) 64 | -------------------------------------------------------------------------------- /.github/workflows/production.yml: -------------------------------------------------------------------------------- 1 | name: Production 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | 7 | env: 8 | AWS_REGION: ap-southeast-2 9 | 10 | jobs: 11 | build: 12 | name: Build 13 | runs-on: ubuntu-24.04-arm 14 | environment: Production 15 | permissions: 16 | id-token: write 17 | contents: read 18 | steps: 19 | - name: Checkout repository 20 | uses: actions/checkout@v4 21 | 22 | - name: Configure AWS credentials 23 | uses: aws-actions/configure-aws-credentials@v4 24 | with: 25 | role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }} 26 | role-session-name: ${{ secrets.AWS_ROLE_SESSION_NAME }} 27 | aws-region: ${{ env.AWS_REGION }} 28 | 29 | - name: Set up Docker Buildx 30 | uses: docker/setup-buildx-action@v3 31 | 32 | - name: Cache Docker layers 33 | uses: actions/cache@v4 34 | with: 35 | path: /tmp/.buildx-cache 36 | key: ${{ runner.os }}-buildx-${{ github.sha }} 37 | restore-keys: | 38 | ${{ runner.os }}-buildx- 39 | 40 | - name: Build Docker container 41 | env: 42 | PRODUCTION_BUILD: 'true' 43 | run: | 44 | docker buildx build \ 45 | --cache-from=type=local,src=/tmp/.buildx-cache \ 46 | --cache-to=type=local,dest=/tmp/.buildx-cache-new,mode=max \ 47 | --output type=docker,dest=courses-api.tar \ 48 | --platform=linux/arm64 --file=Dockerfile -t courses-api . 49 | gzip courses-api.tar 50 | 51 | - name: Save Docker cache 52 | if: success() 53 | run: | 54 | rsync -a --delete /tmp/.buildx-cache-new/ /tmp/.buildx-cache/ 55 | 56 | - name: Copy image and compose file to S3 57 | run: | 58 | aws s3 cp ./courses-api.tar.gz s3://${{ secrets.AWS_S3_BUCKET }}/courses-api/ 59 | aws s3 cp ./docker-compose.yml s3://${{ secrets.AWS_S3_BUCKET }}/courses-api/ 60 | 61 | deploy: 62 | needs: build 63 | name: Deploy 64 | runs-on: ubuntu-latest 65 | environment: Production 66 | steps: 67 | - name: Deploy on EC2 68 | env: 69 | KEY: ${{ secrets.SSH_EC2_KEY }} 70 | HOSTNAME: ${{ secrets.SSH_EC2_HOSTNAME }} 71 | USER: ${{ secrets.SSH_EC2_USER }} 72 | YEAR: ${{ secrets.YEAR }} 73 | DB_TYPE: ${{ secrets.DB_TYPE }} 74 | TURSO_DATABASE_URL: ${{ secrets.TURSO_DATABASE_URL }} 75 | TURSO_AUTH_TOKEN: ${{ secrets.TURSO_AUTH_TOKEN }} 76 | run: | 77 | echo "$KEY" > private_key && chmod 600 private_key 78 | ssh -v -o StrictHostKeyChecking=no -i private_key ${USER}@${HOSTNAME} ' 79 | cd ~/courses-api 80 | aws s3 cp s3://${{ secrets.AWS_S3_BUCKET }}/courses-api/courses-api.tar.gz . 81 | aws s3 cp s3://${{ secrets.AWS_S3_BUCKET }}/courses-api/docker-compose.yml . 82 | echo YEAR=${{ secrets.YEAR }} > .env 83 | echo DB_TYPE=${{ secrets.DB_TYPE }} >> .env 84 | echo TURSO_DATABASE_URL=${{ secrets.TURSO_DATABASE_URL }} >> .env 85 | echo TURSO_AUTH_TOKEN=${{ secrets.TURSO_AUTH_TOKEN }} >> .env 86 | docker load -i courses-api.tar.gz 87 | docker compose up -d 88 | docker restart courses-api 89 | ' 90 | -------------------------------------------------------------------------------- /src/fetch_proxies.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ThreadPoolExecutor, as_completed 2 | 3 | import requests 4 | from requests.exceptions import RequestException, Timeout 5 | from rich.progress import ( 6 | BarColumn, 7 | Progress, 8 | TextColumn, 9 | TimeRemainingColumn, 10 | ) 11 | 12 | 13 | def fetch_proxies(url): 14 | """Fetch the list of proxies from the URL.""" 15 | response = requests.get(url) 16 | if response.status_code == 200: 17 | # Split the response content by newline to get each proxy 18 | return response.text.splitlines() 19 | else: 20 | print("Failed to retrieve proxies.") 21 | return [] 22 | 23 | 24 | def test_proxy( 25 | proxy, 26 | test_url="https://uosa-search.funnelback.squiz.cloud/s/search.html?collection=uosa~sp-aem-prod&form=json&num_ranks=1", 27 | timeout=5, 28 | retries=2, # Number of retries 29 | ): 30 | """Test if the given proxy is working by making a request.""" 31 | proxies = { 32 | "http": f"http://{proxy}", 33 | "https": f"http://{proxy}", 34 | } 35 | for attempt in range(retries + 1): # Retry logic 36 | try: 37 | response = requests.get(test_url, proxies=proxies, timeout=timeout) 38 | if response.status_code == 200: 39 | return proxy # Return the working proxy 40 | except (RequestException, Timeout): 41 | if attempt < retries: 42 | continue # Retry on failure 43 | else: 44 | return None # Skip the proxy if all retries fail 45 | 46 | 47 | def save_working_proxies(proxies, filename="src/working_proxies.txt"): 48 | """Save working proxies to a text file.""" 49 | with open(filename, "w") as file: 50 | for proxy in proxies: 51 | file.write(f"{proxy}\n") 52 | 53 | 54 | def main(): 55 | proxy_url = "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/refs/heads/master/http.txt" 56 | proxies = fetch_proxies(proxy_url) 57 | 58 | working_proxies = [] 59 | 60 | # Rich Progress Bar 61 | with Progress( 62 | TextColumn("[progress.description]{task.description}"), 63 | BarColumn(), 64 | TimeRemainingColumn(), 65 | ) as progress: 66 | task = progress.add_task("Testing Proxies...", total=len(proxies)) 67 | 68 | # Use ThreadPoolExecutor for concurrency 69 | with ThreadPoolExecutor(max_workers=1000) as executor: 70 | future_to_proxy = { 71 | executor.submit(test_proxy, proxy): proxy for proxy in proxies 72 | } 73 | for future in as_completed(future_to_proxy): 74 | progress.update(task, advance=1) 75 | try: 76 | result = future.result() 77 | if result: 78 | working_proxies.append(result) 79 | except Exception: 80 | pass # Handle or log specific proxy testing errors if needed 81 | 82 | # Save working proxies to file 83 | if working_proxies: 84 | save_working_proxies(working_proxies) 85 | print( 86 | f"\n[+] Saved {len(working_proxies)} working proxies to 'working_proxies.txt'." 87 | ) 88 | else: 89 | print("\n[-] No working proxies found.") 90 | 91 | 92 | if __name__ == "__main__": 93 | main() 94 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | cover/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | .pybuilder/ 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | # For a library or package, you might want to ignore these files since the code is 89 | # intended to run in multiple environments; otherwise, check them in: 90 | # .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # poetry 100 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 101 | # This is especially recommended for binary packages to ensure reproducibility, and is more 102 | # commonly ignored for libraries. 103 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 104 | #poetry.lock 105 | 106 | # pdm 107 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 108 | #pdm.lock 109 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 110 | # in version control. 111 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 112 | .pdm.toml 113 | .pdm-python 114 | .pdm-build/ 115 | 116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 117 | __pypackages__/ 118 | 119 | # Celery stuff 120 | celerybeat-schedule 121 | celerybeat.pid 122 | 123 | # SageMath parsed files 124 | *.sage.py 125 | 126 | # Environments 127 | .env 128 | .venv 129 | env/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | 153 | # pytype static type analyzer 154 | .pytype/ 155 | 156 | # Cython debug symbols 157 | cython_debug/ 158 | 159 | # PyCharm 160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 162 | # and can be added to the global gitignore or merged into this file. For a more nuclear 163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 164 | #.idea/ 165 | 166 | .python-version 167 | 168 | dev.sqlite3 169 | dev.sqlite3-journal 170 | working_proxies.txt 171 | -------------------------------------------------------------------------------- /src/data_fetcher.py: -------------------------------------------------------------------------------- 1 | import random 2 | import re 3 | import time 4 | 5 | import json_repair 6 | import requests 7 | from bs4 import BeautifulSoup 8 | 9 | from log import logger 10 | 11 | 12 | class DataFetcher: 13 | """Fetch data from a Funnelback search host or from published course content pages. 14 | 15 | By default, the DataFetcher uses `BASE_URL` (Funnelback search) and the endpoint 16 | is expected to be a query string that starts with `?`. 17 | 18 | Pass `use_class_url=True` to use `BASE_INFO_URL` instead and treat the endpoint 19 | as a path under the course content base URL. 20 | """ 21 | 22 | BASE_URL = "https://uosa-search.funnelback.squiz.cloud/s/search.html" 23 | BASE_INFO_URL = "https://adelaideuni.edu.au" 24 | PROXY_FILE = "src/working_proxies.txt" 25 | 26 | def __init__(self, endpoint: str, use_class_url: bool = False) -> None: 27 | self.endpoint = endpoint 28 | self.use_class_url = use_class_url 29 | if self.use_class_url: 30 | # Build a full URL for course page content. Ensure endpoint is a path. 31 | path = ( 32 | self.endpoint if self.endpoint.startswith("/") else f"/{self.endpoint}" 33 | ) 34 | self.url = self.BASE_INFO_URL.rstrip("/") + path 35 | else: 36 | self.url = self.BASE_URL + endpoint 37 | self.data = None 38 | self.last_response = None 39 | self.proxies = self.load_proxies() 40 | 41 | def load_proxies(self) -> list: 42 | """Load proxies from the file.""" 43 | try: 44 | with open(self.PROXY_FILE, "r") as file: 45 | proxies = file.read().splitlines() 46 | logger.debug(f"Loaded {len(proxies)} proxies from {self.PROXY_FILE}.") 47 | return proxies 48 | except FileNotFoundError: 49 | logger.error(f"Proxy file {self.PROXY_FILE} not found.") 50 | return [] 51 | 52 | def get_random_proxy(self) -> dict: 53 | """Get a random proxy from the loaded list.""" 54 | if not self.proxies: 55 | logger.warning("No proxies available. Proceeding without a proxy.") 56 | return None 57 | proxy = random.choice(self.proxies) 58 | return { 59 | "http": f"http://{proxy}", 60 | "https": f"http://{proxy}", 61 | } 62 | 63 | def get(self) -> dict: 64 | """Fetch data from the API, handling retries and rate-limiting.""" 65 | logger.debug(f"Fetching {self.endpoint}...") 66 | if self.data is not None: 67 | return self.data 68 | 69 | if not self.url: 70 | logger.error("Error: No URL provided.") 71 | return {} 72 | 73 | max_retries = 50 # Maximum number of retries 74 | retries = 0 75 | # Clear previous last_response to avoid stale values in callers 76 | self.last_response = None 77 | 78 | # Exponential backoff base, increase gently, capped to avoid huge sleeps. 79 | backoff_base = 1.5 80 | while retries < max_retries: 81 | proxy = self.get_random_proxy() 82 | try: 83 | logger.debug(f"Using proxy: {proxy}") 84 | response = requests.get(self.url, proxies=proxy, timeout=10) 85 | self.last_response = response 86 | 87 | if response.status_code == 429: 88 | # Handle rate limiting properly, use Retry-After if available 89 | logger.warning("HTTP 429 - Too Many Requests.") 90 | retry_after = response.headers.get("Retry-After") 91 | if retry_after: 92 | try: 93 | wait_seconds = int(retry_after) 94 | except ValueError: 95 | # Retry-After may be a HTTP-date; fall back to default 96 | wait_seconds = min(60, int(backoff_base**retries)) 97 | else: 98 | wait_seconds = min(60, int(backoff_base**retries)) 99 | 100 | logger.warning( 101 | f"Sleeping for {wait_seconds} seconds due to 429 response" 102 | ) 103 | time.sleep(wait_seconds) 104 | # Try another proxy for the next attempt 105 | proxy = self.get_random_proxy() 106 | retries += 1 107 | continue 108 | 109 | if response.status_code != 200: 110 | logger.error(f"HTTP {response.status_code} - {response.text}") 111 | # Small backoff for other HTTP errors 112 | wait_seconds = min(10, int(backoff_base**retries)) 113 | logger.debug(f"Waiting for {wait_seconds}s before retrying") 114 | time.sleep(wait_seconds) 115 | retries += 1 116 | continue 117 | 118 | # If using Funnelback (search), parse as JSON and return the response dict. 119 | if not self.use_class_url: 120 | resp = json_repair.loads(response.text) 121 | if not resp.get("response", {}).get("resultPacket"): 122 | logger.error( 123 | f"Funnelback API Error: {resp.get('error', 'Unknown error')}" 124 | ) 125 | retries += 1 126 | continue 127 | self.data = resp.get("response", {}) 128 | return self.data 129 | 130 | # If fetching a class/course content page, just return the HTML text as {'data': }. 131 | if self.use_class_url: 132 | soup = BeautifulSoup(response.content, "html.parser") 133 | # Get main content 134 | main_tag = soup.find("main") 135 | if main_tag: 136 | text = main_tag.get_text() 137 | else: 138 | text = soup.get_text() 139 | # Grab H1 text if present as a separate field to help parsers 140 | h1_tag = soup.find("h1") 141 | h1_text = h1_tag.get_text().strip() if h1_tag else "" 142 | self.data = {"h1": h1_text, "data": re.sub(r"\n+", "\n", text)} 143 | return self.data 144 | 145 | except requests.exceptions.ProxyError: 146 | logger.error(f"Proxy error with proxy: {proxy}") 147 | retries += 1 148 | # Reduce retry flurry by sleeping a moment 149 | time.sleep(min(3, backoff_base**retries)) 150 | except requests.exceptions.RequestException as e: 151 | logger.error(f"Request failed: {e}") 152 | retries += 1 153 | time.sleep(min(3, backoff_base**retries)) 154 | except Exception as e: 155 | logger.error(f"Unexpected error: {e}") 156 | retries += 1 157 | time.sleep(min(3, backoff_base**retries)) 158 | 159 | logger.error( 160 | f"Failed to fetch data from {self.url} after {max_retries} retries." 161 | ) 162 | return {} 163 | -------------------------------------------------------------------------------- /src/scraper.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from concurrent.futures import ThreadPoolExecutor, as_completed 4 | from hashlib import shake_256 5 | from queue import Queue 6 | from threading import Lock, Thread 7 | 8 | from dotenv import dotenv_values 9 | from rich.progress import Progress 10 | from sqlalchemy import create_engine 11 | from sqlalchemy.orm import sessionmaker 12 | 13 | import data_parser 14 | import fetch_proxies 15 | from log import logger 16 | from models import Base, Course, CourseClass, Meetings, Subject 17 | 18 | # Session and write queue for DB writer thread 19 | Session = sessionmaker() 20 | write_queue = Queue() 21 | 22 | 23 | def get_short_hash(content: str, even_length=12) -> str: 24 | """Generates a short hash from the given content using the shake_256 algorithm.""" 25 | return shake_256(content.encode("utf8")).hexdigest(even_length // 2) 26 | 27 | 28 | def db_writer(engine): 29 | """Dedicated DB writer thread to serialize all DB operations and prevent locking.""" 30 | while True: 31 | obj = write_queue.get() 32 | if obj is None: 33 | break # Stop signal 34 | 35 | session = Session(bind=engine) 36 | 37 | try: 38 | session.merge(obj) 39 | session.commit() 40 | except Exception as e: 41 | session.rollback() 42 | print(f"[DB ERROR] {e} on {obj}") 43 | finally: 44 | session.close() 45 | 46 | 47 | def join_str_if_iterable(value): 48 | """Return a comma-separated string if value is a list/tuple, otherwise return the value as str or empty string for None.""" 49 | if isinstance(value, (list, tuple)): 50 | return ",".join([str(x) for x in value]) 51 | if value is None: 52 | return "" 53 | return str(value) 54 | 55 | 56 | def process_course(course, year, subject, engine, progress, subject_task, lock): 57 | """Process a single course and insert data into the database.""" 58 | try: 59 | logger.debug(f"Processing course {course['code']}...") 60 | course_code = course.get("code") 61 | if not course_code: 62 | print(f"Skipping course with missing code: {course}") 63 | progress.update(subject_task, advance=1) 64 | return 65 | course_details = data_parser.get_course_details(course_code) 66 | 67 | name = subject["subject"] 68 | title = course_details.get("title", "") 69 | terms = course.get("terms") 70 | campus = course_details["campus"] 71 | 72 | # Course Custom ID 73 | course_cid = get_short_hash(f"{name}{course_code}{title}{year}{terms}{campus}") 74 | 75 | # Encode course code to match URL format 76 | code_str = ( 77 | course_code[0] if isinstance(course_code, (list, tuple)) else course_code 78 | ) 79 | encoded_course_code = re.sub( 80 | r"([a-zA-Z]+)([0-9]+)", r"\1-\2", str(code_str) 81 | ).lower() 82 | 83 | try: 84 | db_course = Course( 85 | id=course_cid, 86 | course_id=course_details.get("course_id", 0), 87 | year=year, 88 | terms=join_str_if_iterable(terms), 89 | subject=name, 90 | course_code=course_code[0] 91 | if isinstance(course_code, (list, tuple)) 92 | else course_code, 93 | title=title, 94 | campus=join_str_if_iterable(campus), 95 | level_of_study=course_details.get("level_of_study", "N/A"), 96 | units=int(course_details.get("unit_value", "6")), 97 | course_coordinator=course_details.get("course_coordinator", "N/A"), 98 | course_level=course_details.get("course_level", "N/A"), 99 | course_overview=course_details.get("course_overview", "N/A"), 100 | prerequisites=course_details.get("prerequisites", "N/A"), 101 | corequisites=course_details.get("corequisites", "N/A"), 102 | antirequisites=course_details.get("antirequisites", "N/A"), 103 | url="https://adelaideuni.edu.au/study/courses/" + encoded_course_code, 104 | ) 105 | write_queue.put(db_course) 106 | except Exception as e: 107 | print(f"Error inserting course {course_code}: {e}") 108 | progress.update(subject_task, advance=1) 109 | return 110 | 111 | if terms: 112 | class_list = data_parser.get_course_class_list(course_code) 113 | class_items = ( 114 | class_list.get("classes", []) if isinstance(class_list, dict) else [] 115 | ) 116 | 117 | for individual_class in class_items: 118 | class_type = individual_class.get("component") 119 | class_nbr = individual_class.get("class_number") 120 | section = individual_class.get("section") 121 | class_cid = get_short_hash( 122 | f"{course_cid}{class_type}{class_nbr}{section}" 123 | ) 124 | try: 125 | db_course_class = CourseClass( 126 | id=class_cid, 127 | class_nbr=class_nbr, 128 | section=section, 129 | size=int(individual_class.get("size", 0)), 130 | available=int(individual_class.get("available", 0)), 131 | component=class_type, 132 | course_id=course_cid, 133 | ) 134 | write_queue.put(db_course_class) 135 | except Exception as e: 136 | print(f"Error inserting class for course {course_code}: {e}") 137 | print(individual_class) 138 | 139 | meetings = individual_class.get("meetings", []) 140 | for meeting in meetings: 141 | try: 142 | meeting_cid = get_short_hash( 143 | f"{class_cid}{meeting.get('dates')}{meeting.get('days')}{meeting.get('time')}{meeting.get('campus')}{meeting.get('location')}" 144 | ) 145 | # Extract start and end time from time string 146 | time_str = meeting.get("time", "") 147 | start_time = ( 148 | time_str.split("-")[0].strip() if "-" in time_str else "N/A" 149 | ) 150 | end_time = ( 151 | time_str.split("-")[1].strip() if "-" in time_str else "N/A" 152 | ) 153 | db_meeting = Meetings( 154 | id=meeting_cid, 155 | dates=meeting.get("dates", "N/A"), 156 | days=meeting.get("days", "N/A"), 157 | start_time=start_time, 158 | end_time=end_time, 159 | campus=meeting.get("campus", "N/A"), 160 | location=meeting.get("location", "N/A"), 161 | course_class_id=class_cid, 162 | ) 163 | write_queue.put(db_meeting) 164 | except Exception as e: 165 | print( 166 | f"Error inserting meeting for class {class_nbr} of course {course_code}: {e}" 167 | ) 168 | 169 | progress.update(subject_task, advance=1) 170 | 171 | except Exception as e: 172 | print(f"Error processing course {course['code']}: {e}") 173 | 174 | 175 | def process_subject(subject, year, engine, progress, all_task, lock): 176 | """Process a single subject and insert data into the database.""" 177 | try: 178 | name = subject["subject"] 179 | subject_task = progress.add_task(f"[cyan]{name}", total=None) 180 | 181 | # Subject Custom ID 182 | subject_cid = get_short_hash(f"{name}") 183 | 184 | # Open a new session just before inserting data 185 | session = Session(bind=engine) 186 | 187 | # Insert subject into the queue 188 | db_subject = Subject(id=subject_cid, name=name) 189 | session.close() 190 | write_queue.put(db_subject) 191 | 192 | # Encode & in subject name 193 | encoded_name = name.replace("&", "%26") 194 | courses = data_parser.get_course_codes(encoded_name, year) 195 | course_list = courses.get("courses", []) if isinstance(courses, dict) else [] 196 | progress.update(subject_task, total=len(course_list)) 197 | 198 | # Process each course concurrently 199 | with ThreadPoolExecutor(max_workers=50) as executor: 200 | futures = [] 201 | for course in course_list: 202 | future = executor.submit( 203 | process_course, 204 | course, 205 | year, 206 | subject, 207 | engine, 208 | progress, 209 | subject_task, 210 | lock, 211 | ) 212 | futures.append(future) 213 | 214 | # Wait for all threads to complete 215 | for future in as_completed(futures): 216 | future.result() 217 | 218 | progress.update(subject_task, advance=1) 219 | progress.update(all_task, advance=1) 220 | 221 | except Exception as e: 222 | print(f"Error processing subject {subject['subject']}: {e}") 223 | 224 | 225 | def main(): 226 | """Scrape data from the API and store it in a local database""" 227 | 228 | # Run proxy fetching and testing 229 | fetch_proxies.main() 230 | 231 | # If db already exists, delete it 232 | if os.path.exists("src/dev.sqlite3"): 233 | os.remove("src/dev.sqlite3") 234 | 235 | engine = create_engine( 236 | "sqlite:///src/dev.sqlite3", 237 | pool_size=1000, # Increase the pool size to allow for more connections 238 | max_overflow=1000, # Allow overflow connections 239 | pool_timeout=30, # Set the pool timeout to 30 seconds 240 | ) 241 | Base.metadata.create_all(engine) 242 | Session.configure(bind=engine) 243 | 244 | year_str = dotenv_values().get("YEAR") 245 | if year_str is None: 246 | raise ValueError("YEAR environment variable is not set") 247 | year = int(year_str) 248 | 249 | # Create lock for thread-safe operations 250 | lock = Lock() 251 | 252 | # Start DB writer thread 253 | writer_thread = Thread(target=db_writer, args=(engine,)) 254 | writer_thread.start() 255 | 256 | with Progress() as progress: 257 | subjects = data_parser.get_subjects(year) 258 | 259 | all_task = progress.add_task( 260 | "[cyan bold]All Courses", total=len(subjects["subjects"]) 261 | ) 262 | 263 | # Create a thread pool with multiple threads 264 | with ThreadPoolExecutor(max_workers=50) as executor: 265 | futures = [] 266 | for subject in subjects["subjects"]: 267 | future = executor.submit( 268 | process_subject, subject, year, engine, progress, all_task, lock 269 | ) 270 | futures.append(future) 271 | 272 | # Wait for all threads to complete 273 | for future in as_completed(futures): 274 | future.result() 275 | 276 | # Signal DB writer to stop and wait 277 | write_queue.put(None) 278 | writer_thread.join() 279 | 280 | 281 | if __name__ == "__main__": 282 | main() 283 | -------------------------------------------------------------------------------- /src/server.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | from datetime import datetime 4 | from typing import Dict, List, Union 5 | 6 | from dotenv import dotenv_values 7 | from fastapi import Depends, FastAPI, HTTPException 8 | from fastapi.middleware.cors import CORSMiddleware 9 | from pydantic import ValidationError 10 | from sqlalchemy import create_engine 11 | from sqlalchemy.orm import Session, sessionmaker 12 | 13 | from .models import Base, Course, CourseClass, Subject 14 | from .schemas import CourseSchema 15 | 16 | # Check if the application is running in development mode 17 | is_dev_mode = "dev" in sys.argv 18 | 19 | # Configure FastAPI based on the mode 20 | app = FastAPI( 21 | docs_url="/docs" if is_dev_mode else None, 22 | redoc_url="/redoc" if is_dev_mode else None, 23 | ) 24 | 25 | # Determine the database type 26 | DB_TYPE = dotenv_values().get("DB_TYPE") 27 | 28 | 29 | if DB_TYPE == "libsql": 30 | # Use LibSQL 31 | TURSO_DATABASE_URL = dotenv_values().get("TURSO_DATABASE_URL") 32 | TURSO_AUTH_TOKEN = dotenv_values().get("TURSO_AUTH_TOKEN") 33 | DATABASE_URL = ( 34 | f"sqlite+{TURSO_DATABASE_URL}/?authToken={TURSO_AUTH_TOKEN}&secure=true" 35 | ) 36 | engine = create_engine( 37 | DATABASE_URL, connect_args={"check_same_thread": False}, echo=True 38 | ) 39 | elif DB_TYPE == "dev": 40 | # Use dev db 41 | DATABASE_URL = "sqlite:///src/dev.sqlite3" 42 | engine = create_engine(DATABASE_URL) 43 | else: 44 | # Use completed courses db 45 | DATABASE_URL = "sqlite:///src/local.sqlite3" 46 | engine = create_engine(DATABASE_URL) 47 | 48 | print("DB_TYPE:", DB_TYPE) 49 | print("DATABASE_URL:", DATABASE_URL) 50 | 51 | SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) 52 | Base.metadata.create_all(bind=engine) 53 | 54 | # Configure CORS for local development and production 55 | origins = [ 56 | "http://localhost:5173", 57 | "http://localhost:8000", 58 | "https://mytimetable.csclub.org.au", 59 | ] 60 | 61 | app.add_middleware( 62 | CORSMiddleware, 63 | allow_origins=origins, 64 | allow_credentials=True, 65 | allow_methods=["*"], 66 | allow_headers=["*"], 67 | ) 68 | 69 | 70 | def get_db(): 71 | """Get a database session.""" 72 | db = SessionLocal() 73 | try: 74 | yield db 75 | finally: 76 | db.close() 77 | 78 | 79 | def current_year() -> int: 80 | """Gets the current year.""" 81 | year_str = dotenv_values().get("YEAR") 82 | if year_str is None: 83 | return datetime.now().year 84 | return int(year_str) 85 | 86 | 87 | def current_sem() -> str: 88 | """Gets the current semester.""" 89 | return "Semester 1" if datetime.now().month <= 6 else "Semester 2" 90 | 91 | 92 | def get_term_number(db, year: int, term: str) -> str: 93 | """Gets the term number from the local database.""" 94 | 95 | # Convert aliases 96 | term = convert_term_alias(term) 97 | courses = db.query(Course).filter(Course.year == year).all() 98 | 99 | if not courses: 100 | raise HTTPException( 101 | status_code=404, detail=f"No courses found for year: {year}" 102 | ) 103 | 104 | for course in courses: 105 | if term in (course.terms or ""): 106 | return term 107 | 108 | raise HTTPException( 109 | status_code=404, detail=f"Invalid term: {term} for year: {year}" 110 | ) 111 | 112 | 113 | def meeting_date_convert(raw_date: str) -> dict[str, str]: 114 | """Converts the date format given in the meetings to "MM-DD" 115 | Args: 116 | raw_date (str): The given meeting date in the format of "DD {3-char weekday} 117 | - DD {3-char weekday}" 118 | Returns: 119 | formatted_date (dict[str]): The formatted meeting date in the format of "MM-DD" 120 | """ 121 | months = [ 122 | "Jan", 123 | "Feb", 124 | "Mar", 125 | "Apr", 126 | "May", 127 | "Jun", 128 | "Jul", 129 | "Aug", 130 | "Sep", 131 | "Oct", 132 | "Nov", 133 | "Dec", 134 | ] 135 | start, end = raw_date.split(" - ") 136 | 137 | start_d, start_m = start.split() 138 | start_m = str(months.index(start_m) + 1).zfill(2) 139 | 140 | end_d, end_m = end.split() 141 | end_m = str(months.index(end_m) + 1).zfill(2) 142 | 143 | return { 144 | "start": f"{start_m}-{start_d.zfill(2)}", 145 | "end": f"{end_m}-{end_d.zfill(2)}", 146 | } 147 | 148 | 149 | def meeting_time_convert(raw_time: str) -> str: 150 | """Converts the time given in meetings to "HH:mm" 151 | Args: 152 | raw_time (str): The given meeting time in the format of "H{am/pm}" 153 | Returns: 154 | formatted_time (str): The formatted meeting time in the format of "HH:mm" 155 | """ 156 | if ":" in raw_time: 157 | time_part, period = raw_time[:-2], raw_time[-2:].lower() 158 | hour, minute = map(int, time_part.split(":")) 159 | else: 160 | period = raw_time[-2:].lower() 161 | hour = int(raw_time[:-2]) 162 | minute = 0 163 | 164 | if period == "pm" and hour != 12: 165 | hour += 12 166 | elif period == "am" and hour == 12: 167 | hour = 0 168 | 169 | return f"{str(hour).zfill(2)}:{str(minute).zfill(2)}" 170 | 171 | 172 | def parse_requisites(raw_requisites: str) -> Union[list[str], None]: 173 | """Takes in a string of -requisites and returns a list of the parsed-out subjects 174 | Args: 175 | raw_requisites (str): The raw string containing a list of -requisites, usually 176 | in the format of "COMP SCI 1103, COMP SCI 2202, COMP SCI 2202B" as an example 177 | Returns: 178 | parsed_requisites (Union[list[str], None]): A list of the parsed -requisites, 179 | or None if raw_requisites is None 180 | """ 181 | 182 | if not raw_requisites: 183 | return None 184 | 185 | # Regex pattern to match subjects and course numbers 186 | pattern = r"\b([A-Z]+(?:\s+[A-Z]+)*)\s+(\d{4}\w*)\b" 187 | matched_subjects = [ 188 | " ".join(match) for match in re.findall(pattern, raw_requisites) 189 | ] 190 | 191 | return matched_subjects if matched_subjects else None 192 | 193 | 194 | def convert_term_alias(term_alias: str) -> str: 195 | """Takes in a term alias and returns the CoursePlanner API name for said term 196 | Args: 197 | term_alias (str): The unconverted term, this doesn't have to be an alias 198 | in which case no conversion will be done 199 | Returns: 200 | str: The converted or original term depending on if a conversion was made 201 | """ 202 | 203 | terms_without_digits = ("summer", "winter") 204 | aliases = { 205 | "sem": "Semester", 206 | "summer": "summer", 207 | "winter": "Winter", 208 | "online": "Online Term", 209 | "term": "Term", 210 | "uao": "UAO Teaching Period", 211 | } 212 | 213 | # Convert the alias, append its digit to the end if the term needs a digit at the end 214 | converted_alias = aliases.get( 215 | term_alias[:-1] if term_alias[-1].isdigit() else term_alias, term_alias 216 | ) 217 | if ( 218 | term_alias not in terms_without_digits 219 | and term_alias[-1].isdigit() 220 | and converted_alias != term_alias 221 | ): 222 | converted_alias += " " + term_alias[-1] 223 | 224 | return converted_alias 225 | 226 | 227 | @app.get("/subjects", response_model=List[str]) 228 | def get_subjects( 229 | year: int = current_year(), term: str = current_sem(), db: Session = Depends(get_db) 230 | ): 231 | """Get all possible subjects for a given year and term, sorted alphabetically. 232 | 233 | Args: 234 | year (int, optional): The year to search for courses. Defaults to current year. 235 | term (str, optional): The term to search for courses. Defaults to current semester. 236 | 237 | Returns: 238 | dict: A dictionary containing a list of subjects. 239 | """ 240 | term_number = get_term_number(db, year, term) 241 | 242 | results = ( 243 | db.query(Course) 244 | .filter(Course.year == year, Course.terms.contains(term_number)) 245 | .all() 246 | ) 247 | 248 | if not results: 249 | raise HTTPException( 250 | status_code=404, detail="No courses found for the specified year and term" 251 | ) 252 | 253 | # Extract unique subject codes from the results 254 | subjects = db.query(Subject).all() 255 | unique_names = set() 256 | 257 | subjects: list[str] = [] 258 | 259 | # Collect unique subject codes from course results 260 | for entry in results: 261 | name = entry.subject 262 | if name: # Skip empty names 263 | unique_names.add(name) 264 | 265 | # Add subject name for each unique name 266 | for name in unique_names: 267 | subjects.append(name) 268 | 269 | # Sort the subjects alphabetically 270 | subjects.sort() 271 | return subjects 272 | 273 | 274 | @app.get("/courses", response_model=Union[Dict, List]) 275 | def get_subject_courses( 276 | subject: str, 277 | year: int = current_year(), 278 | term: str = current_sem(), 279 | db: Session = Depends(get_db), 280 | ): 281 | """Gets a list of courses given a subject (and optionally a year and term). 282 | 283 | Args: 284 | subject (str, required): The subject code to search for. 285 | year (int, optional): The year of the courses from 2006 to 286 | the current year. Defaults to current year. 287 | term (str, optional): The term of the courses. Defaults to current semester. 288 | 289 | Returns: 290 | list[dict]: A list of courses as dictionaries. 291 | """ 292 | term_number = get_term_number(db, year, term) 293 | results = ( 294 | db.query(Course) 295 | .filter( 296 | Course.subject == subject, 297 | Course.year == year, 298 | Course.terms.contains(term_number), 299 | ) 300 | .order_by(Course.course_code) 301 | .all() 302 | ) 303 | 304 | if not results: 305 | raise HTTPException( 306 | status_code=404, detail="No courses found for the specified year and term" 307 | ) 308 | 309 | transformed_courses = {"courses": []} 310 | 311 | # Extract necessary information from the results 312 | for entry in results: 313 | transformed_courses["courses"].append( 314 | { 315 | "id": entry.id, 316 | "name": { 317 | "subject": entry.subject, 318 | "code": entry.course_code, 319 | "title": entry.title, 320 | }, 321 | } 322 | ) 323 | 324 | # Sort courses by course code alphabetically 325 | transformed_courses["courses"].sort( 326 | key=lambda x: x["name"]["code"].lower() if x["name"]["code"] else "" 327 | ) 328 | return transformed_courses 329 | 330 | 331 | def split_class_type_category(original_type: str): 332 | CATEGORIES = {"enrolment", "related"} 333 | full_category, class_type = original_type.split(": ") 334 | class_category = "unknown" 335 | for category in CATEGORIES: 336 | if category in full_category.lower(): 337 | class_category = category 338 | break 339 | return {"category": class_category, "type": class_type} 340 | 341 | 342 | @app.get("/courses/{course_cid}", response_model=Union[Dict, List]) 343 | def get_course(course_cid: str, db: Session = Depends(get_db)): 344 | """Course details route, takes in an id returns the courses' info and classes. 345 | 346 | Args: 347 | course_cid (string, required): The id to search for. 348 | 349 | Returns: 350 | dict: A dictionary containing the course information and classes. 351 | """ 352 | course = db.query(Course).filter(Course.id == course_cid).first() 353 | 354 | if not course: 355 | raise HTTPException(status_code=404, detail="Course not found") 356 | 357 | course_id = course.course_id 358 | 359 | course_details = db.query(Course).filter(Course.course_id == course_id).first() 360 | 361 | # Extract necessary information from details 362 | if course_details: 363 | name = { 364 | "subject": course.subject, 365 | "code": course.course_code, 366 | "title": course.title, 367 | } 368 | requirements = { 369 | "prerequisites": parse_requisites(course_details.prerequisites), 370 | "corequisites": parse_requisites(course_details.corequisites), 371 | "antirequisites": parse_requisites(course_details.antirequisites), 372 | } 373 | else: 374 | name = {"subject": "", "code": "", "title": ""} 375 | requirements = {} 376 | 377 | # Construct the response 378 | response = { 379 | "id": course_cid, 380 | "course_id": course.course_id, 381 | "name": name, 382 | "year": course.year, 383 | "term": course.terms, 384 | "campus": course.campus, 385 | "units": course.units, 386 | "requirements": requirements, 387 | "class_list": [], 388 | } 389 | 390 | # Fetch classes info and process to match the required structure 391 | classes = db.query(CourseClass).filter(CourseClass.course_id == course_cid).all() 392 | if classes: 393 | class_groups = {} 394 | for class_group in classes: 395 | class_type = split_class_type_category(class_group.component)["type"] 396 | if class_type not in class_groups: 397 | class_groups[class_type] = { 398 | **split_class_type_category(class_group.component), 399 | "id": class_group.id, 400 | "classes": [], 401 | } 402 | class_list_entry = class_groups[class_type] 403 | class_entry = { 404 | "number": str(class_group.class_nbr), 405 | "section": class_group.section, # Returns class section 406 | "size": str(class_group.size), 407 | "available_seats": str(class_group.available), 408 | "meetings": [], 409 | } 410 | for meeting in class_group.meetings: 411 | # Split the meeting days by commas, and handle multiple same-day entries 412 | meeting_days = [ 413 | day.strip() for day in meeting.days.split(",") if day.strip() 414 | ] 415 | 416 | # Flatten the list if the days appear multiple times (e.g., "Monday, Monday") 417 | flattened_meeting_days = [] 418 | for day in meeting_days: 419 | # Append each day individually 420 | flattened_meeting_days.append(day) 421 | 422 | # Skip weekend meetings 423 | if any(day in flattened_meeting_days for day in ["Saturday", "Sunday"]): 424 | continue 425 | 426 | # Create meeting entry 427 | for day in flattened_meeting_days: 428 | meeting_entry = { 429 | "day": day, 430 | "location": meeting.location, 431 | "campus": meeting.campus, 432 | "date": meeting_date_convert(meeting.dates), 433 | "time": { 434 | "start": meeting_time_convert(meeting.start_time), 435 | "end": meeting_time_convert(meeting.end_time), 436 | }, 437 | } 438 | class_entry["meetings"].append(meeting_entry) 439 | 440 | class_list_entry["classes"].append(class_entry) 441 | 442 | response["class_list"] = list(class_groups.values()) 443 | 444 | try: 445 | CourseSchema.model_validate(response) 446 | except ValidationError as e: 447 | raise HTTPException(status_code=501, detail=e.errors()) 448 | 449 | return response 450 | -------------------------------------------------------------------------------- /src/data_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | import data_fetcher 6 | from log import logger 7 | 8 | 9 | def get_subjects(year: int) -> dict[str, list[dict[str, str]]]: 10 | """Return a list of subjects for a given year.""" 11 | subjects = data_fetcher.DataFetcher( 12 | f"?f.Tabs|type=Degrees+%26+Courses&form=json&num_ranks=10&profile=site-search&query=&f.Year|year={year}&collection=uosa~sp-aem-prod&f.Study+type|studyType=Course&start_rank=1" 13 | ) 14 | 15 | try: 16 | data = subjects.get() 17 | if ( 18 | subjects.last_response is None 19 | or subjects.last_response.status_code != 200 20 | or data is None 21 | ): 22 | status = ( 23 | subjects.last_response.status_code 24 | if subjects.last_response 25 | else "NO_RESPONSE" 26 | ) 27 | print(f"Error: {status} - {data}") 28 | return {"subjects": []} 29 | 30 | subject_list = [] 31 | 32 | data = data.get("facets")[5].get("allValues", []) 33 | for subject in data: 34 | subj = subject.get("data") 35 | subject_list.append({"subject": subj}) 36 | logger.debug(f"Subjects: {subject_list}") 37 | return {"subjects": subject_list} 38 | 39 | except Exception as e: 40 | print(f"An error occurred while fetching subjects: {e}") 41 | return {"subjects": []} 42 | 43 | 44 | def get_course_codes(subject: str, year: int): 45 | """Return a list of course codes for a given subject code and year.""" 46 | courses = data_fetcher.DataFetcher( 47 | f"?f.Tabs%7Ctype=Degrees+%26+Courses&form=json&f.Year%7Cyear={year}&num_ranks=1000&profile=site-search&query=&f.Area+of+study%7CstudyArea={subject}&collection=uosa%7Esp-aem-prod&f.Study+type%7CstudyType=Course" 48 | ) 49 | 50 | try: 51 | data = courses.get() 52 | logger.debug(f"Course data: {data}") 53 | if ( 54 | courses.last_response is None 55 | or courses.last_response.status_code != 200 56 | or data is None 57 | ): 58 | status = ( 59 | courses.last_response.status_code 60 | if courses.last_response 61 | else "NO_RESPONSE" 62 | ) 63 | print(f"Error: {status} - {data}") 64 | return {"courses": []} 65 | results = data.get("resultPacket", []).get("results", []) 66 | logger.debug(f"Number of courses found: {len(results)}") 67 | 68 | if not results: 69 | logger.debug("No results found in course codes.") 70 | return {} 71 | 72 | course_codes = [ 73 | { 74 | "code": course.get("listMetadata", {}).get("courseCode"), 75 | "terms": course.get("listMetadata", {}).get("term"), 76 | } 77 | for course in results 78 | ] 79 | logger.debug("Course codes extracted successfully.") 80 | return {"courses": course_codes} 81 | 82 | except Exception as e: 83 | print(f"An error occurred while fetching course codes: {e}") 84 | return {"courses": []} 85 | 86 | 87 | def get_course_details(course_code: str, max_retries=3): 88 | """Return the details for a given course.""" 89 | logger.debug(f"Fetching details for course {course_code}") 90 | for _ in range(max_retries): 91 | # Encode course code to match URL format 92 | code_str = ( 93 | course_code[0] if isinstance(course_code, (list, tuple)) else course_code 94 | ) 95 | encoded_course_code = re.sub( 96 | r"([a-zA-Z]+)([0-9]+)", r"\1-\2", str(code_str) 97 | ).lower() 98 | 99 | course_details = data_fetcher.DataFetcher( 100 | f"/study/courses/{encoded_course_code}/", use_class_url=True 101 | ) 102 | try: 103 | data = course_details.get() 104 | if course_details.last_response is None: 105 | logger.error( 106 | f"No HTTP response available for {course_code}. Data: {data}" 107 | ) 108 | # Make sure to return a dictionary with expected keys so caller won't crash 109 | return { 110 | "code": code_str, 111 | "title": data.get("h1", "") if isinstance(data, dict) else "", 112 | "course_id": None, 113 | } 114 | if course_details.last_response.status_code != 200: 115 | print( 116 | f"Error: {course_details.last_response.status_code} - " 117 | f"{course_details.last_response.text}" 118 | ) 119 | return {} 120 | # Return plain text string without extra newlines 121 | text = data.get("data", "") 122 | 123 | # Strip HTML tags 124 | soup = BeautifulSoup(text, "html.parser") 125 | body_text = soup.get_text() if soup else text 126 | 127 | # Parse the plain-body text for label/value pairs 128 | parsed = parse_course_text(body_text) 129 | 130 | # Return a dict with the parsed fields and the canonical code string 131 | course_details = { 132 | "code": code_str, 133 | "title": data.get("h1", ""), 134 | "course_id": parsed.get("course_id"), 135 | "campus": parsed.get("campus"), 136 | "level_of_study": parsed.get("level_of_study"), 137 | "units": parsed.get("units"), 138 | "course_coordinator": parsed.get("course_coordinator"), 139 | "course_level": parsed.get("course_level"), 140 | "course_overview": parsed.get("course_overview"), 141 | "prerequisites": parsed.get("prerequisites"), 142 | "corequisites": parsed.get("corequisites"), 143 | "antirequisites": parsed.get("antirequisites"), 144 | } 145 | 146 | logger.debug("Course details extracted successfully.") 147 | return course_details 148 | 149 | except Exception as e: 150 | print(f"An error occurred while fetching course details: {e}") 151 | return {} 152 | 153 | print( 154 | f"Failed to retrieve course details for course {course_code} after {max_retries} attempts." 155 | ) 156 | return {} 157 | 158 | 159 | def parse_course_text(text: str) -> dict: 160 | """Parse a course details plain text and return a dict of fields.""" 161 | if not isinstance(text, str): 162 | return {} 163 | 164 | # Ensure text is plain and normalised 165 | lines = [line.strip() for line in text.splitlines() if line.strip()] 166 | labels = { 167 | "course id": "course_id", 168 | "campus": "campus", 169 | "level of study": "level_of_study", 170 | "unit value": "units", 171 | "course coordinator": "course_coordinator", 172 | "course level": "course_level", 173 | "course overview": "course_overview", 174 | "prerequisite(s)": "prerequisites", 175 | "corequisite(s)": "corequisites", 176 | "antirequisite(s)": "antirequisites", 177 | } 178 | 179 | parsed = {v: None for v in labels.values()} 180 | i = 0 181 | # Update the parse_course_text function to handle the case where campus is "Location" 182 | while i < len(lines): 183 | key = lines[i].lower() 184 | if key in labels and i + 1 < len(lines): 185 | parsed_key = labels[key] 186 | value = lines[i + 1].strip() 187 | # Skip if the value for campus is "Location" 188 | if parsed_key == "campus" and value.lower() == "location": 189 | i += 2 190 | continue 191 | parsed[parsed_key] = value 192 | i += 2 193 | continue 194 | i += 1 195 | return parsed 196 | 197 | 198 | def get_course_class_list(course_code: int): 199 | """Return the class list of a course for a given course code.""" 200 | 201 | # Encode course code to match URL format 202 | code_str = course_code[0] if isinstance(course_code, (list, tuple)) else course_code 203 | encoded_course_code = re.sub( 204 | r"([a-zA-Z]+)([0-9]+)", r"\1-\2", str(code_str) 205 | ).lower() 206 | 207 | course_details = data_fetcher.DataFetcher( 208 | f"/study/courses/{encoded_course_code}/", use_class_url=True 209 | ) 210 | 211 | try: 212 | data = course_details.get() 213 | if ( 214 | course_details.last_response is None 215 | or course_details.last_response.status_code != 200 216 | ): 217 | status = ( 218 | course_details.last_response.status_code 219 | if course_details.last_response 220 | else "NO_RESPONSE" 221 | ) 222 | text = ( 223 | course_details.last_response.text 224 | if course_details.last_response 225 | else "" 226 | ) 227 | print(f"Error: {status} - {text}") 228 | # Return a minimal dict so callers don't KeyError when accessing title/course_id 229 | return { 230 | "code": code_str, 231 | "title": data.get("h1", "") if isinstance(data, dict) else "", 232 | "course_id": None, 233 | } 234 | # Return plain text string without extra newlines 235 | text = data.get("data", "") 236 | 237 | # Parse the plain-body text for class list details 238 | parsed_classes = parse_course_class_list(text) 239 | return {"classes": parsed_classes} 240 | 241 | except Exception as e: 242 | print(f"An error occurred while fetching course class list: {e}") 243 | return {} 244 | 245 | 246 | def parse_course_class_list(text: str) -> list[dict]: 247 | """Parse course class list details from the given text.""" 248 | if not isinstance(text, str): 249 | return [] 250 | 251 | lines = [line.strip() for line in text.splitlines() if line.strip()] 252 | parsed_classes = [] 253 | current_class = None 254 | # Current component context for classes within a "Class details" block. 255 | current_component = "unknown" 256 | i = 0 257 | 258 | def _is_date_line(s: str) -> bool: 259 | # matches things like '3 Aug - 21 Sep' or '12 Oct - 9 Nov' 260 | return bool(re.search(r"^\d{1,2} [A-Za-z]+\s*-\s*\d{1,2} [A-Za-z]+$", s)) 261 | 262 | while i < len(lines): 263 | line = lines[i] 264 | 265 | # Look for "Availability" and skip until "Class details" – content prior to class details is not needed 266 | if "Availability" in line: 267 | while i < len(lines) and not lines[i].startswith("Class details"): 268 | i += 1 269 | continue 270 | 271 | # Start parsing class details: reset the context for a new set of classes and components 272 | if line.startswith("Class details"): 273 | # Commit any prior orphaned class (if it has a class number) 274 | if current_class and current_class.get("class_number"): 275 | parsed_classes.append(current_class) 276 | current_class = None 277 | # Reset the current component and attempt a lookahead for the first component 278 | current_component = "unknown" 279 | lookahead = 1 280 | max_look = 24 281 | while i + lookahead < len(lines) and lookahead <= max_look: 282 | candidate = lines[i + lookahead].strip() 283 | if candidate.lower().startswith( 284 | "enrolment class" 285 | ) or candidate.lower().startswith("related class"): 286 | parts = candidate.split(":", 1) 287 | if len(parts) > 1 and parts[1].strip(): 288 | current_component = parts[0].strip() + ": " + parts[1].strip() 289 | else: 290 | # If value is next line and not a class number, use it 291 | next_val = ( 292 | lines[i + lookahead + 1].strip() 293 | if i + lookahead + 1 < len(lines) 294 | else "" 295 | ) 296 | if next_val and not next_val.lower().startswith("class number"): 297 | current_component = parts[0].strip() + ": " + next_val 298 | break 299 | # If the block restarts, stop searching 300 | if candidate.startswith("Class details"): 301 | break 302 | lookahead += 1 303 | i += 1 304 | continue 305 | 306 | # Parse other class attributes 307 | # Detect inline component labels anywhere in the Class details block 308 | if line.lower().startswith("enrolment class") or line.lower().startswith( 309 | "related class" 310 | ): 311 | parts = line.split(":", 1) 312 | if len(parts) > 1 and parts[1].strip(): 313 | current_component = parts[0].strip() + ": " + parts[1].strip() 314 | else: 315 | next_val = lines[i + 1].strip() if i + 1 < len(lines) else "" 316 | if next_val and not next_val.lower().startswith("class number"): 317 | current_component = parts[0].strip() + ": " + next_val 318 | i += 1 319 | if current_class and not current_class.get("class_number"): 320 | current_class["component"] = current_component 321 | i += 1 322 | continue 323 | 324 | if line.startswith("Class number"): 325 | # If we already have a class with a class_number, start a new class and append the old one 326 | if current_class and current_class.get("class_number"): 327 | parsed_classes.append(current_class) 328 | # Carry over campus/header values 329 | current_class = { 330 | "meetings": [], 331 | "campus": current_class.get("campus"), 332 | "component": current_component or current_class.get("component"), 333 | } 334 | elif not current_class: 335 | current_class = {"meetings": []} 336 | current_class["class_number"] = line.split("Class number")[-1].strip() 337 | # Ensure the class has a component set (inherit from block context) 338 | if "component" not in current_class or not current_class.get("component"): 339 | current_class["component"] = current_component or "unknown" 340 | i += 1 341 | continue 342 | 343 | if line.startswith("Section"): 344 | if not current_class: 345 | current_class = {"meetings": []} 346 | current_class["component"] = current_component or "unknown" 347 | current_class["section"] = line.split("Section")[-1].strip() 348 | i += 1 349 | continue 350 | if line.startswith("Size"): 351 | if not current_class: 352 | current_class = {"meetings": []} 353 | current_class["component"] = current_component or "unknown" 354 | current_class["size"] = line.split("Size")[-1].strip() 355 | i += 1 356 | continue 357 | if line.startswith("Available"): 358 | if not current_class: 359 | current_class = {"meetings": []} 360 | current_class["component"] = current_component or "unknown" 361 | if line.split("Available")[-1].strip().isdigit(): 362 | current_class["available"] = line.split("Available")[-1].strip() 363 | i += 1 364 | continue 365 | 366 | # Detect meeting table header 367 | headers = ["Dates", "Days", "Time", "Campus", "Location", "Instructor"] 368 | if all( 369 | i + j < len(lines) and lines[i + j] == headers[j] 370 | for j in range(len(headers)) 371 | ): 372 | # Skip header row 373 | i += len(headers) 374 | # Read meeting rows until next class starts 375 | while ( 376 | i < len(lines) 377 | and not lines[i].startswith("Class number") 378 | and not lines[i].startswith("Class details") 379 | ): 380 | # Need at least a date, day, time, campus, location, instructor to be a valid row 381 | if not _is_date_line(lines[i]): 382 | break 383 | # Attempt to parse row segments 384 | dates_val = lines[i] 385 | days_val = lines[i + 1] if i + 1 < len(lines) else "" 386 | time_val = lines[i + 2] if i + 2 < len(lines) else "" 387 | campus_val = lines[i + 3] if i + 3 < len(lines) else "" 388 | location_val = lines[i + 4] if i + 4 < len(lines) else "" 389 | next_i = i + 5 390 | 391 | meeting = { 392 | "dates": dates_val, 393 | "days": days_val, 394 | "time": time_val, 395 | "campus": campus_val, 396 | "location": location_val, 397 | } 398 | if not current_class: 399 | current_class = {"meetings": []} 400 | current_class["component"] = current_component or "unknown" 401 | current_class.setdefault("meetings", []).append(meeting) 402 | i = next_i 403 | continue 404 | 405 | # Fallback: advance 406 | i += 1 407 | 408 | # Append the last class if it contains a class number 409 | if current_class and current_class.get("class_number"): 410 | parsed_classes.append(current_class) 411 | return parsed_classes 412 | --------------------------------------------------------------------------------