├── requirements.txt
├── plots
    ├── pic_2.png
    ├── plot_2025-04-26 18-34-44_2.png
    └── plot_2025-04-26 18-34-44_3.png
├── README.md
├── system_class.py
├── main.py
└── analysis.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib
2 | numpy
3 | pandas
4 | 


--------------------------------------------------------------------------------
/plots/pic_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jackymn25/utm-department-analysis/HEAD/plots/pic_2.png


--------------------------------------------------------------------------------
/plots/plot_2025-04-26 18-34-44_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jackymn25/utm-department-analysis/HEAD/plots/plot_2025-04-26 18-34-44_2.png


--------------------------------------------------------------------------------
/plots/plot_2025-04-26 18-34-44_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jackymn25/utm-department-analysis/HEAD/plots/plot_2025-04-26 18-34-44_3.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # UTM Department Data Analysis (Demo Only)
 3 | 
 4 | This project demonstrates how to analyze and visualize rating data using Bayesian averaging, based on data that was collected from publicly available pages on [RateMyProfessors](https://www.ratemyprofessors.com/).  
 5 | Only analysis logic and example plots are retained for educational purposes.
 6 | 
 7 | ---
 8 | 
 9 | ## Project Structure
10 | 
11 | - `system_class.py`:  
12 |   Defines core classes including `University`, `Department`, `ProfData`, and `CourseUnderProf`.
13 | 
14 | - `analysis.py`:  
15 |   Contains statistical functions and Bayesian scoring logic, as well as plotting utilities.
16 | 
17 | - `main.py`:  
18 |   Main script to run analysis and generate plots (no longer includes any scraping or data fetching).
19 | 
20 | ---
21 | 
22 | ## Current Status
23 | 
24 | - Code for analysis, Bayesian ranking, and visualization
25 | - Sample output plots generated from previously available data (shown below)
26 | 
27 | ---
28 | 
29 | ## Sample Visualizations
30 | 
31 | These plots are **examples** only, based on previously generated data that has now been removed:
32 | 
33 | ![Difficulty Ranking Chart](https://raw.githubusercontent.com/Jackymn25/utm-professor-analysis-rmp/main/plots/pic_2.png)
34 | 
35 | ---
36 | 
37 | ## Usage (With Your Own Data)
38 | 
39 | If you have your own JSON-formatted professor data (not provided here), you can place it in the `data/` directory:
40 | 
41 | - The expected filename is `all_prof_data.json`
42 | - Run the main script:
43 | 
44 | First set up environment
45 | 
46 | ```bash
47 | pip install -r requirements.txt
48 | 
49 | ```bash
50 | python main.py
51 | 
52 | 


--------------------------------------------------------------------------------
/system_class.py:
--------------------------------------------------------------------------------
  1 | class CourseUnderProf:
  2 |     """
  3 |     A course class by a single prof.
  4 |     """
  5 |     course_code: str
  6 |     rating: int
  7 |     difficulty: int
  8 |     size: int
  9 | 
 10 |     def __init__(self, course_code) -> None:
 11 |         """
 12 |         Initializer
 13 |         """
 14 |         self.course_code = course_code
 15 |         self.rating = 0
 16 |         self.difficulty = 0
 17 |         self.size = 0
 18 | 
 19 |     def update(self, rate, difficulty):
 20 |         """
 21 |         Update by a single data(comment)
 22 |         """
 23 |         self.size += 1
 24 |         self.rating += rate
 25 |         self.difficulty += difficulty
 26 | 
 27 |     def get_avg_rate(self) -> tuple[float, float]:
 28 |         return (self.rating / self.size,
 29 |                 self.difficulty / self.size)
 30 | 
 31 | 
 32 | class ProfData:
 33 |     """
 34 |     A prof data, including id, name, department...
 35 |     """
 36 |     course_map: dict[str, CourseUnderProf]
 37 |     rating: int
 38 |     difficulty: int
 39 |     sample_size: int
 40 |     name: str
 41 |     department: str
 42 |     comments: list
 43 | 
 44 |     def __init__(self, data):
 45 |         self.id = data['id']
 46 |         self.name = data['name']
 47 |         self.department = data['department']
 48 |         self.sample_size = len(data['comments'])
 49 |         self.rating = 0
 50 |         self.difficulty = 0
 51 |         self.comments = []
 52 |         self.course_map = {}
 53 |         # self.raw_comments = data['comments']
 54 | 
 55 |         for single_rate in data['comments']:
 56 |             clarity = single_rate["clarityRating"]
 57 |             difficulty = single_rate["difficultyRating"]
 58 |             self.rating += clarity
 59 |             self.difficulty += difficulty
 60 |             self.comments.append(single_rate["comment"])
 61 | 
 62 |             course_name = single_rate["class"][:7]
 63 |             if course_name not in self.course_map:
 64 |                 self.course_map[course_name] = CourseUnderProf(course_name)
 65 |             self.course_map[course_name].update(clarity, difficulty)
 66 | 
 67 |         self.course = list(self.course_map.values())
 68 | 
 69 |     def get_avg_rate(self):
 70 |         if self.sample_size == 0:
 71 |             return 0.0, 0.0
 72 |         return self.rating / self.sample_size, self.difficulty / self.sample_size
 73 | 
 74 | 
 75 | class Department:
 76 |     """
 77 |     A department containing profs.
 78 |     """
 79 |     name: str
 80 |     profs: list[ProfData]
 81 |     def __init__(self, name):
 82 |         self.name = name
 83 |         self.profs = []
 84 | 
 85 |     def update_prof(self, prof_data):
 86 |         self.profs.append(ProfData(prof_data))
 87 | 
 88 | 
 89 | class University:
 90 |     """
 91 |     A university containing all departments.
 92 |     """
 93 |     name: str
 94 |     departments: dict[str, Department]
 95 | 
 96 |     def __init__(self, name, data):
 97 |         self.name = name
 98 |         self.departments = {}
 99 | 
100 |         for prof in data:
101 |             dept_name = prof['department']
102 |             if dept_name not in self.departments:
103 |                 self.departments[dept_name] = Department(dept_name)
104 |             self.departments[dept_name].update_prof(prof)
105 | 
106 |     def get_all_departments(self):
107 |         return list(self.departments.values())
108 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from analysis import *
  3 | from update import *
  4 | from system_class import *
  5 | import os
  6 | import shutil
  7 | 
  8 | MIN_REVIEW = 6
  9 | 
 10 | while True:
 11 | 
 12 |     update_ = input("Would you like to update? (y/n) ")
 13 |     if update_ == 'y' or update_ == 'yes':
 14 | 
 15 |         BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 16 | 
 17 |         source_file = os.path.join(BASE_DIR, 'data', 'all_prof_data.json')
 18 |         backup_dir = os.path.join(BASE_DIR, 'data', 'data_copy')
 19 |         backup_file = os.path.join(backup_dir, 'all_prof_data.json')
 20 |         shutil.copy(source_file, backup_file)
 21 | 
 22 |     elif update_ == 'n' or update_ == 'no':
 23 |         break
 24 | 
 25 |     else:
 26 |         print("Invalid input, please try again")
 27 | 
 28 | # read all data
 29 | try:
 30 |     with open("data/all_prof_data.json", "r", encoding="utf-8") as f:
 31 |         prof_data = json.load(f)
 32 | except:
 33 |     print("Failed to load local data, trying again...")
 34 |     try:
 35 |         with open("data/all_prof_data.json", "r", encoding="utf-8") as f:
 36 |             prof_data = json.load(f)
 37 |     except:
 38 |         print("Error occurs, please report this")
 39 | 
 40 | # initialize
 41 | utm = University('utm', prof_data)
 42 | all_profs = [prof for dept in utm.departments.values() \
 43 |              for prof in dept.profs]
 44 | 
 45 | valid_profs = [prof for prof in all_profs \
 46 |                if prof.sample_size >= MIN_REVIEW]
 47 | 
 48 | # avg
 49 | C_rating = sum(p.get_avg_rate()[0] for p in valid_profs) / len(valid_profs)
 50 | C_difficulty = sum(p.get_avg_rate()[1] for p in valid_profs) / len(valid_profs)
 51 | m = sum(p.sample_size for p in valid_profs) / len(valid_profs)
 52 | 
 53 | # sorting: Bayesian Rating & Difficulty
 54 | sorted_by_rating = sorted(
 55 |     valid_profs,
 56 |     key=lambda p: bayesian_score(p.get_avg_rate()[0], p.sample_size, C_rating, m),
 57 |     reverse=False
 58 | )
 59 | sorted_by_difficulty = sorted(
 60 |     valid_profs,
 61 |     key=lambda p: bayesian_score(p.get_avg_rate()[1], p.sample_size, C_difficulty, m),
 62 |     reverse=True
 63 | )
 64 | 
 65 | # show <= 50 profs
 66 | top_n = 50
 67 | rating_names = [p.name for p in sorted_by_rating[:top_n]]
 68 | rating_scores = [
 69 |     bayesian_score(p.get_avg_rate()[0], p.sample_size, C_rating, m)
 70 |     for p in sorted_by_rating[:top_n]
 71 | ]
 72 | 
 73 | difficulty_names = [p.name for p in sorted_by_difficulty[:top_n]]
 74 | difficulty_scores = [
 75 |     bayesian_score(p.get_avg_rate()[1], p.sample_size, C_difficulty, m)
 76 |     for p in sorted_by_difficulty[:top_n]
 77 | ]
 78 | 
 79 | # plot
 80 | plot_prof_ranking(
 81 |     rating_names,
 82 |     rating_scores,
 83 |     f"Top worst {top_n} Professors by Bayesian Rating(accurate 2025-4) review > 5",
 84 |     "Bayesian Rating (0-5)"
 85 | )
 86 | 
 87 | plot_prof_ranking(
 88 |     difficulty_names,
 89 |     difficulty_scores,
 90 |     f"Top {top_n} Professors by Bayesian Difficulty(accurate 2025-4) review > 5",
 91 |     "Bayesian Difficulty (0-5)"
 92 | )
 93 | 
 94 | # department same logic
 95 | dept_stats = []
 96 | for dept in utm.departments.values():
 97 |     if dept.profs:
 98 |         avg_rating = sum(p.get_avg_rate()[0] for p in dept.profs) / len(dept.profs)
 99 |         avg_difficulty = sum(p.get_avg_rate()[1] for p in dept.profs) / len(dept.profs)
100 |         dept_stats.append((dept.name, avg_rating, avg_difficulty))
101 | 
102 | # generate plots using data
103 | sorted_by_rating = sorted(dept_stats, key=lambda x: x[1], reverse=True)
104 | dept_names = [d[0] for d in sorted_by_rating]
105 | dept_rating_avgs = [d[1] for d in sorted_by_rating]
106 | 
107 | plot_prof_ranking(
108 |     dept_names,
109 |     dept_rating_avgs,
110 |     "Department Average Ratings",
111 |     "Rating (0-5)"
112 | )
113 | 
114 | sorted_by_difficulty = sorted(dept_stats, key=lambda x: x[2], reverse=True)
115 | dept_names = [d[0] for d in sorted_by_difficulty]
116 | dept_difficulty_avgs = [d[2] for d in sorted_by_difficulty]
117 | 
118 | plot_prof_ranking(
119 |     dept_names,
120 |     dept_difficulty_avgs,
121 |     "Department Average Difficulty",
122 |     "Difficulty (0-5)"
123 | )
124 | 
125 | dept_input = input("Enter your department name：").strip()
126 | plot_dept_professors_by_metric(dept_input, utm, metric="rating")
127 | plot_dept_professors_by_metric(dept_input, utm, metric="difficulty")
128 | 


--------------------------------------------------------------------------------
/analysis.py:
--------------------------------------------------------------------------------
  1 | from system_class import *
  2 | from update import *
  3 | import json
  4 | import difflib
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | 
  8 | # bayesian method
  9 | def bayesian_score(avg, n, C, m):
 10 |     return (n / (n + m)) * avg + (m / (n + m)) * C
 11 | 
 12 | 
 13 | # ploting
 14 | def plot_prof_ranking(names, scores, title, xlabel):
 15 |     bar_height = 0.35
 16 |     height = max(6, bar_height * len(names))
 17 |     plt.figure(figsize=(12, height))
 18 |     plt.barh(names[:], scores[:])
 19 |     plt.title(title)
 20 |     plt.xlabel(xlabel)
 21 |     plt.xlim(0, 5)
 22 |     plt.tight_layout()
 23 |     plt.show()
 24 | 
 25 | 
 26 | def get_department_by_fuzzy_name(utm, query):
 27 |     """Match department"""
 28 |     all_names = list(utm.departments.keys())
 29 |     matches = difflib.get_close_matches(query.lower(),
 30 |                                         [name.lower() for name in all_names],
 31 |                                         n=1,
 32 |                                         cutoff=0.4)
 33 | 
 34 |     # find the name
 35 |     for name in all_names:
 36 |         if name.lower() == matches[0]:
 37 |             return utm.departments[name]
 38 |     return None
 39 | 
 40 | def plot_dept_professors_by_difficulty(dept_name, utm, min_reviews=6):
 41 |     dept = get_department_by_fuzzy_name(utm, dept_name)
 42 |     if not dept:
 43 |         return
 44 | 
 45 |     profs_in_dept = [p for p in dept.profs if p.sample_size >= min_reviews]
 46 |     if not profs_in_dept:
 47 |         return
 48 | 
 49 |     C_difficulty = sum(p.get_avg_rate()[1] for p in profs_in_dept) / len(profs_in_dept)
 50 |     m = sum(p.sample_size for p in profs_in_dept) / len(profs_in_dept)
 51 | 
 52 |     # sorting
 53 |     sorted_profs = sorted(
 54 |         profs_in_dept,
 55 |         key=lambda p: bayesian_score(p.get_avg_rate()[1],
 56 |                                      p.sample_size, C_difficulty, m),
 57 |         reverse=True
 58 |     )
 59 |     names = [p.name for p in sorted_profs]
 60 |     scores = [bayesian_score(p.get_avg_rate()[1], p.sample_size,
 61 |                              C_difficulty, m) for p in sorted_profs]
 62 | 
 63 |     # plot
 64 |     plot_prof_ranking(
 65 |         names,
 66 |         scores,
 67 |         f"{dept.name} - Professors by Bayesian Difficulty (≥{min_reviews} reviews)",
 68 |         "Bayesian Difficulty (0-5)"
 69 |     )
 70 | 
 71 | 
 72 | def plot_dept_professors_by_metric(dept_name, utm, metric="difficulty",
 73 |                                    min_reviews=6):
 74 |     """
 75 |     metric: "difficulty" or "rating"
 76 |     """
 77 | 
 78 |     # Matches department
 79 |     all_names = list(utm.departments.keys())
 80 |     matches = difflib.get_close_matches(dept_name.lower(), [name.lower() for name in all_names], n=1, cutoff=0.4)
 81 |     if not matches:
 82 |         return
 83 | 
 84 |     matched_name = next(name for name in all_names if name.lower() == matches[0])
 85 |     dept = utm.departments[matched_name]
 86 | 
 87 |     # filtering
 88 |     profs_in_dept = [p for p in dept.profs if p.sample_size >= min_reviews]
 89 |     if not profs_in_dept:
 90 |         return
 91 | 
 92 |     if metric == "difficulty":
 93 |         C = sum(p.get_avg_rate()[1] for p in profs_in_dept) / len(profs_in_dept)
 94 |         key_fn = lambda p: p.get_avg_rate()[1]
 95 |         label = "Bayesian Difficulty (0-5)"
 96 |     elif metric == "rating":
 97 |         C = sum(p.get_avg_rate()[0] for p in profs_in_dept) / len(profs_in_dept)
 98 |         key_fn = lambda p: p.get_avg_rate()[0]
 99 |         label = "Bayesian Rating (0-5)"
100 |     else:
101 |         raise ValueError("metric peremetre can only be 'rating' or 'difficulty'")
102 | 
103 |     m = sum(p.sample_size for p in profs_in_dept) / len(profs_in_dept)
104 |     if metric == "rating":
105 | 
106 |         sorted_profs = sorted(
107 |             profs_in_dept,
108 |             key=lambda p: bayesian_score(key_fn(p), p.sample_size, C, m),
109 |             reverse=False
110 |         )
111 |     else:
112 |         sorted_profs = sorted(
113 |             profs_in_dept,
114 |             key=lambda p: bayesian_score(key_fn(p), p.sample_size, C, m),
115 |             reverse=True
116 |         )
117 |     names = [p.name for p in sorted_profs]
118 |     scores = [bayesian_score(key_fn(p), p.sample_size, C, m) for p in sorted_profs]
119 | 
120 |     # plots
121 |     plot_prof_ranking(
122 |         names,
123 |         scores,
124 |         f"{matched_name} - Professors by Bayesian {metric.title()} (≥{min_reviews} reviews)",
125 |         label
126 |     )
127 | 


--------------------------------------------------------------------------------