├── requirements.txt ├── plots ├── pic_2.png ├── plot_2025-04-26 18-34-44_2.png └── plot_2025-04-26 18-34-44_3.png ├── README.md ├── system_class.py ├── main.py └── analysis.py /requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib 2 | numpy 3 | pandas 4 | -------------------------------------------------------------------------------- /plots/pic_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jackymn25/utm-department-analysis/HEAD/plots/pic_2.png -------------------------------------------------------------------------------- /plots/plot_2025-04-26 18-34-44_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jackymn25/utm-department-analysis/HEAD/plots/plot_2025-04-26 18-34-44_2.png -------------------------------------------------------------------------------- /plots/plot_2025-04-26 18-34-44_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jackymn25/utm-department-analysis/HEAD/plots/plot_2025-04-26 18-34-44_3.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # UTM Department Data Analysis (Demo Only) 3 | 4 | This project demonstrates how to analyze and visualize rating data using Bayesian averaging, based on data that was collected from publicly available pages on [RateMyProfessors](https://www.ratemyprofessors.com/). 5 | Only analysis logic and example plots are retained for educational purposes. 6 | 7 | --- 8 | 9 | ## Project Structure 10 | 11 | - `system_class.py`: 12 | Defines core classes including `University`, `Department`, `ProfData`, and `CourseUnderProf`. 13 | 14 | - `analysis.py`: 15 | Contains statistical functions and Bayesian scoring logic, as well as plotting utilities. 16 | 17 | - `main.py`: 18 | Main script to run analysis and generate plots (no longer includes any scraping or data fetching). 19 | 20 | --- 21 | 22 | ## Current Status 23 | 24 | - Code for analysis, Bayesian ranking, and visualization 25 | - Sample output plots generated from previously available data (shown below) 26 | 27 | --- 28 | 29 | ## Sample Visualizations 30 | 31 | These plots are **examples** only, based on previously generated data that has now been removed: 32 | 33 | ![Difficulty Ranking Chart](https://raw.githubusercontent.com/Jackymn25/utm-professor-analysis-rmp/main/plots/pic_2.png) 34 | 35 | --- 36 | 37 | ## Usage (With Your Own Data) 38 | 39 | If you have your own JSON-formatted professor data (not provided here), you can place it in the `data/` directory: 40 | 41 | - The expected filename is `all_prof_data.json` 42 | - Run the main script: 43 | 44 | First set up environment 45 | 46 | ```bash 47 | pip install -r requirements.txt 48 | 49 | ```bash 50 | python main.py 51 | 52 | -------------------------------------------------------------------------------- /system_class.py: -------------------------------------------------------------------------------- 1 | class CourseUnderProf: 2 | """ 3 | A course class by a single prof. 4 | """ 5 | course_code: str 6 | rating: int 7 | difficulty: int 8 | size: int 9 | 10 | def __init__(self, course_code) -> None: 11 | """ 12 | Initializer 13 | """ 14 | self.course_code = course_code 15 | self.rating = 0 16 | self.difficulty = 0 17 | self.size = 0 18 | 19 | def update(self, rate, difficulty): 20 | """ 21 | Update by a single data(comment) 22 | """ 23 | self.size += 1 24 | self.rating += rate 25 | self.difficulty += difficulty 26 | 27 | def get_avg_rate(self) -> tuple[float, float]: 28 | return (self.rating / self.size, 29 | self.difficulty / self.size) 30 | 31 | 32 | class ProfData: 33 | """ 34 | A prof data, including id, name, department... 35 | """ 36 | course_map: dict[str, CourseUnderProf] 37 | rating: int 38 | difficulty: int 39 | sample_size: int 40 | name: str 41 | department: str 42 | comments: list 43 | 44 | def __init__(self, data): 45 | self.id = data['id'] 46 | self.name = data['name'] 47 | self.department = data['department'] 48 | self.sample_size = len(data['comments']) 49 | self.rating = 0 50 | self.difficulty = 0 51 | self.comments = [] 52 | self.course_map = {} 53 | # self.raw_comments = data['comments'] 54 | 55 | for single_rate in data['comments']: 56 | clarity = single_rate["clarityRating"] 57 | difficulty = single_rate["difficultyRating"] 58 | self.rating += clarity 59 | self.difficulty += difficulty 60 | self.comments.append(single_rate["comment"]) 61 | 62 | course_name = single_rate["class"][:7] 63 | if course_name not in self.course_map: 64 | self.course_map[course_name] = CourseUnderProf(course_name) 65 | self.course_map[course_name].update(clarity, difficulty) 66 | 67 | self.course = list(self.course_map.values()) 68 | 69 | def get_avg_rate(self): 70 | if self.sample_size == 0: 71 | return 0.0, 0.0 72 | return self.rating / self.sample_size, self.difficulty / self.sample_size 73 | 74 | 75 | class Department: 76 | """ 77 | A department containing profs. 78 | """ 79 | name: str 80 | profs: list[ProfData] 81 | def __init__(self, name): 82 | self.name = name 83 | self.profs = [] 84 | 85 | def update_prof(self, prof_data): 86 | self.profs.append(ProfData(prof_data)) 87 | 88 | 89 | class University: 90 | """ 91 | A university containing all departments. 92 | """ 93 | name: str 94 | departments: dict[str, Department] 95 | 96 | def __init__(self, name, data): 97 | self.name = name 98 | self.departments = {} 99 | 100 | for prof in data: 101 | dept_name = prof['department'] 102 | if dept_name not in self.departments: 103 | self.departments[dept_name] = Department(dept_name) 104 | self.departments[dept_name].update_prof(prof) 105 | 106 | def get_all_departments(self): 107 | return list(self.departments.values()) 108 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from analysis import * 3 | from update import * 4 | from system_class import * 5 | import os 6 | import shutil 7 | 8 | MIN_REVIEW = 6 9 | 10 | while True: 11 | 12 | update_ = input("Would you like to update? (y/n) ") 13 | if update_ == 'y' or update_ == 'yes': 14 | 15 | BASE_DIR = os.path.dirname(os.path.abspath(__file__)) 16 | 17 | source_file = os.path.join(BASE_DIR, 'data', 'all_prof_data.json') 18 | backup_dir = os.path.join(BASE_DIR, 'data', 'data_copy') 19 | backup_file = os.path.join(backup_dir, 'all_prof_data.json') 20 | shutil.copy(source_file, backup_file) 21 | 22 | elif update_ == 'n' or update_ == 'no': 23 | break 24 | 25 | else: 26 | print("Invalid input, please try again") 27 | 28 | # read all data 29 | try: 30 | with open("data/all_prof_data.json", "r", encoding="utf-8") as f: 31 | prof_data = json.load(f) 32 | except: 33 | print("Failed to load local data, trying again...") 34 | try: 35 | with open("data/all_prof_data.json", "r", encoding="utf-8") as f: 36 | prof_data = json.load(f) 37 | except: 38 | print("Error occurs, please report this") 39 | 40 | # initialize 41 | utm = University('utm', prof_data) 42 | all_profs = [prof for dept in utm.departments.values() \ 43 | for prof in dept.profs] 44 | 45 | valid_profs = [prof for prof in all_profs \ 46 | if prof.sample_size >= MIN_REVIEW] 47 | 48 | # avg 49 | C_rating = sum(p.get_avg_rate()[0] for p in valid_profs) / len(valid_profs) 50 | C_difficulty = sum(p.get_avg_rate()[1] for p in valid_profs) / len(valid_profs) 51 | m = sum(p.sample_size for p in valid_profs) / len(valid_profs) 52 | 53 | # sorting: Bayesian Rating & Difficulty 54 | sorted_by_rating = sorted( 55 | valid_profs, 56 | key=lambda p: bayesian_score(p.get_avg_rate()[0], p.sample_size, C_rating, m), 57 | reverse=False 58 | ) 59 | sorted_by_difficulty = sorted( 60 | valid_profs, 61 | key=lambda p: bayesian_score(p.get_avg_rate()[1], p.sample_size, C_difficulty, m), 62 | reverse=True 63 | ) 64 | 65 | # show <= 50 profs 66 | top_n = 50 67 | rating_names = [p.name for p in sorted_by_rating[:top_n]] 68 | rating_scores = [ 69 | bayesian_score(p.get_avg_rate()[0], p.sample_size, C_rating, m) 70 | for p in sorted_by_rating[:top_n] 71 | ] 72 | 73 | difficulty_names = [p.name for p in sorted_by_difficulty[:top_n]] 74 | difficulty_scores = [ 75 | bayesian_score(p.get_avg_rate()[1], p.sample_size, C_difficulty, m) 76 | for p in sorted_by_difficulty[:top_n] 77 | ] 78 | 79 | # plot 80 | plot_prof_ranking( 81 | rating_names, 82 | rating_scores, 83 | f"Top worst {top_n} Professors by Bayesian Rating(accurate 2025-4) review > 5", 84 | "Bayesian Rating (0-5)" 85 | ) 86 | 87 | plot_prof_ranking( 88 | difficulty_names, 89 | difficulty_scores, 90 | f"Top {top_n} Professors by Bayesian Difficulty(accurate 2025-4) review > 5", 91 | "Bayesian Difficulty (0-5)" 92 | ) 93 | 94 | # department same logic 95 | dept_stats = [] 96 | for dept in utm.departments.values(): 97 | if dept.profs: 98 | avg_rating = sum(p.get_avg_rate()[0] for p in dept.profs) / len(dept.profs) 99 | avg_difficulty = sum(p.get_avg_rate()[1] for p in dept.profs) / len(dept.profs) 100 | dept_stats.append((dept.name, avg_rating, avg_difficulty)) 101 | 102 | # generate plots using data 103 | sorted_by_rating = sorted(dept_stats, key=lambda x: x[1], reverse=True) 104 | dept_names = [d[0] for d in sorted_by_rating] 105 | dept_rating_avgs = [d[1] for d in sorted_by_rating] 106 | 107 | plot_prof_ranking( 108 | dept_names, 109 | dept_rating_avgs, 110 | "Department Average Ratings", 111 | "Rating (0-5)" 112 | ) 113 | 114 | sorted_by_difficulty = sorted(dept_stats, key=lambda x: x[2], reverse=True) 115 | dept_names = [d[0] for d in sorted_by_difficulty] 116 | dept_difficulty_avgs = [d[2] for d in sorted_by_difficulty] 117 | 118 | plot_prof_ranking( 119 | dept_names, 120 | dept_difficulty_avgs, 121 | "Department Average Difficulty", 122 | "Difficulty (0-5)" 123 | ) 124 | 125 | dept_input = input("Enter your department name:").strip() 126 | plot_dept_professors_by_metric(dept_input, utm, metric="rating") 127 | plot_dept_professors_by_metric(dept_input, utm, metric="difficulty") 128 | -------------------------------------------------------------------------------- /analysis.py: -------------------------------------------------------------------------------- 1 | from system_class import * 2 | from update import * 3 | import json 4 | import difflib 5 | import matplotlib.pyplot as plt 6 | 7 | 8 | # bayesian method 9 | def bayesian_score(avg, n, C, m): 10 | return (n / (n + m)) * avg + (m / (n + m)) * C 11 | 12 | 13 | # ploting 14 | def plot_prof_ranking(names, scores, title, xlabel): 15 | bar_height = 0.35 16 | height = max(6, bar_height * len(names)) 17 | plt.figure(figsize=(12, height)) 18 | plt.barh(names[:], scores[:]) 19 | plt.title(title) 20 | plt.xlabel(xlabel) 21 | plt.xlim(0, 5) 22 | plt.tight_layout() 23 | plt.show() 24 | 25 | 26 | def get_department_by_fuzzy_name(utm, query): 27 | """Match department""" 28 | all_names = list(utm.departments.keys()) 29 | matches = difflib.get_close_matches(query.lower(), 30 | [name.lower() for name in all_names], 31 | n=1, 32 | cutoff=0.4) 33 | 34 | # find the name 35 | for name in all_names: 36 | if name.lower() == matches[0]: 37 | return utm.departments[name] 38 | return None 39 | 40 | def plot_dept_professors_by_difficulty(dept_name, utm, min_reviews=6): 41 | dept = get_department_by_fuzzy_name(utm, dept_name) 42 | if not dept: 43 | return 44 | 45 | profs_in_dept = [p for p in dept.profs if p.sample_size >= min_reviews] 46 | if not profs_in_dept: 47 | return 48 | 49 | C_difficulty = sum(p.get_avg_rate()[1] for p in profs_in_dept) / len(profs_in_dept) 50 | m = sum(p.sample_size for p in profs_in_dept) / len(profs_in_dept) 51 | 52 | # sorting 53 | sorted_profs = sorted( 54 | profs_in_dept, 55 | key=lambda p: bayesian_score(p.get_avg_rate()[1], 56 | p.sample_size, C_difficulty, m), 57 | reverse=True 58 | ) 59 | names = [p.name for p in sorted_profs] 60 | scores = [bayesian_score(p.get_avg_rate()[1], p.sample_size, 61 | C_difficulty, m) for p in sorted_profs] 62 | 63 | # plot 64 | plot_prof_ranking( 65 | names, 66 | scores, 67 | f"{dept.name} - Professors by Bayesian Difficulty (≥{min_reviews} reviews)", 68 | "Bayesian Difficulty (0-5)" 69 | ) 70 | 71 | 72 | def plot_dept_professors_by_metric(dept_name, utm, metric="difficulty", 73 | min_reviews=6): 74 | """ 75 | metric: "difficulty" or "rating" 76 | """ 77 | 78 | # Matches department 79 | all_names = list(utm.departments.keys()) 80 | matches = difflib.get_close_matches(dept_name.lower(), [name.lower() for name in all_names], n=1, cutoff=0.4) 81 | if not matches: 82 | return 83 | 84 | matched_name = next(name for name in all_names if name.lower() == matches[0]) 85 | dept = utm.departments[matched_name] 86 | 87 | # filtering 88 | profs_in_dept = [p for p in dept.profs if p.sample_size >= min_reviews] 89 | if not profs_in_dept: 90 | return 91 | 92 | if metric == "difficulty": 93 | C = sum(p.get_avg_rate()[1] for p in profs_in_dept) / len(profs_in_dept) 94 | key_fn = lambda p: p.get_avg_rate()[1] 95 | label = "Bayesian Difficulty (0-5)" 96 | elif metric == "rating": 97 | C = sum(p.get_avg_rate()[0] for p in profs_in_dept) / len(profs_in_dept) 98 | key_fn = lambda p: p.get_avg_rate()[0] 99 | label = "Bayesian Rating (0-5)" 100 | else: 101 | raise ValueError("metric peremetre can only be 'rating' or 'difficulty'") 102 | 103 | m = sum(p.sample_size for p in profs_in_dept) / len(profs_in_dept) 104 | if metric == "rating": 105 | 106 | sorted_profs = sorted( 107 | profs_in_dept, 108 | key=lambda p: bayesian_score(key_fn(p), p.sample_size, C, m), 109 | reverse=False 110 | ) 111 | else: 112 | sorted_profs = sorted( 113 | profs_in_dept, 114 | key=lambda p: bayesian_score(key_fn(p), p.sample_size, C, m), 115 | reverse=True 116 | ) 117 | names = [p.name for p in sorted_profs] 118 | scores = [bayesian_score(key_fn(p), p.sample_size, C, m) for p in sorted_profs] 119 | 120 | # plots 121 | plot_prof_ranking( 122 | names, 123 | scores, 124 | f"{matched_name} - Professors by Bayesian {metric.title()} (≥{min_reviews} reviews)", 125 | label 126 | ) 127 | --------------------------------------------------------------------------------