├── mock_ranked_resumes_cultural_fit.xlsx ├── README.md ├── resume code └── Resume code.py /mock_ranked_resumes_cultural_fit.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Okes2024/AI-based-Resume-Screening-for-Cultural-Fit/HEAD/mock_ranked_resumes_cultural_fit.xlsx -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 📌 Overview 2 | This project leverages AI and NLP techniques to screen resumes for cultural fit, helping recruiters identify candidates who align with an organization’s values, work environment, and team dynamics. It uses text processing, sentiment analysis, and machine learning for accurate predictions. 3 | 4 | 📂 Features 5 | Natural Language Processing (NLP) for resume parsing 6 | 7 | Sentiment and tone analysis 8 | 9 | Machine learning classification for cultural fit assessment 10 | 11 | Scalable for large recruitment datasets 12 | 13 | 🛠️ Technologies Used 14 | Python 15 | 16 | Pandas & NumPy 17 | 18 | Scikit-learn 19 | 20 | NLTK / spaCy 21 | 22 | Matplotlib / Seaborn 23 | 24 | 📊 Dataset 25 | Synthetic or anonymized resume datasets processed for features like tone, keywords, and experience relevance. 26 | 27 | 🚀 How to Run 28 | Clone the repository: 29 | 30 | bash 31 | Copy code 32 | git clone https://github.com/Okes2024/AI-based-Resume-Screening-for-Cultural-Fit.git 33 | Navigate to the project folder: 34 | 35 | bash 36 | Copy code 37 | cd AI-based-Resume-Screening-for-Cultural-Fit 38 | Install dependencies: 39 | 40 | bash 41 | Copy code 42 | pip install -r requirements.txt 43 | Run the script: 44 | 45 | bash 46 | Copy code 47 | python main.py 48 | 49 | 50 | 📜 License 51 | This project is licensed under the MIT License – see the LICENSE file for details. 52 | 53 | 54 | 👨‍💻 Author 55 | Okes Imoni 56 | GitHub: github.com/Okes2024 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /resume code: -------------------------------------------------------------------------------- 1 | import os 2 | import pdfplumber 3 | import nltk 4 | import re 5 | import pandas as pd 6 | from sklearn.metrics.pairwise import cosine_similarity 7 | from sentence_transformers import SentenceTransformer 8 | import matplotlib.pyplot as plt 9 | 10 | nltk.download('stopwords') 11 | from nltk.corpus import stopwords 12 | STOPWORDS = set(stopwords.words('english')) 13 | 14 | # Load SBERT model for embedding resumes and culture statements 15 | model = SentenceTransformer('all-MiniLM-L6-v2') 16 | 17 | # Define path to resume folder 18 | RESUME_DIR = 'resumes/' # folder should contain .txt or .pdf files 19 | 20 | # Define company culture statement 21 | company_culture = """ 22 | At our company, we value collaboration, innovation, transparency, inclusivity, and a growth mindset. 23 | We encourage continuous learning, diversity of thought, and respect for all individuals. 24 | """ 25 | 26 | # Define culture-related keyword categories 27 | culture_keywords = { 28 | 'collaboration': ['teamwork', 'collaborate', 'partner', 'group work', 'cross-functional'], 29 | 'innovation': ['innovative', 'creative', 'ideas', 'disruptive', 'design thinking'], 30 | 'transparency': ['honest', 'clear', 'open', 'direct', 'authentic'], 31 | 'inclusivity': ['inclusive', 'diverse', 'equality', 'respectful', 'belonging'], 32 | 'growth': ['learning', 'adapt', 'grow', 'resilient', 'curious'] 33 | } 34 | 35 | # Function to extract text from PDF or TXT 36 | def extract_text(filepath): 37 | text = "" 38 | if filepath.endswith('.pdf'): 39 | with pdfplumber.open(filepath) as pdf: 40 | for page in pdf.pages: 41 | text += page.extract_text() or "" 42 | elif filepath.endswith('.txt'): 43 | with open(filepath, 'r', encoding='utf-8') as f: 44 | text = f.read() 45 | return text 46 | 47 | # Function to clean and preprocess text 48 | def preprocess(text): 49 | text = re.sub(r'\W+', ' ', text.lower()) 50 | tokens = [word for word in text.split() if word not in STOPWORDS] 51 | return ' '.join(tokens) 52 | 53 | # Function to count keyword matches 54 | def keyword_score(text, keyword_dict): 55 | scores = {} 56 | for key, keywords in keyword_dict.items(): 57 | count = sum(text.lower().count(kw) for kw in keywords) 58 | scores[key] = count 59 | return scores 60 | 61 | # Embed the company culture 62 | culture_embedding = model.encode([company_culture])[0] 63 | 64 | # Store all scores 65 | results = [] 66 | 67 | # Process resumes 68 | for file in os.listdir(RESUME_DIR): 69 | if file.endswith('.pdf') or file.endswith('.txt'): 70 | path = os.path.join(RESUME_DIR, file) 71 | raw_text = extract_text(path) 72 | clean_text = preprocess(raw_text) 73 | 74 | # Semantic similarity 75 | resume_embedding = model.encode([clean_text])[0] 76 | similarity = cosine_similarity([culture_embedding], [resume_embedding])[0][0] 77 | 78 | # Keyword analysis 79 | scores = keyword_score(raw_text, culture_keywords) 80 | total_keyword_score = sum(scores.values()) 81 | 82 | # Record results 83 | result = { 84 | 'filename': file, 85 | 'similarity_score': round(similarity, 3), 86 | 'keyword_score': total_keyword_score, 87 | **scores 88 | } 89 | results.append(result) 90 | 91 | # Create DataFrame 92 | df = pd.DataFrame(results) 93 | df['final_score'] = df['similarity_score'] * 0.6 + (df['keyword_score'] / df['keyword_score'].max()) * 0.4 94 | df_sorted = df.sort_values(by='final_score', ascending=False) 95 | 96 | # Export results to Excel 97 | df_sorted.to_excel('ranked_resumes_cultural_fit.xlsx', index=False) 98 | 99 | # Plot top 5 candidates 100 | top5 = df_sorted.head(5) 101 | plt.figure(figsize=(10, 6)) 102 | plt.bar(top5['filename'], top5['final_score'], color='green') 103 | plt.title("Top 5 Resume Matches to Company Culture") 104 | plt.ylabel("Cultural Fit Score") 105 | plt.xticks(rotation=45) 106 | plt.tight_layout() 107 | plt.savefig("top5_resume_fit.png") 108 | plt.show() 109 | -------------------------------------------------------------------------------- /Resume code.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pdfplumber 3 | import nltk 4 | import re 5 | import pandas as pd 6 | from sklearn.metrics.pairwise import cosine_similarity 7 | from sentence_transformers import SentenceTransformer 8 | import matplotlib.pyplot as plt 9 | 10 | # Ensure NLTK stopwords are available 11 | nltk.download('stopwords') 12 | from nltk.corpus import stopwords 13 | STOPWORDS = set(stopwords.words('english')) 14 | 15 | # Load SBERT model for embedding resumes and culture statements 16 | model = SentenceTransformer('all-MiniLM-L6-v2') 17 | 18 | # Define path to resume folder 19 | RESUME_DIR = 'resumes/' # folder should contain .txt or .pdf files 20 | 21 | # Define company culture statement 22 | company_culture = """ 23 | At our company, we value collaboration, innovation, transparency, inclusivity, and a growth mindset. 24 | We encourage continuous learning, diversity of thought, and respect for all individuals. 25 | """ 26 | 27 | # Define culture-related keyword categories 28 | culture_keywords = { 29 | 'collaboration': ['teamwork', 'collaborate', 'partner', 'group work', 'cross-functional'], 30 | 'innovation': ['innovative', 'creative', 'ideas', 'disruptive', 'design thinking'], 31 | 'transparency': ['honest', 'clear', 'open', 'direct', 'authentic'], 32 | 'inclusivity': ['inclusive', 'diverse', 'equality', 'respectful', 'belonging'], 33 | 'growth': ['learning', 'adapt', 'grow', 'resilient', 'curious'] 34 | } 35 | 36 | # Function to extract text from PDF or TXT 37 | def extract_text(filepath): 38 | text = "" 39 | if filepath.lower().endswith('.pdf'): 40 | with pdfplumber.open(filepath) as pdf: 41 | for page in pdf.pages: 42 | text += page.extract_text() or "" 43 | elif filepath.lower().endswith('.txt'): 44 | with open(filepath, 'r', encoding='utf-8', errors='ignore') as f: 45 | text = f.read() 46 | return text 47 | 48 | # Function to clean and preprocess text 49 | def preprocess(text): 50 | text = re.sub(r'\W+', ' ', text.lower()) 51 | tokens = [word for word in text.split() if word not in STOPWORDS] 52 | return ' '.join(tokens) 53 | 54 | # Function to count keyword matches 55 | def keyword_score(text, keyword_dict): 56 | scores = {} 57 | low = text.lower() 58 | for key, keywords in keyword_dict.items(): 59 | count = sum(low.count(kw) for kw in keywords) 60 | scores[key] = count 61 | return scores 62 | 63 | def main(): 64 | # Embed the company culture 65 | culture_embedding = model.encode([company_culture])[0] 66 | 67 | # Store all scores 68 | results = [] 69 | 70 | # Ensure resume directory exists 71 | if not os.path.isdir(RESUME_DIR): 72 | raise FileNotFoundError(f"Resume directory not found: {RESUME_DIR}") 73 | 74 | # Process resumes 75 | for file in os.listdir(RESUME_DIR): 76 | if file.lower().endswith('.pdf') or file.lower().endswith('.txt'): 77 | path = os.path.join(RESUME_DIR, file) 78 | raw_text = extract_text(path) 79 | clean_text = preprocess(raw_text) 80 | 81 | # Semantic similarity 82 | resume_embedding = model.encode([clean_text])[0] 83 | similarity = cosine_similarity([culture_embedding], [resume_embedding])[0][0] 84 | 85 | # Keyword analysis 86 | scores = keyword_score(raw_text, culture_keywords) 87 | total_keyword_score = sum(scores.values()) 88 | 89 | # Record results 90 | result = { 91 | 'filename': file, 92 | 'similarity_score': round(float(similarity), 3), 93 | 'keyword_score': int(total_keyword_score), 94 | **scores 95 | } 96 | results.append(result) 97 | 98 | # Create DataFrame 99 | df = pd.DataFrame(results) 100 | if df.empty: 101 | raise ValueError("No resumes processed. Ensure there are .pdf or .txt files in the 'resumes/' folder.") 102 | 103 | # Normalize keyword score safely 104 | max_kw = df['keyword_score'].max() 105 | kw_norm = (df['keyword_score'] / max_kw) if max_kw > 0 else 0 106 | 107 | df['final_score'] = df['similarity_score'] * 0.6 + kw_norm * 0.4 108 | df_sorted = df.sort_values(by='final_score', ascending=False) 109 | 110 | # Export results to Excel 111 | df_sorted.to_excel('ranked_resumes_cultural_fit.xlsx', index=False) 112 | 113 | # Plot top 5 candidates 114 | top5 = df_sorted.head(5) 115 | plt.figure(figsize=(10, 6)) 116 | plt.bar(top5['filename'], top5['final_score']) 117 | plt.title("Top 5 Resume Matches to Company Culture") 118 | plt.ylabel("Cultural Fit Score") 119 | plt.xticks(rotation=45, ha='right') 120 | plt.tight_layout() 121 | plt.savefig("top5_resume_fit.png") 122 | # plt.show() # optional: enable for interactive environments 123 | 124 | print("Saved: ranked_resumes_cultural_fit.xlsx, top5_resume_fit.png") 125 | 126 | if __name__ == "__main__": 127 | main() 128 | --------------------------------------------------------------------------------