├── mock_ranked_resumes_cultural_fit.xlsx
├── README.md
├── resume code
└── Resume code.py


/mock_ranked_resumes_cultural_fit.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Okes2024/AI-based-Resume-Screening-for-Cultural-Fit/HEAD/mock_ranked_resumes_cultural_fit.xlsx


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 📌 Overview
 2 | This project leverages AI and NLP techniques to screen resumes for cultural fit, helping recruiters identify candidates who align with an organization’s values, work environment, and team dynamics. It uses text processing, sentiment analysis, and machine learning for accurate predictions.
 3 | 
 4 | 📂 Features
 5 | Natural Language Processing (NLP) for resume parsing
 6 | 
 7 | Sentiment and tone analysis
 8 | 
 9 | Machine learning classification for cultural fit assessment
10 | 
11 | Scalable for large recruitment datasets
12 | 
13 | 🛠️ Technologies Used
14 | Python
15 | 
16 | Pandas & NumPy
17 | 
18 | Scikit-learn
19 | 
20 | NLTK / spaCy
21 | 
22 | Matplotlib / Seaborn
23 | 
24 | 📊 Dataset
25 | Synthetic or anonymized resume datasets processed for features like tone, keywords, and experience relevance.
26 | 
27 | 🚀 How to Run
28 | Clone the repository:
29 | 
30 | bash
31 | Copy code
32 | git clone https://github.com/Okes2024/AI-based-Resume-Screening-for-Cultural-Fit.git
33 | Navigate to the project folder:
34 | 
35 | bash
36 | Copy code
37 | cd AI-based-Resume-Screening-for-Cultural-Fit
38 | Install dependencies:
39 | 
40 | bash
41 | Copy code
42 | pip install -r requirements.txt
43 | Run the script:
44 | 
45 | bash
46 | Copy code
47 | python main.py
48 | 
49 | 
50 | 📜 License
51 | This project is licensed under the MIT License – see the LICENSE file for details.
52 | 
53 | 
54 | 👨‍💻 Author
55 | Okes Imoni
56 | GitHub: github.com/Okes2024
57 | 
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/resume code:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pdfplumber
  3 | import nltk
  4 | import re
  5 | import pandas as pd
  6 | from sklearn.metrics.pairwise import cosine_similarity
  7 | from sentence_transformers import SentenceTransformer
  8 | import matplotlib.pyplot as plt
  9 | 
 10 | nltk.download('stopwords')
 11 | from nltk.corpus import stopwords
 12 | STOPWORDS = set(stopwords.words('english'))
 13 | 
 14 | # Load SBERT model for embedding resumes and culture statements
 15 | model = SentenceTransformer('all-MiniLM-L6-v2')
 16 | 
 17 | # Define path to resume folder
 18 | RESUME_DIR = 'resumes/'  # folder should contain .txt or .pdf files
 19 | 
 20 | # Define company culture statement
 21 | company_culture = """
 22 | At our company, we value collaboration, innovation, transparency, inclusivity, and a growth mindset. 
 23 | We encourage continuous learning, diversity of thought, and respect for all individuals.
 24 | """
 25 | 
 26 | # Define culture-related keyword categories
 27 | culture_keywords = {
 28 |     'collaboration': ['teamwork', 'collaborate', 'partner', 'group work', 'cross-functional'],
 29 |     'innovation': ['innovative', 'creative', 'ideas', 'disruptive', 'design thinking'],
 30 |     'transparency': ['honest', 'clear', 'open', 'direct', 'authentic'],
 31 |     'inclusivity': ['inclusive', 'diverse', 'equality', 'respectful', 'belonging'],
 32 |     'growth': ['learning', 'adapt', 'grow', 'resilient', 'curious']
 33 | }
 34 | 
 35 | # Function to extract text from PDF or TXT
 36 | def extract_text(filepath):
 37 |     text = ""
 38 |     if filepath.endswith('.pdf'):
 39 |         with pdfplumber.open(filepath) as pdf:
 40 |             for page in pdf.pages:
 41 |                 text += page.extract_text() or ""
 42 |     elif filepath.endswith('.txt'):
 43 |         with open(filepath, 'r', encoding='utf-8') as f:
 44 |             text = f.read()
 45 |     return text
 46 | 
 47 | # Function to clean and preprocess text
 48 | def preprocess(text):
 49 |     text = re.sub(r'\W+', ' ', text.lower())
 50 |     tokens = [word for word in text.split() if word not in STOPWORDS]
 51 |     return ' '.join(tokens)
 52 | 
 53 | # Function to count keyword matches
 54 | def keyword_score(text, keyword_dict):
 55 |     scores = {}
 56 |     for key, keywords in keyword_dict.items():
 57 |         count = sum(text.lower().count(kw) for kw in keywords)
 58 |         scores[key] = count
 59 |     return scores
 60 | 
 61 | # Embed the company culture
 62 | culture_embedding = model.encode([company_culture])[0]
 63 | 
 64 | # Store all scores
 65 | results = []
 66 | 
 67 | # Process resumes
 68 | for file in os.listdir(RESUME_DIR):
 69 |     if file.endswith('.pdf') or file.endswith('.txt'):
 70 |         path = os.path.join(RESUME_DIR, file)
 71 |         raw_text = extract_text(path)
 72 |         clean_text = preprocess(raw_text)
 73 | 
 74 |         # Semantic similarity
 75 |         resume_embedding = model.encode([clean_text])[0]
 76 |         similarity = cosine_similarity([culture_embedding], [resume_embedding])[0][0]
 77 | 
 78 |         # Keyword analysis
 79 |         scores = keyword_score(raw_text, culture_keywords)
 80 |         total_keyword_score = sum(scores.values())
 81 | 
 82 |         # Record results
 83 |         result = {
 84 |             'filename': file,
 85 |             'similarity_score': round(similarity, 3),
 86 |             'keyword_score': total_keyword_score,
 87 |             **scores
 88 |         }
 89 |         results.append(result)
 90 | 
 91 | # Create DataFrame
 92 | df = pd.DataFrame(results)
 93 | df['final_score'] = df['similarity_score'] * 0.6 + (df['keyword_score'] / df['keyword_score'].max()) * 0.4
 94 | df_sorted = df.sort_values(by='final_score', ascending=False)
 95 | 
 96 | # Export results to Excel
 97 | df_sorted.to_excel('ranked_resumes_cultural_fit.xlsx', index=False)
 98 | 
 99 | # Plot top 5 candidates
100 | top5 = df_sorted.head(5)
101 | plt.figure(figsize=(10, 6))
102 | plt.bar(top5['filename'], top5['final_score'], color='green')
103 | plt.title("Top 5 Resume Matches to Company Culture")
104 | plt.ylabel("Cultural Fit Score")
105 | plt.xticks(rotation=45)
106 | plt.tight_layout()
107 | plt.savefig("top5_resume_fit.png")
108 | plt.show()
109 | 


--------------------------------------------------------------------------------
/Resume code.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pdfplumber
  3 | import nltk
  4 | import re
  5 | import pandas as pd
  6 | from sklearn.metrics.pairwise import cosine_similarity
  7 | from sentence_transformers import SentenceTransformer
  8 | import matplotlib.pyplot as plt
  9 | 
 10 | # Ensure NLTK stopwords are available
 11 | nltk.download('stopwords')
 12 | from nltk.corpus import stopwords
 13 | STOPWORDS = set(stopwords.words('english'))
 14 | 
 15 | # Load SBERT model for embedding resumes and culture statements
 16 | model = SentenceTransformer('all-MiniLM-L6-v2')
 17 | 
 18 | # Define path to resume folder
 19 | RESUME_DIR = 'resumes/'  # folder should contain .txt or .pdf files
 20 | 
 21 | # Define company culture statement
 22 | company_culture = """
 23 | At our company, we value collaboration, innovation, transparency, inclusivity, and a growth mindset. 
 24 | We encourage continuous learning, diversity of thought, and respect for all individuals.
 25 | """
 26 | 
 27 | # Define culture-related keyword categories
 28 | culture_keywords = {
 29 |     'collaboration': ['teamwork', 'collaborate', 'partner', 'group work', 'cross-functional'],
 30 |     'innovation': ['innovative', 'creative', 'ideas', 'disruptive', 'design thinking'],
 31 |     'transparency': ['honest', 'clear', 'open', 'direct', 'authentic'],
 32 |     'inclusivity': ['inclusive', 'diverse', 'equality', 'respectful', 'belonging'],
 33 |     'growth': ['learning', 'adapt', 'grow', 'resilient', 'curious']
 34 | }
 35 | 
 36 | # Function to extract text from PDF or TXT
 37 | def extract_text(filepath):
 38 |     text = ""
 39 |     if filepath.lower().endswith('.pdf'):
 40 |         with pdfplumber.open(filepath) as pdf:
 41 |             for page in pdf.pages:
 42 |                 text += page.extract_text() or ""
 43 |     elif filepath.lower().endswith('.txt'):
 44 |         with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
 45 |             text = f.read()
 46 |     return text
 47 | 
 48 | # Function to clean and preprocess text
 49 | def preprocess(text):
 50 |     text = re.sub(r'\W+', ' ', text.lower())
 51 |     tokens = [word for word in text.split() if word not in STOPWORDS]
 52 |     return ' '.join(tokens)
 53 | 
 54 | # Function to count keyword matches
 55 | def keyword_score(text, keyword_dict):
 56 |     scores = {}
 57 |     low = text.lower()
 58 |     for key, keywords in keyword_dict.items():
 59 |         count = sum(low.count(kw) for kw in keywords)
 60 |         scores[key] = count
 61 |     return scores
 62 | 
 63 | def main():
 64 |     # Embed the company culture
 65 |     culture_embedding = model.encode([company_culture])[0]
 66 | 
 67 |     # Store all scores
 68 |     results = []
 69 | 
 70 |     # Ensure resume directory exists
 71 |     if not os.path.isdir(RESUME_DIR):
 72 |         raise FileNotFoundError(f"Resume directory not found: {RESUME_DIR}")
 73 | 
 74 |     # Process resumes
 75 |     for file in os.listdir(RESUME_DIR):
 76 |         if file.lower().endswith('.pdf') or file.lower().endswith('.txt'):
 77 |             path = os.path.join(RESUME_DIR, file)
 78 |             raw_text = extract_text(path)
 79 |             clean_text = preprocess(raw_text)
 80 | 
 81 |             # Semantic similarity
 82 |             resume_embedding = model.encode([clean_text])[0]
 83 |             similarity = cosine_similarity([culture_embedding], [resume_embedding])[0][0]
 84 | 
 85 |             # Keyword analysis
 86 |             scores = keyword_score(raw_text, culture_keywords)
 87 |             total_keyword_score = sum(scores.values())
 88 | 
 89 |             # Record results
 90 |             result = {
 91 |                 'filename': file,
 92 |                 'similarity_score': round(float(similarity), 3),
 93 |                 'keyword_score': int(total_keyword_score),
 94 |                 **scores
 95 |             }
 96 |             results.append(result)
 97 | 
 98 |     # Create DataFrame
 99 |     df = pd.DataFrame(results)
100 |     if df.empty:
101 |         raise ValueError("No resumes processed. Ensure there are .pdf or .txt files in the 'resumes/' folder.")
102 | 
103 |     # Normalize keyword score safely
104 |     max_kw = df['keyword_score'].max()
105 |     kw_norm = (df['keyword_score'] / max_kw) if max_kw > 0 else 0
106 | 
107 |     df['final_score'] = df['similarity_score'] * 0.6 + kw_norm * 0.4
108 |     df_sorted = df.sort_values(by='final_score', ascending=False)
109 | 
110 |     # Export results to Excel
111 |     df_sorted.to_excel('ranked_resumes_cultural_fit.xlsx', index=False)
112 | 
113 |     # Plot top 5 candidates
114 |     top5 = df_sorted.head(5)
115 |     plt.figure(figsize=(10, 6))
116 |     plt.bar(top5['filename'], top5['final_score'])
117 |     plt.title("Top 5 Resume Matches to Company Culture")
118 |     plt.ylabel("Cultural Fit Score")
119 |     plt.xticks(rotation=45, ha='right')
120 |     plt.tight_layout()
121 |     plt.savefig("top5_resume_fit.png")
122 |     # plt.show()  # optional: enable for interactive environments
123 | 
124 |     print("Saved: ranked_resumes_cultural_fit.xlsx, top5_resume_fit.png")
125 | 
126 | if __name__ == "__main__":
127 |     main()
128 | 


--------------------------------------------------------------------------------