├── Demo ├── Interface.png ├── Interface_Results.png └── Workflow.png ├── Models.py ├── README.md ├── Resume_scanner.py ├── __pycache__ ├── Models.cpython-37.pyc └── Resume_Scanner.cpython-37.pyc ├── application.py └── requirements.txt /Demo/Interface.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SOUMEE2000/Applicant_Tracking_System/564da0b0af167e48abaee4dca30ef4a6d6f379a6/Demo/Interface.png -------------------------------------------------------------------------------- /Demo/Interface_Results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SOUMEE2000/Applicant_Tracking_System/564da0b0af167e48abaee4dca30ef4a6d6f379a6/Demo/Interface_Results.png -------------------------------------------------------------------------------- /Demo/Workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SOUMEE2000/Applicant_Tracking_System/564da0b0af167e48abaee4dca30ef4a6d6f379a6/Demo/Workflow.png -------------------------------------------------------------------------------- /Models.py: -------------------------------------------------------------------------------- 1 | import gensim 2 | from gensim.models.doc2vec import Doc2Vec, TaggedDocument 3 | from nltk.tokenize import word_tokenize 4 | from gensim.models.doc2vec import Doc2Vec 5 | import nltk 6 | from transformers import AutoTokenizer, AutoModel 7 | from sklearn.metrics.pairwise import cosine_similarity 8 | import torch 9 | import numpy as np 10 | import streamlit as st 11 | 12 | #Mean Pooling - Take attention mask into account for correct averaging 13 | def mean_pooling(model_output, attention_mask): 14 | token_embeddings = model_output[0] #First element of model_output contains all token embeddings 15 | input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() 16 | return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) 17 | 18 | 19 | @st.cache_resource 20 | def get_HF_embeddings(sentences): 21 | 22 | # Load model from HuggingFace Hub 23 | tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens') 24 | model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens') 25 | # Tokenize sentences 26 | encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt', max_length=512) 27 | # Compute token embeddings 28 | with torch.no_grad(): 29 | model_output = model(**encoded_input) 30 | # Perform pooling. In this case, max pooling. 31 | embeddings = mean_pooling(model_output, encoded_input['attention_mask']) 32 | 33 | # print("Sentence embeddings:") 34 | # print(embeddings) 35 | return embeddings 36 | 37 | 38 | @st.cache_data 39 | def get_doc2vec_embeddings(JD, text_resume): 40 | nltk.download("punkt") 41 | data = [JD] 42 | resume_embeddings = [] 43 | 44 | tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)] 45 | #print (tagged_data) 46 | 47 | model = gensim.models.doc2vec.Doc2Vec(vector_size=512, min_count=3, epochs=80) 48 | model.build_vocab(tagged_data) 49 | model.train(tagged_data, total_examples=model.corpus_count, epochs=80) 50 | JD_embeddings = np.transpose(model.docvecs['0'].reshape(-1,1)) 51 | 52 | for i in text_resume: 53 | text = word_tokenize(i.lower()) 54 | embeddings = model.infer_vector(text) 55 | resume_embeddings.append(np.transpose(embeddings.reshape(-1,1))) 56 | return (JD_embeddings, resume_embeddings) 57 | 58 | 59 | def cosine(embeddings1, embeddings2): 60 | # get the match percentage 61 | score_list = [] 62 | for i in embeddings1: 63 | matchPercentage = cosine_similarity(np.array(i), np.array(embeddings2)) 64 | matchPercentage = np.round(matchPercentage, 4)*100 # round to two decimal 65 | print("Your resume matches about" + str(matchPercentage[0])+ "% of the job description.") 66 | score_list.append(str(matchPercentage[0][0])) 67 | return score_list 68 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Resume Screening App 2 | This app is built for employers looking for candidates against a particular job description. This app looks into outputing a x% percent similarity score given the resume of the candidate and a job description. 3 | 4 | App deployed on [Streamlit Community Cloud](https://soumee2000-applicant-tracking-system-application-tqrpm0.streamlit.app/) 5 | 6 | ## Intuition: 7 | 1. Get [context-aware BERT Embeddings](https://towardsdatascience.com/nlp-extract-contextualized-word-embeddings-from-bert-keras-tf-67ef29f60a7b) or [document doc2vec embeddings](https://cs.stanford.edu/~quocle/paragraph_vector.pdf) for Resume and Job Description. 8 | 2. [Hugging Face](https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens) Library was very useful alongwith doc2vec or nltk 9 | 3. Get their [cosine similarity](https://developers.google.com/machine-learning/clustering/similarity/measuring-similarity) 10 | 11 | ## Workflow: 12 | 13 | 14 | ## Interface 15 | 16 | 17 | 18 | ## Usage 19 | 20 | ``` 21 | pip install -r requirements.txt 22 | ``` 23 | **Run**: ``` streamlit run application.py``` 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /Resume_scanner.py: -------------------------------------------------------------------------------- 1 | 2 | import streamlit as st 3 | from Models import get_HF_embeddings, cosine, get_doc2vec_embeddings 4 | 5 | def compare(resume_texts, JD_text, flag='HuggingFace-BERT'): 6 | JD_embeddings = None 7 | resume_embeddings = [] 8 | 9 | if flag == 'HuggingFace-BERT': 10 | if JD_text is not None: 11 | JD_embeddings = get_HF_embeddings(JD_text) 12 | for resume_text in resume_texts: 13 | resume_embeddings.append(get_HF_embeddings(resume_text)) 14 | 15 | if JD_embeddings is not None and resume_embeddings is not None: 16 | cos_scores = cosine(resume_embeddings, JD_embeddings) 17 | return cos_scores 18 | 19 | # Add logic for other flags like 'Doc2Vec' if necessary 20 | else: 21 | # Handle other cases 22 | pass -------------------------------------------------------------------------------- /__pycache__/Models.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SOUMEE2000/Applicant_Tracking_System/564da0b0af167e48abaee4dca30ef4a6d6f379a6/__pycache__/Models.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/Resume_Scanner.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SOUMEE2000/Applicant_Tracking_System/564da0b0af167e48abaee4dca30ef4a6d6f379a6/__pycache__/Resume_Scanner.cpython-37.pyc -------------------------------------------------------------------------------- /application.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import streamlit as st 3 | import pdfplumber 4 | from Resume_scanner import compare 5 | 6 | 7 | def extract_pdf_data(file_path): 8 | data = "" 9 | with pdfplumber.open(file_path) as pdf: 10 | for page in pdf.pages: 11 | text = page.extract_text() 12 | if text: 13 | data += text 14 | return data 15 | 16 | 17 | def extract_text_data(file_path): 18 | with open(file_path, 'r') as file: 19 | data = file.read() 20 | return data 21 | 22 | 23 | # Command-line argument processing 24 | if len(sys.argv) > 1: 25 | 26 | if len(sys.argv) == 3: 27 | resume_path = sys.argv[1] 28 | jd_path = sys.argv[2] 29 | 30 | resume_data = extract_pdf_data(resume_path) 31 | jd_data = extract_text_data(jd_path) 32 | 33 | result = compare([resume_data], jd_data, flag='HuggingFace-BERT') 34 | 35 | sys.exit() 36 | 37 | # Sidebar 38 | flag = 'HuggingFace-BERT' 39 | with st.sidebar: 40 | st.markdown('**Which embedding do you want to use**') 41 | options = st.selectbox('Which embedding do you want to use', 42 | ['HuggingFace-BERT', 'Doc2Vec'], 43 | label_visibility="collapsed") 44 | flag = options 45 | 46 | # Main content 47 | tab1, tab2 = st.tabs(["**Home**", "**Results**"]) 48 | 49 | # Tab Home 50 | with tab1: 51 | st.title("Applicant Tracking System") 52 | uploaded_files = st.file_uploader( 53 | '**Choose your resume.pdf file:** ', type="pdf", accept_multiple_files=True) 54 | JD = st.text_area("**Enter the job description:**") 55 | comp_pressed = st.button("Compare!") 56 | if comp_pressed and uploaded_files: 57 | # Streamlit file_uploader gives file-like objects, not paths 58 | uploaded_file_paths = [extract_pdf_data( 59 | file) for file in uploaded_files] 60 | score = compare(uploaded_file_paths, JD, flag) 61 | 62 | # Tab Results 63 | with tab2: 64 | st.header("Results") 65 | my_dict = {} 66 | if comp_pressed and uploaded_files: 67 | for i in range(len(score)): 68 | my_dict[uploaded_files[i].name] = score[i] 69 | sorted_dict = dict(sorted(my_dict.items())) 70 | for i in sorted_dict.items(): 71 | with st.expander(str(i[0])): 72 | st.write("Score is: ", i[1]) 73 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit 2 | transformers 3 | torch 4 | pdfplumber 5 | nltk 6 | gensim 7 | scikit-learn 8 | --------------------------------------------------------------------------------