├── Demo
├── Interface.png
├── Interface_Results.png
└── Workflow.png
├── Models.py
├── README.md
├── Resume_scanner.py
├── __pycache__
├── Models.cpython-37.pyc
└── Resume_Scanner.cpython-37.pyc
├── application.py
└── requirements.txt
/Demo/Interface.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SOUMEE2000/Applicant_Tracking_System/564da0b0af167e48abaee4dca30ef4a6d6f379a6/Demo/Interface.png
--------------------------------------------------------------------------------
/Demo/Interface_Results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SOUMEE2000/Applicant_Tracking_System/564da0b0af167e48abaee4dca30ef4a6d6f379a6/Demo/Interface_Results.png
--------------------------------------------------------------------------------
/Demo/Workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SOUMEE2000/Applicant_Tracking_System/564da0b0af167e48abaee4dca30ef4a6d6f379a6/Demo/Workflow.png
--------------------------------------------------------------------------------
/Models.py:
--------------------------------------------------------------------------------
1 | import gensim
2 | from gensim.models.doc2vec import Doc2Vec, TaggedDocument
3 | from nltk.tokenize import word_tokenize
4 | from gensim.models.doc2vec import Doc2Vec
5 | import nltk
6 | from transformers import AutoTokenizer, AutoModel
7 | from sklearn.metrics.pairwise import cosine_similarity
8 | import torch
9 | import numpy as np
10 | import streamlit as st
11 |
12 | #Mean Pooling - Take attention mask into account for correct averaging
13 | def mean_pooling(model_output, attention_mask):
14 | token_embeddings = model_output[0] #First element of model_output contains all token embeddings
15 | input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
16 | return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
17 |
18 |
19 | @st.cache_resource
20 | def get_HF_embeddings(sentences):
21 |
22 | # Load model from HuggingFace Hub
23 | tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
24 | model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
25 | # Tokenize sentences
26 | encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt', max_length=512)
27 | # Compute token embeddings
28 | with torch.no_grad():
29 | model_output = model(**encoded_input)
30 | # Perform pooling. In this case, max pooling.
31 | embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
32 |
33 | # print("Sentence embeddings:")
34 | # print(embeddings)
35 | return embeddings
36 |
37 |
38 | @st.cache_data
39 | def get_doc2vec_embeddings(JD, text_resume):
40 | nltk.download("punkt")
41 | data = [JD]
42 | resume_embeddings = []
43 |
44 | tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]
45 | #print (tagged_data)
46 |
47 | model = gensim.models.doc2vec.Doc2Vec(vector_size=512, min_count=3, epochs=80)
48 | model.build_vocab(tagged_data)
49 | model.train(tagged_data, total_examples=model.corpus_count, epochs=80)
50 | JD_embeddings = np.transpose(model.docvecs['0'].reshape(-1,1))
51 |
52 | for i in text_resume:
53 | text = word_tokenize(i.lower())
54 | embeddings = model.infer_vector(text)
55 | resume_embeddings.append(np.transpose(embeddings.reshape(-1,1)))
56 | return (JD_embeddings, resume_embeddings)
57 |
58 |
59 | def cosine(embeddings1, embeddings2):
60 | # get the match percentage
61 | score_list = []
62 | for i in embeddings1:
63 | matchPercentage = cosine_similarity(np.array(i), np.array(embeddings2))
64 | matchPercentage = np.round(matchPercentage, 4)*100 # round to two decimal
65 | print("Your resume matches about" + str(matchPercentage[0])+ "% of the job description.")
66 | score_list.append(str(matchPercentage[0][0]))
67 | return score_list
68 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Resume Screening App
2 | This app is built for employers looking for candidates against a particular job description. This app looks into outputing a x% percent similarity score given the resume of the candidate and a job description.
3 |
4 | App deployed on [Streamlit Community Cloud](https://soumee2000-applicant-tracking-system-application-tqrpm0.streamlit.app/)
5 |
6 | ## Intuition:
7 | 1. Get [context-aware BERT Embeddings](https://towardsdatascience.com/nlp-extract-contextualized-word-embeddings-from-bert-keras-tf-67ef29f60a7b) or [document doc2vec embeddings](https://cs.stanford.edu/~quocle/paragraph_vector.pdf) for Resume and Job Description.
8 | 2. [Hugging Face](https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens) Library was very useful alongwith doc2vec or nltk
9 | 3. Get their [cosine similarity](https://developers.google.com/machine-learning/clustering/similarity/measuring-similarity)
10 |
11 | ## Workflow:
12 |
13 |
14 | ## Interface
15 |
16 |
17 |
18 | ## Usage
19 |
20 | ```
21 | pip install -r requirements.txt
22 | ```
23 | **Run**: ``` streamlit run application.py```
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/Resume_scanner.py:
--------------------------------------------------------------------------------
1 |
2 | import streamlit as st
3 | from Models import get_HF_embeddings, cosine, get_doc2vec_embeddings
4 |
5 | def compare(resume_texts, JD_text, flag='HuggingFace-BERT'):
6 | JD_embeddings = None
7 | resume_embeddings = []
8 |
9 | if flag == 'HuggingFace-BERT':
10 | if JD_text is not None:
11 | JD_embeddings = get_HF_embeddings(JD_text)
12 | for resume_text in resume_texts:
13 | resume_embeddings.append(get_HF_embeddings(resume_text))
14 |
15 | if JD_embeddings is not None and resume_embeddings is not None:
16 | cos_scores = cosine(resume_embeddings, JD_embeddings)
17 | return cos_scores
18 |
19 | # Add logic for other flags like 'Doc2Vec' if necessary
20 | else:
21 | # Handle other cases
22 | pass
--------------------------------------------------------------------------------
/__pycache__/Models.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SOUMEE2000/Applicant_Tracking_System/564da0b0af167e48abaee4dca30ef4a6d6f379a6/__pycache__/Models.cpython-37.pyc
--------------------------------------------------------------------------------
/__pycache__/Resume_Scanner.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SOUMEE2000/Applicant_Tracking_System/564da0b0af167e48abaee4dca30ef4a6d6f379a6/__pycache__/Resume_Scanner.cpython-37.pyc
--------------------------------------------------------------------------------
/application.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import streamlit as st
3 | import pdfplumber
4 | from Resume_scanner import compare
5 |
6 |
7 | def extract_pdf_data(file_path):
8 | data = ""
9 | with pdfplumber.open(file_path) as pdf:
10 | for page in pdf.pages:
11 | text = page.extract_text()
12 | if text:
13 | data += text
14 | return data
15 |
16 |
17 | def extract_text_data(file_path):
18 | with open(file_path, 'r') as file:
19 | data = file.read()
20 | return data
21 |
22 |
23 | # Command-line argument processing
24 | if len(sys.argv) > 1:
25 |
26 | if len(sys.argv) == 3:
27 | resume_path = sys.argv[1]
28 | jd_path = sys.argv[2]
29 |
30 | resume_data = extract_pdf_data(resume_path)
31 | jd_data = extract_text_data(jd_path)
32 |
33 | result = compare([resume_data], jd_data, flag='HuggingFace-BERT')
34 |
35 | sys.exit()
36 |
37 | # Sidebar
38 | flag = 'HuggingFace-BERT'
39 | with st.sidebar:
40 | st.markdown('**Which embedding do you want to use**')
41 | options = st.selectbox('Which embedding do you want to use',
42 | ['HuggingFace-BERT', 'Doc2Vec'],
43 | label_visibility="collapsed")
44 | flag = options
45 |
46 | # Main content
47 | tab1, tab2 = st.tabs(["**Home**", "**Results**"])
48 |
49 | # Tab Home
50 | with tab1:
51 | st.title("Applicant Tracking System")
52 | uploaded_files = st.file_uploader(
53 | '**Choose your resume.pdf file:** ', type="pdf", accept_multiple_files=True)
54 | JD = st.text_area("**Enter the job description:**")
55 | comp_pressed = st.button("Compare!")
56 | if comp_pressed and uploaded_files:
57 | # Streamlit file_uploader gives file-like objects, not paths
58 | uploaded_file_paths = [extract_pdf_data(
59 | file) for file in uploaded_files]
60 | score = compare(uploaded_file_paths, JD, flag)
61 |
62 | # Tab Results
63 | with tab2:
64 | st.header("Results")
65 | my_dict = {}
66 | if comp_pressed and uploaded_files:
67 | for i in range(len(score)):
68 | my_dict[uploaded_files[i].name] = score[i]
69 | sorted_dict = dict(sorted(my_dict.items()))
70 | for i in sorted_dict.items():
71 | with st.expander(str(i[0])):
72 | st.write("Score is: ", i[1])
73 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit
2 | transformers
3 | torch
4 | pdfplumber
5 | nltk
6 | gensim
7 | scikit-learn
8 |
--------------------------------------------------------------------------------