├── Demo
    ├── Interface.png
    ├── Interface_Results.png
    └── Workflow.png
├── Models.py
├── README.md
├── Resume_scanner.py
├── __pycache__
    ├── Models.cpython-37.pyc
    └── Resume_Scanner.cpython-37.pyc
├── application.py
└── requirements.txt


/Demo/Interface.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SOUMEE2000/Applicant_Tracking_System/564da0b0af167e48abaee4dca30ef4a6d6f379a6/Demo/Interface.png


--------------------------------------------------------------------------------
/Demo/Interface_Results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SOUMEE2000/Applicant_Tracking_System/564da0b0af167e48abaee4dca30ef4a6d6f379a6/Demo/Interface_Results.png


--------------------------------------------------------------------------------
/Demo/Workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SOUMEE2000/Applicant_Tracking_System/564da0b0af167e48abaee4dca30ef4a6d6f379a6/Demo/Workflow.png


--------------------------------------------------------------------------------
/Models.py:
--------------------------------------------------------------------------------
 1 | import gensim
 2 | from gensim.models.doc2vec import Doc2Vec, TaggedDocument
 3 | from nltk.tokenize import word_tokenize
 4 | from gensim.models.doc2vec import Doc2Vec
 5 | import nltk
 6 | from transformers import AutoTokenizer, AutoModel
 7 | from sklearn.metrics.pairwise import cosine_similarity
 8 | import torch
 9 | import numpy as np
10 | import streamlit as st
11 | 
12 | #Mean Pooling - Take attention mask into account for correct averaging
13 | def mean_pooling(model_output, attention_mask):
14 |     token_embeddings = model_output[0] #First element of model_output contains all token embeddings
15 |     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
16 |     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
17 | 
18 | 
19 | @st.cache_resource
20 | def get_HF_embeddings(sentences):
21 | 
22 |   # Load model from HuggingFace Hub
23 |   tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
24 |   model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
25 |   # Tokenize sentences
26 |   encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt', max_length=512)
27 |   # Compute token embeddings
28 |   with torch.no_grad():
29 |       model_output = model(**encoded_input)
30 |   # Perform pooling. In this case, max pooling.
31 |   embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
32 | 
33 |   # print("Sentence embeddings:")
34 |   # print(embeddings)
35 |   return embeddings
36 | 
37 | 
38 | @st.cache_data
39 | def get_doc2vec_embeddings(JD, text_resume):
40 |     nltk.download("punkt")
41 |     data = [JD]
42 |     resume_embeddings = []
43 |     
44 |     tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]
45 |     #print (tagged_data)
46 | 
47 |     model = gensim.models.doc2vec.Doc2Vec(vector_size=512, min_count=3, epochs=80)
48 |     model.build_vocab(tagged_data)
49 |     model.train(tagged_data, total_examples=model.corpus_count, epochs=80)
50 |     JD_embeddings = np.transpose(model.docvecs['0'].reshape(-1,1))
51 | 
52 |     for i in text_resume:
53 |         text = word_tokenize(i.lower())
54 |         embeddings = model.infer_vector(text)
55 |         resume_embeddings.append(np.transpose(embeddings.reshape(-1,1)))
56 |     return (JD_embeddings, resume_embeddings)
57 | 
58 | 
59 | def cosine(embeddings1, embeddings2):
60 |   # get the match percentage
61 |   score_list = []
62 |   for i in embeddings1:
63 |       matchPercentage = cosine_similarity(np.array(i), np.array(embeddings2))
64 |       matchPercentage = np.round(matchPercentage, 4)*100 # round to two decimal
65 |       print("Your resume matches about" + str(matchPercentage[0])+ "% of the job description.")
66 |       score_list.append(str(matchPercentage[0][0]))
67 |   return score_list
68 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Resume Screening App
 2 | This app is built for employers looking for candidates against a particular job description. This app looks into outputing a x% percent similarity score given the resume of the candidate and a job description.
 3 | 
 4 | App deployed on [Streamlit Community Cloud](https://soumee2000-applicant-tracking-system-application-tqrpm0.streamlit.app/)
 5 | 
 6 | ## Intuition:
 7 | 1. Get [context-aware BERT Embeddings](https://towardsdatascience.com/nlp-extract-contextualized-word-embeddings-from-bert-keras-tf-67ef29f60a7b) or [document doc2vec embeddings](https://cs.stanford.edu/~quocle/paragraph_vector.pdf) for Resume and Job Description.
 8 | 2. [Hugging Face](https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens) Library was very useful alongwith doc2vec or nltk
 9 | 3. Get their [cosine similarity](https://developers.google.com/machine-learning/clustering/similarity/measuring-similarity)
10 | 
11 | ## Workflow:
12 | <img src = "https://github.com/SOUMEE2000/Applicant_Tracking_System/blob/main/Demo/Workflow.png">
13 | 
14 | ## Interface
15 | <img src = "https://github.com/SOUMEE2000/Resume_Scanner/blob/main/Demo/Interface.png" height=400>
16 | <img src = "https://github.com/SOUMEE2000/Applicant_Tracking_System/blob/main/Demo/Interface_Results.png" height = 400 width = 800>
17 | 
18 | ## Usage
19 | 
20 | ```
21 | pip install -r requirements.txt
22 | ```
23 | **Run**: ``` streamlit run application.py```
24 | 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/Resume_scanner.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import streamlit as st
 3 | from Models import get_HF_embeddings, cosine, get_doc2vec_embeddings
 4 | 
 5 | def compare(resume_texts, JD_text, flag='HuggingFace-BERT'):
 6 |     JD_embeddings = None
 7 |     resume_embeddings = []
 8 | 
 9 |     if flag == 'HuggingFace-BERT':
10 |         if JD_text is not None:
11 |             JD_embeddings = get_HF_embeddings(JD_text)
12 |         for resume_text in resume_texts:
13 |             resume_embeddings.append(get_HF_embeddings(resume_text))
14 | 
15 |         if JD_embeddings is not None and resume_embeddings is not None:
16 |             cos_scores = cosine(resume_embeddings, JD_embeddings)
17 |             return cos_scores
18 | 
19 |     # Add logic for other flags like 'Doc2Vec' if necessary
20 |     else:
21 |         # Handle other cases
22 |         pass


--------------------------------------------------------------------------------
/__pycache__/Models.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SOUMEE2000/Applicant_Tracking_System/564da0b0af167e48abaee4dca30ef4a6d6f379a6/__pycache__/Models.cpython-37.pyc


--------------------------------------------------------------------------------
/__pycache__/Resume_Scanner.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SOUMEE2000/Applicant_Tracking_System/564da0b0af167e48abaee4dca30ef4a6d6f379a6/__pycache__/Resume_Scanner.cpython-37.pyc


--------------------------------------------------------------------------------
/application.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import streamlit as st
 3 | import pdfplumber
 4 | from Resume_scanner import compare
 5 | 
 6 | 
 7 | def extract_pdf_data(file_path):
 8 |     data = ""
 9 |     with pdfplumber.open(file_path) as pdf:
10 |         for page in pdf.pages:
11 |             text = page.extract_text()
12 |             if text:
13 |                 data += text
14 |     return data
15 | 
16 | 
17 | def extract_text_data(file_path):
18 |     with open(file_path, 'r') as file:
19 |         data = file.read()
20 |     return data
21 | 
22 | 
23 | # Command-line argument processing
24 | if len(sys.argv) > 1:
25 | 
26 |     if len(sys.argv) == 3:
27 |         resume_path = sys.argv[1]
28 |         jd_path = sys.argv[2]
29 | 
30 |         resume_data = extract_pdf_data(resume_path)
31 |         jd_data = extract_text_data(jd_path)
32 | 
33 |         result = compare([resume_data], jd_data, flag='HuggingFace-BERT')
34 | 
35 |     sys.exit()
36 | 
37 | # Sidebar
38 | flag = 'HuggingFace-BERT'
39 | with st.sidebar:
40 |     st.markdown('**Which embedding do you want to use**')
41 |     options = st.selectbox('Which embedding do you want to use',
42 |                            ['HuggingFace-BERT', 'Doc2Vec'],
43 |                            label_visibility="collapsed")
44 |     flag = options
45 | 
46 | # Main content
47 | tab1, tab2 = st.tabs(["**Home**", "**Results**"])
48 | 
49 | # Tab Home
50 | with tab1:
51 |     st.title("Applicant Tracking System")
52 |     uploaded_files = st.file_uploader(
53 |         '**Choose your resume.pdf file:** ', type="pdf", accept_multiple_files=True)
54 |     JD = st.text_area("**Enter the job description:**")
55 |     comp_pressed = st.button("Compare!")
56 |     if comp_pressed and uploaded_files:
57 |         # Streamlit file_uploader gives file-like objects, not paths
58 |         uploaded_file_paths = [extract_pdf_data(
59 |             file) for file in uploaded_files]
60 |         score = compare(uploaded_file_paths, JD, flag)
61 | 
62 | # Tab Results
63 | with tab2:
64 |     st.header("Results")
65 |     my_dict = {}
66 |     if comp_pressed and uploaded_files:
67 |         for i in range(len(score)):
68 |             my_dict[uploaded_files[i].name] = score[i]
69 |         sorted_dict = dict(sorted(my_dict.items()))
70 |         for i in sorted_dict.items():
71 |             with st.expander(str(i[0])):
72 |                 st.write("Score is: ", i[1])
73 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit
2 | transformers
3 | torch
4 | pdfplumber
5 | nltk
6 | gensim
7 | scikit-learn
8 | 


--------------------------------------------------------------------------------