├── Code └── Resume parser.py ├── README.md ├── mail.py ├── mbl_number.py ├── name.py ├── qualification.py ├── skills.py └── text.py /Code/Resume parser.py: -------------------------------------------------------------------------------- 1 | ## Use to upload files in Google colab 2 | #from google.colab import files 3 | #uploaded = files.upload() 4 | 5 | 6 | 7 | pip install docx2txt 8 | pip install pypdf2 9 | 10 | 11 | import docx2txt 12 | from PyPDF2 import PdfFileReader, PdfFileWriter, PdfFileMerger 13 | 14 | #Extracting text from DOCX 15 | def doctotext(m): 16 | temp = docx2txt.process(m) 17 | resume_text = [line.replace('\t', ' ') for line in temp.split('\n') if line] 18 | text = ' '.join(resume_text) 19 | return (text) 20 | 21 | 22 | ## Extracting text from PDF 23 | def pdftotext(m): 24 | # pdf file object 25 | # you can find find the pdf file with complete code in below 26 | pdfFileObj = open(m, 'rb') 27 | 28 | # pdf reader object 29 | pdfFileReader = PdfFileReader(pdfFileObj) 30 | 31 | # number of pages in pdf 32 | num_pages = pdfFileReader.numPages 33 | 34 | currentPageNumber = 0 35 | text = '' 36 | 37 | # Loop in all the pdf pages. 38 | while(currentPageNumber < num_pages ): 39 | 40 | # Get the specified pdf page object. 41 | pdfPage = pdfFileReader.getPage(currentPageNumber) 42 | 43 | # Get pdf page text. 44 | text = text + pdfPage.extractText() 45 | 46 | # Process next page. 47 | currentPageNumber += 1 48 | return (text) 49 | 50 | if __name__ == '__main__': 51 | 52 | FilePath = 'AI.pdf' 53 | FilePath.lower().endswith(('.png', '.docx')) 54 | if FilePath.endswith('.docx'): 55 | textinput = doctotext(FilePath) 56 | elif FilePath.endswith('.pdf'): 57 | textinput = pdftotext(FilePath) 58 | else: 59 | print("File not support") 60 | 61 | import re 62 | import nltk 63 | nltk.download('stopwords') 64 | from nltk.corpus import stopwords 65 | stop = stopwords.words('english') 66 | nltk.download('punkt') 67 | nltk.download('averaged_perceptron_tagger') 68 | nltk.download('maxent_ne_chunker') 69 | nltk.download('words') 70 | import spacy 71 | import en_core_web_sm 72 | from spacy.matcher import Matcher 73 | 74 | # load pre-trained model 75 | nlp = en_core_web_sm.load() 76 | 77 | # initialize matcher with a vocab 78 | matcher = Matcher(nlp.vocab) 79 | 80 | 81 | def extract_name(resume_text): 82 | nlp_text = nlp(resume_text) 83 | 84 | # First name and Last name are always Proper Nouns 85 | pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}] 86 | 87 | matcher.add('NAME', None, pattern) 88 | 89 | matches = matcher(nlp_text) 90 | 91 | for match_id, start, end in matches: 92 | span = nlp_text[start:end] 93 | return span.text 94 | print('Name: ',extract_name(textinput)) 95 | 96 | import re 97 | from nltk.corpus import stopwords 98 | 99 | 100 | # Grad all general stop words 101 | STOPWORDS = set(stopwords.words('english')) 102 | 103 | # Education Degrees 104 | EDUCATION = [ 105 | 'BE','B.E.', 'B.E', 'BS', 'B.S', 106 | 'ME', 'M.E', 'M.E.', 'M.B.A', 'MBA', 'MS', 'M.S', 107 | 'BTECH', 'B.TECH', 'M.TECH', 'MTECH', 108 | 'SSLC', 'SSC' 'HSC', 'CBSE', 'ICSE', 'X', 'XII' 109 | ] 110 | 111 | def extract_education(resume_text): 112 | nlp_text = nlp(resume_text) 113 | 114 | # Sentence Tokenizer 115 | nlp_text = [sent.string.strip() for sent in nlp_text.sents] 116 | 117 | edu = {} 118 | # Extract education degree 119 | for index, text in enumerate(nlp_text): 120 | for tex in text.split(): 121 | # Replace all special symbols 122 | tex = re.sub(r'[?|$|.|!|,]', r'', tex) 123 | if tex.upper() in EDUCATION and tex not in STOPWORDS: 124 | edu[tex] = text + nlp_text[index + 1] 125 | 126 | 127 | 128 | # Extract year 129 | education = [] 130 | for key in edu.keys(): 131 | year = re.search(re.compile(r'(((20|19)(\d{})))'), edu[key]) 132 | if year: 133 | education.append((key, ''.join(year[0]))) 134 | else: 135 | education.append(key) 136 | return education 137 | print('Qualification: ',extract_education(textinput)) 138 | 139 | import pandas as pd 140 | import spacy 141 | nlp = spacy.load('en_core_web_sm') 142 | noun_chunks = nlp.noun_chunks 143 | 144 | def extract_skills(resume_text): 145 | nlp_text = nlp(resume_text) 146 | 147 | # removing stop words and implementing word tokenization 148 | tokens = [token.text for token in nlp_text if not token.is_stop] 149 | colnames = ['skill'] 150 | # reading the csv file 151 | data = pd.read_csv('skill.csv', names=colnames) 152 | 153 | # extract values 154 | skills = data.skill.tolist() 155 | print(skills) 156 | skillset = [] 157 | 158 | # check for one-grams (example: python) 159 | for token in tokens: 160 | if token.lower() in skills: 161 | skillset.append(token) 162 | 163 | for token in noun_chunks: 164 | token = token.text.lower().strip() 165 | if token in skills: 166 | skillset.append(token) 167 | return [i.capitalize() for i in set([i.lower() for i in skillset])] 168 | 169 | print ('Skills',extract_skills(textinput)) 170 | 171 | def extract_mobile_number(resume_text): 172 | phone = re.findall(re.compile(r'(?:(?:\+?([1-9]|[0-9][0-9]|[0-9][0-9][0-9])\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([0-9][1-9]|[0-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?'), resume_text) 173 | 174 | if phone: 175 | number = ''.join(phone[0]) 176 | if len(number) > 10: 177 | return number 178 | else: 179 | return number 180 | print('Mobile Number: ',extract_mobile_number(textinput)) 181 | 182 | 183 | def extract_email_addresses(string): 184 | r = re.compile(r'[\w\.-]+@[\w\.-]+') 185 | return r.findall(string) 186 | print('Mail id: ',extract_email_addresses(textinput)) 187 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Resume-Parser-Using-NLP 2 | ## Built Resume Parser using Natural Language Processing(NLP) in Python. 3 | Intially check extension of file either .pdf or .docx. Convert pdf or docx file content into text. Then using NLP-Spacy,nltk,. libraries extract Name, Mobile number, Mail id, Qualification, Technical Skills from resume. 4 | 5 | ### NLP Libraries used 6 | ### 1. spaCy 7 | spaCy is a free, open-source library for advanced Natural Language Processing (NLP) in Python. spaCy is designed specifically for production use and helps you build applications that process and “understand” large volumes of text. It can be used to build information extraction or natural language understanding systems. 8 | ### 2. NLTK 9 | The Natural Language Toolkit (NLTK) is a platform used for building Python programs that work with human language data for applying in statistical natural language processing (NLP). It contains text processing libraries for tokenization, parsing, classification, stemming, tagging and semantic reasoning. 10 | 11 | ### .py files: 12 | * Extract text: text.py 13 | * Name: name.py 14 | * Mobile Number: mbl_number.py 15 | * Mail id: mail.py 16 | * Qualification: qualification.py 17 | * Technical Skills: skills.py 18 | -------------------------------------------------------------------------------- /mail.py: -------------------------------------------------------------------------------- 1 | def extract_email_addresses(string): 2 | r = re.compile(r'[\w\.-]+@[\w\.-]+') 3 | return r.findall(string) 4 | print('Mail id: ',extract_email_addresses(textinput)) 5 | -------------------------------------------------------------------------------- /mbl_number.py: -------------------------------------------------------------------------------- 1 | def extract_mobile_number(resume_text): 2 | phone = re.findall(re.compile(r'(?:(?:\+?([1-9]|[0-9][0-9]|[0-9][0-9][0-9])\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([0-9][1-9]|[0-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?'), resume_text) 3 | 4 | if phone: 5 | number = ''.join(phone[0]) 6 | if len(number) > 10: 7 | return number 8 | else: 9 | return number 10 | print('Mobile Number: ',extract_mobile_number(textinput)) 11 | -------------------------------------------------------------------------------- /name.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | import en_core_web_sm 3 | from spacy.matcher import Matcher 4 | 5 | # load pre-trained model 6 | nlp = en_core_web_sm.load() 7 | 8 | # initialize matcher with a vocab 9 | matcher = Matcher(nlp.vocab) 10 | 11 | 12 | def extract_name(resume_text): 13 | nlp_text = nlp(resume_text) 14 | 15 | # First name and Last name are always Proper Nouns 16 | pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}] 17 | 18 | matcher.add('NAME', None, pattern) 19 | 20 | matches = matcher(nlp_text) 21 | 22 | for match_id, start, end in matches: 23 | span = nlp_text[start:end] 24 | return span.text 25 | print('Name: ',extract_name(textinput)) 26 | -------------------------------------------------------------------------------- /qualification.py: -------------------------------------------------------------------------------- 1 | import re 2 | from nltk.corpus import stopwords 3 | 4 | 5 | # Grad all general stop words 6 | STOPWORDS = set(stopwords.words('english')) 7 | 8 | # Education Degrees 9 | EDUCATION = [ 10 | 'BE','B.E.', 'B.E', 'BS', 'B.S', 11 | 'ME', 'M.E', 'M.E.', 'M.B.A', 'MBA', 'MS', 'M.S', 12 | 'BTECH', 'B.TECH', 'M.TECH', 'MTECH', 13 | 'SSLC', 'SSC' 'HSC', 'CBSE', 'ICSE', 'X', 'XII' 14 | ] 15 | 16 | def extract_education(resume_text): 17 | nlp_text = nlp(resume_text) 18 | 19 | # Sentence Tokenizer 20 | nlp_text = [sent.string.strip() for sent in nlp_text.sents] 21 | 22 | edu = {} 23 | # Extract education degree 24 | for index, text in enumerate(nlp_text): 25 | for tex in text.split(): 26 | # Replace all special symbols 27 | tex = re.sub(r'[?|$|.|!|,]', r'', tex) 28 | if tex.upper() in EDUCATION and tex not in STOPWORDS: 29 | edu[tex] = text + nlp_text[index + 1] 30 | 31 | 32 | 33 | # Extract year 34 | education = [] 35 | for key in edu.keys(): 36 | year = re.search(re.compile(r'(((20|19)(\d{})))'), edu[key]) 37 | if year: 38 | education.append((key, ''.join(year[0]))) 39 | else: 40 | education.append(key) 41 | return education 42 | print('Qualification: ',extract_education(textinput)) 43 | -------------------------------------------------------------------------------- /skills.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import spacy 3 | nlp = spacy.load('en_core_web_sm') 4 | noun_chunks = nlp.noun_chunks 5 | 6 | def extract_skills(resume_text): 7 | nlp_text = nlp(resume_text) 8 | 9 | # removing stop words and implementing word tokenization 10 | tokens = [token.text for token in nlp_text if not token.is_stop] 11 | colnames = ['skill'] 12 | # reading the csv file 13 | data = pd.read_csv('skill.csv', names=colnames) 14 | 15 | # extract values 16 | skills = data.skill.tolist() 17 | print(skills) 18 | skillset = [] 19 | 20 | # check for one-grams (example: python) 21 | for token in tokens: 22 | if token.lower() in skills: 23 | skillset.append(token) 24 | 25 | for token in noun_chunks: 26 | token = token.text.lower().strip() 27 | if token in skills: 28 | skillset.append(token) 29 | return [i.capitalize() for i in set([i.lower() for i in skillset])] 30 | 31 | print ('Skills',extract_skills(textinput)) 32 | -------------------------------------------------------------------------------- /text.py: -------------------------------------------------------------------------------- 1 | pip install pypdf2 2 | pip install docx2txt 3 | import docx2txt 4 | from PyPDF2 import PdfFileReader, PdfFileWriter, PdfFileMerger 5 | 6 | #Extracting text from DOCX 7 | def doctotext(m): 8 | temp = docx2txt.process(m) 9 | resume_text = [line.replace('\t', ' ') for line in temp.split('\n') if line] 10 | text = ' '.join(resume_text) 11 | return (text) 12 | 13 | #Extracting text from PDF 14 | def pdftotext(m): 15 | # pdf file object 16 | # you can find find the pdf file with complete code in below 17 | pdfFileObj = open(m, 'rb') 18 | 19 | # pdf reader object 20 | pdfFileReader = PdfFileReader(pdfFileObj) 21 | 22 | # number of pages in pdf 23 | num_pages = pdfFileReader.numPages 24 | 25 | currentPageNumber = 0 26 | text = '' 27 | 28 | # Loop in all the pdf pages. 29 | while(currentPageNumber < num_pages ): 30 | 31 | # Get the specified pdf page object. 32 | pdfPage = pdfFileReader.getPage(currentPageNumber) 33 | 34 | # Get pdf page text. 35 | text = text + pdfPage.extractText() 36 | 37 | # Process next page. 38 | currentPageNumber += 1 39 | return (text) 40 | 41 | #main function 42 | if __name__ == '__main__': 43 | 44 | FilePath = 'AI.pdf' 45 | FilePath.lower().endswith(('.png', '.docx')) 46 | if FilePath.endswith('.docx'): 47 | textinput = doctotext(FilePath) 48 | elif FilePath.endswith('.pdf'): 49 | textinput = pdftotext(FilePath) 50 | else: 51 | print("File not support") 52 | --------------------------------------------------------------------------------