├── Code
    └── Resume parser.py
├── README.md
├── mail.py
├── mbl_number.py
├── name.py
├── qualification.py
├── skills.py
└── text.py


/Code/Resume parser.py:
--------------------------------------------------------------------------------
  1 | ## Use to upload files in Google colab
  2 | #from google.colab import files
  3 | #uploaded = files.upload()
  4 | 
  5 | 
  6 | 
  7 | pip install docx2txt
  8 | pip install pypdf2
  9 | 
 10 | 
 11 | import docx2txt
 12 | from PyPDF2 import PdfFileReader, PdfFileWriter, PdfFileMerger
 13 | 
 14 | #Extracting text from DOCX
 15 | def doctotext(m):
 16 |     temp = docx2txt.process(m)
 17 |     resume_text = [line.replace('\t', ' ') for line in temp.split('\n') if line]
 18 |     text = ' '.join(resume_text)
 19 |     return (text)
 20 | 
 21 | 
 22 | ## Extracting text from PDF
 23 | def pdftotext(m):
 24 |     # pdf file object
 25 |     # you can find find the pdf file with complete code in below
 26 |     pdfFileObj = open(m, 'rb')
 27 | 
 28 |     # pdf reader object
 29 |     pdfFileReader = PdfFileReader(pdfFileObj)
 30 | 
 31 |     # number of pages in pdf
 32 |     num_pages = pdfFileReader.numPages
 33 | 
 34 |     currentPageNumber = 0
 35 |     text = ''
 36 | 
 37 |     # Loop in all the pdf pages.
 38 |     while(currentPageNumber < num_pages ):
 39 | 
 40 |         # Get the specified pdf page object.
 41 |         pdfPage = pdfFileReader.getPage(currentPageNumber)
 42 | 
 43 |         # Get pdf page text.
 44 |         text = text + pdfPage.extractText()
 45 | 
 46 |         # Process next page.
 47 |         currentPageNumber += 1
 48 |     return (text)
 49 |   
 50 | if __name__ == '__main__': 
 51 | 
 52 |     FilePath = 'AI.pdf'
 53 |     FilePath.lower().endswith(('.png', '.docx'))
 54 |     if FilePath.endswith('.docx'):
 55 |       textinput = doctotext(FilePath) 
 56 |     elif FilePath.endswith('.pdf'):
 57 |       textinput = pdftotext(FilePath)
 58 |     else:
 59 |       print("File not support")
 60 | 
 61 | import re
 62 | import nltk
 63 | nltk.download('stopwords')
 64 | from nltk.corpus import stopwords
 65 | stop = stopwords.words('english')
 66 | nltk.download('punkt')
 67 | nltk.download('averaged_perceptron_tagger')
 68 | nltk.download('maxent_ne_chunker')
 69 | nltk.download('words')
 70 | import spacy
 71 | import en_core_web_sm
 72 | from spacy.matcher import Matcher
 73 | 
 74 | # load pre-trained model
 75 | nlp = en_core_web_sm.load()
 76 | 
 77 | # initialize matcher with a vocab
 78 | matcher = Matcher(nlp.vocab)
 79 | 
 80 | 
 81 | def extract_name(resume_text):
 82 |     nlp_text = nlp(resume_text)
 83 |     
 84 |     # First name and Last name are always Proper Nouns
 85 |     pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]
 86 |     
 87 |     matcher.add('NAME', None, pattern)
 88 |     
 89 |     matches = matcher(nlp_text)
 90 |     
 91 |     for match_id, start, end in matches:
 92 |         span = nlp_text[start:end]
 93 |         return span.text
 94 | print('Name: ',extract_name(textinput))
 95 | 
 96 | import re
 97 | from nltk.corpus import stopwords
 98 | 
 99 | 
100 | # Grad all general stop words
101 | STOPWORDS = set(stopwords.words('english'))
102 | 
103 | # Education Degrees
104 | EDUCATION = [
105 |             'BE','B.E.', 'B.E', 'BS', 'B.S', 
106 |             'ME', 'M.E', 'M.E.', 'M.B.A', 'MBA', 'MS', 'M.S', 
107 |             'BTECH', 'B.TECH', 'M.TECH', 'MTECH', 
108 |             'SSLC', 'SSC' 'HSC', 'CBSE', 'ICSE', 'X', 'XII'
109 |         ]
110 | 
111 | def extract_education(resume_text):
112 |     nlp_text = nlp(resume_text)
113 | 
114 |     # Sentence Tokenizer
115 |     nlp_text = [sent.string.strip() for sent in nlp_text.sents]
116 | 
117 |     edu = {}
118 |     # Extract education degree
119 |     for index, text in enumerate(nlp_text):
120 |         for tex in text.split():
121 |             # Replace all special symbols
122 |             tex = re.sub(r'[?|$|.|!|,]', r'', tex)
123 |             if tex.upper() in EDUCATION and tex not in STOPWORDS:
124 |                 edu[tex] = text + nlp_text[index + 1]
125 |                 
126 |                 
127 | 
128 |     # Extract year
129 |     education = []
130 |     for key in edu.keys():
131 |         year = re.search(re.compile(r'(((20|19)(\d{})))'), edu[key])
132 |         if year:
133 |             education.append((key, ''.join(year[0])))
134 |         else:
135 |             education.append(key)
136 |     return education
137 | print('Qualification: ',extract_education(textinput))
138 | 
139 | import pandas as pd
140 | import spacy
141 | nlp = spacy.load('en_core_web_sm')
142 | noun_chunks = nlp.noun_chunks
143 | 
144 | def extract_skills(resume_text):
145 |     nlp_text = nlp(resume_text)
146 | 
147 |     # removing stop words and implementing word tokenization
148 |     tokens = [token.text for token in nlp_text if not token.is_stop]
149 |     colnames = ['skill']
150 |     # reading the csv file
151 |     data = pd.read_csv('skill.csv', names=colnames) 
152 |     
153 |     # extract values
154 |     skills = data.skill.tolist()
155 |     print(skills)
156 |     skillset = []
157 |     
158 |     # check for one-grams (example: python)
159 |     for token in tokens:
160 |         if token.lower() in skills:
161 |             skillset.append(token)
162 |    
163 |     for token in noun_chunks:
164 |         token = token.text.lower().strip()
165 |         if token in skills:
166 |             skillset.append(token)
167 |     return [i.capitalize() for i in set([i.lower() for i in skillset])]
168 |   
169 | print ('Skills',extract_skills(textinput))
170 | 
171 | def extract_mobile_number(resume_text):
172 |     phone = re.findall(re.compile(r'(?:(?:\+?([1-9]|[0-9][0-9]|[0-9][0-9][0-9])\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([0-9][1-9]|[0-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?'), resume_text)
173 |     
174 |     if phone:
175 |         number = ''.join(phone[0])
176 |         if len(number) > 10:
177 |             return number
178 |         else:
179 |             return number
180 | print('Mobile Number: ',extract_mobile_number(textinput))
181 | 
182 | 
183 | def extract_email_addresses(string):
184 |     r = re.compile(r'[\w\.-]+@[\w\.-]+')
185 |     return r.findall(string)
186 | print('Mail id: ',extract_email_addresses(textinput))
187 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Resume-Parser-Using-NLP
 2 | ## Built Resume Parser using Natural Language Processing(NLP) in Python.
 3 | Intially check extension of file either .pdf or .docx. Convert pdf or docx file content into text. Then using NLP-Spacy,nltk,. libraries extract Name, Mobile number, Mail id, Qualification, Technical Skills from resume.
 4 | 
 5 | ### NLP Libraries used
 6 | ### 1. spaCy 
 7 |   spaCy is a free, open-source library for advanced Natural Language Processing (NLP) in Python. spaCy is designed specifically for production use and helps you build applications that process and “understand” large volumes of text. It can be used to build information extraction or natural language understanding systems.
 8 | ### 2. NLTK
 9 |   The Natural Language Toolkit (NLTK) is a platform used for building Python programs that work with human language data for applying in statistical natural language processing (NLP). It contains text processing libraries for tokenization, parsing, classification, stemming, tagging and semantic reasoning.
10 |         
11 | ### .py files:
12 | * Extract text: text.py
13 | * Name: name.py
14 | * Mobile Number: mbl_number.py
15 | * Mail id: mail.py
16 | * Qualification: qualification.py
17 | * Technical Skills: skills.py
18 | 


--------------------------------------------------------------------------------
/mail.py:
--------------------------------------------------------------------------------
1 | def extract_email_addresses(string):
2 |     r = re.compile(r'[\w\.-]+@[\w\.-]+')
3 |     return r.findall(string)
4 | print('Mail id: ',extract_email_addresses(textinput))
5 | 


--------------------------------------------------------------------------------
/mbl_number.py:
--------------------------------------------------------------------------------
 1 | def extract_mobile_number(resume_text):
 2 |     phone = re.findall(re.compile(r'(?:(?:\+?([1-9]|[0-9][0-9]|[0-9][0-9][0-9])\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([0-9][1-9]|[0-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?'), resume_text)
 3 |     
 4 |     if phone:
 5 |         number = ''.join(phone[0])
 6 |         if len(number) > 10:
 7 |             return number
 8 |         else:
 9 |             return number
10 | print('Mobile Number: ',extract_mobile_number(textinput))
11 | 


--------------------------------------------------------------------------------
/name.py:
--------------------------------------------------------------------------------
 1 | import spacy
 2 | import en_core_web_sm
 3 | from spacy.matcher import Matcher
 4 | 
 5 | # load pre-trained model
 6 | nlp = en_core_web_sm.load()
 7 | 
 8 | # initialize matcher with a vocab
 9 | matcher = Matcher(nlp.vocab)
10 | 
11 | 
12 | def extract_name(resume_text):
13 |     nlp_text = nlp(resume_text)
14 |     
15 |     # First name and Last name are always Proper Nouns
16 |     pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]
17 |     
18 |     matcher.add('NAME', None, pattern)
19 |     
20 |     matches = matcher(nlp_text)
21 |     
22 |     for match_id, start, end in matches:
23 |         span = nlp_text[start:end]
24 |         return span.text
25 | print('Name: ',extract_name(textinput))
26 | 


--------------------------------------------------------------------------------
/qualification.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from nltk.corpus import stopwords
 3 | 
 4 | 
 5 | # Grad all general stop words
 6 | STOPWORDS = set(stopwords.words('english'))
 7 | 
 8 | # Education Degrees
 9 | EDUCATION = [
10 |             'BE','B.E.', 'B.E', 'BS', 'B.S', 
11 |             'ME', 'M.E', 'M.E.', 'M.B.A', 'MBA', 'MS', 'M.S', 
12 |             'BTECH', 'B.TECH', 'M.TECH', 'MTECH', 
13 |             'SSLC', 'SSC' 'HSC', 'CBSE', 'ICSE', 'X', 'XII'
14 |         ]
15 | 
16 | def extract_education(resume_text):
17 |     nlp_text = nlp(resume_text)
18 | 
19 |     # Sentence Tokenizer
20 |     nlp_text = [sent.string.strip() for sent in nlp_text.sents]
21 | 
22 |     edu = {}
23 |     # Extract education degree
24 |     for index, text in enumerate(nlp_text):
25 |         for tex in text.split():
26 |             # Replace all special symbols
27 |             tex = re.sub(r'[?|$|.|!|,]', r'', tex)
28 |             if tex.upper() in EDUCATION and tex not in STOPWORDS:
29 |                 edu[tex] = text + nlp_text[index + 1]
30 |                 
31 |                 
32 | 
33 |     # Extract year
34 |     education = []
35 |     for key in edu.keys():
36 |         year = re.search(re.compile(r'(((20|19)(\d{})))'), edu[key])
37 |         if year:
38 |             education.append((key, ''.join(year[0])))
39 |         else:
40 |             education.append(key)
41 |     return education
42 | print('Qualification: ',extract_education(textinput))
43 | 


--------------------------------------------------------------------------------
/skills.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import spacy
 3 | nlp = spacy.load('en_core_web_sm')
 4 | noun_chunks = nlp.noun_chunks
 5 | 
 6 | def extract_skills(resume_text):
 7 |     nlp_text = nlp(resume_text)
 8 | 
 9 |     # removing stop words and implementing word tokenization
10 |     tokens = [token.text for token in nlp_text if not token.is_stop]
11 |     colnames = ['skill']
12 |     # reading the csv file
13 |     data = pd.read_csv('skill.csv', names=colnames) 
14 |     
15 |     # extract values
16 |     skills = data.skill.tolist()
17 |     print(skills)
18 |     skillset = []
19 |     
20 |     # check for one-grams (example: python)
21 |     for token in tokens:
22 |         if token.lower() in skills:
23 |             skillset.append(token)
24 |    
25 |     for token in noun_chunks:
26 |         token = token.text.lower().strip()
27 |         if token in skills:
28 |             skillset.append(token)
29 |     return [i.capitalize() for i in set([i.lower() for i in skillset])]
30 |   
31 | print ('Skills',extract_skills(textinput))
32 | 


--------------------------------------------------------------------------------
/text.py:
--------------------------------------------------------------------------------
 1 | pip install pypdf2
 2 | pip install docx2txt
 3 | import docx2txt
 4 | from PyPDF2 import PdfFileReader, PdfFileWriter, PdfFileMerger
 5 | 
 6 | #Extracting text from DOCX
 7 | def doctotext(m):
 8 |     temp = docx2txt.process(m)
 9 |     resume_text = [line.replace('\t', ' ') for line in temp.split('\n') if line]
10 |     text = ' '.join(resume_text)
11 |     return (text)
12 |     
13 | #Extracting text from PDF
14 | def pdftotext(m):
15 |     # pdf file object
16 |     # you can find find the pdf file with complete code in below
17 |     pdfFileObj = open(m, 'rb')
18 | 
19 |     # pdf reader object
20 |     pdfFileReader = PdfFileReader(pdfFileObj)
21 | 
22 |     # number of pages in pdf
23 |     num_pages = pdfFileReader.numPages
24 | 
25 |     currentPageNumber = 0
26 |     text = ''
27 | 
28 |     # Loop in all the pdf pages.
29 |     while(currentPageNumber < num_pages ):
30 | 
31 |         # Get the specified pdf page object.
32 |         pdfPage = pdfFileReader.getPage(currentPageNumber)
33 | 
34 |         # Get pdf page text.
35 |         text = text + pdfPage.extractText()
36 | 
37 |         # Process next page.
38 |         currentPageNumber += 1
39 |     return (text)
40 | 
41 | #main function
42 | if __name__ == '__main__': 
43 | 
44 |     FilePath = 'AI.pdf'
45 |     FilePath.lower().endswith(('.png', '.docx'))
46 |     if FilePath.endswith('.docx'):
47 |       textinput = doctotext(FilePath) 
48 |     elif FilePath.endswith('.pdf'):
49 |       textinput = pdftotext(FilePath)
50 |     else:
51 |       print("File not support")
52 | 


--------------------------------------------------------------------------------