├── src ├── __init__.py ├── ML_Pipeline │ ├── dataset.py │ ├── utils.py │ ├── predict_model.py │ ├── text_extractor.py │ ├── entity_extractor.py │ ├── json_spacy.py │ └── train_model.py └── engine.py ├── output ├── Resume 2.pdf └── Alice Clark CV.pdf └── README.md /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /output/Resume 2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Abhimanyu9539/NLP_Resume_Parser/HEAD/output/Resume 2.pdf -------------------------------------------------------------------------------- /output/Alice Clark CV.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Abhimanyu9539/NLP_Resume_Parser/HEAD/output/Alice Clark CV.pdf -------------------------------------------------------------------------------- /src/ML_Pipeline/dataset.py: -------------------------------------------------------------------------------- 1 | import utils 2 | 3 | # function to read the data 4 | def read_data(): 5 | train=utils.convert_json_to_spacy("") # train file 6 | 7 | test=utils.convert_json_to_spacy("") # test file 8 | 9 | return train,test 10 | -------------------------------------------------------------------------------- /src/ML_Pipeline/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import spacy 4 | 5 | # function to check the existing model 6 | def check_existing_model(model_name): # take model name as an input 7 | # pass this in a try except block 8 | try: 9 | nlp=spacy.load(model_name) 10 | print("Model Exists. Updating the model") 11 | return model_name 12 | except Exception as e: # exception 13 | print("Model by this name does not exist. Building a new one") 14 | return None 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /src/ML_Pipeline/predict_model.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | from ML_Pipeline import text_extractor 3 | 4 | # function for prediction 5 | def predict(path): 6 | #output={} 7 | nlp=spacy.load("model") # load the model 8 | test_text=text_extractor.convert_pdf_to_text(path) # convert 9 | for text in test_text: 10 | text=text.replace('\n',' ') # replace 11 | doc = nlp(text) 12 | for ent in doc.ents: 13 | print(f'{ent.label_.upper():{30}}-{ent.text}') 14 | #output[ent.label_.upper()]=ent.text 15 | #return output 16 | 17 | -------------------------------------------------------------------------------- /src/engine.py: -------------------------------------------------------------------------------- 1 | from ML_Pipeline import json_spacy 2 | from ML_Pipeline import train_model 3 | from ML_Pipeline import predict_model 4 | from ML_Pipeline import text_extractor 5 | from ML_Pipeline import utils 6 | 7 | #####First lets create training data out of the tagged data############ 8 | 9 | train= json_spacy.convert_data_to_spacy(r"../input/training/Entity Recognition in Resumes.json") 10 | 11 | #print(train[0]) 12 | print("Done. Converted into spacy format") 13 | 14 | print("Checking if previously built spacy model exists. If not, we will train a new one") 15 | 16 | model=utils.check_existing_model("nlp_model") 17 | 18 | model=train_model.build_spacy_model(train,model) 19 | 20 | predict_model.predict("../output/") 21 | -------------------------------------------------------------------------------- /src/ML_Pipeline/text_extractor.py: -------------------------------------------------------------------------------- 1 | from tika import parser 2 | import os 3 | 4 | # function for text convertion 5 | def convert_pdf_to_text(dir): 6 | output=[] 7 | for root, dirs, files in os.walk(dir): 8 | print(files) 9 | for file in files: 10 | path_to_pdf = os.path.join(root, file) 11 | #print(path_to_pdf) 12 | [stem, ext] = os.path.splitext(path_to_pdf) 13 | if ext == '.pdf': 14 | print("Processing " + path_to_pdf) 15 | pdf_contents = parser.from_file(path_to_pdf,service='text') 16 | path_to_txt = stem + '.txt' 17 | # with open(path_to_txt, 'w',encoding='utf-8') as txt_file: 18 | # print("Writing contents to " + path_to_txt) 19 | # txt_file.write(pdf_contents['content']) 20 | output.append(pdf_contents['content']) 21 | return output 22 | 23 | -------------------------------------------------------------------------------- /src/ML_Pipeline/entity_extractor.py: -------------------------------------------------------------------------------- 1 | 2 | import spacy 3 | import re 4 | 5 | #Function to extract names from the string using spacy 6 | def extract_name(string): 7 | r1 = unicode(string) 8 | nlp = spacy.load('xx_ent_wiki_sm') # load the model 9 | doc = nlp(r1) 10 | for ent in doc.ents: 11 | if(ent.label_ == 'PER'): 12 | print(ent.text) 13 | break 14 | #Function to extract Phone Numbers from string using regular expressions 15 | def extract_phone_numbers(string): 16 | r = re.compile(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})') 17 | phone_numbers = r.findall(string) 18 | return [re.sub(r'\D', '', number) for number in phone_numbers] 19 | 20 | 21 | #Function to extract Email address from a string using regular expressions 22 | def extract_email_addresses(string): 23 | r = re.compile(r'[\w\.-]+@[\w\.-]+') 24 | return r.findall(string) 25 | 26 | -------------------------------------------------------------------------------- /src/ML_Pipeline/json_spacy.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import json 4 | import random 5 | import logging 6 | from sklearn.metrics import classification_report 7 | from sklearn.metrics import precision_recall_fscore_support 8 | import spacy 9 | from spacy.scorer import Scorer 10 | 11 | # function to convert data 12 | def convert_data_to_spacy(JSON_FilePath): 13 | try: 14 | training_data = [] # create an empty list for training data 15 | lines=[] 16 | with open(JSON_FilePath, 'r',encoding='utf-8') as f: # open the json file 17 | lines = f.readlines() 18 | 19 | for line in lines: 20 | data = json.loads(line) 21 | text = data['content'] 22 | entities = [] 23 | for annotation in data['annotation']: 24 | #only a single point in text annotation. 25 | point = annotation['points'][0] 26 | labels = annotation['label'] 27 | # handle both list of labels or a single label. 28 | if not isinstance(labels, list): 29 | labels = [labels] 30 | 31 | for label in labels: 32 | 33 | entities.append((point['start'], point['end'] + 1 ,label)) 34 | 35 | 36 | training_data.append((text, {"entities" : entities})) 37 | 38 | return training_data 39 | except Exception as e: # in case of exception print- 40 | logging.exception("Unable to process " + JSON_FilePath + "\n" + "error = " + str(e)) 41 | return None 42 | 43 | 44 | -------------------------------------------------------------------------------- /src/ML_Pipeline/train_model.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | import random 3 | from spacy.util import minibatch, compounding 4 | 5 | # function to train the model 6 | def build_spacy_model(train,model): 7 | 8 | if model is not None: 9 | nlp = spacy.load(model) # load existing spaCy model 10 | print("Loaded model '%s'" % model) 11 | else: 12 | nlp = spacy.blank("en") # create blank Language class 13 | print("Created blank 'en' model") 14 | 15 | TRAIN_DATA =train 16 | #nlp = spacy.blank('en') # create blank Language class 17 | # create the built-in pipeline components and add them to the pipeline 18 | # nlp.create_pipe works for built-ins that are registered with spaCy 19 | if 'ner' not in nlp.pipe_names: 20 | ner = nlp.create_pipe('ner') 21 | nlp.add_pipe('ner', last=True) 22 | else: 23 | ner = nlp.get_pipe("ner") 24 | 25 | 26 | # add labels 27 | for _, annotations in TRAIN_DATA: 28 | for ent in annotations.get('entities'): 29 | ner.add_label(ent[2]) 30 | 31 | # get names of other pipes to disable them during training 32 | other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] 33 | with nlp.disable_pipes(*other_pipes): # only train NER 34 | if model is None: 35 | optimizer = nlp.begin_training() 36 | for itn in range(2): 37 | print("Starting iteration " + str(itn)) 38 | # random.shuffle(TRAIN_DATA) 39 | # losses = {} 40 | # batches = minibatch(TRAIN_DATA, size=compounding(8., 32., 1.001)) 41 | # for batch in batches: 42 | # texts, annotations = zip(*batch) 43 | # nlp.update(texts, annotations, sgd=optimizer, 44 | # losses=losses) 45 | # print('Losses', losses) 46 | random.shuffle(TRAIN_DATA) 47 | losses = {} 48 | for text, annotations in TRAIN_DATA: 49 | try: 50 | nlp.update( 51 | [text], # batch of texts 52 | [annotations], # batch of annotations 53 | drop=0.2, # dropout - make it harder to memorise data 54 | sgd=optimizer, # callable to update weights 55 | losses=losses) 56 | except Exception as e: 57 | pass 58 | print(losses) 59 | 60 | nlp.to_disk("model") 61 | return nlp -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NLP_Resume_ParserNLP Project to Build a Resume Parser in Python using Spacy 2 | 3 | 4 | ## NLP Tools and Techniques Used: 5 | ### Tokenization 6 | 7 | It is the process of splitting textual data into different pieces called tokens. One can either break a sentence into tokens of words or characters; the choice depends on the problem one is interested in solving. It is usually the first step that is performed in any NLP project, and the same will be the case with this resume parser using NLP project. Tokenization helps in further steps of an NLP pipeline which usually involves evaluating the weights of all the words depending on their significance in the corpus. 8 | 9 | 10 | 11 | ### Lemmatization 12 | 13 | The larger goal of this resume parsing python application is to decode the semantics of the text. For that, the form of the verb that is used does not have a significant impact. Therefore, lemmatization is used to convert all the words into their root form, called 'lemma.' For example, 'drive,' 'driving, 'drove' all have the same lemma 'drive.' 14 | 15 | 16 | 17 | ### Parts-of-Speech Tagging 18 | 19 | If you consider the word "Apple," it can have two meanings in a sentence. Depending on whether it has been used as a proper noun or a common noun, you will understand whether one is discussing the multinational tech company or the fruit. This CV parser python project will understand how POS Tagging is implemented in Python. 20 | 21 | 22 | 23 | ### Stopwords Elimination 24 | 25 | Stopwords are the words like 'a', 'the,' 'am', 'is', etc., that hardly add any meaning to a sentence. These words are usually deleted to save on processing power and time. In their CV, an applicant may submit their work experience in long paragraphs with many stopwords. For such cases, it becomes essential to know how to extract experience from a resume in python, which you will learn in this project. 26 | 27 | 28 | 29 | ### SpaCy 30 | 31 | SpaCy is a library in Python that is widely used in many NLP-based projects by data scientists as it offers quick implementation of techniques mentioned above. Additionally, one can use SpaCy to visualize different entities in text data through its built-in visualizer called displacy. Furthermore, SpaCy supports the implementation of rule-based matching, shallow parsing, dependency parsing, etc. This NLP resume parser project will guide you on using SpaCy for Named Entity Recognition (NER). 32 | 33 | 34 | 35 | ### #OCR using TIKA 36 | 37 | Used Apache Tika, an open-source library for implementing OCR in this project. OCR stands for optical character recognition. It involves converting images into text and will be used in this resume extraction python project for decoding text information from the PDF files. The textual data is processed using various NLP methods to extract meaningful information. 38 | 39 | --------------------------------------------------------------------------------