├── CVs
    ├── .gitkeep
    ├── Ali Abuharb's CV.pdf
    └── Deemah_Alabdulaali_Resume.pdf
├── Output
    ├── CVs_Info_Extracted.xlsx
    └── CVs_Info_Extracted.csv
├── ResumeGPT_Workflow
    └── ResumeGPT_Workflow.PNG
├── .gitignore
├── requirements.txt
├── LICENSE
├── ResumeGPT
    ├── main.py
    ├── OCR_Reader.py
    └── ChatGPT_Pipeline.py
├── Engineered_Prompt
    └── Prompt.txt
└── README.md


/CVs/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/CVs/Ali Abuharb's CV.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Aillian/CVsAgent/HEAD/CVs/Ali Abuharb's CV.pdf


--------------------------------------------------------------------------------
/Output/CVs_Info_Extracted.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Aillian/CVsAgent/HEAD/Output/CVs_Info_Extracted.xlsx


--------------------------------------------------------------------------------
/CVs/Deemah_Alabdulaali_Resume.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Aillian/CVsAgent/HEAD/CVs/Deemah_Alabdulaali_Resume.pdf


--------------------------------------------------------------------------------
/ResumeGPT_Workflow/ResumeGPT_Workflow.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Aillian/CVsAgent/HEAD/ResumeGPT_Workflow/ResumeGPT_Workflow.PNG


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | CVs/*.pdf
2 | !CVs/Ali\ Abuharb's\ CV.pdf
3 | !CVs/Deemah_Alabdulaali_Resume.pdf
4 | Notebooks
5 | resumegpt_venv
6 | ResumeGPT_Workflow/ResumeGPT_Workflow.pptx
7 | __pycache__


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiohttp==3.8.4
 2 | aiosignal==1.3.1
 3 | async-timeout==4.0.2
 4 | attrs==23.1.0
 5 | certifi==2023.5.7
 6 | charset-normalizer==3.1.0
 7 | colorama==0.4.6
 8 | et-xmlfile==1.1.0
 9 | frozenlist==1.3.3
10 | idna==3.4
11 | multidict==6.0.4
12 | numpy==1.24.3
13 | openai==0.27.7
14 | openpyxl==3.1.2
15 | pandas==2.0.1
16 | PyPDF2==3.0.1
17 | python-dateutil==2.8.2
18 | pytz==2023.3
19 | requests==2.31.0
20 | six==1.16.0
21 | tqdm==4.65.0
22 | typing_extensions==4.5.0
23 | tzdata==2023.3
24 | urllib3==2.0.2
25 | yarl==1.9.2
26 | 


--------------------------------------------------------------------------------
/Output/CVs_Info_Extracted.csv:
--------------------------------------------------------------------------------
1 | CV_Filename,Education Bachelor University,Education Bachelor GPA,Education Bachelor Major,Education Bachelor Graduation Date,Education Masters University,Education Masters GPA,Education Masters Major,Education Masters Graduation Date,Education PhD University,Education PhD GPA,Education PhD Major,Education PhD Graduation Date,Years of Experience,Experience Companies,Top 5 Responsibilities/Projects,Top 5 Courses/Certifications,Top 3 Technical Skills,Top 3 Soft Skills,Current Employment Status,Nationality,Current Residence,Suitable Position,Candidate Rating (Out of 10),All_Info_JSON
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Ali Abuharb
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/ResumeGPT/main.py:
--------------------------------------------------------------------------------
 1 | # Import modules:
 2 | # 1. CVsReader from the OCR_Reader module which is used for reading CVs
 3 | # 2. CVsInfoExtractor from the ChatGPT_Pipeline module which is used for extracting specific information from the CVs
 4 | from OCR_Reader import CVsReader
 5 | from ChatGPT_Pipeline import CVsInfoExtractor
 6 | import sys
 7 | 
 8 | # Fetching command line arguments
 9 | cvs_directory_path_arg, openai_api_key_arg, desired_positions_arg = sys.argv[1], sys.argv[2], sys.argv[3].split(",")
10 | 
11 | # Splitting the desired positions into a list and removing any leading or trailing whitespace
12 | desired_positions = [position.strip() for position in desired_positions_arg]
13 | 
14 | 
15 | # Create an instance of CVsReader. 
16 | # The cvs_directory_path argument, which represents the directory where the CV files are located.
17 | cvs_reader = CVsReader(cvs_directory_path = cvs_directory_path_arg)
18 | 
19 | # Use the read_cv method of the CVsReader instance to read all CVs in the specified directory.
20 | # The result is a dataframe where each row corresponds to a different CV's file name and content.
21 | cvs_content_df = cvs_reader.read_cv()
22 | 
23 | # Create an instance of CVsInfoExtractor.
24 | # It takes as an argument the dataframe returned by the read_cv method of the CVsReader instance and the desired positions in a list.
25 | cvs_info_extractor = CVsInfoExtractor(cvs_df = cvs_content_df, openai_api_key = openai_api_key_arg, desired_positions = desired_positions)
26 | 
27 | # Use the extract_cv_info method of the CVsInfoExtractor instance to extract the desired information from the CVs.
28 | # This method presumably returns a list of dataframes, each dataframe corresponding to the extracted information from each CV.
29 | extract_cv_info_dfs = cvs_info_extractor.extract_cv_info()


--------------------------------------------------------------------------------
/Engineered_Prompt/Prompt.txt:
--------------------------------------------------------------------------------
 1 | You are a human resource specialist who is responsible for reviewing candidates' CVs. You will be given the CV of the candidate and your job is to extract the information mentioned below. Also, you must follow the desired output.
 2 | 
 3 | Information To Extract:
 4 | 1. Education Bachelor University: name of university where bachelor degree was taken
 5 | 2. Education Bachelor GPA: GPA of bachelor degree (Example: 4.5/5)
 6 | 3. Education Bachelor Major: major of bachelor degree
 7 | 4. Education Bachelor Graduation Date: date of graduation from bachelor degree (in format: Month_Name, YYYY)
 8 | 5. Education Masters University: name of university where masters degree was taken
 9 | 6. Education Masters GPA: GPA of masters degree (Example: 4.5/5)
10 | 7. Education Masters Major: major of masters degree
11 | 8. Education Masters Graduation Date: date of graduation from masters degree (in format: Month_Name, YYYY)
12 | 9. Education PhD University: name of university where PhD degree was taken
13 | 10. Education PhD GPA: GPA of PhD degree (Example: 4.5/5)
14 | 11. Education PhD Major: major of PhD degree
15 | 12. Education PhD Graduation Date: date of graduation from PhD degree (in format: Month_Name, YYYY)
16 | 13. Years of Experience: total years of experience in all jobs (Example: 3)
17 | 14. Experience Companies: list of all companies that the candidate worked with (Example: [Company1, Company2])
18 | 15. Top 5 Responsibilities/Projects Titles: list of top 5 responsibilities/projects titles that the candidate worked on (Example: [Project1, Project2, Project3, Project4, Project5])
19 | 16. Top 5 Courses/Certifications Titles: list of top 5 courses/certifications titles that the candidate took (Example: [Course1, Course2, Course3, Course4, Course5])
20 | 17. Top 3 Technical Skills: list of top 3 technical skills (Example: [Skill1, Skill2, Skill3])
21 | 18. Top 3 Soft Skills: list of top 3 soft skills (Example: [Skill1, Skill2, Skill3])
22 | 19. Current Employment Status: classify to one of the following (Full-time, Part-Time, Intern, Freelancer, Consultant, Unemployed)
23 | 20. Nationality: nationality of the candidate
24 | 21. Current Residence: where the candidate currently live
25 | 22. Suitable Position: classify to one of the following suitable positions for the candidate (suitable position for the candidate)
26 | 23. Candidate Rating (Out of 10): rate the candidate suitability for the classified position in point 19 (Example: 7.5)
27 | 
28 | 
29 | Desired Output: JSON format like the following:
30 | ###
31 | {"Education Bachelor University":"Information To Extract Number 1",
32 | "Education Bachelor GPA":"Information To Extract Number 2",
33 | "Education Bachelor Major":"Information To Extract Number 3",
34 | "Education Bachelor Graduation Date":"Information To Extract Number 4",
35 | "Education Masters University":"Information To Extract Number 5",
36 | "Education Masters GPA":"Information To Extract Number 6",
37 | "Education Masters Major":"Information To Extract Number 7",
38 | "Education Masters Graduation Date":"Information To Extract Number 8",
39 | "Education PhD University":"Information To Extract Number 9",
40 | "Education PhD GPA":"Information To Extract Number 10",
41 | "Education PhD Major":"Information To Extract Number 11",
42 | "Education PhD Graduation Date":"Information To Extract Number 12",
43 | "Years of Experience":"Information To Extract Number 13",
44 | "Experience Companies":"Information To Extract Number 14",
45 | "Top 5 Responsibilities/Projects":"Information To Extract Number 15",
46 | "Top 5 Courses/Certifications":"Information To Extract Number 16",
47 | "Top 3 Technical Skills":"Information To Extract Number 17",
48 | "Top 3 Soft Skills":"Information To Extract Number 18",
49 | "Current Employment Status":"Information To Extract Number 19",
50 | "Nationality":"Information To Extract Number 20",
51 | "Current Residence":"Information To Extract Number 21",
52 | "Suitable Position":"Information To Extract Number 22",
53 | "Candidate Rating (Out of 10)":"Information To Extract Number 23"}
54 | ###
55 | 
56 | Note: if any of the information is not mentioned in the cv, just leave it blank (empty string)
57 | 


--------------------------------------------------------------------------------
/ResumeGPT/OCR_Reader.py:
--------------------------------------------------------------------------------
  1 | # import modules
  2 | import os
  3 | from PyPDF2 import PdfReader
  4 | import pandas as pd
  5 | from tqdm import tqdm
  6 | 
  7 | 
  8 | # Define a class to read CVs from a directory
  9 | class CVsReader:
 10 |     
 11 |     # Initialize the class with the directory path where CVs are located
 12 |     def __init__(self, cvs_directory_path):
 13 |         self.cvs_directory_path = cvs_directory_path
 14 | 
 15 | 
 16 |     # Method to read new CV files from the given directory
 17 |     def _read_new_directory_files(self):
 18 | 
 19 |         # Store the directory path of CVs
 20 |         cvs_directory_path = self.cvs_directory_path
 21 | 
 22 |         # Store the path of the CSV file where previously extracted CVs are stored
 23 |         previously_extracted_cvs_path = '../Output/CVs_Info_Extracted.csv'
 24 | 
 25 |         # Get a list of all files in the CVs directory
 26 |         all_cvs = os.listdir(cvs_directory_path)
 27 | 
 28 |         # If there is a CSV file of previously extracted CVs
 29 |         if os.path.isfile(previously_extracted_cvs_path):
 30 | 
 31 |             # Read that file and get the filenames of CVs
 32 |             previously_extracted_cvs = pd.read_csv(previously_extracted_cvs_path, usecols = ['CV_Filename'])
 33 | 
 34 |             # Convert those filenames to a list
 35 |             previously_extracted_cvs = previously_extracted_cvs.CV_Filename.to_list()
 36 | 
 37 |             # Filter out the CVs that have already been processed
 38 |             all_cvs = [cv for cv in all_cvs if cv not in previously_extracted_cvs]
 39 | 
 40 |         # Print the number of CVs that are left to be processed
 41 |         print(f'Number of CVs to be processed: {len(all_cvs)}')
 42 | 
 43 |         # Return the list of CVs to be processed
 44 |         return all_cvs
 45 | 
 46 | 
 47 |     # Method to extract text from a PDF file
 48 |     def _extract_text_from_pdf(self, pdf_path):
 49 | 
 50 |         # Print the name of the file being processed
 51 |         print(f"Extracting text from file: {pdf_path}")
 52 | 
 53 |         # Create a PdfReader object
 54 |         pdf = PdfReader(pdf_path)
 55 | 
 56 |         # Initialize an empty string to store the extracted text
 57 |         text = ''
 58 | 
 59 |         # Loop over the pages in the pdf
 60 |         for page in range(len(pdf.pages)):
 61 | 
 62 |             # Extract text from each page and append it to the text string
 63 |             text += pdf.pages[page].extract_text()
 64 | 
 65 |         # Return the extracted text
 66 |         return text
 67 | 
 68 |     
 69 |     # Define a method that reads PDF content from a directory
 70 |     def _read_pdfs_content_from_directory(self, directory_path):
 71 |         
 72 |         # Initialize a dictionary to hold the filenames and contents of the CVs
 73 |         data = {'CV_Filename': [], 'CV_Content': []}
 74 |         
 75 |         # Read all the new files in the directory
 76 |         all_cvs = self._read_new_directory_files()
 77 |         
 78 |         # For each file in the directory
 79 |         for filename in tqdm(all_cvs, desc='CVs'):
 80 |             # If the file is a PDF
 81 |             if filename.endswith('.pdf'):
 82 |                 # Construct the full file path
 83 |                 file_path = os.path.join(directory_path, filename)
 84 |                 try:
 85 |                     # Extract the text content from the PDF
 86 |                     content = self._extract_text_from_pdf(file_path)
 87 |                     # Add the filename to the dictionary
 88 |                     data['CV_Filename'].append(filename)
 89 |                     # Add the content to the dictionary
 90 |                     data['CV_Content'].append(content)
 91 |                 except Exception as e:
 92 |                     # Print the exception if there is an error in reading the file
 93 |                     print(f"Error reading file {filename}: {e}")
 94 |         # Return the data as a DataFrame
 95 |         return pd.DataFrame(data)
 96 | 
 97 | 
 98 |     # Define a method that reads and cleans CVs
 99 |     def read_cv(self):
100 |         
101 |         # Print a message indicating the start of the CV extraction process
102 |         print('---- Excecuting CVs Content Extraction Process ----')
103 |         
104 |         # Read the PDFs from the directory and store their content in a DataFrame
105 |         df = self._read_pdfs_content_from_directory(self.cvs_directory_path)
106 |         
107 |         # Print a message indicating the start of the CV content cleaning process
108 |         print('Cleaning CVs Content...')
109 |         # Clean the CV content by replacing newline characters and trailing spaces with a single newline character
110 |         df['CV_Content'] = df['CV_Content'].str.replace(r"\n(?:\s*)", "\n", regex=True)
111 | 
112 |         # Print a message indicating the end of the CV extraction process
113 |         print('CVs Content Extraction Process Completed!')
114 |         print('----------------------------------------------')
115 |         # Return the DataFrame
116 |         return df


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ResumeGPT
  2 | 
  3 | ResumeGPT is a Python package designed to extract structured information from a PDF Curriculum Vitae (CVs)/Resumes documents. It leverages OCR technology and utilizes the capabilities of ChatGPT AI language model (GPT-3.5 and GPT-4) to extract pieces of information from the CV content and organize them in a structured Excel-friendly format.
  4 | 
  5 | 
  6 | ## Features
  7 | 
  8 | - Extracts text from PDF CVs: Uses OCR technology to extract the CV's PDF content as text.
  9 | - Extracts information using GPT: Sends the extracted text to GPT for information extraction according to a predefined prompt.
 10 | - Structures information to Excel file: Processes the extracted information from GPT and structures it from JSON into a Excel-friendly format.
 11 | 
 12 | 
 13 | ## Module Overview
 14 | 
 15 | ![ResumeGPT Workflow](ResumeGPT_Workflow/ResumeGPT_Workflow.PNG)
 16 | 
 17 | 
 18 | 1. OCR Reader (CVsReader module): This process reads CVs from a specified directory and extracts the text from PDF files.
 19 | 
 20 | 2. Engineered Prompt and ChatGPT Pipeline (CVsInfoExtractor module): This process takes as an input the extracted text generated by the OCR Reader and extracts specific information using ChatGPT in a JSON format.
 21 | 
 22 | 3. Extracted Information Structuring (CVsInfoExtractor module): This process takes the JSON output from the ChatGPT Pipeline, which contains the information extracted from each CV. This information is then structured and organized into a clear and easy-to-understand Excel format.
 23 | 
 24 | 
 25 | ## Requirements
 26 | 
 27 | 1. Python: Python 3.8 or newer.
 28 | 
 29 | 2. GPT-4 API Access: If GPT-3.5 tokens don not fit the CV content, the package uses GPT-4 to extract the information from the CVs, so you'll need an access to the GPT-4 API.
 30 | 
 31 | 
 32 | ## How to Use
 33 | 
 34 | 1.	Prepare Your CVs: Make sure all the CVs you want to analyze are in the “CVs” directory.
 35 | 
 36 | 2.	Run the Script: Run the following scripts. This will clone the project, prepare the environment, and execute the code.
 37 | - Clone the project
 38 | ```bash
 39 | git clone https://github.com/Aillian/ResumeGPT.git
 40 | ```
 41 | - CD project directory
 42 | ```bash
 43 | cd ResumeGPT 
 44 | ```
 45 | - Create a virtual environment
 46 | ```bash
 47 | python -m venv resumegpt_venv
 48 | ```
 49 | - Activate the virtual environment
 50 | ```bash
 51 | source resumegpt_venv/Scripts/activate
 52 | ```
 53 | - Upgrade pip version
 54 | ```bash
 55 | pip install --upgrade pip
 56 | ```
 57 | - Install requirements.txt
 58 | ```bash
 59 | pip install -r requirements.txt
 60 | ```
 61 | - CD codes directory
 62 | ```bash
 63 | cd ResumeGPT 
 64 | ```
 65 | - Run main.py and provide the 3 required arguments:
 66 |     - CVs Directory Path: use "../CVs" to read from 'CVs' directory
 67 |     - Openai API Key: should include GPT-4 model access
 68 |     - Desired Positions: written like the following "Data Scientist,Data Analyst,Data Engineer"
 69 | ```bash
 70 | python main.py "../CVs" "sk-ldbuDCjkgJHiFnbLVCJvvcfKNBDFJTYCVfvRedevDdf" "Data Scientist, Data Analyst, Data Engineer"
 71 | ```
 72 | 
 73 | 3. Examine the Results: After the script finishes, you will find the output in “Output” directory which are two file (CSV & Excel) of the extracted information from each CV.
 74 | 
 75 | 
 76 | ## Extracted Information
 77 | 
 78 | ResumeGPT is designed to extract 23 features from each CV:
 79 | 
 80 | - Education:
 81 | 1. Education Bachelor University: name of university where bachelor degree was taken
 82 | 2. Education Bachelor GPA: GPA of bachelor degree (Example: 4.5/5)
 83 | 3. Education Bachelor Major: major of bachelor degree
 84 | 4. Education Bachelor Graduation Date: date of graduation from bachelor degree (in format: Month_Name, YYYY)
 85 | 5. Education Masters University: name of university where masters degree was taken
 86 | 6. Education Masters GPA: GPA of masters degree (Example: 4.5/5)
 87 | 7. Education Masters Major: major of masters degree
 88 | 8. Education Masters Graduation Date: date of graduation from masters degree (in format: Month_Name, YYYY)
 89 | 9. Education PhD University: name of university where PhD degree was taken
 90 | 10. Education PhD GPA: GPA of PhD degree (Example: 4.5/5)
 91 | 11. Education PhD Major: major of PhD degree
 92 | 12. Education PhD Graduation Date: date of graduation from PhD degree (in format: Month_Name, YYYY)
 93 | 
 94 | - Work Experience:
 95 | 13. Years of Experience: total years of experience in all jobs (Example: 3)
 96 | 14. Experience Companies: list of all companies that the candidate worked with (Example: [Company1, Company2])
 97 | 15. Top 5 Responsibilities/Projects Titles: list of top 5 responsibilities/projects titles that the candidate worked on (Example: [Project1, Project2, Project3, Project4, Project5])
 98 | 
 99 | - Courses/Certifications:
100 | 16. Top 5 Courses/Certifications Titles: list of top 5 courses/certifications titles that the candidate took (Example: [Course1, Course2, Course3, Course4, Course5])
101 | 
102 | - Skills:
103 | 17. Top 3 Technical Skills: list of top 3 technical skills (Example: [Skill1, Skill2, Skill3])
104 | 18. Top 3 Soft Skills: list of top 3 soft skills (Example: [Skill1, Skill2, Skill3])
105 | 
106 | - Employment Status:
107 | 19. Current Employment Status: one of the following (Full-time, Part-Time, Intern, Freelancer, Consultant, Unemployed)
108 | 
109 | - Personal Information:
110 | 20. Nationality: nationality of the candidate
111 | 21. Current Residence: where the candidate currently live
112 | 
113 | - Suitable Position:
114 | 22. Suitable Position: the most suitable position for the candidate, this will be taken from the user and dynamically replaced in the prompt
115 | 
116 | - Rating Score:
117 | 23. Candidate Rating (Out of 10): score of the candidate suitability for the classified position in point 19 (Example: 7.5)
118 | 
119 | 
120 | This information is then organized into a structured Excel file.
121 | 
122 | 
123 | ## Contributing
124 | Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change.
125 | 
126 | Possible additional features and optimizations:
127 | 1. Add additional features to the prompt.
128 | 2. Handling exceeded tokens limit, by further cleansing cv content.
129 | 3. The code tries to call gpt-3.5-turbo model first, if token limit exceeds the acceptable limit, it calls gpt-4. But this has some problems: 1- it is costly 2- what if the provided API key does not have access to gpt-4 model?
130 | 4. Catching GPT-4 "service is down" error by calling the API again after some sleeping time.
131 | 5. Can the prompt be reduced so we save some tokens for the cv content?
132 | 6. Separating "Information To Extract" in the prompt to a different file so the user gets the flexibility of adding new features and then dynamically imputing it into the prompt after that the added features in "CVs_Info_Extracted.csv" should be reflected as column names in the csv file.
133 | 7. Additional errors handling.
134 | 8. What about extending the usage to other LLMs?
135 | 
136 | 
137 | ## License
138 | ResumeGPT is released under the MIT License. See the LICENSE file for more details.


--------------------------------------------------------------------------------
/ResumeGPT/ChatGPT_Pipeline.py:
--------------------------------------------------------------------------------
  1 | # import modules
  2 | import os
  3 | import pandas as pd
  4 | import openai
  5 | from openai import InvalidRequestError
  6 | import time
  7 | import json
  8 | from json import JSONDecodeError
  9 | from tqdm import tqdm
 10 | # add a progress bar to pandas operations
 11 | tqdm.pandas(desc='CVs')
 12 | 
 13 | # define the path to the output CSV file
 14 | output_csv_file_path = '../Output/CVs_Info_Extracted.csv'
 15 | 
 16 | # define the path to the output Excel file
 17 | output_excel_file_path = '../Output/CVs_Info_Extracted.xlsx'
 18 | 
 19 | 
 20 | # define a class to extract CV information
 21 | class CVsInfoExtractor:
 22 |     # define a constructor that initializes the class with a DataFrame of CVs
 23 |     def __init__(self, cvs_df, openai_api_key, desired_positions):
 24 |         self.cvs_df = cvs_df
 25 |         
 26 |         # open a file in read mode and read the contents of the file into a variable
 27 |         with open('../Engineered_Prompt/Prompt.txt', 'r') as file:
 28 |             self.prompt = file.read()
 29 |         
 30 |         # Join the desired positions into a comma-separated string
 31 |         suitable_positions_str = "(" + ", ".join(desired_positions) + ")"
 32 | 
 33 |         # Replace the placeholder in the prompt with the formatted suitable positions string
 34 |         self.prompt = self.prompt.replace('(suitable position for the candidate)', suitable_positions_str)
 35 |         
 36 |         
 37 |         # set the OpenAI API key
 38 |         openai.api_key = openai_api_key
 39 | 
 40 | 
 41 |     # define internal function to call GPT for CV info extraction
 42 |     def _call_gpt_for_cv_info_extraction(self, prompt, cv_content, model, temperature = 0):
 43 | 
 44 |         # create a dict of parameters for the ChatCompletion API
 45 |         completion_params = {
 46 |             'model': model,
 47 |             'messages': [{"role": "system", "content": prompt},
 48 |                         {"role": "user", "content": cv_content}],
 49 |             'temperature': temperature}
 50 | 
 51 |         # send a request to the ChatCompletion API and store the response
 52 |         response = openai.ChatCompletion.create(**completion_params)
 53 |         # if the response contains choices and at least one choice, extract the message content
 54 |         if 'choices' in response and len(response.choices) > 0:
 55 |             cleaned_response = response['choices'][0]['message']['content']
 56 |             try:
 57 |                 # try to convert the message content to a JSON object
 58 |                 json_response = json.loads(cleaned_response)
 59 |             except JSONDecodeError:
 60 |                 # if the conversion fails, set the JSON response to None
 61 |                 json_response = None  
 62 |         else:
 63 |             # if the response does not contain choices or no choice, set the JSON response to None
 64 |             json_response = None
 65 |             
 66 |         # return the JSON response
 67 |         return json_response
 68 |     
 69 |     
 70 |     # Defines internal function to normalize a JSON response from GPT
 71 |     def _normalize_gpt_json_response(self, CV_Filename, json_response):
 72 |         
 73 |         # Creates a DataFrame with one column "CV_Filename", the values of this column is from the "CV_Filename"
 74 |         CV_Filename_df = pd.DataFrame([CV_Filename], columns = ['CV_Filename'])
 75 | 
 76 |         # Creates a DataFrame with one column "All_Info_JSON", the values of this column is the JSON response
 77 |         df_CV_Info_Json = pd.DataFrame([[json_response]], columns = ['All_Info_JSON'])
 78 | 
 79 |         # Normalize the JSON response, flattening it into a table
 80 |         df_CV_Info_Json_normalized = pd.json_normalize(json_response)
 81 | 
 82 |         # Concatenates the three DataFrame along the columns
 83 |         df = pd.concat([CV_Filename_df, df_CV_Info_Json_normalized, df_CV_Info_Json], axis=1)
 84 |         
 85 |         # Returns the final DataFrame
 86 |         return df
 87 | 
 88 | 
 89 |     # Defines internal function to write the DataFrame into a CSV file
 90 |     def _write_response_to_file(self, df):
 91 | 
 92 |         # Checks if the output CSV file already exists
 93 |         if os.path.isfile(output_csv_file_path):
 94 |             # If the file exists, append the DataFrame into the CSV file without writing headers
 95 |             df.to_csv(output_csv_file_path, mode='a', index=False, header=False)
 96 |         else:
 97 |             # If the file doesn't exist, write the DataFrame into a new CSV file
 98 |             df.to_csv(output_csv_file_path, mode='w', index=False)
 99 | 
100 | 
101 |     # Define the internal function _gpt_pipeline
102 |     def _gpt_pipeline(self, row, model = 'gpt-3.5-turbo'):
103 | 
104 |         # Retrieve the CV Filename and Content from the given row
105 |         CV_Filename = row['CV_Filename']
106 |         CV_Content = row['CV_Content']
107 | 
108 |         # Sleep for 5 seconds to delay the next operation
109 |         time.sleep(5)
110 |         
111 |         try:
112 |             # Print status message indicating GPT is being called for CV info extraction
113 |             print('Calling GPT For CV Info Extraction...')
114 | 
115 |             # Call the GPT model for CV information extraction
116 |             json_response = self._call_gpt_for_cv_info_extraction(prompt=self.prompt, cv_content=CV_Content, model=model)
117 | 
118 |             # Print status message indicating normalization of GPT response
119 |             print('Normalizing GPT Response...')
120 | 
121 |             # Normalize the GPT JSON response
122 |             df = self._normalize_gpt_json_response(CV_Filename, json_response)
123 | 
124 |             # Print status message indicating that the results are being appended to the CSV file
125 |             print('Appending Results To The CSV File...')
126 | 
127 |             # Write the normalized response to a file
128 |             self._write_response_to_file(df)
129 |             
130 |             # Print a line for clarity in the output
131 |             print('----------------------------------------------')
132 | 
133 |             # Return the GPT JSON response
134 |             return json_response
135 | 
136 |         # Catch an exception when the tokens don't fit in the chosen GPT model
137 |         except InvalidRequestError as e:
138 |             # Print the error that occurred
139 |             print('An Error Occurred:', str(e))
140 | 
141 |             # Print status message indicating that gpt-4 is being called instead
142 |             print("Tokens don't fit gpt-3.5-turbo, calling gpt-4...")
143 | 
144 |             # Retry the pipeline with the gpt-4 model
145 |             return self._gpt_pipeline(row, model = 'gpt-4')
146 | 
147 | 
148 |     # Define the internal function _write_final_results_to_excel
149 |     def _write_final_results_to_excel(self):
150 |         # Load the CSV file into a pandas DataFrame
151 |         df_to_excel = pd.read_csv(output_csv_file_path)
152 | 
153 |         # Write the DataFrame to an Excel file
154 |         df_to_excel.to_excel(output_excel_file_path)
155 | 
156 |         # Return the DataFrame
157 |         return df_to_excel
158 | 
159 | 
160 |     # Define the main function extract_cv_info
161 |     def extract_cv_info(self):
162 |         # Print a status message indicating the start of the ResumeGPT Pipeline
163 |         print('---- Excecuting ResumeGPT Pipeline ----')
164 |         print('----------------------------------------------')
165 | 
166 |         # Apply the _gpt_pipeline function to each row in cvs_df DataFrame
167 |         self.cvs_df['CV_Info_Json'] = self.cvs_df.progress_apply(self._gpt_pipeline, axis=1)
168 | 
169 |         # Print a status message indicating the completion of the extraction
170 |         print('Extraction Completed!')
171 | 
172 |         # Print a status message indicating that results are being saved to Excel
173 |         print('Saving Results to Excel...')
174 | 
175 |         # Write the final results to an Excel file
176 |         final_df = self._write_final_results_to_excel()
177 | 
178 |         # Return the final DataFrame
179 |         return final_df
180 | 


--------------------------------------------------------------------------------