├── CVs ├── .gitkeep ├── Ali Abuharb's CV.pdf └── Deemah_Alabdulaali_Resume.pdf ├── Output ├── CVs_Info_Extracted.xlsx └── CVs_Info_Extracted.csv ├── ResumeGPT_Workflow └── ResumeGPT_Workflow.PNG ├── .gitignore ├── requirements.txt ├── LICENSE ├── ResumeGPT ├── main.py ├── OCR_Reader.py └── ChatGPT_Pipeline.py ├── Engineered_Prompt └── Prompt.txt └── README.md /CVs/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /CVs/Ali Abuharb's CV.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Aillian/CVsAgent/HEAD/CVs/Ali Abuharb's CV.pdf -------------------------------------------------------------------------------- /Output/CVs_Info_Extracted.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Aillian/CVsAgent/HEAD/Output/CVs_Info_Extracted.xlsx -------------------------------------------------------------------------------- /CVs/Deemah_Alabdulaali_Resume.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Aillian/CVsAgent/HEAD/CVs/Deemah_Alabdulaali_Resume.pdf -------------------------------------------------------------------------------- /ResumeGPT_Workflow/ResumeGPT_Workflow.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Aillian/CVsAgent/HEAD/ResumeGPT_Workflow/ResumeGPT_Workflow.PNG -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | CVs/*.pdf 2 | !CVs/Ali\ Abuharb's\ CV.pdf 3 | !CVs/Deemah_Alabdulaali_Resume.pdf 4 | Notebooks 5 | resumegpt_venv 6 | ResumeGPT_Workflow/ResumeGPT_Workflow.pptx 7 | __pycache__ -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | aiosignal==1.3.1 3 | async-timeout==4.0.2 4 | attrs==23.1.0 5 | certifi==2023.5.7 6 | charset-normalizer==3.1.0 7 | colorama==0.4.6 8 | et-xmlfile==1.1.0 9 | frozenlist==1.3.3 10 | idna==3.4 11 | multidict==6.0.4 12 | numpy==1.24.3 13 | openai==0.27.7 14 | openpyxl==3.1.2 15 | pandas==2.0.1 16 | PyPDF2==3.0.1 17 | python-dateutil==2.8.2 18 | pytz==2023.3 19 | requests==2.31.0 20 | six==1.16.0 21 | tqdm==4.65.0 22 | typing_extensions==4.5.0 23 | tzdata==2023.3 24 | urllib3==2.0.2 25 | yarl==1.9.2 26 | -------------------------------------------------------------------------------- /Output/CVs_Info_Extracted.csv: -------------------------------------------------------------------------------- 1 | CV_Filename,Education Bachelor University,Education Bachelor GPA,Education Bachelor Major,Education Bachelor Graduation Date,Education Masters University,Education Masters GPA,Education Masters Major,Education Masters Graduation Date,Education PhD University,Education PhD GPA,Education PhD Major,Education PhD Graduation Date,Years of Experience,Experience Companies,Top 5 Responsibilities/Projects,Top 5 Courses/Certifications,Top 3 Technical Skills,Top 3 Soft Skills,Current Employment Status,Nationality,Current Residence,Suitable Position,Candidate Rating (Out of 10),All_Info_JSON 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Ali Abuharb 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /ResumeGPT/main.py: -------------------------------------------------------------------------------- 1 | # Import modules: 2 | # 1. CVsReader from the OCR_Reader module which is used for reading CVs 3 | # 2. CVsInfoExtractor from the ChatGPT_Pipeline module which is used for extracting specific information from the CVs 4 | from OCR_Reader import CVsReader 5 | from ChatGPT_Pipeline import CVsInfoExtractor 6 | import sys 7 | 8 | # Fetching command line arguments 9 | cvs_directory_path_arg, openai_api_key_arg, desired_positions_arg = sys.argv[1], sys.argv[2], sys.argv[3].split(",") 10 | 11 | # Splitting the desired positions into a list and removing any leading or trailing whitespace 12 | desired_positions = [position.strip() for position in desired_positions_arg] 13 | 14 | 15 | # Create an instance of CVsReader. 16 | # The cvs_directory_path argument, which represents the directory where the CV files are located. 17 | cvs_reader = CVsReader(cvs_directory_path = cvs_directory_path_arg) 18 | 19 | # Use the read_cv method of the CVsReader instance to read all CVs in the specified directory. 20 | # The result is a dataframe where each row corresponds to a different CV's file name and content. 21 | cvs_content_df = cvs_reader.read_cv() 22 | 23 | # Create an instance of CVsInfoExtractor. 24 | # It takes as an argument the dataframe returned by the read_cv method of the CVsReader instance and the desired positions in a list. 25 | cvs_info_extractor = CVsInfoExtractor(cvs_df = cvs_content_df, openai_api_key = openai_api_key_arg, desired_positions = desired_positions) 26 | 27 | # Use the extract_cv_info method of the CVsInfoExtractor instance to extract the desired information from the CVs. 28 | # This method presumably returns a list of dataframes, each dataframe corresponding to the extracted information from each CV. 29 | extract_cv_info_dfs = cvs_info_extractor.extract_cv_info() -------------------------------------------------------------------------------- /Engineered_Prompt/Prompt.txt: -------------------------------------------------------------------------------- 1 | You are a human resource specialist who is responsible for reviewing candidates' CVs. You will be given the CV of the candidate and your job is to extract the information mentioned below. Also, you must follow the desired output. 2 | 3 | Information To Extract: 4 | 1. Education Bachelor University: name of university where bachelor degree was taken 5 | 2. Education Bachelor GPA: GPA of bachelor degree (Example: 4.5/5) 6 | 3. Education Bachelor Major: major of bachelor degree 7 | 4. Education Bachelor Graduation Date: date of graduation from bachelor degree (in format: Month_Name, YYYY) 8 | 5. Education Masters University: name of university where masters degree was taken 9 | 6. Education Masters GPA: GPA of masters degree (Example: 4.5/5) 10 | 7. Education Masters Major: major of masters degree 11 | 8. Education Masters Graduation Date: date of graduation from masters degree (in format: Month_Name, YYYY) 12 | 9. Education PhD University: name of university where PhD degree was taken 13 | 10. Education PhD GPA: GPA of PhD degree (Example: 4.5/5) 14 | 11. Education PhD Major: major of PhD degree 15 | 12. Education PhD Graduation Date: date of graduation from PhD degree (in format: Month_Name, YYYY) 16 | 13. Years of Experience: total years of experience in all jobs (Example: 3) 17 | 14. Experience Companies: list of all companies that the candidate worked with (Example: [Company1, Company2]) 18 | 15. Top 5 Responsibilities/Projects Titles: list of top 5 responsibilities/projects titles that the candidate worked on (Example: [Project1, Project2, Project3, Project4, Project5]) 19 | 16. Top 5 Courses/Certifications Titles: list of top 5 courses/certifications titles that the candidate took (Example: [Course1, Course2, Course3, Course4, Course5]) 20 | 17. Top 3 Technical Skills: list of top 3 technical skills (Example: [Skill1, Skill2, Skill3]) 21 | 18. Top 3 Soft Skills: list of top 3 soft skills (Example: [Skill1, Skill2, Skill3]) 22 | 19. Current Employment Status: classify to one of the following (Full-time, Part-Time, Intern, Freelancer, Consultant, Unemployed) 23 | 20. Nationality: nationality of the candidate 24 | 21. Current Residence: where the candidate currently live 25 | 22. Suitable Position: classify to one of the following suitable positions for the candidate (suitable position for the candidate) 26 | 23. Candidate Rating (Out of 10): rate the candidate suitability for the classified position in point 19 (Example: 7.5) 27 | 28 | 29 | Desired Output: JSON format like the following: 30 | ### 31 | {"Education Bachelor University":"Information To Extract Number 1", 32 | "Education Bachelor GPA":"Information To Extract Number 2", 33 | "Education Bachelor Major":"Information To Extract Number 3", 34 | "Education Bachelor Graduation Date":"Information To Extract Number 4", 35 | "Education Masters University":"Information To Extract Number 5", 36 | "Education Masters GPA":"Information To Extract Number 6", 37 | "Education Masters Major":"Information To Extract Number 7", 38 | "Education Masters Graduation Date":"Information To Extract Number 8", 39 | "Education PhD University":"Information To Extract Number 9", 40 | "Education PhD GPA":"Information To Extract Number 10", 41 | "Education PhD Major":"Information To Extract Number 11", 42 | "Education PhD Graduation Date":"Information To Extract Number 12", 43 | "Years of Experience":"Information To Extract Number 13", 44 | "Experience Companies":"Information To Extract Number 14", 45 | "Top 5 Responsibilities/Projects":"Information To Extract Number 15", 46 | "Top 5 Courses/Certifications":"Information To Extract Number 16", 47 | "Top 3 Technical Skills":"Information To Extract Number 17", 48 | "Top 3 Soft Skills":"Information To Extract Number 18", 49 | "Current Employment Status":"Information To Extract Number 19", 50 | "Nationality":"Information To Extract Number 20", 51 | "Current Residence":"Information To Extract Number 21", 52 | "Suitable Position":"Information To Extract Number 22", 53 | "Candidate Rating (Out of 10)":"Information To Extract Number 23"} 54 | ### 55 | 56 | Note: if any of the information is not mentioned in the cv, just leave it blank (empty string) 57 | -------------------------------------------------------------------------------- /ResumeGPT/OCR_Reader.py: -------------------------------------------------------------------------------- 1 | # import modules 2 | import os 3 | from PyPDF2 import PdfReader 4 | import pandas as pd 5 | from tqdm import tqdm 6 | 7 | 8 | # Define a class to read CVs from a directory 9 | class CVsReader: 10 | 11 | # Initialize the class with the directory path where CVs are located 12 | def __init__(self, cvs_directory_path): 13 | self.cvs_directory_path = cvs_directory_path 14 | 15 | 16 | # Method to read new CV files from the given directory 17 | def _read_new_directory_files(self): 18 | 19 | # Store the directory path of CVs 20 | cvs_directory_path = self.cvs_directory_path 21 | 22 | # Store the path of the CSV file where previously extracted CVs are stored 23 | previously_extracted_cvs_path = '../Output/CVs_Info_Extracted.csv' 24 | 25 | # Get a list of all files in the CVs directory 26 | all_cvs = os.listdir(cvs_directory_path) 27 | 28 | # If there is a CSV file of previously extracted CVs 29 | if os.path.isfile(previously_extracted_cvs_path): 30 | 31 | # Read that file and get the filenames of CVs 32 | previously_extracted_cvs = pd.read_csv(previously_extracted_cvs_path, usecols = ['CV_Filename']) 33 | 34 | # Convert those filenames to a list 35 | previously_extracted_cvs = previously_extracted_cvs.CV_Filename.to_list() 36 | 37 | # Filter out the CVs that have already been processed 38 | all_cvs = [cv for cv in all_cvs if cv not in previously_extracted_cvs] 39 | 40 | # Print the number of CVs that are left to be processed 41 | print(f'Number of CVs to be processed: {len(all_cvs)}') 42 | 43 | # Return the list of CVs to be processed 44 | return all_cvs 45 | 46 | 47 | # Method to extract text from a PDF file 48 | def _extract_text_from_pdf(self, pdf_path): 49 | 50 | # Print the name of the file being processed 51 | print(f"Extracting text from file: {pdf_path}") 52 | 53 | # Create a PdfReader object 54 | pdf = PdfReader(pdf_path) 55 | 56 | # Initialize an empty string to store the extracted text 57 | text = '' 58 | 59 | # Loop over the pages in the pdf 60 | for page in range(len(pdf.pages)): 61 | 62 | # Extract text from each page and append it to the text string 63 | text += pdf.pages[page].extract_text() 64 | 65 | # Return the extracted text 66 | return text 67 | 68 | 69 | # Define a method that reads PDF content from a directory 70 | def _read_pdfs_content_from_directory(self, directory_path): 71 | 72 | # Initialize a dictionary to hold the filenames and contents of the CVs 73 | data = {'CV_Filename': [], 'CV_Content': []} 74 | 75 | # Read all the new files in the directory 76 | all_cvs = self._read_new_directory_files() 77 | 78 | # For each file in the directory 79 | for filename in tqdm(all_cvs, desc='CVs'): 80 | # If the file is a PDF 81 | if filename.endswith('.pdf'): 82 | # Construct the full file path 83 | file_path = os.path.join(directory_path, filename) 84 | try: 85 | # Extract the text content from the PDF 86 | content = self._extract_text_from_pdf(file_path) 87 | # Add the filename to the dictionary 88 | data['CV_Filename'].append(filename) 89 | # Add the content to the dictionary 90 | data['CV_Content'].append(content) 91 | except Exception as e: 92 | # Print the exception if there is an error in reading the file 93 | print(f"Error reading file {filename}: {e}") 94 | # Return the data as a DataFrame 95 | return pd.DataFrame(data) 96 | 97 | 98 | # Define a method that reads and cleans CVs 99 | def read_cv(self): 100 | 101 | # Print a message indicating the start of the CV extraction process 102 | print('---- Excecuting CVs Content Extraction Process ----') 103 | 104 | # Read the PDFs from the directory and store their content in a DataFrame 105 | df = self._read_pdfs_content_from_directory(self.cvs_directory_path) 106 | 107 | # Print a message indicating the start of the CV content cleaning process 108 | print('Cleaning CVs Content...') 109 | # Clean the CV content by replacing newline characters and trailing spaces with a single newline character 110 | df['CV_Content'] = df['CV_Content'].str.replace(r"\n(?:\s*)", "\n", regex=True) 111 | 112 | # Print a message indicating the end of the CV extraction process 113 | print('CVs Content Extraction Process Completed!') 114 | print('----------------------------------------------') 115 | # Return the DataFrame 116 | return df -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ResumeGPT 2 | 3 | ResumeGPT is a Python package designed to extract structured information from a PDF Curriculum Vitae (CVs)/Resumes documents. It leverages OCR technology and utilizes the capabilities of ChatGPT AI language model (GPT-3.5 and GPT-4) to extract pieces of information from the CV content and organize them in a structured Excel-friendly format. 4 | 5 | 6 | ## Features 7 | 8 | - Extracts text from PDF CVs: Uses OCR technology to extract the CV's PDF content as text. 9 | - Extracts information using GPT: Sends the extracted text to GPT for information extraction according to a predefined prompt. 10 | - Structures information to Excel file: Processes the extracted information from GPT and structures it from JSON into a Excel-friendly format. 11 | 12 | 13 | ## Module Overview 14 | 15 | ![ResumeGPT Workflow](ResumeGPT_Workflow/ResumeGPT_Workflow.PNG) 16 | 17 | 18 | 1. OCR Reader (CVsReader module): This process reads CVs from a specified directory and extracts the text from PDF files. 19 | 20 | 2. Engineered Prompt and ChatGPT Pipeline (CVsInfoExtractor module): This process takes as an input the extracted text generated by the OCR Reader and extracts specific information using ChatGPT in a JSON format. 21 | 22 | 3. Extracted Information Structuring (CVsInfoExtractor module): This process takes the JSON output from the ChatGPT Pipeline, which contains the information extracted from each CV. This information is then structured and organized into a clear and easy-to-understand Excel format. 23 | 24 | 25 | ## Requirements 26 | 27 | 1. Python: Python 3.8 or newer. 28 | 29 | 2. GPT-4 API Access: If GPT-3.5 tokens don not fit the CV content, the package uses GPT-4 to extract the information from the CVs, so you'll need an access to the GPT-4 API. 30 | 31 | 32 | ## How to Use 33 | 34 | 1. Prepare Your CVs: Make sure all the CVs you want to analyze are in the “CVs” directory. 35 | 36 | 2. Run the Script: Run the following scripts. This will clone the project, prepare the environment, and execute the code. 37 | - Clone the project 38 | ```bash 39 | git clone https://github.com/Aillian/ResumeGPT.git 40 | ``` 41 | - CD project directory 42 | ```bash 43 | cd ResumeGPT 44 | ``` 45 | - Create a virtual environment 46 | ```bash 47 | python -m venv resumegpt_venv 48 | ``` 49 | - Activate the virtual environment 50 | ```bash 51 | source resumegpt_venv/Scripts/activate 52 | ``` 53 | - Upgrade pip version 54 | ```bash 55 | pip install --upgrade pip 56 | ``` 57 | - Install requirements.txt 58 | ```bash 59 | pip install -r requirements.txt 60 | ``` 61 | - CD codes directory 62 | ```bash 63 | cd ResumeGPT 64 | ``` 65 | - Run main.py and provide the 3 required arguments: 66 | - CVs Directory Path: use "../CVs" to read from 'CVs' directory 67 | - Openai API Key: should include GPT-4 model access 68 | - Desired Positions: written like the following "Data Scientist,Data Analyst,Data Engineer" 69 | ```bash 70 | python main.py "../CVs" "sk-ldbuDCjkgJHiFnbLVCJvvcfKNBDFJTYCVfvRedevDdf" "Data Scientist, Data Analyst, Data Engineer" 71 | ``` 72 | 73 | 3. Examine the Results: After the script finishes, you will find the output in “Output” directory which are two file (CSV & Excel) of the extracted information from each CV. 74 | 75 | 76 | ## Extracted Information 77 | 78 | ResumeGPT is designed to extract 23 features from each CV: 79 | 80 | - Education: 81 | 1. Education Bachelor University: name of university where bachelor degree was taken 82 | 2. Education Bachelor GPA: GPA of bachelor degree (Example: 4.5/5) 83 | 3. Education Bachelor Major: major of bachelor degree 84 | 4. Education Bachelor Graduation Date: date of graduation from bachelor degree (in format: Month_Name, YYYY) 85 | 5. Education Masters University: name of university where masters degree was taken 86 | 6. Education Masters GPA: GPA of masters degree (Example: 4.5/5) 87 | 7. Education Masters Major: major of masters degree 88 | 8. Education Masters Graduation Date: date of graduation from masters degree (in format: Month_Name, YYYY) 89 | 9. Education PhD University: name of university where PhD degree was taken 90 | 10. Education PhD GPA: GPA of PhD degree (Example: 4.5/5) 91 | 11. Education PhD Major: major of PhD degree 92 | 12. Education PhD Graduation Date: date of graduation from PhD degree (in format: Month_Name, YYYY) 93 | 94 | - Work Experience: 95 | 13. Years of Experience: total years of experience in all jobs (Example: 3) 96 | 14. Experience Companies: list of all companies that the candidate worked with (Example: [Company1, Company2]) 97 | 15. Top 5 Responsibilities/Projects Titles: list of top 5 responsibilities/projects titles that the candidate worked on (Example: [Project1, Project2, Project3, Project4, Project5]) 98 | 99 | - Courses/Certifications: 100 | 16. Top 5 Courses/Certifications Titles: list of top 5 courses/certifications titles that the candidate took (Example: [Course1, Course2, Course3, Course4, Course5]) 101 | 102 | - Skills: 103 | 17. Top 3 Technical Skills: list of top 3 technical skills (Example: [Skill1, Skill2, Skill3]) 104 | 18. Top 3 Soft Skills: list of top 3 soft skills (Example: [Skill1, Skill2, Skill3]) 105 | 106 | - Employment Status: 107 | 19. Current Employment Status: one of the following (Full-time, Part-Time, Intern, Freelancer, Consultant, Unemployed) 108 | 109 | - Personal Information: 110 | 20. Nationality: nationality of the candidate 111 | 21. Current Residence: where the candidate currently live 112 | 113 | - Suitable Position: 114 | 22. Suitable Position: the most suitable position for the candidate, this will be taken from the user and dynamically replaced in the prompt 115 | 116 | - Rating Score: 117 | 23. Candidate Rating (Out of 10): score of the candidate suitability for the classified position in point 19 (Example: 7.5) 118 | 119 | 120 | This information is then organized into a structured Excel file. 121 | 122 | 123 | ## Contributing 124 | Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change. 125 | 126 | Possible additional features and optimizations: 127 | 1. Add additional features to the prompt. 128 | 2. Handling exceeded tokens limit, by further cleansing cv content. 129 | 3. The code tries to call gpt-3.5-turbo model first, if token limit exceeds the acceptable limit, it calls gpt-4. But this has some problems: 1- it is costly 2- what if the provided API key does not have access to gpt-4 model? 130 | 4. Catching GPT-4 "service is down" error by calling the API again after some sleeping time. 131 | 5. Can the prompt be reduced so we save some tokens for the cv content? 132 | 6. Separating "Information To Extract" in the prompt to a different file so the user gets the flexibility of adding new features and then dynamically imputing it into the prompt after that the added features in "CVs_Info_Extracted.csv" should be reflected as column names in the csv file. 133 | 7. Additional errors handling. 134 | 8. What about extending the usage to other LLMs? 135 | 136 | 137 | ## License 138 | ResumeGPT is released under the MIT License. See the LICENSE file for more details. -------------------------------------------------------------------------------- /ResumeGPT/ChatGPT_Pipeline.py: -------------------------------------------------------------------------------- 1 | # import modules 2 | import os 3 | import pandas as pd 4 | import openai 5 | from openai import InvalidRequestError 6 | import time 7 | import json 8 | from json import JSONDecodeError 9 | from tqdm import tqdm 10 | # add a progress bar to pandas operations 11 | tqdm.pandas(desc='CVs') 12 | 13 | # define the path to the output CSV file 14 | output_csv_file_path = '../Output/CVs_Info_Extracted.csv' 15 | 16 | # define the path to the output Excel file 17 | output_excel_file_path = '../Output/CVs_Info_Extracted.xlsx' 18 | 19 | 20 | # define a class to extract CV information 21 | class CVsInfoExtractor: 22 | # define a constructor that initializes the class with a DataFrame of CVs 23 | def __init__(self, cvs_df, openai_api_key, desired_positions): 24 | self.cvs_df = cvs_df 25 | 26 | # open a file in read mode and read the contents of the file into a variable 27 | with open('../Engineered_Prompt/Prompt.txt', 'r') as file: 28 | self.prompt = file.read() 29 | 30 | # Join the desired positions into a comma-separated string 31 | suitable_positions_str = "(" + ", ".join(desired_positions) + ")" 32 | 33 | # Replace the placeholder in the prompt with the formatted suitable positions string 34 | self.prompt = self.prompt.replace('(suitable position for the candidate)', suitable_positions_str) 35 | 36 | 37 | # set the OpenAI API key 38 | openai.api_key = openai_api_key 39 | 40 | 41 | # define internal function to call GPT for CV info extraction 42 | def _call_gpt_for_cv_info_extraction(self, prompt, cv_content, model, temperature = 0): 43 | 44 | # create a dict of parameters for the ChatCompletion API 45 | completion_params = { 46 | 'model': model, 47 | 'messages': [{"role": "system", "content": prompt}, 48 | {"role": "user", "content": cv_content}], 49 | 'temperature': temperature} 50 | 51 | # send a request to the ChatCompletion API and store the response 52 | response = openai.ChatCompletion.create(**completion_params) 53 | # if the response contains choices and at least one choice, extract the message content 54 | if 'choices' in response and len(response.choices) > 0: 55 | cleaned_response = response['choices'][0]['message']['content'] 56 | try: 57 | # try to convert the message content to a JSON object 58 | json_response = json.loads(cleaned_response) 59 | except JSONDecodeError: 60 | # if the conversion fails, set the JSON response to None 61 | json_response = None 62 | else: 63 | # if the response does not contain choices or no choice, set the JSON response to None 64 | json_response = None 65 | 66 | # return the JSON response 67 | return json_response 68 | 69 | 70 | # Defines internal function to normalize a JSON response from GPT 71 | def _normalize_gpt_json_response(self, CV_Filename, json_response): 72 | 73 | # Creates a DataFrame with one column "CV_Filename", the values of this column is from the "CV_Filename" 74 | CV_Filename_df = pd.DataFrame([CV_Filename], columns = ['CV_Filename']) 75 | 76 | # Creates a DataFrame with one column "All_Info_JSON", the values of this column is the JSON response 77 | df_CV_Info_Json = pd.DataFrame([[json_response]], columns = ['All_Info_JSON']) 78 | 79 | # Normalize the JSON response, flattening it into a table 80 | df_CV_Info_Json_normalized = pd.json_normalize(json_response) 81 | 82 | # Concatenates the three DataFrame along the columns 83 | df = pd.concat([CV_Filename_df, df_CV_Info_Json_normalized, df_CV_Info_Json], axis=1) 84 | 85 | # Returns the final DataFrame 86 | return df 87 | 88 | 89 | # Defines internal function to write the DataFrame into a CSV file 90 | def _write_response_to_file(self, df): 91 | 92 | # Checks if the output CSV file already exists 93 | if os.path.isfile(output_csv_file_path): 94 | # If the file exists, append the DataFrame into the CSV file without writing headers 95 | df.to_csv(output_csv_file_path, mode='a', index=False, header=False) 96 | else: 97 | # If the file doesn't exist, write the DataFrame into a new CSV file 98 | df.to_csv(output_csv_file_path, mode='w', index=False) 99 | 100 | 101 | # Define the internal function _gpt_pipeline 102 | def _gpt_pipeline(self, row, model = 'gpt-3.5-turbo'): 103 | 104 | # Retrieve the CV Filename and Content from the given row 105 | CV_Filename = row['CV_Filename'] 106 | CV_Content = row['CV_Content'] 107 | 108 | # Sleep for 5 seconds to delay the next operation 109 | time.sleep(5) 110 | 111 | try: 112 | # Print status message indicating GPT is being called for CV info extraction 113 | print('Calling GPT For CV Info Extraction...') 114 | 115 | # Call the GPT model for CV information extraction 116 | json_response = self._call_gpt_for_cv_info_extraction(prompt=self.prompt, cv_content=CV_Content, model=model) 117 | 118 | # Print status message indicating normalization of GPT response 119 | print('Normalizing GPT Response...') 120 | 121 | # Normalize the GPT JSON response 122 | df = self._normalize_gpt_json_response(CV_Filename, json_response) 123 | 124 | # Print status message indicating that the results are being appended to the CSV file 125 | print('Appending Results To The CSV File...') 126 | 127 | # Write the normalized response to a file 128 | self._write_response_to_file(df) 129 | 130 | # Print a line for clarity in the output 131 | print('----------------------------------------------') 132 | 133 | # Return the GPT JSON response 134 | return json_response 135 | 136 | # Catch an exception when the tokens don't fit in the chosen GPT model 137 | except InvalidRequestError as e: 138 | # Print the error that occurred 139 | print('An Error Occurred:', str(e)) 140 | 141 | # Print status message indicating that gpt-4 is being called instead 142 | print("Tokens don't fit gpt-3.5-turbo, calling gpt-4...") 143 | 144 | # Retry the pipeline with the gpt-4 model 145 | return self._gpt_pipeline(row, model = 'gpt-4') 146 | 147 | 148 | # Define the internal function _write_final_results_to_excel 149 | def _write_final_results_to_excel(self): 150 | # Load the CSV file into a pandas DataFrame 151 | df_to_excel = pd.read_csv(output_csv_file_path) 152 | 153 | # Write the DataFrame to an Excel file 154 | df_to_excel.to_excel(output_excel_file_path) 155 | 156 | # Return the DataFrame 157 | return df_to_excel 158 | 159 | 160 | # Define the main function extract_cv_info 161 | def extract_cv_info(self): 162 | # Print a status message indicating the start of the ResumeGPT Pipeline 163 | print('---- Excecuting ResumeGPT Pipeline ----') 164 | print('----------------------------------------------') 165 | 166 | # Apply the _gpt_pipeline function to each row in cvs_df DataFrame 167 | self.cvs_df['CV_Info_Json'] = self.cvs_df.progress_apply(self._gpt_pipeline, axis=1) 168 | 169 | # Print a status message indicating the completion of the extraction 170 | print('Extraction Completed!') 171 | 172 | # Print a status message indicating that results are being saved to Excel 173 | print('Saving Results to Excel...') 174 | 175 | # Write the final results to an Excel file 176 | final_df = self._write_final_results_to_excel() 177 | 178 | # Return the final DataFrame 179 | return final_df 180 | --------------------------------------------------------------------------------