├── .devcontainer
    └── devcontainer.json
├── Internship_extractor
    ├── agents.py
    ├── final.json
    ├── main.py
    ├── requirements.txt
    └── tasks.py
├── LICENSE.txt
├── README.md
├── internships_dataset.json
├── main.py
├── requirements.txt
├── resume_temp.json
└── tools.py


/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Python 3",
 3 |   // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
 4 |   "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye",
 5 |   "customizations": {
 6 |     "codespaces": {
 7 |       "openFiles": [
 8 |         "README.md",
 9 |         "main.py"
10 |       ]
11 |     },
12 |     "vscode": {
13 |       "settings": {},
14 |       "extensions": [
15 |         "ms-python.python",
16 |         "ms-python.vscode-pylance"
17 |       ]
18 |     }
19 |   },
20 |   "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y <packages.txt; [ -f requirements.txt ] && pip3 install --user -r requirements.txt; pip3 install --user streamlit; echo '✅ Packages installed and Requirements met'",
21 |   "postAttachCommand": {
22 |     "server": "streamlit run main.py --server.enableCORS false --server.enableXsrfProtection false"
23 |   },
24 |   "portsAttributes": {
25 |     "8501": {
26 |       "label": "Application",
27 |       "onAutoForward": "openPreview"
28 |     }
29 |   },
30 |   "forwardPorts": [
31 |     8501
32 |   ]
33 | }


--------------------------------------------------------------------------------
/Internship_extractor/agents.py:
--------------------------------------------------------------------------------
 1 | from crewai import Agent
 2 | from crewai_tools import WebsiteSearchTool, FileReadTool, CSVSearchTool
 3 | 
 4 | from crewai_tools import BaseTool
 5 | 
 6 | # class CSVReaderTool(BaseTool):
 7 | #     name: str = "CSV Reader Tool"
 8 | #     description: str = "A tool for reading data from a CSV file row by row."
 9 | 
10 | #     def _run(self, csv_file_path: str) -> str:
11 | #         try:
12 | #             # Implementation to read CSV file row by row with explicit encoding
13 | #             with open(csv_file_path, 'r', encoding='utf-8') as file:
14 | #                 for line in file:
15 | #                     # Process each row as needed
16 | #                     print(line)
17 | #             return "CSV file read successfully"
18 | #         except UnicodeDecodeError as e:
19 | #             return f"Error reading CSV file: {e}"
20 |     
21 | # Define tools
22 | web_search_tool = WebsiteSearchTool()
23 | file_read_tool = FileReadTool(
24 |     file_path='input.json',
25 |     description='A tool to read the internship job description file.'
26 | )
27 | # csv_reader_tool = CSVReaderTool(file_path='internships.csv')
28 | class Agents:
29 |     def research_agent(self):
30 |         return Agent(
31 |             role='Research Analyst',
32 |             goal='Analyze the internship details and provided descriptions to extract a complete summary of the company job/internship posting.',
33 |             tools=[web_search_tool, file_read_tool],
34 |             backstory='Expert in analyzing internship descriptions and identifying key values and needs from various sources.',
35 |             verbose=True
36 |         )
37 | 
38 |     def writer_agent(self):
39 |         return Agent(
40 |             role='Job Description Writer',
41 |             goal='Use insights from the Research Analyst to create a detailed, engaging, and enticing internship posting.',
42 |             tools=[web_search_tool, file_read_tool],
43 |             backstory="Skilled in crafting compelling internship descriptions that attract the right candidates.",
44 |             verbose=True
45 |         )
46 | 
47 | 


--------------------------------------------------------------------------------
/Internship_extractor/final.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "",
 3 |   "company": "",
 4 |   "apply_link": "",
 5 |   "date_published": "",
 6 |   "country": "",
 7 |   "city": "",
 8 |   "skills": [],
 9 |   "degree": "",
10 |   "field": [],
11 |   "experience": [],
12 |   "summary": ""
13 | }
14 | 


--------------------------------------------------------------------------------
/Internship_extractor/main.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from dotenv import load_dotenv
 3 | from crewai import Crew
 4 | from tasks import Tasks
 5 | from agents import Agents
 6 | import json
 7 | import pandas as pd
 8 | import time
 9 | import os
10 | 
11 | load_dotenv()
12 | tasks = Tasks()
13 | agents = Agents()
14 | 
15 | csv_file = 'internships.csv'
16 | df = pd.read_csv(csv_file)
17 | 
18 | final_output_file = 'internships_dataset.json'  # Specify the final output JSON file
19 | 
20 | # Create or open the final output file in append mode
21 | with open(final_output_file, 'a') as final_output:
22 |     for index, row in df.iterrows():
23 |         company_data = row.to_dict()
24 |         company_json = json.dumps(company_data)
25 | 
26 |         output_json_file = f'output_{index}.json'  # Specify the output JSON file path based on index or other criteria
27 | 
28 |         with open(output_json_file, 'w') as json_file:
29 |             json.dump(company_data, json_file)
30 | 
31 |         research_agent = agents.research_agent()
32 |         review = tasks.extract_info(research_agent, company_json,output_json_file)
33 | 
34 |         crew = Crew(
35 |             agents=[research_agent],
36 |             tasks=[review]
37 |         )
38 | 
39 |         # Kick off the process
40 |         result = crew.kickoff()
41 | 
42 |         # Append the content of the output file to the final output file
43 |         with open(output_json_file, 'r') as output_file:
44 |             final_output.write(output_file.read() + ',\n')
45 | 
46 |         # Delete the last output file
47 |         os.remove(output_json_file)
48 | 
49 |         time.sleep(5) 
50 | 


--------------------------------------------------------------------------------
/Internship_extractor/requirements.txt:
--------------------------------------------------------------------------------
1 | python-dotenv==1.0.1
2 | crewai==0.14.3
3 | 


--------------------------------------------------------------------------------
/Internship_extractor/tasks.py:
--------------------------------------------------------------------------------
 1 | from textwrap import dedent
 2 | from crewai import Task
 3 | 
 4 | class Tasks():
 5 |     def extract_info(self, agent, json_file, output_file):
 6 |         return Task(
 7 |             description=dedent(f"""\
 8 |                     Extracts key information of each company in order such as company,apply_link,date_published,location,country,city,state,roles,skills,eligibility,degree,field,experience,summary,etc from the internships.csv file {json_file} resumes and make a overall summary of the job posting .
 9 |                     Please also include most important skill(not basic skills) needed for the job .and give summary a little longer. Also don't write [Here] in apply_link.
10 |                     In experience field just give important experience need for the job and don't make it longer
11 |                     like """),
12 |             expected_output=dedent("""\
13 |                     Give output like strictly like this json file like {
14 |     "name": "",
15 |     "company": "",
16 |     "apply_link": "",
17 |     "date_published": "",
18 |     "country": "",
19 |     "city": "",
20 |     "skills": [],
21 |     "degree": "",
22 |     "field": [],
23 |     "experience": [],
24 |     "summary": ""
25 | }
26 | Please give summary a little longer of about 60 words"""),
27 |             agent=agent,
28 |             output_file=output_file  # Pass the output file path to the Task
29 |         )


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Muratcan
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 🔍Internship Finder
  2 | 
  3 | ## 📝 Project Overview
  4 | 
  5 | JeezAI Internship Finder is an AI-powered web application that connects students with the most compatible internship opportunities. It leverages advanced natural language processing techniques and the power of DSPy framework by Stanford NLP (Omar Khattab, Herumb Shandilya, Arnav Singhvi) to analyze resumes, generate search queries, and provide detailed match analyses between students' credentials and internship requirements.
  6 | 
  7 | ![image](https://github.com/JeezAI/DSPy_matchmaking/assets/114735073/50934dc0-0b03-4fc2-946d-21794d1a489a)
  8 | 
  9 | 
 10 | ## 🔑 Key Features
 11 | 
 12 | - **Resume Parsing:** Automated extraction of pertinent information from resumes.
 13 | - **Keyword-Based Search Queries:** Generates dynamic queries to find the best internships.
 14 | - **Match Analysis:** Compares student profiles with internship requirements for precise matches.
 15 | - **Scalable Data Storage:** Uses Weaviate, a vector database for efficient data storage and retrieval.
 16 | - **Streamlit Web Interface:** Provides an easy-to-navigate interface for users to upload resumes and explore internship opportunities.
 17 | 
 18 | ## 🖥️ Technical Architecture
 19 | 
 20 | The Internship Finder is built with a stack that includes DSPy for structured AI programming, Cohere for text processing, and Streamlit for the front-end, encapsulated within a modular architectural framework.
 21 | 
 22 | ### Core Technologies
 23 | 
 24 | - **DSPy:** A framework for declarative structured programming in AI. It enables the modularization of NLP tasks into reusable components, enhancing both development efficiency and system scalability. DSPy allows defining declarative language model calls that get compiled into self-improving pipelines. This shifts the focus from manual prompt engineering to treating LMs as devices. It introduces concepts like natural language type signatures, modules, and optimizers to specify transformations, encapsulate prompting techniques, and update LM weights to achieve target metrics. DSPy can be used to build complex pipelines, like multi-hop question answering systems that break down questions, retrieve relevant passages, and synthesize answers. The DSPy compiler enables systematic optimization of prompts by running inputs through the pipeline, analyzing traces, and treating prompt engineering as a discrete AI optimization problem.
 25 |   
 26 |  ![image](https://github.com/JeezAI/DSPy_matchmaking/assets/114735073/59ebbbdb-d382-4422-a530-361b112b8eb5)
 27 | 
 28 | 
 29 | Some exciting applications mentioned include:
 30 | - Generating structured outputs with type predictors, JSON templates, and retry logic
 31 | - Building self-correcting pipelines with DSPy assertions and custom guard rails
 32 | - Optimizing Chain-of-Thought reasoning for complex question answering
 33 | - Integrating with retrieval systems and APIs for grounded language modeling
 34 | 
 35 |   You can learn more about the DSPy framework by visiting Stanford NLP GitHub Repo here [https://github.com/stanfordnlp/dspy].
 36 | 
 37 | Overall, DSPy provides an impressive framework for building reliable, optimized and complex language model applications in a more automated and scalable way compared to manual prompt engineering. We're excited to try it out for our mathmaking projects.
 38 | 
 39 | We also use;
 40 | - **Cohere:** Utilized for its newest and most powerful model, Commard R+ in generating search queries and performing deep linguistic analyses. [https://docs.cohere.com/docs/command-r-plus]
 41 | - **Weaviate:** Chosen for its vector search capabilities, allowing quick retrieval of internship opportunities from large datasets.We used Weaviate Hybrid search to combine multiple search algorithms to improve the accuracy and relevance of search results. This hybrid query involves both keyword matching and semantic vector search to return the most relevant internship opportunities from the database. [https://weaviate.io/]
 42 | - **Streamlit:** Facilitates rapid development of interactive web apps, used here to craft the user interface.
 43 | 
 44 | 
 45 | ### 📚 Directory Structure
 46 | 
 47 | - `main.py` - Orchestrates the user interaction and integrates various modules.
 48 | - `tools.py` - Contains utility functions and custom methods for data processing.
 49 | - `resume_temp.json` - Template for standardizing resume data format.
 50 |  
 51 | ## 🤖 DSPy Integration
 52 | 
 53 | The Internship Finder leverages the power of DSPy to create reusable and modular components for various AI tasks. 
 54 | 
 55 | DSPy (Declarative Structured Programming for AI) is instrumental in building the core functionality of the Internship Finder application. As a framework, DSPy facilitates the creation of modular and reusable components for natural language processing tasks. The implementation in our application is outlined as follows:
 56 | 
 57 | ### Initialization and Configuration
 58 | - DSPy modules `dspy` and `dsp`, along with `WeaviateRM` (Weaviate Retrieval Model), are imported for foundational setup.
 59 | - The connection to the Weaviate database is established using configurations like cluster URL, API key, and timeouts.
 60 | - DSPy settings are configured to integrate the language model (Cohere) and the retrieval model (WeaviateRM) for smooth operations.
 61 | 
 62 | ### Defining the Internship Finder Module
 63 | - A custom DSPy module, `Internship_finder`, derived from `dspy.Module`, encapsulates the logic for internship matching.
 64 | - Within this module:
 65 |   - `generate_query`: Employs `dspy.ChainOfThought` to instantiate the signature for dynamic query generation.
 66 |   - `generate_analysis`: Utilizes `dspy.Predict` for conducting a thorough match analysis.
 67 | 
 68 | ### Defining Signatures
 69 | - Two DSPy signatures are crafted:
 70 |   - `generate_analysis`: Accepts internship context and resume to output structured matches in JSON format.
 71 |   - `generate_query`: Analyzes resumes to produce a targeted search query for the Weaviate database.
 72 | 
 73 | ### Internship Matching Process
 74 | - The module's `forward` method orchestrates the matching process:
 75 |   - Executes query generation through the `generate_query` signature, iterating as needed.
 76 |   - Performs a database search for each query, collating results into a list of passages.
 77 |   - Deduplicates the list to remove any repeated internship listings.
 78 |   - Carries out a detailed match analysis using the `generate_analysis` signature.
 79 |   - The output is a comprehensive analysis detailing the matched internships.
 80 | 
 81 | By adopting DSPy, the Internship Finder application benefits from a structured, maintainable, and extensible framework. It demonstrates the efficient utilization of modules and signatures, streamlining the integration with various models and databases.
 82 |     
 83 | ## 📊Data Extraction with Crew AI LLM Agents
 84 | 
 85 | Crew AI enhances our data extraction capabilities by automatically pulling structured insights from unstructured internship descriptions. This enriched data supports improved matching accuracy and created JSON file for each database records.
 86 | 
 87 | ## 🚀 Getting Started
 88 | 
 89 | ### Installation
 90 | 
 91 | **Clone the repository:**
 92 | 
 93 |    ```bash
 94 |    git clone https://github.com/JeezAI/DSPy_matchmaking.git
 95 |    ```
 96 | 
 97 | # Environment Setup
 98 | 
 99 | Set environment variables for API keys:
100 | 
101 | ```bash
102 | export CO_API_KEY="Your_Cohere_API_Key"
103 | export WCS_API_KEY="Your_Weaviate_API_Key"
104 | export OPENAI_API_KEY="Your_OpenAI_API_Key"
105 | ```
106 | 
107 | Running the Application
108 | Launch the application with Streamlit:
109 | 
110 | ```bash
111 | streamlit run main.py
112 | ```
113 | Visit http://localhost:8501 in your web browser to interact with the application.
114 | 
115 | 
116 | ### Future Enhancements
117 | - Data Source Expansion: Link to more databases for a broader internship selection.
118 | - Personalized Recommendations: Adapt search results based on individual career aspirations and feedback.
119 | - Interactive User Feedback: Use collaborative filtering to refine matching algorithms based on user interactions.
120 | - Real-time Notifications: Implement a system to notify users of new opportunities.
121 | - Integrate with LinkedIn Analyzer and Career Roadmap Planner [https://github.com/JeezAI/careerbuilder_Linkedin2CareerRoadmap]
122 | 
123 | ### 📝Conclusion
124 | The Internship Finder exemplifies the powerful combination of DSPy for structured AI development and Cohere for sophisticated text analysis, providing a robust solution for internship matching. This platform not only streamlines the search process but also offers a scalable framework for future enhancements.
125 | 
126 | ### 🤝Contribution Guidelines
127 | We welcome contributions from the community. Please read our contribution guidelines to learn how you can help improve the Internship Finder.
128 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import dspy
  2 | from dspy import dsp
  3 | import os
  4 | from dspy.retrieve.weaviate_rm import WeaviateRM
  5 | from weaviate.classes.init import AdditionalConfig, Timeout
  6 | import weaviate
  7 | import json
  8 | import streamlit as st
  9 | from typing import Optional, List
 10 | from datetime import datetime
 11 | from pydantic import BaseModel, Field, HttpUrl
 12 | from tools import check_json, company_url, resume_into_json
 13 | import nltk
 14 | from PyPDF2 import PdfReader
 15 | import cohere
 16 | 
 17 | co_api_key = os.getenv("CO_API_KEY")
 18 | nltk.download('punkt')
 19 | 
 20 | # Weaviate client configuration
 21 | url = "https://internship-finder-52en6hka.weaviate.network"
 22 | apikey = os.getenv("WCS_API_KEY")
 23 | openai_api_key = os.getenv("OPENAI_API_KEY")
 24 | 
 25 | # Connect to Weaviate
 26 | weaviate_client = weaviate.connect_to_wcs(
 27 |     cluster_url=url,  
 28 |     auth_credentials=weaviate.auth.AuthApiKey(apikey),
 29 |         headers={
 30 |         "X-OpenAI-Api-Key": openai_api_key  
 31 |     },additional_config=AdditionalConfig(
 32 |         timeout=Timeout(init=2, query=45, insert=120)  # Values in seconds
 33 |     ) 
 34 |     
 35 | )
 36 | 
 37 | cohere = dsp.Cohere(model='command-r-plus',api_key=co_api_key)
 38 | 
 39 | retriever_model = WeaviateRM("Internship", weaviate_client=weaviate_client)
 40 | 
 41 | dspy.settings.configure(lm=cohere,rm=retriever_model)
 42 | # Weaviate client configuration
 43 | st.title("Internship Finder")
 44 | my_bar = st.progress(0)
 45 | 
 46 | class JobListing(BaseModel):
 47 |     city: str
 48 |     date_published: datetime  # Assuming the date can be parsed into a datetime object
 49 |     apply_link: HttpUrl  # Pydantic will validate this is a valid URL
 50 |     company: str
 51 |     location: Optional[str]  # Assuming 'location' could be a string or None
 52 |     country: str
 53 |     name: str
 54 | 
 55 | class Out_Internship(BaseModel):
 56 |     output: list[JobListing] = Field(description="list of internships")  
 57 | 
 58 | def search_datbase(query):
 59 |     url = "https://internship-finder-52en6hka.weaviate.network"
 60 |     apikey = os.getenv("WCS_API_KEY")
 61 |     openai_api_key = os.getenv("OPENAI_API_KEY")
 62 | 
 63 | # Connect to Weaviate
 64 |     weaviate_client = weaviate.connect_to_wcs(
 65 |     cluster_url=url,  
 66 |     auth_credentials=weaviate.auth.AuthApiKey(apikey),
 67 |         headers={
 68 |         "X-OpenAI-Api-Key": openai_api_key  
 69 |     }  
 70 |     
 71 |     )
 72 |     questions = weaviate_client.collections.get("Internship")
 73 | 
 74 |     response = questions.query.hybrid(
 75 |         query=query,
 76 |         limit=10
 77 |     )
 78 | 
 79 |     interns = []
 80 | 
 81 |     # adding internships to list
 82 |     for item in response.objects:
 83 |         interns.append(item.properties) 
 84 |     
 85 |     
 86 |     context = json.dumps(interns)
 87 |     weaviate_client.close()
 88 |     return json.loads(context)
 89 | 
 90 | def check_resume(resume):
 91 |     if (resume != None):
 92 |         pdf_reader = PdfReader(resume)
 93 |         text=""
 94 |         for page_num in range(len(pdf_reader.pages)):
 95 |                 
 96 |                 page = pdf_reader.pages[page_num]
 97 |                 
 98 |                 # Extract text from the page
 99 |                 text += page.extract_text()
100 |     
101 |     
102 |     tokens = nltk.word_tokenize(text)
103 |     
104 |     # Check if the total character count of all tokens exceeds the limit
105 |     total_length = sum(len(token) for token in tokens)
106 |     if total_length >= 16000:
107 |         return False  # Return False if the total length of tokens exceeds the limit
108 | 
109 |     tokens_to_check = ["summary", "skills", "experience", "projects", "education"]
110 |     
111 |     # Convert tokens to lower case for case-insensitive comparison
112 |     text_tokens_lower = [token.lower() for token in tokens]
113 | 
114 |     # Check if any of the specified tokens are in the tokenized text
115 |     tokens_found = [token for token in tokens_to_check if token.lower() in text_tokens_lower]
116 | 
117 |     # Return False if none of the specified tokens were found, True otherwise
118 |     return bool(tokens_found)
119 | 
120 | 
121 | 
122 | class Internship_finder(dspy.Module):
123 |     cohere = dsp.Cohere(model='command-r-plus',api_key=co_api_key)
124 | 
125 |     dspy.settings.configure(lm=cohere)
126 |     def __init__(self):
127 |         super().__init__()
128 |         self.generate_query = [dspy.ChainOfThought(generate_query) for _ in range(3)]
129 |         self.generate_analysis = dspy.Predict(generate_analysis,max_tokens=4000) 
130 | 
131 |     def forward(self, resume):
132 |         #resume to pass as context 
133 |         
134 |         passages = []
135 | 
136 |         for hop in range(3):
137 |             query = self.generate_query[hop](context=str(resume)).query
138 |             info=search_datbase(query)
139 |             passages.append(info)
140 | 
141 |         context = deduplicate(passages)  
142 |         my_bar.progress(60,text="Doing Analysis")
143 |             
144 |         analysis = self.generate_analysis(resume=str(resume), context=context).output
145 |               
146 |         return analysis
147 |     
148 | 
149 | 
150 | def deduplicate(context):
151 |         """
152 |         Removes duplicate elements from the context list while preserving the order.
153 |         
154 |         Parameters:
155 |         context (list): List containing context elements.
156 |         
157 |         Returns:
158 |         list: List with duplicates removed.
159 |         """
160 |         json_strings = [json.dumps(d, sort_keys=True) for d in context]
161 |     
162 |         # Use a set to remove duplicate JSON strings
163 |         unique_json_strings = set(json_strings)
164 |     
165 |         # Convert JSON strings back to dictionaries
166 |         unique_dicts = [json.loads(s) for s in unique_json_strings]
167 |         return unique_dicts
168 | 
169 | def check_answer(assessment_answer):
170 |     if assessment_answer == "no":
171 |         return False
172 |     return True
173 | 
174 | def get_resume():
175 |     with open('resume.json', 'r') as file: 
176 |         resume = json.load(file)
177 |      
178 |     return resume
179 | 
180 | 
181 | 
182 | class generate_analysis(dspy.Signature):
183 |     """
184 |     Your Role:
185 |     You are a Matchmaking Manager, an expert at connecting students with their ideal internship opportunities.
186 | 
187 |     Input:
188 |     You will be provided with a student's resume and a list of potential internship opportunities. Your task is to carefully analyze and match the student's credentials with the requirements of each internship, following the specific criteria outlined below.
189 |     Matching Criteria:
190 | 
191 | 
192 |     Educational Background:
193 |     Degree Level and Major: Seek exact matches or close alignments between the student's degree level (bachelor's, master's, etc.) and major with the educational requirements specified in the internships.
194 |     Related Fields of Study: Consider closely related fields of study as a potential match. For example, a student majoring in Computer Science could be a good fit for internships seeking IT or Software Engineering majors.
195 |     Relevant Coursework: Give bonus points to internships that specifically mention or prefer certain courses that the student has completed. For example, if an internship seeks candidates with a background in Data Structures and the student has taken an advanced course in that area, it strengthens the match.
196 |     Skill and Experience Match:
197 |     Required Skills: Look for strong overlaps between the technical skills listed on the student's resume and the required skills outlined in the internship descriptions.
198 |     Tools and Frameworks: Prioritize internships that specifically mention tools, programming languages, or frameworks that the student has hands-on experience with. For example, if an internship seeks proficiency in Python, and the student has worked on Python projects, it is a strong match.
199 |     Applied Skills: Value projects or previous work experiences that demonstrate the practical application of the required skills. For instance, if an internship seeks candidates with web development skills, and the student has built and deployed websites, it is a clear indication of a good fit.
200 |     Project Relevance:
201 |     Project Experience: Analyze the student's project portfolio to identify technical skills and areas of expertise that align with the internships' requirements.
202 |     AI/ML and Data Focus: Match internships that specifically seek experience or interest in AI/ML model development, data analysis, or similar areas. Look for keywords like "machine learning," "data engineering," or "data-driven solutions" in the internship descriptions.
203 |     Ensure that the internships do not include "research"  in their titles, skills, or descriptions.
204 |     Practical Implementation: Prioritize internships that emphasize hands-on experience in development, engineering, application development, or implementation roles over theoretical or research-focused roles.
205 |     For Match Analysis: do a detailed match analysis for each internship, explaining how resume matches with internship. Provide a brief summary of the match analysis for each internship.
206 |     
207 |     For Output:
208 |     Use the following JSON array format to provide the top-matched internships in a single array:
209 | 
210 |     [
211 |         {
212 |             "name": "Job Title",
213 |             "company": "Company Name",
214 |             "apply_link": "Application Link",
215 |             "match_analysis": "Detailed match analysis here. Explain how the student's background matches the internship requirements, using specific examples."
216 |         },
217 |         {
218 |             "name": "Another Job Title",
219 |             "company": "Another Company",
220 |             "apply_link": "Application Link",
221 |             "match_analysis": "Provide a detailed match analysis for this internship opportunity as well, highlighting relevant matches."
222 |         }
223 |     ]
224 | 
225 |     If there are no suitable matches, return None. Ensure that no additional words or JSON annotations are included outside the code block.
226 |         
227 | 
228 |     """
229 |     
230 |     context = dspy.InputField(desc="Internships")
231 |     resume = dspy.InputField(desc="resume")
232 |     output = dspy.OutputField(desc="list of internships",type=list[JobListing])
233 | 
234 | class generate_query(dspy.Signature):
235 |     """
236 |     Generate query to search in the weaviate database hybrid search by following below rules:
237 |     1. Analyze the resume, extract keywords from skills, education, experience, projects
238 |     2. then use the keywords to generate query to search in the weaviate database
239 |     3. query should be keyword based to find the best internships for the resume
240 |     """
241 | 
242 |     context = dspy.InputField(desc="Resume")
243 |     query = dspy.OutputField(desc="query in simple string format")
244 | 
245 | 
246 | def main():
247 |     
248 |         
249 |     file = st.file_uploader("Upload Resume to get started", type=["pdf"])
250 |     my_bar.progress(0,text="Starting...") 
251 |     
252 |     if file is not None:
253 |         msg = st.toast("Resume Uploaded")
254 |         if check_resume(file):
255 |             with st.status("Extracting Details from  Resume"):
256 |                 resume = resume_into_json(file)
257 |                 st.write(resume)
258 | 
259 |             analysis = Internship_finder()
260 |             
261 |             my_bar.progress(30,text="Finding Internships")   
262 |             
263 |             generate = analysis(resume)
264 |             print(generate)
265 | 
266 |             if generate != "None":
267 |                 st.subheader("List of Internships:")
268 |                 col_company, col_url = st.columns([2,6])
269 |                 interns = json.loads(generate)
270 |                 my_bar.progress(100, "Internships Found !!")
271 |               
272 |                 with col_company:
273 |                         for intern in interns:
274 |                             st.link_button(intern["company"],company_url(intern["company"]))
275 |                             
276 |                     
277 |                 with col_url:
278 |                         for intern in interns:
279 |                             st.link_button(intern["name"], intern["apply_link"])
280 |                             with st.status("Match Analysis"):
281 |                                 st.write(intern["match_analysis"])
282 |                 
283 | 
284 |             else:
285 |                 my_bar.progress(100, "Sorry, No Internships Found for you !!")
286 |                 st.write(" We are adding more internships every day, please check back later.")
287 |             
288 |             
289 |         else:
290 |             st.warning("Invalid File Uploaded !!")
291 |             my_bar.progress(0,text="Invalid File Uploaded")
292 | 
293 | 
294 | if __name__ == "__main__":
295 |     main()
296 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/stanfordnlp/dspy.git
2 | weaviate-client==4.5.4
3 | streamlit==1.32.2
4 | pydantic
5 | PyPDF2
6 | nltk
7 | cohere
8 | 


--------------------------------------------------------------------------------
/resume_temp.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "links": [{
 3 |     }],
 4 |     "Summary":"",
 5 |     "Education": [
 6 |       {
 7 |         "Institution": "",
 8 |         "Degree": "",
 9 |         "FieldOfStudy": "",
10 |         "StartDate": "",
11 |         "EndDate": "",
12 |         "Achievements": []
13 |       }
14 |     ],
15 |     "Skills":[{
16 |       
17 |     }],
18 | 
19 |     "Projects":[{
20 |       "ProjectName":"",
21 |       "StartDate":"",
22 |       "EndDate": "",
23 |       "Responsibilities": []
24 | 
25 |     }],
26 |     "Experience": [
27 |       {
28 |         "CompanyName": "",
29 |         "Role": "",
30 |         "Location": "",
31 |         "StartDate": "",
32 |         "EndDate": "",
33 |         "Responsibilities": []
34 |       }
35 |     ],
36 |     "Activities": [],
37 |     "Interests": [],
38 |     "AdditionalInformation": ""
39 |   }
40 | 


--------------------------------------------------------------------------------
/tools.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from PyPDF2 import PdfReader
 3 | import os
 4 | import cohere
 5 | import requests
 6 | 
 7 | with open("resume_temp.json") as f:
 8 |     file = json.load(f)
 9 | 
10 | temp = json.dumps(file)
11 | 
12 | def resume_into_json(resume):
13 |     cohere_api_key = os.getenv("CO_API_KEY")
14 |     co = cohere.Client(cohere_api_key)
15 | 
16 |     pdf_reader = PdfReader(resume)
17 |     text = ""
18 |     for page_num in range(len(pdf_reader.pages)):
19 |         page = pdf_reader.pages[page_num]
20 |         text += page.extract_text()
21 | 
22 |     prompt = f"Act as Master in extracting data from resume. Don't give any explanation. Please analyze and convert resume data from this {text} into JSON, remove data like name, email, or personal information, and please return only the JSON file."
23 | 
24 |     response = co.generate(
25 |         model='command-r-plus',
26 |         prompt=prompt,
27 |         max_tokens=10000,
28 |         num_generations=1,
29 |         temperature=0.2,
30 |         
31 |     )
32 | 
33 |     return json.loads(response.generations[0].text)
34 | 
35 | def company_url(company):
36 | 
37 |     if company == "Astranis":
38 |         return "https://www.jeezai.com/companies/astranis-space-technologies"
39 |     
40 |     company = (company.lower()).replace(" ", "-")
41 | 
42 |     return f"https://www.jeezai.com/companies/{company}/"
43 | 
44 | 
45 | def get_company_info(company):
46 |     data = requests.post(
47 |         "https://advanced-research-agents.onrender.com",
48 |         json={
49 |             "query": company,
50 |         }
51 |     )
52 |     return data.json()
53 | 


--------------------------------------------------------------------------------