├── src ├── notebook │ ├── CV.pdf │ ├── skills_extraction.py │ ├── url_data_scientist_loc_bangalore.json │ ├── skills.csv │ ├── jd_data_extraction.ipynb │ └── job_recommendation.ipynb ├── components │ ├── job_recommender.py │ ├── jd_data_cleaner.py │ └── jd_data_extractor.py └── data │ ├── url_data_scientist_loc_bangalore.json │ └── skills.csv ├── myenv ├── Scripts │ ├── f2py.exe │ ├── ftfy.exe │ ├── nltk.exe │ ├── pip.exe │ ├── pip3.exe │ ├── tqdm.exe │ ├── ttx.exe │ ├── ipython.exe │ ├── jupyter.exe │ ├── pathy.exe │ ├── pip3.10.exe │ ├── python.exe │ ├── pythonw.exe │ ├── spacy.exe │ ├── wheel.exe │ ├── fonttools.exe │ ├── ipython3.exe │ ├── pyftmerge.exe │ ├── python_d.exe │ ├── pythonw_d.exe │ ├── streamlit.exe │ ├── watchmedo.exe │ ├── chardetect.exe │ ├── jsonschema.exe │ ├── jupyter-run.exe │ ├── markdown-it.exe │ ├── normalizer.exe │ ├── pyftsubset.exe │ ├── pygmentize.exe │ ├── pyresparser.exe │ ├── jupyter-kernel.exe │ ├── jupyter-migrate.exe │ ├── jupyter-kernelspec.exe │ ├── jupyter-troubleshoot.exe │ ├── docx2txt │ ├── deactivate.bat │ ├── streamlit.cmd │ ├── activate.bat │ ├── activate │ ├── plac_runner.py │ ├── pywin32_testall.py │ ├── pdf2txt.py │ └── dumppdf.py ├── pyvenv.cfg ├── etc │ └── jupyter │ │ └── nbconfig │ │ └── notebook.d │ │ └── pydeck.json ├── docx-template │ ├── docProps │ │ └── thumbnail.jpeg │ ├── _rels │ │ └── .rels │ └── word │ │ ├── settings.xml │ │ ├── fontTable.xml │ │ ├── styles.xml │ │ ├── theme │ │ └── theme1.xml │ │ └── numbering.xml └── share │ ├── jupyter │ ├── kernels │ │ └── python3 │ │ │ ├── logo-32x32.png │ │ │ ├── logo-64x64.png │ │ │ ├── kernel.json │ │ │ └── logo-svg.svg │ └── nbextensions │ │ └── pydeck │ │ └── extensionRequires.js │ └── man │ └── man1 │ ├── ipython.1 │ └── ttx.1 ├── utilities └── resumes │ └── CV.pdf ├── chromedriver_win32 └── chromedriver.exe ├── requirements.txt ├── setup.py ├── __init__.py ├── .gitignore └── README.md /src/notebook/CV.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/src/notebook/CV.pdf -------------------------------------------------------------------------------- /myenv/Scripts/f2py.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/f2py.exe -------------------------------------------------------------------------------- /myenv/Scripts/ftfy.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/ftfy.exe -------------------------------------------------------------------------------- /myenv/Scripts/nltk.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/nltk.exe -------------------------------------------------------------------------------- /myenv/Scripts/pip.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/pip.exe -------------------------------------------------------------------------------- /myenv/Scripts/pip3.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/pip3.exe -------------------------------------------------------------------------------- /myenv/Scripts/tqdm.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/tqdm.exe -------------------------------------------------------------------------------- /myenv/Scripts/ttx.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/ttx.exe -------------------------------------------------------------------------------- /myenv/pyvenv.cfg: -------------------------------------------------------------------------------- 1 | home = C:\Program Files\Python310 2 | include-system-site-packages = false 3 | version = 3.10.10 4 | -------------------------------------------------------------------------------- /myenv/Scripts/ipython.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/ipython.exe -------------------------------------------------------------------------------- /myenv/Scripts/jupyter.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/jupyter.exe -------------------------------------------------------------------------------- /myenv/Scripts/pathy.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/pathy.exe -------------------------------------------------------------------------------- /myenv/Scripts/pip3.10.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/pip3.10.exe -------------------------------------------------------------------------------- /myenv/Scripts/python.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/python.exe -------------------------------------------------------------------------------- /myenv/Scripts/pythonw.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/pythonw.exe -------------------------------------------------------------------------------- /myenv/Scripts/spacy.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/spacy.exe -------------------------------------------------------------------------------- /myenv/Scripts/wheel.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/wheel.exe -------------------------------------------------------------------------------- /utilities/resumes/CV.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/utilities/resumes/CV.pdf -------------------------------------------------------------------------------- /myenv/Scripts/fonttools.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/fonttools.exe -------------------------------------------------------------------------------- /myenv/Scripts/ipython3.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/ipython3.exe -------------------------------------------------------------------------------- /myenv/Scripts/pyftmerge.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/pyftmerge.exe -------------------------------------------------------------------------------- /myenv/Scripts/python_d.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/python_d.exe -------------------------------------------------------------------------------- /myenv/Scripts/pythonw_d.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/pythonw_d.exe -------------------------------------------------------------------------------- /myenv/Scripts/streamlit.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/streamlit.exe -------------------------------------------------------------------------------- /myenv/Scripts/watchmedo.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/watchmedo.exe -------------------------------------------------------------------------------- /myenv/Scripts/chardetect.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/chardetect.exe -------------------------------------------------------------------------------- /myenv/Scripts/jsonschema.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/jsonschema.exe -------------------------------------------------------------------------------- /myenv/Scripts/jupyter-run.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/jupyter-run.exe -------------------------------------------------------------------------------- /myenv/Scripts/markdown-it.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/markdown-it.exe -------------------------------------------------------------------------------- /myenv/Scripts/normalizer.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/normalizer.exe -------------------------------------------------------------------------------- /myenv/Scripts/pyftsubset.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/pyftsubset.exe -------------------------------------------------------------------------------- /myenv/Scripts/pygmentize.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/pygmentize.exe -------------------------------------------------------------------------------- /myenv/Scripts/pyresparser.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/pyresparser.exe -------------------------------------------------------------------------------- /myenv/Scripts/jupyter-kernel.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/jupyter-kernel.exe -------------------------------------------------------------------------------- /myenv/etc/jupyter/nbconfig/notebook.d/pydeck.json: -------------------------------------------------------------------------------- 1 | { 2 | "load_extensions": { 3 | "pydeck/extension": true 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /chromedriver_win32/chromedriver.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/chromedriver_win32/chromedriver.exe -------------------------------------------------------------------------------- /myenv/Scripts/jupyter-migrate.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/jupyter-migrate.exe -------------------------------------------------------------------------------- /myenv/Scripts/jupyter-kernelspec.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/jupyter-kernelspec.exe -------------------------------------------------------------------------------- /myenv/Scripts/jupyter-troubleshoot.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/jupyter-troubleshoot.exe -------------------------------------------------------------------------------- /myenv/docx-template/docProps/thumbnail.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/docx-template/docProps/thumbnail.jpeg -------------------------------------------------------------------------------- /myenv/share/jupyter/kernels/python3/logo-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/share/jupyter/kernels/python3/logo-32x32.png -------------------------------------------------------------------------------- /myenv/share/jupyter/kernels/python3/logo-64x64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/share/jupyter/kernels/python3/logo-64x64.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | numpy 3 | seaborn 4 | matplotlib 5 | selenium 6 | tqdm 7 | bs4 8 | regex 9 | docx 10 | nltk 11 | spacy 12 | python-docx 13 | scikit-learn 14 | ftfy 15 | PyPDF2 16 | pyresparser 17 | streamlit 18 | -e . -------------------------------------------------------------------------------- /myenv/share/jupyter/kernels/python3/kernel.json: -------------------------------------------------------------------------------- 1 | { 2 | "argv": [ 3 | "python", 4 | "-m", 5 | "ipykernel_launcher", 6 | "-f", 7 | "{connection_file}" 8 | ], 9 | "display_name": "Python 3 (ipykernel)", 10 | "language": "python", 11 | "metadata": { 12 | "debugger": true 13 | } 14 | } -------------------------------------------------------------------------------- /myenv/Scripts/docx2txt: -------------------------------------------------------------------------------- 1 | #!C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\myenv\Scripts\python.exe 2 | 3 | import docx2txt 4 | 5 | if __name__ == '__main__': 6 | import sys 7 | args = docx2txt.process_args() 8 | text = docx2txt.process(args.docx, args.img_dir) 9 | output = getattr(sys.stdout, 'buffer', sys.stdout) 10 | output.write(text.encode('utf-8')) 11 | -------------------------------------------------------------------------------- /myenv/share/jupyter/nbextensions/pydeck/extensionRequires.js: -------------------------------------------------------------------------------- 1 | /* eslint-disable */ 2 | define(function() { 3 | 'use strict'; 4 | requirejs.config({ 5 | map: { 6 | '*': { 7 | '@deck.gl/jupyter-widget': 'nbextensions/pydeck/index' 8 | } 9 | } 10 | }); 11 | // Export the required load_ipython_extension function 12 | return { 13 | load_ipython_extension: function() {} 14 | }; 15 | }); 16 | -------------------------------------------------------------------------------- /myenv/Scripts/deactivate.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | if defined _OLD_VIRTUAL_PROMPT ( 4 | set "PROMPT=%_OLD_VIRTUAL_PROMPT%" 5 | ) 6 | set _OLD_VIRTUAL_PROMPT= 7 | 8 | if defined _OLD_VIRTUAL_PYTHONHOME ( 9 | set "PYTHONHOME=%_OLD_VIRTUAL_PYTHONHOME%" 10 | set _OLD_VIRTUAL_PYTHONHOME= 11 | ) 12 | 13 | if defined _OLD_VIRTUAL_PATH ( 14 | set "PATH=%_OLD_VIRTUAL_PATH%" 15 | ) 16 | 17 | set _OLD_VIRTUAL_PATH= 18 | 19 | set VIRTUAL_ENV= 20 | set VIRTUAL_ENV_PROMPT= 21 | 22 | :END 23 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages,setup 2 | 3 | HYPHEN_DOT_E='-e .' 4 | def get_requirements(file_path:str)->list[str]: 5 | requirements=[] 6 | with open(file_path) as file_obj: 7 | requirements=file_obj.readlines() 8 | requirements=[req.replace("\n"," ") for req in requirements] 9 | if HYPHEN_DOT_E in requirements: 10 | requirements.remove(HYPHEN_DOT_E) 11 | return requirements 12 | 13 | 14 | 15 | setup( 16 | name='Job_Recommendation_System', 17 | version='0.0.1', 18 | author='Abbas Behrainwala', 19 | author_email='abbasbehrain95@gmail.com', 20 | packages=find_packages(), 21 | install_requires=get_requirements('requirements.txt') 22 | 23 | ) -------------------------------------------------------------------------------- /myenv/Scripts/streamlit.cmd: -------------------------------------------------------------------------------- 1 | rem Copyright (c) Streamlit Inc. (2018-2022) Snowflake Inc. (2022) 2 | rem 3 | rem Licensed under the Apache License, Version 2.0 (the "License"); 4 | rem you may not use this file except in compliance with the License. 5 | rem You may obtain a copy of the License at 6 | rem 7 | rem http://www.apache.org/licenses/LICENSE-2.0 8 | rem 9 | rem Unless required by applicable law or agreed to in writing, software 10 | rem distributed under the License is distributed on an "AS IS" BASIS, 11 | rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | rem See the License for the specific language governing permissions and 13 | rem limitations under the License. 14 | 15 | @echo OFF 16 | python -m streamlit %* 17 | -------------------------------------------------------------------------------- /myenv/docx-template/_rels/.rels: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /myenv/Scripts/activate.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | rem This file is UTF-8 encoded, so we need to update the current code page while executing it 4 | for /f "tokens=2 delims=:." %%a in ('"%SystemRoot%\System32\chcp.com"') do ( 5 | set _OLD_CODEPAGE=%%a 6 | ) 7 | if defined _OLD_CODEPAGE ( 8 | "%SystemRoot%\System32\chcp.com" 65001 > nul 9 | ) 10 | 11 | set VIRTUAL_ENV=C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\myenv 12 | 13 | if not defined PROMPT set PROMPT=$P$G 14 | 15 | if defined _OLD_VIRTUAL_PROMPT set PROMPT=%_OLD_VIRTUAL_PROMPT% 16 | if defined _OLD_VIRTUAL_PYTHONHOME set PYTHONHOME=%_OLD_VIRTUAL_PYTHONHOME% 17 | 18 | set _OLD_VIRTUAL_PROMPT=%PROMPT% 19 | set PROMPT=(myenv) %PROMPT% 20 | 21 | if defined PYTHONHOME set _OLD_VIRTUAL_PYTHONHOME=%PYTHONHOME% 22 | set PYTHONHOME= 23 | 24 | if defined _OLD_VIRTUAL_PATH set PATH=%_OLD_VIRTUAL_PATH% 25 | if not defined _OLD_VIRTUAL_PATH set _OLD_VIRTUAL_PATH=%PATH% 26 | 27 | set PATH=%VIRTUAL_ENV%\Scripts;%PATH% 28 | set VIRTUAL_ENV_PROMPT=(myenv) 29 | 30 | :END 31 | if defined _OLD_CODEPAGE ( 32 | "%SystemRoot%\System32\chcp.com" %_OLD_CODEPAGE% > nul 33 | set _OLD_CODEPAGE= 34 | ) 35 | -------------------------------------------------------------------------------- /src/notebook/skills_extraction.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | from spacy.matcher import Matcher 3 | import PyPDF2 4 | import os 5 | 6 | # Load the Spacy English model 7 | nlp = spacy.load('en_core_web_sm') 8 | import csv 9 | from spacy.matcher import Matcher 10 | import csv 11 | 12 | # Read skills from CSV file 13 | file_path=r'C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\src\data\skills.csv' 14 | with open(file_path, 'r') as file: 15 | csv_reader = csv.reader(file) 16 | skills = [row for row in csv_reader] 17 | 18 | # Create pattern dictionaries from skills 19 | skill_patterns = [[{'LOWER': skill}] for skill in skills[0]] 20 | 21 | # Create a Matcher object 22 | matcher = Matcher(nlp.vocab) 23 | 24 | # Add skill patterns to the matcher 25 | for pattern in skill_patterns: 26 | matcher.add('Skills', [pattern]) 27 | 28 | # Function to extract skills from text 29 | def extract_skills(text): 30 | doc = nlp(text) 31 | matches = matcher(doc) 32 | skills = set() 33 | for match_id, start, end in matches: 34 | skill = doc[start:end].text 35 | skills.add(skill) 36 | return skills 37 | 38 | # Function to extract text from PDF 39 | def extract_text_from_pdf(file_path:str): 40 | with open(file_path, 'rb') as f: 41 | pdf_reader = PyPDF2.PdfReader(f) 42 | text = '' 43 | for page in pdf_reader.pages: 44 | text += page.extract_text() 45 | return text 46 | 47 | def skills_extractor(file_path): 48 | # Extract text from PDF 49 | path=r'C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\src\notebook' 50 | full_file_path = os.path.join(path, file_path) 51 | resume_text = extract_text_from_pdf(full_file_path) 52 | 53 | # Extract skills from resume text 54 | skills = list(extract_skills(resume_text)) 55 | return skills 56 | 57 | 58 | -------------------------------------------------------------------------------- /myenv/docx-template/word/settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import PyPDF2 4 | from pyresparser import ResumeParser 5 | from sklearn.neighbors import NearestNeighbors 6 | from src.components.job_recommender import ngrams,getNearestN,jd_df 7 | import src.notebook.skills_extraction as skills_extraction 8 | from sklearn.feature_extraction.text import TfidfVectorizer 9 | 10 | 11 | # Function to process the resume and recommend jobs 12 | def process_resume(file_path): 13 | # Extract text from PDF resume 14 | resume_skills=skills_extraction.skills_extractor(file_path) 15 | 16 | # Perform job recommendation based on parsed resume data 17 | skills=[] 18 | skills.append(' '.join(word for word in resume_skills)) 19 | 20 | 21 | # Feature Engineering: 22 | vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False) 23 | tfidf = vectorizer.fit_transform(skills) 24 | 25 | 26 | nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf) 27 | jd_test = (jd_df['Processed_JD'].values.astype('U')) 28 | 29 | distances, indices = getNearestN(jd_test) 30 | test = list(jd_test) 31 | matches = [] 32 | 33 | for i,j in enumerate(indices): 34 | dist=round(distances[i][0],2) 35 | temp = [dist] 36 | matches.append(temp) 37 | 38 | matches = pd.DataFrame(matches, columns=['Match confidence']) 39 | 40 | # Following recommends Top 5 Jobs based on candidate resume: 41 | jd_df['match']=matches['Match confidence'] 42 | 43 | return jd_df.head(5).sort_values('match') 44 | 45 | # Streamlit app 46 | def main(): 47 | st.title("Job Recommendation App") 48 | st.write("Upload your resume in PDF format") 49 | 50 | # File uploader 51 | uploaded_file = st.file_uploader("Choose a file", type=['pdf']) 52 | 53 | if uploaded_file is not None: 54 | # Process resume and recommend jobs 55 | file_path=uploaded_file.name 56 | df_jobs = process_resume(file_path) 57 | 58 | # Display recommended jobs as DataFrame 59 | st.write("Recommended Jobs:") 60 | st.dataframe(df_jobs[['Job Title','Company Name','Location','Industry','Sector','Average Salary']]) 61 | 62 | # Run the Streamlit app 63 | if __name__ == '__main__': 64 | main() 65 | -------------------------------------------------------------------------------- /myenv/share/man/man1/ipython.1: -------------------------------------------------------------------------------- 1 | .\" Hey, EMACS: -*- nroff -*- 2 | .\" First parameter, NAME, should be all caps 3 | .\" Second parameter, SECTION, should be 1-8, maybe w/ subsection 4 | .\" other parameters are allowed: see man(7), man(1) 5 | .TH IPYTHON 1 "July 15, 2011" 6 | .\" Please adjust this date whenever revising the manpage. 7 | .\" 8 | .\" Some roff macros, for reference: 9 | .\" .nh disable hyphenation 10 | .\" .hy enable hyphenation 11 | .\" .ad l left justify 12 | .\" .ad b justify to both left and right margins 13 | .\" .nf disable filling 14 | .\" .fi enable filling 15 | .\" .br insert line break 16 | .\" .sp insert n+1 empty lines 17 | .\" for manpage-specific macros, see man(7) and groff_man(7) 18 | .\" .SH section heading 19 | .\" .SS secondary section heading 20 | .\" 21 | .\" 22 | .\" To preview this page as plain text: nroff -man ipython.1 23 | .\" 24 | .SH NAME 25 | ipython \- Tools for Interactive Computing in Python. 26 | .SH SYNOPSIS 27 | .B ipython 28 | .RI [ options ] " files" ... 29 | 30 | .B ipython subcommand 31 | .RI [ options ] ... 32 | 33 | .SH DESCRIPTION 34 | An interactive Python shell with automatic history (input and output), dynamic 35 | object introspection, easier configuration, command completion, access to the 36 | system shell, integration with numerical and scientific computing tools, 37 | web notebook, Qt console, and more. 38 | 39 | For more information on how to use IPython, see 'ipython \-\-help', 40 | or 'ipython \-\-help\-all' for all available command\(hyline options. 41 | 42 | .SH "ENVIRONMENT VARIABLES" 43 | .sp 44 | .PP 45 | \fIIPYTHONDIR\fR 46 | .RS 4 47 | This is the location where IPython stores all its configuration files. The default 48 | is $HOME/.ipython if IPYTHONDIR is not defined. 49 | 50 | You can see the computed value of IPYTHONDIR with `ipython locate`. 51 | 52 | .SH FILES 53 | 54 | IPython uses various configuration files stored in profiles within IPYTHONDIR. 55 | To generate the default configuration files and start configuring IPython, 56 | do 'ipython profile create', and edit '*_config.py' files located in 57 | IPYTHONDIR/profile_default. 58 | 59 | .SH AUTHORS 60 | IPython is written by the IPython Development Team . 61 | -------------------------------------------------------------------------------- /myenv/Scripts/activate: -------------------------------------------------------------------------------- 1 | # This file must be used with "source bin/activate" *from bash* 2 | # you cannot run it directly 3 | 4 | deactivate () { 5 | # reset old environment variables 6 | if [ -n "${_OLD_VIRTUAL_PATH:-}" ] ; then 7 | PATH="${_OLD_VIRTUAL_PATH:-}" 8 | export PATH 9 | unset _OLD_VIRTUAL_PATH 10 | fi 11 | if [ -n "${_OLD_VIRTUAL_PYTHONHOME:-}" ] ; then 12 | PYTHONHOME="${_OLD_VIRTUAL_PYTHONHOME:-}" 13 | export PYTHONHOME 14 | unset _OLD_VIRTUAL_PYTHONHOME 15 | fi 16 | 17 | # This should detect bash and zsh, which have a hash command that must 18 | # be called to get it to forget past commands. Without forgetting 19 | # past commands the $PATH changes we made may not be respected 20 | if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then 21 | hash -r 2> /dev/null 22 | fi 23 | 24 | if [ -n "${_OLD_VIRTUAL_PS1:-}" ] ; then 25 | PS1="${_OLD_VIRTUAL_PS1:-}" 26 | export PS1 27 | unset _OLD_VIRTUAL_PS1 28 | fi 29 | 30 | unset VIRTUAL_ENV 31 | unset VIRTUAL_ENV_PROMPT 32 | if [ ! "${1:-}" = "nondestructive" ] ; then 33 | # Self destruct! 34 | unset -f deactivate 35 | fi 36 | } 37 | 38 | # unset irrelevant variables 39 | deactivate nondestructive 40 | 41 | VIRTUAL_ENV="C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\myenv" 42 | export VIRTUAL_ENV 43 | 44 | _OLD_VIRTUAL_PATH="$PATH" 45 | PATH="$VIRTUAL_ENV/Scripts:$PATH" 46 | export PATH 47 | 48 | # unset PYTHONHOME if set 49 | # this will fail if PYTHONHOME is set to the empty string (which is bad anyway) 50 | # could use `if (set -u; : $PYTHONHOME) ;` in bash 51 | if [ -n "${PYTHONHOME:-}" ] ; then 52 | _OLD_VIRTUAL_PYTHONHOME="${PYTHONHOME:-}" 53 | unset PYTHONHOME 54 | fi 55 | 56 | if [ -z "${VIRTUAL_ENV_DISABLE_PROMPT:-}" ] ; then 57 | _OLD_VIRTUAL_PS1="${PS1:-}" 58 | PS1="(myenv) ${PS1:-}" 59 | export PS1 60 | VIRTUAL_ENV_PROMPT="(myenv) " 61 | export VIRTUAL_ENV_PROMPT 62 | fi 63 | 64 | # This should detect bash and zsh, which have a hash command that must 65 | # be called to get it to forget past commands. Without forgetting 66 | # past commands the $PATH changes we made may not be respected 67 | if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then 68 | hash -r 2> /dev/null 69 | fi 70 | -------------------------------------------------------------------------------- /myenv/docx-template/word/fontTable.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /src/components/job_recommender.py: -------------------------------------------------------------------------------- 1 | import re 2 | from ftfy import fix_text 3 | from sklearn.feature_extraction.text import TfidfVectorizer 4 | import re 5 | from sklearn.neighbors import NearestNeighbors 6 | import numpy as np 7 | import pandas as pd 8 | import nltk 9 | from nltk.corpus import stopwords 10 | stopw = set(stopwords.words('english')) 11 | from pyresparser import ResumeParser 12 | import os 13 | from docx import Document 14 | import src.notebook.skills_extraction as skills_extraction 15 | 16 | # Load dataset: 17 | jd_df=pd.read_csv(r'C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\src\data\jd_structured_data.csv') 18 | 19 | # Load the extracted resume skills: 20 | file_path=r'C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\utilities\resumes\CV.pdf' 21 | skills=[] 22 | skills.append(' '.join(word for word in skills_extraction.skills_extractor(file_path))) 23 | 24 | def ngrams(string, n=3): 25 | string = fix_text(string) # fix text 26 | string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars 27 | string = string.lower() 28 | chars_to_remove = [")","(",".","|","[","]","{","}","'"] 29 | rx = '[' + re.escape(''.join(chars_to_remove)) + ']' 30 | string = re.sub(rx, '', string) 31 | string = string.replace('&', 'and') 32 | string = string.replace(',', ' ') 33 | string = string.replace('-', ' ') 34 | string = string.title() # normalise case - capital at start of each word 35 | string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single 36 | string = ' '+ string +' ' # pad names for ngrams... 37 | string = re.sub(r'[,-./]|\sBD',r'', string) 38 | ngrams = zip(*[string[i:] for i in range(n)]) 39 | return [''.join(ngram) for ngram in ngrams] 40 | 41 | vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False) 42 | tfidf = vectorizer.fit_transform(skills) 43 | 44 | nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf) 45 | jd_test = (jd_df['Processed_JD'].values.astype('U')) 46 | 47 | def getNearestN(query): 48 | queryTFIDF_ = vectorizer.transform(query) 49 | distances, indices = nbrs.kneighbors(queryTFIDF_) 50 | return distances, indices 51 | 52 | distances, indices = getNearestN(jd_test) 53 | test = list(jd_test) 54 | matches = [] 55 | 56 | for i,j in enumerate(indices): 57 | dist=round(distances[i][0],2) 58 | 59 | temp = [dist] 60 | matches.append(temp) 61 | 62 | matches = pd.DataFrame(matches, columns=['Match confidence']) 63 | 64 | # Following recommends Top 5 Jobs based on candidate resume: 65 | jd_df['match']=matches['Match confidence'] 66 | jd_df.head(5).sort_values('match') -------------------------------------------------------------------------------- /myenv/Scripts/plac_runner.py: -------------------------------------------------------------------------------- 1 | #!C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\myenv\Scripts\python.exe 2 | from __future__ import with_statement 3 | import os 4 | import sys 5 | import shlex 6 | import plac 7 | 8 | 9 | def run(fnames, cmd, verbose): 10 | "Run batch scripts and tests" 11 | for fname in fnames: 12 | with open(fname) as f: 13 | lines = list(f) 14 | if not lines[0].startswith('#!'): 15 | sys.exit('Missing or incorrect shebang line!') 16 | firstline = lines[0][2:] # strip the shebang 17 | init_args = shlex.split(firstline) 18 | tool = plac.import_main(*init_args) 19 | command = getattr(plac.Interpreter(tool), cmd) # doctest or execute 20 | if verbose: 21 | sys.stdout.write('Running %s with %s' % (fname, firstline)) 22 | command(lines[1:], verbose=verbose) 23 | 24 | 25 | @plac.annotations( 26 | verbose=('verbose mode', 'flag', 'v'), 27 | interactive=('run plac tool in interactive mode', 'flag', 'i'), 28 | multiline=('run plac tool in multiline mode', 'flag', 'm'), 29 | serve=('run plac server', 'option', 's', int), 30 | batch=('run plac batch files', 'flag', 'b'), 31 | test=('run plac test files', 'flag', 't'), 32 | fname='script to run (.py or .plac or .placet)', 33 | extra='additional arguments', 34 | ) 35 | def main(verbose, interactive, multiline, serve, batch, test, fname='', 36 | *extra): 37 | "Runner for plac tools, plac batch files and plac tests" 38 | baseparser = plac.parser_from(main) 39 | if not fname: 40 | baseparser.print_help() 41 | elif sys.argv[1] == fname: # script mode 42 | plactool = plac.import_main(fname) 43 | plactool.prog = os.path.basename(sys.argv[0]) + ' ' + fname 44 | out = plac.call(plactool, sys.argv[2:], eager=False) 45 | if plac.iterable(out): 46 | for output in out: 47 | print(output) 48 | else: 49 | print(out) 50 | elif interactive or multiline or serve: 51 | plactool = plac.import_main(fname, *extra) 52 | plactool.prog = '' 53 | i = plac.Interpreter(plactool) 54 | if interactive: 55 | i.interact(verbose=verbose) 56 | elif multiline: 57 | i.multiline(verbose=verbose) 58 | elif serve: 59 | i.start_server(serve) 60 | elif batch: 61 | run((fname,) + extra, 'execute', verbose) 62 | elif test: 63 | run((fname,) + extra, 'doctest', verbose) 64 | print('run %s plac test(s)' % (len(extra) + 1)) 65 | else: 66 | baseparser.print_usage() 67 | 68 | 69 | main.add_help = False 70 | 71 | if __name__ == '__main__': 72 | plac.call(main) 73 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /src/components/jd_data_cleaner.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | import re 5 | import nltk 6 | from nltk.corpus import stopwords 7 | stopw = set(stopwords.words('english')) 8 | 9 | # Load the dataset: 10 | unstructured_df=pd.read_csv('jd_unstructured_data.csv') 11 | 12 | def convert_salary(value): 13 | if 'Unknown' in value: 14 | return None 15 | elif '-' in value: 16 | values = re.findall(r'\$\d+K', value) 17 | min_value = int(values[0].replace('$', '').replace('K', '')) if values else None 18 | max_value = int(values[1].replace('$', '').replace('K', '')) if len(values) > 1 else None 19 | if min_value and max_value: 20 | return (min_value + max_value) / 2 21 | elif min_value: 22 | return min_value 23 | elif max_value: 24 | return max_value 25 | else: 26 | return None 27 | else: 28 | return int(re.findall(r'\$\d+K', value)[0].replace('$', '').replace('K', '')) 29 | 30 | def convert_revenue(value): 31 | if 'Unknown' in value: 32 | return None 33 | elif ' to ' in value: 34 | values = re.findall(r'\d+\.?\d*', value) 35 | min_revenue = float(values[0]) 36 | max_revenue = float(values[1]) 37 | unit = value.split()[-2] 38 | if unit == 'billion': 39 | min_revenue *= 1000 40 | max_revenue *= 1000 41 | return (min_revenue + max_revenue) / 2 42 | else: 43 | numerical_values = re.findall(r'\d+\.?\d*', value) 44 | if numerical_values: 45 | return float(numerical_values[0]) 46 | else: 47 | return None 48 | 49 | # Define a function to convert the size value 50 | def convert_size(value): 51 | if 'Unknown' in value: 52 | return None 53 | elif ' to ' in value: 54 | sizes = value.split(' to ') 55 | min_size = int(sizes[0].replace('+', '').replace(',', '').split()[0]) 56 | max_size = int(sizes[1].replace('+', '').replace(',', '').split()[0]) 57 | return (min_size + max_size) / 2 58 | else: 59 | return int(value.replace('+', '').replace(',', '').split()[0]) 60 | 61 | # Apply the conversion function to the "Salary Column" column 62 | unstructured_df['Average Salary'] = unstructured_df['Salary Estimate'].apply(convert_salary) 63 | 64 | # Apply the conversion function to the "Revenue" column 65 | unstructured_df['Average Revenue'] = unstructured_df['Revenue'].apply(convert_revenue) 66 | 67 | # Extract the company name by splitting on '\r\n' and selecting the first element 68 | unstructured_df['Company Name'] = unstructured_df['Company Name'].str.split('\r\n').str[0] 69 | 70 | 71 | # Apply the conversion function to the "Size" column 72 | unstructured_df['Size'] = unstructured_df['Size'].apply(convert_size) 73 | 74 | # remove stopwords and pre-process Job Description Column: 75 | unstructured_df['Processed_JD']=unstructured_df['Job Description'].apply(lambda x: ' '.join([word for word in str(x).split() if len(word)>2 and word not in (stopw)])) 76 | 77 | 78 | # Drop Unwanted Columns: 79 | unstructured_df=unstructured_df.drop(['Unnamed: 0','Salary Estimate','Revenue','Job Description'],axis=1) 80 | 81 | # Check for Null Value after data pre-processing: 82 | unstructured_df.isnull().sum() 83 | 84 | # Replace the null values with average value of each columns: 85 | # Calculate the average value of column B 86 | size_average = unstructured_df['Size'].mean() 87 | salary_average=unstructured_df['Average Salary'].mean() 88 | revenue_average=unstructured_df['Average Revenue'].mean() 89 | 90 | # Replace null values with the average 91 | unstructured_df['Size'].fillna(size_average, inplace=True) 92 | unstructured_df['Average Salary'].fillna(salary_average, inplace=True) 93 | unstructured_df['Average Revenue'].fillna(revenue_average, inplace=True) 94 | 95 | # Convert DataFrame to CSV file 96 | unstructured_df.to_csv(r'C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\src\data\jd_structured_data.csv', index=False) -------------------------------------------------------------------------------- /myenv/Scripts/pywin32_testall.py: -------------------------------------------------------------------------------- 1 | """A test runner for pywin32""" 2 | import os 3 | import site 4 | import subprocess 5 | import sys 6 | 7 | # locate the dirs based on where this script is - it may be either in the 8 | # source tree, or in an installed Python 'Scripts' tree. 9 | this_dir = os.path.dirname(__file__) 10 | site_packages = [ 11 | site.getusersitepackages(), 12 | ] + site.getsitepackages() 13 | 14 | failures = [] 15 | 16 | 17 | # Run a test using subprocess and wait for the result. 18 | # If we get an returncode != 0, we know that there was an error, but we don't 19 | # abort immediately - we run as many tests as we can. 20 | def run_test(script, cmdline_extras): 21 | dirname, scriptname = os.path.split(script) 22 | # some tests prefer to be run from their directory. 23 | cmd = [sys.executable, "-u", scriptname] + cmdline_extras 24 | print("--- Running '%s' ---" % script) 25 | sys.stdout.flush() 26 | result = subprocess.run(cmd, check=False, cwd=dirname) 27 | print("*** Test script '%s' exited with %s" % (script, result.returncode)) 28 | sys.stdout.flush() 29 | if result.returncode: 30 | failures.append(script) 31 | 32 | 33 | def find_and_run(possible_locations, extras): 34 | for maybe in possible_locations: 35 | if os.path.isfile(maybe): 36 | run_test(maybe, extras) 37 | break 38 | else: 39 | raise RuntimeError( 40 | "Failed to locate a test script in one of %s" % possible_locations 41 | ) 42 | 43 | 44 | def main(): 45 | import argparse 46 | 47 | code_directories = [this_dir] + site_packages 48 | 49 | parser = argparse.ArgumentParser( 50 | description="A script to trigger tests in all subprojects of PyWin32." 51 | ) 52 | parser.add_argument( 53 | "-no-user-interaction", 54 | default=False, 55 | action="store_true", 56 | help="(This is now the default - use `-user-interaction` to include them)", 57 | ) 58 | 59 | parser.add_argument( 60 | "-user-interaction", 61 | action="store_true", 62 | help="Include tests which require user interaction", 63 | ) 64 | 65 | parser.add_argument( 66 | "-skip-adodbapi", 67 | default=False, 68 | action="store_true", 69 | help="Skip the adodbapi tests; useful for CI where there's no provider", 70 | ) 71 | 72 | args, remains = parser.parse_known_args() 73 | 74 | # win32, win32ui / Pythonwin 75 | 76 | extras = [] 77 | if args.user_interaction: 78 | extras += ["-user-interaction"] 79 | extras.extend(remains) 80 | scripts = [ 81 | "win32/test/testall.py", 82 | "Pythonwin/pywin/test/all.py", 83 | ] 84 | for script in scripts: 85 | maybes = [os.path.join(directory, script) for directory in code_directories] 86 | find_and_run(maybes, extras) 87 | 88 | # win32com 89 | maybes = [ 90 | os.path.join(directory, "win32com", "test", "testall.py") 91 | for directory in [ 92 | os.path.join(this_dir, "com"), 93 | ] 94 | + site_packages 95 | ] 96 | extras = remains + ["1"] # only run "level 1" tests in CI 97 | find_and_run(maybes, extras) 98 | 99 | # adodbapi 100 | if not args.skip_adodbapi: 101 | maybes = [ 102 | os.path.join(directory, "adodbapi", "test", "adodbapitest.py") 103 | for directory in code_directories 104 | ] 105 | find_and_run(maybes, remains) 106 | # This script has a hard-coded sql server name in it, (and markh typically 107 | # doesn't have a different server to test on) but there is now supposed to be a server out there on the Internet 108 | # just to run these tests, so try it... 109 | maybes = [ 110 | os.path.join(directory, "adodbapi", "test", "test_adodbapi_dbapi20.py") 111 | for directory in code_directories 112 | ] 113 | find_and_run(maybes, remains) 114 | 115 | if failures: 116 | print("The following scripts failed") 117 | for failure in failures: 118 | print(">", failure) 119 | sys.exit(1) 120 | print("All tests passed \\o/") 121 | 122 | 123 | if __name__ == "__main__": 124 | main() 125 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![jsr2](https://github.com/abbas99-hub/Job-Recommendation-System/assets/60792939/ffa4a634-42d7-491e-89f8-07a137322876) 2 | 3 | # Job Recommendation System using Machine Learning 4 | This repository contains the code and instructions to build a job recommendation system using machine learning. The system is designed to provide personalized job recommendations based on user preferences and historical job data. The data for this project is scraped from Glassdoor, and the system is deployed using the Azure cloud platform. 5 | 6 | ## Business Understanding 7 | The goal of this project is to develop a job recommendation system that helps users find relevant job opportunities based on their preferences and historical data. By leveraging machine learning techniques, we aim to provide personalized recommendations that align with the user's skills, interests, and career goals. The system will take into account various factors such as job title, salary estimate, company rating, location, industry, and more to generate accurate recommendations. 8 | 9 | ## Data Scraping 10 | To collect the necessary data for training our recommendation system, we will scrape job-related information from Glassdoor. The following columns will be extracted: 11 | 12 | Job Title 13 | Salary Estimate 14 | Job Description 15 | Rating 16 | Company Name 17 | Location 18 | Headquarters 19 | Size 20 | Founded 21 | Type of Ownership 22 | Industry 23 | Sector 24 | Revenue 25 | Competitors 26 | 27 | ## Feature Engineering 28 | Once the data is collected, we will perform feature engineering to preprocess and transform the raw data into a suitable format for training our recommendation model. This step includes: 29 | 30 | Handling Missing Data: Deal with missing values in the dataset by either imputing them or removing the corresponding rows/columns. 31 | Encoding Categorical Variables: Convert categorical variables such as job title, location, industry, and sector into numerical representations using techniques like one-hot encoding or label encoding. 32 | Feature Scaling: Normalize numerical features, such as salary estimate and company rating, to ensure they have a similar scale and prevent dominance of certain features in the model. 33 | 34 | ## Machine Learning Techniques: 35 | To provide personalized job recommendations, we employ the TF-IDF (Term Frequency-Inverse Document Frequency) vectorization technique. The "job_recommender.py" component plays a crucial role in this process. It utilizes the TF-IDF vectorizer from the scikit-learn library to transform job descriptions and user preferences into numerical feature vectors. These vectors capture the importance of each word in the documents, enabling the system to find similar job opportunities based on user preferences. The Nearest Neighbors algorithm is then used to identify the most relevant job recommendations. 36 | 37 | skill extractor segment provides functions and utilities to extract skills from a PDF file using the Spacy library and perform text processing and matching operations. These extracted skills can be used for further analysis and processing in the job recommendation system. 38 | 39 | ## Streamlit Application 40 | To make the job recommendation system easily accessible and user-friendly, we have developed a Streamlit application. Streamlit provides an intuitive web interface where users can upload their resumes. The application processes the user input, applies the machine learning models, and displays the top-recommended jobs based on the user's preferences and historical data. 41 | 42 | ## Model Deployment using Azure Cloud 43 | To make the job recommendation system accessible to users, we will deploy the model on the Azure cloud platform. The deployment process involves the following steps: 44 | 45 | * Model Serialization: Serialize the trained model to a format compatible with the Azure cloud deployment. 46 | * Model Containerization: Package the serialized model along with the necessary dependencies and environment specifications into a container using tools like Docker. 47 | * Azure Container Registry: Create a container registry on Azure to store the model container and related artifacts securely. 48 | * Azure Kubernetes Service (AKS): Deploy the model container as a scalable microservice using AKS, which provides orchestration and management capabilities. 49 | * API Development: Develop an API that allows users to interact with the deployed model and request personalized job recommendations. 50 | * Integration and Testing: Integrate the API with other components of the job recommendation system, and perform thorough testing to ensure its functionality and performance. 51 | * Deployment Monitoring: Monitor the deployed model and API to track usage, and performance metrics, and address any potential issues or errors. 52 | 53 | ## Usage 54 | To use the job recommendation system, follow the instructions below: 55 | 56 | * Clone this repository: git clone 57 | * Install the required dependencies: pip install -r requirements.txt 58 | * Run the command: streamlit run __init__.py ( For Local Server ) 59 | * Access the deployed job recommendation API and make requests to receive personalized recommendations. 60 | 61 | #### Please feel free to contribute to this project by submitting pull requests or opening issues. 62 | -------------------------------------------------------------------------------- /myenv/docx-template/word/styles.xml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /myenv/share/man/man1/ttx.1: -------------------------------------------------------------------------------- 1 | .Dd May 18, 2004 2 | .\" ttx is not specific to any OS, but contrary to what groff_mdoc(7) 3 | .\" seems to imply, entirely omitting the .Os macro causes 'BSD' to 4 | .\" be used, so I give a zero-width space as its argument. 5 | .Os \& 6 | .\" The "FontTools Manual" argument apparently has no effect in 7 | .\" groff 1.18.1. I think it is a bug in the -mdoc groff package. 8 | .Dt TTX 1 "FontTools Manual" 9 | .Sh NAME 10 | .Nm ttx 11 | .Nd tool for manipulating TrueType and OpenType fonts 12 | .Sh SYNOPSIS 13 | .Nm 14 | .Bk 15 | .Op Ar option ... 16 | .Ek 17 | .Bk 18 | .Ar file ... 19 | .Ek 20 | .Sh DESCRIPTION 21 | .Nm 22 | is a tool for manipulating TrueType and OpenType fonts. It can convert 23 | TrueType and OpenType fonts to and from an 24 | .Tn XML Ns -based format called 25 | .Tn TTX . 26 | .Tn TTX 27 | files have a 28 | .Ql .ttx 29 | extension. 30 | .Pp 31 | For each 32 | .Ar file 33 | argument it is given, 34 | .Nm 35 | detects whether it is a 36 | .Ql .ttf , 37 | .Ql .otf 38 | or 39 | .Ql .ttx 40 | file and acts accordingly: if it is a 41 | .Ql .ttf 42 | or 43 | .Ql .otf 44 | file, it generates a 45 | .Ql .ttx 46 | file; if it is a 47 | .Ql .ttx 48 | file, it generates a 49 | .Ql .ttf 50 | or 51 | .Ql .otf 52 | file. 53 | .Pp 54 | By default, every output file is created in the same directory as the 55 | corresponding input file and with the same name except for the 56 | extension, which is substituted appropriately. 57 | .Nm 58 | never overwrites existing files; if necessary, it appends a suffix to 59 | the output file name before the extension, as in 60 | .Pa Arial#1.ttf . 61 | .Ss "General options" 62 | .Bl -tag -width ".Fl t Ar table" 63 | .It Fl h 64 | Display usage information. 65 | .It Fl d Ar dir 66 | Write the output files to directory 67 | .Ar dir 68 | instead of writing every output file to the same directory as the 69 | corresponding input file. 70 | .It Fl o Ar file 71 | Write the output to 72 | .Ar file 73 | instead of writing it to the same directory as the 74 | corresponding input file. 75 | .It Fl v 76 | Be verbose. Write more messages to the standard output describing what 77 | is being done. 78 | .It Fl a 79 | Allow virtual glyphs ID's on compile or decompile. 80 | .El 81 | .Ss "Dump options" 82 | The following options control the process of dumping font files 83 | (TrueType or OpenType) to 84 | .Tn TTX 85 | files. 86 | .Bl -tag -width ".Fl t Ar table" 87 | .It Fl l 88 | List table information. Instead of dumping the font to a 89 | .Tn TTX 90 | file, display minimal information about each table. 91 | .It Fl t Ar table 92 | Dump table 93 | .Ar table . 94 | This option may be given multiple times to dump several tables at 95 | once. When not specified, all tables are dumped. 96 | .It Fl x Ar table 97 | Exclude table 98 | .Ar table 99 | from the list of tables to dump. This option may be given multiple 100 | times to exclude several tables from the dump. The 101 | .Fl t 102 | and 103 | .Fl x 104 | options are mutually exclusive. 105 | .It Fl s 106 | Split tables. Dump each table to a separate 107 | .Tn TTX 108 | file and write (under the name that would have been used for the output 109 | file if the 110 | .Fl s 111 | option had not been given) one small 112 | .Tn TTX 113 | file containing references to the individual table dump files. This 114 | file can be used as input to 115 | .Nm 116 | as long as the referenced files can be found in the same directory. 117 | .It Fl i 118 | .\" XXX: I suppose OpenType programs (exist and) are also affected. 119 | Don't disassemble TrueType instructions. When this option is specified, 120 | all TrueType programs (glyph programs, the font program and the 121 | pre-program) are written to the 122 | .Tn TTX 123 | file as hexadecimal data instead of 124 | assembly. This saves some time and results in smaller 125 | .Tn TTX 126 | files. 127 | .It Fl y Ar n 128 | When decompiling a TrueType Collection (TTC) file, 129 | decompile font number 130 | .Ar n , 131 | starting from 0. 132 | .El 133 | .Ss "Compilation options" 134 | The following options control the process of compiling 135 | .Tn TTX 136 | files into font files (TrueType or OpenType): 137 | .Bl -tag -width ".Fl t Ar table" 138 | .It Fl m Ar fontfile 139 | Merge the input 140 | .Tn TTX 141 | file 142 | .Ar file 143 | with 144 | .Ar fontfile . 145 | No more than one 146 | .Ar file 147 | argument can be specified when this option is used. 148 | .It Fl b 149 | Don't recalculate glyph bounding boxes. Use the values in the 150 | .Tn TTX 151 | file as is. 152 | .El 153 | .Sh "THE TTX FILE FORMAT" 154 | You can find some information about the 155 | .Tn TTX 156 | file format in 157 | .Pa documentation.html . 158 | In particular, you will find in that file the list of tables understood by 159 | .Nm 160 | and the relations between TrueType GlyphIDs and the glyph names used in 161 | .Tn TTX 162 | files. 163 | .Sh EXAMPLES 164 | In the following examples, all files are read from and written to the 165 | current directory. Additionally, the name given for the output file 166 | assumes in every case that it did not exist before 167 | .Nm 168 | was invoked. 169 | .Pp 170 | Dump the TrueType font contained in 171 | .Pa FreeSans.ttf 172 | to 173 | .Pa FreeSans.ttx : 174 | .Pp 175 | .Dl ttx FreeSans.ttf 176 | .Pp 177 | Compile 178 | .Pa MyFont.ttx 179 | into a TrueType or OpenType font file: 180 | .Pp 181 | .Dl ttx MyFont.ttx 182 | .Pp 183 | List the tables in 184 | .Pa FreeSans.ttf 185 | along with some information: 186 | .Pp 187 | .Dl ttx -l FreeSans.ttf 188 | .Pp 189 | Dump the 190 | .Sq cmap 191 | table from 192 | .Pa FreeSans.ttf 193 | to 194 | .Pa FreeSans.ttx : 195 | .Pp 196 | .Dl ttx -t cmap FreeSans.ttf 197 | .Sh NOTES 198 | On MS\-Windows and MacOS, 199 | .Nm 200 | is available as a graphical application to which files can be dropped. 201 | .Sh SEE ALSO 202 | .Pa documentation.html 203 | .Pp 204 | .Xr fontforge 1 , 205 | .Xr ftinfo 1 , 206 | .Xr gfontview 1 , 207 | .Xr xmbdfed 1 , 208 | .Xr Font::TTF 3pm 209 | .Sh AUTHORS 210 | .Nm 211 | was written by 212 | .An -nosplit 213 | .An "Just van Rossum" Aq just@letterror.com . 214 | .Pp 215 | This manual page was written by 216 | .An "Florent Rougon" Aq f.rougon@free.fr 217 | for the Debian GNU/Linux system based on the existing FontTools 218 | documentation. It may be freely used, modified and distributed without 219 | restrictions. 220 | .\" For Emacs: 221 | .\" Local Variables: 222 | .\" fill-column: 72 223 | .\" sentence-end: "[.?!][]\"')}]*\\($\\| $\\| \\| \\)[ \n]*" 224 | .\" sentence-end-double-space: t 225 | .\" End: -------------------------------------------------------------------------------- /src/data/url_data_scientist_loc_bangalore.json: -------------------------------------------------------------------------------- 1 | [ 2 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=117&ao=389273&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_24fdd564&cb=1586328300996&jobListingId=3548697507", 3 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=120&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_385c924b&cb=1586328300998&jobListingId=3334835027", 4 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=129&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_07047cd3&cb=1586328301010&jobListingId=3255119944", 5 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=124&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_12a2f280&cb=1586328301002&jobListingId=3309508178", 6 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=125&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_f977b9a1&cb=1586328301002&jobListingId=3463137315", 7 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=121&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_f41df190&cb=1586328300999&jobListingId=3522398014", 8 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=114&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_eed74d53&cb=1586328300994&jobListingId=3548424285", 9 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=101&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_b527f613&cb=1586328300984&jobListingId=3463396953", 10 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=102&ao=883172&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_4cce804c&cb=1586328300985&jobListingId=3463302895", 11 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=108&ao=389273&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_d4e51354&cb=1586328300990&jobListingId=3548552003", 12 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=118&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&ea=1&cs=1_86e6a5cf&cb=1586328300997&jobListingId=3361772952", 13 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=122&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_9a16a794&cb=1586328301000&jobListingId=3224747590", 14 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=109&ao=4120&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_b44be053&cb=1586328300991&jobListingId=3284143205", 15 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=103&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_b9149c3f&cb=1586328300986&jobListingId=3488569582", 16 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=127&ao=140609&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_1bd12ebc&cb=1586328301004&jobListingId=3463632306", 17 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=110&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_b987a82e&cb=1586328300992&jobListingId=3394241447", 18 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=112&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_0f7674db&cb=1586328300993&jobListingId=3442340171", 19 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=126&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_5bd8e316&cb=1586328301003&jobListingId=3549041922", 20 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=111&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_588dd741&cb=1586328300992&jobListingId=3255119711", 21 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=113&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_779c66c0&cb=1586328300994&jobListingId=3285076786", 22 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=123&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&ea=1&cs=1_175a925f&cb=1586328301001&jobListingId=3442891307", 23 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=116&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_86d86e28&cb=1586328300996&jobListingId=3548427144", 24 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=104&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_eca49f2c&cb=1586328300987&jobListingId=3463894344", 25 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=130&ao=389273&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_78b076c2&cb=1586328301010&jobListingId=3548552092", 26 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=106&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&ea=1&cs=1_a61639ea&cb=1586328300988&jobListingId=3548704688", 27 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=105&ao=4120&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_0e5497d0&cb=1586328300987&jobListingId=3488034422", 28 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=128&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_64c1d4e8&cb=1586328301009&jobListingId=3393519661", 29 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=119&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_9264dd03&cb=1586328300998&jobListingId=3361772911", 30 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=115&ao=389273&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_37b252b0&cb=1586328300995&jobListingId=3548425012", 31 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=107&ao=389273&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_68954416&cb=1586328300989&jobListingId=3548552132" 32 | ] -------------------------------------------------------------------------------- /src/notebook/url_data_scientist_loc_bangalore.json: -------------------------------------------------------------------------------- 1 | [ 2 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=117&ao=389273&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_24fdd564&cb=1586328300996&jobListingId=3548697507", 3 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=120&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_385c924b&cb=1586328300998&jobListingId=3334835027", 4 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=129&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_07047cd3&cb=1586328301010&jobListingId=3255119944", 5 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=124&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_12a2f280&cb=1586328301002&jobListingId=3309508178", 6 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=125&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_f977b9a1&cb=1586328301002&jobListingId=3463137315", 7 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=121&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_f41df190&cb=1586328300999&jobListingId=3522398014", 8 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=114&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_eed74d53&cb=1586328300994&jobListingId=3548424285", 9 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=101&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_b527f613&cb=1586328300984&jobListingId=3463396953", 10 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=102&ao=883172&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_4cce804c&cb=1586328300985&jobListingId=3463302895", 11 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=108&ao=389273&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_d4e51354&cb=1586328300990&jobListingId=3548552003", 12 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=118&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&ea=1&cs=1_86e6a5cf&cb=1586328300997&jobListingId=3361772952", 13 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=122&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_9a16a794&cb=1586328301000&jobListingId=3224747590", 14 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=109&ao=4120&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_b44be053&cb=1586328300991&jobListingId=3284143205", 15 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=103&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_b9149c3f&cb=1586328300986&jobListingId=3488569582", 16 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=127&ao=140609&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_1bd12ebc&cb=1586328301004&jobListingId=3463632306", 17 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=110&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_b987a82e&cb=1586328300992&jobListingId=3394241447", 18 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=112&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_0f7674db&cb=1586328300993&jobListingId=3442340171", 19 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=126&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_5bd8e316&cb=1586328301003&jobListingId=3549041922", 20 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=111&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_588dd741&cb=1586328300992&jobListingId=3255119711", 21 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=113&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_779c66c0&cb=1586328300994&jobListingId=3285076786", 22 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=123&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&ea=1&cs=1_175a925f&cb=1586328301001&jobListingId=3442891307", 23 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=116&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_86d86e28&cb=1586328300996&jobListingId=3548427144", 24 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=104&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_eca49f2c&cb=1586328300987&jobListingId=3463894344", 25 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=130&ao=389273&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_78b076c2&cb=1586328301010&jobListingId=3548552092", 26 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=106&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&ea=1&cs=1_a61639ea&cb=1586328300988&jobListingId=3548704688", 27 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=105&ao=4120&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_0e5497d0&cb=1586328300987&jobListingId=3488034422", 28 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=128&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_64c1d4e8&cb=1586328301009&jobListingId=3393519661", 29 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=119&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_9264dd03&cb=1586328300998&jobListingId=3361772911", 30 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=115&ao=389273&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_37b252b0&cb=1586328300995&jobListingId=3548425012", 31 | "https://www.glassdoor.co.in/partner/jobListing.htm?pos=107&ao=389273&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_68954416&cb=1586328300989&jobListingId=3548552132" 32 | ] -------------------------------------------------------------------------------- /myenv/docx-template/word/theme/theme1.xml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /myenv/share/jupyter/kernels/python3/logo-svg.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 23 | 25 | 26 | 28 | image/svg+xml 29 | 31 | 32 | 33 | 34 | 61 | 63 | 65 | 69 | 73 | 74 | 76 | 80 | 84 | 85 | 87 | 91 | 95 | 96 | 98 | 102 | 106 | 107 | 109 | 113 | 117 | 118 | 120 | 124 | 128 | 129 | 138 | 147 | 157 | 167 | 177 | 187 | 197 | 207 | 218 | 228 | 238 | 249 | 250 | 254 | 258 | 265 | 266 | -------------------------------------------------------------------------------- /myenv/Scripts/pdf2txt.py: -------------------------------------------------------------------------------- 1 | #!C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\myenv\Scripts\python.exe 2 | """A command line tool for extracting text and images from PDF and 3 | output it to plain text, html, xml or tags.""" 4 | import argparse 5 | import logging 6 | import sys 7 | from typing import Any, Container, Iterable, List, Optional 8 | 9 | import pdfminer.high_level 10 | from pdfminer.layout import LAParams 11 | from pdfminer.utils import AnyIO 12 | 13 | logging.basicConfig() 14 | 15 | OUTPUT_TYPES = ((".htm", "html"), (".html", "html"), (".xml", "xml"), (".tag", "tag")) 16 | 17 | 18 | def float_or_disabled(x: str) -> Optional[float]: 19 | if x.lower().strip() == "disabled": 20 | return None 21 | try: 22 | return float(x) 23 | except ValueError: 24 | raise argparse.ArgumentTypeError("invalid float value: {}".format(x)) 25 | 26 | 27 | def extract_text( 28 | files: Iterable[str] = [], 29 | outfile: str = "-", 30 | laparams: Optional[LAParams] = None, 31 | output_type: str = "text", 32 | codec: str = "utf-8", 33 | strip_control: bool = False, 34 | maxpages: int = 0, 35 | page_numbers: Optional[Container[int]] = None, 36 | password: str = "", 37 | scale: float = 1.0, 38 | rotation: int = 0, 39 | layoutmode: str = "normal", 40 | output_dir: Optional[str] = None, 41 | debug: bool = False, 42 | disable_caching: bool = False, 43 | **kwargs: Any 44 | ) -> AnyIO: 45 | if not files: 46 | raise ValueError("Must provide files to work upon!") 47 | 48 | if output_type == "text" and outfile != "-": 49 | for override, alttype in OUTPUT_TYPES: 50 | if outfile.endswith(override): 51 | output_type = alttype 52 | 53 | if outfile == "-": 54 | outfp: AnyIO = sys.stdout 55 | if sys.stdout.encoding is not None: 56 | codec = "utf-8" 57 | else: 58 | outfp = open(outfile, "wb") 59 | 60 | for fname in files: 61 | with open(fname, "rb") as fp: 62 | pdfminer.high_level.extract_text_to_fp(fp, **locals()) 63 | return outfp 64 | 65 | 66 | def create_parser() -> argparse.ArgumentParser: 67 | parser = argparse.ArgumentParser(description=__doc__, add_help=True) 68 | parser.add_argument( 69 | "files", 70 | type=str, 71 | default=None, 72 | nargs="+", 73 | help="One or more paths to PDF files.", 74 | ) 75 | 76 | parser.add_argument( 77 | "--version", 78 | "-v", 79 | action="version", 80 | version="pdfminer.six v{}".format(pdfminer.__version__), 81 | ) 82 | parser.add_argument( 83 | "--debug", 84 | "-d", 85 | default=False, 86 | action="store_true", 87 | help="Use debug logging level.", 88 | ) 89 | parser.add_argument( 90 | "--disable-caching", 91 | "-C", 92 | default=False, 93 | action="store_true", 94 | help="If caching or resources, such as fonts, should be disabled.", 95 | ) 96 | 97 | parse_params = parser.add_argument_group( 98 | "Parser", description="Used during PDF parsing" 99 | ) 100 | parse_params.add_argument( 101 | "--page-numbers", 102 | type=int, 103 | default=None, 104 | nargs="+", 105 | help="A space-seperated list of page numbers to parse.", 106 | ) 107 | parse_params.add_argument( 108 | "--pagenos", 109 | "-p", 110 | type=str, 111 | help="A comma-separated list of page numbers to parse. " 112 | "Included for legacy applications, use --page-numbers " 113 | "for more idiomatic argument entry.", 114 | ) 115 | parse_params.add_argument( 116 | "--maxpages", 117 | "-m", 118 | type=int, 119 | default=0, 120 | help="The maximum number of pages to parse.", 121 | ) 122 | parse_params.add_argument( 123 | "--password", 124 | "-P", 125 | type=str, 126 | default="", 127 | help="The password to use for decrypting PDF file.", 128 | ) 129 | parse_params.add_argument( 130 | "--rotation", 131 | "-R", 132 | default=0, 133 | type=int, 134 | help="The number of degrees to rotate the PDF " 135 | "before other types of processing.", 136 | ) 137 | 138 | la_params = LAParams() # will be used for defaults 139 | la_param_group = parser.add_argument_group( 140 | "Layout analysis", description="Used during layout analysis." 141 | ) 142 | la_param_group.add_argument( 143 | "--no-laparams", 144 | "-n", 145 | default=False, 146 | action="store_true", 147 | help="If layout analysis parameters should be ignored.", 148 | ) 149 | la_param_group.add_argument( 150 | "--detect-vertical", 151 | "-V", 152 | default=la_params.detect_vertical, 153 | action="store_true", 154 | help="If vertical text should be considered during layout analysis", 155 | ) 156 | la_param_group.add_argument( 157 | "--line-overlap", 158 | type=float, 159 | default=la_params.line_overlap, 160 | help="If two characters have more overlap than this they " 161 | "are considered to be on the same line. The overlap is specified " 162 | "relative to the minimum height of both characters.", 163 | ) 164 | la_param_group.add_argument( 165 | "--char-margin", 166 | "-M", 167 | type=float, 168 | default=la_params.char_margin, 169 | help="If two characters are closer together than this margin they " 170 | "are considered to be part of the same line. The margin is " 171 | "specified relative to the width of the character.", 172 | ) 173 | la_param_group.add_argument( 174 | "--word-margin", 175 | "-W", 176 | type=float, 177 | default=la_params.word_margin, 178 | help="If two characters on the same line are further apart than this " 179 | "margin then they are considered to be two separate words, and " 180 | "an intermediate space will be added for readability. The margin " 181 | "is specified relative to the width of the character.", 182 | ) 183 | la_param_group.add_argument( 184 | "--line-margin", 185 | "-L", 186 | type=float, 187 | default=la_params.line_margin, 188 | help="If two lines are close together they are considered to " 189 | "be part of the same paragraph. The margin is specified " 190 | "relative to the height of a line.", 191 | ) 192 | la_param_group.add_argument( 193 | "--boxes-flow", 194 | "-F", 195 | type=float_or_disabled, 196 | default=la_params.boxes_flow, 197 | help="Specifies how much a horizontal and vertical position of a " 198 | "text matters when determining the order of lines. The value " 199 | "should be within the range of -1.0 (only horizontal position " 200 | "matters) to +1.0 (only vertical position matters). You can also " 201 | "pass `disabled` to disable advanced layout analysis, and " 202 | "instead return text based on the position of the bottom left " 203 | "corner of the text box.", 204 | ) 205 | la_param_group.add_argument( 206 | "--all-texts", 207 | "-A", 208 | default=la_params.all_texts, 209 | action="store_true", 210 | help="If layout analysis should be performed on text in figures.", 211 | ) 212 | 213 | output_params = parser.add_argument_group( 214 | "Output", description="Used during output generation." 215 | ) 216 | output_params.add_argument( 217 | "--outfile", 218 | "-o", 219 | type=str, 220 | default="-", 221 | help="Path to file where output is written. " 222 | 'Or "-" (default) to write to stdout.', 223 | ) 224 | output_params.add_argument( 225 | "--output_type", 226 | "-t", 227 | type=str, 228 | default="text", 229 | help="Type of output to generate {text,html,xml,tag}.", 230 | ) 231 | output_params.add_argument( 232 | "--codec", 233 | "-c", 234 | type=str, 235 | default="utf-8", 236 | help="Text encoding to use in output file.", 237 | ) 238 | output_params.add_argument( 239 | "--output-dir", 240 | "-O", 241 | default=None, 242 | help="The output directory to put extracted images in. If not given, " 243 | "images are not extracted.", 244 | ) 245 | output_params.add_argument( 246 | "--layoutmode", 247 | "-Y", 248 | default="normal", 249 | type=str, 250 | help="Type of layout to use when generating html " 251 | "{normal,exact,loose}. If normal,each line is" 252 | " positioned separately in the html. If exact" 253 | ", each character is positioned separately in" 254 | " the html. If loose, same result as normal " 255 | "but with an additional newline after each " 256 | "text line. Only used when output_type is html.", 257 | ) 258 | output_params.add_argument( 259 | "--scale", 260 | "-s", 261 | type=float, 262 | default=1.0, 263 | help="The amount of zoom to use when generating html file. " 264 | "Only used when output_type is html.", 265 | ) 266 | output_params.add_argument( 267 | "--strip-control", 268 | "-S", 269 | default=False, 270 | action="store_true", 271 | help="Remove control statement from text. " 272 | "Only used when output_type is xml.", 273 | ) 274 | 275 | return parser 276 | 277 | 278 | def parse_args(args: Optional[List[str]]) -> argparse.Namespace: 279 | parsed_args = create_parser().parse_args(args=args) 280 | 281 | # Propagate parsed layout parameters to LAParams object 282 | if parsed_args.no_laparams: 283 | parsed_args.laparams = None 284 | else: 285 | parsed_args.laparams = LAParams( 286 | line_overlap=parsed_args.line_overlap, 287 | char_margin=parsed_args.char_margin, 288 | line_margin=parsed_args.line_margin, 289 | word_margin=parsed_args.word_margin, 290 | boxes_flow=parsed_args.boxes_flow, 291 | detect_vertical=parsed_args.detect_vertical, 292 | all_texts=parsed_args.all_texts, 293 | ) 294 | 295 | if parsed_args.page_numbers: 296 | parsed_args.page_numbers = {x - 1 for x in parsed_args.page_numbers} 297 | 298 | if parsed_args.pagenos: 299 | parsed_args.page_numbers = {int(x) - 1 for x in parsed_args.pagenos.split(",")} 300 | 301 | if parsed_args.output_type == "text" and parsed_args.outfile != "-": 302 | for override, alttype in OUTPUT_TYPES: 303 | if parsed_args.outfile.endswith(override): 304 | parsed_args.output_type = alttype 305 | 306 | return parsed_args 307 | 308 | 309 | def main(args: Optional[List[str]] = None) -> int: 310 | parsed_args = parse_args(args) 311 | outfp = extract_text(**vars(parsed_args)) 312 | outfp.close() 313 | return 0 314 | 315 | 316 | if __name__ == "__main__": 317 | sys.exit(main()) 318 | -------------------------------------------------------------------------------- /src/data/skills.csv: -------------------------------------------------------------------------------- 1 | technical skills,ajenti,django-suit,django-xadmin,flask-admin,flower,grappelli,wooey,algorithms,pypattyrn,python-patterns,sortedcontainers,django-simple-captcha,django-simple-spam-blocker,django-compressor,django-pipeline,django-storages,fanstatic,fileconveyor,flask-assets,jinja-assets-compressor,webassets,audiolazy,audioread,beets,dejavu,django-elastic-transcoder,eyed3,id3reader,m3u8,mingus,pyaudioanalysis,pydub,pyechonest,talkbox,timeside,tinytag,authomatic,django-allauth,django-oauth-toolkit,flask-oauthlib,oauthlib,python-oauth2,python-social-auth,rauth,sanction,jose,pyjwt,python-jws,python-jwt,bitbake,buildout,platformio,pybuilder,scons,django-cms,djedi-cms,feincms,kotti,mezzanine,opps,plone,quokka,wagtail,widgy,beaker,diskcache,django-cache-machine,django-cacheops,django-viewlet,dogpile.cache,hermescache,johnny-cache,pylibmc,errbot,coala,code2flow,pycallgraph,flake8,pylama,pylint,mypy,asciimatics,cement,click,cliff,clint,colorama,docopt,gooey,python-fire,python-prompt-toolkit,aws-cli,bashplotlib,caniusepython3,cookiecutter,doitlive,howdoi,httpie,mycli,pathpicker,percol,pgcli,saws,thefuck,try,python-future,python-modernize,six,opencv,pyocr,pytesseract,simplecv,eventlet,gevent,multiprocessing,threading,tomorrow,uvloop,config,configobj,configparser,profig,python-decouple,cryptography,hashids,paramiko,passlib,pynacl,blaze,orange,pandas,cerberus,colander,jsonschema,schematics,valideer,voluptuous,altair,bokeh,ggplot,matplotlib,pygal,pygraphviz,pyqtgraph,seaborn,vispy,pickledb,pipelinedb,tinydb,zodb,mysql,mysql-python,mysqlclient,oursql,pymysql,postgresql,psycopg2,queries,txpostgres,apsw,pymssql,nosql,cassandra-python-driver,happybase,plyvel,py2neo,pycassa,pymongo,redis-py,telephus,txredis,arrow,chronyk,dateutil,delorean,moment,pendulum,pytime,pytz,when.py,ipdb,pdb++,pudb,remote-pdb,wdb,line_profiler,memory_profiler,profiling,vprof,caffe,keras,mxnet,neupy,pytorch,tensorflow,theano,ansible,cloud-init,cuisine,docker,fabric,fabtools,honcho,openstack,pexpect,psutil,saltstack,supervisor,dh-virtualenv,nuitka,py2app,py2exe,pyinstaller,pynsist,sphinx,awesome-sphinxdoc,mkdocs,pdoc,pycco,s3cmd,s4cmd,you-get,youtube-dl,alipay,cartridge,django-oscar,django-shop,merchant,money,python-currencies,forex-python,shoop,emacs,elpy,sublime,anaconda,sublimejedi,vim,jedi-vim,python-mode,youcompleteme,ptvs,visual,python,magic,liclipse,pycharm,spyder,envelopes,flanker,imbox,inbox.py,lamson,marrow,modoboa,nylas,yagmail,pipenv,p,pyenv,venv,virtualenv,virtualenvwrapper,imghdr,mimetypes,path.py,pathlib,python-magic,unipath,watchdog,cffi,ctypes,pycuda,swig,deform,django-bootstrap3,django-crispy-forms,django-remote-forms,wtforms,cytoolz,fn.py,funcy,toolz,curses,enaml,flexx,kivy,pyglet,pygobject,pyqt,pyside,pywebview,tkinter,toga,urwid,wxpython,cocos2d,panda3d,pygame,pyogre,pyopengl,pysdl2,renpy,django-countries,geodjango,geoip,geojson,geopy,pygeoip,beautifulsoup,bleach,cssutils,html5lib,lxml,markupsafe,pyquery,untangle,weasyprint,xmldataset,xmltodict,grequests,httplib2,requests,treq,urllib3,ino,keyboard,mouse,pingo,pyro,pyuserinput,scapy,wifi,hmap,imgseek,nude.py,pagan,pillow,pybarcode,pygram,python-qrcode,quads,scikit-image,thumbor,wand,clpython,cpython,cython,grumpy,ironpython,jython,micropython,numba,peachpy,pyjion,pypy,pysec,pyston,stackless,interactive,bpython,jupyter,ptpython,babel,pyicu,apscheduler,django-schedule,doit,gunnery,joblib,plan,schedule,spiff,taskflow,eliot,logbook,logging,sentry,metrics,nupic,scikit-learn,spark,vowpal_porpoise,xgboost,pyspark,luigi,mrjob,streamparse,dask,python(x,y),pythonlibs,pythonnet,pywin32,winpython,gensim,jieba,langid.py,nltk,pattern,polyglot,snownlp,spacy,textblob,mininet,pox,pyretic,sdx,asyncio,diesel,pulsar,pyzmq,twisted,txzmq,napalm,django-activity-stream,stream-framework,django,sqlalchemy,awesome-sqlalchemy,orator,peewee,ponyorm,pydal,python-sql,pip,conda,curdling,pip-tools,wheel,warehouse,bandersnatch,devpi,localshop,carteblanche,django-guardian,django-rules,delegator.py subprocesses for,sarge,sh,celery,huey,mrq,rq,simpleq,annoy,fastfm,implicit,libffm,lightfm,surprise,tensorrec,django-rest-framework,django-tastypie,flask,eve,flask-api-utils,flask-api,flask-restful,flask-restless,pyramid,cornice,falcon,hug,restless,ripozo,sandman,apistar,simplejsonrpcserver,simplexmlrpcserver,zerorpc,astropy,bcbio-nextgen,bccb,biopython,cclib,networkx,nipy,numpy,obspy,pydy,pymc,rdkit,scipy,statsmodels,sympy,zipline,simpy,django-haystack,elasticsearch-dsl-py,elasticsearch-py,esengine,pysolr,solrpy,whoosh,marshmallow,apex,python-lambda,zappa,tablib,marmir,openpyxl,pyexcel,python-docx,relatorio,unoconv,xlsxwriter,xlwings,xlwt / xlrd,pdf,pdfminer,pypdf2,reportlab,markdown,mistune,python-markdown,yaml,pyyaml,csvkit,unp,cactus,hyde,lektor,nikola,pelican,tinkerer,django-taggit,genshi,jinja2,mako,hypothesis,mamba,nose,nose2,pytest,robot,unittest,green,tox,locust,pyautogui,selenium,sixpack,splinter,doublex,freezegun,httmock,httpretty,mock,responses,vcr.py,factory_boy,mixer,model_mommy,mimesis,fake2db,faker,radar,chardet,difflib,ftfy,fuzzywuzzy,levenshtein,pangu.py,pyfiglet,pypinyin,shortuuid,unidecode,uniout,xpinyin,slugify,awesome-slugify,python-slugify,unicode-slugify,parser,phonenumbers,ply,pygments,pyparsing,python-nameparser,python-user-agents,sqlparse,apache-libcloud,boto3,django-wordpress,facebook-sdk,facepy,gmail,google-api-python-client,gspread,twython,furl,purl,pyshorteners,short_url,webargs,moviepy,scikit-video,wsgi-compatible,bjoern,fapws3,gunicorn,meinheld,netius,paste,rocket,uwsgi,waitress,werkzeug,haul,html2text,lassie,micawber,newspaper,opengraph,python-goose,python-readability,sanitize,sumy,textract,cola,demiurge,feedparser,grab,mechanicalsoup,portia,pyspider,robobrowser,scrapy,bottle,cherrypy,awesome-django,awesome-flask,awesome-pyramid,sanic,tornado,turbogears,web2py,github,autobahnpython,crossbar,django-socketio,websocket-for-python,javascript,php,c#,c++,ruby,css,c,objective-c,shell,scala,swift,matlab,clojure,octave,machine learning,data analytics,predictive analytics,html,js,accounts payable,receivables,inventory controls,payroll,deposits,bank reconciliation,planning and enacting cash-flows,report preparation,financial models,financial controls,documentation,time management,schedules,benchmarking,future state assessment,business process re-engineering,as-is analysis,defining solutions and scope,gap analysis,role change,wireframing,prototyping,user stories,financial analysis/modeling,swot analysis,quickbooks,quicken,erp,enterprise resource planning,spanish,german,rest,soap,json,website,ui,ux,design,crm,cms,communication,coding,windows,servers,unix,linux,redhat,solaris,java,perl,vb script,xml,database,oracle,microsoft sql,sql,microsoft word,microsoft powerpoint,powerpoint,word,excel,visio,microsoft visio,microsoft excel,adobe,photoshop,hadoop,hbase,hive,zookeeper,openserver,auto cad,pl/sql,ruby on rails,asp,jsp,operations,technical,training,sales,marketing,reporting,compliance,strategy,research,analytical,engineering,policies,budget,finance,project management,health,customer service,content,presentation,brand,presentations,safety,certification,seo,digital marketing,accounting,regulations,legal,engagement,analytics,distribution,coaching,testing,vendors,consulting,writing,contracts,inventory,retail,healthcare,regulatory,scheduling,construction,logistics,mobile,c�(programming language),correspondence,controls,human resources,specifications,recruitment,procurement,partnership,partnerships,management experience,negotiation,hardware,programming,agile,forecasting,advertising,business development,audit,architecture,supply chain,governance,staffing,continuous improvement,product development,networking,recruiting,product management,sap,troubleshooting,computer science,budgeting,electrical,customer experience,economics,information technology,transportation,social media,automation,lifecycle,filing,modeling,investigation,editing,purchasing,kpis,hospital,forecasts,acquisition,expenses,billing,workflow,product owner,analyze,cross functional,business process,process,improvement,pivot tables,pivot,vlookups,sharepoint,microsoft sharepoint,access database,access,test case,jira,tfs,hp alm,tableau,business object,business intelligence,jad,solicitation,kaban,vue.js,sketch,indesign,illustrator,english,french,active directory,data center,solution architecture,dns,network design,open source,desktop support,application support,administration,change management,video,invoices,administrative support,payments,lean,process improvement,installation,risk management,transactions,investigations,r (programming language),data analysis,statistics,protocols,program management,quality assurance,banking,outreach,sourcing,microsoft office,merchandising,r,teaching,pharmaceutical,fulfillment,positioning,tax,service delivery,investigate,editorial,account management,valid drivers license,electronics,pr,public relations,assembly,facebook,spreadsheets,recruit,proposal,data entry,hotel,ordering,branding,life cycle,real estate,relationship management,researching,process improvements,chemistry,saas,cad,sales experience,mathematics,customer-facing,audio,project management skills,six sigma,hospitality,mechanical engineering,auditing,employee relations,android,security clearance,licensing,fundraising,repairs,iso,market research,business strategy,pmp,data management,quality control,reconciliation,conversion,business analysis,financial analysis,ecommerce,client service,publishing,supervising,complex projects,key performance indicators,scrum,sports,e-commerce,journalism,d (programming language),data collection,higher education,marketing programs,financial management,business plans,user experience,client relationships,cloud,analytical skills,cisco,internal stakeholders,product marketing,regulatory requirements,itil,information security,aviation,supply chain management,industry experience,autocad,purchase orders,acquisitions,tv,instrumentation,strategic direction,law enforcement,call center,experiments,technical skills,human resource,business cases,build relationships,invoicing,support services,marketing strategy,operating systems,biology,start-up,electrical engineering,workflows,routing,non-profit,marketing plans,due diligence,business management,iphone,architectures,reconcile,dynamic environment,external partners,asset management,emea,intranet,sops,sas,digital media,prospecting,financial reporting,project delivery,operational excellence,standard operating procedures,technical knowledge,on-call,talent management,stakeholder management,tablets,analyze data,financial statements,microsoft office suite,fitness,case management,value proposition,industry trends,rfp,broadcast,portfolio management,fabrication,financial performance,customer requirements,psychology,marketing materials,resource management,physics,mortgage,development activities,end user,business planning,root cause,analysis,leadership development,relationship building,sdlc,on-boarding,quality standards,regulatory compliance,aws,kpi,status reports,product line,drafting,phone calls,product knowledge,business stakeholders,technical issues,admissions,supervisory experience,usability,pharmacy,commissioning,project plan,ms excel,fda,test plans,variances,financing,travel arrangements,internal customers,medical device,counsel,inventory management,performance metrics,lighting,outsourcing,performance improvement,management consulting,graphic design,transport,information management,.net,startup,matrix,front-end,project planning,business systems,accounts receivable,public health,hris,instructional design,in-store,employee engagement,cost effective,sales management,api,adobe creative suite,twitter,program development,event planning,cash flow,strategic plans,vendor management,trade shows,hotels,segmentation,contract management,gis,talent acquisition,photography,internal communications,client services,ibm,financial reports,product quality,beverage,strong analytical skills,underwriting,cpr,mining,sales goals,chemicals,scripting,migration,software engineering,mis,therapeutic,general ledger,ms project,standardization,retention,spelling,media relations,os,daily operations,immigration,product design,etl,field sales,driving record,peoplesoft,benchmark,quality management,apis,test cases,internal controls,telecom,business issues,research projects,data quality,strategic initiatives,office software,cfa,co-op,big data,journal entries,vmware,help desk,statistical analysis,datasets,alliances,solidworks,prototype,lan,sci,budget management,rfps,flex,gaap,experimental,cpg,information system,customer facing,process development,web services,international,travel,revenue growth,software development life cycle,operations management,computer applications,risk assessments,sales operations,raw materials,internal audit,physical security,sql server,affiliate,computer software,manage projects,business continuity,litigation,it infrastructure,cost reduction,small business,annual budget,ios,html5,real-time,consulting experience,circuits,risk assessment,cross-functional team,public policy,analyzing data,consulting services,google drive,ad words,pay per click,email,db2,expense tracking,reports,wordpress,yoast,ghostwriting,corel draw,automated billing,system,customer management,debugging,system administration,network configuration,software installation,security,tech support,updates,tci/ip,dhcp,wan/lan,ubuntu,virtualized networks,network automation,cloud management,ai,salesforce,mango db,math,calculus,product launch,mvp 2 | -------------------------------------------------------------------------------- /src/notebook/skills.csv: -------------------------------------------------------------------------------- 1 | technical skills,ajenti,django-suit,django-xadmin,flask-admin,flower,grappelli,wooey,algorithms,pypattyrn,python-patterns,sortedcontainers,django-simple-captcha,django-simple-spam-blocker,django-compressor,django-pipeline,django-storages,fanstatic,fileconveyor,flask-assets,jinja-assets-compressor,webassets,audiolazy,audioread,beets,dejavu,django-elastic-transcoder,eyed3,id3reader,m3u8,mingus,pyaudioanalysis,pydub,pyechonest,talkbox,timeside,tinytag,authomatic,django-allauth,django-oauth-toolkit,flask-oauthlib,oauthlib,python-oauth2,python-social-auth,rauth,sanction,jose,pyjwt,python-jws,python-jwt,bitbake,buildout,platformio,pybuilder,scons,django-cms,djedi-cms,feincms,kotti,mezzanine,opps,plone,quokka,wagtail,widgy,beaker,diskcache,django-cache-machine,django-cacheops,django-viewlet,dogpile.cache,hermescache,johnny-cache,pylibmc,errbot,coala,code2flow,pycallgraph,flake8,pylama,pylint,mypy,asciimatics,cement,click,cliff,clint,colorama,docopt,gooey,python-fire,python-prompt-toolkit,aws-cli,bashplotlib,caniusepython3,cookiecutter,doitlive,howdoi,httpie,mycli,pathpicker,percol,pgcli,saws,thefuck,try,python-future,python-modernize,six,opencv,pyocr,pytesseract,simplecv,eventlet,gevent,multiprocessing,threading,tomorrow,uvloop,config,configobj,configparser,profig,python-decouple,cryptography,hashids,paramiko,passlib,pynacl,blaze,orange,pandas,cerberus,colander,jsonschema,schematics,valideer,voluptuous,altair,bokeh,ggplot,matplotlib,pygal,pygraphviz,pyqtgraph,seaborn,vispy,pickledb,pipelinedb,tinydb,zodb,mysql,mysql-python,mysqlclient,oursql,pymysql,postgresql,psycopg2,queries,txpostgres,apsw,pymssql,nosql,cassandra-python-driver,happybase,plyvel,py2neo,pycassa,pymongo,redis-py,telephus,txredis,arrow,chronyk,dateutil,delorean,moment,pendulum,pytime,pytz,when.py,ipdb,pdb++,pudb,remote-pdb,wdb,line_profiler,memory_profiler,profiling,vprof,caffe,keras,mxnet,neupy,pytorch,tensorflow,theano,ansible,cloud-init,cuisine,docker,fabric,fabtools,honcho,openstack,pexpect,psutil,saltstack,supervisor,dh-virtualenv,nuitka,py2app,py2exe,pyinstaller,pynsist,sphinx,awesome-sphinxdoc,mkdocs,pdoc,pycco,s3cmd,s4cmd,you-get,youtube-dl,alipay,cartridge,django-oscar,django-shop,merchant,money,python-currencies,forex-python,shoop,emacs,elpy,sublime,anaconda,sublimejedi,vim,jedi-vim,python-mode,youcompleteme,ptvs,visual,python,magic,liclipse,pycharm,spyder,envelopes,flanker,imbox,inbox.py,lamson,marrow,modoboa,nylas,yagmail,pipenv,p,pyenv,venv,virtualenv,virtualenvwrapper,imghdr,mimetypes,path.py,pathlib,python-magic,unipath,watchdog,cffi,ctypes,pycuda,swig,deform,django-bootstrap3,django-crispy-forms,django-remote-forms,wtforms,cytoolz,fn.py,funcy,toolz,curses,enaml,flexx,kivy,pyglet,pygobject,pyqt,pyside,pywebview,tkinter,toga,urwid,wxpython,cocos2d,panda3d,pygame,pyogre,pyopengl,pysdl2,renpy,django-countries,geodjango,geoip,geojson,geopy,pygeoip,beautifulsoup,bleach,cssutils,html5lib,lxml,markupsafe,pyquery,untangle,weasyprint,xmldataset,xmltodict,grequests,httplib2,requests,treq,urllib3,ino,keyboard,mouse,pingo,pyro,pyuserinput,scapy,wifi,hmap,imgseek,nude.py,pagan,pillow,pybarcode,pygram,python-qrcode,quads,scikit-image,thumbor,wand,clpython,cpython,cython,grumpy,ironpython,jython,micropython,numba,peachpy,pyjion,pypy,pysec,pyston,stackless,interactive,bpython,jupyter,ptpython,babel,pyicu,apscheduler,django-schedule,doit,gunnery,joblib,plan,schedule,spiff,taskflow,eliot,logbook,logging,sentry,metrics,nupic,scikit-learn,spark,vowpal_porpoise,xgboost,pyspark,luigi,mrjob,streamparse,dask,python(x,y),pythonlibs,pythonnet,pywin32,winpython,gensim,jieba,langid.py,nltk,pattern,polyglot,snownlp,spacy,textblob,mininet,pox,pyretic,sdx,asyncio,diesel,pulsar,pyzmq,twisted,txzmq,napalm,django-activity-stream,stream-framework,django,sqlalchemy,awesome-sqlalchemy,orator,peewee,ponyorm,pydal,python-sql,pip,conda,curdling,pip-tools,wheel,warehouse,bandersnatch,devpi,localshop,carteblanche,django-guardian,django-rules,delegator.py subprocesses for,sarge,sh,celery,huey,mrq,rq,simpleq,annoy,fastfm,implicit,libffm,lightfm,surprise,tensorrec,django-rest-framework,django-tastypie,flask,eve,flask-api-utils,flask-api,flask-restful,flask-restless,pyramid,cornice,falcon,hug,restless,ripozo,sandman,apistar,simplejsonrpcserver,simplexmlrpcserver,zerorpc,astropy,bcbio-nextgen,bccb,biopython,cclib,networkx,nipy,numpy,obspy,pydy,pymc,rdkit,scipy,statsmodels,sympy,zipline,simpy,django-haystack,elasticsearch-dsl-py,elasticsearch-py,esengine,pysolr,solrpy,whoosh,marshmallow,apex,python-lambda,zappa,tablib,marmir,openpyxl,pyexcel,python-docx,relatorio,unoconv,xlsxwriter,xlwings,xlwt / xlrd,pdf,pdfminer,pypdf2,reportlab,markdown,mistune,python-markdown,yaml,pyyaml,csvkit,unp,cactus,hyde,lektor,nikola,pelican,tinkerer,django-taggit,genshi,jinja2,mako,hypothesis,mamba,nose,nose2,pytest,robot,unittest,green,tox,locust,pyautogui,selenium,sixpack,splinter,doublex,freezegun,httmock,httpretty,mock,responses,vcr.py,factory_boy,mixer,model_mommy,mimesis,fake2db,faker,radar,chardet,difflib,ftfy,fuzzywuzzy,levenshtein,pangu.py,pyfiglet,pypinyin,shortuuid,unidecode,uniout,xpinyin,slugify,awesome-slugify,python-slugify,unicode-slugify,parser,phonenumbers,ply,pygments,pyparsing,python-nameparser,python-user-agents,sqlparse,apache-libcloud,boto3,django-wordpress,facebook-sdk,facepy,gmail,google-api-python-client,gspread,twython,furl,purl,pyshorteners,short_url,webargs,moviepy,scikit-video,wsgi-compatible,bjoern,fapws3,gunicorn,meinheld,netius,paste,rocket,uwsgi,waitress,werkzeug,haul,html2text,lassie,micawber,newspaper,opengraph,python-goose,python-readability,sanitize,sumy,textract,cola,demiurge,feedparser,grab,mechanicalsoup,portia,pyspider,robobrowser,scrapy,bottle,cherrypy,awesome-django,awesome-flask,awesome-pyramid,sanic,tornado,turbogears,web2py,github,autobahnpython,crossbar,django-socketio,websocket-for-python,javascript,php,c#,c++,ruby,css,c,objective-c,shell,scala,swift,matlab,clojure,octave,machine learning,data analytics,predictive analytics,html,js,accounts payable,receivables,inventory controls,payroll,deposits,bank reconciliation,planning and enacting cash-flows,report preparation,financial models,financial controls,documentation,time management,schedules,benchmarking,future state assessment,business process re-engineering,as-is analysis,defining solutions and scope,gap analysis,role change,wireframing,prototyping,user stories,financial analysis/modeling,swot analysis,quickbooks,quicken,erp,enterprise resource planning,spanish,german,rest,soap,json,website,ui,ux,design,crm,cms,communication,coding,windows,servers,unix,linux,redhat,solaris,java,perl,vb script,xml,database,oracle,microsoft sql,sql,microsoft word,microsoft powerpoint,powerpoint,word,excel,visio,microsoft visio,microsoft excel,adobe,photoshop,hadoop,hbase,hive,zookeeper,openserver,auto cad,pl/sql,ruby on rails,asp,jsp,operations,technical,training,sales,marketing,reporting,compliance,strategy,research,analytical,engineering,policies,budget,finance,project management,health,customer service,content,presentation,brand,presentations,safety,certification,seo,digital marketing,accounting,regulations,legal,engagement,analytics,distribution,coaching,testing,vendors,consulting,writing,contracts,inventory,retail,healthcare,regulatory,scheduling,construction,logistics,mobile,c�(programming language),correspondence,controls,human resources,specifications,recruitment,procurement,partnership,partnerships,management experience,negotiation,hardware,programming,agile,forecasting,advertising,business development,audit,architecture,supply chain,governance,staffing,continuous improvement,product development,networking,recruiting,product management,sap,troubleshooting,computer science,budgeting,electrical,customer experience,economics,information technology,transportation,social media,automation,lifecycle,filing,modeling,investigation,editing,purchasing,kpis,hospital,forecasts,acquisition,expenses,billing,workflow,product owner,analyze,cross functional,business process,process,improvement,pivot tables,pivot,vlookups,sharepoint,microsoft sharepoint,access database,access,test case,jira,tfs,hp alm,tableau,business object,business intelligence,jad,solicitation,kaban,vue.js,sketch,indesign,illustrator,english,french,active directory,data center,solution architecture,dns,network design,open source,desktop support,application support,administration,change management,video,invoices,administrative support,payments,lean,process improvement,installation,risk management,transactions,investigations,r (programming language),data analysis,statistics,protocols,program management,quality assurance,banking,outreach,sourcing,microsoft office,merchandising,r,teaching,pharmaceutical,fulfillment,positioning,tax,service delivery,investigate,editorial,account management,valid drivers license,electronics,pr,public relations,assembly,facebook,spreadsheets,recruit,proposal,data entry,hotel,ordering,branding,life cycle,real estate,relationship management,researching,process improvements,chemistry,saas,cad,sales experience,mathematics,customer-facing,audio,project management skills,six sigma,hospitality,mechanical engineering,auditing,employee relations,android,security clearance,licensing,fundraising,repairs,iso,market research,business strategy,pmp,data management,quality control,reconciliation,conversion,business analysis,financial analysis,ecommerce,client service,publishing,supervising,complex projects,key performance indicators,scrum,sports,e-commerce,journalism,d (programming language),data collection,higher education,marketing programs,financial management,business plans,user experience,client relationships,cloud,analytical skills,cisco,internal stakeholders,product marketing,regulatory requirements,itil,information security,aviation,supply chain management,industry experience,autocad,purchase orders,acquisitions,tv,instrumentation,strategic direction,law enforcement,call center,experiments,technical skills,human resource,business cases,build relationships,invoicing,support services,marketing strategy,operating systems,biology,start-up,electrical engineering,workflows,routing,non-profit,marketing plans,due diligence,business management,iphone,architectures,reconcile,dynamic environment,external partners,asset management,emea,intranet,sops,sas,digital media,prospecting,financial reporting,project delivery,operational excellence,standard operating procedures,technical knowledge,on-call,talent management,stakeholder management,tablets,analyze data,financial statements,microsoft office suite,fitness,case management,value proposition,industry trends,rfp,broadcast,portfolio management,fabrication,financial performance,customer requirements,psychology,marketing materials,resource management,physics,mortgage,development activities,end user,business planning,root cause,analysis,leadership development,relationship building,sdlc,on-boarding,quality standards,regulatory compliance,aws,kpi,status reports,product line,drafting,phone calls,product knowledge,business stakeholders,technical issues,admissions,supervisory experience,usability,pharmacy,commissioning,project plan,ms excel,fda,test plans,variances,financing,travel arrangements,internal customers,medical device,counsel,inventory management,performance metrics,lighting,outsourcing,performance improvement,management consulting,graphic design,transport,information management,.net,startup,matrix,front-end,project planning,business systems,accounts receivable,public health,hris,instructional design,in-store,employee engagement,cost effective,sales management,api,adobe creative suite,twitter,program development,event planning,cash flow,strategic plans,vendor management,trade shows,hotels,segmentation,contract management,gis,talent acquisition,photography,internal communications,client services,ibm,financial reports,product quality,beverage,strong analytical skills,underwriting,cpr,mining,sales goals,chemicals,scripting,migration,software engineering,mis,therapeutic,general ledger,ms project,standardization,retention,spelling,media relations,os,daily operations,immigration,product design,etl,field sales,driving record,peoplesoft,benchmark,quality management,apis,test cases,internal controls,telecom,business issues,research projects,data quality,strategic initiatives,office software,cfa,co-op,big data,journal entries,vmware,help desk,statistical analysis,datasets,alliances,solidworks,prototype,lan,sci,budget management,rfps,flex,gaap,experimental,cpg,information system,customer facing,process development,web services,international,travel,revenue growth,software development life cycle,operations management,computer applications,risk assessments,sales operations,raw materials,internal audit,physical security,sql server,affiliate,computer software,manage projects,business continuity,litigation,it infrastructure,cost reduction,small business,annual budget,ios,html5,real-time,consulting experience,circuits,risk assessment,cross-functional team,public policy,analyzing data,consulting services,google drive,ad words,pay per click,email,db2,expense tracking,reports,wordpress,yoast,ghostwriting,corel draw,automated billing,system,customer management,debugging,system administration,network configuration,software installation,security,tech support,updates,tci/ip,dhcp,wan/lan,ubuntu,virtualized networks,network automation,cloud management,ai,salesforce,mango db,math,calculus,product launch,mvp 2 | -------------------------------------------------------------------------------- /src/components/jd_data_extractor.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from tqdm import tqdm 3 | from time import sleep 4 | from selenium import webdriver 5 | from selenium.webdriver.common.by import By 6 | from selenium.webdriver.support.ui import WebDriverWait 7 | from selenium.webdriver.support import expected_conditions as EC 8 | from selenium.common.exceptions import TimeoutException 9 | from bs4 import BeautifulSoup 10 | from selenium.common.exceptions import ElementClickInterceptedException 11 | from selenium.common.exceptions import NoSuchElementException 12 | import json 13 | import urllib 14 | import time 15 | 16 | driver = webdriver.Chrome(executable_path=r'C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\chromedriver_win32\chromedriver.exe') 17 | 18 | def openbrowser(locid, key): 19 | driver.wait = WebDriverWait(driver, 5) 20 | driver.maximize_window() 21 | words = key.split() 22 | txt ='' 23 | for w in words: 24 | txt +=(w+'+') 25 | #print (txt) 26 | driver.get("https://www.glassdoor.co.in/Job/jobs.htm?suggestCount=0&suggestChosen=true&clickSource=searchBtn&typedKeyword={}" 27 | "&sc.keyword={}&locT=C&locId={}&jobType=fulltime&fromAge=1&radius=6&cityId=-1&minRating=0.0&industryId=-1" 28 | "&sgocId=-1&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0".format(txt[:-1], txt[:-1], locid)) 29 | 30 | return driver 31 | 32 | def geturl(driver): 33 | url = set() 34 | while True: 35 | print(len(url)) 36 | if len(url)>=20: 37 | break 38 | soup1 = BeautifulSoup(driver.page_source, "lxml") 39 | 40 | main = soup1.find_all("li",{"class":"jl"}) 41 | 42 | for m in main: 43 | url.add('https://www.glassdoor.co.in{}'.format(m.find('a')['href'])) 44 | try: 45 | next_element = soup1.find("li", {"class": "next"}) 46 | try: 47 | next_exist = next_element.find('a') 48 | except AttributeError: 49 | driver.quit() 50 | break 51 | except NoSuchElementException: 52 | driver.quit() 53 | break 54 | if next_exist: 55 | 56 | driver.find_element_by_class_name("next").click() 57 | time.sleep(2) 58 | else: 59 | driver.quit() 60 | break 61 | except ElementClickInterceptedException: 62 | pass 63 | 64 | return list(url) 65 | 66 | x =openbrowser(locid =4477468, key='"Data Scientist"') 67 | with open('url_data_scientist_loc_bangalore.json','w') as f: 68 | json.dump(geturl(driver),f, indent = 4) 69 | print("file created") 70 | 71 | with open('url_data_scientist_loc_bangalore.json','r') as f: 72 | url = json.load(f) 73 | data ={} 74 | i = 1 75 | jd_df = pd.DataFrame() 76 | driver = webdriver.Chrome(executable_path=r'C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\chromedriver_win32\chromedriver.exe') 77 | 78 | for u in tqdm(url): 79 | driver.wait = WebDriverWait(driver, 2) 80 | driver.maximize_window() 81 | driver.get(u) 82 | soup = BeautifulSoup(driver.page_source, "lxml") 83 | try: 84 | 85 | header = soup.find("div",{"class":"header cell info"}) 86 | position = driver.find_element_by_tag_name('h2').text 87 | company = driver.find_element_by_xpath("//span[@class='strong ib']").text 88 | location = driver.find_element_by_xpath("//span[@class='subtle ib']").text 89 | jd_temp = driver.find_element_by_id("JobDescriptionContainer") 90 | jd = jd_temp.text 91 | info = soup.find_all("infoEntity") 92 | except IndexError: 93 | print('IndexError: list index out of range') 94 | except NoSuchElementException: 95 | pass 96 | data[i] = { 97 | 'url' :u, 98 | 'Position':position, 99 | 'Company': company, 100 | 'Location' :location, 101 | 'Job_Description' :jd 102 | } 103 | i+=1 104 | driver.quit() 105 | jd_df = pd.DataFrame(data) 106 | jd = jd_df.transpose() 107 | 108 | jd = jd[['url','Position','Company','Location','Job_Description']] 109 | jd.to_csv(r'C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\src\data\jd_unstructured_data.csv') 110 | print('file created') 111 | 112 | 113 | def get_jobs(keyword, num_jobs, verbose, path, slp_time): 114 | 115 | '''Gathers jobs as a dataframe, scraped from Glassdoor''' 116 | 117 | #Initializing the webdriver 118 | options = webdriver.ChromeOptions() 119 | 120 | #Uncomment the line below if you'd like to scrape without a new Chrome window every time. 121 | #options.add_argument('headless') 122 | 123 | #Change the path to where chromedriver is in your home folder. 124 | driver = webdriver.Chrome(executable_path=path, options=options) 125 | driver.set_window_size(1120, 1000) 126 | 127 | url = "https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword="+keyword+"&sc.keyword="+keyword+"&locT=&locId=&jobType=" 128 | #url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword="' + keyword + '"&locT=C&locId=1147401&locKeyword=San%20Francisco,%20CA&jobType=all&fromAge=-1&minSalary=0&includeNoSalaryJobs=true&radius=100&cityId=-1&minRating=0.0&industryId=-1&sgocId=-1&seniorityType=all&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0' 129 | driver.get(url) 130 | jobs = [] 131 | 132 | while len(jobs) < num_jobs: #If true, should be still looking for new jobs. 133 | 134 | #Let the page load. Change this number based on your internet speed. 135 | #Or, wait until the webpage is loaded, instead of hardcoding it. 136 | time.sleep(slp_time) 137 | 138 | #Test for the "Sign Up" prompt and get rid of it. 139 | try: 140 | driver.find_element_by_class_name("selected").click() 141 | except ElementClickInterceptedException: 142 | pass 143 | 144 | time.sleep(.1) 145 | 146 | try: 147 | driver.find_element_by_css_selector('[alt="Close"]').click() #clicking to the X. 148 | print(' x out worked') 149 | except NoSuchElementException: 150 | print(' x out failed') 151 | pass 152 | 153 | 154 | #Going through each job in this page 155 | job_buttons = driver.find_elements_by_class_name("jl") #jl for Job Listing. These are the buttons we're going to click. 156 | for job_button in job_buttons: 157 | 158 | print("Progress: {}".format("" + str(len(jobs)) + "/" + str(num_jobs))) 159 | if len(jobs) >= num_jobs: 160 | break 161 | 162 | job_button.click() #You might 163 | time.sleep(1) 164 | collected_successfully = False 165 | 166 | while not collected_successfully: 167 | try: 168 | company_name = driver.find_element_by_xpath('.//div[@class="employerName"]').text 169 | location = driver.find_element_by_xpath('.//div[@class="location"]').text 170 | job_title = driver.find_element_by_xpath('.//div[contains(@class, "title")]').text 171 | job_description = driver.find_element_by_xpath('.//div[@class="jobDescriptionContent desc"]').text 172 | collected_successfully = True 173 | except: 174 | time.sleep(5) 175 | 176 | try: 177 | salary_estimate = driver.find_element_by_xpath('.//span[@class="gray salary"]').text 178 | except NoSuchElementException: 179 | salary_estimate = -1 #You need to set a "not found value. It's important." 180 | 181 | try: 182 | rating = driver.find_element_by_xpath('.//span[@class="rating"]').text 183 | except NoSuchElementException: 184 | rating = -1 #You need to set a "not found value. It's important." 185 | 186 | #Printing for debugging 187 | if verbose: 188 | print("Job Title: {}".format(job_title)) 189 | print("Salary Estimate: {}".format(salary_estimate)) 190 | print("Job Description: {}".format(job_description[:500])) 191 | print("Rating: {}".format(rating)) 192 | print("Company Name: {}".format(company_name)) 193 | print("Location: {}".format(location)) 194 | 195 | #Going to the Company tab... 196 | #clicking on this: 197 | #
Company
198 | try: 199 | driver.find_element_by_xpath('.//div[@class="tab" and @data-tab-type="overview"]').click() 200 | 201 | try: 202 | #
203 | # 204 | # San Francisco, CA 205 | #
206 | headquarters = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Headquarters"]//following-sibling::*').text 207 | except NoSuchElementException: 208 | headquarters = -1 209 | 210 | try: 211 | size = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Size"]//following-sibling::*').text 212 | except NoSuchElementException: 213 | size = -1 214 | 215 | try: 216 | founded = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Founded"]//following-sibling::*').text 217 | except NoSuchElementException: 218 | founded = -1 219 | 220 | try: 221 | type_of_ownership = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Type"]//following-sibling::*').text 222 | except NoSuchElementException: 223 | type_of_ownership = -1 224 | 225 | try: 226 | industry = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Industry"]//following-sibling::*').text 227 | except NoSuchElementException: 228 | industry = -1 229 | 230 | try: 231 | sector = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Sector"]//following-sibling::*').text 232 | except NoSuchElementException: 233 | sector = -1 234 | 235 | try: 236 | revenue = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Revenue"]//following-sibling::*').text 237 | except NoSuchElementException: 238 | revenue = -1 239 | 240 | try: 241 | competitors = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Competitors"]//following-sibling::*').text 242 | except NoSuchElementException: 243 | competitors = -1 244 | 245 | except NoSuchElementException: #Rarely, some job postings do not have the "Company" tab. 246 | headquarters = -1 247 | size = -1 248 | founded = -1 249 | type_of_ownership = -1 250 | industry = -1 251 | sector = -1 252 | revenue = -1 253 | competitors = -1 254 | 255 | 256 | if verbose: 257 | print("Headquarters: {}".format(headquarters)) 258 | print("Size: {}".format(size)) 259 | print("Founded: {}".format(founded)) 260 | print("Type of Ownership: {}".format(type_of_ownership)) 261 | print("Industry: {}".format(industry)) 262 | print("Sector: {}".format(sector)) 263 | print("Revenue: {}".format(revenue)) 264 | print("Competitors: {}".format(competitors)) 265 | 266 | jobs.append({"Job Title" : job_title, 267 | "Salary Estimate" : salary_estimate, 268 | "Job Description" : job_description, 269 | "Rating" : rating, 270 | "Company Name" : company_name, 271 | "Location" : location, 272 | "Headquarters" : headquarters, 273 | "Size" : size, 274 | "Founded" : founded, 275 | "Type of ownership" : type_of_ownership, 276 | "Industry" : industry, 277 | "Sector" : sector, 278 | "Revenue" : revenue, 279 | "Competitors" : competitors}) 280 | #add job to jobs 281 | 282 | 283 | #Clicking on the "next page" button 284 | try: 285 | driver.find_element_by_xpath('.//li[@class="next"]//a').click() 286 | except NoSuchElementException: 287 | print("Scraping terminated before reaching target number of jobs. Needed {}, got {}.".format(num_jobs, len(jobs))) 288 | break 289 | 290 | return pd.DataFrame(jobs) #This line converts the dictionary object into a pandas DataFrame. 291 | 292 | path = r"C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\chromedriver_win32\chromedriver.exe" 293 | 294 | unstructured_data_df = get_jobs('data scientist',1000, False, driver, 15) 295 | 296 | unstructured_data_df.to_csv(r'C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\src\data\jd_unstructured_data.csv', index = False) -------------------------------------------------------------------------------- /myenv/docx-template/word/numbering.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | -------------------------------------------------------------------------------- /myenv/Scripts/dumppdf.py: -------------------------------------------------------------------------------- 1 | #!C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\myenv\Scripts\python.exe 2 | """Extract pdf structure in XML format""" 3 | import logging 4 | import os.path 5 | import re 6 | import sys 7 | from typing import Any, Container, Dict, Iterable, List, Optional, TextIO, Union, cast 8 | from argparse import ArgumentParser 9 | 10 | import pdfminer 11 | from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines, PDFXRefFallback 12 | from pdfminer.pdfpage import PDFPage 13 | from pdfminer.pdfparser import PDFParser 14 | from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError 15 | from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value 16 | from pdfminer.psparser import PSKeyword, PSLiteral, LIT 17 | from pdfminer.utils import isnumber 18 | 19 | logging.basicConfig() 20 | logger = logging.getLogger(__name__) 21 | 22 | ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]') 23 | 24 | 25 | def escape(s: Union[str, bytes]) -> str: 26 | if isinstance(s, bytes): 27 | us = str(s, "latin-1") 28 | else: 29 | us = s 30 | return ESC_PAT.sub(lambda m: "&#%d;" % ord(m.group(0)), us) 31 | 32 | 33 | def dumpxml(out: TextIO, obj: object, codec: Optional[str] = None) -> None: 34 | if obj is None: 35 | out.write("") 36 | return 37 | 38 | if isinstance(obj, dict): 39 | out.write('\n' % len(obj)) 40 | for (k, v) in obj.items(): 41 | out.write("%s\n" % k) 42 | out.write("") 43 | dumpxml(out, v) 44 | out.write("\n") 45 | out.write("") 46 | return 47 | 48 | if isinstance(obj, list): 49 | out.write('\n' % len(obj)) 50 | for v in obj: 51 | dumpxml(out, v) 52 | out.write("\n") 53 | out.write("") 54 | return 55 | 56 | if isinstance(obj, (str, bytes)): 57 | out.write('%s' % (len(obj), escape(obj))) 58 | return 59 | 60 | if isinstance(obj, PDFStream): 61 | if codec == "raw": 62 | # Bug: writing bytes to text I/O. This will raise TypeError. 63 | out.write(obj.get_rawdata()) # type: ignore [arg-type] 64 | elif codec == "binary": 65 | # Bug: writing bytes to text I/O. This will raise TypeError. 66 | out.write(obj.get_data()) # type: ignore [arg-type] 67 | else: 68 | out.write("\n\n") 69 | dumpxml(out, obj.attrs) 70 | out.write("\n\n") 71 | if codec == "text": 72 | data = obj.get_data() 73 | out.write('%s\n' % (len(data), escape(data))) 74 | out.write("") 75 | return 76 | 77 | if isinstance(obj, PDFObjRef): 78 | out.write('' % obj.objid) 79 | return 80 | 81 | if isinstance(obj, PSKeyword): 82 | # Likely bug: obj.name is bytes, not str 83 | out.write("%s" % obj.name) # type: ignore [str-bytes-safe] 84 | return 85 | 86 | if isinstance(obj, PSLiteral): 87 | # Likely bug: obj.name may be bytes, not str 88 | out.write("%s" % obj.name) # type: ignore [str-bytes-safe] 89 | return 90 | 91 | if isnumber(obj): 92 | out.write("%s" % obj) 93 | return 94 | 95 | raise TypeError(obj) 96 | 97 | 98 | def dumptrailers( 99 | out: TextIO, doc: PDFDocument, show_fallback_xref: bool = False 100 | ) -> None: 101 | for xref in doc.xrefs: 102 | if not isinstance(xref, PDFXRefFallback) or show_fallback_xref: 103 | out.write("\n") 104 | dumpxml(out, xref.get_trailer()) 105 | out.write("\n\n\n") 106 | no_xrefs = all(isinstance(xref, PDFXRefFallback) for xref in doc.xrefs) 107 | if no_xrefs and not show_fallback_xref: 108 | msg = ( 109 | "This PDF does not have an xref. Use --show-fallback-xref if " 110 | "you want to display the content of a fallback xref that " 111 | "contains all objects." 112 | ) 113 | logger.warning(msg) 114 | return 115 | 116 | 117 | def dumpallobjs( 118 | out: TextIO, 119 | doc: PDFDocument, 120 | codec: Optional[str] = None, 121 | show_fallback_xref: bool = False, 122 | ) -> None: 123 | visited = set() 124 | out.write("") 125 | for xref in doc.xrefs: 126 | for objid in xref.get_objids(): 127 | if objid in visited: 128 | continue 129 | visited.add(objid) 130 | try: 131 | obj = doc.getobj(objid) 132 | if obj is None: 133 | continue 134 | out.write('\n' % objid) 135 | dumpxml(out, obj, codec=codec) 136 | out.write("\n\n\n") 137 | except PDFObjectNotFound as e: 138 | print("not found: %r" % e) 139 | dumptrailers(out, doc, show_fallback_xref) 140 | out.write("") 141 | return 142 | 143 | 144 | def dumpoutline( 145 | outfp: TextIO, 146 | fname: str, 147 | objids: Any, 148 | pagenos: Container[int], 149 | password: str = "", 150 | dumpall: bool = False, 151 | codec: Optional[str] = None, 152 | extractdir: Optional[str] = None, 153 | ) -> None: 154 | fp = open(fname, "rb") 155 | parser = PDFParser(fp) 156 | doc = PDFDocument(parser, password) 157 | pages = { 158 | page.pageid: pageno 159 | for (pageno, page) in enumerate(PDFPage.create_pages(doc), 1) 160 | } 161 | 162 | def resolve_dest(dest: object) -> Any: 163 | if isinstance(dest, (str, bytes)): 164 | dest = resolve1(doc.get_dest(dest)) 165 | elif isinstance(dest, PSLiteral): 166 | dest = resolve1(doc.get_dest(dest.name)) 167 | if isinstance(dest, dict): 168 | dest = dest["D"] 169 | if isinstance(dest, PDFObjRef): 170 | dest = dest.resolve() 171 | return dest 172 | 173 | try: 174 | outlines = doc.get_outlines() 175 | outfp.write("\n") 176 | for (level, title, dest, a, se) in outlines: 177 | pageno = None 178 | if dest: 179 | dest = resolve_dest(dest) 180 | pageno = pages[dest[0].objid] 181 | elif a: 182 | action = a 183 | if isinstance(action, dict): 184 | subtype = action.get("S") 185 | if subtype and repr(subtype) == "/'GoTo'" and action.get("D"): 186 | dest = resolve_dest(action["D"]) 187 | pageno = pages[dest[0].objid] 188 | s = escape(title) 189 | outfp.write('\n'.format(level, s)) 190 | if dest is not None: 191 | outfp.write("") 192 | dumpxml(outfp, dest) 193 | outfp.write("\n") 194 | if pageno is not None: 195 | outfp.write("%r\n" % pageno) 196 | outfp.write("\n") 197 | outfp.write("\n") 198 | except PDFNoOutlines: 199 | pass 200 | parser.close() 201 | fp.close() 202 | return 203 | 204 | 205 | LITERAL_FILESPEC = LIT("Filespec") 206 | LITERAL_EMBEDDEDFILE = LIT("EmbeddedFile") 207 | 208 | 209 | def extractembedded(fname: str, password: str, extractdir: str) -> None: 210 | def extract1(objid: int, obj: Dict[str, Any]) -> None: 211 | filename = os.path.basename(obj.get("UF") or cast(bytes, obj.get("F")).decode()) 212 | fileref = obj["EF"].get("UF") or obj["EF"].get("F") 213 | fileobj = doc.getobj(fileref.objid) 214 | if not isinstance(fileobj, PDFStream): 215 | error_msg = ( 216 | "unable to process PDF: reference for %r is not a " 217 | "PDFStream" % filename 218 | ) 219 | raise PDFValueError(error_msg) 220 | if fileobj.get("Type") is not LITERAL_EMBEDDEDFILE: 221 | raise PDFValueError( 222 | "unable to process PDF: reference for %r " 223 | "is not an EmbeddedFile" % (filename) 224 | ) 225 | path = os.path.join(extractdir, "%.6d-%s" % (objid, filename)) 226 | if os.path.exists(path): 227 | raise IOError("file exists: %r" % path) 228 | print("extracting: %r" % path) 229 | os.makedirs(os.path.dirname(path), exist_ok=True) 230 | out = open(path, "wb") 231 | out.write(fileobj.get_data()) 232 | out.close() 233 | return 234 | 235 | with open(fname, "rb") as fp: 236 | parser = PDFParser(fp) 237 | doc = PDFDocument(parser, password) 238 | extracted_objids = set() 239 | for xref in doc.xrefs: 240 | for objid in xref.get_objids(): 241 | obj = doc.getobj(objid) 242 | if ( 243 | objid not in extracted_objids 244 | and isinstance(obj, dict) 245 | and obj.get("Type") is LITERAL_FILESPEC 246 | ): 247 | extracted_objids.add(objid) 248 | extract1(objid, obj) 249 | return 250 | 251 | 252 | def dumppdf( 253 | outfp: TextIO, 254 | fname: str, 255 | objids: Iterable[int], 256 | pagenos: Container[int], 257 | password: str = "", 258 | dumpall: bool = False, 259 | codec: Optional[str] = None, 260 | extractdir: Optional[str] = None, 261 | show_fallback_xref: bool = False, 262 | ) -> None: 263 | fp = open(fname, "rb") 264 | parser = PDFParser(fp) 265 | doc = PDFDocument(parser, password) 266 | if objids: 267 | for objid in objids: 268 | obj = doc.getobj(objid) 269 | dumpxml(outfp, obj, codec=codec) 270 | if pagenos: 271 | for (pageno, page) in enumerate(PDFPage.create_pages(doc)): 272 | if pageno in pagenos: 273 | if codec: 274 | for obj in page.contents: 275 | obj = stream_value(obj) 276 | dumpxml(outfp, obj, codec=codec) 277 | else: 278 | dumpxml(outfp, page.attrs) 279 | if dumpall: 280 | dumpallobjs(outfp, doc, codec, show_fallback_xref) 281 | if (not objids) and (not pagenos) and (not dumpall): 282 | dumptrailers(outfp, doc, show_fallback_xref) 283 | fp.close() 284 | if codec not in ("raw", "binary"): 285 | outfp.write("\n") 286 | return 287 | 288 | 289 | def create_parser() -> ArgumentParser: 290 | parser = ArgumentParser(description=__doc__, add_help=True) 291 | parser.add_argument( 292 | "files", 293 | type=str, 294 | default=None, 295 | nargs="+", 296 | help="One or more paths to PDF files.", 297 | ) 298 | 299 | parser.add_argument( 300 | "--version", 301 | "-v", 302 | action="version", 303 | version="pdfminer.six v{}".format(pdfminer.__version__), 304 | ) 305 | parser.add_argument( 306 | "--debug", 307 | "-d", 308 | default=False, 309 | action="store_true", 310 | help="Use debug logging level.", 311 | ) 312 | procedure_parser = parser.add_mutually_exclusive_group() 313 | procedure_parser.add_argument( 314 | "--extract-toc", 315 | "-T", 316 | default=False, 317 | action="store_true", 318 | help="Extract structure of outline", 319 | ) 320 | procedure_parser.add_argument( 321 | "--extract-embedded", "-E", type=str, help="Extract embedded files" 322 | ) 323 | 324 | parse_params = parser.add_argument_group( 325 | "Parser", description="Used during PDF parsing" 326 | ) 327 | parse_params.add_argument( 328 | "--page-numbers", 329 | type=int, 330 | default=None, 331 | nargs="+", 332 | help="A space-seperated list of page numbers to parse.", 333 | ) 334 | parse_params.add_argument( 335 | "--pagenos", 336 | "-p", 337 | type=str, 338 | help="A comma-separated list of page numbers to parse. Included for " 339 | "legacy applications, use --page-numbers for more idiomatic " 340 | "argument entry.", 341 | ) 342 | parse_params.add_argument( 343 | "--objects", 344 | "-i", 345 | type=str, 346 | help="Comma separated list of object numbers to extract", 347 | ) 348 | parse_params.add_argument( 349 | "--all", 350 | "-a", 351 | default=False, 352 | action="store_true", 353 | help="If the structure of all objects should be extracted", 354 | ) 355 | parse_params.add_argument( 356 | "--show-fallback-xref", 357 | action="store_true", 358 | help="Additionally show the fallback xref. Use this if the PDF " 359 | "has zero or only invalid xref's. This setting is ignored if " 360 | "--extract-toc or --extract-embedded is used.", 361 | ) 362 | parse_params.add_argument( 363 | "--password", 364 | "-P", 365 | type=str, 366 | default="", 367 | help="The password to use for decrypting PDF file.", 368 | ) 369 | 370 | output_params = parser.add_argument_group( 371 | "Output", description="Used during output generation." 372 | ) 373 | output_params.add_argument( 374 | "--outfile", 375 | "-o", 376 | type=str, 377 | default="-", 378 | help='Path to file where output is written. Or "-" (default) to ' 379 | "write to stdout.", 380 | ) 381 | codec_parser = output_params.add_mutually_exclusive_group() 382 | codec_parser.add_argument( 383 | "--raw-stream", 384 | "-r", 385 | default=False, 386 | action="store_true", 387 | help="Write stream objects without encoding", 388 | ) 389 | codec_parser.add_argument( 390 | "--binary-stream", 391 | "-b", 392 | default=False, 393 | action="store_true", 394 | help="Write stream objects with binary encoding", 395 | ) 396 | codec_parser.add_argument( 397 | "--text-stream", 398 | "-t", 399 | default=False, 400 | action="store_true", 401 | help="Write stream objects as plain text", 402 | ) 403 | 404 | return parser 405 | 406 | 407 | def main(argv: Optional[List[str]] = None) -> None: 408 | parser = create_parser() 409 | args = parser.parse_args(args=argv) 410 | 411 | if args.debug: 412 | logging.getLogger().setLevel(logging.DEBUG) 413 | 414 | if args.outfile == "-": 415 | outfp = sys.stdout 416 | else: 417 | outfp = open(args.outfile, "w") 418 | 419 | if args.objects: 420 | objids = [int(x) for x in args.objects.split(",")] 421 | else: 422 | objids = [] 423 | 424 | if args.page_numbers: 425 | pagenos = {x - 1 for x in args.page_numbers} 426 | elif args.pagenos: 427 | pagenos = {int(x) - 1 for x in args.pagenos.split(",")} 428 | else: 429 | pagenos = set() 430 | 431 | password = args.password 432 | 433 | if args.raw_stream: 434 | codec: Optional[str] = "raw" 435 | elif args.binary_stream: 436 | codec = "binary" 437 | elif args.text_stream: 438 | codec = "text" 439 | else: 440 | codec = None 441 | 442 | for fname in args.files: 443 | if args.extract_toc: 444 | dumpoutline( 445 | outfp, 446 | fname, 447 | objids, 448 | pagenos, 449 | password=password, 450 | dumpall=args.all, 451 | codec=codec, 452 | extractdir=None, 453 | ) 454 | elif args.extract_embedded: 455 | extractembedded(fname, password=password, extractdir=args.extract_embedded) 456 | else: 457 | dumppdf( 458 | outfp, 459 | fname, 460 | objids, 461 | pagenos, 462 | password=password, 463 | dumpall=args.all, 464 | codec=codec, 465 | extractdir=None, 466 | show_fallback_xref=args.show_fallback_xref, 467 | ) 468 | 469 | outfp.close() 470 | 471 | 472 | if __name__ == "__main__": 473 | main() 474 | -------------------------------------------------------------------------------- /src/notebook/jd_data_extraction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "from tqdm import tqdm\n", 11 | "from time import sleep\n", 12 | "from selenium import webdriver\n", 13 | "from selenium.webdriver.common.by import By\n", 14 | "from selenium.webdriver.support.ui import WebDriverWait\n", 15 | "from selenium.webdriver.support import expected_conditions as EC\n", 16 | "from selenium.common.exceptions import TimeoutException\n", 17 | "from bs4 import BeautifulSoup\n", 18 | "from selenium.common.exceptions import ElementClickInterceptedException\n", 19 | "from selenium.common.exceptions import NoSuchElementException\n", 20 | "import json\n", 21 | "import urllib\n", 22 | "import time" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "driver = webdriver.Chrome(executable_path=r'C:\\Users\\Admin\\ML_Projects\\Job_Recommendation_System\\Job-Recommendation-System\\chromedriver_win32\\chromedriver.exe')\n", 32 | "\n", 33 | "def openbrowser(locid, key):\n", 34 | " driver.wait = WebDriverWait(driver, 5)\n", 35 | " driver.maximize_window()\n", 36 | " words = key.split()\n", 37 | " txt ='' \n", 38 | " for w in words:\n", 39 | " txt +=(w+'+')\n", 40 | " #print (txt)\n", 41 | " driver.get(\"https://www.glassdoor.co.in/Job/jobs.htm?suggestCount=0&suggestChosen=true&clickSource=searchBtn&typedKeyword={}\n", 42 | " &sc.keyword\"={}&locT=C&locId={}&jobType=fulltime&fromAge=1&radius=6&cityId=-1&minRating=0.0&industryId=-1\n", 43 | " &sgocId=-1&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0\".format(txt[:-1],txt[:-1], locid))\n", 44 | " return driver\n", 45 | "\n", 46 | "def geturl(driver):\n", 47 | " url = set()\n", 48 | " while True:\n", 49 | " print(len(url))\n", 50 | " if len(url)>=20:\n", 51 | " break\n", 52 | " soup1 = BeautifulSoup(driver.page_source, \"lxml\")\n", 53 | " \n", 54 | " main = soup1.find_all(\"li\",{\"class\":\"jl\"})\n", 55 | " \n", 56 | " for m in main:\n", 57 | " url.add('https://www.glassdoor.co.in{}'.format(m.find('a')['href'])) \n", 58 | " try:\n", 59 | " next_element = soup1.find(\"li\", {\"class\": \"next\"})\n", 60 | " try:\n", 61 | " next_exist = next_element.find('a')\n", 62 | " except AttributeError:\n", 63 | " driver.quit()\n", 64 | " break\n", 65 | " except NoSuchElementException:\n", 66 | " driver.quit()\n", 67 | " break\n", 68 | " if next_exist:\n", 69 | " \n", 70 | " driver.find_element_by_class_name(\"next\").click()\n", 71 | " time.sleep(2)\n", 72 | " else:\n", 73 | " driver.quit()\n", 74 | " break\n", 75 | " except ElementClickInterceptedException:\n", 76 | " pass\n", 77 | " \n", 78 | " return list(url)\n", 79 | "\n", 80 | "x =openbrowser(locid =4477468, key='\"Data Scientist\"')\n", 81 | "with open('url_data_scientist_loc_bangalore.json','w') as f:\n", 82 | " json.dump(geturl(driver),f, indent = 4)\n", 83 | " print(\"file created\")" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "with open('url_data_scientist_loc_bangalore.json','r') as f:\n", 93 | " url = json.load(f)\n", 94 | "data ={} \n", 95 | "i = 1\n", 96 | "jd_df = pd.DataFrame()\n", 97 | "driver = webdriver.Chrome(executable_path=r'C:\\Users\\Admin\\ML_Projects\\Job_Recommendation_System\\Job-Recommendation-System\\chromedriver_win32\\chromedriver.exe')\n", 98 | "\n", 99 | "for u in tqdm(url):\n", 100 | " driver.wait = WebDriverWait(driver, 2)\n", 101 | " driver.maximize_window()\n", 102 | " driver.get(u)\n", 103 | " soup = BeautifulSoup(driver.page_source, \"lxml\")\n", 104 | " try:\n", 105 | " \n", 106 | " header = soup.find(\"div\",{\"class\":\"header cell info\"})\n", 107 | " position = driver.find_element_by_tag_name('h2').text\n", 108 | " company = driver.find_element_by_xpath(\"//span[@class='strong ib']\").text\n", 109 | " location = driver.find_element_by_xpath(\"//span[@class='subtle ib']\").text\n", 110 | " jd_temp = driver.find_element_by_id(\"JobDescriptionContainer\")\n", 111 | " jd = jd_temp.text\n", 112 | " info = soup.find_all(\"infoEntity\")\n", 113 | " except IndexError:\n", 114 | " print('IndexError: list index out of range')\n", 115 | " except NoSuchElementException:\n", 116 | " pass\n", 117 | " data[i] = {\n", 118 | " 'url' :u,\n", 119 | " 'Position':position,\n", 120 | " 'Company': company,\n", 121 | " 'Location' :location,\n", 122 | " 'Job_Description' :jd\n", 123 | " }\n", 124 | " i+=1 \n", 125 | "driver.quit()\n", 126 | "jd_df = pd.DataFrame(data)\n", 127 | "jd = jd_df.transpose()\n", 128 | "\n", 129 | "jd = jd[['url','Position','Company','Location','Job_Description']]\n", 130 | "jd.to_csv('unstructured_data.csv')\n", 131 | "print('file created')" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "def get_jobs(keyword, num_jobs, verbose, path, slp_time):\n", 141 | " \n", 142 | " '''Gathers jobs as a dataframe, scraped from Glassdoor'''\n", 143 | " \n", 144 | " #Initializing the webdriver\n", 145 | " options = webdriver.ChromeOptions()\n", 146 | " \n", 147 | " #Uncomment the line below if you'd like to scrape without a new Chrome window every time.\n", 148 | " #options.add_argument('headless')\n", 149 | " \n", 150 | " #Change the path to where chromedriver is in your home folder.\n", 151 | " driver = webdriver.Chrome(executable_path=path, options=options)\n", 152 | " driver.set_window_size(1120, 1000)\n", 153 | " \n", 154 | " url = \"https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword=\"+keyword+\"&sc.keyword=\"+keyword+\"&locT=&locId=&jobType=\"\n", 155 | " #url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword=\"' + keyword + '\"&locT=C&locId=1147401&locKeyword=San%20Francisco,%20CA&jobType=all&fromAge=-1&minSalary=0&includeNoSalaryJobs=true&radius=100&cityId=-1&minRating=0.0&industryId=-1&sgocId=-1&seniorityType=all&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0'\n", 156 | " driver.get(url)\n", 157 | " jobs = []\n", 158 | "\n", 159 | " while len(jobs) < num_jobs: #If true, should be still looking for new jobs.\n", 160 | "\n", 161 | " #Let the page load. Change this number based on your internet speed.\n", 162 | " #Or, wait until the webpage is loaded, instead of hardcoding it.\n", 163 | " time.sleep(slp_time)\n", 164 | "\n", 165 | " #Test for the \"Sign Up\" prompt and get rid of it.\n", 166 | " try:\n", 167 | " driver.find_element_by_class_name(\"selected\").click()\n", 168 | " except ElementClickInterceptedException:\n", 169 | " pass\n", 170 | "\n", 171 | " time.sleep(.1)\n", 172 | "\n", 173 | " try:\n", 174 | " driver.find_element_by_css_selector('[alt=\"Close\"]').click() #clicking to the X.\n", 175 | " print(' x out worked')\n", 176 | " except NoSuchElementException:\n", 177 | " print(' x out failed')\n", 178 | " pass\n", 179 | "\n", 180 | " \n", 181 | " #Going through each job in this page\n", 182 | " job_buttons = driver.find_elements_by_class_name(\"jl\") #jl for Job Listing. These are the buttons we're going to click.\n", 183 | " for job_button in job_buttons: \n", 184 | "\n", 185 | " print(\"Progress: {}\".format(\"\" + str(len(jobs)) + \"/\" + str(num_jobs)))\n", 186 | " if len(jobs) >= num_jobs:\n", 187 | " break\n", 188 | "\n", 189 | " job_button.click() #You might \n", 190 | " time.sleep(1)\n", 191 | " collected_successfully = False\n", 192 | " \n", 193 | " while not collected_successfully:\n", 194 | " try:\n", 195 | " company_name = driver.find_element_by_xpath('.//div[@class=\"employerName\"]').text\n", 196 | " location = driver.find_element_by_xpath('.//div[@class=\"location\"]').text\n", 197 | " job_title = driver.find_element_by_xpath('.//div[contains(@class, \"title\")]').text\n", 198 | " job_description = driver.find_element_by_xpath('.//div[@class=\"jobDescriptionContent desc\"]').text\n", 199 | " collected_successfully = True\n", 200 | " except:\n", 201 | " time.sleep(5)\n", 202 | "\n", 203 | " try:\n", 204 | " salary_estimate = driver.find_element_by_xpath('.//span[@class=\"gray salary\"]').text\n", 205 | " except NoSuchElementException:\n", 206 | " salary_estimate = -1 #You need to set a \"not found value. It's important.\"\n", 207 | " \n", 208 | " try:\n", 209 | " rating = driver.find_element_by_xpath('.//span[@class=\"rating\"]').text\n", 210 | " except NoSuchElementException:\n", 211 | " rating = -1 #You need to set a \"not found value. It's important.\"\n", 212 | "\n", 213 | " #Printing for debugging\n", 214 | " if verbose:\n", 215 | " print(\"Job Title: {}\".format(job_title))\n", 216 | " print(\"Salary Estimate: {}\".format(salary_estimate))\n", 217 | " print(\"Job Description: {}\".format(job_description[:500]))\n", 218 | " print(\"Rating: {}\".format(rating))\n", 219 | " print(\"Company Name: {}\".format(company_name))\n", 220 | " print(\"Location: {}\".format(location))\n", 221 | "\n", 222 | " #Going to the Company tab...\n", 223 | " #clicking on this:\n", 224 | " #
Company
\n", 225 | " try:\n", 226 | " driver.find_element_by_xpath('.//div[@class=\"tab\" and @data-tab-type=\"overview\"]').click()\n", 227 | "\n", 228 | " try:\n", 229 | " #
\n", 230 | " # \n", 231 | " # San Francisco, CA\n", 232 | " #
\n", 233 | " headquarters = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Headquarters\"]//following-sibling::*').text\n", 234 | " except NoSuchElementException:\n", 235 | " headquarters = -1\n", 236 | "\n", 237 | " try:\n", 238 | " size = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Size\"]//following-sibling::*').text\n", 239 | " except NoSuchElementException:\n", 240 | " size = -1\n", 241 | "\n", 242 | " try:\n", 243 | " founded = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Founded\"]//following-sibling::*').text\n", 244 | " except NoSuchElementException:\n", 245 | " founded = -1\n", 246 | "\n", 247 | " try:\n", 248 | " type_of_ownership = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Type\"]//following-sibling::*').text\n", 249 | " except NoSuchElementException:\n", 250 | " type_of_ownership = -1\n", 251 | "\n", 252 | " try:\n", 253 | " industry = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Industry\"]//following-sibling::*').text\n", 254 | " except NoSuchElementException:\n", 255 | " industry = -1\n", 256 | "\n", 257 | " try:\n", 258 | " sector = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Sector\"]//following-sibling::*').text\n", 259 | " except NoSuchElementException:\n", 260 | " sector = -1\n", 261 | "\n", 262 | " try:\n", 263 | " revenue = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Revenue\"]//following-sibling::*').text\n", 264 | " except NoSuchElementException:\n", 265 | " revenue = -1\n", 266 | "\n", 267 | " try:\n", 268 | " competitors = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Competitors\"]//following-sibling::*').text\n", 269 | " except NoSuchElementException:\n", 270 | " competitors = -1\n", 271 | "\n", 272 | " except NoSuchElementException: #Rarely, some job postings do not have the \"Company\" tab.\n", 273 | " headquarters = -1\n", 274 | " size = -1\n", 275 | " founded = -1\n", 276 | " type_of_ownership = -1\n", 277 | " industry = -1\n", 278 | " sector = -1\n", 279 | " revenue = -1\n", 280 | " competitors = -1\n", 281 | "\n", 282 | " \n", 283 | " if verbose:\n", 284 | " print(\"Headquarters: {}\".format(headquarters))\n", 285 | " print(\"Size: {}\".format(size))\n", 286 | " print(\"Founded: {}\".format(founded))\n", 287 | " print(\"Type of Ownership: {}\".format(type_of_ownership))\n", 288 | " print(\"Industry: {}\".format(industry))\n", 289 | " print(\"Sector: {}\".format(sector))\n", 290 | " print(\"Revenue: {}\".format(revenue))\n", 291 | " print(\"Competitors: {}\".format(competitors))\n", 292 | "\n", 293 | " jobs.append({\"Job Title\" : job_title,\n", 294 | " \"Salary Estimate\" : salary_estimate,\n", 295 | " \"Job Description\" : job_description,\n", 296 | " \"Rating\" : rating,\n", 297 | " \"Company Name\" : company_name,\n", 298 | " \"Location\" : location,\n", 299 | " \"Headquarters\" : headquarters,\n", 300 | " \"Size\" : size,\n", 301 | " \"Founded\" : founded,\n", 302 | " \"Type of ownership\" : type_of_ownership,\n", 303 | " \"Industry\" : industry,\n", 304 | " \"Sector\" : sector,\n", 305 | " \"Revenue\" : revenue,\n", 306 | " \"Competitors\" : competitors})\n", 307 | " #add job to jobs\n", 308 | " \n", 309 | " \n", 310 | " #Clicking on the \"next page\" button\n", 311 | " try:\n", 312 | " driver.find_element_by_xpath('.//li[@class=\"next\"]//a').click()\n", 313 | " except NoSuchElementException:\n", 314 | " print(\"Scraping terminated before reaching target number of jobs. Needed {}, got {}.\".format(num_jobs, len(jobs)))\n", 315 | " break\n", 316 | "\n", 317 | " return pd.DataFrame(jobs) #This line converts the dictionary object into a pandas DataFrame." 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "metadata": {}, 324 | "outputs": [], 325 | "source": [ 326 | "path = r\"C:\\Users\\Admin\\ML_Projects\\Job_Recommendation_System\\Job-Recommendation-System\\chromedriver_win32\\chromedriver.exe\"\n", 327 | "\n", 328 | "unstructured_data_df = get_jobs('data scientist',1000, False, path, 15)\n", 329 | "\n", 330 | "unstructured_data_df.to_csv('unstructured_data.csv', index = False)" 331 | ] 332 | } 333 | ], 334 | "metadata": { 335 | "kernelspec": { 336 | "display_name": "myenv", 337 | "language": "python", 338 | "name": "python3" 339 | }, 340 | "language_info": { 341 | "codemirror_mode": { 342 | "name": "ipython", 343 | "version": 3 344 | }, 345 | "file_extension": ".py", 346 | "mimetype": "text/x-python", 347 | "name": "python", 348 | "nbconvert_exporter": "python", 349 | "pygments_lexer": "ipython3", 350 | "version": "3.10.10" 351 | }, 352 | "orig_nbformat": 4, 353 | "vscode": { 354 | "interpreter": { 355 | "hash": "ae6b9c19ba8290d367f751939abe8de5af7ecdf4fdf442937bc3215b661f3d40" 356 | } 357 | } 358 | }, 359 | "nbformat": 4, 360 | "nbformat_minor": 2 361 | } 362 | -------------------------------------------------------------------------------- /src/notebook/job_recommendation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import re\n", 10 | "from ftfy import fix_text\n", 11 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 12 | "import re\n", 13 | "from sklearn.neighbors import NearestNeighbors\n", 14 | "import numpy as np\n", 15 | "import pandas as pd\n", 16 | "import nltk\n", 17 | "from nltk.corpus import stopwords\n", 18 | "stopw = set(stopwords.words('english'))\n", 19 | "from pyresparser import ResumeParser\n", 20 | "import os\n", 21 | "from docx import Document\n", 22 | "from skills_extraction import skills_extractor" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 4, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "data": { 32 | "text/html": [ 33 | "
\n", 34 | "\n", 47 | "\n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | "
Job TitleRatingCompany NameLocationHeadquartersSizeFoundedType of ownershipIndustrySectorCompetitorsAverage SalaryAverage RevenueProcessed_JD
0Data Scientist3.8Tecolote ResearchAlbuquerque, NMGoleta, CA750.51973Company - PrivateAerospace & DefenseAerospace & Defense-172.075.000000Data Scientist Location: Albuquerque, Educatio...
1Healthcare Data Scientist3.4University of Maryland Medical SystemLinthicum, MDBaltimore, MD10000.01984Other OrganizationHealth Care Services & HospitalsHealth Care-187.53500.000000What You Will Do: General Summary The Healthca...
2Data Scientist4.8KnowBe4Clearwater, FLClearwater, FL750.52010Company - PrivateSecurity ServicesBusiness Services-185.0300.000000KnowBe4, Inc. high growth information security...
3Data Scientist3.8PNNLRichland, WARichland, WA3000.51965GovernmentEnergyOil, Gas, Energy & UtilitiesOak Ridge National Laboratory, National Renewa...76.5250500.000000*Organization Job ID** Job ID: 310709 Director...
4Data Scientist2.9Affinity SolutionsNew York, NYNew York, NY125.51998Company - PrivateAdvertising & MarketingBusiness ServicesCommerce Signals, Cardlytics, Yodlee114.524319.000761Data Scientist Affinity Solutions Marketing Cl...
\n", 155 | "
" 156 | ], 157 | "text/plain": [ 158 | " Job Title Rating Company Name \n", 159 | "0 Data Scientist 3.8 Tecolote Research \\\n", 160 | "1 Healthcare Data Scientist 3.4 University of Maryland Medical System \n", 161 | "2 Data Scientist 4.8 KnowBe4 \n", 162 | "3 Data Scientist 3.8 PNNL \n", 163 | "4 Data Scientist 2.9 Affinity Solutions \n", 164 | "\n", 165 | " Location Headquarters Size Founded Type of ownership \n", 166 | "0 Albuquerque, NM Goleta, CA 750.5 1973 Company - Private \\\n", 167 | "1 Linthicum, MD Baltimore, MD 10000.0 1984 Other Organization \n", 168 | "2 Clearwater, FL Clearwater, FL 750.5 2010 Company - Private \n", 169 | "3 Richland, WA Richland, WA 3000.5 1965 Government \n", 170 | "4 New York, NY New York, NY 125.5 1998 Company - Private \n", 171 | "\n", 172 | " Industry Sector \n", 173 | "0 Aerospace & Defense Aerospace & Defense \\\n", 174 | "1 Health Care Services & Hospitals Health Care \n", 175 | "2 Security Services Business Services \n", 176 | "3 Energy Oil, Gas, Energy & Utilities \n", 177 | "4 Advertising & Marketing Business Services \n", 178 | "\n", 179 | " Competitors Average Salary \n", 180 | "0 -1 72.0 \\\n", 181 | "1 -1 87.5 \n", 182 | "2 -1 85.0 \n", 183 | "3 Oak Ridge National Laboratory, National Renewa... 76.5 \n", 184 | "4 Commerce Signals, Cardlytics, Yodlee 114.5 \n", 185 | "\n", 186 | " Average Revenue Processed_JD \n", 187 | "0 75.000000 Data Scientist Location: Albuquerque, Educatio... \n", 188 | "1 3500.000000 What You Will Do: General Summary The Healthca... \n", 189 | "2 300.000000 KnowBe4, Inc. high growth information security... \n", 190 | "3 250500.000000 *Organization Job ID** Job ID: 310709 Director... \n", 191 | "4 24319.000761 Data Scientist Affinity Solutions Marketing Cl... " 192 | ] 193 | }, 194 | "execution_count": 4, 195 | "metadata": {}, 196 | "output_type": "execute_result" 197 | } 198 | ], 199 | "source": [ 200 | "# Load dataset:\n", 201 | "jd_df=pd.read_csv('jd_structured_data.csv')\n", 202 | "jd_df.head()" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 7, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "# Load the extracted resume skills:\n", 212 | "file_path=r'C:\\Users\\Admin\\ML_Projects\\Job_Recommendation_System\\Job-Recommendation-System\\src\\notebook\\CV.pdf'\n", 213 | "skills=[]\n", 214 | "skills.append(' '.join(word for word in skills_extractor(file_path)))" 215 | ] 216 | }, 217 | { 218 | "attachments": {}, 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "# Feature Engineering" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 8, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "def ngrams(string, n=3):\n", 232 | " string = fix_text(string) # fix text\n", 233 | " string = string.encode(\"ascii\", errors=\"ignore\").decode() #remove non ascii chars\n", 234 | " string = string.lower()\n", 235 | " chars_to_remove = [\")\",\"(\",\".\",\"|\",\"[\",\"]\",\"{\",\"}\",\"'\"]\n", 236 | " rx = '[' + re.escape(''.join(chars_to_remove)) + ']'\n", 237 | " string = re.sub(rx, '', string)\n", 238 | " string = string.replace('&', 'and')\n", 239 | " string = string.replace(',', ' ')\n", 240 | " string = string.replace('-', ' ')\n", 241 | " string = string.title() # normalise case - capital at start of each word\n", 242 | " string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single\n", 243 | " string = ' '+ string +' ' # pad names for ngrams...\n", 244 | " string = re.sub(r'[,-./]|\\sBD',r'', string)\n", 245 | " ngrams = zip(*[string[i:] for i in range(n)])\n", 246 | " return [''.join(ngram) for ngram in ngrams]\n" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 9, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)\n", 256 | "tfidf = vectorizer.fit_transform(skills)\n" 257 | ] 258 | }, 259 | { 260 | "attachments": {}, 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "# Job Recommender" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 12, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf)\n", 274 | "jd_test = (jd_df['Processed_JD'].values.astype('U'))" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 11, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "def getNearestN(query):\n", 284 | " queryTFIDF_ = vectorizer.transform(query)\n", 285 | " distances, indices = nbrs.kneighbors(queryTFIDF_)\n", 286 | " return distances, indices\n" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 13, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "distances, indices = getNearestN(jd_test)\n", 296 | "test = list(jd_test) \n", 297 | "matches = []" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 14, 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [ 306 | "for i,j in enumerate(indices):\n", 307 | " dist=round(distances[i][0],2)\n", 308 | " \n", 309 | " temp = [dist]\n", 310 | " matches.append(temp)\n", 311 | " \n", 312 | "matches = pd.DataFrame(matches, columns=['Match confidence'])" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 23, 318 | "metadata": {}, 319 | "outputs": [ 320 | { 321 | "data": { 322 | "text/html": [ 323 | "
\n", 324 | "\n", 337 | "\n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | "
Job TitleRatingCompany NameLocationHeadquartersSizeFoundedType of ownershipIndustrySectorCompetitorsAverage SalaryAverage RevenueProcessed_JDmatch
4Data Scientist2.9Affinity SolutionsNew York, NYNew York, NY125.51998Company - PrivateAdvertising & MarketingBusiness ServicesCommerce Signals, Cardlytics, Yodlee114.524319.000761Data Scientist Affinity Solutions Marketing Cl...0.73
0Data Scientist3.8Tecolote ResearchAlbuquerque, NMGoleta, CA750.51973Company - PrivateAerospace & DefenseAerospace & Defense-172.075.000000Data Scientist Location: Albuquerque, Educatio...0.74
2Data Scientist4.8KnowBe4Clearwater, FLClearwater, FL750.52010Company - PrivateSecurity ServicesBusiness Services-185.0300.000000KnowBe4, Inc. high growth information security...0.79
3Data Scientist3.8PNNLRichland, WARichland, WA3000.51965GovernmentEnergyOil, Gas, Energy & UtilitiesOak Ridge National Laboratory, National Renewa...76.5250500.000000*Organization Job ID** Job ID: 310709 Director...0.80
1Healthcare Data Scientist3.4University of Maryland Medical SystemLinthicum, MDBaltimore, MD10000.01984Other OrganizationHealth Care Services & HospitalsHealth Care-187.53500.000000What You Will Do: General Summary The Healthca...0.85
\n", 451 | "
" 452 | ], 453 | "text/plain": [ 454 | " Job Title Rating Company Name \n", 455 | "4 Data Scientist 2.9 Affinity Solutions \\\n", 456 | "0 Data Scientist 3.8 Tecolote Research \n", 457 | "2 Data Scientist 4.8 KnowBe4 \n", 458 | "3 Data Scientist 3.8 PNNL \n", 459 | "1 Healthcare Data Scientist 3.4 University of Maryland Medical System \n", 460 | "\n", 461 | " Location Headquarters Size Founded Type of ownership \n", 462 | "4 New York, NY New York, NY 125.5 1998 Company - Private \\\n", 463 | "0 Albuquerque, NM Goleta, CA 750.5 1973 Company - Private \n", 464 | "2 Clearwater, FL Clearwater, FL 750.5 2010 Company - Private \n", 465 | "3 Richland, WA Richland, WA 3000.5 1965 Government \n", 466 | "1 Linthicum, MD Baltimore, MD 10000.0 1984 Other Organization \n", 467 | "\n", 468 | " Industry Sector \n", 469 | "4 Advertising & Marketing Business Services \\\n", 470 | "0 Aerospace & Defense Aerospace & Defense \n", 471 | "2 Security Services Business Services \n", 472 | "3 Energy Oil, Gas, Energy & Utilities \n", 473 | "1 Health Care Services & Hospitals Health Care \n", 474 | "\n", 475 | " Competitors Average Salary \n", 476 | "4 Commerce Signals, Cardlytics, Yodlee 114.5 \\\n", 477 | "0 -1 72.0 \n", 478 | "2 -1 85.0 \n", 479 | "3 Oak Ridge National Laboratory, National Renewa... 76.5 \n", 480 | "1 -1 87.5 \n", 481 | "\n", 482 | " Average Revenue Processed_JD match \n", 483 | "4 24319.000761 Data Scientist Affinity Solutions Marketing Cl... 0.73 \n", 484 | "0 75.000000 Data Scientist Location: Albuquerque, Educatio... 0.74 \n", 485 | "2 300.000000 KnowBe4, Inc. high growth information security... 0.79 \n", 486 | "3 250500.000000 *Organization Job ID** Job ID: 310709 Director... 0.80 \n", 487 | "1 3500.000000 What You Will Do: General Summary The Healthca... 0.85 " 488 | ] 489 | }, 490 | "execution_count": 23, 491 | "metadata": {}, 492 | "output_type": "execute_result" 493 | } 494 | ], 495 | "source": [ 496 | "# Following recommends Top 5 Jobs based on candidate resume:\n", 497 | "jd_df['match']=matches['Match confidence']\n", 498 | "jd_df.head(5).sort_values('match')" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": null, 504 | "metadata": {}, 505 | "outputs": [], 506 | "source": [] 507 | } 508 | ], 509 | "metadata": { 510 | "kernelspec": { 511 | "display_name": "myenv", 512 | "language": "python", 513 | "name": "python3" 514 | }, 515 | "language_info": { 516 | "codemirror_mode": { 517 | "name": "ipython", 518 | "version": 3 519 | }, 520 | "file_extension": ".py", 521 | "mimetype": "text/x-python", 522 | "name": "python", 523 | "nbconvert_exporter": "python", 524 | "pygments_lexer": "ipython3", 525 | "version": "3.10.10" 526 | }, 527 | "orig_nbformat": 4, 528 | "vscode": { 529 | "interpreter": { 530 | "hash": "ae6b9c19ba8290d367f751939abe8de5af7ecdf4fdf442937bc3215b661f3d40" 531 | } 532 | } 533 | }, 534 | "nbformat": 4, 535 | "nbformat_minor": 2 536 | } 537 | --------------------------------------------------------------------------------